use existing T5 models

2026-07-31 17:37:57 +08:00 · 2024-08-06 04:15:40 +03:00 · 2024-08-06 04:15:40 +03:00 · b787b9a8fa
commit b787b9a8fa
parent d56e14ec1e
2 changed files with 295 additions and 179 deletions
--- a/examples/example_01.json
+++ b/examples/example_01.json
@ -1,145 +1,37 @@
 {
-  "last_node_id": 12,
-  "last_link_id": 23,
+  "last_node_id": 31,
+  "last_link_id": 57,
  "nodes": [
    {
-      "id": 11,
-      "type": "CogVideoDecode",
-      "pos": [
-        1301,
-        352
-      ],
-      "size": {
-        "0": 210,
-        "1": 46
-      },
-      "flags": {},
-      "order": 3,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 21
-        },
-        {
-          "name": "samples",
-          "type": "LATENT",
-          "link": 22
-        }
-      ],
-      "outputs": [
-        {
-          "name": "images",
-          "type": "IMAGE",
-          "links": [
-            23
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoDecode"
-      }
-    },
-    {
-      "id": 2,
-      "type": "CogVideoEncodePrompt",
-      "pos": [
-        459,
-        485
-      ],
-      "size": [
-        408.03107827615304,
-        315.59645204258936
-      ],
-      "flags": {},
-      "order": 1,
-      "mode": 0,
-      "inputs": [
-        {
-          "name": "pipeline",
-          "type": "COGVIDEOPIPE",
-          "link": 1
-        }
-      ],
-      "outputs": [
-        {
-          "name": "embeds",
-          "type": "COGEMBEDS",
-          "links": [
-            16
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "CogVideoEncodePrompt"
-      },
-      "widgets_values": [
-        "A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.",
-        ""
-      ]
-    },
-    {
-      "id": 1,
-      "type": "DownloadAndLoadCogVideoModel",
-      "pos": [
-        460,
-        354
-      ],
-      "size": {
-        "0": 315,
-        "1": 58
-      },
-      "flags": {},
-      "order": 0,
-      "mode": 0,
-      "outputs": [
-        {
-          "name": "cogvideo_pipe",
-          "type": "COGVIDEOPIPE",
-          "links": [
-            1,
-            15
-          ],
-          "shape": 3,
-          "slot_index": 0
-        }
-      ],
-      "properties": {
-        "Node name for S&R": "DownloadAndLoadCogVideoModel"
-      },
-      "widgets_values": [
-        "fp16"
-      ]
-    },
-    {
-      "id": 10,
+      "id": 22,
      "type": "CogVideoSampler",
      "pos": [
-        920,
-        353
+        1041,
+        342
      ],
      "size": {
        "0": 315,
-        "1": 246
+        "1": 266
      },
      "flags": {},
-      "order": 2,
+      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
-          "link": 15
+          "link": 36
        },
        {
-          "name": "embeds",
-          "type": "COGEMBEDS",
-          "link": 16
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 55,
+          "slot_index": 1
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 57
        }
      ],
      "outputs": [
@ -147,7 +39,7 @@
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
-            21
+            37
          ],
          "shape": 3
        },
@ -155,7 +47,7 @@
          "name": "samples",
          "type": "LATENT",
          "links": [
-            22
+            38
          ],
          "shape": 3
        }
@ -166,33 +58,75 @@
      "widgets_values": [
        480,
        720,
-        48,
+        16,
        8,
-        30,
+        25,
        6,
-        867121661458558,
+        806286757407561,
        "fixed"
      ]
    },
    {
-      "id": 12,
+      "id": 11,
+      "type": "CogVideoDecode",
+      "pos": [
+        1142,
+        658
+      ],
+      "size": {
+        "0": 210,
+        "1": 46
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 37
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 38
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            51
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      }
+    },
+    {
+      "id": 28,
      "type": "VHS_VideoCombine",
      "pos": [
-        1563,
-        353
+        1432,
+        150
      ],
      "size": [
-        315,
-        520.6666666666666
+        667.752197265625,
+        755.8347981770833
      ],
      "flags": {},
-      "order": 4,
+      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
-          "link": 23
+          "link": 51,
+          "slot_index": 0
        },
        {
          "name": "audio",
@ -235,7 +169,7 @@
          "hidden": false,
          "paused": false,
          "params": {
-            "filename": "AnimateDiff_00003.mp4",
+            "filename": "AnimateDiff_00001.mp4",
            "subfolder": "",
            "type": "temp",
            "format": "video/h264-mp4",
@ -243,66 +177,226 @@
          }
        }
      }
+    },
+    {
+      "id": 30,
+      "type": "CogVideoTextEncode",
+      "pos": [
+        500,
+        308
+      ],
+      "size": [
+        474.84501511852204,
+        164.74235966960538
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 54
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            55
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance."
+      ]
+    },
+    {
+      "id": 20,
+      "type": "CLIPLoader",
+      "pos": [
+        -59,
+        397
+      ],
+      "size": {
+        "0": 451.30548095703125,
+        "1": 82
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            54,
+            56
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPLoader"
+      },
+      "widgets_values": [
+        "t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
+        "sd3"
+      ]
+    },
+    {
+      "id": 31,
+      "type": "CogVideoTextEncode",
+      "pos": [
+        503,
+        521
+      ],
+      "size": [
+        463.01251866466464,
+        98.10446321574796
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 56
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            57
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        ""
+      ]
+    },
+    {
+      "id": 1,
+      "type": "DownloadAndLoadCogVideoModel",
+      "pos": [
+        649,
+        182
+      ],
+      "size": {
+        "0": 315,
+        "1": 58
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            36
+          ],
+          "shape": 3,
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoModel"
+      },
+      "widgets_values": [
+        "fp16"
+      ]
    }
  ],
  "links": [
    [
-      1,
+      36,
      1,
      0,
-      2,
+      22,
      0,
      "COGVIDEOPIPE"
    ],
    [
-      15,
-      1,
-      0,
-      10,
-      0,
-      "COGVIDEOPIPE"
-    ],
-    [
-      16,
-      2,
-      0,
-      10,
-      1,
-      "COGEMBEDS"
-    ],
-    [
-      21,
-      10,
+      37,
+      22,
      0,
      11,
      0,
      "COGVIDEOPIPE"
    ],
    [
+      38,
      22,
-      10,
      1,
      11,
      1,
      "LATENT"
    ],
    [
-      23,
+      51,
      11,
      0,
-      12,
+      28,
      0,
      "IMAGE"
+    ],
+    [
+      54,
+      20,
+      0,
+      30,
+      0,
+      "CLIP"
+    ],
+    [
+      55,
+      30,
+      0,
+      22,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      56,
+      20,
+      0,
+      31,
+      0,
+      "CLIP"
+    ],
+    [
+      57,
+      31,
+      0,
+      22,
+      2,
+      "CONDITIONING"
    ]
  ],
  "groups": [],
  "config": {},
  "extra": {
    "ds": {
-      "scale": 1,
+      "scale": 0.6830134553650706,
      "offset": [
-        -281.3644522995906,
-        -67.92982606602688
+        359.4381777891929,
+        334.95283678425216
      ]
    }
  },
--- a/nodes.py
+++ b/nodes.py
@ -48,12 +48,13 @@ class DownloadAndLoadCogVideoModel:

            snapshot_download(
                repo_id="THUDM/CogVideoX-2b",
-                #ignore_patterns=["*sd-image-variations-encoder-fp16.safetensors", "fye_motion_module-fp16.safetensors"],
+                ignore_patterns=["*text_encoder*"],
                local_dir=base_path,
                local_dir_use_symlinks=False,
            )

        pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
+        

        pipeline = {
            "pipe": pipe,
@ -72,8 +73,8 @@ class CogVideoEncodePrompt:
            }
        }

-    RETURN_TYPES = ("COGEMBEDS",)
-    RETURN_NAMES = ("embeds",)
+    RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
+    RETURN_NAMES = ("positive", "negative")
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"

@ -86,7 +87,7 @@ class CogVideoEncodePrompt:
        pipe.text_encoder.to(device)
        pipe.transformer.to(offload_device)
        
-        pos_embeds, neg_embeds = pipe.encode_prompt(
+        positive, negative = pipe.encode_prompt(
            prompt=prompt,
            negative_prompt=negative_prompt,
            do_classifier_free_guidance=True,
@ -96,11 +97,30 @@ class CogVideoEncodePrompt:
            dtype=dtype,
        )
        pipe.text_encoder.to(offload_device)
-        embeds = {
-            "positive": pos_embeds,
-            "negative": neg_embeds,
+
+        return (positive, negative)
+    
+class CogVideoTextEncode:
+    @classmethod
+    def INPUT_TYPES(s):
+        return {"required": {
+            "clip": ("CLIP",),
+            "prompt": ("STRING", {"default": "", "multiline": True} ),
+            }
        }

+    RETURN_TYPES = ("CONDITIONING",)
+    RETURN_NAMES = ("conditioning",)
+    FUNCTION = "process"
+    CATEGORY = "CogVideoWrapper"
+
+    def process(self, clip, prompt):
+        clip.tokenizer.t5xxl.pad_to_max_length = True
+        clip.tokenizer.t5xxl.max_length = 226
+        tokens = clip.tokenize(prompt, return_word_ids=True)
+
+        embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False)
+
        return (embeds, )

 class CogVideoSampler:
@ -108,7 +128,8 @@ class CogVideoSampler:
    def INPUT_TYPES(s):
        return {"required": {
            "pipeline": ("COGVIDEOPIPE",),
-            "embeds": ("COGEMBEDS", ),
+            "positive": ("CONDITIONING", ),
+            "negative": ("CONDITIONING", ),
            "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
            "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
            "num_frames": ("INT", {"default": 48, "min": 1, "max": 100, "step": 1}),
@ -124,11 +145,12 @@ class CogVideoSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"

-    def process(self, pipeline, embeds, fps, steps, cfg, seed, height, width, num_frames):
+    def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames):
        mm.soft_empty_cache()
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        pipe = pipeline["pipe"]
+        dtype = pipeline["dtype"]

        pipe.transformer.to(device)
        generator = torch.Generator(device=device).manual_seed(seed)
@ -140,8 +162,8 @@ class CogVideoSampler:
            num_frames = num_frames,
            fps = fps,
            guidance_scale=cfg,
-            prompt_embeds=embeds["positive"],
-            negative_prompt_embeds=embeds["negative"],
+            prompt_embeds=positive.to(dtype).to(device),
+            negative_prompt_embeds=negative.to(dtype).to(device),
            #negative_prompt_embeds=torch.zeros_like(embeds),
            generator=generator,
            output_type="latents",
@ -206,12 +228,12 @@ class CogVideoDecode:
 NODE_CLASS_MAPPINGS = {
    "DownloadAndLoadCogVideoModel": DownloadAndLoadCogVideoModel,
    "CogVideoSampler": CogVideoSampler,
-    "CogVideoEncodePrompt": CogVideoEncodePrompt,
-    "CogVideoDecode": CogVideoDecode
+    "CogVideoDecode": CogVideoDecode,
+    "CogVideoTextEncode": CogVideoTextEncode
 }
 NODE_DISPLAY_NAME_MAPPINGS = {
-    "DownloadAndLoadCogVideoModel": "DownloadAndLoadCogVideoModel",
+    "DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model",
    "CogVideoSampler": "CogVideo Sampler",
-    "CogVideoEncodePrompt": "CogVideo EncodePrompt",
    "CogVideoDecode": "CogVideo Decode",
+    "CogVideoTextEncode": "CogVideo TextEncode"
    }