use existing T5 models

This commit is contained in:
kijai 2024-08-06 04:15:40 +03:00
parent d56e14ec1e
commit b787b9a8fa
2 changed files with 295 additions and 179 deletions

View File

@ -1,145 +1,37 @@
{ {
"last_node_id": 12, "last_node_id": 31,
"last_link_id": 23, "last_link_id": 57,
"nodes": [ "nodes": [
{ {
"id": 11, "id": 22,
"type": "CogVideoDecode",
"pos": [
1301,
352
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 21
},
{
"name": "samples",
"type": "LATENT",
"link": 22
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
23
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 2,
"type": "CogVideoEncodePrompt",
"pos": [
459,
485
],
"size": [
408.03107827615304,
315.59645204258936
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 1
}
],
"outputs": [
{
"name": "embeds",
"type": "COGEMBEDS",
"links": [
16
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoEncodePrompt"
},
"widgets_values": [
"A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.",
""
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": [
460,
354
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
1,
15
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"fp16"
]
},
{
"id": 10,
"type": "CogVideoSampler", "type": "CogVideoSampler",
"pos": [ "pos": [
920, 1041,
353 342
], ],
"size": { "size": {
"0": 315, "0": 315,
"1": 246 "1": 266
}, },
"flags": {}, "flags": {},
"order": 2, "order": 4,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "pipeline", "name": "pipeline",
"type": "COGVIDEOPIPE", "type": "COGVIDEOPIPE",
"link": 15 "link": 36
}, },
{ {
"name": "embeds", "name": "positive",
"type": "COGEMBEDS", "type": "CONDITIONING",
"link": 16 "link": 55,
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 57
} }
], ],
"outputs": [ "outputs": [
@ -147,7 +39,7 @@
"name": "cogvideo_pipe", "name": "cogvideo_pipe",
"type": "COGVIDEOPIPE", "type": "COGVIDEOPIPE",
"links": [ "links": [
21 37
], ],
"shape": 3 "shape": 3
}, },
@ -155,7 +47,7 @@
"name": "samples", "name": "samples",
"type": "LATENT", "type": "LATENT",
"links": [ "links": [
22 38
], ],
"shape": 3 "shape": 3
} }
@ -166,33 +58,75 @@
"widgets_values": [ "widgets_values": [
480, 480,
720, 720,
48, 16,
8, 8,
30, 25,
6, 6,
867121661458558, 806286757407561,
"fixed" "fixed"
] ]
}, },
{ {
"id": 12, "id": 11,
"type": "CogVideoDecode",
"pos": [
1142,
658
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 37
},
{
"name": "samples",
"type": "LATENT",
"link": 38
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
51
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 28,
"type": "VHS_VideoCombine", "type": "VHS_VideoCombine",
"pos": [ "pos": [
1563, 1432,
353 150
], ],
"size": [ "size": [
315, 667.752197265625,
520.6666666666666 755.8347981770833
], ],
"flags": {}, "flags": {},
"order": 4, "order": 6,
"mode": 0, "mode": 0,
"inputs": [ "inputs": [
{ {
"name": "images", "name": "images",
"type": "IMAGE", "type": "IMAGE",
"link": 23 "link": 51,
"slot_index": 0
}, },
{ {
"name": "audio", "name": "audio",
@ -235,7 +169,7 @@
"hidden": false, "hidden": false,
"paused": false, "paused": false,
"params": { "params": {
"filename": "AnimateDiff_00003.mp4", "filename": "AnimateDiff_00001.mp4",
"subfolder": "", "subfolder": "",
"type": "temp", "type": "temp",
"format": "video/h264-mp4", "format": "video/h264-mp4",
@ -243,66 +177,226 @@
} }
} }
} }
},
{
"id": 30,
"type": "CogVideoTextEncode",
"pos": [
500,
308
],
"size": [
474.84501511852204,
164.74235966960538
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 54
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
55
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance."
]
},
{
"id": 20,
"type": "CLIPLoader",
"pos": [
-59,
397
],
"size": {
"0": 451.30548095703125,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
54,
56
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 31,
"type": "CogVideoTextEncode",
"pos": [
503,
521
],
"size": [
463.01251866466464,
98.10446321574796
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
57
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
""
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": [
649,
182
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 1,
"mode": 0,
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
36
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"fp16"
]
} }
], ],
"links": [ "links": [
[ [
1, 36,
1, 1,
0, 0,
2, 22,
0, 0,
"COGVIDEOPIPE" "COGVIDEOPIPE"
], ],
[ [
15, 37,
1, 22,
0,
10,
0,
"COGVIDEOPIPE"
],
[
16,
2,
0,
10,
1,
"COGEMBEDS"
],
[
21,
10,
0, 0,
11, 11,
0, 0,
"COGVIDEOPIPE" "COGVIDEOPIPE"
], ],
[ [
38,
22, 22,
10,
1, 1,
11, 11,
1, 1,
"LATENT" "LATENT"
], ],
[ [
23, 51,
11, 11,
0, 0,
12, 28,
0, 0,
"IMAGE" "IMAGE"
],
[
54,
20,
0,
30,
0,
"CLIP"
],
[
55,
30,
0,
22,
1,
"CONDITIONING"
],
[
56,
20,
0,
31,
0,
"CLIP"
],
[
57,
31,
0,
22,
2,
"CONDITIONING"
] ]
], ],
"groups": [], "groups": [],
"config": {}, "config": {},
"extra": { "extra": {
"ds": { "ds": {
"scale": 1, "scale": 0.6830134553650706,
"offset": [ "offset": [
-281.3644522995906, 359.4381777891929,
-67.92982606602688 334.95283678425216
] ]
} }
}, },

View File

@ -48,12 +48,13 @@ class DownloadAndLoadCogVideoModel:
snapshot_download( snapshot_download(
repo_id="THUDM/CogVideoX-2b", repo_id="THUDM/CogVideoX-2b",
#ignore_patterns=["*sd-image-variations-encoder-fp16.safetensors", "fye_motion_module-fp16.safetensors"], ignore_patterns=["*text_encoder*"],
local_dir=base_path, local_dir=base_path,
local_dir_use_symlinks=False, local_dir_use_symlinks=False,
) )
pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device) pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
pipeline = { pipeline = {
"pipe": pipe, "pipe": pipe,
@ -72,8 +73,8 @@ class CogVideoEncodePrompt:
} }
} }
RETURN_TYPES = ("COGEMBEDS",) RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
RETURN_NAMES = ("embeds",) RETURN_NAMES = ("positive", "negative")
FUNCTION = "process" FUNCTION = "process"
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
@ -86,7 +87,7 @@ class CogVideoEncodePrompt:
pipe.text_encoder.to(device) pipe.text_encoder.to(device)
pipe.transformer.to(offload_device) pipe.transformer.to(offload_device)
pos_embeds, neg_embeds = pipe.encode_prompt( positive, negative = pipe.encode_prompt(
prompt=prompt, prompt=prompt,
negative_prompt=negative_prompt, negative_prompt=negative_prompt,
do_classifier_free_guidance=True, do_classifier_free_guidance=True,
@ -96,11 +97,30 @@ class CogVideoEncodePrompt:
dtype=dtype, dtype=dtype,
) )
pipe.text_encoder.to(offload_device) pipe.text_encoder.to(offload_device)
embeds = {
"positive": pos_embeds, return (positive, negative)
"negative": neg_embeds,
class CogVideoTextEncode:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"clip": ("CLIP",),
"prompt": ("STRING", {"default": "", "multiline": True} ),
}
} }
RETURN_TYPES = ("CONDITIONING",)
RETURN_NAMES = ("conditioning",)
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, clip, prompt):
clip.tokenizer.t5xxl.pad_to_max_length = True
clip.tokenizer.t5xxl.max_length = 226
tokens = clip.tokenize(prompt, return_word_ids=True)
embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False)
return (embeds, ) return (embeds, )
class CogVideoSampler: class CogVideoSampler:
@ -108,7 +128,8 @@ class CogVideoSampler:
def INPUT_TYPES(s): def INPUT_TYPES(s):
return {"required": { return {"required": {
"pipeline": ("COGVIDEOPIPE",), "pipeline": ("COGVIDEOPIPE",),
"embeds": ("COGEMBEDS", ), "positive": ("CONDITIONING", ),
"negative": ("CONDITIONING", ),
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}), "height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}), "width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
"num_frames": ("INT", {"default": 48, "min": 1, "max": 100, "step": 1}), "num_frames": ("INT", {"default": 48, "min": 1, "max": 100, "step": 1}),
@ -124,11 +145,12 @@ class CogVideoSampler:
FUNCTION = "process" FUNCTION = "process"
CATEGORY = "CogVideoWrapper" CATEGORY = "CogVideoWrapper"
def process(self, pipeline, embeds, fps, steps, cfg, seed, height, width, num_frames): def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames):
mm.soft_empty_cache() mm.soft_empty_cache()
device = mm.get_torch_device() device = mm.get_torch_device()
offload_device = mm.unet_offload_device() offload_device = mm.unet_offload_device()
pipe = pipeline["pipe"] pipe = pipeline["pipe"]
dtype = pipeline["dtype"]
pipe.transformer.to(device) pipe.transformer.to(device)
generator = torch.Generator(device=device).manual_seed(seed) generator = torch.Generator(device=device).manual_seed(seed)
@ -140,8 +162,8 @@ class CogVideoSampler:
num_frames = num_frames, num_frames = num_frames,
fps = fps, fps = fps,
guidance_scale=cfg, guidance_scale=cfg,
prompt_embeds=embeds["positive"], prompt_embeds=positive.to(dtype).to(device),
negative_prompt_embeds=embeds["negative"], negative_prompt_embeds=negative.to(dtype).to(device),
#negative_prompt_embeds=torch.zeros_like(embeds), #negative_prompt_embeds=torch.zeros_like(embeds),
generator=generator, generator=generator,
output_type="latents", output_type="latents",
@ -206,12 +228,12 @@ class CogVideoDecode:
NODE_CLASS_MAPPINGS = { NODE_CLASS_MAPPINGS = {
"DownloadAndLoadCogVideoModel": DownloadAndLoadCogVideoModel, "DownloadAndLoadCogVideoModel": DownloadAndLoadCogVideoModel,
"CogVideoSampler": CogVideoSampler, "CogVideoSampler": CogVideoSampler,
"CogVideoEncodePrompt": CogVideoEncodePrompt, "CogVideoDecode": CogVideoDecode,
"CogVideoDecode": CogVideoDecode "CogVideoTextEncode": CogVideoTextEncode
} }
NODE_DISPLAY_NAME_MAPPINGS = { NODE_DISPLAY_NAME_MAPPINGS = {
"DownloadAndLoadCogVideoModel": "DownloadAndLoadCogVideoModel", "DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model",
"CogVideoSampler": "CogVideo Sampler", "CogVideoSampler": "CogVideo Sampler",
"CogVideoEncodePrompt": "CogVideo EncodePrompt",
"CogVideoDecode": "CogVideo Decode", "CogVideoDecode": "CogVideo Decode",
"CogVideoTextEncode": "CogVideo TextEncode"
} }