use existing T5 models

This commit is contained in:
kijai 2024-08-06 04:15:40 +03:00
parent d56e14ec1e
commit b787b9a8fa
2 changed files with 295 additions and 179 deletions

View File

@ -1,145 +1,37 @@
{
"last_node_id": 12,
"last_link_id": 23,
"last_node_id": 31,
"last_link_id": 57,
"nodes": [
{
"id": 11,
"type": "CogVideoDecode",
"pos": [
1301,
352
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 21
},
{
"name": "samples",
"type": "LATENT",
"link": 22
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
23
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 2,
"type": "CogVideoEncodePrompt",
"pos": [
459,
485
],
"size": [
408.03107827615304,
315.59645204258936
],
"flags": {},
"order": 1,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 1
}
],
"outputs": [
{
"name": "embeds",
"type": "COGEMBEDS",
"links": [
16
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoEncodePrompt"
},
"widgets_values": [
"A detailed wooden toy ship with intricately carved masts and sails is seen gliding smoothly over a plush, blue carpet that mimics the waves of the sea. The ship's hull is painted a rich brown, with tiny windows. The carpet, soft and textured, provides a perfect backdrop, resembling an oceanic expanse. Surrounding the ship are various other toys and children's items, hinting at a playful environment. The scene captures the innocence and imagination of childhood, with the toy ship's journey symbolizing endless adventures in a whimsical, indoor setting.",
""
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": [
460,
354
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
1,
15
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"fp16"
]
},
{
"id": 10,
"id": 22,
"type": "CogVideoSampler",
"pos": [
920,
353
1041,
342
],
"size": {
"0": 315,
"1": 246
"1": 266
},
"flags": {},
"order": 2,
"order": 4,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 15
"link": 36
},
{
"name": "embeds",
"type": "COGEMBEDS",
"link": 16
"name": "positive",
"type": "CONDITIONING",
"link": 55,
"slot_index": 1
},
{
"name": "negative",
"type": "CONDITIONING",
"link": 57
}
],
"outputs": [
@ -147,7 +39,7 @@
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
21
37
],
"shape": 3
},
@ -155,7 +47,7 @@
"name": "samples",
"type": "LATENT",
"links": [
22
38
],
"shape": 3
}
@ -166,33 +58,75 @@
"widgets_values": [
480,
720,
48,
16,
8,
30,
25,
6,
867121661458558,
806286757407561,
"fixed"
]
},
{
"id": 12,
"id": 11,
"type": "CogVideoDecode",
"pos": [
1142,
658
],
"size": {
"0": 210,
"1": 46
},
"flags": {},
"order": 5,
"mode": 0,
"inputs": [
{
"name": "pipeline",
"type": "COGVIDEOPIPE",
"link": 37
},
{
"name": "samples",
"type": "LATENT",
"link": 38
}
],
"outputs": [
{
"name": "images",
"type": "IMAGE",
"links": [
51
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoDecode"
}
},
{
"id": 28,
"type": "VHS_VideoCombine",
"pos": [
1563,
353
1432,
150
],
"size": [
315,
520.6666666666666
667.752197265625,
755.8347981770833
],
"flags": {},
"order": 4,
"order": 6,
"mode": 0,
"inputs": [
{
"name": "images",
"type": "IMAGE",
"link": 23
"link": 51,
"slot_index": 0
},
{
"name": "audio",
@ -235,7 +169,7 @@
"hidden": false,
"paused": false,
"params": {
"filename": "AnimateDiff_00003.mp4",
"filename": "AnimateDiff_00001.mp4",
"subfolder": "",
"type": "temp",
"format": "video/h264-mp4",
@ -243,66 +177,226 @@
}
}
}
},
{
"id": 30,
"type": "CogVideoTextEncode",
"pos": [
500,
308
],
"size": [
474.84501511852204,
164.74235966960538
],
"flags": {},
"order": 2,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 54
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
55
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. The panda's fluffy paws strum a miniature\nacoustic guitar, producing soft, melodic tunes. Nearby, a few other pandas gather, watching curiously and some clapping in rhythm. Sunlight filters\nthrough the tall bamboo, casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. The\nbackground includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical atmosphere of this unique musical\nperformance."
]
},
{
"id": 20,
"type": "CLIPLoader",
"pos": [
-59,
397
],
"size": {
"0": 451.30548095703125,
"1": 82
},
"flags": {},
"order": 0,
"mode": 0,
"outputs": [
{
"name": "CLIP",
"type": "CLIP",
"links": [
54,
56
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CLIPLoader"
},
"widgets_values": [
"t5\\google_t5-v1_1-xxl_encoderonly-fp8_e4m3fn.safetensors",
"sd3"
]
},
{
"id": 31,
"type": "CogVideoTextEncode",
"pos": [
503,
521
],
"size": [
463.01251866466464,
98.10446321574796
],
"flags": {},
"order": 3,
"mode": 0,
"inputs": [
{
"name": "clip",
"type": "CLIP",
"link": 56
}
],
"outputs": [
{
"name": "conditioning",
"type": "CONDITIONING",
"links": [
57
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "CogVideoTextEncode"
},
"widgets_values": [
""
]
},
{
"id": 1,
"type": "DownloadAndLoadCogVideoModel",
"pos": [
649,
182
],
"size": {
"0": 315,
"1": 58
},
"flags": {},
"order": 1,
"mode": 0,
"outputs": [
{
"name": "cogvideo_pipe",
"type": "COGVIDEOPIPE",
"links": [
36
],
"shape": 3,
"slot_index": 0
}
],
"properties": {
"Node name for S&R": "DownloadAndLoadCogVideoModel"
},
"widgets_values": [
"fp16"
]
}
],
"links": [
[
1,
36,
1,
0,
2,
22,
0,
"COGVIDEOPIPE"
],
[
15,
1,
0,
10,
0,
"COGVIDEOPIPE"
],
[
16,
2,
0,
10,
1,
"COGEMBEDS"
],
[
21,
10,
37,
22,
0,
11,
0,
"COGVIDEOPIPE"
],
[
38,
22,
10,
1,
11,
1,
"LATENT"
],
[
23,
51,
11,
0,
12,
28,
0,
"IMAGE"
],
[
54,
20,
0,
30,
0,
"CLIP"
],
[
55,
30,
0,
22,
1,
"CONDITIONING"
],
[
56,
20,
0,
31,
0,
"CLIP"
],
[
57,
31,
0,
22,
2,
"CONDITIONING"
]
],
"groups": [],
"config": {},
"extra": {
"ds": {
"scale": 1,
"scale": 0.6830134553650706,
"offset": [
-281.3644522995906,
-67.92982606602688
359.4381777891929,
334.95283678425216
]
}
},

View File

@ -48,12 +48,13 @@ class DownloadAndLoadCogVideoModel:
snapshot_download(
repo_id="THUDM/CogVideoX-2b",
#ignore_patterns=["*sd-image-variations-encoder-fp16.safetensors", "fye_motion_module-fp16.safetensors"],
ignore_patterns=["*text_encoder*"],
local_dir=base_path,
local_dir_use_symlinks=False,
)
pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
pipeline = {
"pipe": pipe,
@ -72,8 +73,8 @@ class CogVideoEncodePrompt:
}
}
RETURN_TYPES = ("COGEMBEDS",)
RETURN_NAMES = ("embeds",)
RETURN_TYPES = ("CONDITIONING", "CONDITIONING")
RETURN_NAMES = ("positive", "negative")
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
@ -86,7 +87,7 @@ class CogVideoEncodePrompt:
pipe.text_encoder.to(device)
pipe.transformer.to(offload_device)
pos_embeds, neg_embeds = pipe.encode_prompt(
positive, negative = pipe.encode_prompt(
prompt=prompt,
negative_prompt=negative_prompt,
do_classifier_free_guidance=True,
@ -96,11 +97,30 @@ class CogVideoEncodePrompt:
dtype=dtype,
)
pipe.text_encoder.to(offload_device)
embeds = {
"positive": pos_embeds,
"negative": neg_embeds,
return (positive, negative)
class CogVideoTextEncode:
@classmethod
def INPUT_TYPES(s):
return {"required": {
"clip": ("CLIP",),
"prompt": ("STRING", {"default": "", "multiline": True} ),
}
}
RETURN_TYPES = ("CONDITIONING",)
RETURN_NAMES = ("conditioning",)
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, clip, prompt):
clip.tokenizer.t5xxl.pad_to_max_length = True
clip.tokenizer.t5xxl.max_length = 226
tokens = clip.tokenize(prompt, return_word_ids=True)
embeds = clip.encode_from_tokens(tokens, return_pooled=False, return_dict=False)
return (embeds, )
class CogVideoSampler:
@ -108,7 +128,8 @@ class CogVideoSampler:
def INPUT_TYPES(s):
return {"required": {
"pipeline": ("COGVIDEOPIPE",),
"embeds": ("COGEMBEDS", ),
"positive": ("CONDITIONING", ),
"negative": ("CONDITIONING", ),
"height": ("INT", {"default": 480, "min": 128, "max": 2048, "step": 8}),
"width": ("INT", {"default": 720, "min": 128, "max": 2048, "step": 8}),
"num_frames": ("INT", {"default": 48, "min": 1, "max": 100, "step": 1}),
@ -124,11 +145,12 @@ class CogVideoSampler:
FUNCTION = "process"
CATEGORY = "CogVideoWrapper"
def process(self, pipeline, embeds, fps, steps, cfg, seed, height, width, num_frames):
def process(self, pipeline, positive, negative, fps, steps, cfg, seed, height, width, num_frames):
mm.soft_empty_cache()
device = mm.get_torch_device()
offload_device = mm.unet_offload_device()
pipe = pipeline["pipe"]
dtype = pipeline["dtype"]
pipe.transformer.to(device)
generator = torch.Generator(device=device).manual_seed(seed)
@ -140,8 +162,8 @@ class CogVideoSampler:
num_frames = num_frames,
fps = fps,
guidance_scale=cfg,
prompt_embeds=embeds["positive"],
negative_prompt_embeds=embeds["negative"],
prompt_embeds=positive.to(dtype).to(device),
negative_prompt_embeds=negative.to(dtype).to(device),
#negative_prompt_embeds=torch.zeros_like(embeds),
generator=generator,
output_type="latents",
@ -206,12 +228,12 @@ class CogVideoDecode:
NODE_CLASS_MAPPINGS = {
"DownloadAndLoadCogVideoModel": DownloadAndLoadCogVideoModel,
"CogVideoSampler": CogVideoSampler,
"CogVideoEncodePrompt": CogVideoEncodePrompt,
"CogVideoDecode": CogVideoDecode
"CogVideoDecode": CogVideoDecode,
"CogVideoTextEncode": CogVideoTextEncode
}
NODE_DISPLAY_NAME_MAPPINGS = {
"DownloadAndLoadCogVideoModel": "DownloadAndLoadCogVideoModel",
"DownloadAndLoadCogVideoModel": "(Down)load CogVideo Model",
"CogVideoSampler": "CogVideo Sampler",
"CogVideoEncodePrompt": "CogVideo EncodePrompt",
"CogVideoDecode": "CogVideo Decode",
"CogVideoTextEncode": "CogVideo TextEncode"
}