fun text2vid

2026-03-16 11:57:15 +08:00 · 2024-09-19 01:07:18 +03:00 · 2024-09-19 01:07:18 +03:00 · a125f61aad
commit a125f61aad
parent 09f53ce0c0
2 changed files with 17 additions and 8 deletions
--- a/cogvideox_fun/pipeline_cogvideox_inpaint.py
+++ b/cogvideox_fun/pipeline_cogvideox_inpaint.py
@ -674,7 +674,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):
            height,
            width,
            video_length,
-            prompt_embeds.dtype,
+            self.vae.dtype,
            device,
            generator,
            latents,
@ -721,7 +721,7 @@ class CogVideoX_Fun_Pipeline_Inpaint(DiffusionPipeline):
                        batch_size,
                        height,
                        width,
-                        prompt_embeds.dtype,
+                        self.vae.dtype,
                        device,
                        generator,
                        do_classifier_free_guidance,
--- a/nodes.py
+++ b/nodes.py
@ -525,6 +525,7 @@ class CogVideoXFunSampler:
            "optional":{
                "start_img": ("IMAGE",),
                "end_img": ("IMAGE",),
+                "opt_empty_latent": ("LATENT",),
            },
        }
    
@ -533,7 +534,8 @@ class CogVideoXFunSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"

-    def process(self, pipeline,  positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler, start_img=None, end_img=None):
+    def process(self, pipeline,  positive, negative, video_length, base_resolution, seed, steps, cfg, scheduler, 
+                start_img=None, end_img=None, opt_empty_latent=None):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        pipe = pipeline["pipe"]
@ -543,14 +545,21 @@ class CogVideoXFunSampler:

        mm.soft_empty_cache()

-        start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None
-        end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
-        # Count most suitable height and width
-        aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
-        original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
+        aspect_ratio_sample_size = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+
+        if start_img is not None:
+            start_img = [to_pil(_start_img) for _start_img in start_img] if start_img is not None else None
+            end_img = [to_pil(_end_img) for _end_img in end_img] if end_img is not None else None
+            # Count most suitable height and width
+            original_width, original_height = start_img[0].size if type(start_img) is list else Image.open(start_img).size
+        else:
+            original_width = opt_empty_latent["samples"][0].shape[-1] * 8
+            original_height = opt_empty_latent["samples"][0].shape[-2] * 8
        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
        height, width = [int(x / 16) * 16 for x in closest_size]
+        print(f"Closest size: {width}:{height}")
        
+            
        base_path = pipeline["base_path"]

        # Load Sampler