Modify CogVideoXfun pipeline to fix Vid2Vid sampler issues

2026-05-13 17:24:48 +08:00 · 2024-10-26 07:38:14 +08:00 · 2024-10-26 07:38:14 +08:00 · c2950dfb47
commit c2950dfb47
parent 25f16462aa
3 changed files with 756 additions and 116 deletions
--- a/cogvideox_fun/pipeline_cogvideox_inpaint.py
+++ b/cogvideox_fun/pipeline_cogvideox_inpaint.py
@ -262,111 +262,111 @@ class CogVideoX_Fun_Pipeline_Inpaint(VideoSysPipeline):
            set_pab_manager(pab_config)
    def prepare_latents(
-        self, 
+            self, 
        batch_size,
        num_channels_latents,
        height,
        width,
        video_length,
        dtype,
        device,
        generator,
        latents=None,
        video=None,
        timestep=None,
        is_strength_max=True,
        return_noise=False,
        return_video_latents=False,
        context_size=None,
        context_overlap=None,
        freenoise=False,
    ):
        shape = (
            batch_size,
            (video_length - 1) // self.vae_scale_factor_temporal + 1,
            num_channels_latents,
-            height // self.vae_scale_factor_spatial,
+            height,
-            width // self.vae_scale_factor_spatial,
+            width,
-        )
+            video_length,
-        if isinstance(generator, list) and len(generator) != batch_size:
+            dtype,
-            raise ValueError(
+            device,
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+            generator,
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            latents=None,
            video=None,
            timestep=None,
            is_strength_max=True,
            return_noise=False,
            return_video_latents=False,
            context_size=None,
            context_overlap=None,
            freenoise=False,
        ):
            shape = (
                batch_size,
                (video_length - 1) // self.vae_scale_factor_temporal + 1,
                num_channels_latents,
                height // self.vae_scale_factor_spatial,
                width // self.vae_scale_factor_spatial,
            )
            if isinstance(generator, list) and len(generator) != batch_size:
                raise ValueError(
                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
                )
-        if return_video_latents or (latents is None and not is_strength_max):
+            if return_video_latents or (latents is None and not is_strength_max):
-            video = video.to(device=device, dtype=self.vae.dtype)
+                video = video.to(device=device, dtype=self.vae.dtype)
-            
+                
-            bs = 1
+                bs = 1
-            new_video = []
+                new_video = []
-            for i in range(0, video.shape[0], bs):
+                for i in range(0, video.shape[0], bs):
-                video_bs = video[i : i + bs]
+                    video_bs = video[i : i + bs]
-                video_bs = self.vae.encode(video_bs)[0]
+                    video_bs = self.vae.encode(video_bs)[0]
-                video_bs = video_bs.sample()
+                    video_bs = video_bs.sample()
-                new_video.append(video_bs)
+                    new_video.append(video_bs)
-            video = torch.cat(new_video, dim = 0)
+                video = torch.cat(new_video, dim = 0)
-            video = video * self.vae.config.scaling_factor
+                video = video * self.vae.config.scaling_factor
-            video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
+                video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
-            video_latents = video_latents.to(device=device, dtype=dtype)
+                video_latents = video_latents.to(device=device, dtype=dtype)
-            video_latents = rearrange(video_latents, "b c f h w -> b f c h w")
+                video_latents = rearrange(video_latents, "b c f h w -> b f c h w")
-        if latents is None:
+            if latents is None:
-            noise = randn_tensor(shape, generator=generator, device=torch.device("cpu"), dtype=dtype)
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            if freenoise:
+                if freenoise:
-                print("Applying FreeNoise")
+                    print("Applying FreeNoise")
-                # code and comments from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
+                    # code and comments from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
-                video_length = video_length // 4
+                    video_length_adjusted = video_length // 4
-                delta = context_size - context_overlap
+                    delta = context_size - context_overlap
-                for start_idx in range(0, video_length-context_size, delta):
+                    for start_idx in range(0, video_length_adjusted - context_size, delta):
-                    # start_idx corresponds to the beginning of a context window
+                        # start_idx corresponds to the beginning of a context window
-                    # goal: place shuffled in the delta region right after the end of the context window
+                        # goal: place shuffled in the delta region right after the end of the context window
-                    #       if space after context window is not enough to place the noise, adjust and finish
+                        #       if space after context window is not enough to place the noise, adjust and finish
-                    place_idx = start_idx + context_size
+                        place_idx = start_idx + context_size
-                    # if place_idx is outside the valid indexes, we are already finished
+                        # if place_idx is outside the valid indexes, we are already finished
-                    if place_idx >= video_length:
+                        if place_idx >= video_length_adjusted:
-                        break
+                            break
-                    end_idx = place_idx - 1
+                        end_idx = place_idx - 1
-                    #print("video_length:", video_length, "start_idx:", start_idx, "end_idx:", end_idx, "place_idx:", place_idx, "delta:", delta)
+                        #print("video_length:", video_length, "start_idx:", start_idx, "end_idx:", end_idx, "place_idx:", place_idx, "delta:", delta)
-                    # if there is not enough room to copy delta amount of indexes, copy limited amount and finish
+                        # if there is not enough room to copy delta amount of indexes, copy limited amount and finish
-                    if end_idx + delta >= video_length:
+                        if end_idx + delta >= video_length_adjusted:
-                        final_delta = video_length - place_idx
+                            final_delta = video_length_adjusted - place_idx
-                        # generate list of indexes in final delta region
+                            # generate list of indexes in final delta region
-                        list_idx = torch.tensor(list(range(start_idx,start_idx+final_delta)), device=torch.device("cpu"), dtype=torch.long)
+                            list_idx = torch.tensor(list(range(start_idx, start_idx + final_delta)), device=device, dtype=torch.long)
                            # shuffle list
                            list_idx = list_idx[torch.randperm(final_delta, generator=generator)]
                            # apply shuffled indexes
                            noise[:, place_idx:place_idx + final_delta, :, :, :] = noise[:, list_idx, :, :, :]
                            break
                        # otherwise, do normal behavior
                        # generate list of indexes in delta region
                        list_idx = torch.tensor(list(range(start_idx, start_idx + delta)), device=device, dtype=torch.long)
                        # shuffle list
-                        list_idx = list_idx[torch.randperm(final_delta, generator=generator)]
+                        list_idx = list_idx[torch.randperm(delta, generator=generator)]
                        # apply shuffled indexes
-                        noise[:, place_idx:place_idx + final_delta, :, :, :] = noise[:, list_idx, :, :, :]
+                        #print("place_idx:", place_idx, "delta:", delta, "list_idx:", list_idx)
-                        break
+                        noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]
                    # otherwise, do normal behavior
                    # generate list of indexes in delta region
                    list_idx = torch.tensor(list(range(start_idx,start_idx+delta)), device=torch.device("cpu"), dtype=torch.long)
                    # shuffle list
                    list_idx = list_idx[torch.randperm(delta, generator=generator)]
                    # apply shuffled indexes
                    #print("place_idx:", place_idx, "delta:", delta, "list_idx:", list_idx)
                    noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]
-            # if strength is 1. then initialise the latents to noise, else initial to image + noise
+                # if strength is 1. then initialise the latents to noise, else initial to image + noise
-            latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
+                latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
-            # if pure noise then scale the initial latents by the  Scheduler's init sigma
+                # if pure noise then scale the initial latents by the Scheduler's init sigma
-            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+                latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
-            latents = latents.to(device)
+                latents = latents.to(device)
-        else:
+            else:
-            noise = latents.to(device)
+                noise = latents.to(device)
-            latents = noise * self.scheduler.init_noise_sigma
+                latents = noise * self.scheduler.init_noise_sigma
-        # scale the initial noise by the standard deviation required by the scheduler
+            # scale the initial noise by the standard deviation required by the scheduler
-        outputs = (latents,)
+            outputs = (latents,)
-        if return_noise:
+            if return_noise:
-            outputs += (noise,)
+                outputs += (noise,)
-        if return_video_latents:
+            if return_video_latents:
-            outputs += (video_latents,)
+                outputs += (video_latents,)
-        return outputs
+            return outputs
    def prepare_mask_latents(
        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance, noise_aug_strength
--- a/examples/cogvideox_fun_V2V_example_01.json
+++ b/examples/cogvideox_fun_V2V_example_01.json
@ -0,0 +1,614 @@
 {
  "last_node_id": 224,
  "last_link_id": 333,
  "nodes": [
    {
      "id": 216,
      "type": "CogVideoTextEncode",
      "pos": {
        "0": 1320,
        "1": 2224
      },
      "size": [
        400,
        200
      ],
      "flags": {},
      "order": 4,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 315
        }
      ],
      "outputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
            314
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
        "",
        1,
        true
      ]
    },
    {
      "id": 217,
      "type": "CLIPLoader",
      "pos": {
        "0": 937,
        "1": 2245
      },
      "size": {
        "0": 315,
        "1": 82
      },
      "flags": {},
      "order": 0,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "CLIP",
          "type": "CLIP",
          "links": [
            315,
            317
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CLIPLoader"
      },
      "widgets_values": [
        "t5xxl_fp8_e4m3fn.safetensors",
        "sd3"
      ]
    },
    {
      "id": 206,
      "type": "VHS_VideoCombine",
      "pos": {
        "0": 2648,
        "1": 2268
      },
      "size": [
        595.7279663085938,
        652.9874093191964
      ],
      "flags": {
        "collapsed": false
      },
      "order": 8,
      "mode": 0,
      "inputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "link": 333,
          "label": "images"
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "link": null,
          "label": "audio"
        },
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null,
          "label": "meta_batch"
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null,
          "label": "vae"
        }
      ],
      "outputs": [
        {
          "name": "Filenames",
          "type": "VHS_FILENAMES",
          "links": null,
          "shape": 3,
          "label": "Filenames"
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_VideoCombine",
        "ttNbgOverride": {
          "color": "#223",
          "bgcolor": "#335",
          "groupcolor": "#88A"
        }
      },
      "widgets_values": {
        "frame_rate": 24,
        "loop_count": 0,
        "filename_prefix": "1009/",
        "format": "video/h264-mp4",
        "pix_fmt": "yuv420p",
        "crf": 19,
        "save_metadata": false,
        "pingpong": false,
        "save_output": true,
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "filename": "1009_00373.mp4",
            "subfolder": "",
            "type": "output",
            "format": "video/h264-mp4",
            "frame_rate": 24
          },
          "muted": false
        }
      },
      "color": "#223",
      "bgcolor": "#335"
    },
    {
      "id": 220,
      "type": "CogVideoDecode",
      "pos": {
        "0": 2242.513671875,
        "1": 2250.128662109375
      },
      "size": {
        "0": 315,
        "1": 198
      },
      "flags": {},
      "order": 7,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 320
        },
        {
          "name": "samples",
          "type": "LATENT",
          "link": 321
        }
      ],
      "outputs": [
        {
          "name": "images",
          "type": "IMAGE",
          "links": [
            333
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoDecode"
      },
      "widgets_values": [
        false,
        240,
        360,
        0.2,
        0.2,
        true
      ]
    },
    {
      "id": 218,
      "type": "CogVideoTextEncode",
      "pos": {
        "0": 1318,
        "1": 2492
      },
      "size": {
        "0": 400,
        "1": 200
      },
      "flags": {},
      "order": 5,
      "mode": 0,
      "inputs": [
        {
          "name": "clip",
          "type": "CLIP",
          "link": 317
        }
      ],
      "outputs": [
        {
          "name": "conditioning",
          "type": "CONDITIONING",
          "links": [
            318
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoTextEncode"
      },
      "widgets_values": [
        "The video is not of a high quality, it has a low resolution. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, blurry camera, shaking camera. Deformation, blurry, ugly, distortion. ",
        1,
        true
      ]
    },
    {
      "id": 215,
      "type": "DownloadAndLoadCogVideoModel",
      "pos": {
        "0": 1388,
        "1": 1986
      },
      "size": {
        "0": 315,
        "1": 194
      },
      "flags": {},
      "order": 1,
      "mode": 0,
      "inputs": [
        {
          "name": "pab_config",
          "type": "PAB_CONFIG",
          "link": null
        },
        {
          "name": "block_edit",
          "type": "TRANSFORMERBLOCKS",
          "link": null
        },
        {
          "name": "lora",
          "type": "COGLORA",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            313
          ],
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "DownloadAndLoadCogVideoModel"
      },
      "widgets_values": [
        "kijai/CogVideoX-Fun-5b",
        "bf16",
        "disabled",
        "disabled",
        false
      ]
    },
    {
      "id": 214,
      "type": "CogVideoXFunVid2VidSampler",
      "pos": {
        "0": 1823,
        "1": 2249
      },
      "size": {
        "0": 380.4000244140625,
        "1": 306
      },
      "flags": {},
      "order": 6,
      "mode": 0,
      "inputs": [
        {
          "name": "pipeline",
          "type": "COGVIDEOPIPE",
          "link": 313
        },
        {
          "name": "positive",
          "type": "CONDITIONING",
          "link": 314
        },
        {
          "name": "negative",
          "type": "CONDITIONING",
          "link": 318
        },
        {
          "name": "validation_video",
          "type": "IMAGE",
          "link": 332
        },
        {
          "name": "context_options",
          "type": "COGCONTEXT",
          "link": 327
        }
      ],
      "outputs": [
        {
          "name": "cogvideo_pipe",
          "type": "COGVIDEOPIPE",
          "links": [
            320
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "samples",
          "type": "LATENT",
          "links": [
            321
          ],
          "slot_index": 1,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoXFunVid2VidSampler"
      },
      "widgets_values": [
        49,
        768,
        15861381752719,
        "fixed",
        30,
        7,
        "DDIM",
        0.5
      ]
    },
    {
      "id": 222,
      "type": "CogVideoContextOptions",
      "pos": {
        "0": 1379,
        "1": 2771
      },
      "size": [
        315,
        154
      ],
      "flags": {},
      "order": 2,
      "mode": 0,
      "inputs": [],
      "outputs": [
        {
          "name": "context_options",
          "type": "COGCONTEXT",
          "links": [
            327
          ],
          "slot_index": 0,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "CogVideoContextOptions"
      },
      "widgets_values": [
        "uniform_standard",
        48,
        4,
        4,
        true
      ]
    },
    {
      "id": 209,
      "type": "VHS_LoadVideo",
      "pos": {
        "0": 286,
        "1": 2110
      },
      "size": [
        426.63287353515625,
        508.3616420200893
      ],
      "flags": {},
      "order": 3,
      "mode": 0,
      "inputs": [
        {
          "name": "meta_batch",
          "type": "VHS_BatchManager",
          "link": null
        },
        {
          "name": "vae",
          "type": "VAE",
          "link": null
        }
      ],
      "outputs": [
        {
          "name": "IMAGE",
          "type": "IMAGE",
          "links": [
            332
          ],
          "slot_index": 0,
          "shape": 3
        },
        {
          "name": "frame_count",
          "type": "INT",
          "links": null,
          "shape": 3
        },
        {
          "name": "audio",
          "type": "AUDIO",
          "links": null,
          "shape": 3
        },
        {
          "name": "video_info",
          "type": "VHS_VIDEOINFO",
          "links": null,
          "shape": 3
        }
      ],
      "properties": {
        "Node name for S&R": "VHS_LoadVideo"
      },
      "widgets_values": {
        "video": "CogVideoX-Fun_00006.mp4",
        "force_rate": 0,
        "force_size": "Disabled",
        "custom_width": 512,
        "custom_height": 512,
        "frame_load_cap": 0,
        "skip_first_frames": 0,
        "select_every_nth": 3,
        "choose video to upload": "image",
        "videopreview": {
          "hidden": false,
          "paused": false,
          "params": {
            "frame_load_cap": 0,
            "skip_first_frames": 0,
            "force_rate": 0,
            "filename": "CogVideoX-Fun_00006.mp4",
            "type": "input",
            "format": "video/mp4",
            "select_every_nth": 3
          },
          "muted": false
        }
      },
      "color": "#223",
      "bgcolor": "#335"
    }
  ],
  "links": [
    [
      313,
      215,
      0,
      214,
      0,
      "COGVIDEOPIPE"
    ],
    [
      314,
      216,
      0,
      214,
      1,
      "CONDITIONING"
    ],
    [
      315,
      217,
      0,
      216,
      0,
      "CLIP"
    ],
    [
      317,
      217,
      0,
      218,
      0,
      "CLIP"
    ],
    [
      318,
      218,
      0,
      214,
      2,
      "CONDITIONING"
    ],
    [
      320,
      214,
      0,
      220,
      0,
      "COGVIDEOPIPE"
    ],
    [
      321,
      214,
      1,
      220,
      1,
      "LATENT"
    ],
    [
      327,
      222,
      0,
      214,
      4,
      "COGCONTEXT"
    ],
    [
      332,
      209,
      0,
      214,
      3,
      "IMAGE"
    ],
    [
      333,
      220,
      0,
      206,
      0,
      "IMAGE"
    ]
  ],
  "groups": [
    {
      "title": "cogvideoxfun v2v sampler",
      "bounding": [
        78,
        1820,
        3385,
        1603
      ],
      "color": "#3f789e",
      "font_size": 24,
      "flags": {}
    }
  ],
  "config": {},
  "extra": {
    "ds": {
      "scale": 0.3797498335832498,
      "offset": [
        590.2813176508071,
        -1550.4283526080162
      ]
    }
  },
  "version": 0.4
 }
--- a/nodes.py
+++ b/nodes.py
@ -1538,6 +1538,9 @@ class CogVideoXFunVid2VidSampler:
                "denoise_strength": ("FLOAT", {"default": 0.70, "min": 0.05, "max": 1.00, "step": 0.01}),
                "validation_video": ("IMAGE",),
            },
            "optional": {
                "context_options": ("COGCONTEXT", ),
            },
        }
    RETURN_TYPES = ("COGVIDEOPIPE", "LATENT",)
@ -1545,8 +1548,7 @@ class CogVideoXFunVid2VidSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"
-    def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler, 
+    def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler, validation_video, context_options=None):
                validation_video):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        pipe = pipeline["pipe"]
@ -1562,15 +1564,20 @@ class CogVideoXFunVid2VidSampler:
        mm.soft_empty_cache()
        # Count most suitable height and width
-        aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+        aspect_ratio_sample_size = {key: [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
        validation_video = np.array(validation_video.cpu().numpy() * 255, np.uint8)
        original_width, original_height = Image.fromarray(validation_video[0]).size
        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
        height, width = [int(x / 16) * 16 for x in closest_size]
        log.info(f"Closest bucket size: {width}x{height}")
        # Handle context_options and adjust scheduler if needed
        if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
            logging.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM")
            scheduler = "CogVideoXDDIM"
        # Load Sampler
        scheduler_config = pipeline["scheduler_config"]
        if scheduler in scheduler_mapping:
            noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config)
@ -1578,7 +1585,15 @@ class CogVideoXFunVid2VidSampler:
        else:
            raise ValueError(f"Unknown scheduler: {scheduler}")
-        generator = torch.Generator(device=torch.device("cpu")).manual_seed(seed)
+        if context_options is not None:
            context_frames = context_options["context_frames"] // 4
            context_stride = context_options["context_stride"] // 4
            context_overlap = context_options["context_overlap"] // 4
        else:
            context_frames, context_stride, context_overlap = None, None, None
        # Create generator on the correct device
        generator = torch.Generator(device=device).manual_seed(seed)
        autocastcondition = not pipeline["onediff"] or not dtype == torch.float32
        autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext()
@ -1586,12 +1601,33 @@ class CogVideoXFunVid2VidSampler:
            video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1
            input_video, input_video_mask, clip_image = get_video_to_video_latent(validation_video, video_length=video_length, sample_size=(height, width))
-            # for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
+            # Move tensors to the correct device
-            #     pipeline = merge_lora(pipeline, _lora_path, _lora_weight)
+            input_video = input_video.to(device)
            input_video_mask = input_video_mask.to(device)
            if clip_image is not None:
                clip_image = clip_image.to(device)
            # Move positive and negative prompts to the correct device
            positive = positive.to(dtype).to(device)
            negative = negative.to(dtype).to(device)
            # Add print statements to debug devices
            print(f"Device Information:")
            print(f"  device: {device}")
            print(f"  positive.device: {positive.device}")
            print(f"  negative.device: {negative.device}")
            print(f"  input_video.device: {input_video.device}")
            print(f"  input_video_mask.device: {input_video_mask.device}")
            if clip_image is not None:
                print(f"  clip_image.device: {clip_image.device}")
            print(f"  generator device: {generator.device}")
            print(f"  pipe.device: {device}")
            print(f"  pipe.vae.device: {next(pipe.vae.parameters()).device}")
            print(f"  pipe.transformer.device: {next(pipe.transformer.parameters()).device}")
            common_params = {
-                "prompt_embeds": positive.to(dtype).to(device),
+                "prompt_embeds": positive,
-                "negative_prompt_embeds": negative.to(dtype).to(device),
+                "negative_prompt_embeds": negative,
                "num_frames": video_length,
                "height": height,
                "width": width,
@ -1605,25 +1641,15 @@ class CogVideoXFunVid2VidSampler:
                **common_params,
                video=input_video,
                mask_video=input_video_mask,
-                strength=float(denoise_strength)
+                strength=float(denoise_strength),
                context_schedule=context_options["context_schedule"] if context_options is not None else None,
                context_frames=context_frames,
                context_stride=context_stride,
                context_overlap=context_overlap,
                freenoise=context_options["freenoise"] if context_options is not None else None
            )
            # for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
            #     pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight)
        return (pipeline, {"samples": latents})
 def add_noise_to_reference_video(image, ratio=None):
    if ratio is None:
        sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
        sigma = torch.exp(sigma).to(image.dtype)
    else:
        sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
    image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
    image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
    image = image + image_noise
    return image
 class CogVideoControlImageEncode:
    @classmethod
    def INPUT_TYPES(s):