Modify CogVideoXfun pipeline to fix Vid2Vid sampler issues

2026-05-13 15:11:20 +08:00 · 2024-10-26 07:38:14 +08:00 · 2024-10-26 07:38:14 +08:00 · c2950dfb47
commit c2950dfb47
parent 25f16462aa
3 changed files with 756 additions and 116 deletions
--- a/cogvideox_fun/pipeline_cogvideox_inpaint.py
+++ b/cogvideox_fun/pipeline_cogvideox_inpaint.py
@ -262,111 +262,111 @@ class CogVideoX_Fun_Pipeline_Inpaint(VideoSysPipeline):
            set_pab_manager(pab_config)

    def prepare_latents(
-        self, 
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        video_length,
-        dtype,
-        device,
-        generator,
-        latents=None,
-        video=None,
-        timestep=None,
-        is_strength_max=True,
-        return_noise=False,
-        return_video_latents=False,
-        context_size=None,
-        context_overlap=None,
-        freenoise=False,
-    ):
-        shape = (
+            self, 
            batch_size,
-            (video_length - 1) // self.vae_scale_factor_temporal + 1,
            num_channels_latents,
-            height // self.vae_scale_factor_spatial,
-            width // self.vae_scale_factor_spatial,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            height,
+            width,
+            video_length,
+            dtype,
+            device,
+            generator,
+            latents=None,
+            video=None,
+            timestep=None,
+            is_strength_max=True,
+            return_noise=False,
+            return_video_latents=False,
+            context_size=None,
+            context_overlap=None,
+            freenoise=False,
+        ):
+            shape = (
+                batch_size,
+                (video_length - 1) // self.vae_scale_factor_temporal + 1,
+                num_channels_latents,
+                height // self.vae_scale_factor_spatial,
+                width // self.vae_scale_factor_spatial,
            )
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )

-        if return_video_latents or (latents is None and not is_strength_max):
-            video = video.to(device=device, dtype=self.vae.dtype)
-            
-            bs = 1
-            new_video = []
-            for i in range(0, video.shape[0], bs):
-                video_bs = video[i : i + bs]
-                video_bs = self.vae.encode(video_bs)[0]
-                video_bs = video_bs.sample()
-                new_video.append(video_bs)
-            video = torch.cat(new_video, dim = 0)
-            video = video * self.vae.config.scaling_factor
+            if return_video_latents or (latents is None and not is_strength_max):
+                video = video.to(device=device, dtype=self.vae.dtype)
+                
+                bs = 1
+                new_video = []
+                for i in range(0, video.shape[0], bs):
+                    video_bs = video[i : i + bs]
+                    video_bs = self.vae.encode(video_bs)[0]
+                    video_bs = video_bs.sample()
+                    new_video.append(video_bs)
+                video = torch.cat(new_video, dim = 0)
+                video = video * self.vae.config.scaling_factor

-            video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
-            video_latents = video_latents.to(device=device, dtype=dtype)
-            video_latents = rearrange(video_latents, "b c f h w -> b f c h w")
+                video_latents = video.repeat(batch_size // video.shape[0], 1, 1, 1, 1)
+                video_latents = video_latents.to(device=device, dtype=dtype)
+                video_latents = rearrange(video_latents, "b c f h w -> b f c h w")

-        if latents is None:
-            noise = randn_tensor(shape, generator=generator, device=torch.device("cpu"), dtype=dtype)
-            if freenoise:
-                print("Applying FreeNoise")
-                # code and comments from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
-                video_length = video_length // 4
-                delta = context_size - context_overlap
-                for start_idx in range(0, video_length-context_size, delta):
-                    # start_idx corresponds to the beginning of a context window
-                    # goal: place shuffled in the delta region right after the end of the context window
-                    #       if space after context window is not enough to place the noise, adjust and finish
-                    place_idx = start_idx + context_size
-                    # if place_idx is outside the valid indexes, we are already finished
-                    if place_idx >= video_length:
-                        break
-                    end_idx = place_idx - 1
-                    #print("video_length:", video_length, "start_idx:", start_idx, "end_idx:", end_idx, "place_idx:", place_idx, "delta:", delta)
+            if latents is None:
+                noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+                if freenoise:
+                    print("Applying FreeNoise")
+                    # code and comments from AnimateDiff-Evolved by Kosinkadink (https://github.com/Kosinkadink/ComfyUI-AnimateDiff-Evolved)
+                    video_length_adjusted = video_length // 4
+                    delta = context_size - context_overlap
+                    for start_idx in range(0, video_length_adjusted - context_size, delta):
+                        # start_idx corresponds to the beginning of a context window
+                        # goal: place shuffled in the delta region right after the end of the context window
+                        #       if space after context window is not enough to place the noise, adjust and finish
+                        place_idx = start_idx + context_size
+                        # if place_idx is outside the valid indexes, we are already finished
+                        if place_idx >= video_length_adjusted:
+                            break
+                        end_idx = place_idx - 1
+                        #print("video_length:", video_length, "start_idx:", start_idx, "end_idx:", end_idx, "place_idx:", place_idx, "delta:", delta)

-                    # if there is not enough room to copy delta amount of indexes, copy limited amount and finish
-                    if end_idx + delta >= video_length:
-                        final_delta = video_length - place_idx
-                        # generate list of indexes in final delta region
-                        list_idx = torch.tensor(list(range(start_idx,start_idx+final_delta)), device=torch.device("cpu"), dtype=torch.long)
+                        # if there is not enough room to copy delta amount of indexes, copy limited amount and finish
+                        if end_idx + delta >= video_length_adjusted:
+                            final_delta = video_length_adjusted - place_idx
+                            # generate list of indexes in final delta region
+                            list_idx = torch.tensor(list(range(start_idx, start_idx + final_delta)), device=device, dtype=torch.long)
+                            # shuffle list
+                            list_idx = list_idx[torch.randperm(final_delta, generator=generator)]
+                            # apply shuffled indexes
+                            noise[:, place_idx:place_idx + final_delta, :, :, :] = noise[:, list_idx, :, :, :]
+                            break
+                        # otherwise, do normal behavior
+                        # generate list of indexes in delta region
+                        list_idx = torch.tensor(list(range(start_idx, start_idx + delta)), device=device, dtype=torch.long)
                        # shuffle list
-                        list_idx = list_idx[torch.randperm(final_delta, generator=generator)]
+                        list_idx = list_idx[torch.randperm(delta, generator=generator)]
                        # apply shuffled indexes
-                        noise[:, place_idx:place_idx + final_delta, :, :, :] = noise[:, list_idx, :, :, :]
-                        break
-                    # otherwise, do normal behavior
-                    # generate list of indexes in delta region
-                    list_idx = torch.tensor(list(range(start_idx,start_idx+delta)), device=torch.device("cpu"), dtype=torch.long)
-                    # shuffle list
-                    list_idx = list_idx[torch.randperm(delta, generator=generator)]
-                    # apply shuffled indexes
-                    #print("place_idx:", place_idx, "delta:", delta, "list_idx:", list_idx)
-                    noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]
+                        #print("place_idx:", place_idx, "delta:", delta, "list_idx:", list_idx)
+                        noise[:, place_idx:place_idx + delta, :, :, :] = noise[:, list_idx, :, :, :]

-            # if strength is 1. then initialise the latents to noise, else initial to image + noise
-            latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
-            # if pure noise then scale the initial latents by the  Scheduler's init sigma
-            latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
-            latents = latents.to(device)
-        else:
-            noise = latents.to(device)
-            latents = noise * self.scheduler.init_noise_sigma
+                # if strength is 1. then initialise the latents to noise, else initial to image + noise
+                latents = noise if is_strength_max else self.scheduler.add_noise(video_latents, noise, timestep)
+                # if pure noise then scale the initial latents by the Scheduler's init sigma
+                latents = latents * self.scheduler.init_noise_sigma if is_strength_max else latents
+                latents = latents.to(device)
+            else:
+                noise = latents.to(device)
+                latents = noise * self.scheduler.init_noise_sigma

-        # scale the initial noise by the standard deviation required by the scheduler
-        outputs = (latents,)
+            # scale the initial noise by the standard deviation required by the scheduler
+            outputs = (latents,)

-        if return_noise:
-            outputs += (noise,)
+            if return_noise:
+                outputs += (noise,)

-        if return_video_latents:
-            outputs += (video_latents,)
+            if return_video_latents:
+                outputs += (video_latents,)

-        return outputs
+            return outputs

    def prepare_mask_latents(
        self, mask, masked_image, batch_size, height, width, dtype, device, generator, do_classifier_free_guidance, noise_aug_strength
--- a/examples/cogvideox_fun_V2V_example_01.json
+++ b/examples/cogvideox_fun_V2V_example_01.json
@ -0,0 +1,614 @@
+{
+  "last_node_id": 224,
+  "last_link_id": 333,
+  "nodes": [
+    {
+      "id": 216,
+      "type": "CogVideoTextEncode",
+      "pos": {
+        "0": 1320,
+        "1": 2224
+      },
+      "size": [
+        400,
+        200
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 315
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            314
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "",
+        1,
+        true
+      ]
+    },
+    {
+      "id": 217,
+      "type": "CLIPLoader",
+      "pos": {
+        "0": 937,
+        "1": 2245
+      },
+      "size": {
+        "0": 315,
+        "1": 82
+      },
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            315,
+            317
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPLoader"
+      },
+      "widgets_values": [
+        "t5xxl_fp8_e4m3fn.safetensors",
+        "sd3"
+      ]
+    },
+    {
+      "id": 206,
+      "type": "VHS_VideoCombine",
+      "pos": {
+        "0": 2648,
+        "1": 2268
+      },
+      "size": [
+        595.7279663085938,
+        652.9874093191964
+      ],
+      "flags": {
+        "collapsed": false
+      },
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 333,
+          "label": "images"
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "link": null,
+          "label": "audio"
+        },
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null,
+          "label": "meta_batch"
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null,
+          "label": "vae"
+        }
+      ],
+      "outputs": [
+        {
+          "name": "Filenames",
+          "type": "VHS_FILENAMES",
+          "links": null,
+          "shape": 3,
+          "label": "Filenames"
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_VideoCombine",
+        "ttNbgOverride": {
+          "color": "#223",
+          "bgcolor": "#335",
+          "groupcolor": "#88A"
+        }
+      },
+      "widgets_values": {
+        "frame_rate": 24,
+        "loop_count": 0,
+        "filename_prefix": "1009/",
+        "format": "video/h264-mp4",
+        "pix_fmt": "yuv420p",
+        "crf": 19,
+        "save_metadata": false,
+        "pingpong": false,
+        "save_output": true,
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "filename": "1009_00373.mp4",
+            "subfolder": "",
+            "type": "output",
+            "format": "video/h264-mp4",
+            "frame_rate": 24
+          },
+          "muted": false
+        }
+      },
+      "color": "#223",
+      "bgcolor": "#335"
+    },
+    {
+      "id": 220,
+      "type": "CogVideoDecode",
+      "pos": {
+        "0": 2242.513671875,
+        "1": 2250.128662109375
+      },
+      "size": {
+        "0": 315,
+        "1": 198
+      },
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 320
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 321
+        }
+      ],
+      "outputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "links": [
+            333
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoDecode"
+      },
+      "widgets_values": [
+        false,
+        240,
+        360,
+        0.2,
+        0.2,
+        true
+      ]
+    },
+    {
+      "id": 218,
+      "type": "CogVideoTextEncode",
+      "pos": {
+        "0": 1318,
+        "1": 2492
+      },
+      "size": {
+        "0": 400,
+        "1": 200
+      },
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 317
+        }
+      ],
+      "outputs": [
+        {
+          "name": "conditioning",
+          "type": "CONDITIONING",
+          "links": [
+            318
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoTextEncode"
+      },
+      "widgets_values": [
+        "The video is not of a high quality, it has a low resolution. Strange motion trajectory, a poor composition and deformed video, low resolution, duplicate and ugly, strange body structure, long and strange neck, bad teeth, bad eyes, bad limbs, bad hands, blurry camera, shaking camera. Deformation, blurry, ugly, distortion. ",
+        1,
+        true
+      ]
+    },
+    {
+      "id": 215,
+      "type": "DownloadAndLoadCogVideoModel",
+      "pos": {
+        "0": 1388,
+        "1": 1986
+      },
+      "size": {
+        "0": 315,
+        "1": 194
+      },
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pab_config",
+          "type": "PAB_CONFIG",
+          "link": null
+        },
+        {
+          "name": "block_edit",
+          "type": "TRANSFORMERBLOCKS",
+          "link": null
+        },
+        {
+          "name": "lora",
+          "type": "COGLORA",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            313
+          ],
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DownloadAndLoadCogVideoModel"
+      },
+      "widgets_values": [
+        "kijai/CogVideoX-Fun-5b",
+        "bf16",
+        "disabled",
+        "disabled",
+        false
+      ]
+    },
+    {
+      "id": 214,
+      "type": "CogVideoXFunVid2VidSampler",
+      "pos": {
+        "0": 1823,
+        "1": 2249
+      },
+      "size": {
+        "0": 380.4000244140625,
+        "1": 306
+      },
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "pipeline",
+          "type": "COGVIDEOPIPE",
+          "link": 313
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 314
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 318
+        },
+        {
+          "name": "validation_video",
+          "type": "IMAGE",
+          "link": 332
+        },
+        {
+          "name": "context_options",
+          "type": "COGCONTEXT",
+          "link": 327
+        }
+      ],
+      "outputs": [
+        {
+          "name": "cogvideo_pipe",
+          "type": "COGVIDEOPIPE",
+          "links": [
+            320
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "links": [
+            321
+          ],
+          "slot_index": 1,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoXFunVid2VidSampler"
+      },
+      "widgets_values": [
+        49,
+        768,
+        15861381752719,
+        "fixed",
+        30,
+        7,
+        "DDIM",
+        0.5
+      ]
+    },
+    {
+      "id": 222,
+      "type": "CogVideoContextOptions",
+      "pos": {
+        "0": 1379,
+        "1": 2771
+      },
+      "size": [
+        315,
+        154
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "context_options",
+          "type": "COGCONTEXT",
+          "links": [
+            327
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CogVideoContextOptions"
+      },
+      "widgets_values": [
+        "uniform_standard",
+        48,
+        4,
+        4,
+        true
+      ]
+    },
+    {
+      "id": 209,
+      "type": "VHS_LoadVideo",
+      "pos": {
+        "0": 286,
+        "1": 2110
+      },
+      "size": [
+        426.63287353515625,
+        508.3616420200893
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "meta_batch",
+          "type": "VHS_BatchManager",
+          "link": null
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": null
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            332
+          ],
+          "slot_index": 0,
+          "shape": 3
+        },
+        {
+          "name": "frame_count",
+          "type": "INT",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "audio",
+          "type": "AUDIO",
+          "links": null,
+          "shape": 3
+        },
+        {
+          "name": "video_info",
+          "type": "VHS_VIDEOINFO",
+          "links": null,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VHS_LoadVideo"
+      },
+      "widgets_values": {
+        "video": "CogVideoX-Fun_00006.mp4",
+        "force_rate": 0,
+        "force_size": "Disabled",
+        "custom_width": 512,
+        "custom_height": 512,
+        "frame_load_cap": 0,
+        "skip_first_frames": 0,
+        "select_every_nth": 3,
+        "choose video to upload": "image",
+        "videopreview": {
+          "hidden": false,
+          "paused": false,
+          "params": {
+            "frame_load_cap": 0,
+            "skip_first_frames": 0,
+            "force_rate": 0,
+            "filename": "CogVideoX-Fun_00006.mp4",
+            "type": "input",
+            "format": "video/mp4",
+            "select_every_nth": 3
+          },
+          "muted": false
+        }
+      },
+      "color": "#223",
+      "bgcolor": "#335"
+    }
+  ],
+  "links": [
+    [
+      313,
+      215,
+      0,
+      214,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      314,
+      216,
+      0,
+      214,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      315,
+      217,
+      0,
+      216,
+      0,
+      "CLIP"
+    ],
+    [
+      317,
+      217,
+      0,
+      218,
+      0,
+      "CLIP"
+    ],
+    [
+      318,
+      218,
+      0,
+      214,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      320,
+      214,
+      0,
+      220,
+      0,
+      "COGVIDEOPIPE"
+    ],
+    [
+      321,
+      214,
+      1,
+      220,
+      1,
+      "LATENT"
+    ],
+    [
+      327,
+      222,
+      0,
+      214,
+      4,
+      "COGCONTEXT"
+    ],
+    [
+      332,
+      209,
+      0,
+      214,
+      3,
+      "IMAGE"
+    ],
+    [
+      333,
+      220,
+      0,
+      206,
+      0,
+      "IMAGE"
+    ]
+  ],
+  "groups": [
+    {
+      "title": "cogvideoxfun v2v sampler",
+      "bounding": [
+        78,
+        1820,
+        3385,
+        1603
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.3797498335832498,
+      "offset": [
+        590.2813176508071,
+        -1550.4283526080162
+      ]
+    }
+  },
+  "version": 0.4
+}
--- a/nodes.py
+++ b/nodes.py
@ -1538,6 +1538,9 @@ class CogVideoXFunVid2VidSampler:
                "denoise_strength": ("FLOAT", {"default": 0.70, "min": 0.05, "max": 1.00, "step": 0.01}),
                "validation_video": ("IMAGE",),
            },
+            "optional": {
+                "context_options": ("COGCONTEXT", ),
+            },
        }
    
    RETURN_TYPES = ("COGVIDEOPIPE", "LATENT",)
@ -1545,8 +1548,7 @@ class CogVideoXFunVid2VidSampler:
    FUNCTION = "process"
    CATEGORY = "CogVideoWrapper"

-    def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler, 
-                validation_video):
+    def process(self, pipeline, positive, negative, video_length, base_resolution, seed, steps, cfg, denoise_strength, scheduler, validation_video, context_options=None):
        device = mm.get_torch_device()
        offload_device = mm.unet_offload_device()
        pipe = pipeline["pipe"]
@ -1562,15 +1564,20 @@ class CogVideoXFunVid2VidSampler:
        mm.soft_empty_cache()

        # Count most suitable height and width
-        aspect_ratio_sample_size    = {key : [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}
+        aspect_ratio_sample_size = {key: [x / 512 * base_resolution for x in ASPECT_RATIO_512[key]] for key in ASPECT_RATIO_512.keys()}

        validation_video = np.array(validation_video.cpu().numpy() * 255, np.uint8)
        original_width, original_height = Image.fromarray(validation_video[0]).size

        closest_size, closest_ratio = get_closest_ratio(original_height, original_width, ratios=aspect_ratio_sample_size)
        height, width = [int(x / 16) * 16 for x in closest_size]
+        log.info(f"Closest bucket size: {width}x{height}")
+
+        # Handle context_options and adjust scheduler if needed
+        if context_options is not None and context_options["context_schedule"] == "temporal_tiling":
+            logging.info("Temporal tiling enabled, changing scheduler to CogVideoXDDIM")
+            scheduler = "CogVideoXDDIM"

-        # Load Sampler
        scheduler_config = pipeline["scheduler_config"]
        if scheduler in scheduler_mapping:
            noise_scheduler = scheduler_mapping[scheduler].from_config(scheduler_config)
@ -1578,7 +1585,15 @@ class CogVideoXFunVid2VidSampler:
        else:
            raise ValueError(f"Unknown scheduler: {scheduler}")

-        generator = torch.Generator(device=torch.device("cpu")).manual_seed(seed)
+        if context_options is not None:
+            context_frames = context_options["context_frames"] // 4
+            context_stride = context_options["context_stride"] // 4
+            context_overlap = context_options["context_overlap"] // 4
+        else:
+            context_frames, context_stride, context_overlap = None, None, None
+
+        # Create generator on the correct device
+        generator = torch.Generator(device=device).manual_seed(seed)

        autocastcondition = not pipeline["onediff"] or not dtype == torch.float32
        autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext()
@ -1586,12 +1601,33 @@ class CogVideoXFunVid2VidSampler:
            video_length = int((video_length - 1) // pipe.vae.config.temporal_compression_ratio * pipe.vae.config.temporal_compression_ratio) + 1 if video_length != 1 else 1
            input_video, input_video_mask, clip_image = get_video_to_video_latent(validation_video, video_length=video_length, sample_size=(height, width))

-            # for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
-            #     pipeline = merge_lora(pipeline, _lora_path, _lora_weight)
+            # Move tensors to the correct device
+            input_video = input_video.to(device)
+            input_video_mask = input_video_mask.to(device)
+            if clip_image is not None:
+                clip_image = clip_image.to(device)
+
+            # Move positive and negative prompts to the correct device
+            positive = positive.to(dtype).to(device)
+            negative = negative.to(dtype).to(device)
+
+            # Add print statements to debug devices
+            print(f"Device Information:")
+            print(f"  device: {device}")
+            print(f"  positive.device: {positive.device}")
+            print(f"  negative.device: {negative.device}")
+            print(f"  input_video.device: {input_video.device}")
+            print(f"  input_video_mask.device: {input_video_mask.device}")
+            if clip_image is not None:
+                print(f"  clip_image.device: {clip_image.device}")
+            print(f"  generator device: {generator.device}")
+            print(f"  pipe.device: {device}")
+            print(f"  pipe.vae.device: {next(pipe.vae.parameters()).device}")
+            print(f"  pipe.transformer.device: {next(pipe.transformer.parameters()).device}")

            common_params = {
-                "prompt_embeds": positive.to(dtype).to(device),
-                "negative_prompt_embeds": negative.to(dtype).to(device),
+                "prompt_embeds": positive,
+                "negative_prompt_embeds": negative,
                "num_frames": video_length,
                "height": height,
                "width": width,
@ -1605,25 +1641,15 @@ class CogVideoXFunVid2VidSampler:
                **common_params,
                video=input_video,
                mask_video=input_video_mask,
-                strength=float(denoise_strength)
+                strength=float(denoise_strength),
+                context_schedule=context_options["context_schedule"] if context_options is not None else None,
+                context_frames=context_frames,
+                context_stride=context_stride,
+                context_overlap=context_overlap,
+                freenoise=context_options["freenoise"] if context_options is not None else None
            )
-
-            # for _lora_path, _lora_weight in zip(cogvideoxfun_model.get("loras", []), cogvideoxfun_model.get("strength_model", [])):
-            #     pipeline = unmerge_lora(pipeline, _lora_path, _lora_weight)
        return (pipeline, {"samples": latents})

-def add_noise_to_reference_video(image, ratio=None):
-    if ratio is None:
-        sigma = torch.normal(mean=-3.0, std=0.5, size=(image.shape[0],)).to(image.device)
-        sigma = torch.exp(sigma).to(image.dtype)
-    else:
-        sigma = torch.ones((image.shape[0],)).to(image.device, image.dtype) * ratio
-    
-    image_noise = torch.randn_like(image) * sigma[:, None, None, None, None]
-    image_noise = torch.where(image==-1, torch.zeros_like(image), image_noise)
-    image = image + image_noise
-    return image
-
 class CogVideoControlImageEncode:
    @classmethod
    def INPUT_TYPES(s):