cleanup, fix vid2vid

2026-03-19 12:37:15 +08:00 · 2024-08-07 02:11:37 +03:00 · 2024-08-07 02:11:37 +03:00 · 2ae70dd82e
commit 2ae70dd82e
parent 97e89d596e
3 changed files with 700 additions and 703 deletions
--- a/examples/cogvideo_vid2vid_test_example_01.json
+++ b/examples/cogvideo_vid2vid_test_example_01.json
--- a/nodes.py
+++ b/nodes.py
@ -3,6 +3,7 @@ import torch
 import folder_paths
 import comfy.model_management as mm
 from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
+from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 from .pipeline_cogvideox import CogVideoXPipeline

 import logging
@ -52,8 +53,11 @@ class DownloadAndLoadCogVideoModel:
                local_dir=base_path,
                local_dir_use_symlinks=False,
            )
+        transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer").to(dtype).to(offload_device)
+        vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
+        scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")

-        pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
+        pipe = CogVideoXPipeline(vae, transformer, scheduler)

        pipeline = {
            "pipe": pipe,
@ -239,7 +243,6 @@ class CogVideoSampler:
            prompt_embeds=positive.to(dtype).to(device),
            negative_prompt_embeds=negative.to(dtype).to(device),
            generator=generator,
-            output_type="latents",
            device=device
        )
        pipe.transformer.to(offload_device)
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -14,16 +14,14 @@
 # limitations under the License.

 import inspect
-from dataclasses import dataclass
 from typing import Callable, Dict, List, Optional, Tuple, Union

 import torch

-from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
-from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils import logging
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor

@ -31,30 +29,6 @@ from comfy.utils import ProgressBar

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```python
-        >>> from diffusers import CogVideoXPipeline
-        >>> from diffusers.utils import export_to_video
-
-        >>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.bfloat16).to("cuda")
-        >>> prompt = (
-        ...     "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
-        ...     "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
-        ...     "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
-        ...     "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
-        ...     "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
-        ...     "atmosphere of this unique musical performance."
-        ... )
-        >>> video = pipe(
-        ...     "a polar bear dancing, high quality, realistic", guidance_scale=6, num_inference_steps=20
-        ... ).frames[0]
-        >>> export_to_video(video, "output.mp4", fps=8)
-        ```
-"""
-
-
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
    scheduler,
@ -114,22 +88,6 @@ def retrieve_timesteps(
        timesteps = scheduler.timesteps
    return timesteps, num_inference_steps

-
-@dataclass
-class CogVideoXPipelineOutput(BaseOutput):
-    r"""
-    Output class for CogVideo pipelines.
-
-    Args:
-        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
-            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
-            `(batch_size, num_frames, channels, height, width)`.
-    """
-
-    frames: torch.Tensor
-
-
 class CogVideoXPipeline(DiffusionPipeline):
    r"""
    Pipeline for text-to-video generation using CogVideoX.
@ -156,12 +114,6 @@ class CogVideoXPipeline(DiffusionPipeline):
    _optional_components = ["tokenizer", "text_encoder"]
    model_cpu_offload_seq = "text_encoder->transformer->vae"

-    _callback_tensor_inputs = [
-        "latents",
-        "prompt_embeds",
-        "negative_prompt_embeds",
-    ]
-
    def __init__(
        self,
        vae: AutoencoderKLCogVideoX,
@ -199,9 +151,7 @@ class CogVideoXPipeline(DiffusionPipeline):
            )

        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-            # scale the initial noise by the standard deviation required by the scheduler
-            
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)            
        else:
            latents = latents.to(device)
            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
@ -219,7 +169,7 @@ class CogVideoXPipeline(DiffusionPipeline):
                latents = latents[:, :frames_needed, :, :, :]

            latents = self.scheduler.add_noise(latents, noise, latent_timestep)
-        latents = latents * self.scheduler.init_noise_sigma
+        latents = latents * self.scheduler.init_noise_sigma # scale the initial noise by the standard deviation required by the scheduler
        return latents, timesteps

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@ -245,20 +195,12 @@ class CogVideoXPipeline(DiffusionPipeline):
        self,
        height,
        width,
-        callback_on_step_end_tensor_inputs,
        prompt_embeds=None,
        negative_prompt_embeds=None,
    ):
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
        if prompt_embeds is not None and negative_prompt_embeds is not None:
            if prompt_embeds.shape != negative_prompt_embeds.shape:
                raise ValueError(
@ -297,7 +239,6 @@ class CogVideoXPipeline(DiffusionPipeline):
        return self._interrupt

    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
        height: int = 480,
@ -314,25 +255,12 @@ class CogVideoXPipeline(DiffusionPipeline):
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: str = "pil",
-        return_dict: bool = True,
-        callback_on_step_end: Optional[
-            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
-        ] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        device = torch.device("cuda"),
-    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+    ):
        """
        Function invoked when calling the pipeline for generation.

        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. This is set to 1024 by default for the best results.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@ -371,37 +299,12 @@ class CogVideoXPipeline(DiffusionPipeline):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
-                of a plain tuple.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] or `tuple`:
-            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
-            `tuple`. When returning a tuple, the first element is a list with the generated images.
        """

        assert (
            num_frames <= 48 and num_frames % fps == 0 and fps == 8
        ), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."

-        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-
        height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
        width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
        num_videos_per_prompt = 1
@ -410,7 +313,6 @@ class CogVideoXPipeline(DiffusionPipeline):
        self.check_inputs(
            height,
            width,
-            callback_on_step_end_tensor_inputs,
            prompt_embeds,
            negative_prompt_embeds,
        )
@ -503,17 +405,6 @@ class CogVideoXPipeline(DiffusionPipeline):
                    )
                latents = latents.to(prompt_embeds.dtype)

-                # call the callback, if provided
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
                    comfy_pbar.update(1)