cleanup, fix vid2vid

2026-05-10 14:24:35 +08:00 · 2024-08-07 02:11:37 +03:00 · 2024-08-07 02:11:37 +03:00 · 2ae70dd82e
commit 2ae70dd82e
parent 97e89d596e
3 changed files with 700 additions and 703 deletions
--- a/examples/cogvideo_vid2vid_test_example_01.json
+++ b/examples/cogvideo_vid2vid_test_example_01.json
--- a/nodes.py
+++ b/nodes.py
@ -3,6 +3,7 @@ import torch
 import folder_paths
 import comfy.model_management as mm
 from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
 from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 from .pipeline_cogvideox import CogVideoXPipeline
 import logging
@ -52,8 +53,11 @@ class DownloadAndLoadCogVideoModel:
                local_dir=base_path,
                local_dir_use_symlinks=False,
            )
        transformer = CogVideoXTransformer3DModel.from_pretrained(base_path, subfolder="transformer").to(dtype).to(offload_device)
        vae = AutoencoderKLCogVideoX.from_pretrained(base_path, subfolder="vae").to(dtype).to(offload_device)
        scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
-        pipe = CogVideoXPipeline.from_pretrained(base_path, torch_dtype=dtype).to(offload_device)
+        pipe = CogVideoXPipeline(vae, transformer, scheduler)
        pipeline = {
            "pipe": pipe,
@ -239,7 +243,6 @@ class CogVideoSampler:
            prompt_embeds=positive.to(dtype).to(device),
            negative_prompt_embeds=negative.to(dtype).to(device),
            generator=generator,
            output_type="latents",
            device=device
        )
        pipe.transformer.to(offload_device)
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@ -14,16 +14,14 @@
 # limitations under the License.
 import inspect
 from dataclasses import dataclass
 from typing import Callable, Dict, List, Optional, Tuple, Union
 import torch
 from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
 from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
-from diffusers.utils import BaseOutput, logging, replace_example_docstring
+from diffusers.utils import logging
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor
@ -31,30 +29,6 @@ from comfy.utils import ProgressBar
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 EXAMPLE_DOC_STRING = """
    Examples:
        ```python
        >>> from diffusers import CogVideoXPipeline
        >>> from diffusers.utils import export_to_video
        >>> pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-2b", torch_dtype=torch.bfloat16).to("cuda")
        >>> prompt = (
        ...     "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
        ...     "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
        ...     "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
        ...     "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
        ...     "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
        ...     "atmosphere of this unique musical performance."
        ... )
        >>> video = pipe(
        ...     "a polar bear dancing, high quality, realistic", guidance_scale=6, num_inference_steps=20
        ... ).frames[0]
        >>> export_to_video(video, "output.mp4", fps=8)
        ```
 """
 # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
 def retrieve_timesteps(
    scheduler,
@ -114,22 +88,6 @@ def retrieve_timesteps(
        timesteps = scheduler.timesteps
    return timesteps, num_inference_steps
@dataclass
 class CogVideoXPipelineOutput(BaseOutput):
    r"""
    Output class for CogVideo pipelines.
    Args:
        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
            `(batch_size, num_frames, channels, height, width)`.
    """
    frames: torch.Tensor
 class CogVideoXPipeline(DiffusionPipeline):
    r"""
    Pipeline for text-to-video generation using CogVideoX.
@ -156,12 +114,6 @@ class CogVideoXPipeline(DiffusionPipeline):
    _optional_components = ["tokenizer", "text_encoder"]
    model_cpu_offload_seq = "text_encoder->transformer->vae"
    _callback_tensor_inputs = [
        "latents",
        "prompt_embeds",
        "negative_prompt_embeds",
    ]
    def __init__(
        self,
        vae: AutoencoderKLCogVideoX,
@ -200,8 +152,6 @@ class CogVideoXPipeline(DiffusionPipeline):
        if latents is None:
            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)            
            # scale the initial noise by the standard deviation required by the scheduler
        else:
            latents = latents.to(device)
            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
@ -219,7 +169,7 @@ class CogVideoXPipeline(DiffusionPipeline):
                latents = latents[:, :frames_needed, :, :, :]
            latents = self.scheduler.add_noise(latents, noise, latent_timestep)
-        latents = latents * self.scheduler.init_noise_sigma
+        latents = latents * self.scheduler.init_noise_sigma # scale the initial noise by the standard deviation required by the scheduler
        return latents, timesteps
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
@ -245,20 +195,12 @@ class CogVideoXPipeline(DiffusionPipeline):
        self,
        height,
        width,
        callback_on_step_end_tensor_inputs,
        prompt_embeds=None,
        negative_prompt_embeds=None,
    ):
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
        if callback_on_step_end_tensor_inputs is not None and not all(
            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
        ):
            raise ValueError(
                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
            )
        if prompt_embeds is not None and negative_prompt_embeds is not None:
            if prompt_embeds.shape != negative_prompt_embeds.shape:
                raise ValueError(
@ -297,7 +239,6 @@ class CogVideoXPipeline(DiffusionPipeline):
        return self._interrupt
    @torch.no_grad()
    @replace_example_docstring(EXAMPLE_DOC_STRING)
    def __call__(
        self,
        height: int = 480,
@ -314,25 +255,12 @@ class CogVideoXPipeline(DiffusionPipeline):
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: str = "pil",
        return_dict: bool = True,
        callback_on_step_end: Optional[
            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        device = torch.device("cuda"),
-    ) -> Union[CogVideoXPipelineOutput, Tuple]:
+    ):
        """
        Function invoked when calling the pipeline for generation.
        Args:
            prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
                instead.
            negative_prompt (`str` or `List[str]`, *optional*):
                The prompt or prompts not to guide the image generation. If not defined, one has to pass
                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
                less than `1`).
            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
                The height in pixels of the generated image. This is set to 1024 by default for the best results.
            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
@ -371,37 +299,12 @@ class CogVideoXPipeline(DiffusionPipeline):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
                of a plain tuple.
            callback_on_step_end (`Callable`, *optional*):
                A function that calls at the end of each denoising steps during the inference. The function is called
                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
                `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
        Examples:
        Returns:
            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] or `tuple`:
            [`~pipelines.cogvideo.pipeline_cogvideox.CogVideoXPipelineOutput`] if `return_dict` is True, otherwise a
            `tuple`. When returning a tuple, the first element is a list with the generated images.
        """
        assert (
            num_frames <= 48 and num_frames % fps == 0 and fps == 8
        ), f"The number of frames must be divisible by {fps=} and less than 48 frames (for now). Other values are not supported in CogVideoX."
        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
        height = height or self.transformer.config.sample_size * self.vae_scale_factor_spatial
        width = width or self.transformer.config.sample_size * self.vae_scale_factor_spatial
        num_videos_per_prompt = 1
@ -410,7 +313,6 @@ class CogVideoXPipeline(DiffusionPipeline):
        self.check_inputs(
            height,
            width,
            callback_on_step_end_tensor_inputs,
            prompt_embeds,
            negative_prompt_embeds,
        )
@ -503,17 +405,6 @@ class CogVideoXPipeline(DiffusionPipeline):
                    )
                latents = latents.to(prompt_embeds.dtype)
                # call the callback, if provided
                if callback_on_step_end is not None:
                    callback_kwargs = {}
                    for k in callback_on_step_end_tensor_inputs:
                        callback_kwargs[k] = locals()[k]
                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
                    latents = callback_outputs.pop("latents", latents)
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                    progress_bar.update()
                    comfy_pbar.update(1)