From cf01dc2b0bf7d3daddfc7e61bfae10747e281d14 Mon Sep 17 00:00:00 2001
From: Kijai <40791699+kijai@users.noreply.github.com>
Date: Wed, 28 Aug 2024 16:51:59 +0300
Subject: [PATCH] onediff support

---
 .../cogvideo_2b_vid2vid_test_example_02.json  |  4 +--
 examples/cogvideox_5b_example_01.json         |  4 +--
 nodes.py                                      | 28 ++++++++++++++++---
 readme.md                                     | 11 ++++++++
 4 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/examples/cogvideo_2b_vid2vid_test_example_02.json b/examples/cogvideo_2b_vid2vid_test_example_02.json
index e195575..f78505c 100644
--- a/examples/cogvideo_2b_vid2vid_test_example_02.json
+++ b/examples/cogvideo_2b_vid2vid_test_example_02.json
@@ -819,7 +819,7 @@
         "frame_rate": 8,
         "loop_count": 0,
         "filename_prefix": "CogVideoX_vid2vid",
-        "format": "video/nvenc_h264-mp4",
+        "format": "video/h264-mp4",
         "pix_fmt": "yuv420p",
         "bitrate": 10,
         "megabit": true,
@@ -833,7 +833,7 @@
             "filename": "AnimateDiff_00001.mp4",
             "subfolder": "",
             "type": "temp",
-            "format": "video/nvenc_h264-mp4",
+            "format": "video/h264-mp4",
             "frame_rate": 8
           }
         }
diff --git a/examples/cogvideox_5b_example_01.json b/examples/cogvideox_5b_example_01.json
index 50d0668..04fb30d 100644
--- a/examples/cogvideox_5b_example_01.json
+++ b/examples/cogvideox_5b_example_01.json
@@ -279,7 +279,7 @@
         "frame_rate": 8,
         "loop_count": 0,
         "filename_prefix": "CogVideoX5B",
-        "format": "video/nvenc_h264-mp4",
+        "format": "video/h264-mp4",
         "pix_fmt": "yuv420p",
         "bitrate": 10,
         "megabit": true,
@@ -293,7 +293,7 @@
             "filename": "CogVideoX5B_00009.mp4",
             "subfolder": "",
             "type": "temp",
-            "format": "video/nvenc_h264-mp4",
+            "format": "video/h264-mp4",
             "frame_rate": 8
           },
           "muted": false
diff --git a/nodes.py b/nodes.py
index ae5d41a..33dbd42 100644
--- a/nodes.py
+++ b/nodes.py
@@ -6,11 +6,14 @@ from comfy.utils import ProgressBar
 from diffusers.schedulers import CogVideoXDDIMScheduler, CogVideoXDPMScheduler
 from diffusers.models import AutoencoderKLCogVideoX, CogVideoXTransformer3DModel
 from .pipeline_cogvideox import CogVideoXPipeline
+from contextlib import nullcontext
+
 
 import logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 log = logging.getLogger(__name__)
 
+
 class DownloadAndLoadCogVideoModel:
     @classmethod
     def INPUT_TYPES(s):
@@ -30,6 +33,7 @@ class DownloadAndLoadCogVideoModel:
                 ),
                 "fp8_transformer": ("BOOLEAN", {"default": False, "tooltip": "cast the transformer to torch.float8_e4m3fn"}),
                 "torch_compile": ("BOOLEAN", {"default": False, "tooltip": "use torch.compile to speed up inference, Linux only"}),
+                "onediff": ("BOOLEAN", {"default": False, "tooltip": "use onediff/nexfort to speed up inference, requires onediff installed (Linux only)"}),
             }
         }
 
@@ -38,7 +42,7 @@ class DownloadAndLoadCogVideoModel:
     FUNCTION = "loadmodel"
     CATEGORY = "CogVideoWrapper"
 
-    def loadmodel(self, model, precision, fp8_transformer, torch_compile):
+    def loadmodel(self, model, precision, fp8_transformer, torch_compile, onediff):
         device = mm.get_torch_device()
         offload_device = mm.unet_offload_device()
         mm.soft_empty_cache()
@@ -72,13 +76,26 @@ class DownloadAndLoadCogVideoModel:
 
         if torch_compile:
             torch._dynamo.config.suppress_errors = True
-            pipe.transformer.to(device).to(memory_format=torch.channels_last)
+            pipe.transformer.to(memory_format=torch.channels_last)
             pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
 
+        if onediff:
+            from onediffx import compile_pipe, quantize_pipe
+            options = None
+            pipe = compile_pipe(
+            pipe,
+            backend="nexfort",
+            options=options,
+            ignores=["vae"],
+            fuse_qkv_projections=True,
+            )
+
+
         pipeline = {
             "pipe": pipe,
             "dtype": dtype,
-            "base_path": base_path
+            "base_path": base_path,
+            "onediff": onediff
         }
 
         return (pipeline,)
@@ -253,7 +270,10 @@ class CogVideoSampler:
             pipe.scheduler = CogVideoXDDIMScheduler.from_pretrained(base_path, subfolder="scheduler")
         elif scheduler == "DPM":
             pipe.scheduler = CogVideoXDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
-        with torch.autocast(mm.get_autocast_device(device)):
+            
+        autocastcondition = not pipeline["onediff"]
+        autocast_context = torch.autocast(mm.get_autocast_device(device)) if autocastcondition else nullcontext()
+        with autocast_context:
             latents = pipeline["pipe"](
                 num_inference_steps=steps,
                 height = height,
diff --git a/readme.md b/readme.md
index a601208..c0f975f 100644
--- a/readme.md
+++ b/readme.md
@@ -1,5 +1,16 @@
 # WORK IN PROGRESS
 
+## Updade2
+
+Added **experimental** support for onediff, this reduced sampling time by ~30% for me, reaching 4.23 it/s on 4090 with 49 frames. 
+This requires using Linux, torch 2.4.0, onediff and nexfort installation:
+
+`pip install --pre onediff onediffx`
+
+`pip install nexfort`
+
+First run will take around 5 mins for the compilation.
+
 ## Update
 5b model is now also supported for basic text2vid: https://huggingface.co/THUDM/CogVideoX-5b