From 046bb97fac9c185ebd9b2d3fac1a94afd2eea2e5 Mon Sep 17 00:00:00 2001
From: kijai <40791699+kijai@users.noreply.github.com>
Date: Mon, 3 Feb 2025 14:39:28 +0200
Subject: [PATCH] Support fast model

https://huggingface.co/tencent/Hunyuan3D-2/tree/main/hunyuan3d-dit-v2-0-fast
---
 configs/dit_config.yaml                 |  1 +
 hy3dgen/shapegen/models/conditioner.py  |  2 +-
 hy3dgen/shapegen/models/hunyuan3ddit.py | 10 +++++
 hy3dgen/shapegen/pipelines.py           |  5 +++
 hy3dgen/texgen/hunyuanpaint/pipeline.py | 56 ++++++++++++++++++++++++-
 nodes.py                                | 11 +++--
 6 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/configs/dit_config.yaml b/configs/dit_config.yaml
index 4f685de..001e8a2 100755
--- a/configs/dit_config.yaml
+++ b/configs/dit_config.yaml
@@ -11,6 +11,7 @@ model:
     axes_dim: [ 64 ]
     theta: 10000
     qkv_bias: True
+    guidance_embed: False
 
 vae:
   target: .hy3dgen.shapegen.models.ShapeVAE
diff --git a/hy3dgen/shapegen/models/conditioner.py b/hy3dgen/shapegen/models/conditioner.py
index 3616fca..e7841ca 100755
--- a/hy3dgen/shapegen/models/conditioner.py
+++ b/hy3dgen/shapegen/models/conditioner.py
@@ -78,7 +78,7 @@ class ImageEncoder(nn.Module):
             mask = mask.to(image)
             image = image * mask
 
-        inputs = self.transform(image)
+        inputs = image
         outputs = self.model(inputs)
 
         last_hidden_state = outputs.last_hidden_state
diff --git a/hy3dgen/shapegen/models/hunyuan3ddit.py b/hy3dgen/shapegen/models/hunyuan3ddit.py
index d2fd277..a128659 100755
--- a/hy3dgen/shapegen/models/hunyuan3ddit.py
+++ b/hy3dgen/shapegen/models/hunyuan3ddit.py
@@ -306,6 +306,7 @@ class Hunyuan3DDiT(nn.Module):
         axes_dim: List[int] = [64],
         theta: int = 10_000,
         qkv_bias: bool = True,
+        guidance_embed: bool = False,
         time_factor: float = 1000,
         ckpt_path: Optional[str] = None,
         attention_mode: str = "sdpa",
@@ -325,6 +326,7 @@ class Hunyuan3DDiT(nn.Module):
         self.time_factor = time_factor
         self.out_channels = self.in_channels
         self.attention_mode = attention_mode
+        self.guidance_embed = guidance_embed
 
         if hidden_size % num_heads != 0:
             raise ValueError(
@@ -338,6 +340,9 @@ class Hunyuan3DDiT(nn.Module):
         self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
         self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
         self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else nn.Identity()
+        )
 
         self.double_blocks = nn.ModuleList(
             [
@@ -401,6 +406,11 @@ class Hunyuan3DDiT(nn.Module):
         cond = contexts['main']
         latent = self.latent_in(x)
         vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
+        if self.guidance_embed:
+            guidance = kwargs.get('guidance', None)
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.time_factor))
         cond = self.cond_in(cond)
         pe = None
 
diff --git a/hy3dgen/shapegen/pipelines.py b/hy3dgen/shapegen/pipelines.py
index dc706c7..8c3f856 100755
--- a/hy3dgen/shapegen/pipelines.py
+++ b/hy3dgen/shapegen/pipelines.py
@@ -183,8 +183,12 @@ class Hunyuan3DDiTPipeline:
 
         
         # load model
+        if "guidance_in.in_layer.bias" in ckpt['model']: #guidance_in.in_layer.bias
+            logger.info("Model has guidance_in, setting guidance_embed to True")
+            config['model']['params']['guidance_embed'] = True
         config['model']['params']['attention_mode'] = attention_mode
         config['vae']['params']['attention_mode'] = attention_mode
+        
         with init_empty_weights():
             model = instantiate_from_config(config['model'])
             vae = instantiate_from_config(config['vae'])
@@ -603,6 +607,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
         if hasattr(self.model, 'guidance_embed') and \
             self.model.guidance_embed is True:
             guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
+        print("guidance: ", guidance)
 
         comfy_pbar = ProgressBar(num_inference_steps)
         for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):
diff --git a/hy3dgen/texgen/hunyuanpaint/pipeline.py b/hy3dgen/texgen/hunyuanpaint/pipeline.py
index 16920e6..a03e282 100755
--- a/hy3dgen/texgen/hunyuanpaint/pipeline.py
+++ b/hy3dgen/texgen/hunyuanpaint/pipeline.py
@@ -233,7 +233,52 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
             return (image,)
 
         return ImagePipelineOutput(images=image)
+    
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
 
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+    
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, timesteps, num_inference_steps, latents=None, denoise_strength=1.0):
+        from diffusers.utils.torch_utils import randn_tensor
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        if latents is None:
+            latents = noise
+        elif denoise_strength < 1.0:
+            latents = latents.to(noise)
+            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
+            latent_timestep = timesteps[:1].repeat(batch_size)
+            latents = self.vae.config.scaling_factor * latents
+
+            latents = self.scheduler.add_noise(latents, noise, latent_timestep)
+            
+           
+            #latents = latents * (1 - latent_timestep / 1000) + latent_timestep / 1000 * noise
+        else:
+            latents = latents.to(device)
+            latents = self.vae.config.scaling_factor * latents
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents.to(torch.float16), timesteps
+    
     def denoise(
         self,
         prompt: Union[str, List[str]] = None,
@@ -261,6 +306,7 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
             Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
         ] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        denoise_strength: Optional[float] = None,
         **kwargs,
     ):
         r"""
@@ -439,10 +485,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
         timesteps, num_inference_steps = retrieve_timesteps(
             self.scheduler, num_inference_steps, device, timesteps, sigmas
         )
+        print("timesteps", timesteps)
         assert num_images_per_prompt == 1
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
+
+        latents, timesteps = self.prepare_latents(
             batch_size * kwargs['num_in_batch'],  # num_images_per_prompt,
             num_channels_latents,
             height,
@@ -450,8 +498,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
             prompt_embeds.dtype,
             device,
             generator,
+            timesteps,
+            num_inference_steps,
             latents,
+            denoise_strength=denoise_strength,
         )
+        print("timesteps", timesteps)
 
         # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
         extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@@ -462,7 +514,7 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
             if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
             else None
         )
-
+        
         # 6.2 Optionally get Guidance Scale Embedding
         timestep_cond = None
         if self.unet.config.time_cond_proj_dim is not None:
diff --git a/nodes.py b/nodes.py
index 8d9320e..cd2aae8 100644
--- a/nodes.py
+++ b/nodes.py
@@ -692,6 +692,8 @@ class Hy3DSampleMultiView:
             "optional": {
                 "camera_config": ("HY3DCAMERA",),
                 "scheduler": ("NOISESCHEDULER",),
+                "denoise_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
+                "samples": ("LATENT", ),
             }
         }
 
@@ -700,7 +702,8 @@ class Hy3DSampleMultiView:
     FUNCTION = "process"
     CATEGORY = "Hunyuan3DWrapper"
 
-    def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps, camera_config=None, scheduler=None):
+    def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps, 
+                camera_config=None, scheduler=None, denoise_strength=1.0, samples=None):
         device = mm.get_torch_device()
         mm.soft_empty_cache()
         torch.manual_seed(seed)
@@ -720,7 +723,7 @@ class Hy3DSampleMultiView:
         camera_info = [(((azim // 30) + 9) % 12) // {-90: 3, -45: 2, -20: 1, 0: 1, 20: 1, 45: 2, 90: 3}[
             elev] + {-90: 36, -45: 30, -20: 0, 0: 12, 20: 24, 45: 30, 90: 40}[elev] for azim, elev in
                     zip(selected_camera_azims, selected_camera_elevs)]
-        print(camera_info)
+        #print(camera_info)
         
         normal_maps_np = (normal_maps * 255).to(torch.uint8).cpu().numpy()
         normal_maps_pil = [Image.fromarray(normal_map) for normal_map in normal_maps_np]
@@ -754,6 +757,7 @@ class Hy3DSampleMultiView:
             width=view_size,
             height=view_size,
             generator=generator,
+            latents=samples["samples"] if samples is not None else None,
             num_in_batch = num_view,
             camera_info_gen = [camera_info],
             camera_info_ref = [[0]],
@@ -762,7 +766,8 @@ class Hy3DSampleMultiView:
             num_inference_steps=steps,
             output_type="pt",
             callback_on_step_end=callback,
-            callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"]
+            callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"],
+            denoise_strength=denoise_strength
             ).images
 
         out_tensors = multiview_images.permute(0, 2, 3, 1).cpu().float()