Support fast model

https://huggingface.co/tencent/Hunyuan3D-2/tree/main/hunyuan3d-dit-v2-0-fast
2026-03-16 08:27:05 +08:00 · 2025-02-03 14:39:28 +02:00 · 2025-02-03 14:39:28 +02:00 · 046bb97fac
commit 046bb97fac
parent 26a57e86b0
6 changed files with 79 additions and 6 deletions
--- a/configs/dit_config.yaml
+++ b/configs/dit_config.yaml
@ -11,6 +11,7 @@ model:
    axes_dim: [ 64 ]
    theta: 10000
    qkv_bias: True
+    guidance_embed: False

 vae:
  target: .hy3dgen.shapegen.models.ShapeVAE
--- a/hy3dgen/shapegen/models/conditioner.py
+++ b/hy3dgen/shapegen/models/conditioner.py
@ -78,7 +78,7 @@ class ImageEncoder(nn.Module):
            mask = mask.to(image)
            image = image * mask

-        inputs = self.transform(image)
+        inputs = image
        outputs = self.model(inputs)

        last_hidden_state = outputs.last_hidden_state
--- a/hy3dgen/shapegen/models/hunyuan3ddit.py
+++ b/hy3dgen/shapegen/models/hunyuan3ddit.py
@ -306,6 +306,7 @@ class Hunyuan3DDiT(nn.Module):
        axes_dim: List[int] = [64],
        theta: int = 10_000,
        qkv_bias: bool = True,
+        guidance_embed: bool = False,
        time_factor: float = 1000,
        ckpt_path: Optional[str] = None,
        attention_mode: str = "sdpa",
@ -325,6 +326,7 @@ class Hunyuan3DDiT(nn.Module):
        self.time_factor = time_factor
        self.out_channels = self.in_channels
        self.attention_mode = attention_mode
+        self.guidance_embed = guidance_embed

        if hidden_size % num_heads != 0:
            raise ValueError(
@ -338,6 +340,9 @@ class Hunyuan3DDiT(nn.Module):
        self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
        self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else nn.Identity()
+        )

        self.double_blocks = nn.ModuleList(
            [
@ -401,6 +406,11 @@ class Hunyuan3DDiT(nn.Module):
        cond = contexts['main']
        latent = self.latent_in(x)
        vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
+        if self.guidance_embed:
+            guidance = kwargs.get('guidance', None)
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.time_factor))
        cond = self.cond_in(cond)
        pe = None

--- a/hy3dgen/shapegen/pipelines.py
+++ b/hy3dgen/shapegen/pipelines.py
@ -183,8 +183,12 @@ class Hunyuan3DDiTPipeline:

        
        # load model
+        if "guidance_in.in_layer.bias" in ckpt['model']: #guidance_in.in_layer.bias
+            logger.info("Model has guidance_in, setting guidance_embed to True")
+            config['model']['params']['guidance_embed'] = True
        config['model']['params']['attention_mode'] = attention_mode
        config['vae']['params']['attention_mode'] = attention_mode
+        
        with init_empty_weights():
            model = instantiate_from_config(config['model'])
            vae = instantiate_from_config(config['vae'])
@ -603,6 +607,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
        if hasattr(self.model, 'guidance_embed') and \
            self.model.guidance_embed is True:
            guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
+        print("guidance: ", guidance)

        comfy_pbar = ProgressBar(num_inference_steps)
        for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):
--- a/hy3dgen/texgen/hunyuanpaint/pipeline.py
+++ b/hy3dgen/texgen/hunyuanpaint/pipeline.py
@ -233,7 +233,52 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
            return (image,)

        return ImagePipelineOutput(images=image)
+    
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)

+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+    
+    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, timesteps, num_inference_steps, latents=None, denoise_strength=1.0):
+        from diffusers.utils.torch_utils import randn_tensor
+        shape = (
+            batch_size,
+            num_channels_latents,
+            int(height) // self.vae_scale_factor,
+            int(width) // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        if latents is None:
+            latents = noise
+        elif denoise_strength < 1.0:
+            latents = latents.to(noise)
+            timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
+            latent_timestep = timesteps[:1].repeat(batch_size)
+            latents = self.vae.config.scaling_factor * latents
+
+            latents = self.scheduler.add_noise(latents, noise, latent_timestep)
+            
+           
+            #latents = latents * (1 - latent_timestep / 1000) + latent_timestep / 1000 * noise
+        else:
+            latents = latents.to(device)
+            latents = self.vae.config.scaling_factor * latents
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents.to(torch.float16), timesteps
+    
    def denoise(
        self,
        prompt: Union[str, List[str]] = None,
@ -261,6 +306,7 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        denoise_strength: Optional[float] = None,
        **kwargs,
    ):
        r"""
@ -439,10 +485,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
        timesteps, num_inference_steps = retrieve_timesteps(
            self.scheduler, num_inference_steps, device, timesteps, sigmas
        )
+        print("timesteps", timesteps)
        assert num_images_per_prompt == 1
        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels
-        latents = self.prepare_latents(
+
+        latents, timesteps = self.prepare_latents(
            batch_size * kwargs['num_in_batch'],  # num_images_per_prompt,
            num_channels_latents,
            height,
@ -450,8 +498,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
            prompt_embeds.dtype,
            device,
            generator,
+            timesteps,
+            num_inference_steps,
            latents,
+            denoise_strength=denoise_strength,
        )
+        print("timesteps", timesteps)

        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@ -462,7 +514,7 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
            else None
        )
-
+        
        # 6.2 Optionally get Guidance Scale Embedding
        timestep_cond = None
        if self.unet.config.time_cond_proj_dim is not None:
--- a/nodes.py
+++ b/nodes.py
@ -692,6 +692,8 @@ class Hy3DSampleMultiView:
            "optional": {
                "camera_config": ("HY3DCAMERA",),
                "scheduler": ("NOISESCHEDULER",),
+                "denoise_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
+                "samples": ("LATENT", ),
            }
        }

@ -700,7 +702,8 @@ class Hy3DSampleMultiView:
    FUNCTION = "process"
    CATEGORY = "Hunyuan3DWrapper"

-    def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps, camera_config=None, scheduler=None):
+    def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps, 
+                camera_config=None, scheduler=None, denoise_strength=1.0, samples=None):
        device = mm.get_torch_device()
        mm.soft_empty_cache()
        torch.manual_seed(seed)
@ -720,7 +723,7 @@ class Hy3DSampleMultiView:
        camera_info = [(((azim // 30) + 9) % 12) // {-90: 3, -45: 2, -20: 1, 0: 1, 20: 1, 45: 2, 90: 3}[
            elev] + {-90: 36, -45: 30, -20: 0, 0: 12, 20: 24, 45: 30, 90: 40}[elev] for azim, elev in
                    zip(selected_camera_azims, selected_camera_elevs)]
-        print(camera_info)
+        #print(camera_info)
        
        normal_maps_np = (normal_maps * 255).to(torch.uint8).cpu().numpy()
        normal_maps_pil = [Image.fromarray(normal_map) for normal_map in normal_maps_np]
@ -754,6 +757,7 @@ class Hy3DSampleMultiView:
            width=view_size,
            height=view_size,
            generator=generator,
+            latents=samples["samples"] if samples is not None else None,
            num_in_batch = num_view,
            camera_info_gen = [camera_info],
            camera_info_ref = [[0]],
@ -762,7 +766,8 @@ class Hy3DSampleMultiView:
            num_inference_steps=steps,
            output_type="pt",
            callback_on_step_end=callback,
-            callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"]
+            callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"],
+            denoise_strength=denoise_strength
            ).images

        out_tensors = multiview_images.permute(0, 2, 3, 1).cpu().float()