From 046bb97fac9c185ebd9b2d3fac1a94afd2eea2e5 Mon Sep 17 00:00:00 2001 From: kijai <40791699+kijai@users.noreply.github.com> Date: Mon, 3 Feb 2025 14:39:28 +0200 Subject: [PATCH] Support fast model https://huggingface.co/tencent/Hunyuan3D-2/tree/main/hunyuan3d-dit-v2-0-fast --- configs/dit_config.yaml | 1 + hy3dgen/shapegen/models/conditioner.py | 2 +- hy3dgen/shapegen/models/hunyuan3ddit.py | 10 +++++ hy3dgen/shapegen/pipelines.py | 5 +++ hy3dgen/texgen/hunyuanpaint/pipeline.py | 56 ++++++++++++++++++++++++- nodes.py | 11 +++-- 6 files changed, 79 insertions(+), 6 deletions(-) diff --git a/configs/dit_config.yaml b/configs/dit_config.yaml index 4f685de..001e8a2 100755 --- a/configs/dit_config.yaml +++ b/configs/dit_config.yaml @@ -11,6 +11,7 @@ model: axes_dim: [ 64 ] theta: 10000 qkv_bias: True + guidance_embed: False vae: target: .hy3dgen.shapegen.models.ShapeVAE diff --git a/hy3dgen/shapegen/models/conditioner.py b/hy3dgen/shapegen/models/conditioner.py index 3616fca..e7841ca 100755 --- a/hy3dgen/shapegen/models/conditioner.py +++ b/hy3dgen/shapegen/models/conditioner.py @@ -78,7 +78,7 @@ class ImageEncoder(nn.Module): mask = mask.to(image) image = image * mask - inputs = self.transform(image) + inputs = image outputs = self.model(inputs) last_hidden_state = outputs.last_hidden_state diff --git a/hy3dgen/shapegen/models/hunyuan3ddit.py b/hy3dgen/shapegen/models/hunyuan3ddit.py index d2fd277..a128659 100755 --- a/hy3dgen/shapegen/models/hunyuan3ddit.py +++ b/hy3dgen/shapegen/models/hunyuan3ddit.py @@ -306,6 +306,7 @@ class Hunyuan3DDiT(nn.Module): axes_dim: List[int] = [64], theta: int = 10_000, qkv_bias: bool = True, + guidance_embed: bool = False, time_factor: float = 1000, ckpt_path: Optional[str] = None, attention_mode: str = "sdpa", @@ -325,6 +326,7 @@ class Hunyuan3DDiT(nn.Module): self.time_factor = time_factor self.out_channels = self.in_channels self.attention_mode = attention_mode + self.guidance_embed = guidance_embed if hidden_size % num_heads != 0: raise ValueError( @@ -338,6 +340,9 @@ class Hunyuan3DDiT(nn.Module): self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True) self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) self.cond_in = nn.Linear(context_in_dim, self.hidden_size) + self.guidance_in = ( + MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else nn.Identity() + ) self.double_blocks = nn.ModuleList( [ @@ -401,6 +406,11 @@ class Hunyuan3DDiT(nn.Module): cond = contexts['main'] latent = self.latent_in(x) vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype)) + if self.guidance_embed: + guidance = kwargs.get('guidance', None) + if guidance is None: + raise ValueError("Didn't get guidance strength for guidance distilled model.") + vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.time_factor)) cond = self.cond_in(cond) pe = None diff --git a/hy3dgen/shapegen/pipelines.py b/hy3dgen/shapegen/pipelines.py index dc706c7..8c3f856 100755 --- a/hy3dgen/shapegen/pipelines.py +++ b/hy3dgen/shapegen/pipelines.py @@ -183,8 +183,12 @@ class Hunyuan3DDiTPipeline: # load model + if "guidance_in.in_layer.bias" in ckpt['model']: #guidance_in.in_layer.bias + logger.info("Model has guidance_in, setting guidance_embed to True") + config['model']['params']['guidance_embed'] = True config['model']['params']['attention_mode'] = attention_mode config['vae']['params']['attention_mode'] = attention_mode + with init_empty_weights(): model = instantiate_from_config(config['model']) vae = instantiate_from_config(config['vae']) @@ -603,6 +607,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline): if hasattr(self.model, 'guidance_embed') and \ self.model.guidance_embed is True: guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype) + print("guidance: ", guidance) comfy_pbar = ProgressBar(num_inference_steps) for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")): diff --git a/hy3dgen/texgen/hunyuanpaint/pipeline.py b/hy3dgen/texgen/hunyuanpaint/pipeline.py index 16920e6..a03e282 100755 --- a/hy3dgen/texgen/hunyuanpaint/pipeline.py +++ b/hy3dgen/texgen/hunyuanpaint/pipeline.py @@ -233,7 +233,52 @@ class HunyuanPaintPipeline(StableDiffusionPipeline): return (image,) return ImagePipelineOutput(images=image) + + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) + + return timesteps, num_inference_steps - t_start + + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, timesteps, num_inference_steps, latents=None, denoise_strength=1.0): + from diffusers.utils.torch_utils import randn_tensor + shape = ( + batch_size, + num_channels_latents, + int(height) // self.vae_scale_factor, + int(width) // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + if latents is None: + latents = noise + elif denoise_strength < 1.0: + latents = latents.to(noise) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device) + latent_timestep = timesteps[:1].repeat(batch_size) + latents = self.vae.config.scaling_factor * latents + + latents = self.scheduler.add_noise(latents, noise, latent_timestep) + + + #latents = latents * (1 - latent_timestep / 1000) + latent_timestep / 1000 * noise + else: + latents = latents.to(device) + latents = self.vae.config.scaling_factor * latents + + # scale the initial noise by the standard deviation required by the scheduler + latents = latents * self.scheduler.init_noise_sigma + return latents.to(torch.float16), timesteps + def denoise( self, prompt: Union[str, List[str]] = None, @@ -261,6 +306,7 @@ class HunyuanPaintPipeline(StableDiffusionPipeline): Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks] ] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], + denoise_strength: Optional[float] = None, **kwargs, ): r""" @@ -439,10 +485,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline): timesteps, num_inference_steps = retrieve_timesteps( self.scheduler, num_inference_steps, device, timesteps, sigmas ) + print("timesteps", timesteps) assert num_images_per_prompt == 1 # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels - latents = self.prepare_latents( + + latents, timesteps = self.prepare_latents( batch_size * kwargs['num_in_batch'], # num_images_per_prompt, num_channels_latents, height, @@ -450,8 +498,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline): prompt_embeds.dtype, device, generator, + timesteps, + num_inference_steps, latents, + denoise_strength=denoise_strength, ) + print("timesteps", timesteps) # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) @@ -462,7 +514,7 @@ class HunyuanPaintPipeline(StableDiffusionPipeline): if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) else None ) - + # 6.2 Optionally get Guidance Scale Embedding timestep_cond = None if self.unet.config.time_cond_proj_dim is not None: diff --git a/nodes.py b/nodes.py index 8d9320e..cd2aae8 100644 --- a/nodes.py +++ b/nodes.py @@ -692,6 +692,8 @@ class Hy3DSampleMultiView: "optional": { "camera_config": ("HY3DCAMERA",), "scheduler": ("NOISESCHEDULER",), + "denoise_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}), + "samples": ("LATENT", ), } } @@ -700,7 +702,8 @@ class Hy3DSampleMultiView: FUNCTION = "process" CATEGORY = "Hunyuan3DWrapper" - def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps, camera_config=None, scheduler=None): + def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps, + camera_config=None, scheduler=None, denoise_strength=1.0, samples=None): device = mm.get_torch_device() mm.soft_empty_cache() torch.manual_seed(seed) @@ -720,7 +723,7 @@ class Hy3DSampleMultiView: camera_info = [(((azim // 30) + 9) % 12) // {-90: 3, -45: 2, -20: 1, 0: 1, 20: 1, 45: 2, 90: 3}[ elev] + {-90: 36, -45: 30, -20: 0, 0: 12, 20: 24, 45: 30, 90: 40}[elev] for azim, elev in zip(selected_camera_azims, selected_camera_elevs)] - print(camera_info) + #print(camera_info) normal_maps_np = (normal_maps * 255).to(torch.uint8).cpu().numpy() normal_maps_pil = [Image.fromarray(normal_map) for normal_map in normal_maps_np] @@ -754,6 +757,7 @@ class Hy3DSampleMultiView: width=view_size, height=view_size, generator=generator, + latents=samples["samples"] if samples is not None else None, num_in_batch = num_view, camera_info_gen = [camera_info], camera_info_ref = [[0]], @@ -762,7 +766,8 @@ class Hy3DSampleMultiView: num_inference_steps=steps, output_type="pt", callback_on_step_end=callback, - callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"] + callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"], + denoise_strength=denoise_strength ).images out_tensors = multiview_images.permute(0, 2, 3, 1).cpu().float()