mirror of
https://git.datalinker.icu/kijai/ComfyUI-Hunyuan3DWrapper.git
synced 2025-12-09 04:44:26 +08:00
Support fast model
https://huggingface.co/tencent/Hunyuan3D-2/tree/main/hunyuan3d-dit-v2-0-fast
This commit is contained in:
parent
26a57e86b0
commit
046bb97fac
@ -11,6 +11,7 @@ model:
|
||||
axes_dim: [ 64 ]
|
||||
theta: 10000
|
||||
qkv_bias: True
|
||||
guidance_embed: False
|
||||
|
||||
vae:
|
||||
target: .hy3dgen.shapegen.models.ShapeVAE
|
||||
|
||||
@ -78,7 +78,7 @@ class ImageEncoder(nn.Module):
|
||||
mask = mask.to(image)
|
||||
image = image * mask
|
||||
|
||||
inputs = self.transform(image)
|
||||
inputs = image
|
||||
outputs = self.model(inputs)
|
||||
|
||||
last_hidden_state = outputs.last_hidden_state
|
||||
|
||||
@ -306,6 +306,7 @@ class Hunyuan3DDiT(nn.Module):
|
||||
axes_dim: List[int] = [64],
|
||||
theta: int = 10_000,
|
||||
qkv_bias: bool = True,
|
||||
guidance_embed: bool = False,
|
||||
time_factor: float = 1000,
|
||||
ckpt_path: Optional[str] = None,
|
||||
attention_mode: str = "sdpa",
|
||||
@ -325,6 +326,7 @@ class Hunyuan3DDiT(nn.Module):
|
||||
self.time_factor = time_factor
|
||||
self.out_channels = self.in_channels
|
||||
self.attention_mode = attention_mode
|
||||
self.guidance_embed = guidance_embed
|
||||
|
||||
if hidden_size % num_heads != 0:
|
||||
raise ValueError(
|
||||
@ -338,6 +340,9 @@ class Hunyuan3DDiT(nn.Module):
|
||||
self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
|
||||
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
|
||||
self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
|
||||
self.guidance_in = (
|
||||
MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else nn.Identity()
|
||||
)
|
||||
|
||||
self.double_blocks = nn.ModuleList(
|
||||
[
|
||||
@ -401,6 +406,11 @@ class Hunyuan3DDiT(nn.Module):
|
||||
cond = contexts['main']
|
||||
latent = self.latent_in(x)
|
||||
vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
|
||||
if self.guidance_embed:
|
||||
guidance = kwargs.get('guidance', None)
|
||||
if guidance is None:
|
||||
raise ValueError("Didn't get guidance strength for guidance distilled model.")
|
||||
vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.time_factor))
|
||||
cond = self.cond_in(cond)
|
||||
pe = None
|
||||
|
||||
|
||||
@ -183,8 +183,12 @@ class Hunyuan3DDiTPipeline:
|
||||
|
||||
|
||||
# load model
|
||||
if "guidance_in.in_layer.bias" in ckpt['model']: #guidance_in.in_layer.bias
|
||||
logger.info("Model has guidance_in, setting guidance_embed to True")
|
||||
config['model']['params']['guidance_embed'] = True
|
||||
config['model']['params']['attention_mode'] = attention_mode
|
||||
config['vae']['params']['attention_mode'] = attention_mode
|
||||
|
||||
with init_empty_weights():
|
||||
model = instantiate_from_config(config['model'])
|
||||
vae = instantiate_from_config(config['vae'])
|
||||
@ -603,6 +607,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
|
||||
if hasattr(self.model, 'guidance_embed') and \
|
||||
self.model.guidance_embed is True:
|
||||
guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
|
||||
print("guidance: ", guidance)
|
||||
|
||||
comfy_pbar = ProgressBar(num_inference_steps)
|
||||
for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):
|
||||
|
||||
@ -234,6 +234,51 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
|
||||
|
||||
return ImagePipelineOutput(images=image)
|
||||
|
||||
def get_timesteps(self, num_inference_steps, strength, device):
|
||||
# get the original timestep using init_timestep
|
||||
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
|
||||
|
||||
t_start = max(num_inference_steps - init_timestep, 0)
|
||||
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
|
||||
if hasattr(self.scheduler, "set_begin_index"):
|
||||
self.scheduler.set_begin_index(t_start * self.scheduler.order)
|
||||
|
||||
return timesteps, num_inference_steps - t_start
|
||||
|
||||
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, timesteps, num_inference_steps, latents=None, denoise_strength=1.0):
|
||||
from diffusers.utils.torch_utils import randn_tensor
|
||||
shape = (
|
||||
batch_size,
|
||||
num_channels_latents,
|
||||
int(height) // self.vae_scale_factor,
|
||||
int(width) // self.vae_scale_factor,
|
||||
)
|
||||
if isinstance(generator, list) and len(generator) != batch_size:
|
||||
raise ValueError(
|
||||
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
|
||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||
)
|
||||
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
|
||||
if latents is None:
|
||||
latents = noise
|
||||
elif denoise_strength < 1.0:
|
||||
latents = latents.to(noise)
|
||||
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
|
||||
latent_timestep = timesteps[:1].repeat(batch_size)
|
||||
latents = self.vae.config.scaling_factor * latents
|
||||
|
||||
latents = self.scheduler.add_noise(latents, noise, latent_timestep)
|
||||
|
||||
|
||||
#latents = latents * (1 - latent_timestep / 1000) + latent_timestep / 1000 * noise
|
||||
else:
|
||||
latents = latents.to(device)
|
||||
latents = self.vae.config.scaling_factor * latents
|
||||
|
||||
# scale the initial noise by the standard deviation required by the scheduler
|
||||
latents = latents * self.scheduler.init_noise_sigma
|
||||
return latents.to(torch.float16), timesteps
|
||||
|
||||
def denoise(
|
||||
self,
|
||||
prompt: Union[str, List[str]] = None,
|
||||
@ -261,6 +306,7 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
|
||||
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
|
||||
] = None,
|
||||
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
|
||||
denoise_strength: Optional[float] = None,
|
||||
**kwargs,
|
||||
):
|
||||
r"""
|
||||
@ -439,10 +485,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
|
||||
timesteps, num_inference_steps = retrieve_timesteps(
|
||||
self.scheduler, num_inference_steps, device, timesteps, sigmas
|
||||
)
|
||||
print("timesteps", timesteps)
|
||||
assert num_images_per_prompt == 1
|
||||
# 5. Prepare latent variables
|
||||
num_channels_latents = self.unet.config.in_channels
|
||||
latents = self.prepare_latents(
|
||||
|
||||
latents, timesteps = self.prepare_latents(
|
||||
batch_size * kwargs['num_in_batch'], # num_images_per_prompt,
|
||||
num_channels_latents,
|
||||
height,
|
||||
@ -450,8 +498,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
|
||||
prompt_embeds.dtype,
|
||||
device,
|
||||
generator,
|
||||
timesteps,
|
||||
num_inference_steps,
|
||||
latents,
|
||||
denoise_strength=denoise_strength,
|
||||
)
|
||||
print("timesteps", timesteps)
|
||||
|
||||
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
|
||||
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
|
||||
|
||||
11
nodes.py
11
nodes.py
@ -692,6 +692,8 @@ class Hy3DSampleMultiView:
|
||||
"optional": {
|
||||
"camera_config": ("HY3DCAMERA",),
|
||||
"scheduler": ("NOISESCHEDULER",),
|
||||
"denoise_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
|
||||
"samples": ("LATENT", ),
|
||||
}
|
||||
}
|
||||
|
||||
@ -700,7 +702,8 @@ class Hy3DSampleMultiView:
|
||||
FUNCTION = "process"
|
||||
CATEGORY = "Hunyuan3DWrapper"
|
||||
|
||||
def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps, camera_config=None, scheduler=None):
|
||||
def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps,
|
||||
camera_config=None, scheduler=None, denoise_strength=1.0, samples=None):
|
||||
device = mm.get_torch_device()
|
||||
mm.soft_empty_cache()
|
||||
torch.manual_seed(seed)
|
||||
@ -720,7 +723,7 @@ class Hy3DSampleMultiView:
|
||||
camera_info = [(((azim // 30) + 9) % 12) // {-90: 3, -45: 2, -20: 1, 0: 1, 20: 1, 45: 2, 90: 3}[
|
||||
elev] + {-90: 36, -45: 30, -20: 0, 0: 12, 20: 24, 45: 30, 90: 40}[elev] for azim, elev in
|
||||
zip(selected_camera_azims, selected_camera_elevs)]
|
||||
print(camera_info)
|
||||
#print(camera_info)
|
||||
|
||||
normal_maps_np = (normal_maps * 255).to(torch.uint8).cpu().numpy()
|
||||
normal_maps_pil = [Image.fromarray(normal_map) for normal_map in normal_maps_np]
|
||||
@ -754,6 +757,7 @@ class Hy3DSampleMultiView:
|
||||
width=view_size,
|
||||
height=view_size,
|
||||
generator=generator,
|
||||
latents=samples["samples"] if samples is not None else None,
|
||||
num_in_batch = num_view,
|
||||
camera_info_gen = [camera_info],
|
||||
camera_info_ref = [[0]],
|
||||
@ -762,7 +766,8 @@ class Hy3DSampleMultiView:
|
||||
num_inference_steps=steps,
|
||||
output_type="pt",
|
||||
callback_on_step_end=callback,
|
||||
callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"]
|
||||
callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"],
|
||||
denoise_strength=denoise_strength
|
||||
).images
|
||||
|
||||
out_tensors = multiview_images.permute(0, 2, 3, 1).cpu().float()
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user