kijai 2025-02-03 14:39:28 +02:00
parent 26a57e86b0
commit 046bb97fac
6 changed files with 79 additions and 6 deletions

View File

@ -11,6 +11,7 @@ model:
axes_dim: [ 64 ]
theta: 10000
qkv_bias: True
guidance_embed: False
vae:
target: .hy3dgen.shapegen.models.ShapeVAE

View File

@ -78,7 +78,7 @@ class ImageEncoder(nn.Module):
mask = mask.to(image)
image = image * mask
inputs = self.transform(image)
inputs = image
outputs = self.model(inputs)
last_hidden_state = outputs.last_hidden_state

View File

@ -306,6 +306,7 @@ class Hunyuan3DDiT(nn.Module):
axes_dim: List[int] = [64],
theta: int = 10_000,
qkv_bias: bool = True,
guidance_embed: bool = False,
time_factor: float = 1000,
ckpt_path: Optional[str] = None,
attention_mode: str = "sdpa",
@ -325,6 +326,7 @@ class Hunyuan3DDiT(nn.Module):
self.time_factor = time_factor
self.out_channels = self.in_channels
self.attention_mode = attention_mode
self.guidance_embed = guidance_embed
if hidden_size % num_heads != 0:
raise ValueError(
@ -338,6 +340,9 @@ class Hunyuan3DDiT(nn.Module):
self.latent_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
self.cond_in = nn.Linear(context_in_dim, self.hidden_size)
self.guidance_in = (
MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if guidance_embed else nn.Identity()
)
self.double_blocks = nn.ModuleList(
[
@ -401,6 +406,11 @@ class Hunyuan3DDiT(nn.Module):
cond = contexts['main']
latent = self.latent_in(x)
vec = self.time_in(timestep_embedding(t, 256, self.time_factor).to(dtype=latent.dtype))
if self.guidance_embed:
guidance = kwargs.get('guidance', None)
if guidance is None:
raise ValueError("Didn't get guidance strength for guidance distilled model.")
vec = vec + self.guidance_in(timestep_embedding(guidance, 256, self.time_factor))
cond = self.cond_in(cond)
pe = None

View File

@ -183,8 +183,12 @@ class Hunyuan3DDiTPipeline:
# load model
if "guidance_in.in_layer.bias" in ckpt['model']: #guidance_in.in_layer.bias
logger.info("Model has guidance_in, setting guidance_embed to True")
config['model']['params']['guidance_embed'] = True
config['model']['params']['attention_mode'] = attention_mode
config['vae']['params']['attention_mode'] = attention_mode
with init_empty_weights():
model = instantiate_from_config(config['model'])
vae = instantiate_from_config(config['vae'])
@ -603,6 +607,7 @@ class Hunyuan3DDiTFlowMatchingPipeline(Hunyuan3DDiTPipeline):
if hasattr(self.model, 'guidance_embed') and \
self.model.guidance_embed is True:
guidance = torch.tensor([guidance_scale] * batch_size, device=device, dtype=dtype)
print("guidance: ", guidance)
comfy_pbar = ProgressBar(num_inference_steps)
for i, t in enumerate(tqdm(timesteps, disable=not enable_pbar, desc="Diffusion Sampling:")):

View File

@ -233,7 +233,52 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
return (image,)
return ImagePipelineOutput(images=image)
def get_timesteps(self, num_inference_steps, strength, device):
# get the original timestep using init_timestep
init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
t_start = max(num_inference_steps - init_timestep, 0)
timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
if hasattr(self.scheduler, "set_begin_index"):
self.scheduler.set_begin_index(t_start * self.scheduler.order)
return timesteps, num_inference_steps - t_start
def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, timesteps, num_inference_steps, latents=None, denoise_strength=1.0):
from diffusers.utils.torch_utils import randn_tensor
shape = (
batch_size,
num_channels_latents,
int(height) // self.vae_scale_factor,
int(width) // self.vae_scale_factor,
)
if isinstance(generator, list) and len(generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
if latents is None:
latents = noise
elif denoise_strength < 1.0:
latents = latents.to(noise)
timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, denoise_strength, device)
latent_timestep = timesteps[:1].repeat(batch_size)
latents = self.vae.config.scaling_factor * latents
latents = self.scheduler.add_noise(latents, noise, latent_timestep)
#latents = latents * (1 - latent_timestep / 1000) + latent_timestep / 1000 * noise
else:
latents = latents.to(device)
latents = self.vae.config.scaling_factor * latents
# scale the initial noise by the standard deviation required by the scheduler
latents = latents * self.scheduler.init_noise_sigma
return latents.to(torch.float16), timesteps
def denoise(
self,
prompt: Union[str, List[str]] = None,
@ -261,6 +306,7 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
] = None,
callback_on_step_end_tensor_inputs: List[str] = ["latents"],
denoise_strength: Optional[float] = None,
**kwargs,
):
r"""
@ -439,10 +485,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
timesteps, num_inference_steps = retrieve_timesteps(
self.scheduler, num_inference_steps, device, timesteps, sigmas
)
print("timesteps", timesteps)
assert num_images_per_prompt == 1
# 5. Prepare latent variables
num_channels_latents = self.unet.config.in_channels
latents = self.prepare_latents(
latents, timesteps = self.prepare_latents(
batch_size * kwargs['num_in_batch'], # num_images_per_prompt,
num_channels_latents,
height,
@ -450,8 +498,12 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
prompt_embeds.dtype,
device,
generator,
timesteps,
num_inference_steps,
latents,
denoise_strength=denoise_strength,
)
print("timesteps", timesteps)
# 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
@ -462,7 +514,7 @@ class HunyuanPaintPipeline(StableDiffusionPipeline):
if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
else None
)
# 6.2 Optionally get Guidance Scale Embedding
timestep_cond = None
if self.unet.config.time_cond_proj_dim is not None:

View File

@ -692,6 +692,8 @@ class Hy3DSampleMultiView:
"optional": {
"camera_config": ("HY3DCAMERA",),
"scheduler": ("NOISESCHEDULER",),
"denoise_strength": ("FLOAT", {"default": 1.0, "min": 0.0, "max": 1.0, "step": 0.01}),
"samples": ("LATENT", ),
}
}
@ -700,7 +702,8 @@ class Hy3DSampleMultiView:
FUNCTION = "process"
CATEGORY = "Hunyuan3DWrapper"
def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps, camera_config=None, scheduler=None):
def process(self, pipeline, ref_image, normal_maps, position_maps, view_size, seed, steps,
camera_config=None, scheduler=None, denoise_strength=1.0, samples=None):
device = mm.get_torch_device()
mm.soft_empty_cache()
torch.manual_seed(seed)
@ -720,7 +723,7 @@ class Hy3DSampleMultiView:
camera_info = [(((azim // 30) + 9) % 12) // {-90: 3, -45: 2, -20: 1, 0: 1, 20: 1, 45: 2, 90: 3}[
elev] + {-90: 36, -45: 30, -20: 0, 0: 12, 20: 24, 45: 30, 90: 40}[elev] for azim, elev in
zip(selected_camera_azims, selected_camera_elevs)]
print(camera_info)
#print(camera_info)
normal_maps_np = (normal_maps * 255).to(torch.uint8).cpu().numpy()
normal_maps_pil = [Image.fromarray(normal_map) for normal_map in normal_maps_np]
@ -754,6 +757,7 @@ class Hy3DSampleMultiView:
width=view_size,
height=view_size,
generator=generator,
latents=samples["samples"] if samples is not None else None,
num_in_batch = num_view,
camera_info_gen = [camera_info],
camera_info_ref = [[0]],
@ -762,7 +766,8 @@ class Hy3DSampleMultiView:
num_inference_steps=steps,
output_type="pt",
callback_on_step_end=callback,
callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"]
callback_on_step_end_tensor_inputs=["latents", "prompt_embeds", "negative_prompt_embeds"],
denoise_strength=denoise_strength
).images
out_tensors = multiview_images.permute(0, 2, 3, 1).cpu().float()