diff --git a/pipeline_cogvideox.py b/pipeline_cogvideox.py
index c4ec8a1..d91e2b4 100644
--- a/pipeline_cogvideox.py
+++ b/pipeline_cogvideox.py
@@ -822,7 +822,8 @@ class CogVideoXPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
                                 fun_inpaint_masked_video_latents = torch.cat([fun_masked_video_latents] * 3)
                             else:
                                 fun_inpaint_masked_video_latents = fun_masked_video_latents
-
+                            
+                            fun_inpaint_latents = torch.cat([fun_inpaint_mask, fun_inpaint_masked_video_latents], dim=2).to(latents.dtype)
                             latent_model_input = torch.cat([latent_model_input, fun_inpaint_latents], dim=2)
 
                     # broadcast to batch dimension in a way that's compatible with ONNX/Core ML