Spaces:
Paused
Paused
Update aduc_framework/managers/pipeline_wan_i2v (4).py
Browse files
aduc_framework/managers/pipeline_wan_i2v (4).py
CHANGED
|
@@ -432,7 +432,7 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
| 432 |
latents = latents.to(device=device, dtype=dtype)
|
| 433 |
print(f"latents{latents.shape}")
|
| 434 |
|
| 435 |
-
|
| 436 |
|
| 437 |
if self.config.expand_timesteps:
|
| 438 |
video_condition = image
|
|
@@ -651,6 +651,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
| 651 |
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
| 652 |
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
| 653 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 654 |
# 1. Check inputs. Raise error if not correct
|
| 655 |
self.check_inputs(
|
| 656 |
prompt,
|
|
@@ -719,6 +723,9 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
| 719 |
image_embeds = image_embeds.repeat(batch_size, 1, 1)
|
| 720 |
image_embeds = image_embeds.to(transformer_dtype)
|
| 721 |
|
|
|
|
|
|
|
|
|
|
| 722 |
# 4. Prepare timesteps
|
| 723 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 724 |
timesteps = self.scheduler.timesteps
|
|
@@ -744,12 +751,21 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
| 744 |
latents,
|
| 745 |
last_image,
|
| 746 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 747 |
if self.config.expand_timesteps:
|
| 748 |
# wan 2.2 5b i2v use firt_frame_mask to mask timesteps
|
| 749 |
latents, condition, first_frame_mask = latents_outputs
|
| 750 |
else:
|
| 751 |
latents, condition = latents_outputs
|
| 752 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 753 |
# 6. Denoising loop
|
| 754 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
| 755 |
self._num_timesteps = len(timesteps)
|
|
@@ -812,6 +828,8 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
| 812 |
# compute the previous noisy sample x_t -> x_t-1
|
| 813 |
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
| 814 |
|
|
|
|
|
|
|
| 815 |
if callback_on_step_end is not None:
|
| 816 |
callback_kwargs = {}
|
| 817 |
for k in callback_on_step_end_tensor_inputs:
|
|
@@ -831,6 +849,10 @@ class WanImageToVideoPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
|
| 831 |
|
| 832 |
self._current_timestep = None
|
| 833 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 834 |
if self.config.expand_timesteps:
|
| 835 |
latents = (1 - first_frame_mask) * condition + first_frame_mask * latents
|
| 836 |
|
|
|
|
| 432 |
latents = latents.to(device=device, dtype=dtype)
|
| 433 |
print(f"latents{latents.shape}")
|
| 434 |
|
| 435 |
+
pipeline_wan_i2v = image.unsqueeze(2) # [batch_size, channels, 1, height, width]
|
| 436 |
|
| 437 |
if self.config.expand_timesteps:
|
| 438 |
video_condition = image
|
|
|
|
| 651 |
if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
|
| 652 |
callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
|
| 653 |
|
| 654 |
+
|
| 655 |
+
|
| 656 |
+
print(f"latents00{latents.shape}")
|
| 657 |
+
|
| 658 |
# 1. Check inputs. Raise error if not correct
|
| 659 |
self.check_inputs(
|
| 660 |
prompt,
|
|
|
|
| 723 |
image_embeds = image_embeds.repeat(batch_size, 1, 1)
|
| 724 |
image_embeds = image_embeds.to(transformer_dtype)
|
| 725 |
|
| 726 |
+
|
| 727 |
+
print(f"image_embeds{image_embeds.shape}")
|
| 728 |
+
|
| 729 |
# 4. Prepare timesteps
|
| 730 |
self.scheduler.set_timesteps(num_inference_steps, device=device)
|
| 731 |
timesteps = self.scheduler.timesteps
|
|
|
|
| 751 |
latents,
|
| 752 |
last_image,
|
| 753 |
)
|
| 754 |
+
|
| 755 |
+
|
| 756 |
+
print(f"latents_outputs{latents_outputs.shape}")
|
| 757 |
+
|
| 758 |
if self.config.expand_timesteps:
|
| 759 |
# wan 2.2 5b i2v use firt_frame_mask to mask timesteps
|
| 760 |
latents, condition, first_frame_mask = latents_outputs
|
| 761 |
else:
|
| 762 |
latents, condition = latents_outputs
|
| 763 |
|
| 764 |
+
|
| 765 |
+
|
| 766 |
+
print(f"latentsxx{latents.shape}")
|
| 767 |
+
|
| 768 |
+
|
| 769 |
# 6. Denoising loop
|
| 770 |
num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
|
| 771 |
self._num_timesteps = len(timesteps)
|
|
|
|
| 828 |
# compute the previous noisy sample x_t -> x_t-1
|
| 829 |
latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
|
| 830 |
|
| 831 |
+
print(f"latentsppp{latents.shape}")
|
| 832 |
+
|
| 833 |
if callback_on_step_end is not None:
|
| 834 |
callback_kwargs = {}
|
| 835 |
for k in callback_on_step_end_tensor_inputs:
|
|
|
|
| 849 |
|
| 850 |
self._current_timestep = None
|
| 851 |
|
| 852 |
+
|
| 853 |
+
print(f"latentsfim{latents.shape}")
|
| 854 |
+
|
| 855 |
+
|
| 856 |
if self.config.expand_timesteps:
|
| 857 |
latents = (1 - first_frame_mask) * condition + first_frame_mask * latents
|
| 858 |
|