Diffusers updates

Browse files

Files changed (12) hide show

.gitattributes +0 -6
README.md +244 -16
before_denoise.py +516 -111
decoders.py +18 -10
denoise.py +39 -70
encoders.py +18 -10
modular_blocks.py +11 -14
modular_config.json +1 -1
modular_model_index.json +3 -3
transformer/attention.py +72 -124
transformer/causal_model.py +2 -1
transformer/model.py +6 -8

.gitattributes CHANGED Viewed

@@ -33,9 +33,3 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
-hf_assets/3.mp4 filter=lfs diff=lfs merge=lfs -text
-hf_assets/masonry.mp4 filter=lfs diff=lfs merge=lfs -text
-hf_assets/trimferarricrop2_2x_speed.mp4 filter=lfs diff=lfs merge=lfs -text
-hf_assets/v2v_me_crop_1final.mov filter=lfs diff=lfs merge=lfs -text
-hf_assets/vertical_grid_all_videos.mp4 filter=lfs diff=lfs merge=lfs -text
-hf_assets/vertical_grid_output_reordered.mp4 filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -12,7 +12,7 @@ library_name: diffusers
 ---
 Krea Realtime 14B is distilled from the [Wan 2.1 14B text-to-video model](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B) using Self-Forcing, a technique for converting regular video diffusion models into autoregressive models. It achieves a text-to-video inference speed of **11fps** using 4 inference steps on a single NVIDIA B200 GPU. For more details on our training methodology and sampling innovations, refer to our [technical blog post](https://www.krea.ai/blog/krea-realtime-14b).
-Inference code can be found [here](https://github.com/krea-ai/realtime-video).
 <video width="100%" controls>
@@ -118,60 +118,288 @@ export CUDA_VISIBLE_DEVICES=0 # pick the GPU you want to serve on
 export DO_COMPILE=true
 uvicorn release_server:app --host 0.0.0.0 --port 8000
-```
-And use the web app at http://localhost:8000/ in your browser
 (for more advanced use-cases and custom pipeline check out our GitHub repository: https://github.com/krea-ai/realtime-video)
 # Use it with 🧨 diffusers
-Krea Realtime 14B can be used with the `diffusers` library utilizing the new Modular Diffusers structure (for now supporting text-to-video, video-to-video coming soon)
 ```bash
 # Install diffusers from main
 pip install git+github.com/huggingface/diffusers.git
-```
 ```py
 import torch
-from collections import deque
 from diffusers.utils import export_to_video
-from diffusers import ModularPipelineBlocks
-from diffusers.modular_pipelines import PipelineState, WanModularPipeline
 repo_id = "krea/krea-realtime-video"
-blocks = ModularPipelineBlocks.from_pretrained(repo_id, trust_remote_code=True)
-pipe = WanModularPipeline(blocks, repo_id)
 pipe.load_components(
     trust_remote_code=True,
     device_map="cuda",
     torch_dtype={"default": torch.bfloat16, "vae": torch.float16},
 )
-num_frames_per_block = 3
 num_blocks = 9
 frames = []
 state = PipelineState()
-state.set("frame_cache_context", deque(maxlen=pipe.config.frame_cache_len))
-prompt = ["a cat sitting on a boat"]
 for block in pipe.transformer.blocks:
     block.self_attn.fuse_projections()
-for block_idx in range(num_blocks):
     state = pipe(
         state,
         prompt=prompt,
         num_inference_steps=6,
         num_blocks=num_blocks,
         num_frames_per_block=num_frames_per_block,
         block_idx=block_idx,
         generator=torch.Generator("cuda").manual_seed(42),
     )
     frames.extend(state.values["videos"][0])
-export_to_video(frames, "output.mp4", fps=16)
-```

 ---
 Krea Realtime 14B is distilled from the [Wan 2.1 14B text-to-video model](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B) using Self-Forcing, a technique for converting regular video diffusion models into autoregressive models. It achieves a text-to-video inference speed of **11fps** using 4 inference steps on a single NVIDIA B200 GPU. For more details on our training methodology and sampling innovations, refer to our [technical blog post](https://www.krea.ai/blog/krea-realtime-14b).
+Inference code can be found [here](https://github.com/krea-ai/realtime-video).
 <video width="100%" controls>
 export DO_COMPILE=true
 uvicorn release_server:app --host 0.0.0.0 --port 8000
+```
+And use the web app at http://localhost:8000/ in your browser
 (for more advanced use-cases and custom pipeline check out our GitHub repository: https://github.com/krea-ai/realtime-video)
 # Use it with 🧨 diffusers
+Krea Realtime 14B can be used with the `diffusers` library utilizing the new Modular Diffusers structure
 ```bash
 # Install diffusers from main
 pip install git+github.com/huggingface/diffusers.git
+```
+<details>
+<summary>Text to Video</summary>
 ```py
 import torch
+from tqdm import tqdm
 from diffusers.utils import export_to_video
+from diffusers import ModularPipeline
+from diffusers.modular_pipelines import PipelineState
 repo_id = "krea/krea-realtime-video"
+pipe = ModularPipeline.from_pretrained(repo_id, trust_remote_code=True)
+pipe.load_components(
+    trust_remote_code=True,
+    device_map="cuda",
+    torch_dtype={"default": torch.bfloat16, "vae": torch.float16},
+)
+for block in pipe.transformer.blocks:
+    block.self_attn.fuse_projections()
+num_blocks = 9
+frames = []
+state = PipelineState()
+prompt = ["a cat sitting on a boat"]
+generator = torch.Generator(device=pipe.device).manual_seed(42)
+for block_idx in tqdm(range(num_blocks)):
+    state = pipe(
+        state,
+        prompt=prompt,
+        num_inference_steps=6,
+        num_blocks=num_blocks,
+        block_idx=block_idx,
+        generator=generator,
+    )
+    frames.extend(state.values["videos"][0])
+export_to_video(frames, "output.mp4", fps=24)
+```
+</details>
+<details>
+<summary>Video to Video</summary>
+```py
+import torch
+from tqdm import tqdm
+from diffusers.utils import load_video, export_to_video
+from diffusers import ModularPipeline
+from diffusers.modular_pipelines import PipelineState
+repo_id = "krea/krea-realtime-video"
+pipe = ModularPipeline.from_pretrained(repo_id, trust_remote_code=True)
 pipe.load_components(
     trust_remote_code=True,
     device_map="cuda",
     torch_dtype={"default": torch.bfloat16, "vae": torch.float16},
 )
+for block in pipe.transformer.blocks:
+    block.self_attn.fuse_projections()
 num_blocks = 9
+video = load_video("https://app-uploads.krea.ai/public/a8218957-1a80-43dc-81b2-da970b5f2221-video.mp4")
 frames = []
+prompt = ["A car racing down a snowy mountain"]
 state = PipelineState()
+generator = torch.Generator("cuda").manual_seed(42)
+for block_idx in tqdm(range(num_blocks)):
+    state = pipe(
+        state,
+        video=video,
+        prompt=prompt,
+        num_inference_steps=6,
+        strength=0.3,
+        block_idx=block_idx,
+        generator=generator,
+    )
+    frames.extend(state.values["videos"][0])
+export_to_video(frames, "output-v2v.mp4", fps=24)
+```
+</details>
+<details>
+<summary>Streaming Video to Video</summary>
+Using the `video_stream` input will process video frames in as they arrive, while maintaining temporal consistency across chunks.
+```py
+import torch
+from collections import deque
+from tqdm import tqdm
+from diffusers.utils import load_video, export_to_video
+from diffusers import ModularPipeline
+from diffusers.modular_pipelines import PipelineState
+repo_id = "krea/krea-realtime-video"
+pipe = ModularPipeline.from_pretrained(repo_id, trust_remote_code=True)
+pipe.load_components(
+    trust_remote_code=True,
+    device_map="cuda",
+    torch_dtype={"default": torch.bfloat16, "vae": torch.float16},
+)
 for block in pipe.transformer.blocks:
     block.self_attn.fuse_projections()
+n_samples = 9
+frame_sample_len = 12
+video = load_video(
+    "https://app-uploads.krea.ai/public/a8218957-1a80-43dc-81b2-da970b5f2221-video.mp4"
+)
+# Simulate streaming video input
+frame_samples = [
+    video[sample_start : sample_start + frame_sample_len]
+    for sample_start in range(0, n_samples * frame_sample_len, frame_sample_len)
+]
+frames = []
+state = PipelineState()
+prompt = ["A car racing down a snowny mountain road"]
+block_idx = 0
+generator = torch.Generator("cpu").manual_seed(42)
+for frame_sample in tqdm(frame_samples):
+    state = pipe(
+        state,
+        video_stream=frame_sample,
+        prompt=prompt,
+        num_inference_steps=6,
+        strength=0.3,
+        block_idx=block_idx,
+        generator=generator,
+    )
+    frames.extend(state.values["videos"][0])
+    block_idx += 1
+export_to_video(frames, "output-v2v-streaming.mp4", fps=24)
+```
+</details>
+<details>
+<summary>Using LoRAs</summary>
+```py
+import torch
+from collections import deque
+from tqdm import tqdm
+from diffusers.utils import export_to_video
+from diffusers import ModularPipeline
+from diffusers.modular_pipelines import PipelineState
+repo_id = "krea/krea-realtime-video"
+pipe = ModularPipeline.from_pretrained(repo_id, trust_remote_code=True)
+pipe.load_components(
+    trust_remote_code=True,
+    device_map="cuda",
+    torch_dtype={"default": torch.bfloat16, "vae": torch.float16},
+)
+pipe.transformer.load_lora_adapter(
+    "shauray/Origami_WanLora",
+    prefix="diffusion_model",
+    weight_name="origami_000000500.safetensors",
+    adapter_name="origami",
+)
+for block in pipe.transformer.blocks:
+    block.self_attn.fuse_projections()
+num_blocks = 9
+frames = []
+state = PipelineState()
+prompt = ["[origami] a cat sitting on a boat"]
+generator = torch.Generator("cuda").manual_seed(42)
+for block_idx in tqdm(range(num_blocks)):
     state = pipe(
         state,
         prompt=prompt,
         num_inference_steps=6,
         num_blocks=num_blocks,
+        block_idx=block_idx,
+        generator=generator,
+    )
+    frames.extend(state.values["videos"][0])
+export_to_video(frames, "output.mp4", fps=24)
+```
+</details>
+<details>
+<summary>Optimized Inference</summary>
+To optimize inference speed and memory usage on Hopper level GPUs (H100s), we recommend using `torch.compile`, Flash Attention 3 and FP8 quantization with [torchao](https://github.com/pytorch/ao).
+First let's set up our depedencies by enabling Flash Attention 3 via Hub [kernels](https://huggingface.co/docs/kernels/en/index) and installing the `torchao` and `kernels` packages.
+```shell
+export DIFFUSERS_ENABLE_HUB_KERNELS=true
+pip install -U kernels torchao
+```
+Then we will iterate over the blocks of the transformer and apply quantization and `torch.compile`.
+```py
+import torch
+from collections import deque
+from tqdm import tqdm
+from diffusers.utils import export_to_video
+from diffusers import ModularPipeline
+from diffusers.modular_pipelines import PipelineState
+from torchao.quantization import Float8DynamicActivationFloat8WeightConfig, quantize_
+repo_id = "krea/krea-realtime-video"
+pipe = ModularPipeline.from_pretrained(repo_id, trust_remote_code=True)
+pipe.load_components(
+    trust_remote_code=True,
+    device_map="cuda",
+    torch_dtype={"default": torch.bfloat16, "vae": torch.float16},
+)
+for block in pipe.transformer.blocks:
+    block.self_attn.fuse_projections()
+# Quantize just the transformer blocks
+for block in pipe.transformer.blocks:
+    quantize_(block, Float8DynamicActivationFloat8WeightConfig())
+# Compile just the attention modules
+for submod in pipe.transformer.modules():
+    if submod.__class__.__name__ in ["CausalWanAttentionBlock"]:
+        submod.compile(fullgraph=False)
+num_blocks = 9
+state = PipelineState()
+prompt = ["a cat sitting on a boat"]
+# Compile warmup
+for block_idx in range(num_blocks):
+    state = pipe(
+        state,
+        prompt=prompt,
+        num_inference_steps=2,
+        num_blocks=num_blocks,
         num_frames_per_block=num_frames_per_block,
         block_idx=block_idx,
         generator=torch.Generator("cuda").manual_seed(42),
     )
+# Reset state
+state = PipelineState()
+generator = torch.Generator("cuda").manual_seed(42)
+for block_idx in tqdm(range(num_blocks)):
+    state = pipe(
+        state,
+        prompt=prompt,
+        num_inference_steps=6,
+        num_blocks=num_blocks,
+        block_idx=block_idx,
+        generator=generator,
+    )
     frames.extend(state.values["videos"][0])
+export_to_video(frames, "output.mp4", fps=24)
+```
+</details>

before_denoise.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import inspect
 from typing import List, Optional, Union, Dict
 import torch
@@ -25,6 +26,7 @@ from diffusers.modular_pipelines import (
     ModularPipeline,
     ModularPipelineBlocks,
     SequentialPipelineBlocks,
     PipelineState,
 )
 from diffusers.modular_pipelines.modular_pipeline_utils import (
@@ -221,7 +223,7 @@ def _initialize_crossattn_cache(
 class WanInputStep(ModularPipelineBlocks):
-    model_name = "WanRT"
     @property
     def description(self) -> str:
@@ -237,7 +239,11 @@ class WanInputStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("num_videos_per_prompt", default=1),
             InputParam(
                 "prompt_embeds",
                 required=True,
@@ -331,8 +337,8 @@ class WanInputStep(ModularPipelineBlocks):
         return components, state
-class WanRTStreamingSetTimestepsStep(ModularPipelineBlocks):
-    model_name = "WanRT"
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -350,6 +356,7 @@ class WanRTStreamingSetTimestepsStep(ModularPipelineBlocks):
             InputParam("num_inference_steps", default=4),
             InputParam("timesteps"),
             InputParam("sigmas"),
         ]
     @property
@@ -391,7 +398,10 @@ class WanRTStreamingSetTimestepsStep(ModularPipelineBlocks):
             ]
         )
         denoising_steps = torch.linspace(
-            1000, 0, block_state.num_inference_steps, dtype=torch.float32
         ).to(torch.long)
         block_state.timesteps = zero_padded_timesteps[1000 - denoising_steps]
@@ -403,8 +413,8 @@ class WanRTStreamingSetTimestepsStep(ModularPipelineBlocks):
         return components, state
-class WanRTStreamingPrepareLatentsStep(ModularPipelineBlocks):
-    model_name = "WanRT"
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -423,15 +433,36 @@ class WanRTStreamingPrepareLatentsStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("height", type_hint=int),
-            InputParam("width", type_hint=int),
-            InputParam("num_blocks", type_hint=int),
-            InputParam("num_frames_per_block", type_hint=int),
-            InputParam("latents", type_hint=Optional[torch.Tensor]),
-            InputParam("init_latents", type_hint=Optional[torch.Tensor]),
-            InputParam("final_latents", type_hint=Optional[torch.Tensor]),
-            InputParam("num_videos_per_prompt", type_hint=int, default=1),
-            InputParam("generator"),
             InputParam(
                 "dtype",
                 type_hint=torch.dtype,
@@ -442,20 +473,11 @@ class WanRTStreamingPrepareLatentsStep(ModularPipelineBlocks):
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam(
-                "latents",
-                type_hint=torch.Tensor,
-                description="The initial latents to use for the denoising process",
-            ),
             OutputParam(
                 "init_latents",
                 type_hint=torch.Tensor,
                 description="The initial latents to use for the denoising process",
             ),
-            OutputParam(
-                "final_latents",
-                type_hint=torch.Tensor,
-            ),
         ]
     @staticmethod
@@ -476,8 +498,8 @@ class WanRTStreamingPrepareLatentsStep(ModularPipelineBlocks):
         components,
         batch_size: int,
         num_channels_latents: int = 16,
-        height: int = 352,
-        width: int = 640,
         num_blocks: int = 9,
         num_frames_per_block: int = 3,
         dtype: Optional[torch.dtype] = None,
@@ -536,56 +558,398 @@ class WanRTStreamingPrepareLatentsStep(ModularPipelineBlocks):
             block_state.generator,
             block_state.init_latents,
         )
-        if block_state.final_latents is None:
-            block_state.final_latents = torch.zeros_like(
-                block_state.init_latents, device=components.transformer.device
-            )
         self.set_block_state(state, block_state)
         return components, state
-class WanRTStreamingExtractBlockLatentsStep(ModularPipelineBlocks):
     """
-    Extracts a single block of latents from the full video buffer for streaming generation.
-    This block simply slices the final_latents buffer to get the current block's latents.
-    The final_latents buffer should be created beforehand using WanRTStreamingPrepareAllLatents.
     """
-    model_name = "WanRT"
     @property
     def expected_components(self) -> List[ComponentSpec]:
-        return []
     @property
     def description(self) -> str:
         return (
-            "Extracts a single block from the full latent buffer for streaming generation. "
-            "Slices final_latents based on block_idx to get current block's latents."
         )
     @property
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
-                "final_latents",
-                required=True,
                 type_hint=torch.Tensor,
-                description="Full latent buffer [B, C, total_frames, H, W]",
             ),
             InputParam(
                 "init_latents",
-                required=True,
                 type_hint=torch.Tensor,
-                description="Full latent buffer [B, C, total_frames, H, W]",
             ),
             InputParam(
                 "latents",
                 type_hint=torch.Tensor,
-                description="Full latent buffer [B, C, total_frames, H, W]",
             ),
             InputParam(
                 "block_idx",
                 required=True,
@@ -593,6 +957,12 @@ class WanRTStreamingExtractBlockLatentsStep(ModularPipelineBlocks):
                 default=0,
                 description="Current block index to process",
             ),
             InputParam(
                 "num_frames_per_block",
                 required=True,
@@ -623,7 +993,7 @@ class WanRTStreamingExtractBlockLatentsStep(ModularPipelineBlocks):
     ) -> PipelineState:
         block_state = self.get_block_state(state)
-        num_frames_per_block = block_state.num_frames_per_block
         block_idx = block_state.block_idx
         # Calculate frame range for current block
@@ -642,7 +1012,7 @@ class WanRTStreamingExtractBlockLatentsStep(ModularPipelineBlocks):
         return components, state
-class WanRTStreamingSetupKVCache(ModularPipelineBlocks):
     """
     Initializes KV cache and cross-attention cache for streaming generation.
@@ -651,7 +1021,7 @@ class WanRTStreamingSetupKVCache(ModularPipelineBlocks):
     Should be called once at the start of streaming generation.
     """
-    model_name = "WanRT"
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -772,7 +1142,7 @@ class WanRTStreamingSetupKVCache(ModularPipelineBlocks):
         return components, state
-class WanRTStreamingRecomputeKVCache(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
@@ -782,34 +1152,20 @@ class WanRTStreamingRecomputeKVCache(ModularPipelineBlocks):
                 description="Current block latents [B, C, num_frames_per_block, H, W]",
             ),
             InputParam(
-                "num_frames_per_block",
                 type_hint=int,
-                description="Number of frames per block",
             ),
             InputParam(
                 "block_idx",
                 type_hint=int,
                 description="Current block index to process",
             ),
-            InputParam(
-                "block_mask",
-                description="Block-wise causal attention mask",
-            ),
             InputParam(
                 "current_start_frame",
                 type_hint=int,
                 description="Starting frame index for current block",
             ),
-            InputParam(
-                "videos",
-                type_hint=torch.Tensor,
-                description="Video frames for context encoding",
-            ),
-            InputParam(
-                "final_latents",
-                type_hint=torch.Tensor,
-                description="Full latent buffer [B, C, total_frames, H, W]",
-            ),
             InputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
@@ -825,16 +1181,14 @@ class WanRTStreamingRecomputeKVCache(ModularPipelineBlocks):
                 type_hint=torch.Tensor,
                 description="Cross-attention cache",
             ),
-            InputParam(
-                "encoder_cache",
-                description="Encoder feature cache",
-            ),
             InputParam(
                 "frame_cache_context",
                 description="Cached context frames for reencoding",
             ),
             InputParam(
-                "local_attn_size",
             ),
         ]
@@ -842,9 +1196,7 @@ class WanRTStreamingRecomputeKVCache(ModularPipelineBlocks):
     def expected_configs(self) -> List[ConfigSpec]:
         return [ConfigSpec("seq_length", 32760)]
-    def prepare_latents(self, components, block_state):
-        frames = block_state.frame_cache_context[0].half()
         components.vae._enc_feat_map = [None] * 55
         latents = retrieve_latents(components.vae.encode(frames), sample_mode="argmax")
         latents_mean = (
@@ -861,30 +1213,23 @@ class WanRTStreamingRecomputeKVCache(ModularPipelineBlocks):
     def get_context_frames(self, components, block_state):
         current_kv_cache_num_frames = components.config.kv_cache_num_frames
-        context_frames = block_state.final_latents[
-            :, :, : block_state.current_start_frame
-        ]
-        if (
             block_state.block_idx - 1
-        ) * block_state.num_frames_per_block < current_kv_cache_num_frames:
-            if current_kv_cache_num_frames == 1:
-                context_frames = context_frames[:, :, :1]
-            else:
-                context_frames = torch.cat(
-                    (
-                        context_frames[:, :, :1],
-                        context_frames[:, :, 1:][
-                            :, :, -current_kv_cache_num_frames + 1 :
-                        ],
-                    ),
-                    dim=2,
-                )
         else:
             context_frames = context_frames[:, :, 1:][
                 :, :, -current_kv_cache_num_frames + 1 :
             ]
-            first_frame_latent = self.prepare_latents(components, block_state)
             first_frame_latent = first_frame_latent.to(block_state.latents)
             context_frames = torch.cat((first_frame_latent, context_frames), dim=2)
@@ -895,20 +1240,15 @@ class WanRTStreamingRecomputeKVCache(ModularPipelineBlocks):
         if block_state.block_idx == 0:
             return components, state
-        start_frame = min(
-            block_state.current_start_frame, components.config.kv_cache_num_frames
-        )
         context_frames = self.get_context_frames(components, block_state)
-        block_state.block_mask = (
-            components.transformer._prepare_blockwise_causal_attn_mask(
-                components.transformer.device,
-                num_frames=context_frames.shape[2],
-                frame_seqlen=components.config.frame_seq_length,
-                num_frame_per_block=block_state.num_frames_per_block,
-                local_attn_size=-1,
-            )
         )
-        components.transformer.block_mask = block_state.block_mask
         context_timestep = torch.zeros(
             (context_frames.shape[0], context_frames.shape[2]),
             device=components.transformer.device,
@@ -921,7 +1261,7 @@ class WanRTStreamingRecomputeKVCache(ModularPipelineBlocks):
             kv_cache=block_state.kv_cache,
             seq_len=components.config.seq_length,
             crossattn_cache=block_state.crossattn_cache,
-            current_start=start_frame * components.config.frame_seq_length,
             cache_start=None,
         )
         components.transformer.block_mask = None
@@ -929,13 +1269,13 @@ class WanRTStreamingRecomputeKVCache(ModularPipelineBlocks):
         return components, state
-class WanRTStreamingBeforeDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
-        WanRTStreamingSetTimestepsStep,
-        WanRTStreamingPrepareLatentsStep,
-        WanRTStreamingExtractBlockLatentsStep,
-        WanRTStreamingSetupKVCache,
-        WanRTStreamingRecomputeKVCache,
     ]
     block_names = [
         "set_timesteps",
@@ -953,4 +1293,69 @@ class WanRTStreamingBeforeDenoiseStep(SequentialPipelineBlocks):
             + " - `WanRTInputStep` is used to adjust the batch size of the model inputs\n"
             + " - `WanRTSetTimestepsStep` is used to set the timesteps\n"
             + " - `WanRTPrepareLatentsStep` is used to prepare the latents\n"
         )

 import inspect
 from typing import List, Optional, Union, Dict
+from collections import deque
 import torch
     ModularPipeline,
     ModularPipelineBlocks,
     SequentialPipelineBlocks,
+    AutoPipelineBlocks,
     PipelineState,
 )
 from diffusers.modular_pipelines.modular_pipeline_utils import (
 class WanInputStep(ModularPipelineBlocks):
+    model_name = "wan"
     @property
     def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
+            InputParam(
+                "num_videos_per_prompt",
+                default=1,
+                description="Number of videos to generate per prompt",
+            ),
             InputParam(
                 "prompt_embeds",
                 required=True,
         return components, state
+class WanRTSetTimestepsStep(ModularPipelineBlocks):
+    model_name = "wan"
     @property
     def expected_components(self) -> List[ComponentSpec]:
             InputParam("num_inference_steps", default=4),
             InputParam("timesteps"),
             InputParam("sigmas"),
+            InputParam("strength", default=1.0),
         ]
     @property
             ]
         )
         denoising_steps = torch.linspace(
+            block_state.strength * 1000,
+            0,
+            block_state.num_inference_steps,
+            dtype=torch.float32,
         ).to(torch.long)
         block_state.timesteps = zero_padded_timesteps[1000 - denoising_steps]
         return components, state
+class WanRTPrepareLatentsStep(ModularPipelineBlocks):
+    model_name = "wan"
     @property
     def expected_components(self) -> List[ComponentSpec]:
     @property
     def inputs(self) -> List[InputParam]:
         return [
+            InputParam(
+                "height",
+                type_hint=int,
+                description="Height of the video to generate in pixels",
+            ),
+            InputParam(
+                "width",
+                type_hint=int,
+                description="Width of the video to generate in pixels",
+            ),
+            InputParam(
+                "num_blocks",
+                type_hint=int,
+                description="Number of temporal blocks to generate",
+            ),
+            InputParam(
+                "init_latents",
+                type_hint=Optional[torch.Tensor],
+                description="Pre-initialized latents to use instead of random noise",
+            ),
+            InputParam(
+                "num_videos_per_prompt",
+                type_hint=int,
+                default=1,
+                description="Number of videos to generate per prompt",
+            ),
+            InputParam(
+                "generator",
+                description="Random number generator for reproducible generation",
+            ),
             InputParam(
                 "dtype",
                 type_hint=torch.dtype,
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
             OutputParam(
                 "init_latents",
                 type_hint=torch.Tensor,
                 description="The initial latents to use for the denoising process",
             ),
         ]
     @staticmethod
         components,
         batch_size: int,
         num_channels_latents: int = 16,
+        height: int = 480,
+        width: int = 832,
         num_blocks: int = 9,
         num_frames_per_block: int = 3,
         dtype: Optional[torch.dtype] = None,
             block_state.generator,
             block_state.init_latents,
         )
+        block_state.init_latents = block_state.init_latents.contiguous()
         self.set_block_state(state, block_state)
         return components, state
+class WanRTPrepareVideoLatentStep(ModularPipelineBlocks):
     """
+    Prepares video latents from input PIL images for video-to-video generation.
+    This block:
+    1. Processes input PIL images
+    2. Encodes them to latent space using the VAE encoder
+    3. Adds noise based on denoising strength for partial denoising
     """
+    model_name = "wan"
     @property
     def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLWan),
+        ]
     @property
     def description(self) -> str:
         return (
+            "Prepares video latents from input PIL images by encoding to latent space "
+            "and optionally adding noise for video-to-video generation."
         )
     @property
     def inputs(self) -> List[InputParam]:
         return [
             InputParam(
+                "video",
+                type_hint=list,
+                description="List of PIL Images for input video",
+            ),
+            InputParam(
+                "height",
+                type_hint=int,
+                default=480,
+                description="Target height for video processing",
+            ),
+            InputParam(
+                "width",
+                type_hint=int,
+                default=832,
+                description="Target width for video processing",
+            ),
+            InputParam(
+                "strength",
+                type_hint=float,
+                default=1.0,
+                description="Denoising strength (0-1). Lower values preserve more of original video.",
+            ),
+            InputParam(
+                "generator",
+                description="Random generator for noise",
+            ),
+            InputParam(
+                "timesteps",
                 type_hint=torch.Tensor,
+                description="All timesteps for noise scheduling",
+            ),
+            InputParam(
+                "num_blocks",
+                type_hint=int,
+                description="Number of blocks for generation",
             ),
             InputParam(
                 "init_latents",
                 type_hint=torch.Tensor,
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
+                "init_latents",
+                type_hint=torch.Tensor,
+                description="Noised latents from input video ready for denoising",
+            ),
+            OutputParam(
+                "num_blocks",
+                type_hint=int,
+                description="Updated number of blocks based on video length",
+            ),
+        ]
+    def encode_frames(
+        self,
+        components,
+        video: Optional[torch.Tensor] = None,
+        timesteps: Optional[torch.Tensor] = None,
+        generator: Optional[torch.Generator] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if latents is not None:
+            return latents.to(device, dtype)
+        if not hasattr(components.vae, "_enc_feat_map"):
+            components.vae.clear_cache()
+        else:
+            components.vae._enc_feat_map = [None] * 55
+        init_latents = [
+            retrieve_latents(
+                components.vae.encode(vid.unsqueeze(0).transpose(2, 1)),
+                sample_mode="argmax",
+            )
+            for vid in video
+        ]
+        init_latents = torch.cat(init_latents, dim=0).to(dtype)
+        latents_mean = (
+            torch.tensor(components.vae.config.latents_mean)
+            .view(1, components.vae.config.z_dim, 1, 1, 1)
+            .to(device, dtype)
+        )
+        latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
+            1, components.vae.config.z_dim, 1, 1, 1
+        ).to(device, dtype)
+        init_latents = (init_latents - latents_mean) * latents_std
+        init_denoising_strength = timesteps[0] / 1000.0
+        # Add noise to latents
+        noise = randn_tensor(
+            init_latents.shape,
+            device=init_latents.device,
+            dtype=init_latents.dtype,
+            generator=generator,
+        )
+        init_latents = (
+            init_latents * (1.0 - init_denoising_strength)
+            + noise * init_denoising_strength
+        )
+        init_latents = init_latents.to(components.transformer.dtype).contiguous()
+        return init_latents
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        if block_state.init_latents is not None:
+            block_state.init_latents = block_state.init_latents.to(
+                components.transformer.dtype
+            )
+            self.set_block_state(state, block_state)
+            return components, state
+        video = (
+            components.video_processor.preprocess(
+                block_state.video, block_state.height, block_state.width
+            )
+            .unsqueeze(0)
+            .to(components.vae.device, components.vae.dtype)
+        )
+        block_state.init_latents = self.encode_frames(
+            components,
+            video,
+            block_state.timesteps,
+            block_state.generator,
+            components.vae.dtype,
+            components.vae.device,
+            block_state.init_latents,
+        )
+        block_state.init_latents = block_state.init_latents.to(
+            components.transformer.dtype
+        )
+        self.set_block_state(state, block_state)
+        return components, state
+class WanRTStreamPrepareVideoLatentStep(ModularPipelineBlocks):
+    """
+    Prepares video latents from input PIL images for video-to-video generation.
+    This block:
+    1. Processes input PIL images
+    2. Encodes them to latent space using the VAE encoder
+    3. Adds noise based on denoising strength for partial denoising
+    """
+    model_name = "wan"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLWan),
+        ]
+    @property
+    def description(self) -> str:
+        return (
+            "Prepares video latents from input PIL images by encoding to latent space "
+            "and optionally adding noise for video-to-video generation."
+        )
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
+            InputParam(
+                "video_stream",
+                type_hint=list,
+                description="List of PIL Images for input video",
+            ),
+            InputParam(
+                "height",
+                type_hint=int,
+                default=480,
+                description="Target height for video processing",
             ),
             InputParam(
+                "width",
+                type_hint=int,
+                default=832,
+                description="Target width for video processing",
+            ),
+            InputParam(
+                "generator",
+                type_hint=torch.Generator,
+                description="Random generator for noise",
+            ),
+            InputParam(
+                "timesteps",
+                type_hint=torch.Tensor,
+                description="All timesteps for noise scheduling",
+            ),
+            InputParam(
+                "block_idx",
+                type_hint=int,
+                description="Index of current block to denoise",
+            ),
+            InputParam(
+                "num_blocks",
+                type_hint=int,
+                description="Total number of blocks to denoise",
+            ),
+            InputParam(
+                "input_frames_cache",
+                default=deque(maxlen=24),
+                description="Cached input video frames for context encoding",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> List[OutputParam]:
+        return [
+            OutputParam(
                 "latents",
                 type_hint=torch.Tensor,
+                description="Noised latents from input video ready for denoising",
             ),
+            OutputParam(
+                "current_start_frame",
+                type_hint=int,
+            ),
+        ]
+    def encode_frames(
+        self,
+        components,
+        video: Optional[torch.Tensor] = None,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[torch.device] = None,
+        latents: Optional[torch.Tensor] = None,
+    ):
+        if latents is not None:
+            return latents.to(device, dtype)
+        if not hasattr(components.vae, "_enc_feat_map"):
+            components.vae.clear_cache()
+        else:
+            components.vae._enc_feat_map = [None] * 55
+        init_latents = [
+            retrieve_latents(
+                components.vae.encode(vid.unsqueeze(0).transpose(2, 1)),
+                sample_mode="argmax",
+            )
+            for vid in video
+        ]
+        init_latents = torch.cat(init_latents, dim=0).to(dtype)
+        latents_mean = (
+            torch.tensor(components.vae.config.latents_mean)
+            .view(1, components.vae.config.z_dim, 1, 1, 1)
+            .to(device, dtype)
+        )
+        latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
+            1, components.vae.config.z_dim, 1, 1, 1
+        ).to(device, dtype)
+        init_latents = (init_latents - latents_mean) * latents_std
+        return init_latents
+    def resample_frames(self, frames, target_length):
+        """Resample a list to the target length using linear interpolation of indices"""
+        if len(frames) == target_length:
+            return frames
+        indices = (
+            torch.linspace(0, len(frames) - 1, target_length, device="cpu")
+            .round()
+            .long()
+        )
+        return [frames[i] for i in indices]
+    @torch.no_grad()
+    def __call__(
+        self, components: ModularPipeline, state: PipelineState
+    ) -> PipelineState:
+        block_state = self.get_block_state(state)
+        if block_state.video_stream is None:
+            raise ValueError(
+                "Stream Video to Video requires an input video. Please provide a`video` input to the Pipeline"
+            )
+        block_state.input_frames_cache.extend(block_state.video_stream)
+        video = (
+            components.video_processor.preprocess(
+                list(block_state.input_frames_cache),
+                block_state.height,
+                block_state.width,
+            )
+            .unsqueeze(0)
+            .to(components.vae.device, components.vae.dtype)
+        )
+        block_state.current_start_frame = (
+            block_state.block_idx * components.config.num_frames_per_block
+        )
+        init_latents = self.encode_frames(
+            components,
+            video,
+            components.vae.dtype,
+            components.vae.device,
+            None,
+        )
+        init_latents = init_latents[:, :, -components.config.num_frames_per_block :]
+        strength = block_state.timesteps[0] / 1000.0
+        noise = randn_tensor(
+            init_latents.shape,
+            device=components.transformer.device,
+            dtype=components.transformer.dtype,
+            generator=block_state.generator,
+        )
+        init_latents = init_latents * (1.0 - strength) + noise * strength
+        init_latents = init_latents.to(components.transformer.dtype).contiguous()
+        block_state.latents = init_latents
+        self.set_block_state(state, block_state)
+        return components, state
+class WanRTExtractBlockLatentsStep(ModularPipelineBlocks):
+    """
+    Extracts a single block of latents from the full video buffer for streaming generation.
+    This block simply slices the final_latents buffer to get the current block's latents.
+    The final_latents buffer should be created beforehand using WanRTPrepareAllLatents.
+    """
+    model_name = "wan"
+    @property
+    def expected_components(self) -> List[ComponentSpec]:
+        return []
+    @property
+    def description(self) -> str:
+        return (
+            "Extracts a single block from the full latent buffer for streaming generation. "
+            "Slices final_latents based on block_idx to get current block's latents."
+        )
+    @property
+    def inputs(self) -> List[InputParam]:
+        return [
             InputParam(
                 "block_idx",
                 required=True,
                 default=0,
                 description="Current block index to process",
             ),
+            InputParam(
+                "init_latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Full latent buffer [B, C, total_frames, H, W]",
+            ),
             InputParam(
                 "num_frames_per_block",
                 required=True,
     ) -> PipelineState:
         block_state = self.get_block_state(state)
+        num_frames_per_block = components.config.num_frames_per_block
         block_idx = block_state.block_idx
         # Calculate frame range for current block
         return components, state
+class WanRTSetupKVCache(ModularPipelineBlocks):
     """
     Initializes KV cache and cross-attention cache for streaming generation.
     Should be called once at the start of streaming generation.
     """
+    model_name = "wan"
     @property
     def expected_components(self) -> List[ComponentSpec]:
         return components, state
+class WanRTRecomputeKVCache(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
                 description="Current block latents [B, C, num_frames_per_block, H, W]",
             ),
             InputParam(
+                "num_blocks",
                 type_hint=int,
+                description="Number of blocks to denoise",
             ),
             InputParam(
                 "block_idx",
                 type_hint=int,
                 description="Current block index to process",
             ),
             InputParam(
                 "current_start_frame",
                 type_hint=int,
                 description="Starting frame index for current block",
             ),
             InputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
                 type_hint=torch.Tensor,
                 description="Cross-attention cache",
             ),
             InputParam(
                 "frame_cache_context",
                 description="Cached context frames for reencoding",
             ),
             InputParam(
+                "current_denoised_latents",
+                type_hint=torch.Tensor,
+                description="Current denoised latents",
             ),
         ]
     def expected_configs(self) -> List[ConfigSpec]:
         return [ConfigSpec("seq_length", 32760)]
+    def prepare_latents(self, components, frames):
         components.vae._enc_feat_map = [None] * 55
         latents = retrieve_latents(components.vae.encode(frames), sample_mode="argmax")
         latents_mean = (
     def get_context_frames(self, components, block_state):
         current_kv_cache_num_frames = components.config.kv_cache_num_frames
+        total_frames_generated = (
             block_state.block_idx - 1
+        ) * components.config.num_frames_per_block
+        if total_frames_generated < current_kv_cache_num_frames:
+            context_frames = block_state.current_denoised_latents[
+                :, :, :current_kv_cache_num_frames
+            ]
         else:
+            context_frames = block_state.current_denoised_latents
             context_frames = context_frames[:, :, 1:][
                 :, :, -current_kv_cache_num_frames + 1 :
             ]
+            first_frame_latent = self.prepare_latents(
+                components, frames=block_state.frame_cache_context[0].half()
+            )
             first_frame_latent = first_frame_latent.to(block_state.latents)
             context_frames = torch.cat((first_frame_latent, context_frames), dim=2)
         if block_state.block_idx == 0:
             return components, state
         context_frames = self.get_context_frames(components, block_state)
+        block_mask = components.transformer._prepare_blockwise_causal_attn_mask(
+            components.transformer.device,
+            num_frames=context_frames.shape[2],
+            frame_seqlen=components.config.frame_seq_length,
+            num_frame_per_block=components.config.num_frames_per_block,
+            local_attn_size=-1,
         )
+        components.transformer.block_mask = block_mask
         context_timestep = torch.zeros(
             (context_frames.shape[0], context_frames.shape[2]),
             device=components.transformer.device,
             kv_cache=block_state.kv_cache,
             seq_len=components.config.seq_length,
             crossattn_cache=block_state.crossattn_cache,
+            current_start=0,  # when updating the kv cache with block_mask the current_start is unused
             cache_start=None,
         )
         components.transformer.block_mask = None
         return components, state
+class WanRTBeforeDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
+        WanRTSetTimestepsStep,
+        WanRTPrepareLatentsStep,
+        WanRTExtractBlockLatentsStep,
+        WanRTSetupKVCache,
+        WanRTRecomputeKVCache,
     ]
     block_names = [
         "set_timesteps",
             + " - `WanRTInputStep` is used to adjust the batch size of the model inputs\n"
             + " - `WanRTSetTimestepsStep` is used to set the timesteps\n"
             + " - `WanRTPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `WanRTPrepareVideoLatentStep` is used to prepare video latents from input video\n"
+        )
+class WanRTVideoToVideoBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        WanRTSetTimestepsStep,
+        WanRTPrepareVideoLatentStep,
+        WanRTExtractBlockLatentsStep,
+        WanRTSetupKVCache,
+        WanRTRecomputeKVCache,
+    ]
+    block_names = [
+        "set_timesteps",
+        "prepare_video_latents",
+        "extract_block_init_latents",
+        "setup_kv_cache",
+        "recompute_kv_cache",
+    ]
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `WanRTInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `WanRTSetTimestepsStep` is used to set the timesteps\n"
+            + " - `WanRTPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `WanRTPrepareVideoLatentStep` is used to prepare video latents from input video\n"
         )
+class WanRTStreamVideoToVideoBeforeDenoiseStep(SequentialPipelineBlocks):
+    block_classes = [
+        WanRTSetTimestepsStep,
+        WanRTStreamPrepareVideoLatentStep,
+        WanRTSetupKVCache,
+        WanRTRecomputeKVCache,
+    ]
+    block_names = [
+        "set_timesteps",
+        "prepare_video_latents",
+        "setup_kv_cache",
+        "recompute_kv_cache",
+    ]
+    @property
+    def description(self):
+        return (
+            "Before denoise step that prepare the inputs for the denoise step.\n"
+            + "This is a sequential pipeline blocks:\n"
+            + " - `WanRTInputStep` is used to adjust the batch size of the model inputs\n"
+            + " - `WanRTSetTimestepsStep` is used to set the timesteps\n"
+            + " - `WanRTPrepareLatentsStep` is used to prepare the latents\n"
+            + " - `WanRTPrepareVideoLatentStep` is used to prepare video latents from input video\n"
+        )
+class WanRTAutoBeforeDenoiseStep(AutoPipelineBlocks):
+    block_classes = [
+        WanRTVideoToVideoBeforeDenoiseStep,
+        WanRTStreamVideoToVideoBeforeDenoiseStep,
+        WanRTBeforeDenoiseStep,
+    ]
+    block_names = ["video-to-video", "stream-to-video", "text-to-video"]
+    block_trigger_inputs = ["video", "video_stream", None]

decoders.py CHANGED Viewed

@@ -13,6 +13,7 @@
 # limitations under the License.
 from typing import Any, List, Tuple, Union
 import numpy as np
 import PIL
@@ -35,7 +36,7 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 class WanRTDecodeStep(ModularPipelineBlocks):
-    model_name = "WanRT"
     decoder_cache = []
     @property
@@ -62,7 +63,15 @@ class WanRTDecodeStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         return [
-            InputParam("output_type", default="pil"),
             InputParam(
                 "latents",
                 required=True,
@@ -71,15 +80,12 @@ class WanRTDecodeStep(ModularPipelineBlocks):
             ),
             InputParam(
                 "frame_cache_context",
-                description="The denoised latents from the denoising step",
-            ),
-            InputParam(
-                "block_idx",
-                description="The denoised latents from the denoising step",
             ),
             InputParam(
                 "decoder_cache",
-                description="The denoised latents from the denoising step",
             ),
         ]
@@ -100,6 +106,10 @@ class WanRTDecodeStep(ModularPipelineBlocks):
         block_state = self.get_block_state(state)
         vae_dtype = components.vae.dtype
         # Disable clearing cache
         if block_state.block_idx == 0:
             components.vae.clear_cache()
@@ -134,12 +144,10 @@ class WanRTDecodeStep(ModularPipelineBlocks):
         block_state.decoder_cache = components.vae._feat_map
         block_state.frame_cache_context.extend(videos.split(1, dim=2))
         videos = components.video_processor.postprocess_video(
             videos, output_type=block_state.output_type
         )
         block_state.videos = videos
         self.set_block_state(state, block_state)
         return components, state

 # limitations under the License.
 from typing import Any, List, Tuple, Union
+from collections import deque
 import numpy as np
 import PIL
 class WanRTDecodeStep(ModularPipelineBlocks):
+    model_name = "wan"
     decoder_cache = []
     @property
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         return [
+            InputParam(
+                "output_type",
+                default="pil",
+                description="The output format for the generated videos (pil, latent, pt, or np)",
+            ),
+            InputParam(
+                "block_idx",
+                description="Index of the current block being decoded",
+            ),
             InputParam(
                 "latents",
                 required=True,
             ),
             InputParam(
                 "frame_cache_context",
+                description="Deque object to store most recently decoded frames",
+                type_hint=deque
             ),
             InputParam(
                 "decoder_cache",
+                description="Decoder feature cache",
             ),
         ]
         block_state = self.get_block_state(state)
         vae_dtype = components.vae.dtype
+        if block_state.frame_cache_context is None:
+            frame_cache_len = 1 + (components.config.kv_cache_num_frames - 1) * 4
+            block_state.frame_cache_context = deque(maxlen=frame_cache_len)
         # Disable clearing cache
         if block_state.block_idx == 0:
             components.vae.clear_cache()
         block_state.decoder_cache = components.vae._feat_map
         block_state.frame_cache_context.extend(videos.split(1, dim=2))
         videos = components.video_processor.postprocess_video(
             videos, output_type=block_state.output_type
         )
         block_state.videos = videos
         self.set_block_state(state, block_state)
         return components, state

denoise.py CHANGED Viewed

@@ -16,8 +16,6 @@ from typing import Any, List, Tuple
 import torch
-from diffusers.configuration_utils import FrozenDict
-from diffusers.guiders import ClassifierFreeGuidance
 from diffusers.models import AutoModel
 from diffusers.schedulers import UniPCMultistepScheduler
 from diffusers.utils import logging
@@ -39,8 +37,8 @@ from diffusers.modular_pipelines.modular_pipeline_utils import (
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-class WanRTStreamingLoopDenoiser(ModularPipelineBlocks):
-    model_name = "WanRTStreaming"
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -51,14 +49,12 @@ class WanRTStreamingLoopDenoiser(ModularPipelineBlocks):
         return (
             "Step within the denoising loop that denoise the latents with guidance. "
             "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
-            "object (e.g. `WanRTStreamingDenoiseLoopWrapper`)"
         )
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         return [
-            InputParam("attention_kwargs"),
-            InputParam("block_idx"),
             InputParam(
                 "latents",
                 required=True,
@@ -69,36 +65,25 @@ class WanRTStreamingLoopDenoiser(ModularPipelineBlocks):
                 "prompt_embeds",
                 required=True,
                 type_hint=torch.Tensor,
             ),
             InputParam(
                 "kv_cache",
                 required=True,
                 type_hint=torch.Tensor,
             ),
             InputParam(
                 "crossattn_cache",
                 required=True,
                 type_hint=torch.Tensor,
             ),
             InputParam(
                 "current_start_frame",
                 required=True,
                 type_hint=torch.Tensor,
-            ),
-            InputParam(
-                "num_inference_steps",
-                required=True,
-                type_hint=int,
-                default=4,
-                description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
-            ),
-            InputParam(
-                kwargs_type="guider_input_fields",
-                description=(
-                    "All conditional model inputs that need to be prepared with guider. "
-                    "It should contain prompt_embeds/negative_prompt_embeds. "
-                    "Please add `kwargs_type=guider_input_fields` to their parameter spec (`OutputParam`) when they are created and added to the pipeline state"
-                ),
             ),
         ]
@@ -116,20 +101,21 @@ class WanRTStreamingLoopDenoiser(ModularPipelineBlocks):
         block_state.noise_pred = components.transformer(
             x=block_state.latents,
-            t=t.expand(block_state.latents.shape[0], block_state.num_frames_per_block),
             context=block_state.prompt_embeds,
             kv_cache=block_state.kv_cache,
             seq_len=components.config.seq_length,
             crossattn_cache=block_state.crossattn_cache,
             current_start=start_frame * components.config.frame_seq_length,
-            cache_start=start_frame * components.config.frame_seq_length,
         )
         return components, block_state
-class WanRTStreamingLoopAfterDenoiser(ModularPipelineBlocks):
-    model_name = "WanRTStreaming"
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -142,18 +128,24 @@ class WanRTStreamingLoopAfterDenoiser(ModularPipelineBlocks):
         return (
             "step within the denoising loop that update the latents. "
             "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
-            "object (e.g. `WanRTStreamingDenoiseLoopWrapper`)"
         )
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
-        return []
-    @property
-    def intermediate_inputs(self) -> List[str]:
         return [
-            InputParam("generator"),
-            InputParam("block_id"),
         ]
     @property
@@ -185,14 +177,13 @@ class WanRTStreamingLoopAfterDenoiser(ModularPipelineBlocks):
             block_state.latents.double()
             - sigma_t.double() * block_state.noise_pred.double()
         ).to(latents_dtype)
         block_state.latents = latents
         return components, block_state
-class WanRTStreamingDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
-    model_name = "WanRTStreaming"
     @property
     def description(self) -> str:
@@ -201,7 +192,7 @@ class WanRTStreamingDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
             "Recomputes cache from context frames, denoises current block, and updates cache."
         )
-    def add_noise(self, components, block_state, sample, noise, timestep, index):
         timesteps = block_state.all_timesteps
         sigmas = block_state.sigmas.to(timesteps.device)
@@ -232,38 +223,25 @@ class WanRTStreamingDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
                 "all_timesteps",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
             InputParam(
                 "sigmas",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam("final_latents", type_hint=torch.Tensor),
             InputParam(
                 "num_inference_steps",
                 required=True,
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
-            InputParam(
-                "num_frames_per_block",
-                required=True,
-                type_hint=int,
-                default=3,
-            ),
             InputParam(
                 "current_start_frame",
                 required=True,
                 type_hint=int,
             ),
-            InputParam(
-                "block_idx",
-            ),
-            InputParam(
-                "generator",
-            ),
         ]
     @torch.no_grad()
@@ -279,7 +257,6 @@ class WanRTStreamingDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
                 block_state.latents = (
                     self.add_noise(
-                        components,
                         block_state,
                         block_state.latents.transpose(1, 2).squeeze(0),
                         randn_tensor(
@@ -290,31 +267,23 @@ class WanRTStreamingDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
                         ),
                         t1.expand(
                             block_state.latents.shape[0],
-                            block_state.num_frames_per_block,
                         ),
-                        i,
                     )
                     .unsqueeze(0)
                     .transpose(1, 2)
                 )
-        # Update the state
-        block_state.final_latents[
-            :,
-            :,
-            block_state.current_start_frame : block_state.current_start_frame
-            + block_state.num_frames_per_block,
-        ] = block_state.latents
         self.set_block_state(state, block_state)
         return components, state
-class WanRTStreamingDenoiseStep(WanRTStreamingDenoiseLoopWrapper):
     block_classes = [
-        WanRTStreamingLoopDenoiser,
-        WanRTStreamingLoopAfterDenoiser,
     ]
     block_names = ["denoiser", "after_denoiser"]
@@ -322,9 +291,9 @@ class WanRTStreamingDenoiseStep(WanRTStreamingDenoiseLoopWrapper):
     def description(self) -> str:
         return (
             "Denoise step that iteratively denoise the latents. \n"
-            "Its loop logic is defined in `WanRTStreamingDenoiseLoopWrapper.__call__` method \n"
             "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
-            " - `WanRTStreamingLoopDenoiser`\n"
-            " - `WanRTStreamingLoopAfterDenoiser`\n"
             "This block supports both text2vid tasks."
         )

 import torch
 from diffusers.models import AutoModel
 from diffusers.schedulers import UniPCMultistepScheduler
 from diffusers.utils import logging
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class WanRTLoopDenoiser(ModularPipelineBlocks):
+    model_name = "wan"
     @property
     def expected_components(self) -> List[ComponentSpec]:
         return (
             "Step within the denoising loop that denoise the latents with guidance. "
             "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanRTDenoiseLoopWrapper`)"
         )
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         return [
             InputParam(
                 "latents",
                 required=True,
                 "prompt_embeds",
                 required=True,
                 type_hint=torch.Tensor,
+                description="Text embeddings to condition the denoising process",
             ),
             InputParam(
                 "kv_cache",
                 required=True,
                 type_hint=torch.Tensor,
+                description="KV Cache of the transformer model",
             ),
             InputParam(
                 "crossattn_cache",
                 required=True,
                 type_hint=torch.Tensor,
+                description="Cross Attention Cache of the transformer model",
             ),
             InputParam(
                 "current_start_frame",
                 required=True,
                 type_hint=torch.Tensor,
+                description="Starting frame index for the current block in the streaming generation",
             ),
         ]
         block_state.noise_pred = components.transformer(
             x=block_state.latents,
+            t=t.expand(
+                block_state.latents.shape[0], components.config.num_frames_per_block
+            ),
             context=block_state.prompt_embeds,
             kv_cache=block_state.kv_cache,
             seq_len=components.config.seq_length,
             crossattn_cache=block_state.crossattn_cache,
             current_start=start_frame * components.config.frame_seq_length,
+            cache_start=None,
         )
         return components, block_state
+class WanRTLoopAfterDenoiser(ModularPipelineBlocks):
+    model_name = "wan"
     @property
     def expected_components(self) -> List[ComponentSpec]:
         return (
             "step within the denoising loop that update the latents. "
             "This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
+            "object (e.g. `WanRTDenoiseLoopWrapper`)"
         )
     @property
     def inputs(self) -> List[Tuple[str, Any]]:
         return [
+            InputParam(
+                "latents",
+                description="Current latents being denoised",
+            ),
+            InputParam(
+                "all_timesteps",
+                description="All timesteps for the denoising process",
+            ),
+            InputParam(
+                "sigmas",
+                description="Noise schedule sigmas for each timestep",
+            ),
         ]
     @property
             block_state.latents.double()
             - sigma_t.double() * block_state.noise_pred.double()
         ).to(latents_dtype)
         block_state.latents = latents
         return components, block_state
+class WanRTDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
+    model_name = "wan"
     @property
     def description(self) -> str:
             "Recomputes cache from context frames, denoises current block, and updates cache."
         )
+    def add_noise(self, block_state, sample, noise, timestep):
         timesteps = block_state.all_timesteps
         sigmas = block_state.sigmas.to(timesteps.device)
                 "all_timesteps",
                 required=True,
                 type_hint=torch.Tensor,
             ),
             InputParam(
                 "sigmas",
                 required=True,
                 type_hint=torch.Tensor,
             ),
+            InputParam("current_denoised_latents", type_hint=torch.Tensor),
             InputParam(
                 "num_inference_steps",
                 required=True,
                 type_hint=int,
                 description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
             ),
             InputParam(
                 "current_start_frame",
                 required=True,
                 type_hint=int,
             ),
+            InputParam("generator", type_hint=torch.Generator),
         ]
     @torch.no_grad()
                 block_state.latents = (
                     self.add_noise(
                         block_state,
                         block_state.latents.transpose(1, 2).squeeze(0),
                         randn_tensor(
                         ),
                         t1.expand(
                             block_state.latents.shape[0],
+                            components.config.num_frames_per_block,
                         ),
                     )
                     .unsqueeze(0)
                     .transpose(1, 2)
                 )
+        block_state.current_denoised_latents = block_state.latents
         self.set_block_state(state, block_state)
         return components, state
+class WanRTDenoiseStep(WanRTDenoiseLoopWrapper):
     block_classes = [
+        WanRTLoopDenoiser,
+        WanRTLoopAfterDenoiser,
     ]
     block_names = ["denoiser", "after_denoiser"]
     def description(self) -> str:
         return (
             "Denoise step that iteratively denoise the latents. \n"
+            "Its loop logic is defined in `WanRTDenoiseLoopWrapper.__call__` method \n"
             "At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
+            " - `WanRTLoopDenoiser`\n"
+            " - `WanRTLoopAfterDenoiser`\n"
             "This block supports both text2vid tasks."
         )

encoders.py CHANGED Viewed

@@ -56,7 +56,7 @@ def prompt_clean(text):
     return text
-class WanRTStreamingTextEncoderStep(ModularPipelineBlocks):
     model_name = "WanRTStreaming"
     @property
@@ -83,8 +83,14 @@ class WanRTStreamingTextEncoderStep(ModularPipelineBlocks):
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("prompt"),
-            InputParam("negative_prompt"),
             InputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
@@ -95,7 +101,10 @@ class WanRTStreamingTextEncoderStep(ModularPipelineBlocks):
                 type_hint=torch.Tensor,
                 description="negative text embeddings used to guide the image generation",
             ),
-            InputParam("attention_kwargs"),
         ]
     @property
@@ -205,7 +214,7 @@ class WanRTStreamingTextEncoderStep(ModularPipelineBlocks):
         batch_size = len(prompt) if prompt is not None else prompt_embeds.shape[0]
         if prompt_embeds is None:
-            prompt_embeds = WanRTStreamingTextEncoderStep._get_t5_prompt_embeds(
                 components, prompt, max_sequence_length, device
             )
@@ -229,10 +238,8 @@ class WanRTStreamingTextEncoderStep(ModularPipelineBlocks):
                     " the batch size of `prompt`."
                 )
-            negative_prompt_embeds = (
-                WanRTStreamingTextEncoderStep._get_t5_prompt_embeds(
-                    components, negative_prompt, max_sequence_length, device
-                )
             )
         bs_embed, seq_len, _ = prompt_embeds.shape
@@ -266,7 +273,7 @@ class WanRTStreamingTextEncoderStep(ModularPipelineBlocks):
         (
             block_state.prompt_embeds,
             block_state.negative_prompt_embeds,
-        ) = WanRTStreamingTextEncoderStep.encode_prompt(
             components,
             block_state.prompt,
             block_state.device,
@@ -276,6 +283,7 @@ class WanRTStreamingTextEncoderStep(ModularPipelineBlocks):
             prompt_embeds=block_state.prompt_embeds,
             negative_prompt_embeds=block_state.negative_prompt_embeds,
         )
         # Add outputs
         self.set_block_state(state, block_state)

     return text
+class WanRTTextEncoderStep(ModularPipelineBlocks):
     model_name = "WanRTStreaming"
     @property
     @property
     def inputs(self) -> List[InputParam]:
         return [
+            InputParam(
+                "prompt",
+                description="The prompt or prompts to guide the video generation",
+            ),
+            InputParam(
+                "negative_prompt",
+                description="The prompt or prompts not to guide the video generation",
+            ),
             InputParam(
                 "prompt_embeds",
                 type_hint=torch.Tensor,
                 type_hint=torch.Tensor,
                 description="negative text embeddings used to guide the image generation",
             ),
+            InputParam(
+                "attention_kwargs",
+                description="Additional keyword arguments to pass to the attention mechanism",
+            ),
         ]
     @property
         batch_size = len(prompt) if prompt is not None else prompt_embeds.shape[0]
         if prompt_embeds is None:
+            prompt_embeds = WanRTTextEncoderStep._get_t5_prompt_embeds(
                 components, prompt, max_sequence_length, device
             )
                     " the batch size of `prompt`."
                 )
+            negative_prompt_embeds = WanRTTextEncoderStep._get_t5_prompt_embeds(
+                components, negative_prompt, max_sequence_length, device
             )
         bs_embed, seq_len, _ = prompt_embeds.shape
         (
             block_state.prompt_embeds,
             block_state.negative_prompt_embeds,
+        ) = WanRTTextEncoderStep.encode_prompt(
             components,
             block_state.prompt,
             block_state.device,
             prompt_embeds=block_state.prompt_embeds,
             negative_prompt_embeds=block_state.negative_prompt_embeds,
         )
+        block_state.prompt_embeds = block_state.prompt_embeds.contiguous()
         # Add outputs
         self.set_block_state(state, block_state)

modular_blocks.py CHANGED Viewed

@@ -16,27 +16,24 @@ from diffusers.utils import logging
 from diffusers.modular_pipelines import SequentialPipelineBlocks
 from diffusers.modular_pipelines.modular_pipeline_utils import InsertableDict
-from .before_denoise import WanRTStreamingBeforeDenoiseStep
 from .decoders import WanRTDecodeStep
-from .encoders import WanRTStreamingTextEncoderStep
-from .denoise import WanRTStreamingDenoiseStep
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-TEXT2VIDEO_BLOCKS = InsertableDict(
     [
-        ("text_encoder", WanRTStreamingTextEncoderStep),
-        ("before_denoise", WanRTStreamingBeforeDenoiseStep),
-        ("denoise", WanRTStreamingDenoiseStep),
         ("decode", WanRTDecodeStep),
     ]
 )
-ALL_BLOCKS = {
-    "text2video": TEXT2VIDEO_BLOCKS,
-}
-class WanStreamingRTBlocks(SequentialPipelineBlocks):
-    block_classes = list(TEXT2VIDEO_BLOCKS.copy().values())
-    block_names = list(TEXT2VIDEO_BLOCKS.copy().keys())

 from diffusers.modular_pipelines import SequentialPipelineBlocks
 from diffusers.modular_pipelines.modular_pipeline_utils import InsertableDict
+from .before_denoise import WanRTAutoBeforeDenoiseStep
 from .decoders import WanRTDecodeStep
+from .encoders import WanRTTextEncoderStep
+from .denoise import WanRTDenoiseStep
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+AUTO_BLOCKS = InsertableDict(
     [
+        ("text_encoder", WanRTTextEncoderStep),
+        ("before_denoise", WanRTAutoBeforeDenoiseStep),
+        ("denoise", WanRTDenoiseStep),
         ("decode", WanRTDecodeStep),
     ]
 )
+class WanRTBlocks(SequentialPipelineBlocks):
+    block_classes = list(AUTO_BLOCKS.copy().values())
+    block_names = list(AUTO_BLOCKS.copy().keys())

modular_config.json CHANGED Viewed

@@ -2,6 +2,6 @@
   "_class_name": "WanRTBlocks",
   "_diffusers_version": "0.36.0.dev0",
   "auto_map": {
-    "ModularPipelineBlocks": "modular_blocks.WanStreamingRTBlocks"
   }
 }

   "_class_name": "WanRTBlocks",
   "_diffusers_version": "0.36.0.dev0",
   "auto_map": {
+    "ModularPipelineBlocks": "modular_blocks.WanRTBlocks"
   }
 }

modular_model_index.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
-  "_blocks_class_name": "WanStreamingRTBlocks",
-  "_class_name": "WanRTStreamingPipeline",
   "_diffusers_version": "0.36.0.dev0",
   "frame_seq_length": 1560,
   "kv_cache_num_frames": 3,
@@ -52,7 +52,7 @@
     null,
     null,
     {
-      "repo": "diffusers-internal-dev/krt",
       "revision": null,
       "subfolder": "transformer",
       "type_hint": [

 {
+  "_blocks_class_name": "WanRTBlocks",
+  "_class_name": "WanModularPipeline",
   "_diffusers_version": "0.36.0.dev0",
   "frame_seq_length": 1560,
   "kv_cache_num_frames": 3,
     null,
     null,
     {
+      "repo": "krea/krea-realtime-video",
       "revision": null,
       "subfolder": "transformer",
       "type_hint": [

transformer/attention.py CHANGED Viewed

@@ -1,65 +1,40 @@
 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import torch
-from typing import Optional
 import os
 import warnings
-# Global state for lazy initialization
-_SAGEATTN_AVAILABLE = None
-_FLASH_ATTN_3_AVAILABLE = None
-_FLASH_ATTN_2_AVAILABLE = None
-_sageattn_func = None
-_flash_attn_func = None
-_flash_attn_interface = None
-_flash_attn = None
-def _init_sageattention():
-    """Lazy initialization for SageAttention."""
-    global _SAGEATTN_AVAILABLE, _sageattn_func
-    if _SAGEATTN_AVAILABLE is not None:
-        return _SAGEATTN_AVAILABLE
-    _SAGEATTN_AVAILABLE = False
-    try:
-        if os.getenv("DISABLE_SAGEATTENTION", "0") != "0":
-            raise Exception("DISABLE_SAGEATTENTION is set")
-        from sageattention import sageattn
-        @torch.library.custom_op(
-            "mylib::sageattn", mutates_args={"q", "k", "v"}, device_types="cuda"
         )
-        def sageattn_func(
-            q: torch.Tensor,
-            k: torch.Tensor,
-            v: torch.Tensor,
-            attn_mask: Optional[torch.Tensor] = None,
-            dropout_p: float = 0,
-            is_causal: bool = False,
-        ) -> torch.Tensor:
-            return sageattn(
-                q, k, v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
-            )
-        @sageattn_func.register_fake
-        def _sageattn_fake(q, k, v, attn_mask=None, dropout_p=0, is_causal=False):
-            return torch.empty(*q.shape, device=q.device, dtype=q.dtype)
-        print("SageAttention loaded successfully")
-        _sageattn_func = sageattn_func
-        _SAGEATTN_AVAILABLE = True
-    except Exception as e:
-        print(f"Warning: Could not load sageattention: {str(e)}")
-        if isinstance(e, ModuleNotFoundError):
-            print("sageattention package is not installed")
-        elif isinstance(e, ImportError) and "DLL" in str(e):
-            print("sageattention DLL loading error")
-        _sageattn_func = None
-    return _SAGEATTN_AVAILABLE
 def _is_hopper_gpu():
@@ -69,65 +44,41 @@ def _is_hopper_gpu():
     device_name = torch.cuda.get_device_name(0).lower()
     return "h100" in device_name or "hopper" in device_name
-def _init_flash_attention_3():
-    """Lazy initialization for Flash Attention 3."""
-    global _FLASH_ATTN_3_AVAILABLE, _flash_attn_func, _flash_attn_interface
-    if _FLASH_ATTN_3_AVAILABLE is not None:
-        return _FLASH_ATTN_3_AVAILABLE
-    _FLASH_ATTN_3_AVAILABLE = False
-    try:
-        from flash_attn import flash_attn_func
-        import flash_attn_interface
-        # Always set the function reference if flash_attn is available
-        _flash_attn_func = flash_attn_func
-        _flash_attn_interface = flash_attn_interface
-        # FA3 optimizations only available on Hopper GPUs
-        _FLASH_ATTN_3_AVAILABLE = _is_hopper_gpu()
-    except ModuleNotFoundError:
-        _FLASH_ATTN_3_AVAILABLE = False
-        _flash_attn_func = None
-        _flash_attn_interface = None
-    return _FLASH_ATTN_3_AVAILABLE
-def _init_flash_attention_2():
-    """Lazy initialization for Flash Attention 2."""
-    global _FLASH_ATTN_2_AVAILABLE, _flash_attn
-    if _FLASH_ATTN_2_AVAILABLE is not None:
-        return _FLASH_ATTN_2_AVAILABLE
-    _FLASH_ATTN_2_AVAILABLE = False
-    try:
-        import flash_attn
-        _flash_attn = flash_attn
-        _FLASH_ATTN_2_AVAILABLE = True
-    except ModuleNotFoundError:
-        _FLASH_ATTN_2_AVAILABLE = False
-    return _FLASH_ATTN_2_AVAILABLE
 __all__ = ["flash_attention", "attention"]
-# Compatibility getters for external code
-def sageattn_func():
-    """Getter for sageattn_func - initializes if needed."""
-    _init_sageattention()
-    return _sageattn_func
-def SAGEATTN_AVAILABLE():
-    """Getter for SAGEATTN_AVAILABLE - initializes if needed."""
-    return _init_sageattention()
 def flash_attention(
     q,
     k,
@@ -156,14 +107,15 @@ def flash_attention(
     deterministic:  bool. If True, slightly slower and uses more memory.
     dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
     """
-    # Initialize flash attention modules
-    flash_attn_3_available = _init_flash_attention_3()
-    flash_attn_2_available = _init_flash_attention_2()
-    # Early fallback for simple cases when advanced features aren't needed
-    # Only use this path if flash_attn is available but we're not using FA3 features
-    if not flash_attn_3_available and _flash_attn_func is not None and q_lens is None and k_lens is None:
-        return _flash_attn_func(
             q,
             k,
             v,
@@ -205,15 +157,15 @@ def flash_attention(
     if q_scale is not None:
         q = q * q_scale
-    if version is not None and version == 3 and not flash_attn_3_available:
         warnings.warn(
             "Flash attention 3 is not available, use flash attention 2 instead."
         )
     # apply attention
-    if (version is None or version == 3) and flash_attn_3_available:
         # Note: dropout_p, window_size are not supported in FA3 now.
-        x = _flash_attn_interface.flash_attn_varlen_func(
             q=q,
             k=k,
             v=v,
@@ -230,8 +182,8 @@ def flash_attention(
             deterministic=deterministic,
         ).unflatten(0, (b, lq))
     else:
-        assert flash_attn_2_available
-        x = _flash_attn.flash_attn_varlen_func(
             q=q,
             k=k,
             v=v,
@@ -270,12 +222,8 @@ def attention(
     fa_version=None,
     # og_dtype=torch.bfloat16,
 ):
-    # Initialize attention modules
-    sageattn_available = _init_sageattention()
-    flash_attn_2_available = _init_flash_attention_2()
-    flash_attn_3_available = _init_flash_attention_3()
-    if sageattn_available:
         # print("Using sageattention")
         attn_mask = None
@@ -284,14 +232,14 @@ def attention(
         k = k.transpose(1, 2).to(dtype)
         v = v.transpose(1, 2).to(dtype)
-        out = _sageattn_func(
             q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p
         )
         out = out.transpose(1, 2).contiguous().to(og_dtype)
         return out
-    elif flash_attn_2_available or flash_attn_3_available:
         return flash_attention(
             q=q,
             k=k,

 # Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
 import torch
 import os
 import warnings
+from typing import Optional
+from diffusers.utils import is_kernels_available
+SAGEATTN_AVAILABLE = False
+try:
+    if os.getenv("DISABLE_SAGEATTENTION", "0") != "0":
+        raise Exception("DISABLE_SAGEATTENTION is set")
+    from sageattention import sageattn
+    @torch.library.custom_op(
+        "mylib::sageattn", mutates_args={"q", "k", "v"}, device_types="cuda"
+    )
+    def sageattn_func(
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        attn_mask: Optional[torch.Tensor] = None,
+        dropout_p: float = 0,
+        is_causal: bool = False,
+    ) -> torch.Tensor:
+        return sageattn(
+            q, k, v, attn_mask=attn_mask, dropout_p=dropout_p, is_causal=is_causal
         )
+    @sageattn_func.register_fake
+    def _sageattn_fake(q, k, v, attn_mask=None, dropout_p=0, is_causal=False):
+        return torch.empty(*q.shape, device=q.device, dtype=q.dtype)
+    SAGEATTN_AVAILABLE = True
+except Exception as e:
+    sageattn_func = None
 def _is_hopper_gpu():
     device_name = torch.cuda.get_device_name(0).lower()
     return "h100" in device_name or "hopper" in device_name
+FLASH_ATTN_3_AVAILABLE = False
+try:
+    import flash_attn_interface
+    FLASH_ATTN_3_AVAILABLE = _is_hopper_gpu()
+except ModuleNotFoundError:
+    FLASH_ATTN_3_AVAILABLE = False
+FLASH_ATTN_3_HUB_AVAILABLE = False
+try:
+    use_hub_kernels = os.getenv("DIFFUSERS_ENABLE_HUB_KERNELS", "false").upper() in ["1", "TRUE"]
+    if use_hub_kernels and not is_kernels_available():
+        raise EnvironmentError((
+            "Attempting to use Hub Kernels for Flash Attention 3,"
+            "but the `kernels` library was not found in your environment. "
+            "Please install via `pip install kernels`"
+        ))
+    from kernels import get_kernel
+    flash_attn_3_hub = get_kernel("kernels-community/flash-attn3", revision="fake-ops-return-probs")
+    FLASH_ATTN_3_HUB_AVAILABLE = _is_hopper_gpu()
+except:
+    FLASH_ATTN_3_HUB_AVAILABLE = False
+FLASH_ATTN_2_AVAILABLE = False
+try:
+    import flash_attn
+    FLASH_ATTN_2_AVAILABLE = True
+except ModuleNotFoundError:
+    FLASH_ATTN_2_AVAILABLE = False
 __all__ = ["flash_attention", "attention"]
 def flash_attention(
     q,
     k,
     deterministic:  bool. If True, slightly slower and uses more memory.
     dtype:          torch.dtype. Apply when dtype of q/k/v is not float16/bfloat16.
     """
+    if not FLASH_ATTN_3_AVAILABLE or not FLASH_ATTN_3_HUB_AVAILABLE:
+        return flash_attn.flash_attn_func(
+            q,
+            k,
+            v,
+        )
+    elif FLASH_ATTN_3_HUB_AVAILABLE:
+        return flash_attn_3_hub.flash_attn_func(
             q,
             k,
             v,
     if q_scale is not None:
         q = q * q_scale
+    if version is not None and version == 3 and not FLASH_ATTN_3_AVAILABLE:
         warnings.warn(
             "Flash attention 3 is not available, use flash attention 2 instead."
         )
     # apply attention
+    if (version is None or version == 3) and FLASH_ATTN_3_AVAILABLE:
         # Note: dropout_p, window_size are not supported in FA3 now.
+        x = flash_attn_interface.flash_attn_varlen_func(
             q=q,
             k=k,
             v=v,
             deterministic=deterministic,
         ).unflatten(0, (b, lq))
     else:
+        assert FLASH_ATTN_3_AVAILABLE
+        x = flash_attn.flash_attn_varlen_func(
             q=q,
             k=k,
             v=v,
     fa_version=None,
     # og_dtype=torch.bfloat16,
 ):
+    if SAGEATTN_AVAILABLE:
         # print("Using sageattention")
         attn_mask = None
         k = k.transpose(1, 2).to(dtype)
         v = v.transpose(1, 2).to(dtype)
+        out = sageattn_func(
             q, k, v, attn_mask=attn_mask, is_causal=causal, dropout_p=dropout_p
         )
         out = out.transpose(1, 2).contiguous().to(og_dtype)
         return out
+    elif FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
         return flash_attention(
             q=q,
             k=k,

transformer/causal_model.py CHANGED Viewed

@@ -18,6 +18,7 @@ from torch.nn.attention.flex_attention import BlockMask
 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_utils import ModelMixin
 flex_attention = torch.compile(
     flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"
@@ -642,7 +643,7 @@ class CausalHead(nn.Module):
         return x
-class CausalWanModel(ModelMixin, ConfigMixin):
     r"""
     Wan diffusion backbone supporting both text-to-video and image-to-video.
     """

 from diffusers.configuration_utils import ConfigMixin, register_to_config
 from diffusers.models.modeling_utils import ModelMixin
+from diffusers.loaders import PeftAdapterMixin
 flex_attention = torch.compile(
     flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"
         return x
+class CausalWanModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
     r"""
     Wan diffusion backbone supporting both text-to-video and image-to-video.
     """

transformer/model.py CHANGED Viewed

@@ -10,13 +10,11 @@ from einops import repeat
 from .attention import (
     flash_attention,
     sageattn_func,
-    _SAGEATTN_AVAILABLE,
-    _FLASH_ATTN_2_AVAILABLE,
-    _FLASH_ATTN_3_AVAILABLE,
 )
-print("SAGEATTN_AVAILABLE:", _SAGEATTN_AVAILABLE)
 __all__ = ["WanModel"]
@@ -153,7 +151,7 @@ class WanSelfAttention(nn.Module):
         q, k, v = qkv_fn(x)
-        if _SAGEATTN_AVAILABLE:
             # print("Using sageattention in crossattn")
             og_dtype = q.dtype
             q = q.transpose(1, 2).to(dtype)
@@ -209,7 +207,7 @@ class WanT2VCrossAttention(WanSelfAttention):
             v = self.v(context).view(b, -1, n, d)
         # compute attention
-        if _SAGEATTN_AVAILABLE:
             # print("Using sageattention in crossattn")
             dtype = torch.bfloat16
             og_dtype = q.dtype
@@ -222,7 +220,7 @@ class WanT2VCrossAttention(WanSelfAttention):
                 v=v,
             )
             x = x.transpose(1, 2).contiguous().to(og_dtype)
-        elif _FLASH_ATTN_2_AVAILABLE or _FLASH_ATTN_3_AVAILABLE:
             x = flash_attention(q, k, v, k_lens=context_lens)
         else:
             dtype = torch.bfloat16

 from .attention import (
     flash_attention,
     sageattn_func,
+    SAGEATTN_AVAILABLE,
+    FLASH_ATTN_2_AVAILABLE,
+    FLASH_ATTN_3_AVAILABLE,
 )
 __all__ = ["WanModel"]
         q, k, v = qkv_fn(x)
+        if SAGEATTN_AVAILABLE:
             # print("Using sageattention in crossattn")
             og_dtype = q.dtype
             q = q.transpose(1, 2).to(dtype)
             v = self.v(context).view(b, -1, n, d)
         # compute attention
+        if SAGEATTN_AVAILABLE:
             # print("Using sageattention in crossattn")
             dtype = torch.bfloat16
             og_dtype = q.dtype
                 v=v,
             )
             x = x.transpose(1, 2).contiguous().to(og_dtype)
+        elif FLASH_ATTN_2_AVAILABLE or FLASH_ATTN_3_AVAILABLE:
             x = flash_attention(q, k, v, k_lens=context_lens)
         else:
             dtype = torch.bfloat16