huggingface · waitingcheung · Apr 23, 2026
diff --git a/docs/source/en/api/models/motif_video_transformer_3d.md b/docs/source/en/api/models/motif_video_transformer_3d.md
@@ -0,0 +1,32 @@
+<!-- Copyright 2026 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. -->
+
+# MotifVideoTransformer3DModel
+
+A Diffusion Transformer model for 3D video-like data was introduced in Motif-Video by the Motif Technologies Team.
+
+The model uses a three-stage architecture with 12 dual-stream + 16 single-stream + 8 DDT decoder layers and rotary positional embeddings (RoPE) for video generation.
+
+The model can be loaded with the following code snippet.
+
+```python
+from diffusers import MotifVideoTransformer3DModel
+
+transformer = MotifVideoTransformer3DModel.from_pretrained("MotifTechnologies/Motif-Video-2B", subfolder="transformer", torch_dtype=torch.bfloat16)
+```
+
+## MotifVideoTransformer3DModel
+
+[[autodoc]] MotifVideoTransformer3DModel
+
+## Transformer2DModelOutput
+
+[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
diff --git a/docs/source/en/api/pipelines/motif_video.md b/docs/source/en/api/pipelines/motif_video.md
@@ -0,0 +1,147 @@
+<!-- Copyright 2026 The HuggingFace Team. All rights reserved. -->
+
+# Motif-Video
+
+[Technical Report](https://arxiv.org/abs/2604.16503)
+
+Motif-Video is a 2B parameter diffusion transformer designed for text-to-video and image-to-video generation. It features a three-stage architecture with 12 dual-stream + 16 single-stream + 8 DDT decoder layers, Shared Cross-Attention for stable text-video alignment under long video sequences, T5Gemma2 text encoder, and rectified flow matching for velocity prediction.
+
+<p align="center">
+  <img src="https://huggingface.co/MotifTechnologies/Motif-Video-2B/resolve/main/assets/architecture.png" width="90%" alt="Motif-Video architecture"/>
+</p>
+
+## Text-to-Video Generation
+
+Use `MotifVideoPipeline` for text-to-video generation:
+
+```python
+import torch
+from diffusers import AdaptiveProjectedGuidance, MotifVideoPipeline
+from diffusers.utils import export_to_video
+
+guider = AdaptiveProjectedGuidance(
+    guidance_scale=8.0,
+    adaptive_projected_guidance_rescale=12.0,
+    adaptive_projected_guidance_momentum=0.1,
+    use_original_formulation=True,
+    normalization_dims="spatial",
+)
+
+pipe = MotifVideoPipeline.from_pretrained(
+    "MotifTechnologies/Motif-Video-2B",
+    torch_dtype=torch.bfloat16,
+    guider=guider,
+)
+pipe.to("cuda")
+
+prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair."
+negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+
+video = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=1280,
+    height=736,
+    num_frames=121,
+    num_inference_steps=50,
+).frames[0]
+export_to_video(video, "output.mp4", fps=24)
+```
+
+## Image-to-Video Generation
+
+Use `MotifVideoImage2VideoPipeline` for image-to-video generation:
+
+```python
+import torch
+from diffusers import AdaptiveProjectedGuidance, MotifVideoImage2VideoPipeline
+from diffusers.utils import export_to_video, load_image
+
+guider = AdaptiveProjectedGuidance(
+    guidance_scale=8.0,
+    adaptive_projected_guidance_rescale=12.0,
+    adaptive_projected_guidance_momentum=0.1,
+    use_original_formulation=True,
+    normalization_dims="spatial",
+)
+
+pipe = MotifVideoImage2VideoPipeline.from_pretrained(
+    "MotifTechnologies/Motif-Video-2B",
+    torch_dtype=torch.bfloat16,
+    guider=guider,
+)
+pipe.to("cuda")
+
+image = load_image("input_image.png")
+prompt = "A cinematic scene with vivid colors."
+negative_prompt = "worst quality, blurry, jittery, distorted"
+
+video = pipe(
+    image=image,
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=1280,
+    height=736,
+    num_frames=121,
+    num_inference_steps=50,
+).frames[0]
+export_to_video(video, "i2v_output.mp4", fps=24)
+```
+
+### Memory-efficient Inference
+
+For GPUs with less than 30GB VRAM (e.g., RTX 4090), use model CPU offloading:
+
+```bash
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+```
+
+```python
+import torch
+from diffusers import AdaptiveProjectedGuidance, MotifVideoPipeline
+from diffusers.utils import export_to_video
+
+guider = AdaptiveProjectedGuidance(
+    guidance_scale=8.0,
+    adaptive_projected_guidance_rescale=12.0,
+    adaptive_projected_guidance_momentum=0.1,
+    use_original_formulation=True,
+    normalization_dims="spatial",
+)
+
+pipe = MotifVideoPipeline.from_pretrained(
+    "MotifTechnologies/Motif-Video-2B",
+    torch_dtype=torch.bfloat16,
+    guider=guider,
+)
+pipe.enable_model_cpu_offload()
+
+prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair."
+negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
+
+video = pipe(
+    prompt=prompt,
+    negative_prompt=negative_prompt,
+    width=1280,
+    height=736,
+    num_frames=121,
+    num_inference_steps=50,
+).frames[0]
+export_to_video(video, "output.mp4", fps=24)
+```
+
+## MotifVideoPipeline
+
+[[autodoc]] MotifVideoPipeline
+  - all
+  - __call__
+
+## MotifVideoImage2VideoPipeline
+
+[[autodoc]] MotifVideoImage2VideoPipeline
+  - all
+  - __call__
+
+## MotifVideoPipelineOutput
+
+[[autodoc]] pipelines.motif_video.pipeline_output.MotifVideoPipelineOutput
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
@@ -263,6 +263,7 @@
             "LuminaNextDiT2DModel",
             "MochiTransformer3DModel",
             "ModelMixin",
+            "MotifVideoTransformer3DModel",
             "MotionAdapter",
             "MultiAdapter",
             "MultiControlNetModel",
@@ -625,6 +626,9 @@
             "MarigoldIntrinsicsPipeline",
             "MarigoldNormalsPipeline",
             "MochiPipeline",
+            "MotifVideoImage2VideoPipeline",
+            "MotifVideoPipeline",
+            "MotifVideoPipelineOutput",
             "MusicLDMPipeline",
             "NucleusMoEImagePipeline",
             "OmniGenPipeline",
@@ -1073,6 +1077,7 @@
             LuminaNextDiT2DModel,
             MochiTransformer3DModel,
             ModelMixin,
+            MotifVideoTransformer3DModel,
             MotionAdapter,
             MultiAdapter,
             MultiControlNetModel,
@@ -1410,6 +1415,9 @@
             MarigoldIntrinsicsPipeline,
             MarigoldNormalsPipeline,
             MochiPipeline,
+            MotifVideoImage2VideoPipeline,
+            MotifVideoPipeline,
+            MotifVideoPipelineOutput,
             MusicLDMPipeline,
             NucleusMoEImagePipeline,
             OmniGenPipeline,

diff --git a/src/diffusers/guiders/adaptive_projected_guidance.py b/src/diffusers/guiders/adaptive_projected_guidance.py
@@ -48,6 +48,12 @@ class AdaptiveProjectedGuidance(BaseGuidance):
             Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
             we use the diffusers-native implementation that has been in the codebase for a long time. See
             [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
+        normalization_dims (`str` or `list[int]` or `None`, defaults to `None`):
+            Dimensions to normalize over for the guidance computation. Can be:
+            - `None` (default): Normalize over all non-batch dimensions (e.g., [C, H, W] for 4D, [C, T, H, W] for 5D)
+            - `"spatial"`: Spatial-only normalization - normalize over [C, H, W] per frame for 5D tensors, standard for
+              4D
+            - `list[int]`: Custom dimensions to normalize over (e.g., `[-1, -2, -4]` for [W, H, C])
         start (`float`, defaults to `0.0`):
             The fraction of the total number of denoising steps after which guidance starts.
         stop (`float`, defaults to `1.0`):
@@ -65,6 +71,7 @@ def __init__(
         eta: float = 1.0,
         guidance_rescale: float = 0.0,
         use_original_formulation: bool = False,
+        normalization_dims: str | list[int] | None = None,
         start: float = 0.0,
         stop: float = 1.0,
         enabled: bool = True,
@@ -77,6 +84,7 @@ def __init__(
         self.eta = eta
         self.guidance_rescale = guidance_rescale
         self.use_original_formulation = use_original_formulation
+        self.normalization_dims = normalization_dims
         self.momentum_buffer = None
 
     def prepare_inputs(self, data: dict[str, tuple[torch.Tensor, torch.Tensor]]) -> list["BlockState"]:
@@ -117,6 +125,7 @@ def forward(self, pred_cond: torch.Tensor, pred_uncond: torch.Tensor | None = No
                 self.eta,
                 self.adaptive_projected_guidance_rescale,
                 self.use_original_formulation,
+                self.normalization_dims,
             )
 
         if self.guidance_rescale > 0.0:
@@ -210,9 +219,25 @@ def normalized_guidance(
     eta: float = 1.0,
     norm_threshold: float = 0.0,
     use_original_formulation: bool = False,
+    normalization_dims: str | list[int] | None = None,
 ):
     diff = pred_cond - pred_uncond
-    dim = [-i for i in range(1, len(diff.shape))]
+
+    # Determine normalization dimensions
+    if normalization_dims == "spatial":
+        # Spatial-only normalization: normalize over [C, H, W] per frame for 5D tensors
+        if len(diff.shape) == 5:
+            # [B, C, T, H, W] -> normalize over W(-1), H(-2), C(-4), skip T(-3)
+            dim = [-1, -2, -4]
+        else:
+            # [B, C, H, W] -> standard behavior
+            dim = [-i for i in range(1, len(diff.shape))]
+    elif normalization_dims is None:
+        # Default: normalize over all non-batch dimensions
+        dim = [-i for i in range(1, len(diff.shape))]
+    else:
+        # Custom dimensions provided by user
+        dim = normalization_dims
 
     if momentum_buffer is not None:
         momentum_buffer.update(diff)

diff --git a/src/diffusers/guiders/adaptive_projected_guidance_mix.py b/src/diffusers/guiders/adaptive_projected_guidance_mix.py
@@ -48,6 +48,12 @@ class AdaptiveProjectedMixGuidance(BaseGuidance):
             Whether to use the original formulation of classifier-free guidance as proposed in the paper. By default,
             we use the diffusers-native implementation that has been in the codebase for a long time. See
             [~guiders.classifier_free_guidance.ClassifierFreeGuidance] for more details.
+        normalization_dims (`str` or `list[int]` or `None`, defaults to `None`):
+            Dimensions to normalize over for the guidance computation. Can be:
+            - `None` (default): Normalize over all non-batch dimensions (e.g., [C, H, W] for 4D, [C, T, H, W] for 5D)
+            - `"spatial"`: Spatial-only normalization - normalize over [C, H, W] per frame for 5D tensors, standard for
+              4D
+            - `list[int]`: Custom dimensions to normalize over (e.g., `[-1, -2, -4]` for [W, H, C])
         start (`float`, defaults to `0.0`):
             The fraction of the total number of denoising steps after which the classifier-free guidance starts.
         stop (`float`, defaults to `1.0`):
@@ -71,6 +77,7 @@ def __init__(
         adaptive_projected_guidance_rescale: float = 10.0,
         eta: float = 0.0,
         use_original_formulation: bool = False,
+        normalization_dims: str | list[int] | None = None,
         start: float = 0.0,
         stop: float = 1.0,
         adaptive_projected_guidance_start_step: int = 5,
@@ -86,6 +93,7 @@ def __init__(
         self.eta = eta
         self.adaptive_projected_guidance_start_step = adaptive_projected_guidance_start_step
         self.use_original_formulation = use_original_formulation
+        self.normalization_dims = normalization_dims
         self.momentum_buffer = None
 
     def prepare_inputs(self, data: dict[str, tuple[torch.Tensor, torch.Tensor]]) -> list["BlockState"]:
@@ -138,6 +146,7 @@ def forward(self, pred_cond: torch.Tensor, pred_uncond: torch.Tensor | None = No
                 self.eta,
                 self.adaptive_projected_guidance_rescale,
                 self.use_original_formulation,
+                self.normalization_dims,
             )
 
         if self.guidance_rescale > 0.0:
@@ -269,14 +278,29 @@ def normalized_guidance(
     eta: float = 1.0,
     norm_threshold: float = 0.0,
     use_original_formulation: bool = False,
+    normalization_dims: str | list[int] | None = None,
 ):
     if momentum_buffer is not None:
         update_momentum_buffer(pred_cond, pred_uncond, momentum_buffer)
         diff = momentum_buffer.running_average
     else:
         diff = pred_cond - pred_uncond
 
-    dim = [-i for i in range(1, len(diff.shape))]
+    # Determine normalization dimensions
+    if normalization_dims == "spatial":
+        # Spatial-only normalization: normalize over [C, H, W] per frame for 5D tensors
+        if len(diff.shape) == 5:
+            # [B, C, T, H, W] -> normalize over W(-1), H(-2), C(-4), skip T(-3)
+            dim = [-1, -2, -4]
+        else:
+            # [B, C, H, W] -> standard behavior
+            dim = [-i for i in range(1, len(diff.shape))]
+    elif normalization_dims is None:
+        # Default: normalize over all non-batch dimensions
+        dim = [-i for i in range(1, len(diff.shape))]
+    else:
+        # Custom dimensions provided by user
+        dim = normalization_dims
 
     if norm_threshold > 0:
         ones = torch.ones_like(diff)