Motif-Technologies
/

Motif-Video-2B

@@ -1,210 +0,0 @@
-#!/usr/bin/env python3
-"""Motif-Video 2B — Text-to-Video inference.
-GPU requirements: ~24GB VRAM for 720p (1280x736, 121 frames).
-Requires: torch, diffusers (with MotifVideoPipeline), transformers>=5.5.4,
-           accelerate, ftfy, einops, sentencepiece, regex
-Uses Adaptive Projected Guidance (APG) and DPMSolver++ scheduler by default.
-"""
-import argparse
-import torch
-from diffusers import (
-    AdaptiveProjectedGuidance,
-    DPMSolverMultistepScheduler,
-    MotifVideoPipeline,
-)
-from diffusers.utils import export_to_video
-_DEFAULT_NEGATIVE_PROMPT = (
-    "text overlay, graphic overlay, watermark, logo, subtitles, timestamp, "
-    "broadcast graphics, UI elements, random letters, frozen pose, rigid, "
-    "static expression, jerky motion, mechanical motion, discontinuous motion, "
-    "flat framing, depthless, dull lighting, monotone, crushed shadows, "
-    "blown-out highlights, shifting background, fading background, poor continuity, "
-    "identity drift, deformation, flickering, ghosting, smearing, duplication, "
-    "mutated proportions, inconsistent clothing, flat colors, desaturated, "
-    "tonally compressed, poor background separation, exposure shift, "
-    "uneven brightness, color balance shift"
-)
-def parse_args():
-    parser = argparse.ArgumentParser(description="Motif-Video 2B Inference (T2V)")
-    parser.add_argument(
-        "--model-path",
-        type=str,
-        default="Motif-Technologies/Motif-Video-2B",
-        help="HuggingFace model ID or local checkpoint path",
-    )
-    parser.add_argument(
-        "--prompt",
-        type=str,
-        default="A category-five hurricane, viewed from inside the eye, reveals a circular stadium of cloud walls rising to fifty thousand feet with an eerie disk of blue sky directly overhead. Shot from a NOAA reconnaissance aircraft mounted camera, the perspective looks outward toward the eyewall — a near-vertical curtain of rotating cloud and lightning that is simultaneously terrifying and transcendent. The inner surface of the eyewall catches the setting sun, painting it in improbable shades of peach and rose. The camera slowly pans 360 degrees to complete one full revolution, capturing the entire coliseum of the storm. Below, the ocean surface is a white blur of foam and spray. The documentary-style cinematography strips away all artifice to present the storm as an entity of pure elemental power.",
-        help="Text prompt for video generation",
-    )
-    parser.add_argument(
-        "--negative-prompt",
-        type=str,
-        default=_DEFAULT_NEGATIVE_PROMPT,
-        help="Negative prompt",
-    )
-    parser.add_argument("--output", type=str, default="output.mp4", help="Output video file path")
-    parser.add_argument("--num-frames", type=int, default=121, help="Number of frames to generate (121 = ~5s at 24fps)")
-    parser.add_argument("--height", type=int, default=736, help="Video height in pixels")
-    parser.add_argument("--width", type=int, default=1280, help="Video width in pixels")
-    parser.add_argument("--guidance-scale", type=float, default=8.0, help="Classifier-free guidance scale")
-    parser.add_argument("--num-inference-steps", type=int, default=50, help="Number of denoising steps")
-    parser.add_argument("--fps", type=int, default=24, help="Output video frame rate")
-    parser.add_argument("--seed", type=int, default=42, help="Random seed for reproducibility")
-    parser.add_argument(
-        "--dtype",
-        type=str,
-        default="bfloat16",
-        choices=["float16", "bfloat16", "float32"],
-        help="Model dtype",
-    )
-    parser.add_argument(
-        "--use-sage-attention",
-        action="store_true",
-        help="Enable SageAttention for ~2x faster attention (requires: pip install sageattention>=2.1.1 from GitHub source)",
-    )
-    return parser.parse_args()
-def _enable_sage_attention(transformer):
-    """Patch transformer attention to use SageAttention.
-    Only patches _compute_attention (self-attention path). Cross-attention
-    uses _handle_cross_attention_mode which calls F.sdpa directly and is
-    unaffected by this patch.
-    Mask handling follows motif-models dispatch_optimized_attention pattern:
-    - mask=None: sage directly
-    - mask with uniform active length: slice active region -> sage -> pad back
-    - mask with non-uniform active length: SDPA fallback
-    """
-    from sageattention import sageattn
-    from diffusers.models.transformers.transformer_motif_video import MotifVideoAttnProcessor2_0
-    _orig_compute = MotifVideoAttnProcessor2_0._compute_attention
-    def _sage_compute(self, query, key, value, attention_mask):
-        if attention_mask is None:
-            out = sageattn(
-                query.contiguous(), key.contiguous(), value.contiguous(),
-                tensor_layout="HND", is_causal=False,
-            )
-            out = out.transpose(1, 2).flatten(2, 3).to(query.dtype)
-            return out
-        # Find active token count from mask (shape: [B, 1, 1, S])
-        padding_indices = attention_mask.sum(dim=-1).long().flatten()
-        common_padding_index = padding_indices[0]
-        is_uniform = (padding_indices == common_padding_index).all()
-        if not is_uniform:
-            return _orig_compute(self, query, key, value, attention_mask)
-        active_len = common_padding_index.item()
-        S = query.shape[2]
-        if active_len == S:
-            out = sageattn(
-                query.contiguous(), key.contiguous(), value.contiguous(),
-                tensor_layout="HND", is_causal=False,
-            )
-            out = out.transpose(1, 2).flatten(2, 3).to(query.dtype)
-            return out
-        # Slice to active region, run sage, pad back
-        q_a = query[:, :, :active_len, :].contiguous()
-        k_a = key[:, :, :active_len, :].contiguous()
-        v_a = value[:, :, :active_len, :].contiguous()
-        out_a = sageattn(q_a, k_a, v_a, tensor_layout="HND", is_causal=False)
-        out = query.new_zeros(query.shape)
-        out[:, :, :active_len, :] = out_a
-        out = out.transpose(1, 2).flatten(2, 3).to(query.dtype)
-        return out
-    MotifVideoAttnProcessor2_0._compute_attention = _sage_compute
-    transformer.to(memory_format=torch.channels_last_3d)
-    print("[SageAttention] Enabled (patched _compute_attention + channels_last_3d)")
-def main():
-    args = parse_args()
-    dtype_map = {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}
-    torch_dtype = dtype_map[args.dtype]
-    print(f"[T2V] Loading model from: {args.model_path}")
-    guider = AdaptiveProjectedGuidance(
-        guidance_scale=args.guidance_scale,
-        adaptive_projected_guidance_rescale=12.0,
-        adaptive_projected_guidance_momentum=0.1,
-        use_original_formulation=True,
-        normalization_dims="spatial",
-    )
-    pipe = MotifVideoPipeline.from_pretrained(
-        args.model_path,
-        torch_dtype=torch_dtype,
-        guider=guider,
-    )
-    # Replace scheduler with DPMSolver++ for faster convergence and better quality.
-    # Subclass ignores pipeline-supplied sigmas (PR branch always passes them)
-    # and uses its own flow-matching sigma schedule instead.
-    class _FlowDPMSolver(DPMSolverMultistepScheduler):
-        def set_timesteps(self, num_inference_steps=None, device=None,
-                          sigmas=None, mu=None, timesteps=None):
-            if sigmas is not None and num_inference_steps is None:
-                num_inference_steps = len(sigmas)
-            super().set_timesteps(
-                num_inference_steps=num_inference_steps,
-                device=device, timesteps=timesteps,
-            )
-    pipe.scheduler = _FlowDPMSolver(
-        num_train_timesteps=pipe.scheduler.config.get("num_train_timesteps", 1000),
-        algorithm_type="dpmsolver++",
-        solver_order=2,
-        prediction_type="flow_prediction",
-        use_flow_sigmas=True,
-        flow_shift=15.0,
-    )
-    # Offload model components to CPU between uses to reduce peak VRAM
-    pipe.enable_model_cpu_offload()
-    if args.use_sage_attention:
-        _enable_sage_attention(pipe.transformer)
-    generator = torch.Generator(device="cuda").manual_seed(args.seed)
-    print(f"Generating video: {args.width}x{args.height}, {args.num_frames} frames, {args.num_inference_steps} steps")
-    output = pipe(
-        prompt=args.prompt,
-        negative_prompt=args.negative_prompt,
-        height=args.height,
-        width=args.width,
-        num_frames=args.num_frames,
-        num_inference_steps=args.num_inference_steps,
-        frame_rate=args.fps,
-        use_linear_quadratic_schedule=False,
-        generator=generator,
-    )
-    video_frames = output.frames[0]
-    export_to_video(video_frames, args.output, fps=args.fps)
-    print(f"Video saved to: {args.output}")
-if __name__ == "__main__":
-    main()