Spaces:

Ani14
/

Video-agent

Starting

App Files Files Community

Ani14 commited on Sep 15

Commit

95315db

verified ·

1 Parent(s): f3c7daa

Upload 6 files

Browse files

Files changed (6) hide show

app.py +302 -0
config.py +51 -0
model_handler.py +209 -0
planning.py +60 -0
requirements.txt +19 -0
utils.py +102 -0

app.py ADDED Viewed

	@@ -0,0 +1,302 @@

+"""
+Gradio interface for WAN-VACE video generation
+"""
+import gradio as gr
+import torch
+import time
+from typing import Optional
+# Import the simple planner
+from planning import plan_from_topic
+from config import UI_CONFIG, DEFAULT_PARAMS, SERVER_CONFIG
+from model_handler import model_handler
+from utils import cleanup_temp_files
+def load_model_interface(progress=gr.Progress()):
+    """Interface function for loading the model"""
+    def progress_callback(value, message):
+        progress(value, desc=message)
+    success, message = model_handler.load_model(progress_callback)
+    if success:
+        return (
+            gr.update(visible=False),  # Hide load button
+            gr.update(visible=True),   # Show generation interface
+            gr.update(value=message, visible=True),  # Show success message
+            gr.update(visible=False)   # Hide error message
+        )
+    else:
+        return (
+            gr.update(visible=True),   # Keep load button visible
+            gr.update(visible=False),  # Keep generation interface hidden
+            gr.update(visible=False),  # Hide success message
+            gr.update(value=message, visible=True)   # Show error message
+        )
+def generate_video_interface(
+    prompt: str,
+    negative_prompt: str,
+    width: int,
+    height: int,
+    num_frames: int,
+    num_inference_steps: int,
+    guidance_scale: float,
+    seed: Optional[int],
+    progress=gr.Progress()
+):
+    """Interface function for video generation"""
+    def progress_callback(value, message):
+        progress(value, desc=message)
+    # Plan the prompt: treat the user input as a high‑level concept and let the
+    # agent craft a refined prompt and recommended negative prompt.  If the user
+    # supplies a negative prompt, it overrides the recommended negative prompt.
+    plan = plan_from_topic(prompt)
+    # Use the refined prompt from the plan
+    effective_prompt = plan.prompt
+    # If the user provided a negative prompt, use it; otherwise use the recommended one
+    effective_negative = negative_prompt.strip() if negative_prompt and negative_prompt.strip() else plan.negative_prompt
+    success, video_path, error_msg, gen_info = model_handler.generate_video(
+        prompt=effective_prompt,
+        negative_prompt=effective_negative,
+        width=width,
+        height=height,
+        num_frames=num_frames,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        seed=seed,
+        progress_callback=progress_callback
+    )
+    if success:
+        return (
+            gr.update(value=video_path, visible=True),  # Video output
+            gr.update(value=gen_info, visible=True),    # Generation info
+            gr.update(visible=False)                    # Hide error message
+        )
+    else:
+        return (
+            gr.update(value=None, visible=False),       # Hide video output
+            gr.update(visible=False),                   # Hide generation info
+            gr.update(value=error_msg, visible=True)    # Show error message
+        )
+def create_interface():
+    """Create the Gradio interface"""
+    with gr.Blocks(
+        title=UI_CONFIG["title"],
+        theme=UI_CONFIG["theme"]
+    ) as demo:
+        # Header
+        gr.Markdown(f"# {UI_CONFIG['title']}")
+        gr.Markdown(UI_CONFIG["description"])
+        # Model loading section
+        with gr.Row():
+            with gr.Column():
+                load_btn = gr.Button(
+                    "🚀 Load Video Generation Model",
+                    variant="primary",
+                    size="lg"
+                )
+                load_success_msg = gr.Markdown(visible=False)
+                load_error_msg = gr.Markdown(visible=False)
+        # Main generation interface (initially hidden)
+        with gr.Column(visible=False) as generation_interface:
+            # Input section
+            with gr.Row():
+                with gr.Column(scale=2):
+                    with gr.Group():
+                        gr.Markdown("### 📝 Concept & Prompts")
+                        # The user supplies a high‑level concept or topic.  The agent will
+                        # refine this into a detailed prompt automatically.
+                        prompt_input = gr.Textbox(
+                            label="Video Concept",
+                            placeholder="Describe the concept you want to generate, e.g. 'a pig in a winter forest'...",
+                            lines=3,
+                            value="a pig moving quickly in a beautiful winter scenery nature trees sunset tracking camera"
+                        )
+                        # Optional negative prompt: overrides the agent's recommended negative prompt.
+                        negative_prompt_input = gr.Textbox(
+                            label="Negative Prompt (Optional)",
+                            placeholder="Things you don't want in the video; leave empty to use the agent's recommendation...",
+                            lines=2,
+                            value=""
+                        )
+                with gr.Column(scale=1):
+                    with gr.Group():
+                        gr.Markdown("### ⚙️ Generation Parameters")
+                        with gr.Row():
+                            width_slider = gr.Slider(
+                                label="Width",
+                                minimum=64,
+                                maximum=1920,
+                                step=8,
+                                value=DEFAULT_PARAMS["width"]
+                            )
+                            height_slider = gr.Slider(
+                                label="Height",
+                                minimum=64,
+                                maximum=1080,
+                                step=8,
+                                value=DEFAULT_PARAMS["height"]
+                            )
+                        num_frames_slider = gr.Slider(
+                            label="Number of Frames",
+                            minimum=1,
+                            maximum=200,
+                            step=1,
+                            value=DEFAULT_PARAMS["num_frames"]
+                        )
+                        inference_steps_slider = gr.Slider(
+                            label="Inference Steps",
+                            minimum=1,
+                            maximum=100,
+                            step=1,
+                            value=DEFAULT_PARAMS["num_inference_steps"]
+                        )
+                        guidance_scale_slider = gr.Slider(
+                            label="Guidance Scale",
+                            minimum=0.0,
+                            maximum=20.0,
+                            step=0.1,
+                            value=DEFAULT_PARAMS["guidance_scale"]
+                        )
+                        seed_input = gr.Number(
+                            label="Seed (Optional)",
+                            value=0,
+                            precision=0
+                        )
+            # Generation button
+            with gr.Row():
+                generate_btn = gr.Button(
+                    "🎬 Generate Video",
+                    variant="primary",
+                    size="lg"
+                )
+            # Output section
+            with gr.Row():
+                with gr.Column():
+                    video_output = gr.Video(
+                        label="Generated Video",
+                        visible=False
+                    )
+                    generation_info = gr.Markdown(
+                        label="Generation Information",
+                        visible=False
+                    )
+                    generation_error = gr.Markdown(
+                        visible=False
+                    )
+            # Additional controls
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("""
+                    ### 💡 Tips:
+                    - Enter a short **concept** (e.g. “a busy city street at dawn”). The agent will expand it into a detailed prompt.
+                    - Adjust the **guidance scale**: higher values make the video adhere more closely to the refined prompt.
+                    - Increasing **inference steps** improves quality at the cost of generation time.
+                    - Use the optional **Negative Prompt** field only if you want to override the agent's recommended terms.
+                    - Keep width and height multiples of 8 for optimal performance.
+                    """)
+                with gr.Column():
+                    if torch.cuda.is_available():
+                        gpu_info = f"🎮 GPU: {torch.cuda.get_device_name()}"
+                    else:
+                        gpu_info = "💻 Running on CPU"
+                    gr.Markdown(f"""
+                    ### 🖥️ System Information:
+                    {gpu_info}
+                    ### 📊 Model Information:
+                    - **Model:** WAN‑VACE 1.3B (Q4_0 Quantized)
+                    - **Text Encoder:** UMT5‑XXL
+                    - **Scheduler:** UniPC Multistep
+                    ### 🤖 Agent Details:
+                    - **Planning:** The agent automatically crafts a detailed prompt and a recommended negative prompt based on your concept.
+                    - **Override:** Supply your own negative prompt to override the recommendation if desired.
+                    """)
+        # Event handlers
+        load_btn.click(
+            fn=load_model_interface,
+            outputs=[
+                load_btn,
+                generation_interface,
+                load_success_msg,
+                load_error_msg
+            ]
+        )
+        generate_btn.click(
+            fn=generate_video_interface,
+            inputs=[
+                prompt_input,
+                negative_prompt_input,
+                width_slider,
+                height_slider,
+                num_frames_slider,
+                inference_steps_slider,
+                guidance_scale_slider,
+                seed_input
+            ],
+            outputs=[
+                video_output,
+                generation_info,
+                generation_error
+            ]
+        )
+    return demo
+def main():
+    """Main function to launch the application"""
+    print(f"🚀 Starting {UI_CONFIG['title']}...")
+    print(f"🔧 Server configuration: {SERVER_CONFIG['host']}:{SERVER_CONFIG['port']}")
+    # Check GPU availability
+    if torch.cuda.is_available():
+        print(f"🎮 GPU detected: {torch.cuda.get_device_name()}")
+        print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")
+    else:
+        print("💻 Running on CPU (GPU recommended for better performance)")
+    # Create interface and enable the event queue to support multiple users.
+    demo = create_interface()
+    # Hugging Face Spaces expect `.queue()` to be called for handling request concurrency.
+    # Limiting concurrency_count to 1 helps prevent excessive memory usage on CPU-only hardware.
+    demo = demo.queue(concurrency_count=1)
+    # Launch the interface.
+    demo.launch(
+        server_name=SERVER_CONFIG["host"],
+        server_port=SERVER_CONFIG["port"],
+        share=SERVER_CONFIG["share"],
+        show_error=True,
+        show_tips=True
+    )
+if __name__ == "__main__":
+    main()

config.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+Configuration file for WAN-VACE video generation application
+"""
+import os
+# Hugging Face token (must be set as environment variable)
+HF_TOKEN = os.getenv("HF_TOKEN")
+# Model paths and configurations
+MODEL_CONFIG = {
+    "transformer_path": "https://huggingface.co/calcuis/wan-gguf/blob/main/wan2.1-v5-vace-1.3b-q4_0.gguf",
+    "text_encoder_path": "chatpig/umt5xxl-encoder-gguf",
+    "text_encoder_file": "umt5xxl-encoder-q4_0.gguf",
+    "vae_path": "callgg/wan-decoder",
+    "pipeline_path": "callgg/wan-decoder"
+}
+# Default generation parameters
+DEFAULT_PARAMS = {
+    "width": 720,
+    "height": 480,
+    "num_frames": 57,
+    "num_inference_steps": 24,
+    "guidance_scale": 2.5,
+    "conditioning_scale": 0.0,
+    "fps": 16,
+    "flow_shift": 3.0
+}
+# UI configuration
+#
+# The title and description here emphasise the agentic nature of the app:
+# you provide a concept and the system plans the prompts for you.  Feel free
+# to adjust these strings to suit your needs or branding.
+UI_CONFIG = {
+    "title": "🎬 Agentic WAN-VACE Video Generation",
+    "description": (
+        "Generate high-quality videos from simple concepts. "
+        "Provide a short description of what you want to see, and the agent "
+        "will craft a refined prompt and negative prompt before generating a cinematic "
+        "vertical video using the WAN‑VACE model."
+    ),
+    "theme": "default"
+}
+# Server configuration
+SERVER_CONFIG = {
+    "host": "0.0.0.0",
+    "port": 5000,
+    "share": False
+}

model_handler.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""
+Model handler for WAN-VACE video generation
+"""
+import torch
+import time
+from typing import Optional, Tuple, Any
+from transformers import UMT5EncoderModel
+from diffusers import AutoencoderKLWan, WanVACEPipeline, WanVACETransformer3DModel, GGUFQuantizationConfig
+from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
+from diffusers.utils import export_to_video
+from huggingface_hub import login
+import gradio as gr
+from config import MODEL_CONFIG, DEFAULT_PARAMS, HF_TOKEN
+import os
+from utils import create_temp_video_path, validate_generation_params, validate_prompt, format_generation_info
+class WanVACEModelHandler:
+    """Handler for WAN-VACE model loading and video generation"""
+    def __init__(self):
+        self.pipe = None
+        self.is_loaded = False
+        self.loading_progress = 0
+    def login_hf(self) -> bool:
+        """Login to Hugging Face"""
+        try:
+            login(token=HF_TOKEN)
+            return True
+        except Exception as e:
+            print(f"Warning: Could not login to Hugging Face: {e}")
+            return False
+    def load_model(self, progress_callback=None) -> Tuple[bool, str]:
+        """Load the WAN-VACE model components"""
+        try:
+            # Login to HF
+            self.login_hf()
+            if progress_callback:
+                progress_callback(0.1, "Loading transformer model...")
+            # Determine desired dtype for CPU/GPU execution.
+            # Hugging Face Spaces often run on CPU, where bfloat16 may not be supported.
+            # Allow the dtype to be configured via the WAN_DTYPE environment variable.
+            # Supported values: "bfloat16" (default) or "float32".
+            dtype_str = os.getenv("WAN_DTYPE", "bfloat16").lower()
+            # Select compute dtype: use bfloat16 only if requested and available.
+            # Fall back to float32 otherwise.
+            compute_dtype = torch.bfloat16 if dtype_str == "bfloat16" else torch.float32
+            # Likewise for the torch dtype used when loading weights.
+            torch_dtype = compute_dtype
+            # Load transformer
+            transformer = WanVACETransformer3DModel.from_single_file(
+                MODEL_CONFIG["transformer_path"],
+                quantization_config=GGUFQuantizationConfig(compute_dtype=compute_dtype),
+                torch_dtype=torch_dtype,
+            )
+            if progress_callback:
+                progress_callback(0.4, "Loading text encoder...")
+            # Load text encoder
+            text_encoder = UMT5EncoderModel.from_pretrained(
+                MODEL_CONFIG["text_encoder_path"],
+                gguf_file=MODEL_CONFIG["text_encoder_file"],
+                torch_dtype=torch_dtype,
+            )
+            if progress_callback:
+                progress_callback(0.7, "Loading VAE...")
+            # Load VAE
+            vae = AutoencoderKLWan.from_pretrained(
+                MODEL_CONFIG["vae_path"],
+                subfolder="vae",
+                torch_dtype=torch.float32
+            )
+            if progress_callback:
+                progress_callback(0.9, "Assembling pipeline...")
+            # Create pipeline
+            self.pipe = WanVACEPipeline.from_pretrained(
+                MODEL_CONFIG["pipeline_path"],
+                transformer=transformer,
+                text_encoder=text_encoder,
+                vae=vae,
+                torch_dtype=torch_dtype
+            )
+            # Configure scheduler
+            flow_shift = DEFAULT_PARAMS["flow_shift"]
+            self.pipe.scheduler = UniPCMultistepScheduler.from_config(
+                self.pipe.scheduler.config,
+                flow_shift=flow_shift
+            )
+            # Enable optimizations
+            self.pipe.enable_model_cpu_offload()
+            self.pipe.vae.enable_tiling()
+            self.is_loaded = True
+            if progress_callback:
+                progress_callback(1.0, "Model loaded successfully!")
+            return True, "Model loaded successfully!"
+        except Exception as e:
+            error_msg = f"Error loading model: {str(e)}"
+            if progress_callback:
+                progress_callback(0, error_msg)
+            return False, error_msg
+    def generate_video(
+        self,
+        prompt: str,
+        negative_prompt: str = "",
+        width: int = DEFAULT_PARAMS["width"],
+        height: int = DEFAULT_PARAMS["height"],
+        num_frames: int = DEFAULT_PARAMS["num_frames"],
+        num_inference_steps: int = DEFAULT_PARAMS["num_inference_steps"],
+        guidance_scale: float = DEFAULT_PARAMS["guidance_scale"],
+        seed: Optional[int] = None,
+        progress_callback=None
+    ) -> Tuple[bool, str, str, str]:
+        """
+        Generate video from text prompt
+        Returns: (success, video_path, error_message, generation_info)
+        """
+        if not self.is_loaded:
+            return False, "", "Model not loaded. Please load the model first.", ""
+        # Validate inputs
+        prompt_valid, prompt_error = validate_prompt(prompt)
+        if not prompt_valid:
+            return False, "", prompt_error or "Invalid prompt", ""
+        params_valid, params_error = validate_generation_params(
+            width, height, num_frames, num_inference_steps, guidance_scale
+        )
+        if not params_valid:
+            return False, "", params_error or "Invalid parameters", ""
+        try:
+            if progress_callback:
+                progress_callback(0.1, "Preparing generation...")
+            # Check if pipeline is loaded
+            if self.pipe is None:
+                return False, "", "Pipeline not initialized. Please load the model first.", ""
+            # Set up generator with seed
+            generator = torch.Generator()
+            if seed is not None:
+                generator.manual_seed(seed)
+            else:
+                generator.manual_seed(0)  # Default seed
+            if progress_callback:
+                progress_callback(0.2, "Starting video generation...")
+            start_time = time.time()
+            # Generate video
+            output = self.pipe(
+                prompt=prompt,
+                negative_prompt=negative_prompt if negative_prompt else None,
+                width=width,
+                height=height,
+                num_frames=num_frames,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                conditioning_scale=DEFAULT_PARAMS["conditioning_scale"],
+                generator=generator,
+            ).frames[0]
+            if progress_callback:
+                progress_callback(0.8, "Exporting video...")
+            # Export to video file
+            output_path = create_temp_video_path()
+            export_to_video(output, output_path, fps=DEFAULT_PARAMS["fps"])
+            generation_time = time.time() - start_time
+            if progress_callback:
+                progress_callback(1.0, "Video generation complete!")
+            # Format generation info
+            gen_info = format_generation_info(
+                prompt, negative_prompt, width, height, num_frames,
+                num_inference_steps, guidance_scale, generation_time
+            )
+            return True, output_path, "", gen_info
+        except Exception as e:
+            error_msg = f"Error during video generation: {str(e)}"
+            if progress_callback:
+                progress_callback(0, error_msg)
+            return False, "", error_msg, ""
+# Global model handler instance
+model_handler = WanVACEModelHandler()

planning.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+Planning utilities for the agentic WAN‑VACE video generator.
+This module defines a simple planner that takes a high‑level concept or topic and
+returns a refined text prompt and a recommended negative prompt.  The planner
+adds cinematic and visual descriptors to the concept to encourage more
+engaging video outputs and recommends a default negative prompt to avoid
+common artifacts and low‑quality renderings.
+The planner can be replaced or extended with more sophisticated logic or local
+LLMs if desired.
+"""
+from dataclasses import dataclass
+from typing import Tuple
+@dataclass
+class Plan:
+    """A dataclass representing a planned prompt and negative prompt."""
+    prompt: str
+    negative_prompt: str
+def plan_from_topic(topic: str) -> Plan:
+    """
+    Generate a refined prompt and a recommended negative prompt from a high‑level topic.
+    The refined prompt enriches the user's concept with cinematic descriptors and
+    details that tend to produce appealing vertical videos.  The negative prompt
+    includes terms that discourage common undesirable artifacts.
+    Parameters
+    ----------
+    topic: str
+        A short description of what the user wants in the video.
+    Returns
+    -------
+    Plan
+        An object containing a refined prompt and a negative prompt.
+    """
+    # Base descriptors to enrich the concept. These tokens help guide the model
+    # towards vibrant, cinematic compositions. You can customise these tokens
+    # depending on your aesthetic preferences.
+    base_descriptors = (
+        "cinematic, dynamic motion, rich details, warm lighting, volumetric lighting, "
+        "bokeh, warm sun rim light, tracking shot, shallow depth of field, vertical 9:16"
+    )
+    # Compose the refined prompt
+    refined_prompt = f"{topic}, {base_descriptors}"
+    # Recommended negative prompt to avoid low‑quality outputs.  Users can
+    # override this by supplying their own negative prompt.
+    recommended_negative = (
+        "blurry, lowres, artifacts, distorted anatomy, dull colors, washed out, "
+        "overexposed, underexposed, jitter, bad compression"
+    )
+    return Plan(prompt=refined_prompt, negative_prompt=recommended_negative)

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+# Requirements for running the WAN‑VACE Gradio application on Hugging Face Spaces
+# Core inference libraries
+torch==2.2.*        # PyTorch CPU build; GPU is not available in most Spaces
+transformers>=4.42.0
+diffusers==0.32.*
+accelerate>=0.31.0
+safetensors>=0.4.0
+huggingface_hub>=0.21.0
+# Application and interface libraries
+gradio>=4.0.0
+opencv-python-headless>=4.8.0
+numpy>=1.24.0
+Pillow>=10.0.0
+# The following line pins the Torch CPU wheel source for Linux systems.
+# It is optional but recommended to avoid downloading GPU wheels on CPU-only hardware.
+-f https://download.pytorch.org/whl/cpu

utils.py ADDED Viewed

	@@ -0,0 +1,102 @@

+"""
+Utility functions for video processing and file handling
+"""
+import os
+import tempfile
+import uuid
+from typing import Optional, Tuple
+import torch
+import numpy as np
+from PIL import Image
+def create_temp_video_path(extension: str = "mp4") -> str:
+    """Create a temporary file path for video output"""
+    temp_dir = tempfile.gettempdir()
+    unique_id = str(uuid.uuid4())
+    return os.path.join(temp_dir, f"video_{unique_id}.{extension}")
+def validate_generation_params(
+    width: int,
+    height: int,
+    num_frames: int,
+    num_inference_steps: int,
+    guidance_scale: float
+) -> Tuple[bool, Optional[str]]:
+    """Validate video generation parameters"""
+    # Check width and height
+    if width < 64 or width > 1920:
+        return False, "Width must be between 64 and 1920 pixels"
+    if height < 64 or height > 1080:
+        return False, "Height must be between 64 and 1080 pixels"
+    # Check if dimensions are divisible by 8 (common requirement for video models)
+    if width % 8 != 0:
+        return False, "Width must be divisible by 8"
+    if height % 8 != 0:
+        return False, "Height must be divisible by 8"
+    # Check frames
+    if num_frames < 1 or num_frames > 200:
+        return False, "Number of frames must be between 1 and 200"
+    # Check inference steps
+    if num_inference_steps < 1 or num_inference_steps > 100:
+        return False, "Number of inference steps must be between 1 and 100"
+    # Check guidance scale
+    if guidance_scale < 0 or guidance_scale > 20:
+        return False, "Guidance scale must be between 0 and 20"
+    return True, None
+def validate_prompt(prompt: str) -> Tuple[bool, Optional[str]]:
+    """Validate the input prompt"""
+    if not prompt or len(prompt.strip()) == 0:
+        return False, "Prompt cannot be empty"
+    if len(prompt) > 1000:
+        return False, "Prompt must be less than 1000 characters"
+    return True, None
+def get_memory_usage() -> str:
+    """Get current GPU memory usage if available"""
+    if torch.cuda.is_available():
+        allocated = torch.cuda.memory_allocated() / 1024**3  # Convert to GB
+        cached = torch.cuda.memory_reserved() / 1024**3
+        return f"GPU Memory - Allocated: {allocated:.2f}GB, Cached: {cached:.2f}GB"
+    else:
+        return "GPU not available"
+def cleanup_temp_files(file_path: str) -> None:
+    """Clean up temporary files"""
+    try:
+        if os.path.exists(file_path):
+            os.remove(file_path)
+    except Exception as e:
+        print(f"Warning: Could not remove temporary file {file_path}: {e}")
+def format_generation_info(
+    prompt: str,
+    negative_prompt: str,
+    width: int,
+    height: int,
+    num_frames: int,
+    num_inference_steps: int,
+    guidance_scale: float,
+    generation_time: float
+) -> str:
+    """Format generation information for display"""
+    info = f"""
+**Generation Details:**
+- **Prompt:** {prompt}
+- **Negative Prompt:** {negative_prompt if negative_prompt else "None"}
+- **Dimensions:** {width}x{height}
+- **Frames:** {num_frames}
+- **Inference Steps:** {num_inference_steps}
+- **Guidance Scale:** {guidance_scale}
+- **Generation Time:** {generation_time:.2f} seconds
+- **Memory Usage:** {get_memory_usage()}
+"""
+    return info