multimodalart's picture
Update app.py
8ed752e verified
raw
history blame
7.11 kB
import os
import sys
import subprocess
import argparse
from pathlib import Path
import torch
import datetime
import numpy as np
from PIL import Image
import imageio
import spaces
# --- Part 1: Auto-Setup (Clone Repo & Download Weights) ---
REPO_URL = "https://github.com/Tencent-Hunyuan/HunyuanVideo-1.5.git"
REPO_DIR = "HunyuanVideo-1.5"
MODEL_DIR = "ckpts"
HF_REPO_ID = "tencent/HunyuanVideo"
# Configuration
TRANSFORMER_VERSION = "480p_i2v_distilled"
DTYPE = torch.bfloat16
# Set to False if you have >40GB VRAM and want everything on GPU constantly.
# Set to True (Default) to allow running on 16GB-24GB cards via CPU offloading.
ENABLE_OFFLOADING = True
def setup_environment():
"""Clones the repo and downloads weights if they don't exist."""
print("=" * 50)
print("Checking Environment & Dependencies...")
# 1. Clone Repository
if not os.path.exists(REPO_DIR):
print(f"Cloning repository from {REPO_URL}...")
subprocess.run(["git", "clone", REPO_URL], check=True)
else:
print(f"Repository {REPO_DIR} exists.")
# 2. Add Repo to Python Path
repo_path = os.path.abspath(REPO_DIR)
if repo_path not in sys.path:
sys.path.insert(0, repo_path)
# 3. Download Weights
if not os.path.exists(MODEL_DIR) or not os.listdir(MODEL_DIR):
print(f"Downloading weights from {HF_REPO_ID} to {MODEL_DIR}...")
try:
from huggingface_hub import snapshot_download
allow_patterns = [
f"transformer/{TRANSFORMER_VERSION}/*",
"vae/*",
"text_encoder/*",
"vision_encoder/*",
"scheduler/*",
"tokenizer/*"
]
snapshot_download(repo_id=HF_REPO_ID, local_dir=MODEL_DIR, allow_patterns=allow_patterns)
print("Download complete.")
except Exception as e:
print(f"Error downloading weights: {e}")
sys.exit(1)
print("Environment Ready.")
print("=" * 50)
# Run setup immediately
setup_environment()
# --- Part 2: Imports from Cloned Repo ---
# Set Env Vars for HyVideo
if 'PYTORCH_CUDA_ALLOC_CONF' not in os.environ:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['RANK'] = '0'
os.environ['WORLD_SIZE'] = '1'
try:
from hyvideo.pipelines.hunyuan_video_pipeline import HunyuanVideo_1_5_Pipeline
from hyvideo.commons.parallel_states import initialize_parallel_state
from hyvideo.commons.infer_state import initialize_infer_state
except ImportError as e:
print(f"CRITICAL ERROR: Could not import hyvideo modules. {e}")
sys.exit(1)
import gradio as gr
# --- Part 3: Model Initialization (Pre-Load) ---
# Initialize Distributed/Infer States
#parallel_dims = initialize_parallel_state(sp=1)
#if torch.cuda.is_available():
# torch.cuda.set_device(0)
class ArgsNamespace:
def __init__(self):
self.use_sageattn = False
self.sage_blocks_range = "0-53"
self.enable_torch_compile = False
initialize_infer_state(ArgsNamespace())
# Global Pipeline Variable
pipe = None
def pre_load_model():
"""Loads the model into memory/GPU before UI launch."""
global pipe
print(f"⏳ Initializing Pipeline ({TRANSFORMER_VERSION})... this may take a moment...")
try:
pipe = HunyuanVideo_1_5_Pipeline.create_pipeline(
pretrained_model_name_or_path=MODEL_DIR,
transformer_version=TRANSFORMER_VERSION,
enable_offloading=ENABLE_OFFLOADING,
enable_group_offloading=ENABLE_OFFLOADING,
transformer_dtype=DTYPE,
)
print("✅ Model loaded successfully!")
if not ENABLE_OFFLOADING:
print(" Model is fully resident on GPU.")
else:
print(" Model loaded with CPU Offloading enabled (optimizes VRAM usage).")
except Exception as e:
print(f"❌ Failed to load model: {e}")
sys.exit(1)
def save_video_tensor(video_tensor, path, fps=24):
if isinstance(video_tensor, list): video_tensor = video_tensor[0]
if video_tensor.ndim == 5: video_tensor = video_tensor[0]
vid = (video_tensor * 255).clamp(0, 255).to(torch.uint8)
vid = vid.permute(1, 2, 3, 0).cpu().numpy()
imageio.mimwrite(path, vid, fps=fps)
@spaces.GPU(duration=120)
def generate(input_image, prompt, length, steps, shift, seed, guidance):
if pipe is None:
raise gr.Error("Pipeline not initialized!")
if input_image is None:
raise gr.Error("Reference image required.")
if isinstance(input_image, np.ndarray):
input_image = Image.fromarray(input_image).convert("RGB")
if seed == -1: seed = torch.randint(0, 1000000, (1,)).item()
generator = torch.Generator(device="cpu").manual_seed(int(seed))
print(f"Generating: {prompt} | Seed: {seed}")
try:
output = pipe(
prompt=prompt,
height=480, width=854, aspect_ratio="16:9",
video_length=int(length),
num_inference_steps=int(steps),
guidance_scale=float(guidance),
flow_shift=float(shift),
reference_image=input_image,
seed=int(seed),
generator=generator,
output_type="pt",
enable_sr=False,
return_dict=True
)
except Exception as e:
raise gr.Error(f"Inference Failed: {e}")
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
os.makedirs("outputs", exist_ok=True)
output_path = f"outputs/gen_{timestamp}.mp4"
save_video_tensor(output.videos, output_path)
return output_path
# --- Part 4: UI Definition & Launch ---
def create_ui():
with gr.Blocks(title="HunyuanVideo 1.5 I2V") as demo:
gr.Markdown(f"### 🎬 HunyuanVideo 1.5 I2V ({TRANSFORMER_VERSION})")
gr.Markdown("Model is pre-loaded. Ready to generate.")
with gr.Row():
with gr.Column():
img = gr.Image(label="Reference", type="pil", height=250)
prompt = gr.Textbox(label="Prompt", placeholder="Describe motion...", lines=2)
with gr.Row():
steps = gr.Slider(2, 20, value=6, step=1, label="Steps")
guidance = gr.Slider(1.0, 5.0, value=1.0, step=0.1, label="Guidance")
with gr.Row():
shift = gr.Slider(1.0, 20.0, value=5.0, step=0.5, label="Shift")
length = gr.Slider(1, 129, value=61, step=4, label="Length")
seed = gr.Number(value=-1, label="Seed", precision=0)
btn = gr.Button("Generate", variant="primary")
with gr.Column():
out = gr.Video(label="Result", autoplay=True)
btn.click(generate, inputs=[img, prompt, length, steps, shift, seed, guidance], outputs=[out])
return demo
if __name__ == "__main__":
# 1. Execute the pre-load BEFORE the UI launches
pre_load_model()
# 2. Launch UI
ui = create_ui()
ui.queue().launch(server_name="0.0.0.0", share=True)