Spaces:
Paused
Paused
| import os, random, tempfile | |
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| from PIL import Image | |
| from diffusers import LTXConditionPipeline | |
| from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition | |
| from diffusers.utils import export_to_video | |
| # Modelo LTX-Video via Diffusers | |
| MODEL_REPO = os.getenv("LTX_REPO", "Lightricks/LTX-Video") | |
| # Parâmetros básicos | |
| FPS = 24 | |
| MAX_FRAMES = 161 | |
| MIN_DIM = 256 | |
| MAX_DIM = 1280 | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Helpers de tipo | |
| def _to_int(x, d): | |
| if isinstance(x, (list, tuple)): | |
| x = x[0] if x else d | |
| try: | |
| return int(x) | |
| except Exception: | |
| return d | |
| def _to_float(x, d): | |
| if isinstance(x, (list, tuple)): | |
| x = x[0] if x else d | |
| try: | |
| return float(x) | |
| except Exception: | |
| return d | |
| def _to_bool(x, d=True): | |
| if isinstance(x, (list, tuple)): | |
| x = x[0] if x else d | |
| return bool(x) | |
| # Frames alinhados a (8k+1) | |
| def _frames_from_secs(secs): | |
| secs = _to_float(secs, 2.0) | |
| n = max(9, int(round(secs * FPS))) | |
| k = round((n - 1) / 8.0) | |
| return int(max(9, min(MAX_FRAMES, k * 8 + 1))) | |
| def _pad32(v): | |
| return ((v - 1) // 32 + 1) * 32 | |
| def _dims_for_image(path, target=768): | |
| im = Image.open(path) | |
| w, h = im.size | |
| if w >= h: | |
| new_h = target | |
| new_w = int(round((w / max(1, h)) * new_h / 32) * 32) | |
| else: | |
| new_w = target | |
| new_h = int(round((h / max(1, w)) * new_w / 32) * 32) | |
| new_h = max(MIN_DIM, min(new_h, MAX_DIM)) | |
| new_w = max(MIN_DIM, min(new_w, MAX_DIM)) | |
| return new_h, new_w | |
| print(f"Carregando {MODEL_REPO} (LTXConditionPipeline)...") | |
| pipe = LTXConditionPipeline.from_pretrained( | |
| MODEL_REPO, | |
| torch_dtype=torch.bfloat16, # simples e estável; FP8 opcional pode ser adicionado depois | |
| ) | |
| pipe.to(device) | |
| # Desliga dynamic shifting no scheduler para não exigir 'mu' | |
| if hasattr(pipe, "scheduler") and hasattr(pipe.scheduler, "use_dynamic_shifting"): | |
| pipe.scheduler.use_dynamic_shifting = False | |
| # Tiling do VAE para reduzir picos de VRAM | |
| if hasattr(pipe, "vae") and hasattr(pipe.vae, "enable_tiling"): | |
| pipe.vae.enable_tiling() | |
| def handle_dims(image_path, cur_h, cur_w): | |
| if not image_path: | |
| return gr.update(value=cur_h), gr.update(value=cur_w) | |
| try: | |
| h, w = _dims_for_image(image_path, 768) | |
| return gr.update(value=h), gr.update(value=w) | |
| except Exception as e: | |
| print(f"Erro ao ajustar dimensões: {e}") | |
| return gr.update(value=cur_h), gr.update(value=cur_w) | |
| def generate_i2v( | |
| prompt, | |
| neg_prompt, | |
| image_path, | |
| height_ui, | |
| width_ui, | |
| duration_ui, | |
| seed_ui, | |
| randomize_seed, | |
| guidance_ui, | |
| denoise_ui, | |
| image_noise_ui, | |
| progress=gr.Progress(track_tqdm=True) | |
| ): | |
| if not image_path: | |
| raise gr.Error("Selecione uma imagem.") | |
| h = _to_int(height_ui, 512) | |
| w = _to_int(width_ui, 704) | |
| h_pad = _pad32(h) | |
| w_pad = _pad32(w) | |
| num_frames = _frames_from_secs(duration_ui) | |
| # Parâmetros de qualidade estáveis | |
| guidance_scale = _to_float(guidance_ui, 5.0) # 4.0–6.0 funcionam bem | |
| denoise_strength = _to_float(denoise_ui, 0.4) # 0.3–0.5 preserva bem a imagem | |
| image_cond_noise_scale = _to_float(image_noise_ui, 0.0) # 0.0 fixa a aparência; 0.01–0.03 relaxa | |
| seed = _to_int(seed_ui, 42) | |
| if _to_bool(randomize_seed, True): | |
| seed = random.randint(0, 2**32 - 1) | |
| # Condição: imagem como primeiro frame | |
| img = Image.open(image_path).convert("RGB") | |
| cond = LTXVideoCondition(image=img, frame_index=0, strength=1.0) | |
| gen = torch.Generator(device=device).manual_seed(seed) | |
| progress(0.0, desc="Gerando vídeo...") | |
| out = pipe( | |
| conditions=[cond], | |
| prompt=prompt, | |
| negative_prompt=neg_prompt, | |
| width=w_pad, | |
| height=h_pad, | |
| num_frames=num_frames, | |
| num_inference_steps=30, # simples e estável | |
| #guidance_scale=guidance_scale, | |
| #guidance_rescale=0.7, # ajuda a estabilizar CFG | |
| #decode_timestep=0.05, # valores seguros para >=0.9.1 | |
| #decode_noise_scale=0.025, | |
| #image_cond_noise_scale=image_cond_noise_scale, | |
| #denoise_strength=denoise_strength, | |
| generator=gen, | |
| output_type="pil", | |
| ) | |
| frames = out.frames[0] | |
| tmp = tempfile.mkdtemp() | |
| out_path = os.path.join(tmp, f"output_{random.randint(10000,99999)}.mp4") | |
| progress(0.8, desc="Salvando vídeo") | |
| export_to_video(frames, out_path, fps=FPS) | |
| return out_path, int(seed) | |
| # UI simples | |
| with gr.Blocks() as demo: | |
| gr.Markdown("LTX I2V (Diffusers) simples com denoise e dynamic shifting desligado") | |
| with gr.Row(): | |
| with gr.Column(): | |
| img = gr.Image(label="Imagem", type="filepath") | |
| prompt = gr.Textbox(label="Prompt", value="Subject moves gently; subtle camera push-in", lines=2) | |
| neg = gr.Textbox(label="Negative", value="worst quality, jitter, blur, distortions", lines=2) | |
| dur = gr.Slider(label="Duração (s)", minimum=0.5, maximum=8.0, step=0.1, value=2.0) | |
| with gr.Row(): | |
| h = gr.Slider(label="Altura", minimum=MIN_DIM, maximum=MAX_DIM, step=32, value=512) | |
| w = gr.Slider(label="Largura", minimum=MIN_DIM, maximum=MAX_DIM, step=32, value=704) | |
| with gr.Accordion("Avançado", open=False): | |
| seed = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1) | |
| rand = gr.Checkbox(label="Randomize seed", value=True) | |
| guidance = gr.Slider(label="Guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=5.0) | |
| denoise = gr.Slider(label="Denoise strength", minimum=0.0, maximum=1.0, step=0.05, value=0.4) | |
| image_noise = gr.Slider(label="Image cond noise", minimum=0.0, maximum=0.2, step=0.005, value=0.0) | |
| btn = gr.Button("Gerar", variant="primary") | |
| with gr.Column(): | |
| vid = gr.Video(label="Vídeo") | |
| img.upload(handle_dims, [img, h, w], [h, w]) | |
| btn.click( | |
| generate_i2v, | |
| [prompt, neg, img, h, w, dur, seed, rand, guidance, denoise, image_noise], | |
| [vid, seed] | |
| ) | |
| if __name__ == "__main__": | |
| # Para integração MCP: instale gradio[mcp] e adicione mcp_server=True se necessário. | |
| demo.queue().launch(debug=True, share=False) | |