Aduc-sdr-2_5s / app.py
euIaxs22's picture
Update app.py
aa91947 verified
raw
history blame
6.4 kB
import os, random, tempfile
import gradio as gr
import torch
import numpy as np
from PIL import Image
from diffusers import LTXConditionPipeline
from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
from diffusers.utils import export_to_video
# Modelo LTX-Video via Diffusers
MODEL_REPO = os.getenv("LTX_REPO", "Lightricks/LTX-Video")
# Parâmetros básicos
FPS = 24
MAX_FRAMES = 161
MIN_DIM = 256
MAX_DIM = 1280
device = "cuda" if torch.cuda.is_available() else "cpu"
# Helpers de tipo
def _to_int(x, d):
if isinstance(x, (list, tuple)):
x = x[0] if x else d
try:
return int(x)
except Exception:
return d
def _to_float(x, d):
if isinstance(x, (list, tuple)):
x = x[0] if x else d
try:
return float(x)
except Exception:
return d
def _to_bool(x, d=True):
if isinstance(x, (list, tuple)):
x = x[0] if x else d
return bool(x)
# Frames alinhados a (8k+1)
def _frames_from_secs(secs):
secs = _to_float(secs, 2.0)
n = max(9, int(round(secs * FPS)))
k = round((n - 1) / 8.0)
return int(max(9, min(MAX_FRAMES, k * 8 + 1)))
def _pad32(v):
return ((v - 1) // 32 + 1) * 32
def _dims_for_image(path, target=768):
im = Image.open(path)
w, h = im.size
if w >= h:
new_h = target
new_w = int(round((w / max(1, h)) * new_h / 32) * 32)
else:
new_w = target
new_h = int(round((h / max(1, w)) * new_w / 32) * 32)
new_h = max(MIN_DIM, min(new_h, MAX_DIM))
new_w = max(MIN_DIM, min(new_w, MAX_DIM))
return new_h, new_w
print(f"Carregando {MODEL_REPO} (LTXConditionPipeline)...")
pipe = LTXConditionPipeline.from_pretrained(
MODEL_REPO,
torch_dtype=torch.bfloat16, # simples e estável; FP8 opcional pode ser adicionado depois
)
pipe.to(device)
# Desliga dynamic shifting no scheduler para não exigir 'mu'
if hasattr(pipe, "scheduler") and hasattr(pipe.scheduler, "use_dynamic_shifting"):
pipe.scheduler.use_dynamic_shifting = False
# Tiling do VAE para reduzir picos de VRAM
if hasattr(pipe, "vae") and hasattr(pipe.vae, "enable_tiling"):
pipe.vae.enable_tiling()
def handle_dims(image_path, cur_h, cur_w):
if not image_path:
return gr.update(value=cur_h), gr.update(value=cur_w)
try:
h, w = _dims_for_image(image_path, 768)
return gr.update(value=h), gr.update(value=w)
except Exception as e:
print(f"Erro ao ajustar dimensões: {e}")
return gr.update(value=cur_h), gr.update(value=cur_w)
def generate_i2v(
prompt,
neg_prompt,
image_path,
height_ui,
width_ui,
duration_ui,
seed_ui,
randomize_seed,
guidance_ui,
denoise_ui,
image_noise_ui,
progress=gr.Progress(track_tqdm=True)
):
if not image_path:
raise gr.Error("Selecione uma imagem.")
h = _to_int(height_ui, 512)
w = _to_int(width_ui, 704)
h_pad = _pad32(h)
w_pad = _pad32(w)
num_frames = _frames_from_secs(duration_ui)
# Parâmetros de qualidade estáveis
guidance_scale = _to_float(guidance_ui, 5.0) # 4.0–6.0 funcionam bem
denoise_strength = _to_float(denoise_ui, 0.4) # 0.3–0.5 preserva bem a imagem
image_cond_noise_scale = _to_float(image_noise_ui, 0.0) # 0.0 fixa a aparência; 0.01–0.03 relaxa
seed = _to_int(seed_ui, 42)
if _to_bool(randomize_seed, True):
seed = random.randint(0, 2**32 - 1)
# Condição: imagem como primeiro frame
img = Image.open(image_path).convert("RGB")
cond = LTXVideoCondition(image=img, frame_index=0, strength=1.0)
gen = torch.Generator(device=device).manual_seed(seed)
progress(0.0, desc="Gerando vídeo...")
out = pipe(
conditions=[cond],
prompt=prompt,
negative_prompt=neg_prompt,
width=w_pad,
height=h_pad,
num_frames=num_frames,
num_inference_steps=30, # simples e estável
#guidance_scale=guidance_scale,
#guidance_rescale=0.7, # ajuda a estabilizar CFG
#decode_timestep=0.05, # valores seguros para >=0.9.1
#decode_noise_scale=0.025,
#image_cond_noise_scale=image_cond_noise_scale,
#denoise_strength=denoise_strength,
generator=gen,
output_type="pil",
)
frames = out.frames[0]
tmp = tempfile.mkdtemp()
out_path = os.path.join(tmp, f"output_{random.randint(10000,99999)}.mp4")
progress(0.8, desc="Salvando vídeo")
export_to_video(frames, out_path, fps=FPS)
return out_path, int(seed)
# UI simples
with gr.Blocks() as demo:
gr.Markdown("LTX I2V (Diffusers) simples com denoise e dynamic shifting desligado")
with gr.Row():
with gr.Column():
img = gr.Image(label="Imagem", type="filepath")
prompt = gr.Textbox(label="Prompt", value="Subject moves gently; subtle camera push-in", lines=2)
neg = gr.Textbox(label="Negative", value="worst quality, jitter, blur, distortions", lines=2)
dur = gr.Slider(label="Duração (s)", minimum=0.5, maximum=8.0, step=0.1, value=2.0)
with gr.Row():
h = gr.Slider(label="Altura", minimum=MIN_DIM, maximum=MAX_DIM, step=32, value=512)
w = gr.Slider(label="Largura", minimum=MIN_DIM, maximum=MAX_DIM, step=32, value=704)
with gr.Accordion("Avançado", open=False):
seed = gr.Number(label="Seed", value=42, precision=0, minimum=0, maximum=2**32-1)
rand = gr.Checkbox(label="Randomize seed", value=True)
guidance = gr.Slider(label="Guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=5.0)
denoise = gr.Slider(label="Denoise strength", minimum=0.0, maximum=1.0, step=0.05, value=0.4)
image_noise = gr.Slider(label="Image cond noise", minimum=0.0, maximum=0.2, step=0.005, value=0.0)
btn = gr.Button("Gerar", variant="primary")
with gr.Column():
vid = gr.Video(label="Vídeo")
img.upload(handle_dims, [img, h, w], [h, w])
btn.click(
generate_i2v,
[prompt, neg, img, h, w, dur, seed, rand, guidance, denoise, image_noise],
[vid, seed]
)
if __name__ == "__main__":
# Para integração MCP: instale gradio[mcp] e adicione mcp_server=True se necessário.
demo.queue().launch(debug=True, share=False)