import os
import time
import spaces
import gradio as gr
import torch as th
import torch
import numpy as np
import tempfile
from diffusers import AutoencoderKLWan
from diffusers.utils import export_to_video, load_image
from diffusers.schedulers import UniPCMultistepScheduler
from transformers import CLIPVisionModel
from chronoedit_diffusers.pipeline_chronoedit import ChronoEditPipeline
from chronoedit_diffusers.transformer_chronoedit import ChronoEditTransformer3DModel
from PIL import Image
from huggingface_hub import hf_hub_download
from prompt_enhancer import load_model, enhance_prompt

import shlex
import subprocess
subprocess.run(shlex.split("pip install flash-attn  --no-build-isolation"), env=os.environ | {"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, check=True)


start = time.time()

model_id = "nvidia/ChronoEdit-14B-Diffusers"
image_encoder = CLIPVisionModel.from_pretrained(
	model_id,
	subfolder="image_encoder",
	torch_dtype=torch.float32
)
print("✓ Loaded image encoder")

vae = AutoencoderKLWan.from_pretrained(
	model_id,
	subfolder="vae",
	torch_dtype=torch.bfloat16
)
print("✓ Loaded VAE")

transformer = ChronoEditTransformer3DModel.from_pretrained(
	model_id,
	subfolder="transformer",
	torch_dtype=torch.bfloat16
)
print("✓ Loaded transformer")

pipe = ChronoEditPipeline.from_pretrained(
	model_id,
	image_encoder=image_encoder,
	transformer=transformer,
	vae=vae,
	torch_dtype=torch.bfloat16
)
print("✓ Created pipeline")

lora_path = hf_hub_download(repo_id=model_id, filename="lora/chronoedit_distill_lora.safetensors")
# Load LoRA if specified
if lora_path:
	print(f"Loading LoRA weights from {lora_path}...")
	pipe.load_lora_weights(lora_path)

	pipe.fuse_lora(lora_scale=1.0)
	print(f"✓ Fused LoRA with scale 1.0")
	
	# Setup scheduler
	pipe.scheduler = UniPCMultistepScheduler.from_config(
		pipe.scheduler.config,
		flow_shift=2.0
	)
	print(f"✓ Configured scheduler (flow_shift=2.0)")

pipe.to("cuda")
# pipe.enable_model_cpu_offload()
end = time.time()
print(f"Model loaded in {end - start:.2f}s.")

start = time.time()
prompt_enhancer_model = "Qwen/Qwen3-VL-8B-Instruct"
prompt_model, processor = load_model(prompt_enhancer_model)
end = time.time()
print(f"Prompt enhancer loaded in {end - start:.2f}s.")


def calculate_dimensions(image,  mod_value):
    """
    Calculate output dimensions based on resolution settings.
    
    Args:
        image: PIL Image
        mod_value: Modulo value for dimension alignment
        
    Returns:
        Tuple of (width, height)
    """
    
    # Get max area from preset or override 
    target_area = 720 * 1280
    
    # Calculate dimensions maintaining aspect ratio
    aspect_ratio = image.height / image.width
    calculated_height = round(np.sqrt(target_area * aspect_ratio)) // mod_value * mod_value
    calculated_width = round(np.sqrt(target_area / aspect_ratio)) // mod_value * mod_value
    
    return calculated_width, calculated_height


@spaces.GPU
def run_inference(
	image_path: str,
	prompt: str,
	enable_temporal_reasoning: bool = False,
	num_inference_steps: int = 8,
	guidance_scale: float = 1.0,
	shift: float = 2.0,
	num_temporal_reasoning_steps: int = 8,
):
	# Rewriter
	final_prompt = prompt
    
	prompt_model.to("cuda")
	# Enhance prompt with CoT reasoning
	start = time.time()
	cot_prompt = enhance_prompt(
		image_path,
		prompt,
		prompt_model,
		processor,
	)
	end = time.time()
	print(f"Prompt enhanced in {end - start:.2f}s")
	prompt_model.to("cpu")

	# Print enhanced CoT prompt
	print("\n" + "=" * 80)
	print("Enhanced CoT Prompt:")
	print("=" * 80)
	print(cot_prompt)
	print("=" * 80 + "\n")
	final_prompt = cot_prompt

	# Inference
	print(f"Loading input image: {image_path}")
	image = load_image(image_path)
	mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
	width, height = calculate_dimensions(
		image,
		mod_value
	)
	print(f"Output dimensions: {width}x{height}")
	image = image.resize((width, height))
	num_frames = 29 if enable_temporal_reasoning else 5

	start = time.time()
	output = pipe(
		image=image,
		prompt=final_prompt,
		height=height,
		width=width,
		num_frames=num_frames,
		num_inference_steps=num_inference_steps,
		guidance_scale=guidance_scale,
		enable_temporal_reasoning=enable_temporal_reasoning,
		num_temporal_reasoning_steps=num_temporal_reasoning_steps,
        # offload_model=True
	).frames[0]
	end = time.time()
	print(f"Generated video in {end - start:.2f}s")

	image_tmp = tempfile.NamedTemporaryFile(suffix=".png", delete=False)
	output_path_image = image_tmp.name
	image_tmp.close()
	Image.fromarray((output[-1] * 255).clip(0, 255).astype("uint8")).save(output_path_image)

	log_text = (
		f"Final prompt: {final_prompt}\n"
		f"Guidance: {guidance_scale}, Shift: {shift}, Steps: {num_inference_steps}\n"
	)
	if enable_temporal_reasoning:
		log_text += f"Temporal reasoning: {enable_temporal_reasoning}, Steps: {num_temporal_reasoning_steps}\n"

	print(log_text)

	return output_path_image


def build_ui() -> gr.Blocks:
	with gr.Blocks(title="ChronoEdit", theme=gr.themes.Soft()) as demo:

		gr.Markdown("""
		# 🚀 ChronoEdit Demo
		This demo is built on ChronoEdit-14B with an 8-step distillation LoRA for fast edits.  
		See our [GitHub repo](https://github.com/nv-tlabs/ChronoEdit) for the full model and temporal reasoning setup.  
		[[Project Page]](https://research.nvidia.com/labs/toronto-ai/chronoedit/) | 
		[[Code]](https://github.com/nv-tlabs/ChronoEdit) |
		[[Technical Report]](https://arxiv.org/abs/2510.04290) | 
		[[Model]](https://huggingface.co/nvidia/ChronoEdit-14B-Diffusers)
		""")
		
		with gr.Row():
			image = gr.Image(type="filepath", label="Input Image")
			output_image = gr.Image(label="Generated Image")
		with gr.Row():
			with gr.Column(scale=1):
				prompt = gr.Textbox(label="Prompt", lines=4, value="")
				gr.Markdown("""
				_For temporal reasoning, set it up locally following the [GitHub repo](https://github.com/nv-tlabs/ChronoEdit)._
				""")
				run_btn = gr.Button("Start Generation", variant="primary")
		
		# with gr.Row():
		# 	num_inference_steps = gr.Slider(minimum=4, maximum=75, step=1, value=50, label="Num Inference Steps")
		# 	guidance_scale = gr.Slider(minimum=1.0, maximum=10.0, step=0.5, value=1.0, label="Guidance Scale")
		# 	shift = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=5.0, label="Shift")
		# 	num_temporal_reasoning_steps = gr.Slider(minimum=0, maximum=50, step=1, value=50, label="Number of temporal reasoning steps")

		# log_text = gr.Markdown("Logs will appear here.")	

		def _on_run(image_path, prompt):
			image_out_path = run_inference(
				image_path=image_path,
				prompt=prompt,
			)
			return image_out_path

		run_btn.click(
			_on_run,
			inputs=[image, prompt],
			outputs=[output_image]
		)

		with gr.Column(scale=1):
			gr.Examples(
				examples=[ 
					[
						"examples/1.png",
						"The user wants to change the provided illustration of an elegant woman in a flowing red kimono into a high-end Japanese anime PVC scale figure, rendered photorealistically as a pre-painted collectible. Preserve her long black hair styled with golden hair ornaments and delicate floral accessories, her slightly tilted head and confident gaze, and the detailed red kimono with golden and floral embroidery tied with a wide gold obi.  Cherry blossom petals drift around.   Maintain the pose and camera view point unchanged. The scene should look like a premium finished PVC figure on display, with realistic textures, fine paint detailing, and a polished collectible presentation. Place the figure on a simple round base on a computer desk, with blurred keyboard and monitor glow in the background. Emphasize a strong 3D sense of volume and depth, realistic shadows and lighting, and painted PVC figure textures. Professional studio photography style, shallow depth of field, focus on the figure as a physical collectible. The lighting on the figure is uniform and highlighted, emphasizing every sculpted detail and painted accent.",
					],
					[
						"examples/2.png",
						"The user wants to change the scene so that the girl in the traditional-style painting, wearing her ornate floral robe and headdress, is now playing a guitar. Her graceful appearance remains unchanged - smooth black hair tied neatly, soft facial features with a calm, focused expression - but her pose shifts: both hands are engaged with the guitar. One hand rests on the neck of the instrument, fingers pressing the strings with delicate precision, while the other hand strums near the sound hole. The guitar is positioned naturally across her lap, blending with the elegance of her posture. The traditional painting style is preserved, but the addition of the guitar introduces a modern contrast, giving the scene a harmonious fusion of classical refinement and contemporary music.",
					],
					[
						"examples/3.png",
						"Transform the image so that inside the floral teacup of steaming tea, a small, cute mouse is sitting and taking a bath; the mouse should look relaxed and cheerful, with a tiny white bath towel draped over its head as if enjoying a spa moment, while the steam rises gently around it, blending seamlessly with the warm and cozy atmosphere.",
					],
					[
						"examples/4.png",
						"Generate a Japanese black-and-white anime-style battle scene that strictly follows the input stick-figure structure, where the figure on the left represents a school girl in a school-style outfit (jacket, white shirt, skirt, sneakers), and the figure on the right represents the dark-haired girl in a flowing black robe with ornate accessories. The left character charges forward in a running stance, her teeth clenched and eyes sharp with determination, one arm swinging back for momentum while the other braces forward to strike. The right character delivers a dramatic horizontal flying kick, her body fully airborne and parallel to the ground, one leg stretched forward in a devastating strike while the other bends back for balance, her furious expression marked by blazing eyes and a battle cry. Between them, a massive punch boom impact erupts as fist and kick collide, exploding outward with jagged ink strokes, debris, and speed lines. Flowing robes, hair, and clothing ripple violently from the shockwave, while bold brush shading captures the overwhelming force and raw intensity of this classic manga battle moment.",
					],
				],
				inputs=[image, prompt], outputs=[output_image], fn=_on_run, cache_examples=False
			)

	return demo


if __name__ == "__main__":
	demo = build_ui()
	demo.launch()