Ani14 commited on
Commit
95315db
·
verified ·
1 Parent(s): f3c7daa

Upload 6 files

Browse files
Files changed (6) hide show
  1. app.py +302 -0
  2. config.py +51 -0
  3. model_handler.py +209 -0
  4. planning.py +60 -0
  5. requirements.txt +19 -0
  6. utils.py +102 -0
app.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio interface for WAN-VACE video generation
3
+ """
4
+ import gradio as gr
5
+ import torch
6
+ import time
7
+ from typing import Optional
8
+
9
+ # Import the simple planner
10
+ from planning import plan_from_topic
11
+
12
+ from config import UI_CONFIG, DEFAULT_PARAMS, SERVER_CONFIG
13
+ from model_handler import model_handler
14
+ from utils import cleanup_temp_files
15
+
16
+ def load_model_interface(progress=gr.Progress()):
17
+ """Interface function for loading the model"""
18
+ def progress_callback(value, message):
19
+ progress(value, desc=message)
20
+
21
+ success, message = model_handler.load_model(progress_callback)
22
+
23
+ if success:
24
+ return (
25
+ gr.update(visible=False), # Hide load button
26
+ gr.update(visible=True), # Show generation interface
27
+ gr.update(value=message, visible=True), # Show success message
28
+ gr.update(visible=False) # Hide error message
29
+ )
30
+ else:
31
+ return (
32
+ gr.update(visible=True), # Keep load button visible
33
+ gr.update(visible=False), # Keep generation interface hidden
34
+ gr.update(visible=False), # Hide success message
35
+ gr.update(value=message, visible=True) # Show error message
36
+ )
37
+
38
+ def generate_video_interface(
39
+ prompt: str,
40
+ negative_prompt: str,
41
+ width: int,
42
+ height: int,
43
+ num_frames: int,
44
+ num_inference_steps: int,
45
+ guidance_scale: float,
46
+ seed: Optional[int],
47
+ progress=gr.Progress()
48
+ ):
49
+ """Interface function for video generation"""
50
+
51
+ def progress_callback(value, message):
52
+ progress(value, desc=message)
53
+
54
+ # Plan the prompt: treat the user input as a high‑level concept and let the
55
+ # agent craft a refined prompt and recommended negative prompt. If the user
56
+ # supplies a negative prompt, it overrides the recommended negative prompt.
57
+ plan = plan_from_topic(prompt)
58
+ # Use the refined prompt from the plan
59
+ effective_prompt = plan.prompt
60
+ # If the user provided a negative prompt, use it; otherwise use the recommended one
61
+ effective_negative = negative_prompt.strip() if negative_prompt and negative_prompt.strip() else plan.negative_prompt
62
+
63
+ success, video_path, error_msg, gen_info = model_handler.generate_video(
64
+ prompt=effective_prompt,
65
+ negative_prompt=effective_negative,
66
+ width=width,
67
+ height=height,
68
+ num_frames=num_frames,
69
+ num_inference_steps=num_inference_steps,
70
+ guidance_scale=guidance_scale,
71
+ seed=seed,
72
+ progress_callback=progress_callback
73
+ )
74
+
75
+ if success:
76
+ return (
77
+ gr.update(value=video_path, visible=True), # Video output
78
+ gr.update(value=gen_info, visible=True), # Generation info
79
+ gr.update(visible=False) # Hide error message
80
+ )
81
+ else:
82
+ return (
83
+ gr.update(value=None, visible=False), # Hide video output
84
+ gr.update(visible=False), # Hide generation info
85
+ gr.update(value=error_msg, visible=True) # Show error message
86
+ )
87
+
88
+ def create_interface():
89
+ """Create the Gradio interface"""
90
+
91
+ with gr.Blocks(
92
+ title=UI_CONFIG["title"],
93
+ theme=UI_CONFIG["theme"]
94
+ ) as demo:
95
+
96
+ # Header
97
+ gr.Markdown(f"# {UI_CONFIG['title']}")
98
+ gr.Markdown(UI_CONFIG["description"])
99
+
100
+ # Model loading section
101
+ with gr.Row():
102
+ with gr.Column():
103
+ load_btn = gr.Button(
104
+ "🚀 Load Video Generation Model",
105
+ variant="primary",
106
+ size="lg"
107
+ )
108
+ load_success_msg = gr.Markdown(visible=False)
109
+ load_error_msg = gr.Markdown(visible=False)
110
+
111
+ # Main generation interface (initially hidden)
112
+ with gr.Column(visible=False) as generation_interface:
113
+
114
+ # Input section
115
+ with gr.Row():
116
+ with gr.Column(scale=2):
117
+ with gr.Group():
118
+ gr.Markdown("### 📝 Concept & Prompts")
119
+ # The user supplies a high‑level concept or topic. The agent will
120
+ # refine this into a detailed prompt automatically.
121
+ prompt_input = gr.Textbox(
122
+ label="Video Concept",
123
+ placeholder="Describe the concept you want to generate, e.g. 'a pig in a winter forest'...",
124
+ lines=3,
125
+ value="a pig moving quickly in a beautiful winter scenery nature trees sunset tracking camera"
126
+ )
127
+ # Optional negative prompt: overrides the agent's recommended negative prompt.
128
+ negative_prompt_input = gr.Textbox(
129
+ label="Negative Prompt (Optional)",
130
+ placeholder="Things you don't want in the video; leave empty to use the agent's recommendation...",
131
+ lines=2,
132
+ value=""
133
+ )
134
+
135
+ with gr.Column(scale=1):
136
+ with gr.Group():
137
+ gr.Markdown("### ⚙️ Generation Parameters")
138
+
139
+ with gr.Row():
140
+ width_slider = gr.Slider(
141
+ label="Width",
142
+ minimum=64,
143
+ maximum=1920,
144
+ step=8,
145
+ value=DEFAULT_PARAMS["width"]
146
+ )
147
+ height_slider = gr.Slider(
148
+ label="Height",
149
+ minimum=64,
150
+ maximum=1080,
151
+ step=8,
152
+ value=DEFAULT_PARAMS["height"]
153
+ )
154
+
155
+ num_frames_slider = gr.Slider(
156
+ label="Number of Frames",
157
+ minimum=1,
158
+ maximum=200,
159
+ step=1,
160
+ value=DEFAULT_PARAMS["num_frames"]
161
+ )
162
+
163
+ inference_steps_slider = gr.Slider(
164
+ label="Inference Steps",
165
+ minimum=1,
166
+ maximum=100,
167
+ step=1,
168
+ value=DEFAULT_PARAMS["num_inference_steps"]
169
+ )
170
+
171
+ guidance_scale_slider = gr.Slider(
172
+ label="Guidance Scale",
173
+ minimum=0.0,
174
+ maximum=20.0,
175
+ step=0.1,
176
+ value=DEFAULT_PARAMS["guidance_scale"]
177
+ )
178
+
179
+ seed_input = gr.Number(
180
+ label="Seed (Optional)",
181
+ value=0,
182
+ precision=0
183
+ )
184
+
185
+ # Generation button
186
+ with gr.Row():
187
+ generate_btn = gr.Button(
188
+ "🎬 Generate Video",
189
+ variant="primary",
190
+ size="lg"
191
+ )
192
+
193
+ # Output section
194
+ with gr.Row():
195
+ with gr.Column():
196
+ video_output = gr.Video(
197
+ label="Generated Video",
198
+ visible=False
199
+ )
200
+
201
+ generation_info = gr.Markdown(
202
+ label="Generation Information",
203
+ visible=False
204
+ )
205
+
206
+ generation_error = gr.Markdown(
207
+ visible=False
208
+ )
209
+
210
+ # Additional controls
211
+ with gr.Row():
212
+ with gr.Column():
213
+ gr.Markdown("""
214
+ ### 💡 Tips:
215
+ - Enter a short **concept** (e.g. “a busy city street at dawn”). The agent will expand it into a detailed prompt.
216
+ - Adjust the **guidance scale**: higher values make the video adhere more closely to the refined prompt.
217
+ - Increasing **inference steps** improves quality at the cost of generation time.
218
+ - Use the optional **Negative Prompt** field only if you want to override the agent's recommended terms.
219
+ - Keep width and height multiples of 8 for optimal performance.
220
+ """)
221
+
222
+ with gr.Column():
223
+ if torch.cuda.is_available():
224
+ gpu_info = f"🎮 GPU: {torch.cuda.get_device_name()}"
225
+ else:
226
+ gpu_info = "💻 Running on CPU"
227
+
228
+ gr.Markdown(f"""
229
+ ### 🖥️ System Information:
230
+ {gpu_info}
231
+
232
+ ### 📊 Model Information:
233
+ - **Model:** WAN‑VACE 1.3B (Q4_0 Quantized)
234
+ - **Text Encoder:** UMT5‑XXL
235
+ - **Scheduler:** UniPC Multistep
236
+
237
+ ### 🤖 Agent Details:
238
+ - **Planning:** The agent automatically crafts a detailed prompt and a recommended negative prompt based on your concept.
239
+ - **Override:** Supply your own negative prompt to override the recommendation if desired.
240
+ """)
241
+
242
+ # Event handlers
243
+ load_btn.click(
244
+ fn=load_model_interface,
245
+ outputs=[
246
+ load_btn,
247
+ generation_interface,
248
+ load_success_msg,
249
+ load_error_msg
250
+ ]
251
+ )
252
+
253
+ generate_btn.click(
254
+ fn=generate_video_interface,
255
+ inputs=[
256
+ prompt_input,
257
+ negative_prompt_input,
258
+ width_slider,
259
+ height_slider,
260
+ num_frames_slider,
261
+ inference_steps_slider,
262
+ guidance_scale_slider,
263
+ seed_input
264
+ ],
265
+ outputs=[
266
+ video_output,
267
+ generation_info,
268
+ generation_error
269
+ ]
270
+ )
271
+
272
+ return demo
273
+
274
+ def main():
275
+ """Main function to launch the application"""
276
+ print(f"🚀 Starting {UI_CONFIG['title']}...")
277
+ print(f"🔧 Server configuration: {SERVER_CONFIG['host']}:{SERVER_CONFIG['port']}")
278
+
279
+ # Check GPU availability
280
+ if torch.cuda.is_available():
281
+ print(f"🎮 GPU detected: {torch.cuda.get_device_name()}")
282
+ print(f"💾 GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")
283
+ else:
284
+ print("💻 Running on CPU (GPU recommended for better performance)")
285
+
286
+ # Create interface and enable the event queue to support multiple users.
287
+ demo = create_interface()
288
+ # Hugging Face Spaces expect `.queue()` to be called for handling request concurrency.
289
+ # Limiting concurrency_count to 1 helps prevent excessive memory usage on CPU-only hardware.
290
+ demo = demo.queue(concurrency_count=1)
291
+
292
+ # Launch the interface.
293
+ demo.launch(
294
+ server_name=SERVER_CONFIG["host"],
295
+ server_port=SERVER_CONFIG["port"],
296
+ share=SERVER_CONFIG["share"],
297
+ show_error=True,
298
+ show_tips=True
299
+ )
300
+
301
+ if __name__ == "__main__":
302
+ main()
config.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration file for WAN-VACE video generation application
3
+ """
4
+ import os
5
+
6
+ # Hugging Face token (must be set as environment variable)
7
+ HF_TOKEN = os.getenv("HF_TOKEN")
8
+
9
+ # Model paths and configurations
10
+ MODEL_CONFIG = {
11
+ "transformer_path": "https://huggingface.co/calcuis/wan-gguf/blob/main/wan2.1-v5-vace-1.3b-q4_0.gguf",
12
+ "text_encoder_path": "chatpig/umt5xxl-encoder-gguf",
13
+ "text_encoder_file": "umt5xxl-encoder-q4_0.gguf",
14
+ "vae_path": "callgg/wan-decoder",
15
+ "pipeline_path": "callgg/wan-decoder"
16
+ }
17
+
18
+ # Default generation parameters
19
+ DEFAULT_PARAMS = {
20
+ "width": 720,
21
+ "height": 480,
22
+ "num_frames": 57,
23
+ "num_inference_steps": 24,
24
+ "guidance_scale": 2.5,
25
+ "conditioning_scale": 0.0,
26
+ "fps": 16,
27
+ "flow_shift": 3.0
28
+ }
29
+
30
+ # UI configuration
31
+ #
32
+ # The title and description here emphasise the agentic nature of the app:
33
+ # you provide a concept and the system plans the prompts for you. Feel free
34
+ # to adjust these strings to suit your needs or branding.
35
+ UI_CONFIG = {
36
+ "title": "🎬 Agentic WAN-VACE Video Generation",
37
+ "description": (
38
+ "Generate high-quality videos from simple concepts. "
39
+ "Provide a short description of what you want to see, and the agent "
40
+ "will craft a refined prompt and negative prompt before generating a cinematic "
41
+ "vertical video using the WAN‑VACE model."
42
+ ),
43
+ "theme": "default"
44
+ }
45
+
46
+ # Server configuration
47
+ SERVER_CONFIG = {
48
+ "host": "0.0.0.0",
49
+ "port": 5000,
50
+ "share": False
51
+ }
model_handler.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Model handler for WAN-VACE video generation
3
+ """
4
+ import torch
5
+ import time
6
+ from typing import Optional, Tuple, Any
7
+ from transformers import UMT5EncoderModel
8
+ from diffusers import AutoencoderKLWan, WanVACEPipeline, WanVACETransformer3DModel, GGUFQuantizationConfig
9
+ from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
10
+ from diffusers.utils import export_to_video
11
+ from huggingface_hub import login
12
+ import gradio as gr
13
+
14
+ from config import MODEL_CONFIG, DEFAULT_PARAMS, HF_TOKEN
15
+ import os
16
+ from utils import create_temp_video_path, validate_generation_params, validate_prompt, format_generation_info
17
+
18
+ class WanVACEModelHandler:
19
+ """Handler for WAN-VACE model loading and video generation"""
20
+
21
+ def __init__(self):
22
+ self.pipe = None
23
+ self.is_loaded = False
24
+ self.loading_progress = 0
25
+
26
+ def login_hf(self) -> bool:
27
+ """Login to Hugging Face"""
28
+ try:
29
+ login(token=HF_TOKEN)
30
+ return True
31
+ except Exception as e:
32
+ print(f"Warning: Could not login to Hugging Face: {e}")
33
+ return False
34
+
35
+ def load_model(self, progress_callback=None) -> Tuple[bool, str]:
36
+ """Load the WAN-VACE model components"""
37
+ try:
38
+ # Login to HF
39
+ self.login_hf()
40
+
41
+ if progress_callback:
42
+ progress_callback(0.1, "Loading transformer model...")
43
+
44
+ # Determine desired dtype for CPU/GPU execution.
45
+ # Hugging Face Spaces often run on CPU, where bfloat16 may not be supported.
46
+ # Allow the dtype to be configured via the WAN_DTYPE environment variable.
47
+ # Supported values: "bfloat16" (default) or "float32".
48
+ dtype_str = os.getenv("WAN_DTYPE", "bfloat16").lower()
49
+ # Select compute dtype: use bfloat16 only if requested and available.
50
+ # Fall back to float32 otherwise.
51
+ compute_dtype = torch.bfloat16 if dtype_str == "bfloat16" else torch.float32
52
+ # Likewise for the torch dtype used when loading weights.
53
+ torch_dtype = compute_dtype
54
+
55
+ # Load transformer
56
+ transformer = WanVACETransformer3DModel.from_single_file(
57
+ MODEL_CONFIG["transformer_path"],
58
+ quantization_config=GGUFQuantizationConfig(compute_dtype=compute_dtype),
59
+ torch_dtype=torch_dtype,
60
+ )
61
+
62
+ if progress_callback:
63
+ progress_callback(0.4, "Loading text encoder...")
64
+
65
+ # Load text encoder
66
+ text_encoder = UMT5EncoderModel.from_pretrained(
67
+ MODEL_CONFIG["text_encoder_path"],
68
+ gguf_file=MODEL_CONFIG["text_encoder_file"],
69
+ torch_dtype=torch_dtype,
70
+ )
71
+
72
+ if progress_callback:
73
+ progress_callback(0.7, "Loading VAE...")
74
+
75
+ # Load VAE
76
+ vae = AutoencoderKLWan.from_pretrained(
77
+ MODEL_CONFIG["vae_path"],
78
+ subfolder="vae",
79
+ torch_dtype=torch.float32
80
+ )
81
+
82
+ if progress_callback:
83
+ progress_callback(0.9, "Assembling pipeline...")
84
+
85
+ # Create pipeline
86
+ self.pipe = WanVACEPipeline.from_pretrained(
87
+ MODEL_CONFIG["pipeline_path"],
88
+ transformer=transformer,
89
+ text_encoder=text_encoder,
90
+ vae=vae,
91
+ torch_dtype=torch_dtype
92
+ )
93
+
94
+ # Configure scheduler
95
+ flow_shift = DEFAULT_PARAMS["flow_shift"]
96
+ self.pipe.scheduler = UniPCMultistepScheduler.from_config(
97
+ self.pipe.scheduler.config,
98
+ flow_shift=flow_shift
99
+ )
100
+
101
+ # Enable optimizations
102
+ self.pipe.enable_model_cpu_offload()
103
+ self.pipe.vae.enable_tiling()
104
+
105
+ self.is_loaded = True
106
+
107
+ if progress_callback:
108
+ progress_callback(1.0, "Model loaded successfully!")
109
+
110
+ return True, "Model loaded successfully!"
111
+
112
+ except Exception as e:
113
+ error_msg = f"Error loading model: {str(e)}"
114
+ if progress_callback:
115
+ progress_callback(0, error_msg)
116
+ return False, error_msg
117
+
118
+ def generate_video(
119
+ self,
120
+ prompt: str,
121
+ negative_prompt: str = "",
122
+ width: int = DEFAULT_PARAMS["width"],
123
+ height: int = DEFAULT_PARAMS["height"],
124
+ num_frames: int = DEFAULT_PARAMS["num_frames"],
125
+ num_inference_steps: int = DEFAULT_PARAMS["num_inference_steps"],
126
+ guidance_scale: float = DEFAULT_PARAMS["guidance_scale"],
127
+ seed: Optional[int] = None,
128
+ progress_callback=None
129
+ ) -> Tuple[bool, str, str, str]:
130
+ """
131
+ Generate video from text prompt
132
+ Returns: (success, video_path, error_message, generation_info)
133
+ """
134
+
135
+ if not self.is_loaded:
136
+ return False, "", "Model not loaded. Please load the model first.", ""
137
+
138
+ # Validate inputs
139
+ prompt_valid, prompt_error = validate_prompt(prompt)
140
+ if not prompt_valid:
141
+ return False, "", prompt_error or "Invalid prompt", ""
142
+
143
+ params_valid, params_error = validate_generation_params(
144
+ width, height, num_frames, num_inference_steps, guidance_scale
145
+ )
146
+ if not params_valid:
147
+ return False, "", params_error or "Invalid parameters", ""
148
+
149
+ try:
150
+ if progress_callback:
151
+ progress_callback(0.1, "Preparing generation...")
152
+
153
+ # Check if pipeline is loaded
154
+ if self.pipe is None:
155
+ return False, "", "Pipeline not initialized. Please load the model first.", ""
156
+
157
+ # Set up generator with seed
158
+ generator = torch.Generator()
159
+ if seed is not None:
160
+ generator.manual_seed(seed)
161
+ else:
162
+ generator.manual_seed(0) # Default seed
163
+
164
+ if progress_callback:
165
+ progress_callback(0.2, "Starting video generation...")
166
+
167
+ start_time = time.time()
168
+
169
+ # Generate video
170
+ output = self.pipe(
171
+ prompt=prompt,
172
+ negative_prompt=negative_prompt if negative_prompt else None,
173
+ width=width,
174
+ height=height,
175
+ num_frames=num_frames,
176
+ num_inference_steps=num_inference_steps,
177
+ guidance_scale=guidance_scale,
178
+ conditioning_scale=DEFAULT_PARAMS["conditioning_scale"],
179
+ generator=generator,
180
+ ).frames[0]
181
+
182
+ if progress_callback:
183
+ progress_callback(0.8, "Exporting video...")
184
+
185
+ # Export to video file
186
+ output_path = create_temp_video_path()
187
+ export_to_video(output, output_path, fps=DEFAULT_PARAMS["fps"])
188
+
189
+ generation_time = time.time() - start_time
190
+
191
+ if progress_callback:
192
+ progress_callback(1.0, "Video generation complete!")
193
+
194
+ # Format generation info
195
+ gen_info = format_generation_info(
196
+ prompt, negative_prompt, width, height, num_frames,
197
+ num_inference_steps, guidance_scale, generation_time
198
+ )
199
+
200
+ return True, output_path, "", gen_info
201
+
202
+ except Exception as e:
203
+ error_msg = f"Error during video generation: {str(e)}"
204
+ if progress_callback:
205
+ progress_callback(0, error_msg)
206
+ return False, "", error_msg, ""
207
+
208
+ # Global model handler instance
209
+ model_handler = WanVACEModelHandler()
planning.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Planning utilities for the agentic WAN‑VACE video generator.
3
+
4
+ This module defines a simple planner that takes a high‑level concept or topic and
5
+ returns a refined text prompt and a recommended negative prompt. The planner
6
+ adds cinematic and visual descriptors to the concept to encourage more
7
+ engaging video outputs and recommends a default negative prompt to avoid
8
+ common artifacts and low‑quality renderings.
9
+
10
+ The planner can be replaced or extended with more sophisticated logic or local
11
+ LLMs if desired.
12
+ """
13
+
14
+ from dataclasses import dataclass
15
+ from typing import Tuple
16
+
17
+
18
+ @dataclass
19
+ class Plan:
20
+ """A dataclass representing a planned prompt and negative prompt."""
21
+ prompt: str
22
+ negative_prompt: str
23
+
24
+
25
+ def plan_from_topic(topic: str) -> Plan:
26
+ """
27
+ Generate a refined prompt and a recommended negative prompt from a high‑level topic.
28
+
29
+ The refined prompt enriches the user's concept with cinematic descriptors and
30
+ details that tend to produce appealing vertical videos. The negative prompt
31
+ includes terms that discourage common undesirable artifacts.
32
+
33
+ Parameters
34
+ ----------
35
+ topic: str
36
+ A short description of what the user wants in the video.
37
+
38
+ Returns
39
+ -------
40
+ Plan
41
+ An object containing a refined prompt and a negative prompt.
42
+ """
43
+ # Base descriptors to enrich the concept. These tokens help guide the model
44
+ # towards vibrant, cinematic compositions. You can customise these tokens
45
+ # depending on your aesthetic preferences.
46
+ base_descriptors = (
47
+ "cinematic, dynamic motion, rich details, warm lighting, volumetric lighting, "
48
+ "bokeh, warm sun rim light, tracking shot, shallow depth of field, vertical 9:16"
49
+ )
50
+ # Compose the refined prompt
51
+ refined_prompt = f"{topic}, {base_descriptors}"
52
+
53
+ # Recommended negative prompt to avoid low‑quality outputs. Users can
54
+ # override this by supplying their own negative prompt.
55
+ recommended_negative = (
56
+ "blurry, lowres, artifacts, distorted anatomy, dull colors, washed out, "
57
+ "overexposed, underexposed, jitter, bad compression"
58
+ )
59
+
60
+ return Plan(prompt=refined_prompt, negative_prompt=recommended_negative)
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Requirements for running the WAN‑VACE Gradio application on Hugging Face Spaces
2
+
3
+ # Core inference libraries
4
+ torch==2.2.* # PyTorch CPU build; GPU is not available in most Spaces
5
+ transformers>=4.42.0
6
+ diffusers==0.32.*
7
+ accelerate>=0.31.0
8
+ safetensors>=0.4.0
9
+ huggingface_hub>=0.21.0
10
+
11
+ # Application and interface libraries
12
+ gradio>=4.0.0
13
+ opencv-python-headless>=4.8.0
14
+ numpy>=1.24.0
15
+ Pillow>=10.0.0
16
+
17
+ # The following line pins the Torch CPU wheel source for Linux systems.
18
+ # It is optional but recommended to avoid downloading GPU wheels on CPU-only hardware.
19
+ -f https://download.pytorch.org/whl/cpu
utils.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility functions for video processing and file handling
3
+ """
4
+ import os
5
+ import tempfile
6
+ import uuid
7
+ from typing import Optional, Tuple
8
+ import torch
9
+ import numpy as np
10
+ from PIL import Image
11
+
12
+ def create_temp_video_path(extension: str = "mp4") -> str:
13
+ """Create a temporary file path for video output"""
14
+ temp_dir = tempfile.gettempdir()
15
+ unique_id = str(uuid.uuid4())
16
+ return os.path.join(temp_dir, f"video_{unique_id}.{extension}")
17
+
18
+ def validate_generation_params(
19
+ width: int,
20
+ height: int,
21
+ num_frames: int,
22
+ num_inference_steps: int,
23
+ guidance_scale: float
24
+ ) -> Tuple[bool, Optional[str]]:
25
+ """Validate video generation parameters"""
26
+
27
+ # Check width and height
28
+ if width < 64 or width > 1920:
29
+ return False, "Width must be between 64 and 1920 pixels"
30
+ if height < 64 or height > 1080:
31
+ return False, "Height must be between 64 and 1080 pixels"
32
+
33
+ # Check if dimensions are divisible by 8 (common requirement for video models)
34
+ if width % 8 != 0:
35
+ return False, "Width must be divisible by 8"
36
+ if height % 8 != 0:
37
+ return False, "Height must be divisible by 8"
38
+
39
+ # Check frames
40
+ if num_frames < 1 or num_frames > 200:
41
+ return False, "Number of frames must be between 1 and 200"
42
+
43
+ # Check inference steps
44
+ if num_inference_steps < 1 or num_inference_steps > 100:
45
+ return False, "Number of inference steps must be between 1 and 100"
46
+
47
+ # Check guidance scale
48
+ if guidance_scale < 0 or guidance_scale > 20:
49
+ return False, "Guidance scale must be between 0 and 20"
50
+
51
+ return True, None
52
+
53
+ def validate_prompt(prompt: str) -> Tuple[bool, Optional[str]]:
54
+ """Validate the input prompt"""
55
+ if not prompt or len(prompt.strip()) == 0:
56
+ return False, "Prompt cannot be empty"
57
+
58
+ if len(prompt) > 1000:
59
+ return False, "Prompt must be less than 1000 characters"
60
+
61
+ return True, None
62
+
63
+ def get_memory_usage() -> str:
64
+ """Get current GPU memory usage if available"""
65
+ if torch.cuda.is_available():
66
+ allocated = torch.cuda.memory_allocated() / 1024**3 # Convert to GB
67
+ cached = torch.cuda.memory_reserved() / 1024**3
68
+ return f"GPU Memory - Allocated: {allocated:.2f}GB, Cached: {cached:.2f}GB"
69
+ else:
70
+ return "GPU not available"
71
+
72
+ def cleanup_temp_files(file_path: str) -> None:
73
+ """Clean up temporary files"""
74
+ try:
75
+ if os.path.exists(file_path):
76
+ os.remove(file_path)
77
+ except Exception as e:
78
+ print(f"Warning: Could not remove temporary file {file_path}: {e}")
79
+
80
+ def format_generation_info(
81
+ prompt: str,
82
+ negative_prompt: str,
83
+ width: int,
84
+ height: int,
85
+ num_frames: int,
86
+ num_inference_steps: int,
87
+ guidance_scale: float,
88
+ generation_time: float
89
+ ) -> str:
90
+ """Format generation information for display"""
91
+ info = f"""
92
+ **Generation Details:**
93
+ - **Prompt:** {prompt}
94
+ - **Negative Prompt:** {negative_prompt if negative_prompt else "None"}
95
+ - **Dimensions:** {width}x{height}
96
+ - **Frames:** {num_frames}
97
+ - **Inference Steps:** {num_inference_steps}
98
+ - **Guidance Scale:** {guidance_scale}
99
+ - **Generation Time:** {generation_time:.2f} seconds
100
+ - **Memory Usage:** {get_memory_usage()}
101
+ """
102
+ return info