ScriptAgent / app.py
XD-MU's picture
Update app.py
71bc2ba verified
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.models.qwen2_5_omni import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
import warnings
import os
import time
import re
import base64
import datetime
import uuid
import logging
from typing import List, Dict, Tuple, Optional
from PIL import Image
from huggingface_hub import snapshot_download
from swift.llm import PtEngine, RequestConfig, InferRequest
# --- 依赖库检查 ---
try:
import cv2
from moviepy.editor import VideoFileClip, concatenate_videoclips
from openai import OpenAI
from google import genai
from google.genai import types
except ImportError as e:
print(f"❌ 缺少必要库: {e}")
print("请运行: pip install opencv-python moviepy openai google-genai")
cv2 = None
VideoFileClip = None
OpenAI = None
genai = None
# --- 环境设置 ---
os.environ['ENABLE_AUDIO_OUTPUT'] = '0'
os.environ['VIDEO_TOTAL_PIXELS'] = '0'
os.environ['IMAGE_FACTOR'] = '1'
os.environ['MAX_PIXELS'] = '1024' # 降低到最低
warnings.filterwarnings("ignore")
os.environ['PYTHONWARNINGS'] = 'ignore'
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
LOGGER = logging.getLogger(__name__)
# ==========================================
# PART 0: 配置常量 & Demo 数据
# ==========================================
# 1. 模型参数配置
MODEL_CONFIGS = {
"sora-2": {
"sizes": ["1792x1024", "1024x1792", "1280x720", "720x1280"],
"seconds_range": {"minimum": 4, "maximum": 12, "step": 4, "value": 4},
"seconds_label": "单镜时长 (Sora: 4/8/12秒)"
},
"sora-2-pro": {
"sizes": ["1792x1024", "1024x1792", "1280x720", "720x1280"],
"seconds_range": {"minimum": 4, "maximum": 12, "step": 4, "value": 4},
"seconds_label": "单镜时长 (Sora Pro: 4/8/12秒)"
},
"veo-3.1": {
"sizes": ["1080p", "720p"],
"seconds_range": {"minimum": 4, "maximum": 8, "step": 2, "value": 4},
"seconds_label": "单镜时长 (Veo: 4/6/8秒)"
}
}
# 2. 提示词与风格
CONTINUITY_PROMPT = (
"保持统一的视觉风格与世界观,场景与光影保持稳定,角色服装、发型、体型与表情连贯,仅根据剧情调整动作;"
"如果有参考图片,请严格保持人物形象与参考图一致,人物站位不得变化,镜头衔接需流畅自然。"
"旁白不需要朗读或配音,仅作为剧情提示使用。要求视频生成的最后一帧要展示所有人物的正面形象和此时的站位。"
)
STYLE_PROMPTS = {
"Anime (二次元)": "整体画面要求:高质量二次元动漫渲染风格,角色为手绘动漫人物,肤色与材质为动画质感,背景为虚构的动画场景;禁止出现写实/真人或真实摄影元素。",
"Realistic (写实)": "整体画面要求:高写实摄影风格,人物与环境光影细节丰富,材质与质感贴近真实世界,禁止出现卡通或夸张笔触,确保色彩与光线符合真实物理规律。",
"Animated (动画/3D)": "整体画面要求:动画/卡通风格,支持二维或三维渲染,人物线条与轮廓清晰,色彩饱和且富有层次,可适当夸张动作与表情。",
"Painterly (艺术/绘画)": "整体画面要求:艺术绘画风格,可呈现厚重笔触或水彩晕染质感,允许保留艺术性的纹理与笔法痕迹,整体色彩与构图需统一。",
"Abstract (抽象/实验)": "整体画面要求:抽象/实验风格,鼓励运用超现实、故障艺术或非传统构图手法,可打破写实规律,突出视觉冲击力与创意表现。"
}
STYLE_KEYS = list(STYLE_PROMPTS.keys())
# 3. Demo 案例数据
DEMO_DATA = [
{
"file": "demo1.mp4",
"title": "案例 1",
"script": """
【Dialogue】:
1. [0 seconds - 9 seconds] (Camera Movement: Handheld Camera Effect + Shot Type: Long Shot - Medium Close-up - Long Shot)
Storyline: In a long shot, Su Luo paces anxiously back and forth in a forest clearing. The protagonist stands a short distance away, quietly watching her. The camera zooms in, switching to a handheld medium close-up, closely following Su Luo. She scratches her hair in frustration, stops, kicks a pebble at her feet, and mutters to herself, "I really shouldn't have agreed to Qin Fei's request. How can I find all three time capsules in such a short time?" She then stomps her foot in annoyance. The scene cuts back to a long shot, making the distance between the two obvious.
2. [9 seconds - 18 seconds] (Camera Movement: Shallow Depth of Field + Shot Type: Long Shot - Medium Shot - Long Shot)
Storyline: In a long shot, the protagonist takes a few steps closer to Su Luo. The camera switches to a medium shot over the protagonist's shoulder, focusing on Su Luo, with the background blurred. The protagonist asks with concern, "What is a time capsule?" Su Luo turns her head at the sound of her voice, her annoyance temporarily replaced by professionalism. She raises her hand to explain, "It's a time capsule controlled by cloud threads, a mechanism that counts down." The scene cuts back to a wide shot, where the protagonist is now standing in front of Su Luo.
3. [18 seconds - 26 seconds] (Camera movement: Tilt Shot + Shot type: Wide shot - Close-up - Wide shot) Plot: In the wide shot, Su Luo continues to explain to the protagonist. The camera cuts to a close-up of Su Luo, her eyes focused, her tone becoming scholarly: "Because it can only be opened at a designated time, it's called a time capsule." The camera tilts downwards, panning across her gestures simulating opening and closing the capsule, then returns to her serious face. The scene cuts back to a wide shot, Su Luo's explanation has ended.
4. [26 seconds - 36 seconds] (Camera Type: Arc Shot + Shot Type: Long Shot - Medium Close-up - Long Shot) Plot: In the long shot, Su Luo's body language becomes exaggerated. The camera switches to a medium close-up arc shot circling Su Luo. She puts one hand on her hip, recalling the scene, her tone shifting from explanation to a slightly smug complaint: "Ugh, that guy Qin Fei suddenly said he buried three time capsules containing precious treasures, thinking I definitely wouldn't find them." She raises her chin, mimicking her past self, proudly patting her chest: "As a connoisseur, I said on the spot, 'Hmph, what is there that I, Su Luo, can't find? I'll find them all for you in less than an hour.'" The scene cuts back to the long shot, Su Luo still maintaining her proud pose.
5. [36seconds - 44 seconds] (Camera Type: Shaky Cam + Shot Type: Long Shot - Medium Shot - Long Shot) Plot: In the long shot, the protagonist seems to say something, and Su Luo's posture instantly collapses. The camera shifts to a medium shot of the protagonist, who, with a slightly mocking smile, gently shakes their head: "Uh, judging by your eagerness, you haven't achieved your goal yet, have you?" The focus remains the same, but Su Luo can be seen in the background; her proud posture instantly crumbles, her shoulders slump like a punctured balloon. Slight camera shake hints at Su Luo's inner turmoil. The scene cuts back to a long shot, where Su Luo lowers her head.
6. [44 seconds - 53 seconds] (Camera movement: Panning Shot + Shot type: Long shot - Close-up - Long shot) Plot: In the long shot, Su Luo appears somewhat dejected. The camera cuts to a close-up of Su Luo, who awkwardly avoids the protagonist's gaze, fidgeting with the hem of her clothes as she whispers an explanation: "Who knew that guy would be so cunning this time? The boxes were so far apart! Even though he cheated, I don't want to lose face..." Before she can finish, she suddenly looks up, and the camera pans to capture her pleading gaze at the protagonist: "Can you help me?" The scene cuts back to a wide shot, their eyes meeting.
7. [53 seconds - 62 seconds] (Camera movement: Crash Zoom + Shot type: Wide shot - Medium shot - Wide shot) Plot: In the wide shot, the protagonist smiles and nods. The camera quickly zooms in on Su Luo, a rapid medium zoom close-up capturing the fleeting surprise and relief on her face. She immediately perked up, excitedly exclaiming, "Great!" Then, she pointed decisively in two different directions into the distance, "I'll leave those two to you," she said, then pounded her chest confidently, "I'll handle the last one myself." The scene cuts back to a distant view; Su Luo has regained her composure and is ready to set off.
【Character Profile】: Su Luo: Medium height/light and agile physique/clear facial features, with large eyes that can instantly switch emotions, sometimes focused and sharp, sometimes cunning and smug/wears a slightly messy high ponytail with vibrant ash brown hair/wears a short jacket and cotton shirt for ease of movement, a tool belt around her waist, and trousers tucked into sturdy hiking boots/extroverted temperament, with rapidly changing emotions, from pacing restlessly and proudly with hands on hips to pleading for help in embarrassment; her body language is rich, and she's an action-oriented person who can't hide her feelings.
Main Character: Above average height / Tall and composed posture / Soft facial features, gentle yet sharp eyes, often with a subtle smile / Short, neatly trimmed dark hair / Wears a dark long-sleeved shirt and durable travel trousers, well-fitting and easy to move in, with a simple and understated style / Reserved and calm demeanor, an excellent observer and listener, responding primarily with subtle gestures like nods and smiles, a stark contrast to Su Luo's liveliness.
【Scene Description】: A dappled sunlight-dappled clearing in the woods in the afternoon. The atmosphere gradually shifts from one person's anxiety and the other's calm observation to a relaxed, cooperative one. The scene is open, with pebbles and fallen leaves scattered on the ground, surrounded by tranquil woods.
【Positioning】:
1. Su Luo moves back and forth in the center of the clearing, while the main character stands at the edge, a considerable distance away.
2. The main character walks towards Su Luo, and eventually the two stand face-to-face, about two or three steps apart.
3. Maintaining this close proximity, Su Luo faces the main character and begins to explain. 4. The two remain in the same position, with Su Luo turning towards the protagonist in a slightly theatrical manner.
5. The two remain in the same position, with the protagonist facing Su Luo, while Su Luo lowers her head and turns to the side, avoiding the protagonist's gaze.
6. Su Luo changes her gaze from avoiding the protagonist to looking directly at her, and the two are face to face again, their eyes meeting.
7. The two remain in the same position, with Su Luo briefly facing the protagonist, then turning her body to the side and pointing into the distance.
"""
},
{
"file": "demo2.mp4",
"title": "案例 2",
"script": """【对话】:1. [0-8秒](运镜类型:Handheld Camera Effect+全景镜头、中景镜头)
故事情节:远景镜头,幽暗的遗迹室内,蒋风正俯身在一块散发着微光的古代石碑(指引图)前。主角从阴影中走出,站定在他身后几步远。镜头切换为手持拍摄的中景镜头,跟随主角的视线,画面有轻微晃动,聚焦在主角锐利的眼神上。主角双臂环抱,带着审视的口吻质问:"你在做什么?指引图是不能随便篡改的。"声音打破了室内的寂静。结尾回到远景,主角保持质问的姿态,蒋风的背影僵住。
2. [8-16秒](运镜类型:Arc Shot+全景镜头、中景镜头)
故事情节:远景镜头,蒋风缓缓转过身。镜头以一个平滑的弧度围绕蒋风移动,切换为中景镜头。他看到主角时明显一愣,双手下意识地抬起,掌心向前,做出一个无辜且防御的姿态,眼神慌乱地解释:"篡改指引图?不不不,你误会了。"他的表情诚恳又急切。结尾远景,两人对峙,气氛紧张。
3. [16-24秒](运镜类型:Shallow Depth of Field+全景镜头、中近景镜头)
故事情节:远景镜头,蒋风放下了手,姿态变得谦卑。镜头切换为中近景,焦点落在蒋风身上,他略带窘迫地笑了笑,背景中的主角身影变得模糊。他一边说一边用手比划着自己:"我才加入风物家没多久,哪有这个本事能篡改它。"结尾回到远景,蒋风仍在解释,主角静静地听着,没有打断。
4. [24-33秒](运镜类型:Tilt Shot+全景镜头、特写镜头)
故事情节:远景镜头,蒋风再次转向指引图。镜头给到蒋风的中近景,他伸出手指,小心翼翼地指向石碑上的一个发光符文,但并未触碰:"我只是想查看指引图上的身份印鉴。"镜头向下倾斜,给到他手指所指之处的符文一个特写,符文复杂而古老。主角的声音从画外传来,带着一丝疑惑:"身份印鉴?"结尾回到远景,主角微微探身,视线也落在了那个符文上。
5. [33-42秒](运镜类型:Panning Shot+全景镜头、近景镜头)
故事情节:远景镜头,两人都注视着指引图。镜头切换为近景,从蒋风的侧脸开始,他温和地解释着:"嗯,就是一种类似签名的东西。"镜头缓缓横移,扫过石碑上更多类似签名的印鉴,光芒流转。他的声音变得低沉而充满怀念:"在考古界,早期开荒的人员有权在指引图上留下自己的名字,我们称之为身份印鉴。"镜头移回,定格在他充满希冀的眼神上:"我想看看这些指引图上有没有我父亲的名字。"结尾回到远景,整个房间的氛围因这番话而悄然改变。
6. [42-51秒](运镜类型:Lens Flare+全景镜头、中景镜头)
故事情节:远景镜头,蒋风垂下目光。镜头切换为中景,他背对着石碑,仿佛陷入了久远的回忆,一道柔和的镜头光晕扫过画面,他眼神飘向远方,带着一丝不易察觉的落寞:"我父亲是主攻考古的风物家,但他常年在外勘察……我已经很久很久没见到他了。"结尾回到远景,主角的注意力已经完全从石碑转移到了蒋风身上。
7. [51-59秒](运镜类型:Deep Depth of Field+全景镜头、中近景镜头)
故事情节:远景镜头,蒋风转过头,重新看向主角。镜头切为中近景,景深拉远,我们能清晰看到前景中蒋风努力挤出一个微笑,眼神却流露着不确定,以及背景里主角严肃倾听的轮廓。蒋风说:"母亲说稷下不少的开荒考古是他完成的,我想看看是不是他真的来过。"结尾回到远景,蒋风的微笑显得有些无力。
8. [59-67秒](运镜类型:Shallow Depth of Field+全景镜头、特写镜头)
故事情节:远景镜头,室内一片沉寂。镜头推进到蒋风脸部的特写,极浅的景深模糊了周围的一切,只剩下他复杂的表情。他的笑容消失了,嘴唇微微颤抖:"虽然我相信母亲不会骗我,但……"他停顿了一下,低下头,用几不可闻的声音说出心底的委屈,"哪有人经常在外不回家的。"结尾远景,蒋风低着头,肩膀微微垮下。
9. [67-74秒](运镜类型:Shaky Cam+全景镜头、中近景镜头)
故事情节:远景镜头,主角打破了沉默。镜头切换为中近景,聚焦在主角身上,轻微的镜头晃动暗示着他内心的触动。他原本锐利的眼神已经完全柔和下来,取而代之的是理解与同情。他轻声问道:"那你找到答案了吗?"结尾远景,听到问话,蒋风缓缓抬起头。
【人物形象】:主角:身形挺拔,体态匀称有力,面部轮廓分明,眼神锐利如鹰。留着一头便于打理的深色短发,发丝间或夹杂风霜痕迹。身着深色调、材质耐磨的探险服,肩部和肘部有皮革补丁,腰间挂着若干实用工具包。气质沉稳老练,初期动作多为双臂环抱的审视姿态,后期眼神转为柔和,流露同情与无奈,是一位经验丰富、外冷内热的行动派。
蒋风:身高略低于主角,体态偏瘦,书生气较重,面部线条柔和,眼神清澈但时常流露慌乱与不确定。发型是略显蓬乱的黑色中短发,似乎无暇打理。穿着一身崭新的"风物家"制服,款式简洁但略显宽大,与身形不甚贴合。气质真诚而笨拙,常有抬手、低头、窘迫微笑等下意识动作,在提及父亲时,会从紧张转为充满希冀与感伤的脆弱,是一位涉世未深的年轻后辈。
【场景描述】:幽暗的古代遗迹室内,唯一的稳定光源来自一块散发着微光的石碑指引图,石壁上刻有古老符文。场景氛围从初始的紧张对峙,随着角色对话的深入,逐渐转变为充满感伤与理解的静谧与私密。
【站位】:1:主角站在蒋风身后几步远处,蒋风俯身于石碑前。
2:蒋风完全转过身,与主角正面相对,形成对峙。
3:两人保持面对面的站位,距离不变。
4:蒋风转身面向石碑,主角在其侧后方,视线投向石碑。
5:两人大致并排,共同注视着石碑。
6:蒋风背对石碑,面向空旷处;主角从侧面注视着蒋风。
7:蒋风转身,再次与主角面对面站立。
8:两人位置不变,蒋风低头,避开主角视线。
9:蒋风抬头,与主角视线交汇,维持原有距离。
"""
},
{
"file": "demo3.mp4",
"title": "案例 3",
"script": """
【Dialogue】: 1. [0-8 seconds] (Handheld Camera Effect + Wide Shot + Medium Close-up) A wide shot shows the protagonist and Meng Ya standing on a dusty open ground, while Yuan Xue and Zhao Dongxu lean against a tree not far away. The camera zooms in on Yuan Xue and the protagonist in a medium close-up. Yuan Xue is pale and ragged. She wipes the blood from her forehead with her sleeve, looking at the protagonist with fear and gratitude: "Thank you so much! Luckily you arrived in time! Otherwise, we would be a barbecue feast by now..." Upon hearing this, the protagonist's expression turns serious, his sharp eyes scanning the charred marks behind her. The camera pulls back to a wide shot, confirming the four people's positions.
2. [8-16 seconds] (Tilt Shot + Medium Shot) In a wide shot, the protagonist's gaze shifts from Yuan Xue, and he quickly walks to Zhao Dongxu's side. The camera shifts to a medium shot, starting with Zhao Dongxu's pained face. He grits his teeth, his face ashen, struggling to move his legs. The camera tilts downwards, finally focusing on his trembling right knee, which is supporting his body. He weakly whispers, "I can't feel my leg at all..." The camera pulls back to a long shot, showing the protagonist bending down to carefully examine Zhao Dongxu's injuries.
3. [16-24 seconds] (Panning Shot + Medium Shot) In the long shot, Meng Ya walks out from behind the protagonist. The camera cuts to a medium shot of Meng Ya, his arms crossed, brows furrowed, and he says impatiently to the protagonist, "What's wrong with you? What are you spacing out for?" He glances in the direction of Yuan Xue and Zhao Dongxu, then his gaze moves past the protagonist to the scorched earth, seemingly assessing the damage. The camera pulls back to a long shot, showing Meng Ya standing side-by-side with the protagonist, creating a sense of confrontation.
4. [24-33 seconds] (Shallow Depth of Field + Close-up) A long shot shows Meng Ya and the protagonist's positions. The camera quickly zooms in on Meng Ya, who becomes blurred in the background. He says with a hint of sarcasm, "If you're not strong enough, don't try to take on a top-tier commission! Come with us, back to the camp to get treatment from Bai Cao's family." Before he finishes speaking, the focus shifts to a close-up of the protagonist. He appears calm, but his gaze is unusually cool. He sniffs the air, confirming that Meng Ya's words have a hidden meaning. The camera pulls back to a long shot, and the protagonist slowly raises his head, looking directly at Meng Ya.
5. [33-41 seconds] (Arc Shot + Medium Close-up) A long shot creates a tense atmosphere. The camera begins a slow, arcing movement around the protagonist. Meng Ya's voice is like a ticking time bomb: "Understood. Let's split up then. I'll escort the wounded to the camp first, then come back to find you." In the moving shot, the protagonist's profile appears exceptionally resolute. He doesn't respond to Meng Ya, but instead warily scans his surroundings. Suddenly, his eyes sharpen, as if he's caught something unusual. The camera pulls back to a wide shot; the protagonist has turned completely in another direction.
6. [41-49 seconds] (Crash Zoom + Extreme Close-up) In the wide shot, the protagonist maintains a wary posture. He sniffs the air sharply, his pupils suddenly contracting. The camera instantly zooms in on his eyes, creating an extreme close-up, as if he's discerning the presence of danger: "And... I'm worried there might be other trapped students ahead. Perhaps we should go ahead and scout further." Plot: As soon as he finishes speaking, the camera quickly pulls back to a wide shot; we see him make up his mind and resolutely head towards the unknown danger ahead.
7. [49-57 seconds] (Shaky Cam + Medium-long Shot) In the long shot, Meng Ya is stunned. The camera switches to a handheld medium-long shot, the image shaking slightly with Meng Ya's reaction. He sighs helplessly, points towards the camp, as if to give instructions: "Understood. Let's split up then. I'll escort the wounded to the camp first, then come find you later." Although his tone is dissatisfied, his actions are decisive. The camera pulls back to the long shot, Meng Ya watches the protagonist turn around, then immediately turns and runs in the opposite direction.
8. [57-65 seconds] (Deep Depth of Field + Medium-long Shot) In the long shot, Meng Ya has already run dozens of meters. The camera is fixed on a mid-to-long shot with a large depth of field. In the foreground is a blurry silhouette of Meng Ya, who runs while turning back to shout, "Hey, don't go too far! Big Eye Owl will contact you later!" The background is very clear, with the protagonist continuing to walk forward without looking back, his expression unwavering. Plot: The two disappear rapidly into the distance in completely opposite directions. The camera eventually returns to the long shot, showing only...
a desolate land, no people left in the Frame.
【Character Appearance】: Yuan Xue: Approximately 165cm tall / Slender build, appearing extremely weak due to shock and injuries / Pale face, tattered clothes with burn marks, wiping blood from her forehead with her sleeve / Long, dark hair disheveled, mixed with dust and sweat / Eyes filled with lingering fear and gratitude, movements weak due to swaying body.
Main Character: Approximately 180cm tall / Well-proportioned and agile build, efficient movements / Well-defined facial features, serious and sharp eyes, with a wary expression when calmly observing / Short, dark hair, clean and neat / Wearing dark, durable tactical clothing, in good condition / Calm and decisive temperament, strong action ability, habitually using her sense of smell to perceive her environment.
Zhao Dongxu: Approximately 178cm tall / Slender build, appearing extremely weak after being injured / Pale face, facial features tense due to pain, obvious... Scarred/Short black hair, greasy with sweat and dust/Expression filled with pain, moving his lower limbs with difficulty, his movements trembling due to the injury/A resilient personality, but now submissive due to severe injuries, full of gratitude towards his comrades but unable to respond due to his physical helplessness.
Meng Ya: Approximately 182cm tall/Strong build, exuding power/Hard facial features, often frowning, his expression showing impatience with a hint of mockery/Short red hair, messy and with burn marks/Wearing a light combat vest and work pants, his arm muscles are clearly defined/Direct and fiery temperament, acting faster than thinking, possessing leadership but lacking patience with his comrades, once a decision is made, he will execute it immediately.
【Scene Description】: A desolate clearing after a disaster, the air filled with the dust and acrid smell of burning, the sun blazing, and traces of unextinguished flames still remaining on the ground, the overall atmosphere tense, oppressive, and uneasy.
【Positioning】: 1: The protagonist and Meng Ya stand side by side. 1. Yuan Xue and Zhao Dongxu lean against a tree in the distance, behind and to the side of the protagonist.
2. The protagonist walks from his original position towards Zhao Dongxu, bending down to check on him; the two are very close.
3. Meng Ya emerges from behind the protagonist, a few steps away, forming a confrontational stance with the two injured people.
4. Meng Ya stands side-by-side with the protagonist, facing each other, about an arm's length apart.
5. The protagonist turns around, his back to Meng Ya, scanning his surroundings; Meng Ya remains in place, observing him.
6. The protagonist remains still, turning his body completely forward, isolating himself from Meng Ya and the others behind him.
7. Meng Ya runs towards the distant camp; the protagonist remains in place, watching his retreating figure.
8. The protagonist moves forward, while Meng Ya runs in the opposite direction (away from the protagonist). The distance between them rapidly increases.
"""
},
{
"file": "demo4.mp4",
"title": "案例 4",
"script": """【对话】:1. [0-8秒] (Shallow Depth of Field+全景镜头、中近景镜头) 故事情节:远景展示公园小径上,白衫和主角并肩站立,一只胖乎乎的宠物"呱呱"趴在他们脚边的草地上气喘吁吁。镜头随即以浅景深推向白衫的中近景,他低头看着呱呱,脸上交织着无奈与宠溺,接着他抬眼望向主角,问道:"怎么样?呱呱有好好锻炼吗?" 结尾回到远景,主角正准备回答。
2. [8-17秒] (Handheld Camera Effect+全景镜头、中景镜头) 故事情节:远景中,三人位置不变。镜头切换为手持效果下的中景,画面随着白衫的动作有轻微晃动。他无奈地摊开手,叹了口气:"没办法,之前太惯着它了。" 他的视线落在懒洋洋翻了个身的呱呱身上,语气里满是无可奈何:"现在没有吃的半步也不挪。" 结尾远景,白衫轻轻摇头,主角在一旁安静地听着。
3. [17-26秒] (Arc Shot+全景镜头、中景镜头) 故事情节:远景展现整个场景。镜头开始围绕白衫进行弧线运动,他蹲下身,温柔地抚摸着呱呱的后背,语气放缓:"不过它今天起码完成了一点运动,晚上允许它多吃一点。" 镜头继续沿弧线转向一旁的主角,他看着这温情的一幕,微笑着点头附和:"今天运动的还不错。" 结尾远景,白衫蹲着,主角站着,形成一高一低的构图。
4. [26-34秒] (Shallow Depth of Field+全景镜头、近景镜头) 故事情节:远景中,白衫依旧蹲在呱呱身边,主角静立一旁。镜头切入白衫的近景,背景完全虚化。他抚摸呱呱的动作没停,但眼神中浮现出真切的忧虑,声音也低沉下来,充满了担忧:"我知道运动很辛苦,但呱呱真的太胖了,我很怕它胖得生病。" 结尾远景,能看到主角脸上的笑容也收敛了,神情变得严肃。
5. [34-42秒] (Tilt Shot+全景镜头、中景镜头) 故事情节:远景确认场景站位。镜头从中景开始,从地上心满意足地摇着尾巴的呱呱缓缓向上抬升,最终定格在刚站起身的白衫脸上。他脸上的忧虑一扫而空,转为一种故作爽朗的兴奋,对着呱呱大声宣布:"表现真不错。今晚它可以多吃一点!" 结尾远景,白衫高兴地拍了拍手。
6. [42-51秒] (Shaky Cam+全景镜头、中近景镜头) 故事情节:远景中,主角看着兴奋的白衫。镜头切换为对准主角的中近景,轻微的摇晃反映出他内心的无语。他看着白衫,嘴角微微抽动,眼神里是哭笑不得的怀疑,几乎是对自己低语:"这么吃还能瘦么……" 随即他像是突然想起了正事,表情一正,视线重新聚焦在白衫身上,问道:"对了,你知道刘叶的情况怎么样了吗?" 结尾远景,主角向前迈了半步,成功转移了话题。
7. [51-58秒] (Deep Depth of Field+全景镜头、中景镜头) 故事情节:远景中,两人相对而立。镜头切到白衫的中景,听到"刘叶"的名字,他先是愣了一下,随即眼神投向远方,景深变大,背景中的公园路径和行人都变得清晰。他皱眉思索片刻,然后略带歉意地摇了摇头,收回目光:"刘叶?看着是没长高,具体我就不知道了。" 结尾远景,白衫看着主角,摊了摊手表示不知情。
8. [58-67秒] (Panning Shot+全景镜头、中远景镜头) 故事情节:远景中,三人保持着最后的站位。镜头给到白衫的中远景,他侧过身,抬手指向西边的方向,语气变得热心:"同学要是想知道,不如去找他问问吧。" 镜头随着他的手臂平滑地向西边摇摄,画面中出现一条通往远处开阔广场的小径。"你往西走,他就在那边的广场。" 结尾远景,镜头停下,主角顺着白衫所指的方向望去,若有所思。
【人物形象】:白衫:身高约180cm,体态匀称修长,略带少年感。面部线条柔和,眉眼清秀,笑起来时眼角有细微纹路。发型是自然的黑色短发,刘海稍长,显得随性。身穿一件干净的白色棉麻衬衫和浅色休闲裤,脚踩白色运动鞋。气质温和亲切,与人交谈时真诚,对待宠物时眼神宠溺,会用摊手、挠头等小动作表达无奈,是个内心细腻的暖男。
主角:身高与白衫相仿,身形挺拔,站姿稳重。面部轮廓分明,眼神锐利但内敛,表情变化细微,善于观察。发型为深色利落短发,显得干练。穿着深色系的休闲夹克,内搭纯色T恤,下身是工装裤,整体风格偏向实用和低调。气质沉稳,话不多,习惯通过嘴角抽动、眼神聚焦等微表情传递内心活动,行动果断且有目的性。
呱呱:一只体型极度肥胖的宠物,身躯圆滚滚,四肢短小,趴在地上像个肉球。拥有一双憨态可掬的大眼睛,表情总是懒洋洋的。毛发短而顺滑,脖子上戴着一个简单的项圈。动作迟缓,极度懒散,没有食物的诱惑便不愿动弹,对主人的抚摸会表现出心满意足的样子,是一只被宠坏了的"吃货"。
【场景描述】:午后阳光明媚的公园草坪,氛围从轻松宠溺的日常,转为对宠物健康的真切担忧,最终变为热心指路的平实交流。主要视觉元素是茵茵绿草、蜿蜒的小径,以及趴在地上一动不动的胖宠物。
【站位】:1. 白衫与主角并肩站立,呱呱在他们脚边的草地上。
2. 三人位置不变,白衫面向主角和呱呱。
3. 白衫蹲在呱呱旁边,主角站在他身侧,形成高低位。
4. 白衫维持蹲姿,主角站在一旁注视。
5. 白衫从呱呱身边站起,转身面对呱呱。
6. 主角面向白衫,两人相对而立,主角向前半步拉近距离。
7. 两人保持相对站立,白衫短暂望向远方后,目光回到主角身上。
8. 白衫侧身指向西边,主角随其指向望向同一方向。"""
}
]
# ==========================================
# PART 1: 剧本生成模型 (ScriptAgent)
# ==========================================
from swift.llm import PtEngine, RequestConfig, InferRequest
from swift.plugin import InferStats
import torch
# 全局变量
MODEL_NAME = "XD-MU/ScriptAgent"
LOCAL_MODEL_PATH = "./downloaded_models/ScriptAgent"
engine = None # InferEngine 对象
# 确保目录存在
os.makedirs(LOCAL_MODEL_PATH, exist_ok=True)
def load_llm_model():
"""使用 ms-swift 的 PtEngine 加载量化模型(int4/int8)"""
global engine
if engine is not None:
return
try:
# 检查模型是否存在
if not os.path.exists(os.path.join(LOCAL_MODEL_PATH, "config.json")):
print(f"正在从 HuggingFace 下载模型到 {LOCAL_MODEL_PATH}...")
snapshot_download(
repo_id=MODEL_NAME,
local_dir=LOCAL_MODEL_PATH,
local_dir_use_symlinks=False,
resume_download=True
)
print(f"✅ 模型已下载到: {LOCAL_MODEL_PATH}")
else:
print(f"✅ 模型已存在: {LOCAL_MODEL_PATH}")
print("正在加载文本模式(禁用多模态)...")
engine = PtEngine(
model_id_or_path=LOCAL_MODEL_PATH,
torch_dtype=torch.bfloat16, # 使用 bfloat16 更省内存
max_batch_size=1,
device_map='cpu',
quant_method='bnb',
quantization_bit=4,
model_kwargs={
'low_cpu_mem_usage': True,
'max_memory': {'cpu': '10GB'},
'offload_folder': './offload', # 内存不足时卸载到磁盘
}
)
print("✅ 文本模式加载完成")
except Exception as e:
print(f"❌ 模型加载失败: {e}")
import traceback
traceback.print_exc()
def chat_with_scriptagent(user_input: str):
"""使用 ms-swift InferEngine 与 ScriptAgent 对话生成剧本"""
global engine
if engine is None:
load_llm_model()
if engine is None:
return "❌ 模型加载失败,请检查后台日志。"
user_input = user_input.strip()
if not user_input:
return "请输入内容"
try:
print("🤖 正在使用 ms-swift InferEngine 推理剧本...")
# 🔥 使用 ms-swift 的推理方式
# 1. 构建消息格式
messages = [{'role': 'user', 'content': user_input}]
infer_request = InferRequest(messages=messages)
# 2. 配置请求参数
request_config = RequestConfig(
max_tokens=4096, # 最大生成token数
temperature=0.7, # 温度参数
top_p=0.9, # top_p 采样
repetition_penalty=1.1, # 重复惩罚
stream=False, # 不使用流式输出
)
# 3. 执行推理
metric = InferStats()
resp_list = engine.infer([infer_request], request_config, metrics=[metric])
# 4. 提取结果
response = resp_list[0].choices[0].message.content
# 5. 打印性能指标(可选)
print(f"✅ 生成完成 | 指标: {metric.compute()}")
print(f"✅ 生成结果长度: {len(response)} 字符")
return response if response else "⚠️ 生成为空,请重试"
except Exception as e:
print(f"❌ 生成出错: {e}")
import traceback
traceback.print_exc()
return f"生成失败: {str(e)}"
# ==========================================
# PART 2: 视频生成 API 封装
# ==========================================
class OpenAISoraAPI:
"""OpenAI Sora API 封装"""
def __init__(self, api_key: str):
if OpenAI is None:
raise RuntimeError("未安装 openai 库,请运行: pip install openai")
self.client = OpenAI(api_key=api_key)
def generate_video(
self,
prompt: str,
output_path: str,
model: str,
size: str,
seconds: int,
ref_img_path: str = None
) -> Optional[str]:
"""
生成视频
返回: None (成功) 或 错误信息字符串
"""
try:
LOGGER.info(f"🎬 Sora API 调用: {model} | {size} | {seconds}秒")
# 构建请求参数
kwargs = {
"model": model,
"prompt": prompt,
"size": size,
"seconds": str(seconds),
}
# 添加参考图片(如果有)
if ref_img_path and os.path.exists(ref_img_path):
with open(ref_img_path, 'rb') as f:
kwargs["input_reference"] = f
# 创建视频任务
video_job = self.client.videos.create(**kwargs)
# 轮询任务状态
while video_job.status in ["queued", "processing"]:
LOGGER.info(f"⏳ 视频生成中... 进度: {video_job.progress}%")
time.sleep(10)
video_job = self.client.videos.retrieve(video_job.id)
# 检查任务状态
if video_job.status == "completed":
# 下载视频
video_url = video_job.url
import requests
video_data = requests.get(video_url).content
with open(output_path, 'wb') as f:
f.write(video_data)
LOGGER.info(f"✅ 视频已保存: {output_path}")
return None
else:
error_msg = f"视频生成失败,状态: {video_job.status}"
LOGGER.error(error_msg)
return error_msg
except Exception as e:
error_msg = f"Sora API 错误: {str(e)}"
LOGGER.error(error_msg)
import traceback
traceback.print_exc()
return error_msg
class GoogleVeoAPI:
"""Google Veo 3.1 API 封装"""
def __init__(self, api_key: str):
if genai is None:
raise RuntimeError("未安装 google-genai 库,请运行: pip install google-genai")
self.client = genai.Client(api_key=api_key)
def generate_video(
self,
prompt: str,
output_path: str,
size: str,
seconds: int,
ref_img_path: str = None
) -> Optional[str]:
"""
生成视频
返回: None (成功) 或 错误信息字符串
"""
try:
LOGGER.info(f"🎬 Veo API 调用: {size} | {seconds}秒")
# 构建配置
config_kwargs = {}
# 添加参考图片(如果有)
if ref_img_path and os.path.exists(ref_img_path):
ref_image = Image.open(ref_img_path)
reference = types.VideoGenerationReferenceImage(
image=ref_image,
reference_type="asset"
)
config_kwargs["reference_images"] = [reference]
# 映射分辨率
resolution_map = {"1080p": "1080p", "720p": "720p"}
resolution = resolution_map.get(size, "720p")
# 创建视频生成任务
operation = self.client.models.generate_videos(
model="veo-3.1-generate-preview",
prompt=prompt,
config=types.GenerateVideosConfig(
duration_seconds=seconds,
resolution=resolution,
aspect_ratio="16:9",
**config_kwargs
),
)
# 轮询任务状态
while not operation.done:
LOGGER.info("⏳ 视频生成中...")
time.sleep(10)
operation = self.client.operations.get(operation)
# 下载视频
video = operation.response.generated_videos[0]
self.client.files.download(file=video.video, output_path=output_path)
LOGGER.info(f"✅ 视频已保存: {output_path}")
return None
except Exception as e:
error_msg = f"Veo API 错误: {str(e)}"
LOGGER.error(error_msg)
import traceback
traceback.print_exc()
return error_msg
# ==========================================
# PART 3: 视频处理工具函数
# ==========================================
def parse_script_nodes(script_text: str) -> List[str]:
"""解析剧本为分镜列表"""
cleaned = script_text.replace("\r\n", "\n").strip()
pattern = re.compile(r"\s*(\d+)\.\s*")
matches = list(pattern.finditer(cleaned))
if not matches:
return [line.strip() for line in cleaned.split('\n') if line.strip()]
nodes = []
for index, match in enumerate(matches):
start = match.end()
end = matches[index + 1].start() if index + 1 < len(matches) else len(cleaned)
content = cleaned[start:end].strip()
if content:
nodes.append(content)
return nodes
def extract_last_frame(video_path: str, output_path: str) -> Optional[str]:
"""提取视频最后一帧(OpenCV优化版 - 精简)"""
import time
if cv2 is None:
LOGGER.warning("OpenCV 不可用")
return None
if not os.path.exists(video_path):
LOGGER.error(f"视频文件不存在: {video_path}")
return None
# === 步骤1: 等待文件写入稳定 ===
max_wait = 30
check_interval = 1.0
stable_count = 0
required_stable = 3
last_size = 0
LOGGER.info("⏳ 等待文件写入完成...")
for i in range(int(max_wait / check_interval)):
try:
current_size = os.path.getsize(video_path)
except OSError:
time.sleep(check_interval)
continue
if current_size == 0:
time.sleep(check_interval)
continue
if current_size == last_size:
stable_count += 1
if stable_count >= required_stable:
LOGGER.info(f"✅ 文件稳定: {current_size / 1024 / 1024:.2f} MB")
break
else:
stable_count = 0
last_size = current_size
time.sleep(check_interval)
# 额外等待确保文件系统同步
time.sleep(2.0)
# === 步骤2: OpenCV 读取视频 ===
capture = cv2.VideoCapture(video_path)
if not capture.isOpened():
LOGGER.error("OpenCV 无法打开视频")
return None
try:
total_frames = int(capture.get(cv2.CAP_PROP_FRAME_COUNT) or 0)
fps = capture.get(cv2.CAP_PROP_FPS) or 30
if total_frames <= 0:
LOGGER.error("视频帧数为 0")
return None
LOGGER.info(f"📹 视频信息: {total_frames} 帧, {fps:.2f} FPS")
# === 步骤3: 多候选帧策略(避免黑帧/损坏帧)===
candidates = [
total_frames - 1, # 最后一帧
total_frames - 2, # 倒数第2帧
total_frames - 5, # 倒数第5帧
max(0, int(total_frames * 0.95)) # 95%位置
]
frame = None
used_index = -1
for candidate_idx in candidates:
candidate_idx = max(0, min(candidate_idx, total_frames - 1))
capture.set(cv2.CAP_PROP_POS_FRAMES, candidate_idx)
success, temp_frame = capture.read()
if success and temp_frame is not None and temp_frame.size > 0:
# 检查亮度(排除黑屏)
gray = cv2.cvtColor(temp_frame, cv2.COLOR_BGR2GRAY)
brightness = gray.mean()
if brightness > 5: # 亮度阈值
frame = temp_frame
used_index = candidate_idx
LOGGER.info(f"✅ 提取第 {used_index}/{total_frames} 帧(亮度: {brightness:.1f})")
break
# === 步骤4: 保存图片 ===
if frame is None:
LOGGER.error("所有候选帧均无效")
return None
os.makedirs(os.path.dirname(output_path), exist_ok=True)
if not cv2.imwrite(output_path, frame):
LOGGER.error("保存图片失败")
return None
file_size = os.path.getsize(output_path)
LOGGER.info(f"💾 参考帧已保存: {os.path.basename(output_path)} ({file_size / 1024:.1f} KB)")
return output_path
except Exception as e:
LOGGER.error(f"提取帧时出错: {e}")
return None
finally:
capture.release()
def stitch_videos(video_paths: List[str], output_path: str):
"""拼接多个视频为最终成片"""
if not video_paths:
raise ValueError("未提供可拼接的视频文件。")
if VideoFileClip is None or concatenate_videoclips is None:
raise RuntimeError("未找到 moviepy,请安装依赖。")
clips = []
try:
for path in video_paths:
if not os.path.exists(path):
continue
clips.append(VideoFileClip(path))
if not clips:
raise ValueError("没有有效的视频片段")
final_clip = concatenate_videoclips(clips, method="compose")
final_clip.write_videofile(
output_path,
codec="libx264",
audio_codec="aac",
verbose=False,
logger=None,
remove_temp=True
)
finally:
for clip in clips:
clip.close()
# ==========================================
# PART 4: 视频生成流水线
# ==========================================
def run_video_generation_pipeline(
script_text: str,
api_key: str,
model_name: str,
style_choice: str,
size: str,
seconds: int
):
"""
视频生成流水线
Yields: (分镜列表, 最终视频路径, 日志信息)
"""
# 验证输入
if not script_text:
yield [], None, "❌ 请输入剧本!"
return
if not api_key or api_key == "Your API Key":
yield [], None, "❌ 请输入有效的 API Key!"
return
# 解析剧本
nodes = parse_script_nodes(script_text)
run_id = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = os.path.join("output_videos", run_id)
os.makedirs(output_dir, exist_ok=True)
# 初始化 API 客户端
try:
if model_name.startswith("sora"):
api_client = OpenAISoraAPI(api_key)
elif model_name.startswith("veo"):
api_client = GoogleVeoAPI(api_key)
else:
yield [], None, f"❌ 不支持的模型: {model_name}"
return
except Exception as e:
yield [], None, f"❌ API 初始化失败: {str(e)}"
return
generated_videos = []
last_frame_path = None
style_prompt = STYLE_PROMPTS.get(style_choice, "")
yield [], None, f"🚀 开始任务,共 {len(nodes)} 个分镜。模型: {model_name}"
# 逐个生成分镜
for i, node_text in enumerate(nodes):
idx = i + 1
video_filename = os.path.join(output_dir, f"segment_{idx:02d}.mp4")
full_prompt = f"{CONTINUITY_PROMPT}\n{style_prompt}\n镜头编号:{idx}/{len(nodes)}。\n镜头脚本:{node_text}"
yield generated_videos, None, f"🎥 生成中: 分镜 {idx}/{len(nodes)}..."
# 调用 API 生成视频
if model_name.startswith("sora"):
err = api_client.generate_video(
prompt=full_prompt,
output_path=video_filename,
model=model_name,
size=size,
seconds=seconds,
ref_img_path=last_frame_path
)
else: # veo
err = api_client.generate_video(
prompt=full_prompt,
output_path=video_filename,
size=size,
seconds=seconds,
ref_img_path=last_frame_path
)
if err:
yield generated_videos, None, f"❌ 分镜 {idx} 失败: {err}"
return
generated_videos.append(video_filename)
# 提取最后一帧作为下一个分镜的参考
if i < len(nodes) - 1:
frame_path = os.path.join(output_dir, f"ref_{idx:02d}.png")
last_frame_path = extract_last_frame(video_filename, frame_path)
yield generated_videos, None, f"✅ 分镜 {idx} 完成"
# 拼接视频
yield generated_videos, None, "🎬 正在拼接..."
final_video_path = os.path.join(output_dir, "final_movie.mp4")
try:
stitch_videos(generated_videos, final_video_path)
yield generated_videos, final_video_path, "🎉 任务完成!"
except Exception as e:
yield generated_videos, None, f"❌ 拼接失败: {str(e)}"
# ==========================================
# PART 5: Gradio 界面
# ==========================================
def update_model_params(model_name):
"""根据模型更新界面参数"""
config = MODEL_CONFIGS.get(model_name, MODEL_CONFIGS["sora-2"])
return (
gr.Dropdown(
choices=config["sizes"],
value=config["sizes"][0],
label=f"分辨率 ({model_name})"
),
gr.Slider(
minimum=config["seconds_range"]["minimum"],
maximum=config["seconds_range"]["maximum"],
step=config["seconds_range"]["step"],
value=config["seconds_range"]["value"],
label=config["seconds_label"]
)
)
def get_demo_path(filename):
"""获取 Demo 文件路径"""
return filename if os.path.exists(filename) else None
with gr.Blocks(title="AI 剧本视频工厂") as demo:
gr.Markdown("# 🎬 ScriptAgent & Sora/Veo 视频生成工坊 ")
with gr.Tabs():
# --- TAB 1: 剧本创作 ---
with gr.Tab("📝 第一步:剧本创作"):
with gr.Row():
with gr.Column():
llm_input = gr.Textbox(
label="剧情输入",
placeholder="主角:你在做什么?...",
lines=6
)
llm_btn = gr.Button("生成/续写剧本", variant="primary")
with gr.Column():
llm_output = gr.Textbox(
label="生成的剧本",
lines=10,
interactive=True
)
# 🔥 修改:添加提示信息组件
copy_status = gr.Textbox(
label="",
value="",
visible=False,
elem_classes="copy-status"
)
to_video_btn = gr.Button("⬇️ 复制到视频生成", variant="secondary")
gr.Examples(
[[
"艾蕾娜:……星辰的余烬?你竟敢在此地点燃旧日之光,流亡者。 \n"
"凯兰:光从不属于谁,骑士。它只记得……曾照过怎样的真相。倒是你,影子的囚徒,是来求我终结你的诅咒?还是……来替你的新王收割我的喉咙? \n"
"艾蕾娜:终结?我的诅咒早已生根。每杀一人,他的灵魂便多一道裂痕……而你要的真相,不过是让裂痕更早崩裂。 \n"
"凯兰:那就让它崩裂。你以为影子吞噬的是敌人?不……它啃的是你自己的存在。当你彻底沦为影魔,王国会迎来真正的永夜——而那时,连诅咒都会因你的消失而……笑出声。 \n"
"艾蕾娜:若我消失……她会替我活下去。而王国……会记得我曾是守护者。但若你以星为誓,告诉我……当年你预言的毁灭,可有……一线逆转? \n"
"凯兰:逆转的代价……是被遗忘。你愿用存在换王国的黎明?那么——以星辰与影子的名义,契约成立。 \n"
"艾蕾娜:……从此,弑王者的名字将被抹去。而星辰将记住……一个骑士用消失,为王国换来了……第零次日出。\n"
],
[
"林照:最近读完《人类简史》,突然意识到我们所谓的“现代文明”不过是场集体幻觉,真有点虚无。\n"
"陈放:幻觉才值钱啊,货币、国家、公司,哪个不是大家愿意信才存在?读书不是为了拆穿,而是为了看清游戏规则。 \n"
"许知:你俩别绕了,我昨晚刚把《斯通纳》看完,合上书就一个问题:如果注定平庸,还读个什么劲? \n"
"林照:斯通纳的平庸恰恰反击了功利叙事,他的失败里藏着尊严,像黑暗中的一点磷火。 \n"
"陈放:说穿了,读书就是给自己建一座防空洞,外面狂轰滥炸,洞里还能点一盏小灯。 \n"
"林照:那灯最好自带电池,别指望谁给你发电,明天我打算读《倦怠社会》,继续给灯添点燃料。\n"
]
],
inputs=llm_input
)
# 本地部署代码展示区(保持不变)
gr.Markdown("---")
with gr.Accordion("💻 本地部署完整代码(点击展开查看)", open=False):
gr.Markdown("""
### 📦 完整部署步骤
以下代码可在本地完整运行,获得最佳性能和输出质量:
""")
deployment_code = '''import os
from huggingface_hub import snapshot_download
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
model_name = "XD-MU/ScriptAgent"
local_path = "./models/ScriptAgent"
# 下载整个仓库的所有文件
print("下载模型所有文件...")
snapshot_download(
repo_id=model_name,
local_dir=local_path,
local_dir_use_symlinks=False,
resume_download=True
)
print(f"模型已完整下载到: {local_path}")
# 使用 SWIFT 加载
from swift.llm import PtEngine, RequestConfig, InferRequest
engine = PtEngine(local_path, max_batch_size=1)
request_config = RequestConfig(max_tokens=8192, temperature=0.7)
infer_request = InferRequest(messages=[
{"role": "user", "content": "你的对话上下文(Your Dialogue)"}
])
response = engine.infer([infer_request], request_config)[0]
print(response.choices[0].message.content)'''
gr.Code(
value=deployment_code,
language="python",
label="deploy_scriptagent.py",
lines=35,
interactive=False
)
gr.Markdown("""
### 📌 环境要求
```bash
# 安装依赖
pip install ms-swift transformers torch huggingface_hub
# GPU 推荐配置
- CUDA 11.8+
- 显存: 16GB+ (推荐 24GB)
- 内存: 32GB+
```
### 本地部署优势
- ✅ 完整精度模型,无量化损失
- ✅ 更快的推理速度
- ✅ 无网络限制,支持离线运行
- ✅ 可自定义参数(temperature, max_tokens等)
""")
# --- TAB 2: 视频生成 ---
with gr.Tab("🎥 第二步:视频生成"):
# 🔥 新增:顶部提示区域
gr.Markdown("""
<div style="background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
padding: 15px;
border-radius: 10px;
color: white;
text-align: center;
margin-bottom: 20px;">
💡 <b>提示</b>:在「第一步:剧本创作」中点击「⬇️ 复制到视频生成」后,剧本会自动填充到下方「分镜脚本」输入框
</div>
""")
with gr.Row():
# 左侧配置区
with gr.Column(scale=1):
with gr.Accordion("⚙️ API 设置", open=True):
api_key_input = gr.Textbox(
label="API Key",
type="password",
value="Your API Key",
info="根据选择的模型输入 OpenAI 或 Google API Key"
)
gr.Markdown("### 🎨 风格与模型配置")
style_radio = gr.Radio(
choices=STYLE_KEYS,
value=STYLE_KEYS[0],
label="画风"
)
model_sel = gr.Dropdown(
choices=["sora-2", "sora-2-pro", "veo-3.1"],
value="sora-2",
label="选择模型",
info="Sora 使用 OpenAI Key,Veo 使用 Google Key"
)
with gr.Row():
size_sel = gr.Dropdown(
choices=MODEL_CONFIGS["sora-2"]["sizes"],
value=MODEL_CONFIGS["sora-2"]["sizes"][0],
label="分辨率"
)
sec_slider = gr.Slider(
minimum=4,
maximum=12,
step=4,
value=4,
label="单镜时长"
)
video_script_input = gr.TextArea(
label="分镜脚本",
lines=8,
placeholder="1. [0-8秒] ...",
elem_classes="script-input" # 🔥 添加样式类
)
gen_btn = gr.Button("🚀 开始生成", variant="primary")
status_log = gr.Textbox(label="日志", interactive=False)
# 右侧展示区
with gr.Column(scale=2):
gr.Markdown("### 🎞️ 分镜预览")
gallery = gr.Gallery(
label="分镜序列",
columns=3,
height="auto"
)
gr.Markdown("### 🎬 最终成片")
final_video = gr.Video(label="成片输出")
# Demo 展示区
gr.Markdown("---")
gr.Markdown("### 🌟 精选成片案例 (Demo Showcase)")
for i in range(0, 4, 2):
with gr.Row():
for j in range(2):
idx = i + j
if idx < len(DEMO_DATA):
item = DEMO_DATA[idx]
with gr.Column():
with gr.Group():
gr.Video(value=get_demo_path(item["file"]), label=item["title"], interactive=False)
with gr.Accordion(f"📄 查看剧本: {item['title']}", open=False):
gr.Textbox(
value=item["script"],
show_label=False,
lines=6,
max_lines=6,
interactive=False
)
# 页面底部警告
gr.HTML('<p style="color: red; font-weight: bold; text-align: center; margin-top: 20px; font-size: 16px;">⚠️ 注意:仅供简单测试,由于成本问题在线平台内存只有18G,我们量化了模型,性能效果并不能保证,如果需要最准确的输出请自行部署即可</p>')
# 🔥 添加自定义 CSS 实现高亮动画
demo.load(
None,
None,
None,
js="""
function() {
const style = document.createElement('style');
style.textContent = `
@keyframes highlight {
0%, 100% { background-color: transparent; }
50% { background-color: rgba(102, 126, 234, 0.2); }
}
.script-input.highlight {
animation: highlight 1.5s ease-in-out 3;
border: 2px solid #667eea !important;
}
`;
document.head.appendChild(style);
}
"""
)
# --- 逻辑绑定 ---
llm_btn.click(chat_with_scriptagent, llm_input, llm_output)
# 🔥 修改后的按钮逻辑:复制文本 + 显示成功提示
def copy_to_video(script_text):
"""复制剧本到视频生成标签页"""
if not script_text or not script_text.strip():
return script_text, gr.update(value="⚠️ 剧本为空,无法复制", visible=True)
return script_text, gr.update(value="✅ 已复制到「第二步:视频生成」→「分镜脚本」,请切换标签页查看", visible=True)
to_video_btn.click(
fn=copy_to_video,
inputs=llm_output,
outputs=[video_script_input, copy_status]
)
model_sel.change(
fn=update_model_params,
inputs=model_sel,
outputs=[size_sel, sec_slider]
)
gen_btn.click(
fn=run_video_generation_pipeline,
inputs=[
video_script_input,
api_key_input,
model_sel,
style_radio,
size_sel,
sec_slider
],
outputs=[gallery, final_video, status_log]
)
if __name__ == "__main__":
demo.queue()
demo.launch(server_name="0.0.0.0", server_port=7860)