Spaces:
Running
Running
File size: 4,811 Bytes
14be0b9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 |
import gradio as gr
import datetime
import base64
import numpy as np
import dashscope
import os
API_KEY = os.environ['API_KEY']
VOICE_OPTIONS = {
"Cherry / 芊悦": "Cherry",
"Ethan / 晨煦": "Ethan",
"Jennifer / 詹妮弗": "Jennifer",
"Ryan / 甜茶": "Ryan",
"Katerina / 卡捷琳娜": "Katerina",
"Nofish / 不吃鱼": "Nofish",
"Elias / 墨讲师": "Elias",
"Li / 南京-老李": "Li",
"Marcus / 陕西-秦川": "Marcus",
"Roy / 闽南-阿杰": "Roy",
"Peter / 天津-李彼得": "Peter",
"Eric / 四川-程川": "Eric",
"Rocky / 粤语-阿强": "Rocky",
"Kiki / 粤语-阿清": "Kiki",
"Sunny / 四川-晴儿": "Sunny",
"Jada / 上海-阿珍": "Jada",
"Dylan / 北京-晓东": "Dylan",
}
DEFAULT_VOICE = 'Cherry / 芊悦'
LANGUAGE_OPTIONS = [
"Auto / 自动",
"English / 英文",
"Chinese / 中文",
"German / 德语",
"Italian / 意大利语",
"Portuguese / 葡萄牙语",
"Spanish / 西班牙语",
"Japanese / 日语",
"Korean / 韩语",
"French / 法语",
"Russian / 俄语"
]
LANGUAGE_MAP = {
"Auto / 自动": "Auto",
"English / 英文": "English",
"Chinese / 中文": "Chinese",
"German / 德语": "German",
"Italian / 意大利语": "Italian",
"Portuguese / 葡萄牙语": "Portuguese",
"Spanish / 西班牙语": "Spanish",
"Japanese / 日语": "Japanese",
"Korean / 韩语": "Korean",
"French / 法语": "French",
"Russian / 俄语": "Russian"
}
def tts_interface(text, voice_display, language_display):
voice_name = VOICE_OPTIONS[voice_display]
# 将显示的语言转换为API参数
language = LANGUAGE_MAP[language_display]
print(f"text: {text}, {voice_name}, {language} time: {datetime.datetime.now()}\n")
audio_frames = []
responses = dashscope.MultiModalConversation.call(
api_key=API_KEY,
model="qwen3-tts-flash",
text=text,
voice=voice_name,
stream=True,
language_type=language
)
for chunk in responses:
audio_string = ""
try:
audio_string = chunk.output.audio.data
except:
print(chunk)
pass
wav_bytes = base64.b64decode(audio_string)
audio_np = np.frombuffer(wav_bytes, dtype=np.int16).astype(np.float32) / 32768.0
audio_frames.append(audio_np)
if audio_frames:
full_audio = np.concatenate(audio_frames)
else:
full_audio = None
sample_rate = 24000
return (sample_rate, full_audio)
with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]), css=".gradio-container {max-width: none !important;}") as demo:
gr.Markdown("# 🎤 Qwen3-TTS Demo")
with gr.Row():
with gr.Column():
# 输入文本 - 英文在前
text_input = gr.Textbox(
label="Input Text / 输入文本",
placeholder="Enter text to synthesis here... / 在此输入要合成为语音的文本...",
lines=4,
max_lines=8
)
# 发音人选择 - 英文在前
voice_select = gr.Dropdown(
label="Select Voice / 选择发音人",
choices=list(VOICE_OPTIONS.keys()),
value=DEFAULT_VOICE
)
# 语言选择 - 英文在前
language_select = gr.Dropdown(
label="Select Text Language / 选择文本语言",
choices=LANGUAGE_OPTIONS,
value="Auto / 自动"
)
# 生成按钮 - 英文在前
generate_btn = gr.Button("Generate Speech / 生成语音", variant="primary")
with gr.Column():
# 音频输出 - 英文在前
audio_output = gr.Audio(label="Generated Speech / 生成的语音", interactive=False)
# 示例文本 - 英文在前
examples = gr.Examples(
examples=[
["你好,我是通义千问,很高兴认识你。", "Cherry / 芊悦", "Chinese / 中文"],
["你好,我是通义千问,很高兴认识你。", "Dylan / 北京-晓东", "Chinese / 中文"],
["Hello, this is a text-to-speech demo", "Jennifer / 詹妮弗", "English / 英文"],
["こんにちは、これはデモです", "Cherry / 芊悦", "Japanese / 日语"],
],
inputs=[text_input, voice_select, language_select],
label="Examples / 示例文本"
)
generate_btn.click(
fn=tts_interface,
inputs=[text_input, voice_select, language_select],
outputs=audio_output
)
if __name__ == "__main__":
demo.launch() |