File size: 4,811 Bytes
14be0b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import gradio as gr
import datetime
import base64
import numpy as np
import dashscope
import os

API_KEY = os.environ['API_KEY']

VOICE_OPTIONS = {
    "Cherry / 芊悦": "Cherry",
    "Ethan / 晨煦": "Ethan",
    "Jennifer / 詹妮弗": "Jennifer",
    "Ryan / 甜茶": "Ryan",
    "Katerina / 卡捷琳娜": "Katerina",
    "Nofish / 不吃鱼": "Nofish",
    "Elias / 墨讲师": "Elias",
    "Li / 南京-老李": "Li", 
    "Marcus / 陕西-秦川": "Marcus", 
    "Roy / 闽南-阿杰": "Roy", 
    "Peter / 天津-李彼得": "Peter", 
    "Eric / 四川-程川": "Eric", 
    "Rocky / 粤语-阿强": "Rocky",
    "Kiki / 粤语-阿清": "Kiki",
    "Sunny / 四川-晴儿": "Sunny",
    "Jada / 上海-阿珍": "Jada",
    "Dylan / 北京-晓东": "Dylan",
}
DEFAULT_VOICE = 'Cherry / 芊悦'

LANGUAGE_OPTIONS = [
    "Auto / 自动", 
    "English / 英文", 
    "Chinese / 中文", 
    "German / 德语", 
    "Italian / 意大利语", 
    "Portuguese / 葡萄牙语", 
    "Spanish / 西班牙语", 
    "Japanese / 日语", 
    "Korean / 韩语", 
    "French / 法语", 
    "Russian / 俄语"
]

LANGUAGE_MAP = {
    "Auto / 自动": "Auto",
    "English / 英文": "English",
    "Chinese / 中文": "Chinese",
    "German / 德语": "German",
    "Italian / 意大利语": "Italian",
    "Portuguese / 葡萄牙语": "Portuguese",
    "Spanish / 西班牙语": "Spanish",
    "Japanese / 日语": "Japanese",
    "Korean / 韩语": "Korean",
    "French / 法语": "French",
    "Russian / 俄语": "Russian"
}

def tts_interface(text, voice_display, language_display):
    voice_name = VOICE_OPTIONS[voice_display]
    
    # 将显示的语言转换为API参数
    language = LANGUAGE_MAP[language_display]
    
    print(f"text: {text}, {voice_name}, {language} time: {datetime.datetime.now()}\n")

    audio_frames = []

    responses = dashscope.MultiModalConversation.call(
        api_key=API_KEY,
        model="qwen3-tts-flash",
        text=text,
        voice=voice_name,
        stream=True,
        language_type=language
    )
    
    for chunk in responses:
        audio_string = ""
        try:
            audio_string = chunk.output.audio.data
        except:
            print(chunk)
            pass
        wav_bytes = base64.b64decode(audio_string)
        audio_np = np.frombuffer(wav_bytes, dtype=np.int16).astype(np.float32) / 32768.0
        audio_frames.append(audio_np)

    if audio_frames:
        full_audio = np.concatenate(audio_frames)
    else:
        full_audio = None

    sample_rate = 24000
    return (sample_rate, full_audio)

with gr.Blocks(theme=gr.themes.Soft(font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]), css=".gradio-container {max-width: none !important;}") as demo:
    gr.Markdown("# 🎤 Qwen3-TTS Demo")
    
    with gr.Row():
        with gr.Column():
            # 输入文本 - 英文在前
            text_input = gr.Textbox(
                label="Input Text / 输入文本",
                placeholder="Enter text to synthesis here... / 在此输入要合成为语音的文本...",
                lines=4,
                max_lines=8
            )
            
            # 发音人选择 - 英文在前
            voice_select = gr.Dropdown(
                label="Select Voice / 选择发音人",
                choices=list(VOICE_OPTIONS.keys()),
                value=DEFAULT_VOICE
            )
            
            # 语言选择 - 英文在前
            language_select = gr.Dropdown(
                label="Select Text Language / 选择文本语言",
                choices=LANGUAGE_OPTIONS,
                value="Auto / 自动"
            )
            
            # 生成按钮 - 英文在前
            generate_btn = gr.Button("Generate Speech / 生成语音", variant="primary")
        
        with gr.Column():
            # 音频输出 - 英文在前
            audio_output = gr.Audio(label="Generated Speech / 生成的语音", interactive=False)
    
    # 示例文本 - 英文在前
    examples = gr.Examples(
        examples=[
            ["你好,我是通义千问,很高兴认识你。", "Cherry / 芊悦", "Chinese / 中文"],
            ["你好,我是通义千问,很高兴认识你。", "Dylan / 北京-晓东", "Chinese / 中文"],
            ["Hello, this is a text-to-speech demo", "Jennifer / 詹妮弗", "English / 英文"],
            ["こんにちは、これはデモです", "Cherry / 芊悦", "Japanese / 日语"],
        ],
        inputs=[text_input, voice_select, language_select],
        label="Examples / 示例文本"
    )

    generate_btn.click(
        fn=tts_interface,
        inputs=[text_input, voice_select, language_select],
        outputs=audio_output
    )

if __name__ == "__main__":
    demo.launch()