Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -56,15 +56,16 @@ description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
|
|
| 56 |
tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
| 57 |
|
| 58 |
# Voice options - example speakers
|
| 59 |
-
VOICES = [
|
| 60 |
-
"Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya"
|
| 61 |
-
]
|
| 62 |
|
| 63 |
-
#
|
| 64 |
CSS = '''
|
| 65 |
-
body { background-color: #
|
| 66 |
-
.gradio-container { max-width:
|
| 67 |
-
.
|
|
|
|
|
|
|
|
|
|
| 68 |
'''
|
| 69 |
|
| 70 |
# Helpers
|
|
@@ -74,10 +75,8 @@ def numpy_to_mp3(audio_array, sampling_rate):
|
|
| 74 |
audio_array = (audio_array / max_val) * 32767
|
| 75 |
audio_array = audio_array.astype(np.int16)
|
| 76 |
segment = AudioSegment(
|
| 77 |
-
audio_array.tobytes(),
|
| 78 |
-
|
| 79 |
-
sample_width=audio_array.dtype.itemsize,
|
| 80 |
-
channels=1
|
| 81 |
)
|
| 82 |
mp3_io = io.BytesIO()
|
| 83 |
segment.export(mp3_io, format="mp3", bitrate="320k")
|
|
@@ -90,8 +89,7 @@ def transcribe_and_translate(audio_path, source_language, target_language):
|
|
| 90 |
inputs = feature_extractor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(DEVICE, DTYPE)
|
| 91 |
tgt = LANGUAGE_NAME_TO_CODE[target_language]
|
| 92 |
gen = stt_model.generate(**inputs, tgt_lang=tgt)[0]
|
| 93 |
-
|
| 94 |
-
return text
|
| 95 |
|
| 96 |
# TTS generation
|
| 97 |
def generate_tts(text, voice, finetuned=False):
|
|
@@ -117,40 +115,34 @@ def generate_tts(text, voice, finetuned=False):
|
|
| 117 |
combined = np.concatenate(all_audio)
|
| 118 |
return numpy_to_mp3(combined, tts_feature_extractor.sampling_rate)
|
| 119 |
|
| 120 |
-
#
|
| 121 |
def pipeline(audio_path, source_language, target_language, voice, finetuned):
|
| 122 |
text = transcribe_and_translate(audio_path, source_language, target_language)
|
| 123 |
audio_bytes = generate_tts(text, voice, finetuned)
|
| 124 |
return text, audio_bytes
|
| 125 |
|
| 126 |
# Gradio UI
|
| 127 |
-
|
| 128 |
def build_ui():
|
| 129 |
with gr.Blocks(css=CSS) as demo:
|
| 130 |
-
gr.Markdown("
|
| 131 |
-
# Usage Steps
|
| 132 |
-
with gr.Column():
|
| 133 |
-
gr.HTML("<div class='step-box'><strong>Step 1:</strong> Upload or record your audio clip.</div>")
|
| 134 |
-
gr.HTML("<div class='step-box'><strong>Step 2:</strong> Select the source and target languages.</div>")
|
| 135 |
-
gr.HTML("<div class='step-box'><strong>Step 3:</strong> Choose a voice persona.</div>")
|
| 136 |
-
gr.HTML("<div class='step-box'><strong>Step 4:</strong> (Optional) Toggle fine-tuned TTS for more natural speech.</div>")
|
| 137 |
-
gr.HTML("<div class='step-box'><strong>Step 5:</strong> Click <em>Run</em> and view your text & audio results on the right.</div>")
|
| 138 |
with gr.Row():
|
| 139 |
-
with gr.Column(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
audio_in = gr.Audio(label="Input Audio", type="filepath")
|
| 141 |
src = gr.Dropdown(ASR_TARGET_LANGUAGE_NAMES, label="Source Language", value="English")
|
| 142 |
tgt = gr.Dropdown(S2TT_TARGET_LANGUAGE_NAMES, label="Target Language", value="English")
|
| 143 |
voice = gr.Dropdown(VOICES, label="Voice Persona", value=VOICES[0])
|
| 144 |
finetune = gr.Checkbox(label="Use Fine-tuned TTS", value=False)
|
| 145 |
run_btn = gr.Button("Run", variant="primary")
|
| 146 |
-
with gr.Column(scale=1):
|
| 147 |
text_out = gr.Textbox(label="Translated Text")
|
| 148 |
audio_out = gr.Audio(label="Synthesized Speech", format="mp3")
|
| 149 |
-
run_btn.click(
|
| 150 |
-
fn=pipeline,
|
| 151 |
-
inputs=[audio_in, src, tgt, voice, finetune],
|
| 152 |
-
outputs=[text_out, audio_out]
|
| 153 |
-
)
|
| 154 |
return demo
|
| 155 |
|
| 156 |
if __name__ == "__main__":
|
|
|
|
| 56 |
tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
| 57 |
|
| 58 |
# Voice options - example speakers
|
| 59 |
+
VOICES = ["Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya"]
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
# Dark theme CSS
|
| 62 |
CSS = '''
|
| 63 |
+
body { background-color: #1e1e2f; color: #ececec; }
|
| 64 |
+
.gradio-container { max-width: 1000px; margin: auto; padding: 20px; }
|
| 65 |
+
.left-panel { width: 40%; background: #252538; padding: 20px; border-radius: 8px; }
|
| 66 |
+
.right-panel { width: 58%; }
|
| 67 |
+
.gradio-row { display: flex; gap: 2%; }
|
| 68 |
+
.gradio-row .column { display: inline-block; vertical-align: top; }
|
| 69 |
'''
|
| 70 |
|
| 71 |
# Helpers
|
|
|
|
| 75 |
audio_array = (audio_array / max_val) * 32767
|
| 76 |
audio_array = audio_array.astype(np.int16)
|
| 77 |
segment = AudioSegment(
|
| 78 |
+
audio_array.tobytes(), frame_rate=sampling_rate,
|
| 79 |
+
sample_width=audio_array.dtype.itemsize, channels=1
|
|
|
|
|
|
|
| 80 |
)
|
| 81 |
mp3_io = io.BytesIO()
|
| 82 |
segment.export(mp3_io, format="mp3", bitrate="320k")
|
|
|
|
| 89 |
inputs = feature_extractor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(DEVICE, DTYPE)
|
| 90 |
tgt = LANGUAGE_NAME_TO_CODE[target_language]
|
| 91 |
gen = stt_model.generate(**inputs, tgt_lang=tgt)[0]
|
| 92 |
+
return tt_tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
|
|
|
| 93 |
|
| 94 |
# TTS generation
|
| 95 |
def generate_tts(text, voice, finetuned=False):
|
|
|
|
| 115 |
combined = np.concatenate(all_audio)
|
| 116 |
return numpy_to_mp3(combined, tts_feature_extractor.sampling_rate)
|
| 117 |
|
| 118 |
+
# Pipeline
|
| 119 |
def pipeline(audio_path, source_language, target_language, voice, finetuned):
|
| 120 |
text = transcribe_and_translate(audio_path, source_language, target_language)
|
| 121 |
audio_bytes = generate_tts(text, voice, finetuned)
|
| 122 |
return text, audio_bytes
|
| 123 |
|
| 124 |
# Gradio UI
|
|
|
|
| 125 |
def build_ui():
|
| 126 |
with gr.Blocks(css=CSS) as demo:
|
| 127 |
+
gr.Markdown("# IndicSeamless + Parler-TTS Demo")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
with gr.Row():
|
| 129 |
+
with gr.Column(elem_classes="left-panel column"):
|
| 130 |
+
gr.Markdown("**How to Use:**")
|
| 131 |
+
gr.Markdown("1. Upload or record your audio clip.")
|
| 132 |
+
gr.Markdown("2. Select source & target languages.")
|
| 133 |
+
gr.Markdown("3. Choose a voice persona.")
|
| 134 |
+
gr.Markdown("4. (Optional) Toggle fine-tuned TTS.")
|
| 135 |
+
gr.Markdown("5. Click **Run** to see text & hear speech.")
|
| 136 |
+
with gr.Column(elem_classes="right-panel column"):
|
| 137 |
audio_in = gr.Audio(label="Input Audio", type="filepath")
|
| 138 |
src = gr.Dropdown(ASR_TARGET_LANGUAGE_NAMES, label="Source Language", value="English")
|
| 139 |
tgt = gr.Dropdown(S2TT_TARGET_LANGUAGE_NAMES, label="Target Language", value="English")
|
| 140 |
voice = gr.Dropdown(VOICES, label="Voice Persona", value=VOICES[0])
|
| 141 |
finetune = gr.Checkbox(label="Use Fine-tuned TTS", value=False)
|
| 142 |
run_btn = gr.Button("Run", variant="primary")
|
|
|
|
| 143 |
text_out = gr.Textbox(label="Translated Text")
|
| 144 |
audio_out = gr.Audio(label="Synthesized Speech", format="mp3")
|
| 145 |
+
run_btn.click(fn=pipeline, inputs=[audio_in, src, tgt, voice, finetune], outputs=[text_out, audio_out])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
return demo
|
| 147 |
|
| 148 |
if __name__ == "__main__":
|