Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -51,16 +51,9 @@ tts_tokenizer = AutoTokenizer.from_pretrained(repo_id)
|
|
| 51 |
description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
|
| 52 |
tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
| 53 |
|
| 54 |
-
VOICES = [
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
body { background-color: #1e1e2f; color: #ececec; }
|
| 58 |
-
.gradio-container { max-width: 1000px; margin: auto; padding: 20px; }
|
| 59 |
-
.left-panel { width: 40%; background: #252538; padding: 20px; border-radius: 8px; }
|
| 60 |
-
.right-panel { width: 58%; }
|
| 61 |
-
.gradio-row { display: flex; gap: 2%; }
|
| 62 |
-
.gradio-row .column { display: inline-block; vertical-align: top; }
|
| 63 |
-
'''
|
| 64 |
|
| 65 |
def numpy_to_mp3(audio_array, sampling_rate):
|
| 66 |
if np.issubdtype(audio_array.dtype, np.floating):
|
|
@@ -68,8 +61,10 @@ def numpy_to_mp3(audio_array, sampling_rate):
|
|
| 68 |
audio_array = (audio_array / max_val) * 32767
|
| 69 |
audio_array = audio_array.astype(np.int16)
|
| 70 |
segment = AudioSegment(
|
| 71 |
-
audio_array.tobytes(),
|
| 72 |
-
|
|
|
|
|
|
|
| 73 |
)
|
| 74 |
mp3_io = io.BytesIO()
|
| 75 |
segment.export(mp3_io, format="mp3", bitrate="320k")
|
|
@@ -81,7 +76,8 @@ def transcribe_and_translate(audio_path, source_language, target_language):
|
|
| 81 |
inputs = feature_extractor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(DEVICE, DTYPE)
|
| 82 |
tgt = LANGUAGE_NAME_TO_CODE[target_language]
|
| 83 |
gen = stt_model.generate(**inputs, tgt_lang=tgt)[0]
|
| 84 |
-
|
|
|
|
| 85 |
|
| 86 |
def generate_tts(text, voice, finetuned=False):
|
| 87 |
description = f"{voice} speaks in a neutral tone with clear audio."
|
|
@@ -112,28 +108,34 @@ def pipeline(audio_path, source_language, target_language, voice, finetuned):
|
|
| 112 |
return text, audio_bytes
|
| 113 |
|
| 114 |
def build_ui():
|
| 115 |
-
with gr.Blocks(
|
| 116 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
with gr.Row():
|
| 118 |
-
with gr.Column(
|
| 119 |
-
gr.Markdown("**How to Use:**")
|
| 120 |
-
gr.Markdown("1. Upload or record your audio clip.")
|
| 121 |
-
gr.Markdown("2. Select source & target languages.")
|
| 122 |
-
gr.Markdown("3. Choose a voice persona.")
|
| 123 |
-
gr.Markdown("4. (Optional) Toggle fine-tuned TTS.")
|
| 124 |
-
gr.Markdown("5. Click **Run** to see text & hear speech.")
|
| 125 |
-
with gr.Column(elem_classes="right-panel column"):
|
| 126 |
audio_in = gr.Audio(label="Input Audio", type="filepath")
|
| 127 |
src = gr.Dropdown(ASR_TARGET_LANGUAGE_NAMES, label="Source Language", value="English")
|
| 128 |
tgt = gr.Dropdown(S2TT_TARGET_LANGUAGE_NAMES, label="Target Language", value="English")
|
| 129 |
-
voice = gr.Dropdown(VOICES, label="Voice
|
| 130 |
-
finetune = gr.Checkbox(label="Use
|
| 131 |
-
run_btn = gr.Button("Run"
|
|
|
|
| 132 |
text_out = gr.Textbox(label="Translated Text")
|
| 133 |
audio_out = gr.Audio(label="Synthesized Speech", format="mp3")
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
return demo
|
| 136 |
-
|
| 137 |
if __name__ == "__main__":
|
| 138 |
ui = build_ui()
|
| 139 |
ui.launch(share=True)
|
|
|
|
| 51 |
description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
|
| 52 |
tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
|
| 53 |
|
| 54 |
+
VOICES = [
|
| 55 |
+
"Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya"
|
| 56 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
def numpy_to_mp3(audio_array, sampling_rate):
|
| 59 |
if np.issubdtype(audio_array.dtype, np.floating):
|
|
|
|
| 61 |
audio_array = (audio_array / max_val) * 32767
|
| 62 |
audio_array = audio_array.astype(np.int16)
|
| 63 |
segment = AudioSegment(
|
| 64 |
+
audio_array.tobytes(),
|
| 65 |
+
frame_rate=sampling_rate,
|
| 66 |
+
sample_width=audio_array.dtype.itemsize,
|
| 67 |
+
channels=1
|
| 68 |
)
|
| 69 |
mp3_io = io.BytesIO()
|
| 70 |
segment.export(mp3_io, format="mp3", bitrate="320k")
|
|
|
|
| 76 |
inputs = feature_extractor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(DEVICE, DTYPE)
|
| 77 |
tgt = LANGUAGE_NAME_TO_CODE[target_language]
|
| 78 |
gen = stt_model.generate(**inputs, tgt_lang=tgt)[0]
|
| 79 |
+
text = tt_tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
|
| 80 |
+
return text
|
| 81 |
|
| 82 |
def generate_tts(text, voice, finetuned=False):
|
| 83 |
description = f"{voice} speaks in a neutral tone with clear audio."
|
|
|
|
| 108 |
return text, audio_bytes
|
| 109 |
|
| 110 |
def build_ui():
|
| 111 |
+
with gr.Blocks() as demo:
|
| 112 |
+
gr.Markdown("🎙AUDIO TRANSLATOR🎙")
|
| 113 |
+
gr.Markdown(" ")
|
| 114 |
+
gr.Markdown("How to Use:")
|
| 115 |
+
gr.Markdown("1. Upload or record your audio clip.")
|
| 116 |
+
gr.Markdown("2. Select source & target languages.")
|
| 117 |
+
gr.Markdown("3. Choose a voice persona.")
|
| 118 |
+
gr.Markdown("4. (Optional) Toggle fine-tuned TTS (for better speech).")
|
| 119 |
+
gr.Markdown("5. Click \"Run\" for translated text & speech.")
|
| 120 |
with gr.Row():
|
| 121 |
+
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
audio_in = gr.Audio(label="Input Audio", type="filepath")
|
| 123 |
src = gr.Dropdown(ASR_TARGET_LANGUAGE_NAMES, label="Source Language", value="English")
|
| 124 |
tgt = gr.Dropdown(S2TT_TARGET_LANGUAGE_NAMES, label="Target Language", value="English")
|
| 125 |
+
voice = gr.Dropdown(VOICES, label="Voice", value=VOICES[0])
|
| 126 |
+
finetune = gr.Checkbox(label="Use Finetuned TTS", value=False)
|
| 127 |
+
run_btn = gr.Button("Run")
|
| 128 |
+
with gr.Column():
|
| 129 |
text_out = gr.Textbox(label="Translated Text")
|
| 130 |
audio_out = gr.Audio(label="Synthesized Speech", format="mp3")
|
| 131 |
+
|
| 132 |
+
run_btn.click(
|
| 133 |
+
fn=pipeline,
|
| 134 |
+
inputs=[audio_in, src, tgt, voice, finetune],
|
| 135 |
+
outputs=[text_out, audio_out]
|
| 136 |
+
)
|
| 137 |
return demo
|
| 138 |
+
|
| 139 |
if __name__ == "__main__":
|
| 140 |
ui = build_ui()
|
| 141 |
ui.launch(share=True)
|