rohanmiriyala commited on
Commit
bb3697e
·
verified ·
1 Parent(s): 22824ee

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -30
app.py CHANGED
@@ -56,15 +56,16 @@ description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
56
  tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
57
 
58
  # Voice options - example speakers
59
- VOICES = [
60
- "Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya"
61
- ]
62
 
63
- # Custom CSS for visual styling
64
  CSS = '''
65
- body { background-color: #f9fafb; }
66
- .gradio-container { max-width: 900px; margin: auto; padding: 20px; }
67
- .step-box { background: #ffffff; border-radius: 12px; padding: 16px; margin-bottom: 12px; box-shadow: 0 2px 8px rgba(0,0,0,0.1); }
 
 
 
68
  '''
69
 
70
  # Helpers
@@ -74,10 +75,8 @@ def numpy_to_mp3(audio_array, sampling_rate):
74
  audio_array = (audio_array / max_val) * 32767
75
  audio_array = audio_array.astype(np.int16)
76
  segment = AudioSegment(
77
- audio_array.tobytes(),
78
- frame_rate=sampling_rate,
79
- sample_width=audio_array.dtype.itemsize,
80
- channels=1
81
  )
82
  mp3_io = io.BytesIO()
83
  segment.export(mp3_io, format="mp3", bitrate="320k")
@@ -90,8 +89,7 @@ def transcribe_and_translate(audio_path, source_language, target_language):
90
  inputs = feature_extractor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(DEVICE, DTYPE)
91
  tgt = LANGUAGE_NAME_TO_CODE[target_language]
92
  gen = stt_model.generate(**inputs, tgt_lang=tgt)[0]
93
- text = tt_tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
94
- return text
95
 
96
  # TTS generation
97
  def generate_tts(text, voice, finetuned=False):
@@ -117,40 +115,34 @@ def generate_tts(text, voice, finetuned=False):
117
  combined = np.concatenate(all_audio)
118
  return numpy_to_mp3(combined, tts_feature_extractor.sampling_rate)
119
 
120
- # Combined pipeline to reduce duplicate STT calls
121
  def pipeline(audio_path, source_language, target_language, voice, finetuned):
122
  text = transcribe_and_translate(audio_path, source_language, target_language)
123
  audio_bytes = generate_tts(text, voice, finetuned)
124
  return text, audio_bytes
125
 
126
  # Gradio UI
127
-
128
  def build_ui():
129
  with gr.Blocks(css=CSS) as demo:
130
- gr.Markdown("🎙️ AUDIO TRANSLATION 🎙️")
131
- # Usage Steps
132
- with gr.Column():
133
- gr.HTML("<div class='step-box'><strong>Step 1:</strong> Upload or record your audio clip.</div>")
134
- gr.HTML("<div class='step-box'><strong>Step 2:</strong> Select the source and target languages.</div>")
135
- gr.HTML("<div class='step-box'><strong>Step 3:</strong> Choose a voice persona.</div>")
136
- gr.HTML("<div class='step-box'><strong>Step 4:</strong> (Optional) Toggle fine-tuned TTS for more natural speech.</div>")
137
- gr.HTML("<div class='step-box'><strong>Step 5:</strong> Click <em>Run</em> and view your text & audio results on the right.</div>")
138
  with gr.Row():
139
- with gr.Column(scale=1):
 
 
 
 
 
 
 
140
  audio_in = gr.Audio(label="Input Audio", type="filepath")
141
  src = gr.Dropdown(ASR_TARGET_LANGUAGE_NAMES, label="Source Language", value="English")
142
  tgt = gr.Dropdown(S2TT_TARGET_LANGUAGE_NAMES, label="Target Language", value="English")
143
  voice = gr.Dropdown(VOICES, label="Voice Persona", value=VOICES[0])
144
  finetune = gr.Checkbox(label="Use Fine-tuned TTS", value=False)
145
  run_btn = gr.Button("Run", variant="primary")
146
- with gr.Column(scale=1):
147
  text_out = gr.Textbox(label="Translated Text")
148
  audio_out = gr.Audio(label="Synthesized Speech", format="mp3")
149
- run_btn.click(
150
- fn=pipeline,
151
- inputs=[audio_in, src, tgt, voice, finetune],
152
- outputs=[text_out, audio_out]
153
- )
154
  return demo
155
 
156
  if __name__ == "__main__":
 
56
  tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
57
 
58
  # Voice options - example speakers
59
+ VOICES = ["Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya"]
 
 
60
 
61
+ # Dark theme CSS
62
  CSS = '''
63
+ body { background-color: #1e1e2f; color: #ececec; }
64
+ .gradio-container { max-width: 1000px; margin: auto; padding: 20px; }
65
+ .left-panel { width: 40%; background: #252538; padding: 20px; border-radius: 8px; }
66
+ .right-panel { width: 58%; }
67
+ .gradio-row { display: flex; gap: 2%; }
68
+ .gradio-row .column { display: inline-block; vertical-align: top; }
69
  '''
70
 
71
  # Helpers
 
75
  audio_array = (audio_array / max_val) * 32767
76
  audio_array = audio_array.astype(np.int16)
77
  segment = AudioSegment(
78
+ audio_array.tobytes(), frame_rate=sampling_rate,
79
+ sample_width=audio_array.dtype.itemsize, channels=1
 
 
80
  )
81
  mp3_io = io.BytesIO()
82
  segment.export(mp3_io, format="mp3", bitrate="320k")
 
89
  inputs = feature_extractor(wav, sampling_rate=SAMPLE_RATE, return_tensors="pt").to(DEVICE, DTYPE)
90
  tgt = LANGUAGE_NAME_TO_CODE[target_language]
91
  gen = stt_model.generate(**inputs, tgt_lang=tgt)[0]
92
+ return tt_tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 
93
 
94
  # TTS generation
95
  def generate_tts(text, voice, finetuned=False):
 
115
  combined = np.concatenate(all_audio)
116
  return numpy_to_mp3(combined, tts_feature_extractor.sampling_rate)
117
 
118
+ # Pipeline
119
  def pipeline(audio_path, source_language, target_language, voice, finetuned):
120
  text = transcribe_and_translate(audio_path, source_language, target_language)
121
  audio_bytes = generate_tts(text, voice, finetuned)
122
  return text, audio_bytes
123
 
124
  # Gradio UI
 
125
  def build_ui():
126
  with gr.Blocks(css=CSS) as demo:
127
+ gr.Markdown("# IndicSeamless + Parler-TTS Demo")
 
 
 
 
 
 
 
128
  with gr.Row():
129
+ with gr.Column(elem_classes="left-panel column"):
130
+ gr.Markdown("**How to Use:**")
131
+ gr.Markdown("1. Upload or record your audio clip.")
132
+ gr.Markdown("2. Select source & target languages.")
133
+ gr.Markdown("3. Choose a voice persona.")
134
+ gr.Markdown("4. (Optional) Toggle fine-tuned TTS.")
135
+ gr.Markdown("5. Click **Run** to see text & hear speech.")
136
+ with gr.Column(elem_classes="right-panel column"):
137
  audio_in = gr.Audio(label="Input Audio", type="filepath")
138
  src = gr.Dropdown(ASR_TARGET_LANGUAGE_NAMES, label="Source Language", value="English")
139
  tgt = gr.Dropdown(S2TT_TARGET_LANGUAGE_NAMES, label="Target Language", value="English")
140
  voice = gr.Dropdown(VOICES, label="Voice Persona", value=VOICES[0])
141
  finetune = gr.Checkbox(label="Use Fine-tuned TTS", value=False)
142
  run_btn = gr.Button("Run", variant="primary")
 
143
  text_out = gr.Textbox(label="Translated Text")
144
  audio_out = gr.Audio(label="Synthesized Speech", format="mp3")
145
+ run_btn.click(fn=pipeline, inputs=[audio_in, src, tgt, voice, finetune], outputs=[text_out, audio_out])
 
 
 
 
146
  return demo
147
 
148
  if __name__ == "__main__":