rohanmiriyala commited on
Commit
467303b
·
verified ·
1 Parent(s): bb3697e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1 -12
app.py CHANGED
@@ -16,15 +16,12 @@ import nltk
16
  from parler_tts import ParlerTTSForConditionalGeneration
17
  from lang_list import LANGUAGE_NAME_TO_CODE, ASR_TARGET_LANGUAGE_NAMES, S2TT_TARGET_LANGUAGE_NAMES
18
 
19
- # Download punkt for sentence splitting
20
  nltk.download('punkt_tab')
21
 
22
- # Device and dtype
23
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
24
  DTYPE = torch.bfloat16 if DEVICE != "cpu" else torch.float32
25
  SAMPLE_RATE = 16000
26
 
27
- # Load speech-to-text model
28
  stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
29
  "ai4bharat/indic-seamless",
30
  torch_dtype=DTYPE
@@ -36,7 +33,6 @@ tt_tokenizer = SeamlessM4TTokenizer.from_pretrained(
36
  "ai4bharat/indic-seamless"
37
  )
38
 
39
- # Load TTS models
40
  repo_id = "ai4bharat/indic-parler-tts-pretrained"
41
  finetuned_repo_id = "ai4bharat/indic-parler-tts"
42
 
@@ -55,10 +51,8 @@ tts_tokenizer = AutoTokenizer.from_pretrained(repo_id)
55
  description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
56
  tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
57
 
58
- # Voice options - example speakers
59
  VOICES = ["Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya"]
60
 
61
- # Dark theme CSS
62
  CSS = '''
63
  body { background-color: #1e1e2f; color: #ececec; }
64
  .gradio-container { max-width: 1000px; margin: auto; padding: 20px; }
@@ -68,7 +62,6 @@ body { background-color: #1e1e2f; color: #ececec; }
68
  .gradio-row .column { display: inline-block; vertical-align: top; }
69
  '''
70
 
71
- # Helpers
72
  def numpy_to_mp3(audio_array, sampling_rate):
73
  if np.issubdtype(audio_array.dtype, np.floating):
74
  max_val = np.max(np.abs(audio_array))
@@ -82,7 +75,6 @@ def numpy_to_mp3(audio_array, sampling_rate):
82
  segment.export(mp3_io, format="mp3", bitrate="320k")
83
  return mp3_io.getvalue()
84
 
85
- # STT / Translation
86
  def transcribe_and_translate(audio_path, source_language, target_language):
87
  wav, orig_sr = torchaudio.load(audio_path)
88
  wav = torchaudio.functional.resample(wav, orig_freq=orig_sr, new_freq=SAMPLE_RATE)
@@ -91,7 +83,6 @@ def transcribe_and_translate(audio_path, source_language, target_language):
91
  gen = stt_model.generate(**inputs, tgt_lang=tgt)[0]
92
  return tt_tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
93
 
94
- # TTS generation
95
  def generate_tts(text, voice, finetuned=False):
96
  description = f"{voice} speaks in a neutral tone with clear audio."
97
  sentences = nltk.sent_tokenize(text)
@@ -115,13 +106,11 @@ def generate_tts(text, voice, finetuned=False):
115
  combined = np.concatenate(all_audio)
116
  return numpy_to_mp3(combined, tts_feature_extractor.sampling_rate)
117
 
118
- # Pipeline
119
  def pipeline(audio_path, source_language, target_language, voice, finetuned):
120
  text = transcribe_and_translate(audio_path, source_language, target_language)
121
  audio_bytes = generate_tts(text, voice, finetuned)
122
  return text, audio_bytes
123
 
124
- # Gradio UI
125
  def build_ui():
126
  with gr.Blocks(css=CSS) as demo:
127
  gr.Markdown("# IndicSeamless + Parler-TTS Demo")
@@ -144,7 +133,7 @@ def build_ui():
144
  audio_out = gr.Audio(label="Synthesized Speech", format="mp3")
145
  run_btn.click(fn=pipeline, inputs=[audio_in, src, tgt, voice, finetune], outputs=[text_out, audio_out])
146
  return demo
147
-
148
  if __name__ == "__main__":
149
  ui = build_ui()
150
  ui.launch(share=True)
 
16
  from parler_tts import ParlerTTSForConditionalGeneration
17
  from lang_list import LANGUAGE_NAME_TO_CODE, ASR_TARGET_LANGUAGE_NAMES, S2TT_TARGET_LANGUAGE_NAMES
18
 
 
19
  nltk.download('punkt_tab')
20
 
 
21
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
22
  DTYPE = torch.bfloat16 if DEVICE != "cpu" else torch.float32
23
  SAMPLE_RATE = 16000
24
 
 
25
  stt_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(
26
  "ai4bharat/indic-seamless",
27
  torch_dtype=DTYPE
 
33
  "ai4bharat/indic-seamless"
34
  )
35
 
 
36
  repo_id = "ai4bharat/indic-parler-tts-pretrained"
37
  finetuned_repo_id = "ai4bharat/indic-parler-tts"
38
 
 
51
  description_tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
52
  tts_feature_extractor = AutoFeatureExtractor.from_pretrained(repo_id)
53
 
 
54
  VOICES = ["Sunita", "Suresh", "Aditi", "Prakash", "Rohit", "Anjali", "Jaya"]
55
 
 
56
  CSS = '''
57
  body { background-color: #1e1e2f; color: #ececec; }
58
  .gradio-container { max-width: 1000px; margin: auto; padding: 20px; }
 
62
  .gradio-row .column { display: inline-block; vertical-align: top; }
63
  '''
64
 
 
65
  def numpy_to_mp3(audio_array, sampling_rate):
66
  if np.issubdtype(audio_array.dtype, np.floating):
67
  max_val = np.max(np.abs(audio_array))
 
75
  segment.export(mp3_io, format="mp3", bitrate="320k")
76
  return mp3_io.getvalue()
77
 
 
78
  def transcribe_and_translate(audio_path, source_language, target_language):
79
  wav, orig_sr = torchaudio.load(audio_path)
80
  wav = torchaudio.functional.resample(wav, orig_freq=orig_sr, new_freq=SAMPLE_RATE)
 
83
  gen = stt_model.generate(**inputs, tgt_lang=tgt)[0]
84
  return tt_tokenizer.decode(gen, skip_special_tokens=True, clean_up_tokenization_spaces=True)
85
 
 
86
  def generate_tts(text, voice, finetuned=False):
87
  description = f"{voice} speaks in a neutral tone with clear audio."
88
  sentences = nltk.sent_tokenize(text)
 
106
  combined = np.concatenate(all_audio)
107
  return numpy_to_mp3(combined, tts_feature_extractor.sampling_rate)
108
 
 
109
  def pipeline(audio_path, source_language, target_language, voice, finetuned):
110
  text = transcribe_and_translate(audio_path, source_language, target_language)
111
  audio_bytes = generate_tts(text, voice, finetuned)
112
  return text, audio_bytes
113
 
 
114
  def build_ui():
115
  with gr.Blocks(css=CSS) as demo:
116
  gr.Markdown("# IndicSeamless + Parler-TTS Demo")
 
133
  audio_out = gr.Audio(label="Synthesized Speech", format="mp3")
134
  run_btn.click(fn=pipeline, inputs=[audio_in, src, tgt, voice, finetune], outputs=[text_out, audio_out])
135
  return demo
136
+
137
  if __name__ == "__main__":
138
  ui = build_ui()
139
  ui.launch(share=True)