Spaces:
Sleeping
Sleeping
| # Swahili Text‑to‑Speech Gradio App – MP3 Output (mobile‑friendly) | |
| # ----------------------------------------------------------------- | |
| # ‑ Generates clear Kiswahili speech and serves it **as an MP3 file** so that | |
| # iOS/Android browsers play it reliably. Uses a fine‑tuned VITS checkpoint. | |
| # | |
| #‑ Dependencies (add to requirements.txt): | |
| # torch, transformers, gradio, scipy, pydub | |
| import os | |
| import tempfile | |
| import torch | |
| import numpy as np | |
| import gradio as gr | |
| import scipy.io.wavfile as wavfile | |
| from pydub import AudioSegment | |
| from transformers import VitsModel, AutoTokenizer | |
| MODEL_NAME = "FarmerlineML/swahili-tts-2025" # tokenizer | |
| MODEL_CHECKPOINT = "FarmerlineML/Swahili-tts-2025_part4" # acoustic model | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # ---------- Load model ------------------------------------------------------ | |
| model = VitsModel.from_pretrained(MODEL_CHECKPOINT).to(device) | |
| model.eval() | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # Apply clear‑speech inference parameters once (no UI toggle) | |
| model.noise_scale = 0.7 | |
| model.noise_scale_duration = 0.667 | |
| model.speaking_rate = 0.75 # must be >0 to avoid ZeroDivisionError | |
| # ---------- Helper ---------------------------------------------------------- | |
| def _wav_to_mp3(wave_np: np.ndarray, sr: int) -> str: | |
| """Convert int16 numpy waveform to an MP3 temp file, return its path.""" | |
| # Ensure int16 for pydub | |
| if wave_np.dtype != np.int16: | |
| # waveform from VITS is float32 in range [-1, 1]; scale and cast | |
| wave_np = (wave_np * 32767).astype(np.int16) | |
| with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf: | |
| wavfile.write(tf.name, sr, wave_np) | |
| wav_path = tf.name | |
| # Convert to mp3 via pydub (requires ffmpeg ‑ available in Spaces base img) | |
| mp3_path = wav_path.replace(".wav", ".mp3") | |
| AudioSegment.from_wav(wav_path).export(mp3_path, format="mp3", bitrate="64k") | |
| os.remove(wav_path) # cleanup temp WAV | |
| return mp3_path | |
| # ---------- TTS endpoint ---------------------------------------------------- | |
| def tts_generate(text: str): | |
| if not text: | |
| return None | |
| inputs = tokenizer(text, return_tensors="pt").to(device) | |
| with torch.no_grad(): | |
| wave = model(**inputs).waveform[0].cpu().numpy() | |
| return _wav_to_mp3(wave, model.config.sampling_rate) | |
| # ---------- UI -------------------------------------------------------------- | |
| examples = [ | |
| ["zao kusaidia kuondoa umaskini na kujenga kampeni za mwamko wa virusi vya ukimwi amezitembelea"], | |
| ["Kidole hiki ni tofauti na vidole vingine kwa sababu mwelekeo wake ni wa pekee."], | |
| ["hivyo imekuwa msingi wa teknolojia yote ya umeme hasa nyaya za kila aina"], | |
| ["kumekuwa na majadiliano mengi juu ya usahihi wa ripoti hizi za madeni"], | |
| ["na kusaga ulipoanzia baada ya kumaliza masomo ndugu ruge mutahaba ndipo sasa mwishoni mwa"], | |
| ["Soko la Kariakoo huwa na watu wengi siku za Jumamosi."], | |
| ["Tafadhali hakikisha umefunga mlango kabla ya kuondoka."], | |
| ["Watoto walicheza mpira uwanjani hadi jua lilipotua."], | |
| ] | |
| demo = gr.Interface( | |
| fn=tts_generate, | |
| inputs=gr.Textbox(lines=3, placeholder="Enter Swahili text here", label="Enter Swahili text here"), | |
| outputs=gr.Audio(type="filepath", label="Audio", autoplay=True), | |
| title="Swahili Text‑to‑Speech", | |
| description=( | |
| "Enter Swahili text and click **Submit** to play the audio" | |
| ), | |
| examples=examples, | |
| cache_examples=True, | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |