# Swahili Text‑to‑Speech Gradio App – MP3 Output (mobile‑friendly) # ----------------------------------------------------------------- # ‑ Generates clear Kiswahili speech and serves it **as an MP3 file** so that # iOS/Android browsers play it reliably. Uses a fine‑tuned VITS checkpoint. # #‑ Dependencies (add to requirements.txt): # torch, transformers, gradio, scipy, pydub import os import tempfile import torch import numpy as np import gradio as gr import scipy.io.wavfile as wavfile from pydub import AudioSegment from transformers import VitsModel, AutoTokenizer MODEL_NAME = "FarmerlineML/swahili-tts-2025" # tokenizer MODEL_CHECKPOINT = "FarmerlineML/Swahili-tts-2025_part4" # acoustic model device = "cuda" if torch.cuda.is_available() else "cpu" # ---------- Load model ------------------------------------------------------ model = VitsModel.from_pretrained(MODEL_CHECKPOINT).to(device) model.eval() tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) # Apply clear‑speech inference parameters once (no UI toggle) model.noise_scale = 0.7 model.noise_scale_duration = 0.667 model.speaking_rate = 0.75 # must be >0 to avoid ZeroDivisionError # ---------- Helper ---------------------------------------------------------- def _wav_to_mp3(wave_np: np.ndarray, sr: int) -> str: """Convert int16 numpy waveform to an MP3 temp file, return its path.""" # Ensure int16 for pydub if wave_np.dtype != np.int16: # waveform from VITS is float32 in range [-1, 1]; scale and cast wave_np = (wave_np * 32767).astype(np.int16) with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf: wavfile.write(tf.name, sr, wave_np) wav_path = tf.name # Convert to mp3 via pydub (requires ffmpeg ‑ available in Spaces base img) mp3_path = wav_path.replace(".wav", ".mp3") AudioSegment.from_wav(wav_path).export(mp3_path, format="mp3", bitrate="64k") os.remove(wav_path) # cleanup temp WAV return mp3_path # ---------- TTS endpoint ---------------------------------------------------- def tts_generate(text: str): if not text: return None inputs = tokenizer(text, return_tensors="pt").to(device) with torch.no_grad(): wave = model(**inputs).waveform[0].cpu().numpy() return _wav_to_mp3(wave, model.config.sampling_rate) # ---------- UI -------------------------------------------------------------- examples = [ ["zao kusaidia kuondoa umaskini na kujenga kampeni za mwamko wa virusi vya ukimwi amezitembelea"], ["Kidole hiki ni tofauti na vidole vingine kwa sababu mwelekeo wake ni wa pekee."], ["hivyo imekuwa msingi wa teknolojia yote ya umeme hasa nyaya za kila aina"], ["kumekuwa na majadiliano mengi juu ya usahihi wa ripoti hizi za madeni"], ["na kusaga ulipoanzia baada ya kumaliza masomo ndugu ruge mutahaba ndipo sasa mwishoni mwa"], ["Soko la Kariakoo huwa na watu wengi siku za Jumamosi."], ["Tafadhali hakikisha umefunga mlango kabla ya kuondoka."], ["Watoto walicheza mpira uwanjani hadi jua lilipotua."], ] demo = gr.Interface( fn=tts_generate, inputs=gr.Textbox(lines=3, placeholder="Enter Swahili text here", label="Enter Swahili text here"), outputs=gr.Audio(type="filepath", label="Audio", autoplay=True), title="Swahili Text‑to‑Speech", description=( "Enter Swahili text and click **Submit** to play the audio" ), examples=examples, cache_examples=True, ) if __name__ == "__main__": demo.launch()