Spaces:

FarmerlineML
/

swahili-tts-2025

Sleeping

App Files Files Community

swahili-tts-2025 / app.py

FarmerlineML

Update app.py

ad17ccb verified 4 months ago

raw

history blame contribute delete

3.55 kB

	# Swahili Text‑to‑Speech Gradio App – MP3 Output (mobile‑friendly)
	# -----------------------------------------------------------------
	# ‑ Generates clear Kiswahili speech and serves it as an MP3 file so that
	# iOS/Android browsers play it reliably. Uses a fine‑tuned VITS checkpoint.
	#
	#‑ Dependencies (add to requirements.txt):
	# torch, transformers, gradio, scipy, pydub

	import os
	import tempfile
	import torch
	import numpy as np
	import gradio as gr
	import scipy.io.wavfile as wavfile
	from pydub import AudioSegment
	from transformers import VitsModel, AutoTokenizer

	MODEL_NAME = "FarmerlineML/swahili-tts-2025" # tokenizer
	MODEL_CHECKPOINT = "FarmerlineML/Swahili-tts-2025_part4" # acoustic model

	device = "cuda" if torch.cuda.is_available() else "cpu"

	# ---------- Load model ------------------------------------------------------
	model = VitsModel.from_pretrained(MODEL_CHECKPOINT).to(device)
	model.eval()

	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

	# Apply clear‑speech inference parameters once (no UI toggle)
	model.noise_scale = 0.7
	model.noise_scale_duration = 0.667
	model.speaking_rate = 0.75 # must be >0 to avoid ZeroDivisionError

	# ---------- Helper ----------------------------------------------------------

	def _wav_to_mp3(wave_np: np.ndarray, sr: int) -> str:
	"""Convert int16 numpy waveform to an MP3 temp file, return its path."""
	# Ensure int16 for pydub
	if wave_np.dtype != np.int16:
	# waveform from VITS is float32 in range [-1, 1]; scale and cast
	wave_np = (wave_np * 32767).astype(np.int16)

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tf:
	wavfile.write(tf.name, sr, wave_np)
	wav_path = tf.name

	# Convert to mp3 via pydub (requires ffmpeg ‑ available in Spaces base img)
	mp3_path = wav_path.replace(".wav", ".mp3")
	AudioSegment.from_wav(wav_path).export(mp3_path, format="mp3", bitrate="64k")

	os.remove(wav_path) # cleanup temp WAV
	return mp3_path


	# ---------- TTS endpoint ----------------------------------------------------

	def tts_generate(text: str):
	if not text:
	return None

	inputs = tokenizer(text, return_tensors="pt").to(device)
	with torch.no_grad():
	wave = model(**inputs).waveform[0].cpu().numpy()

	return _wav_to_mp3(wave, model.config.sampling_rate)

	# ---------- UI --------------------------------------------------------------

	examples = [
	["zao kusaidia kuondoa umaskini na kujenga kampeni za mwamko wa virusi vya ukimwi amezitembelea"],
	["Kidole hiki ni tofauti na vidole vingine kwa sababu mwelekeo wake ni wa pekee."],
	["hivyo imekuwa msingi wa teknolojia yote ya umeme hasa nyaya za kila aina"],
	["kumekuwa na majadiliano mengi juu ya usahihi wa ripoti hizi za madeni"],
	["na kusaga ulipoanzia baada ya kumaliza masomo ndugu ruge mutahaba ndipo sasa mwishoni mwa"],
	["Soko la Kariakoo huwa na watu wengi siku za Jumamosi."],
	["Tafadhali hakikisha umefunga mlango kabla ya kuondoka."],
	["Watoto walicheza mpira uwanjani hadi jua lilipotua."],
	]

	demo = gr.Interface(
	fn=tts_generate,
	inputs=gr.Textbox(lines=3, placeholder="Enter Swahili text here", label="Enter Swahili text here"),
	outputs=gr.Audio(type="filepath", label="Audio", autoplay=True),
	title="Swahili Text‑to‑Speech",
	description=(
	"Enter Swahili text and click Submit to play the audio"
	),
	examples=examples,
	cache_examples=True,
	)

	if __name__ == "__main__":
	demo.launch()