Bangla-TTS-Pro / app.py
mdhosainp414's picture
Update app.py
5e57a2e verified
import gradio as gr
import torch
import os
import soundfile as sf
import numpy as np
from transformers import (
AutoProcessor, AutoModel, # <--- CHANGED TO AutoModel
MmsCtcTokenizer, MmsForCtc,
VitsForConditionalGeneration, VitsTokenizer # For Bangla ViT-TTS
)
from datasets import Audio, Dataset
from huggingface_hub import HfFolder, login
import torchaudio
import librosa
import subprocess # For banglatransliterator and other CLI tools if needed
import re # For text normalization
from dataclasses import dataclass
from typing import Any, Dict, List, Union
# --- Configuration and Global Variables ---
# Set device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
# Hugging Face Token (optional, for private models or pushing)
# It's recommended to set this as a Space Secret named HF_TOKEN
HF_TOKEN = os.environ.get("HF_TOKEN", None)
if HF_TOKEN:
try:
login(token=HF_TOKEN)
print("Logged into Hugging Face Hub.")
except Exception as e:
print(f"Failed to log into Hugging Face Hub. Please check your token or internet connection: {e}")
else:
print("HF_TOKEN not found. Some functionalities (like pushing models) might be limited.")
# --- Model Caching and Dynamic Loading ---
# To save memory, models are loaded on demand.
# Keep track of currently loaded models.
loaded_models = {}
# Define the path for a default reference speaker for XTTS-v2 if needed
# This file should be uploaded to your Hugging Face Space root directory.
DEFAULT_XTTS_REFERENCE_SPEAKER = "reference_speaker.wav"
def load_model(model_name):
"""Loads a specified model and its processor/tokenizer, caching it."""
global loaded_models
if model_name in loaded_models:
print(f"Model {model_name} already loaded.")
return loaded_models[model_name]
print(f"Loading model: {model_name}...")
model = None
processor = None
tokenizer = None
try:
if model_name == "XTTS-v2":
# XTTS-v2 (Coqui XTTS-v2)
# Requires `TTS` library.
from TTS.api import TTS
# Note: This downloads the model if not cached.
model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE)
# XTTS doesn't use a separate processor/tokenizer in the same way as other HF models
loaded_models[model_name] = {"model": model}
elif model_name == "StyleTTS2":
processor = AutoProcessor.from_pretrained("yl4579/StyleTTS2")
model = AutoModel.from_pretrained("yl4579/StyleTTS2").to(DEVICE) # <--- This line MUST have exactly 12 spaces
# Apply 8-bit quantization if available and on GPU
if DEVICE == "cuda":
try:
model = model.half().to(DEVICE) # Convert to float16 for memory efficiency
print(f"Applied half-precision (float16) to {model_name}")
except Exception as e:
print(f"Could not apply half-precision to {model_name}: {e}")
loaded_models[model_name] = {"processor": processor, "model": model}
elif model_name == "MMS-TTS":
# MMS-TTS (facebook/mms-tts)
# Using a general English tokenizer/model for demonstration.
# For Bangla, you'd use `facebook/mms-tts-bn` if available or fine-tune.
tokenizer = MmsCtcTokenizer.from_pretrained("facebook/mms-tts-eng")
model = MmsForCtc.from_pretrained("facebook/mms-tts-eng").to(DEVICE)
if DEVICE == "cuda":
try:
model = model.half().to(DEVICE)
print(f"Applied half-precision (float16) to {model_name}")
except Exception as e:
print(f"Could not apply half-precision to {model_name}: {e}")
loaded_models[model_name] = {"tokenizer": tokenizer, "model": model}
elif model_name == "OpenVoice-v2":
# OpenVoice v2 (myshell-ai/OpenVoice)
# OpenVoice is typically used by cloning its repo and running inference scripts.
# For Gradio, we'll need to adapt its core functions.
# Placeholder: For actual OpenVoice, you'd integrate its specific inference code.
print("OpenVoice v2 requires specific setup. Using a placeholder for embedding extraction.")
loaded_models[model_name] = {"model": "placeholder_openvoice_v2"}
elif model_name == "VoiceBlender":
# VoiceBlender (voiceblender/voiceblender)
# This is likely a custom model or script. Placeholder.
print("VoiceBlender requires specific setup. Using a placeholder.")
loaded_models[model_name] = {"model": "placeholder_voiceblender"}
elif model_name == "Bangla-ViT-TTS":
# Bangla ViT-TTS (bangla-speech-processing/bangla_tts_female)
processor = AutoProcessor.from_pretrained("bangla-speech-processing/bangla_tts_female")
model = AutoModel.from_pretrained("bangla-speech-processing/bangla_tts_female").to(DEVICE) # <--- CHANGED TO AutoModel
if DEVICE == "cuda":
try:
model = model.half().to(DEVICE)
print(f"Applied half-precision (float16) to {model_name}")
except Exception as e:
print(f"Could not apply half-precision to {model_name}: {e}")
loaded_models[model_name] = {"processor": processor, "model": model}
elif model_name == "Parselmouth":
# Parselmouth is a Python wrapper for Praat, not a Hugging Face model.
# It's used for audio manipulation. No model to load here, just check if installed.
try:
import parselmouth
print("Parselmouth library found.")
loaded_models[model_name] = {"library": True}
except ImportError:
print("Parselmouth not installed. Voice FX will be disabled.")
loaded_models[model_name] = {"library": False}
elif model_name == "SNAC-Vocoder":
# SNAC Vocoder (hubertsiuzdak/snac_24khz)
# This is likely a custom model or script. Placeholder.
print("SNAC Vocoder requires specific setup. Using a placeholder.")
loaded_models[model_name] = {"model": "placeholder_snac_vocoder"}
elif model_name == "Orpheus-TTS":
# Orpheus TTS (for emotion tagging)
# This is likely a custom model or script. Placeholder.
print("Orpheus TTS requires specific setup. Using a placeholder for emotion tagging.")
loaded_models[model_name] = {"model": "placeholder_orpheus_tts"}
else:
raise ValueError(f"Unknown model: {model_name}")
return loaded_models[model_name]
except Exception as e:
print(f"Error loading {model_name}: {e}")
return None
def unload_model(model_name):
"""Unloads a model from memory."""
global loaded_models
if model_name in loaded_models:
print(f"Unloading model: {model_name}...")
del loaded_models[model_name]
if DEVICE == "cuda":
torch.cuda.empty_cache() # Clear GPU memory
print(f"Model {model_name} unloaded.")
def banglish_to_bangla(text):
"""
Placeholder for Banglish to Bangla transliteration.
Requires 'banglatransliterator' library.
Example: `from banglatransliterator import BengaliTransliterator`
`transliterator = BengaliTransliterator()`
`return transliterator.transliterate(text)`
"""
print(f"Applying Banglish to Bangla transliteration (placeholder): {text}")
# For demonstration, a very simple replacement.
# Real implementation needs a proper library and potentially a subprocess call.
# Example using subprocess if `banglatransliterator` was a CLI tool:
# try:
# result = subprocess.run(['banglatransliterator_cli', text], capture_output=True, text=True, check=True)
# return result.stdout.strip()
# except FileNotFoundError:
# print("banglatransliterator_cli not found. Ensure it's installed and in PATH.")
# return text
# except Exception as e:
# print(f"Error during transliteration: {e}")
# return text
# Simple direct replacements for common Banglish words (very limited)
text = text.replace("ami", "আমি").replace("tumi", "তুমি").replace("kemon", "কেমন").replace("valo", "ভালো")
text = text.replace("ki", "কি").replace("kore", "করে").replace("na", "না").replace("hoy", "হয়")
return text
def normalize_text(text):
"""
Placeholder for text normalization (numbers, dates, abbreviations).
Piper TTS uses espeak-ng and custom rules.
"""
print(f"Applying text normalization (placeholder): {text}")
# Example: Replace numbers with their word form (very basic)
# For full functionality, consider `num2words` library with Bengali support
# or a custom rule-based system.
def num_to_bangla_words(match):
num_str = match.group(0)
try:
num = int(num_str)
# This is a very basic example. For full numbers, use a library.
bangla_digits = ["শূন্য", "এক", "দুই", "তিন", "চার", "পাঁচ", "ছয়", "সাত", "আট", "নয়"]
if 0 <= num <= 9:
return bangla_digits[num]
elif num == 10: return "দশ"
elif num == 100: return "একশ"
# Add more complex logic for larger numbers if needed
return num_str # Fallback
except ValueError:
return num_str
text = re.sub(r'\b\d+\b', num_to_bangla_words, text)
return text
def get_emotion_embedding(emotion, intensity):
"""
Placeholder for Orpheus TTS emotion embedding.
This would typically involve loading an emotion model and generating an embedding.
StyleTTS2 also handles contextual emotions.
"""
print(f"Generating emotion embedding for {emotion} with intensity {intensity} (placeholder).")
# In a real scenario, this would return a tensor embedding that can be passed to TTS models.
# For now, we'll return None or a dummy value.
return None # Or a dummy tensor if a model expects one
def apply_voice_blending(audio_path, speaker_embedding_1, speaker_embedding_2, blend_ratio):
"""
Placeholder for VoiceBlender.
This would take two speaker embeddings and blend them to generate a new voice.
"""
print(f"Applying voice blending (placeholder) with blend ratio {blend_ratio}.")
# For demonstration, just return the first audio.
# In reality, this would involve complex audio processing using VoiceBlender's logic.
return audio_path
def apply_prosody_refinement(audio_path):
"""
Placeholder for Bangla-Speech-Emotion for syllable stress adjustment.
"""
print("Applying prosody refinement (placeholder).")
# This would involve analyzing the audio and text, and modifying prosody.
return audio_path
def upscale_audio(audio_path):
"""
Placeholder for SNAC Vocoder for 24kHz upscaling.
"""
print("Upscaling audio to 24kHz (placeholder).")
# In reality, this would use the SNAC Vocoder model to convert audio to 24kHz.
# For now, we'll just resample using torchaudio if needed.
try:
waveform, sr = torchaudio.load(audio_path)
if sr != 24000:
resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=24000)
waveform = resampler(waveform)
output_path = audio_path.replace(".wav", "_24khz.wav")
torchaudio.save(output_path, waveform, 24000)
return output_path
return audio_path
except Exception as e:
print(f"Error during audio upscaling (placeholder): {e}")
return audio_path
def extract_speaker_embedding(audio_path):
"""
Extracts speaker embedding using OpenVoice v2 (placeholder).
In a real OpenVoice v2 integration, this would use its `get_speaker_embedding` function.
"""
print(f"Extracting speaker embedding from {audio_path} (placeholder).")
# OpenVoice v2 model would be loaded and used here.
# For now, return a dummy embedding or None.
# A real embedding would be a torch.Tensor.
return None
def apply_parselmouth_fx(audio_path, pitch_shift, speed_change, reverb_amount):
"""
Applies pitch, speed, and reverb using Parselmouth.
Requires `parselmouth` library.
"""
print(f"Applying Parselmouth FX: Pitch={pitch_shift}, Speed={speed_change}, Reverb={reverb_amount}")
model_info = load_model("Parselmouth")
if not model_info or not model_info.get("library"):
print("Parselmouth not available. Skipping FX.")
return audio_path
try:
import parselmouth
from parselmouth.praat import call
sound = parselmouth.Sound(audio_path)
# Pitch shifting
if pitch_shift != 0:
# Praat's To Manipulation requires min/max pitch. Using defaults.
manipulation = call(sound, "To Manipulation", 0.01, 75, 600)
pitch_tier = call(manipulation, "Extract pitch tier")
# Shift frequencies by semitones
call(pitch_tier, "Shift frequencies", 0, pitch_shift, "semitones")
sound = call(manipulation, "Replace pitch tier", pitch_tier)
# Speed control
if speed_change != 1.0:
# Praat's Change duration function is more complex.
# Simple resampling changes speed and pitch. For just speed, need time-stretching.
# For simplicity, we'll use a basic time-stretch via librosa.
y, sr = librosa.load(sound.path, sr=None) # Load current state of sound
y_stretched = librosa.effects.time_stretch(y, rate=speed_change)
# Save to a temporary file
temp_speed_path = audio_path.replace(".wav", "_temp_speed.wav")
sf.write(temp_speed_path, y_stretched, sr)
sound = parselmouth.Sound(temp_speed_path) # Reload for further processing
# Reverb (Praat doesn't have direct reverb. This is a simplification/placeholder)
# For real reverb, you'd use a dedicated audio library like `pydub` or `torchaudio.transforms`
# or external tools. This is a conceptual placeholder for a "reverb" slider.
if reverb_amount > 0:
print(f"Reverb application (conceptual, not direct Praat): {reverb_amount}")
# A simple way to simulate reverb is to add a delayed, attenuated copy of the signal.
# This is a very basic echo, not true reverb.
y, sr = librosa.load(sound.path, sr=None) # Load current state of sound
delay_samples = int(0.05 * sr) # 50ms delay
decay = 0.5 * reverb_amount # Adjust decay based on slider
echo = np.zeros_like(y)
if len(y) > delay_samples:
echo[delay_samples:] = y[:-delay_samples] * decay
y_reverb = y + echo
y_reverb = y_reverb / np.max(np.abs(y_reverb)) * 0.9 # Normalize
temp_reverb_path = audio_path.replace(".wav", "_temp_reverb.wav")
sf.write(temp_reverb_path, y_reverb, sr)
sound = parselmouth.Sound(temp_reverb_path) # Reload
output_audio_path = audio_path.replace(".wav", f"_fx_out.wav")
sound.save(output_audio_path, "WAV")
return output_audio_path
except Exception as e:
print(f"Error applying Parselmouth FX: {e}")
return audio_path
def text_to_speech_workflow(text, voice_preset, emotion, intensity):
"""
Handles the full text-to-speech workflow.
"""
print(f"TTS Request: Text='{text}', Voice='{voice_preset}', Emotion='{emotion}', Intensity={intensity}")
# Step 1: Language Handling
processed_text = banglish_to_bangla(text)
processed_text = normalize_text(processed_text)
# Step 2: Emotion Tagging & Contextual Emotions
emotion_embedding = get_emotion_embedding(emotion, intensity) # Placeholder for Orpheus TTS
audio_output_path = "output_tts.wav"
speaker_embedding = None # Will be set based on voice_preset or reference audio
try:
if voice_preset == "Nabanita" or voice_preset == "Pradeep":
# For Nabanita/Pradeep, we'll use Bangla-ViT-TTS.
# Note: Bangla-ViT-TTS is typically a female voice. For Pradeep (male),
# you'd ideally need a male Bangla TTS model or apply voice conversion.
# This implementation uses the base model.
model_info = load_model("Bangla-ViT-TTS")
if not model_info: raise Exception("Bangla-ViT-TTS model not loaded.")
processor = model_info["processor"]
model = model_info["model"]
inputs = processor(text=processed_text, return_tensors="pt").to(DEVICE)
# Bangla-ViT-TTS doesn't directly take emotion embeddings in this simple API.
# If it did, you'd pass `speaker_embeddings=emotion_embedding` or similar.
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=None).cpu().numpy()
sf.write(audio_output_path, speech, model.config.sampling_rate)
elif voice_preset == "Hybrid":
# This implies blending Nabanita and Pradeep.
# Requires OpenVoice v2 for embeddings and VoiceBlender.
# Placeholder: Synthesize with Bangla-ViT-TTS, then conceptually blend.
model_info = load_model("Bangla-ViT-TTS")
if not model_info: raise Exception("Bangla-ViT-TTS model not loaded.")
processor = model_info["processor"]
model = model_info["model"]
inputs = processor(text=processed_text, return_tensors="pt").to(DEVICE)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=None).cpu().numpy()
sf.write(audio_output_path, speech, model.config.sampling_rate)
# Conceptual blending step (requires actual VoiceBlender integration)
# speaker_embedding_nabanita = extract_speaker_embedding("path/to/nabanita_sample.wav")
# speaker_embedding_pradeep = extract_speaker_embedding("path/to/pradeep_sample.wav")
# audio_output_path = apply_voice_blending(audio_output_path, speaker_embedding_nabanita, speaker_embedding_pradeep, 0.6)
print("Hybrid voice blending is a placeholder. Output is from Bangla-ViT-TTS.")
elif voice_preset == "Custom":
# This would use a user-trained model (e.g., from the Colab notebook).
# For demonstration, we'll use XTTS-v2 with a reference speaker.
# In a real scenario, you'd load the specific fine-tuned model from HF Hub.
model_info = load_model("XTTS-v2")
if not model_info: raise Exception("XTTS-v2 model not loaded.")
xtts_model = model_info["model"]
# XTTS-v2 requires a speaker reference audio.
# For a custom voice, this would be a sample from the fine-tuned voice.
# Ensure DEFAULT_XTTS_REFERENCE_SPEAKER exists in your Space.
reference_audio_path = DEFAULT_XTTS_REFERENCE_SPEAKER
if not os.path.exists(reference_audio_path):
# Create a dummy reference if not found for testing
dummy_audio = np.random.rand(16000 * 3).astype(np.float32) # 3 seconds of noise
sf.write(reference_audio_path, dummy_audio, 16000)
print(f"Created dummy reference audio at {reference_audio_path}")
# Generate speech with XTTS-v2
xtts_model.tts_to_file(
text=processed_text,
file_path=audio_output_path,
speaker_wav=reference_audio_path,
language="bn" # Or "en" for English, or "auto"
)
print(f"Custom voice generated using XTTS-v2 with reference: {reference_audio_path}")
else:
raise ValueError("Invalid voice preset selected.")
# Step 5: Prosody Refinement & Upscaling
audio_output_path = apply_prosody_refinement(audio_output_path) # Placeholder
audio_output_path = upscale_audio(audio_output_path) # Placeholder for SNAC Vocoder
return audio_output_path
except Exception as e:
print(f"Error in TTS workflow: {e}")
return None
def voice_changer_workflow(source_audio_tuple, target_audio_tuple, pitch_shift, speed_change, reverb_amount):
"""
Handles the voice conversion workflow.
"""
if source_audio_tuple is None:
return None, "Please upload a source audio file."
source_sr, source_audio_np = source_audio_tuple
# Save source audio temporarily
source_audio_path = "source_audio_temp.wav"
sf.write(source_audio_path, source_audio_np, source_sr)
target_audio_path = None
if target_audio_tuple is not None:
target_sr, target_audio_np = target_audio_tuple
target_audio_path = "target_audio_temp.wav"
sf.write(target_audio_path, target_audio_np, target_sr)
print(f"Voice Changer Request: Source='{source_audio_path}', Target='{target_audio_path}', Pitch={pitch_shift}, Speed={speed_change}, Reverb={reverb_amount}")
try:
# Step 2: Extract speaker embeddings (conceptual for OpenVoice v2)
# In a real OpenVoice v2 setup, you'd use its voice conversion capabilities here.
# If target_audio_path is provided, you'd extract its embedding and use OpenVoice to convert
# the source audio to the target speaker's voice.
# For now, we'll apply FX directly to the source audio.
converted_audio_path = source_audio_path # Start with source audio
if target_audio_path:
# This is where OpenVoice v2 voice conversion would happen.
# It would take source_audio_path and the speaker embedding from target_audio_path
# and produce a new audio.
# For now, this is a placeholder.
print("OpenVoice v2 voice conversion is a placeholder. Applying FX only.")
# Example:
# openvoice_model_info = load_model("OpenVoice-v2")
# if openvoice_model_info:
# target_speaker_embedding = extract_speaker_embedding(target_audio_path)
# if target_speaker_embedding:
# # This is a conceptual call to OpenVoice's conversion function
# converted_audio_path = openvoice_model_info["model"].convert_voice(
# source_audio_path, target_speaker_embedding, output_path="converted_openvoice.wav"
# )
# else:
# print("Could not extract target speaker embedding.")
# else:
# print("OpenVoice-v2 model not loaded.")
# Apply FX using Parselmouth to the current `converted_audio_path` (which is source_audio_path initially)
final_audio_path = apply_parselmouth_fx(converted_audio_path, pitch_shift, speed_change, reverb_amount)
return final_audio_path, "Voice conversion successful!"
except Exception as e:
print(f"Error in Voice Changer workflow: {e}")
return None, f"Error: {e}"
def custom_voice_training_placeholder(audio_file, voice_name):
"""
Placeholder for custom voice training.
This process is typically done offline in a Colab notebook due to GPU memory/time constraints.
"""
if audio_file is None:
return "Please upload an audio file for training.", None
if not voice_name:
return "Please provide a name for your custom voice.", None
print(f"Received request to train custom voice '{voice_name}' with audio: {audio_file.name}")
# In a real scenario, you would:
# 1. Save the uploaded audio file.
# 2. Trigger an asynchronous training job (e.g., on a separate GPU instance, or queue it).
# 3. Provide a link or mechanism for the user to check the training status.
# 4. Once trained, the model would be pushed to Hugging Face Hub.
# For this Gradio app, we'll just simulate the process.
message = ( # <--- This line MUST have exactly 8 spaces before 'message'
f"Custom voice training for '{voice_name}' initiated with {audio_file.name}. "
"This process typically takes a long time (minutes to hours) on a dedicated GPU "
"and is best done in a separate environment like a Google Colab notebook. "
"Once trained, your model would be available on Hugging Face Hub and could be "
"loaded under the 'Custom' voice preset in the Text-to-Speech tab."
)
return message, None # Return None for audio output as no audio is generated here
# --- Gradio Interface ---
with gr.Blocks(title="Ultra Pro Max Bangla TTS System") as demo:
gr.Markdown(
"""
# 🌟 Ultra Pro Max Detailed Plan: Free Bangla TTS System with Voice Cloning & Conversion 🌟
Welcome to the advanced Bangla Text-to-Speech and Voice Changer system!
"""
)
with gr.Tab("Text-to-Speech"):
with gr.Row():
text_input = gr.Textbox(
label="Enter Text (Bangla, English, or Banglish)",
placeholder="আপনার টেক্সট এখানে লিখুন অথবা Write your text here...",
lines=5
)
with gr.Row():
voice_preset_dropdown = gr.Dropdown(
choices=["Nabanita", "Pradeep", "Hybrid", "Custom"],
label="Voice Preset",
value="Nabanita"
)
emotion_dropdown = gr.Dropdown(
choices=["Neutral", "Happy", "Sad", "Angry", "Excited", "Whisper", "Sarcastic", "Fear", "Disgust", "Surprise", "Calm", "Confused", "Bored", "Shouting", "Questioning", "Annoyed", "Tired", "Hopeful"],
label="Emotion",
value="Neutral"
)
intensity_slider = gr.Slider(
minimum=0.0, maximum=1.0, step=0.1, value=0.5, label="Emotion Intensity"
)
tts_button = gr.Button("Generate Speech")
tts_output = gr.Audio(label="Generated Speech", type="filepath")
tts_button.click(
fn=text_to_speech_workflow,
inputs=[text_input, voice_preset_dropdown, emotion_dropdown, intensity_slider],
outputs=tts_output
)
with gr.Tab("Voice Changer"):
gr.Markdown("### Convert the voice of an audio file or apply effects.")
with gr.Row():
source_audio_input = gr.Audio(label="Upload Source Audio", type="numpy")
target_audio_input = gr.Audio(label="Upload Target Audio for Voice Conversion (Optional, for cloning)", type="numpy")
with gr.Row():
pitch_slider = gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="Pitch Shift (semitones)")
speed_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Speed (0.5x - 2x)")
reverb_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.0, label="Reverb Amount (0-1)")
voice_changer_button = gr.Button("Apply Voice Change / FX")
voice_changer_output = gr.Audio(label="Processed Audio", type="filepath")
voice_changer_message = gr.Textbox(label="Status", interactive=False)
voice_changer_button.click(
fn=voice_changer_workflow,
inputs=[source_audio_input, target_audio_input, pitch_slider, speed_slider, reverb_slider],
outputs=[voice_changer_output, voice_changer_message]
)
with gr.Tab("Custom Voice Training"):
gr.Markdown("### Train a custom voice using your own audio data.")
gr.Markdown(
"""
**Note:** This process is computationally intensive and is best performed offline
using a dedicated GPU environment like a Google Colab notebook.
The previous step provided a Colab notebook for fine-tuning Bangla ViT-TTS.
Once trained, you can use your custom voice by selecting 'Custom' in the Text-to-Speech tab.
"""
)
with gr.Row():
training_audio_input = gr.File(label="Upload Audio for Training (5 min - 1 hr WAV/FLAC)", type="file")
voice_name_input = gr.Textbox(label="Name for Custom Voice", placeholder="e.g., MyBanglaVoice")
train_button = gr.Button("Simulate Training (See Note Above)")
training_status_output = gr.Textbox(label="Training Status", interactive=False)
training_audio_output = gr.Audio(label="Output (N/A for training)", type="filepath", visible=False) # Not used for training output
train_button.click(
fn=custom_voice_training_placeholder,
inputs=[training_audio_input, voice_name_input],
outputs=[training_status_output, training_audio_output]
)
# Launch the Gradio app
if __name__ == "__main__":
demo.launch()