Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| import os | |
| import soundfile as sf | |
| import numpy as np | |
| from transformers import ( | |
| AutoProcessor, AutoModel, # <--- CHANGED TO AutoModel | |
| MmsCtcTokenizer, MmsForCtc, | |
| VitsForConditionalGeneration, VitsTokenizer # For Bangla ViT-TTS | |
| ) | |
| from datasets import Audio, Dataset | |
| from huggingface_hub import HfFolder, login | |
| import torchaudio | |
| import librosa | |
| import subprocess # For banglatransliterator and other CLI tools if needed | |
| import re # For text normalization | |
| from dataclasses import dataclass | |
| from typing import Any, Dict, List, Union | |
| # --- Configuration and Global Variables --- | |
| # Set device | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Using device: {DEVICE}") | |
| # Hugging Face Token (optional, for private models or pushing) | |
| # It's recommended to set this as a Space Secret named HF_TOKEN | |
| HF_TOKEN = os.environ.get("HF_TOKEN", None) | |
| if HF_TOKEN: | |
| try: | |
| login(token=HF_TOKEN) | |
| print("Logged into Hugging Face Hub.") | |
| except Exception as e: | |
| print(f"Failed to log into Hugging Face Hub. Please check your token or internet connection: {e}") | |
| else: | |
| print("HF_TOKEN not found. Some functionalities (like pushing models) might be limited.") | |
| # --- Model Caching and Dynamic Loading --- | |
| # To save memory, models are loaded on demand. | |
| # Keep track of currently loaded models. | |
| loaded_models = {} | |
| # Define the path for a default reference speaker for XTTS-v2 if needed | |
| # This file should be uploaded to your Hugging Face Space root directory. | |
| DEFAULT_XTTS_REFERENCE_SPEAKER = "reference_speaker.wav" | |
| def load_model(model_name): | |
| """Loads a specified model and its processor/tokenizer, caching it.""" | |
| global loaded_models | |
| if model_name in loaded_models: | |
| print(f"Model {model_name} already loaded.") | |
| return loaded_models[model_name] | |
| print(f"Loading model: {model_name}...") | |
| model = None | |
| processor = None | |
| tokenizer = None | |
| try: | |
| if model_name == "XTTS-v2": | |
| # XTTS-v2 (Coqui XTTS-v2) | |
| # Requires `TTS` library. | |
| from TTS.api import TTS | |
| # Note: This downloads the model if not cached. | |
| model = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(DEVICE) | |
| # XTTS doesn't use a separate processor/tokenizer in the same way as other HF models | |
| loaded_models[model_name] = {"model": model} | |
| elif model_name == "StyleTTS2": | |
| processor = AutoProcessor.from_pretrained("yl4579/StyleTTS2") | |
| model = AutoModel.from_pretrained("yl4579/StyleTTS2").to(DEVICE) # <--- This line MUST have exactly 12 spaces | |
| # Apply 8-bit quantization if available and on GPU | |
| if DEVICE == "cuda": | |
| try: | |
| model = model.half().to(DEVICE) # Convert to float16 for memory efficiency | |
| print(f"Applied half-precision (float16) to {model_name}") | |
| except Exception as e: | |
| print(f"Could not apply half-precision to {model_name}: {e}") | |
| loaded_models[model_name] = {"processor": processor, "model": model} | |
| elif model_name == "MMS-TTS": | |
| # MMS-TTS (facebook/mms-tts) | |
| # Using a general English tokenizer/model for demonstration. | |
| # For Bangla, you'd use `facebook/mms-tts-bn` if available or fine-tune. | |
| tokenizer = MmsCtcTokenizer.from_pretrained("facebook/mms-tts-eng") | |
| model = MmsForCtc.from_pretrained("facebook/mms-tts-eng").to(DEVICE) | |
| if DEVICE == "cuda": | |
| try: | |
| model = model.half().to(DEVICE) | |
| print(f"Applied half-precision (float16) to {model_name}") | |
| except Exception as e: | |
| print(f"Could not apply half-precision to {model_name}: {e}") | |
| loaded_models[model_name] = {"tokenizer": tokenizer, "model": model} | |
| elif model_name == "OpenVoice-v2": | |
| # OpenVoice v2 (myshell-ai/OpenVoice) | |
| # OpenVoice is typically used by cloning its repo and running inference scripts. | |
| # For Gradio, we'll need to adapt its core functions. | |
| # Placeholder: For actual OpenVoice, you'd integrate its specific inference code. | |
| print("OpenVoice v2 requires specific setup. Using a placeholder for embedding extraction.") | |
| loaded_models[model_name] = {"model": "placeholder_openvoice_v2"} | |
| elif model_name == "VoiceBlender": | |
| # VoiceBlender (voiceblender/voiceblender) | |
| # This is likely a custom model or script. Placeholder. | |
| print("VoiceBlender requires specific setup. Using a placeholder.") | |
| loaded_models[model_name] = {"model": "placeholder_voiceblender"} | |
| elif model_name == "Bangla-ViT-TTS": | |
| # Bangla ViT-TTS (bangla-speech-processing/bangla_tts_female) | |
| processor = AutoProcessor.from_pretrained("bangla-speech-processing/bangla_tts_female") | |
| model = AutoModel.from_pretrained("bangla-speech-processing/bangla_tts_female").to(DEVICE) # <--- CHANGED TO AutoModel | |
| if DEVICE == "cuda": | |
| try: | |
| model = model.half().to(DEVICE) | |
| print(f"Applied half-precision (float16) to {model_name}") | |
| except Exception as e: | |
| print(f"Could not apply half-precision to {model_name}: {e}") | |
| loaded_models[model_name] = {"processor": processor, "model": model} | |
| elif model_name == "Parselmouth": | |
| # Parselmouth is a Python wrapper for Praat, not a Hugging Face model. | |
| # It's used for audio manipulation. No model to load here, just check if installed. | |
| try: | |
| import parselmouth | |
| print("Parselmouth library found.") | |
| loaded_models[model_name] = {"library": True} | |
| except ImportError: | |
| print("Parselmouth not installed. Voice FX will be disabled.") | |
| loaded_models[model_name] = {"library": False} | |
| elif model_name == "SNAC-Vocoder": | |
| # SNAC Vocoder (hubertsiuzdak/snac_24khz) | |
| # This is likely a custom model or script. Placeholder. | |
| print("SNAC Vocoder requires specific setup. Using a placeholder.") | |
| loaded_models[model_name] = {"model": "placeholder_snac_vocoder"} | |
| elif model_name == "Orpheus-TTS": | |
| # Orpheus TTS (for emotion tagging) | |
| # This is likely a custom model or script. Placeholder. | |
| print("Orpheus TTS requires specific setup. Using a placeholder for emotion tagging.") | |
| loaded_models[model_name] = {"model": "placeholder_orpheus_tts"} | |
| else: | |
| raise ValueError(f"Unknown model: {model_name}") | |
| return loaded_models[model_name] | |
| except Exception as e: | |
| print(f"Error loading {model_name}: {e}") | |
| return None | |
| def unload_model(model_name): | |
| """Unloads a model from memory.""" | |
| global loaded_models | |
| if model_name in loaded_models: | |
| print(f"Unloading model: {model_name}...") | |
| del loaded_models[model_name] | |
| if DEVICE == "cuda": | |
| torch.cuda.empty_cache() # Clear GPU memory | |
| print(f"Model {model_name} unloaded.") | |
| def banglish_to_bangla(text): | |
| """ | |
| Placeholder for Banglish to Bangla transliteration. | |
| Requires 'banglatransliterator' library. | |
| Example: `from banglatransliterator import BengaliTransliterator` | |
| `transliterator = BengaliTransliterator()` | |
| `return transliterator.transliterate(text)` | |
| """ | |
| print(f"Applying Banglish to Bangla transliteration (placeholder): {text}") | |
| # For demonstration, a very simple replacement. | |
| # Real implementation needs a proper library and potentially a subprocess call. | |
| # Example using subprocess if `banglatransliterator` was a CLI tool: | |
| # try: | |
| # result = subprocess.run(['banglatransliterator_cli', text], capture_output=True, text=True, check=True) | |
| # return result.stdout.strip() | |
| # except FileNotFoundError: | |
| # print("banglatransliterator_cli not found. Ensure it's installed and in PATH.") | |
| # return text | |
| # except Exception as e: | |
| # print(f"Error during transliteration: {e}") | |
| # return text | |
| # Simple direct replacements for common Banglish words (very limited) | |
| text = text.replace("ami", "আমি").replace("tumi", "তুমি").replace("kemon", "কেমন").replace("valo", "ভালো") | |
| text = text.replace("ki", "কি").replace("kore", "করে").replace("na", "না").replace("hoy", "হয়") | |
| return text | |
| def normalize_text(text): | |
| """ | |
| Placeholder for text normalization (numbers, dates, abbreviations). | |
| Piper TTS uses espeak-ng and custom rules. | |
| """ | |
| print(f"Applying text normalization (placeholder): {text}") | |
| # Example: Replace numbers with their word form (very basic) | |
| # For full functionality, consider `num2words` library with Bengali support | |
| # or a custom rule-based system. | |
| def num_to_bangla_words(match): | |
| num_str = match.group(0) | |
| try: | |
| num = int(num_str) | |
| # This is a very basic example. For full numbers, use a library. | |
| bangla_digits = ["শূন্য", "এক", "দুই", "তিন", "চার", "পাঁচ", "ছয়", "সাত", "আট", "নয়"] | |
| if 0 <= num <= 9: | |
| return bangla_digits[num] | |
| elif num == 10: return "দশ" | |
| elif num == 100: return "একশ" | |
| # Add more complex logic for larger numbers if needed | |
| return num_str # Fallback | |
| except ValueError: | |
| return num_str | |
| text = re.sub(r'\b\d+\b', num_to_bangla_words, text) | |
| return text | |
| def get_emotion_embedding(emotion, intensity): | |
| """ | |
| Placeholder for Orpheus TTS emotion embedding. | |
| This would typically involve loading an emotion model and generating an embedding. | |
| StyleTTS2 also handles contextual emotions. | |
| """ | |
| print(f"Generating emotion embedding for {emotion} with intensity {intensity} (placeholder).") | |
| # In a real scenario, this would return a tensor embedding that can be passed to TTS models. | |
| # For now, we'll return None or a dummy value. | |
| return None # Or a dummy tensor if a model expects one | |
| def apply_voice_blending(audio_path, speaker_embedding_1, speaker_embedding_2, blend_ratio): | |
| """ | |
| Placeholder for VoiceBlender. | |
| This would take two speaker embeddings and blend them to generate a new voice. | |
| """ | |
| print(f"Applying voice blending (placeholder) with blend ratio {blend_ratio}.") | |
| # For demonstration, just return the first audio. | |
| # In reality, this would involve complex audio processing using VoiceBlender's logic. | |
| return audio_path | |
| def apply_prosody_refinement(audio_path): | |
| """ | |
| Placeholder for Bangla-Speech-Emotion for syllable stress adjustment. | |
| """ | |
| print("Applying prosody refinement (placeholder).") | |
| # This would involve analyzing the audio and text, and modifying prosody. | |
| return audio_path | |
| def upscale_audio(audio_path): | |
| """ | |
| Placeholder for SNAC Vocoder for 24kHz upscaling. | |
| """ | |
| print("Upscaling audio to 24kHz (placeholder).") | |
| # In reality, this would use the SNAC Vocoder model to convert audio to 24kHz. | |
| # For now, we'll just resample using torchaudio if needed. | |
| try: | |
| waveform, sr = torchaudio.load(audio_path) | |
| if sr != 24000: | |
| resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=24000) | |
| waveform = resampler(waveform) | |
| output_path = audio_path.replace(".wav", "_24khz.wav") | |
| torchaudio.save(output_path, waveform, 24000) | |
| return output_path | |
| return audio_path | |
| except Exception as e: | |
| print(f"Error during audio upscaling (placeholder): {e}") | |
| return audio_path | |
| def extract_speaker_embedding(audio_path): | |
| """ | |
| Extracts speaker embedding using OpenVoice v2 (placeholder). | |
| In a real OpenVoice v2 integration, this would use its `get_speaker_embedding` function. | |
| """ | |
| print(f"Extracting speaker embedding from {audio_path} (placeholder).") | |
| # OpenVoice v2 model would be loaded and used here. | |
| # For now, return a dummy embedding or None. | |
| # A real embedding would be a torch.Tensor. | |
| return None | |
| def apply_parselmouth_fx(audio_path, pitch_shift, speed_change, reverb_amount): | |
| """ | |
| Applies pitch, speed, and reverb using Parselmouth. | |
| Requires `parselmouth` library. | |
| """ | |
| print(f"Applying Parselmouth FX: Pitch={pitch_shift}, Speed={speed_change}, Reverb={reverb_amount}") | |
| model_info = load_model("Parselmouth") | |
| if not model_info or not model_info.get("library"): | |
| print("Parselmouth not available. Skipping FX.") | |
| return audio_path | |
| try: | |
| import parselmouth | |
| from parselmouth.praat import call | |
| sound = parselmouth.Sound(audio_path) | |
| # Pitch shifting | |
| if pitch_shift != 0: | |
| # Praat's To Manipulation requires min/max pitch. Using defaults. | |
| manipulation = call(sound, "To Manipulation", 0.01, 75, 600) | |
| pitch_tier = call(manipulation, "Extract pitch tier") | |
| # Shift frequencies by semitones | |
| call(pitch_tier, "Shift frequencies", 0, pitch_shift, "semitones") | |
| sound = call(manipulation, "Replace pitch tier", pitch_tier) | |
| # Speed control | |
| if speed_change != 1.0: | |
| # Praat's Change duration function is more complex. | |
| # Simple resampling changes speed and pitch. For just speed, need time-stretching. | |
| # For simplicity, we'll use a basic time-stretch via librosa. | |
| y, sr = librosa.load(sound.path, sr=None) # Load current state of sound | |
| y_stretched = librosa.effects.time_stretch(y, rate=speed_change) | |
| # Save to a temporary file | |
| temp_speed_path = audio_path.replace(".wav", "_temp_speed.wav") | |
| sf.write(temp_speed_path, y_stretched, sr) | |
| sound = parselmouth.Sound(temp_speed_path) # Reload for further processing | |
| # Reverb (Praat doesn't have direct reverb. This is a simplification/placeholder) | |
| # For real reverb, you'd use a dedicated audio library like `pydub` or `torchaudio.transforms` | |
| # or external tools. This is a conceptual placeholder for a "reverb" slider. | |
| if reverb_amount > 0: | |
| print(f"Reverb application (conceptual, not direct Praat): {reverb_amount}") | |
| # A simple way to simulate reverb is to add a delayed, attenuated copy of the signal. | |
| # This is a very basic echo, not true reverb. | |
| y, sr = librosa.load(sound.path, sr=None) # Load current state of sound | |
| delay_samples = int(0.05 * sr) # 50ms delay | |
| decay = 0.5 * reverb_amount # Adjust decay based on slider | |
| echo = np.zeros_like(y) | |
| if len(y) > delay_samples: | |
| echo[delay_samples:] = y[:-delay_samples] * decay | |
| y_reverb = y + echo | |
| y_reverb = y_reverb / np.max(np.abs(y_reverb)) * 0.9 # Normalize | |
| temp_reverb_path = audio_path.replace(".wav", "_temp_reverb.wav") | |
| sf.write(temp_reverb_path, y_reverb, sr) | |
| sound = parselmouth.Sound(temp_reverb_path) # Reload | |
| output_audio_path = audio_path.replace(".wav", f"_fx_out.wav") | |
| sound.save(output_audio_path, "WAV") | |
| return output_audio_path | |
| except Exception as e: | |
| print(f"Error applying Parselmouth FX: {e}") | |
| return audio_path | |
| def text_to_speech_workflow(text, voice_preset, emotion, intensity): | |
| """ | |
| Handles the full text-to-speech workflow. | |
| """ | |
| print(f"TTS Request: Text='{text}', Voice='{voice_preset}', Emotion='{emotion}', Intensity={intensity}") | |
| # Step 1: Language Handling | |
| processed_text = banglish_to_bangla(text) | |
| processed_text = normalize_text(processed_text) | |
| # Step 2: Emotion Tagging & Contextual Emotions | |
| emotion_embedding = get_emotion_embedding(emotion, intensity) # Placeholder for Orpheus TTS | |
| audio_output_path = "output_tts.wav" | |
| speaker_embedding = None # Will be set based on voice_preset or reference audio | |
| try: | |
| if voice_preset == "Nabanita" or voice_preset == "Pradeep": | |
| # For Nabanita/Pradeep, we'll use Bangla-ViT-TTS. | |
| # Note: Bangla-ViT-TTS is typically a female voice. For Pradeep (male), | |
| # you'd ideally need a male Bangla TTS model or apply voice conversion. | |
| # This implementation uses the base model. | |
| model_info = load_model("Bangla-ViT-TTS") | |
| if not model_info: raise Exception("Bangla-ViT-TTS model not loaded.") | |
| processor = model_info["processor"] | |
| model = model_info["model"] | |
| inputs = processor(text=processed_text, return_tensors="pt").to(DEVICE) | |
| # Bangla-ViT-TTS doesn't directly take emotion embeddings in this simple API. | |
| # If it did, you'd pass `speaker_embeddings=emotion_embedding` or similar. | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=None).cpu().numpy() | |
| sf.write(audio_output_path, speech, model.config.sampling_rate) | |
| elif voice_preset == "Hybrid": | |
| # This implies blending Nabanita and Pradeep. | |
| # Requires OpenVoice v2 for embeddings and VoiceBlender. | |
| # Placeholder: Synthesize with Bangla-ViT-TTS, then conceptually blend. | |
| model_info = load_model("Bangla-ViT-TTS") | |
| if not model_info: raise Exception("Bangla-ViT-TTS model not loaded.") | |
| processor = model_info["processor"] | |
| model = model_info["model"] | |
| inputs = processor(text=processed_text, return_tensors="pt").to(DEVICE) | |
| speech = model.generate_speech(inputs["input_ids"], speaker_embeddings=None).cpu().numpy() | |
| sf.write(audio_output_path, speech, model.config.sampling_rate) | |
| # Conceptual blending step (requires actual VoiceBlender integration) | |
| # speaker_embedding_nabanita = extract_speaker_embedding("path/to/nabanita_sample.wav") | |
| # speaker_embedding_pradeep = extract_speaker_embedding("path/to/pradeep_sample.wav") | |
| # audio_output_path = apply_voice_blending(audio_output_path, speaker_embedding_nabanita, speaker_embedding_pradeep, 0.6) | |
| print("Hybrid voice blending is a placeholder. Output is from Bangla-ViT-TTS.") | |
| elif voice_preset == "Custom": | |
| # This would use a user-trained model (e.g., from the Colab notebook). | |
| # For demonstration, we'll use XTTS-v2 with a reference speaker. | |
| # In a real scenario, you'd load the specific fine-tuned model from HF Hub. | |
| model_info = load_model("XTTS-v2") | |
| if not model_info: raise Exception("XTTS-v2 model not loaded.") | |
| xtts_model = model_info["model"] | |
| # XTTS-v2 requires a speaker reference audio. | |
| # For a custom voice, this would be a sample from the fine-tuned voice. | |
| # Ensure DEFAULT_XTTS_REFERENCE_SPEAKER exists in your Space. | |
| reference_audio_path = DEFAULT_XTTS_REFERENCE_SPEAKER | |
| if not os.path.exists(reference_audio_path): | |
| # Create a dummy reference if not found for testing | |
| dummy_audio = np.random.rand(16000 * 3).astype(np.float32) # 3 seconds of noise | |
| sf.write(reference_audio_path, dummy_audio, 16000) | |
| print(f"Created dummy reference audio at {reference_audio_path}") | |
| # Generate speech with XTTS-v2 | |
| xtts_model.tts_to_file( | |
| text=processed_text, | |
| file_path=audio_output_path, | |
| speaker_wav=reference_audio_path, | |
| language="bn" # Or "en" for English, or "auto" | |
| ) | |
| print(f"Custom voice generated using XTTS-v2 with reference: {reference_audio_path}") | |
| else: | |
| raise ValueError("Invalid voice preset selected.") | |
| # Step 5: Prosody Refinement & Upscaling | |
| audio_output_path = apply_prosody_refinement(audio_output_path) # Placeholder | |
| audio_output_path = upscale_audio(audio_output_path) # Placeholder for SNAC Vocoder | |
| return audio_output_path | |
| except Exception as e: | |
| print(f"Error in TTS workflow: {e}") | |
| return None | |
| def voice_changer_workflow(source_audio_tuple, target_audio_tuple, pitch_shift, speed_change, reverb_amount): | |
| """ | |
| Handles the voice conversion workflow. | |
| """ | |
| if source_audio_tuple is None: | |
| return None, "Please upload a source audio file." | |
| source_sr, source_audio_np = source_audio_tuple | |
| # Save source audio temporarily | |
| source_audio_path = "source_audio_temp.wav" | |
| sf.write(source_audio_path, source_audio_np, source_sr) | |
| target_audio_path = None | |
| if target_audio_tuple is not None: | |
| target_sr, target_audio_np = target_audio_tuple | |
| target_audio_path = "target_audio_temp.wav" | |
| sf.write(target_audio_path, target_audio_np, target_sr) | |
| print(f"Voice Changer Request: Source='{source_audio_path}', Target='{target_audio_path}', Pitch={pitch_shift}, Speed={speed_change}, Reverb={reverb_amount}") | |
| try: | |
| # Step 2: Extract speaker embeddings (conceptual for OpenVoice v2) | |
| # In a real OpenVoice v2 setup, you'd use its voice conversion capabilities here. | |
| # If target_audio_path is provided, you'd extract its embedding and use OpenVoice to convert | |
| # the source audio to the target speaker's voice. | |
| # For now, we'll apply FX directly to the source audio. | |
| converted_audio_path = source_audio_path # Start with source audio | |
| if target_audio_path: | |
| # This is where OpenVoice v2 voice conversion would happen. | |
| # It would take source_audio_path and the speaker embedding from target_audio_path | |
| # and produce a new audio. | |
| # For now, this is a placeholder. | |
| print("OpenVoice v2 voice conversion is a placeholder. Applying FX only.") | |
| # Example: | |
| # openvoice_model_info = load_model("OpenVoice-v2") | |
| # if openvoice_model_info: | |
| # target_speaker_embedding = extract_speaker_embedding(target_audio_path) | |
| # if target_speaker_embedding: | |
| # # This is a conceptual call to OpenVoice's conversion function | |
| # converted_audio_path = openvoice_model_info["model"].convert_voice( | |
| # source_audio_path, target_speaker_embedding, output_path="converted_openvoice.wav" | |
| # ) | |
| # else: | |
| # print("Could not extract target speaker embedding.") | |
| # else: | |
| # print("OpenVoice-v2 model not loaded.") | |
| # Apply FX using Parselmouth to the current `converted_audio_path` (which is source_audio_path initially) | |
| final_audio_path = apply_parselmouth_fx(converted_audio_path, pitch_shift, speed_change, reverb_amount) | |
| return final_audio_path, "Voice conversion successful!" | |
| except Exception as e: | |
| print(f"Error in Voice Changer workflow: {e}") | |
| return None, f"Error: {e}" | |
| def custom_voice_training_placeholder(audio_file, voice_name): | |
| """ | |
| Placeholder for custom voice training. | |
| This process is typically done offline in a Colab notebook due to GPU memory/time constraints. | |
| """ | |
| if audio_file is None: | |
| return "Please upload an audio file for training.", None | |
| if not voice_name: | |
| return "Please provide a name for your custom voice.", None | |
| print(f"Received request to train custom voice '{voice_name}' with audio: {audio_file.name}") | |
| # In a real scenario, you would: | |
| # 1. Save the uploaded audio file. | |
| # 2. Trigger an asynchronous training job (e.g., on a separate GPU instance, or queue it). | |
| # 3. Provide a link or mechanism for the user to check the training status. | |
| # 4. Once trained, the model would be pushed to Hugging Face Hub. | |
| # For this Gradio app, we'll just simulate the process. | |
| message = ( # <--- This line MUST have exactly 8 spaces before 'message' | |
| f"Custom voice training for '{voice_name}' initiated with {audio_file.name}. " | |
| "This process typically takes a long time (minutes to hours) on a dedicated GPU " | |
| "and is best done in a separate environment like a Google Colab notebook. " | |
| "Once trained, your model would be available on Hugging Face Hub and could be " | |
| "loaded under the 'Custom' voice preset in the Text-to-Speech tab." | |
| ) | |
| return message, None # Return None for audio output as no audio is generated here | |
| # --- Gradio Interface --- | |
| with gr.Blocks(title="Ultra Pro Max Bangla TTS System") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🌟 Ultra Pro Max Detailed Plan: Free Bangla TTS System with Voice Cloning & Conversion 🌟 | |
| Welcome to the advanced Bangla Text-to-Speech and Voice Changer system! | |
| """ | |
| ) | |
| with gr.Tab("Text-to-Speech"): | |
| with gr.Row(): | |
| text_input = gr.Textbox( | |
| label="Enter Text (Bangla, English, or Banglish)", | |
| placeholder="আপনার টেক্সট এখানে লিখুন অথবা Write your text here...", | |
| lines=5 | |
| ) | |
| with gr.Row(): | |
| voice_preset_dropdown = gr.Dropdown( | |
| choices=["Nabanita", "Pradeep", "Hybrid", "Custom"], | |
| label="Voice Preset", | |
| value="Nabanita" | |
| ) | |
| emotion_dropdown = gr.Dropdown( | |
| choices=["Neutral", "Happy", "Sad", "Angry", "Excited", "Whisper", "Sarcastic", "Fear", "Disgust", "Surprise", "Calm", "Confused", "Bored", "Shouting", "Questioning", "Annoyed", "Tired", "Hopeful"], | |
| label="Emotion", | |
| value="Neutral" | |
| ) | |
| intensity_slider = gr.Slider( | |
| minimum=0.0, maximum=1.0, step=0.1, value=0.5, label="Emotion Intensity" | |
| ) | |
| tts_button = gr.Button("Generate Speech") | |
| tts_output = gr.Audio(label="Generated Speech", type="filepath") | |
| tts_button.click( | |
| fn=text_to_speech_workflow, | |
| inputs=[text_input, voice_preset_dropdown, emotion_dropdown, intensity_slider], | |
| outputs=tts_output | |
| ) | |
| with gr.Tab("Voice Changer"): | |
| gr.Markdown("### Convert the voice of an audio file or apply effects.") | |
| with gr.Row(): | |
| source_audio_input = gr.Audio(label="Upload Source Audio", type="numpy") | |
| target_audio_input = gr.Audio(label="Upload Target Audio for Voice Conversion (Optional, for cloning)", type="numpy") | |
| with gr.Row(): | |
| pitch_slider = gr.Slider(minimum=-12, maximum=12, step=1, value=0, label="Pitch Shift (semitones)") | |
| speed_slider = gr.Slider(minimum=0.5, maximum=2.0, step=0.1, value=1.0, label="Speed (0.5x - 2x)") | |
| reverb_slider = gr.Slider(minimum=0.0, maximum=1.0, step=0.1, value=0.0, label="Reverb Amount (0-1)") | |
| voice_changer_button = gr.Button("Apply Voice Change / FX") | |
| voice_changer_output = gr.Audio(label="Processed Audio", type="filepath") | |
| voice_changer_message = gr.Textbox(label="Status", interactive=False) | |
| voice_changer_button.click( | |
| fn=voice_changer_workflow, | |
| inputs=[source_audio_input, target_audio_input, pitch_slider, speed_slider, reverb_slider], | |
| outputs=[voice_changer_output, voice_changer_message] | |
| ) | |
| with gr.Tab("Custom Voice Training"): | |
| gr.Markdown("### Train a custom voice using your own audio data.") | |
| gr.Markdown( | |
| """ | |
| **Note:** This process is computationally intensive and is best performed offline | |
| using a dedicated GPU environment like a Google Colab notebook. | |
| The previous step provided a Colab notebook for fine-tuning Bangla ViT-TTS. | |
| Once trained, you can use your custom voice by selecting 'Custom' in the Text-to-Speech tab. | |
| """ | |
| ) | |
| with gr.Row(): | |
| training_audio_input = gr.File(label="Upload Audio for Training (5 min - 1 hr WAV/FLAC)", type="file") | |
| voice_name_input = gr.Textbox(label="Name for Custom Voice", placeholder="e.g., MyBanglaVoice") | |
| train_button = gr.Button("Simulate Training (See Note Above)") | |
| training_status_output = gr.Textbox(label="Training Status", interactive=False) | |
| training_audio_output = gr.Audio(label="Output (N/A for training)", type="filepath", visible=False) # Not used for training output | |
| train_button.click( | |
| fn=custom_voice_training_placeholder, | |
| inputs=[training_audio_input, voice_name_input], | |
| outputs=[training_status_output, training_audio_output] | |
| ) | |
| # Launch the Gradio app | |
| if __name__ == "__main__": | |
| demo.launch() |