import gradio as gr import spaces import torch from huggingface_hub import snapshot_download from transformers import AutoTokenizer from src.mimo_audio.modeling_mimo_audio import MiMoAudioArguments, MiMoAudioForCausalLM from peft import PeftModel from src.mimo_audio.mimo_audio import MimoAudio import tempfile import os # Download base models from Hugging Face print("Downloading MiMo-Audio base models from Hugging Face...") base_model_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-7B-Instruct") tokenizer_path = snapshot_download(repo_id="XiaomiMiMo/MiMo-Audio-Tokenizer") print(f"Base models downloaded to: {base_model_path}") # Download LoRA weights print("Downloading EmoAct-MiMo LoRA weights...") hf_token = os.environ.get("HF_TOKEN") lora_path = snapshot_download(repo_id="mrfakename/EmoAct-MiMo", token=hf_token) print(f"LoRA weights downloaded to: {lora_path}") # Load tokenizer and get special tokens print("Loading tokenizer...") tokenizer = AutoTokenizer.from_pretrained(base_model_path) sosp_idx = tokenizer.convert_tokens_to_ids("<|sosp|>") eosp_idx = tokenizer.convert_tokens_to_ids("<|eosp|>") empty_token = tokenizer.convert_tokens_to_ids("<|empty|>") sostm_idx = tokenizer.convert_tokens_to_ids("<|sostm|>") eostm_idx = tokenizer.convert_tokens_to_ids("<|eostm|>") eot_idx = tokenizer.convert_tokens_to_ids("<|eot|>") # Create model args model_args = MiMoAudioArguments( model_name_or_path=base_model_path, sosp_idx=sosp_idx, eosp_idx=eosp_idx, empty_idx=empty_token, sostm_idx=sostm_idx, eostm_idx=eostm_idx, eot_idx=eot_idx, ) # Load base model print("Loading base MiMo-Audio model...") base_model = MiMoAudioForCausalLM.from_pretrained( base_model_path, args=model_args, torch_dtype=torch.bfloat16, device_map="auto", ) print("Base model loaded") # Load and merge LoRA print("Loading LoRA adapter...") model_with_lora = PeftModel.from_pretrained(base_model, lora_path) print("Merging LoRA weights...") merged_model = model_with_lora.merge_and_unload() print("LoRA weights merged!") # Save merged model to temporary directory print("Saving merged model...") merged_model_path = "/tmp/merged_mimo_audio" os.makedirs(merged_model_path, exist_ok=True) merged_model.save_pretrained(merged_model_path) tokenizer.save_pretrained(merged_model_path) print(f"Merged model saved to {merged_model_path}") # Initialize MimoAudio with merged model print("Initializing MimoAudio wrapper...") model = MimoAudio( model_path=merged_model_path, mimo_audio_tokenizer_path=tokenizer_path ) print("Model ready!") @spaces.GPU def generate_speech(emotion, text): """Generate emotional speech from text using EmoAct-MiMo""" if not emotion or not emotion.strip(): return None, "Please enter an emotion description." if not text or not text.strip(): return None, "Please enter text to convert to speech." print("Generating:", text) print("With emotion:", emotion) try: # Create temporary file for output with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: output_path = tmp_file.name # Format the instruction with emotion and text full_instruction = f"Emotion: {emotion.strip()}\nText: {text.strip()}" # Generate TTS with emotion instruction model.tts_sft( text=text.strip(), output_path=output_path, instruct=emotion.strip() ) return output_path, "✅ Speech generated successfully!" except Exception as e: return None, f"❌ Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="EmoAct-MiMo TTS") as demo: gr.Markdown(""" # 🎭 EmoAct-MiMo: Emotion-Controllable Text-to-Speech Generate intensely emotional speech using the [EmoAct-MiMo model](https://huggingface.co/mrfakename/EmoAct-MiMo). This is still a very early experiment and is very early in the training run, I need to change a few settings and retrain. But the model turned out quite nicely! It may hallucinate, try a few times to get good results. Voice cloning is not supported yet. """) with gr.Row(): with gr.Column(): emotion_input = gr.Textbox( label="Emotion", placeholder="e.g., 'intense anger, rage, fury, hatred, and annoyance, speaking without any accent'", lines=3 ) text_input = gr.Textbox( label="Text", placeholder="Enter the text to speak with emotion...", lines=5 ) generate_btn = gr.Button("Generate Emotional Speech", variant="primary") with gr.Column(): audio_output = gr.Audio( label="Generated Speech", type="filepath" ) status_output = gr.Textbox( label="Status", interactive=False ) # Intense emotion examples gr.Examples( examples=[ [ "intense anger, rage, fury, hatred, and annoyance, speaking without any accent", "You know what? I'm done. I'm done with your excuses. (sharp exhale) Every single time, it's the same, and I actually believed you'd change. (voice cracks slightly) God, I'm such an idiot for trusting you again." ], [ "overwhelming grief, deep sorrow, heartbreak, and devastating sadness, speaking without any accent", "I can't... I can't believe they're gone. (trembling voice) It doesn't feel real. I keep expecting them to walk through that door, and... (chokes up) ...and they never will. How am I supposed to go on without them?" ], [ "extreme fear, terror, panic, dread, and anxiety, speaking without any accent", "(breathing heavily) Did you hear that? Something's out there. (whispers urgently) We need to hide, NOW. Oh god, oh god, it's getting closer. I don't want to die. Please, please let us make it out of here alive." ], [ "intense joy, euphoria, excitement, elation, and overwhelming happiness, speaking without any accent", "YES! YES! I DID IT! (laughs breathlessly) I can't believe it actually worked! This is... this is everything I've ever dreamed of! I'm so happy I could cry!" ], [ "crushing despair, hopelessness, depression, and deep emotional pain, speaking without any accent", "(quietly, numbly) What's the point anymore? I've tried everything. Nothing changes. Nothing ever gets better. I'm so tired of pretending I'm okay when I'm falling apart inside." ], [ "bitter jealousy, envy, resentment, and seething frustration, speaking without any accent", "Of course they chose you. They always choose you. (bitter laugh) Must be nice, having everything handed to you while the rest of us break our backs. You don't even appreciate what you have." ] ], inputs=[emotion_input, text_input] ) # Event handler generate_btn.click( fn=generate_speech, inputs=[emotion_input, text_input], outputs=[audio_output, status_output] ) if __name__ == "__main__": demo.launch()