import gradio as gr import torch from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq import librosa import numpy as np import os import tempfile from datetime import datetime # Global variables for model and processor processor = None model = None def load_model(): """Load the Voxtral model and processor""" global processor, model if processor is not None and model is not None: return processor, model try: model_name = "mistralai/Voxtral-Small-24B-2507" print("Loading Voxtral model... This may take a few minutes.") processor = AutoProcessor.from_pretrained(model_name) model = AutoModelForSpeechSeq2Seq.from_pretrained( model_name, torch_dtype=torch.float16, device_map="auto", low_cpu_mem_usage=True, trust_remote_code=True ) print("Model loaded successfully!") return processor, model except Exception as e: print(f"Error loading model: {str(e)}") return None, None def transcribe_audio(audio_file): """Process audio file and return transcription""" if audio_file is None: return "Please upload an audio file.", "", "" try: # Load model if not already loaded global processor, model if processor is None or model is None: processor, model = load_model() if processor is None or model is None: return "Error: Model failed to load. Please try again.", "", "" # Load audio file if isinstance(audio_file, str): # If it's a file path audio, sample_rate = librosa.load(audio_file, sr=16000) else: # If it's uploaded file data audio, sample_rate = librosa.load(audio_file.name, sr=16000) # Calculate duration duration = len(audio) / sample_rate # Process with the model inputs = processor(audio, sampling_rate=16000, return_tensors="pt") # Move inputs to the same device as model if torch.cuda.is_available(): inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()} with torch.no_grad(): predicted_ids = model.generate(**inputs, max_length=512) transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] # Generate file info word_count = len(transcription.split()) file_info = f"Duration: {duration:.2f} seconds | Words: {word_count} | Processed: {datetime.now().strftime('%H:%M:%S')}" return transcription, file_info, transcription # Return transcription twice for download except Exception as e: error_msg = f"Error processing audio: {str(e)}" print(error_msg) return error_msg, "", "" def clear_inputs(): """Clear all inputs and outputs""" return None, "", "", "" # Custom CSS for better styling css = """ .gradio-container { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; } .main-header { text-align: center; color: #2d5aa0; margin-bottom: 20px; } .info-box { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 20px; border-radius: 10px; margin: 10px 0; } .result-box { background-color: #f8f9fa; border: 1px solid #e9ecef; border-radius: 8px; padding: 15px; margin: 10px 0; } """ # Create the Gradio interface def create_interface(): with gr.Blocks(css=css, title="Voxtral-Small-24B Speech Recognition") as demo: # Header gr.Markdown( """ # 🎤 Voxtral-Small-24B Speech Recognition Upload an audio file to transcribe it using Mistral AI's Voxtral-Small-24B-2507 model. """, elem_classes=["main-header"] ) # Model info with gr.Accordion("â„šī¸ About this model", open=False): gr.Markdown( """ **Voxtral-Small-24B-2507** is a speech-to-text model developed by Mistral AI. - **Model**: mistralai/Voxtral-Small-24B-2507 - **Type**: Speech-to-Text Transformation - **Developer**: Mistral AI - **Use Case**: Audio transcription and speech recognition - **Supported Formats**: WAV, MP3, FLAC, M4A, OGG 💡 **Tip**: For best results, use clear audio files with minimal background noise. """ ) with gr.Row(): with gr.Column(scale=1): # Audio input audio_input = gr.Audio( label="📁 Upload Audio File", type="filepath", sources=["upload", "microphone"] ) # Control buttons with gr.Row(): transcribe_btn = gr.Button( "🚀 Transcribe Audio", variant="primary", size="lg" ) clear_btn = gr.Button( "đŸ—‘ī¸ Clear", variant="secondary" ) with gr.Column(scale=1): # Results transcription_output = gr.Textbox( label="📝 Transcription Result", lines=8, max_lines=15, placeholder="Transcribed text will appear here...", show_copy_button=True ) # File info info_output = gr.Textbox( label="📊 Audio Information", lines=1, placeholder="Audio details will appear here..." ) # Download option download_file = gr.File( label="💾 Download Transcription", visible=False ) # Hidden textbox for file content (for download) hidden_text = gr.Textbox(visible=False) # Event handlers transcribe_btn.click( fn=transcribe_audio, inputs=[audio_input], outputs=[transcription_output, info_output, hidden_text], show_progress=True ) # Update download file when transcription is complete def update_download(text_content): if text_content and text_content.strip(): # Create a temporary file with the transcription temp_file = tempfile.NamedTemporaryFile( mode='w', delete=False, suffix='.txt', prefix='transcription_' ) temp_file.write(text_content) temp_file.close() return gr.File(value=temp_file.name, visible=True) else: return gr.File(visible=False) hidden_text.change( fn=update_download, inputs=[hidden_text], outputs=[download_file] ) clear_btn.click( fn=clear_inputs, outputs=[audio_input, transcription_output, info_output, hidden_text] ) # Footer gr.Markdown( """ --- ### đŸ› ī¸ Usage Instructions: 1. **Upload**: Click on the audio input area to upload a file or use your microphone 2. **Transcribe**: Click the "Transcribe Audio" button to process your audio 3. **Results**: View your transcription in the text area on the right 4. **Download**: Use the download button to save your transcription as a text file **Supported formats**: WAV, MP3, FLAC, M4A, OGG """ ) return demo # Initialize and launch the app if __name__ == "__main__": # Pre-load the model when the app starts print("Initializing Voxtral model...") load_model() # Create and launch the interface demo = create_interface() demo.launch( share=True, show_error=True, server_name="0.0.0.0", server_port=7860 )