Spaces:

ravinder024
/

testspace-for-voice-to-text

Sleeping

App Files Files Community

ravinder024 commited on Sep 23

Commit

db214be

verified ·

1 Parent(s): b030e34

v1 app.py added

Browse files

Files changed (1) hide show

app.py +263 -0

app.py ADDED Viewed

	@@ -0,0 +1,263 @@

+import gradio as gr
+import torch
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+import librosa
+import numpy as np
+import os
+import tempfile
+from datetime import datetime
+# Global variables for model and processor
+processor = None
+model = None
+def load_model():
+    """Load the Voxtral model and processor"""
+    global processor, model
+    if processor is not None and model is not None:
+        return processor, model
+    try:
+        model_name = "mistralai/Voxtral-Small-24B-2507"
+        print("Loading Voxtral model... This may take a few minutes.")
+        processor = AutoProcessor.from_pretrained(model_name)
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+            trust_remote_code=True
+        )
+        print("Model loaded successfully!")
+        return processor, model
+    except Exception as e:
+        print(f"Error loading model: {str(e)}")
+        return None, None
+def transcribe_audio(audio_file):
+    """Process audio file and return transcription"""
+    if audio_file is None:
+        return "Please upload an audio file.", "", ""
+    try:
+        # Load model if not already loaded
+        global processor, model
+        if processor is None or model is None:
+            processor, model = load_model()
+        if processor is None or model is None:
+            return "Error: Model failed to load. Please try again.", "", ""
+        # Load audio file
+        if isinstance(audio_file, str):
+            # If it's a file path
+            audio, sample_rate = librosa.load(audio_file, sr=16000)
+        else:
+            # If it's uploaded file data
+            audio, sample_rate = librosa.load(audio_file.name, sr=16000)
+        # Calculate duration
+        duration = len(audio) / sample_rate
+        # Process with the model
+        inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
+        # Move inputs to the same device as model
+        if torch.cuda.is_available():
+            inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
+        with torch.no_grad():
+            predicted_ids = model.generate(**inputs, max_length=512)
+            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        # Generate file info
+        word_count = len(transcription.split())
+        file_info = f"Duration: {duration:.2f} seconds | Words: {word_count} | Processed: {datetime.now().strftime('%H:%M:%S')}"
+        return transcription, file_info, transcription  # Return transcription twice for download
+    except Exception as e:
+        error_msg = f"Error processing audio: {str(e)}"
+        print(error_msg)
+        return error_msg, "", ""
+def clear_inputs():
+    """Clear all inputs and outputs"""
+    return None, "", "", ""
+# Custom CSS for better styling
+css = """
+.gradio-container {
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+.main-header {
+    text-align: center;
+    color: #2d5aa0;
+    margin-bottom: 20px;
+}
+.info-box {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    color: white;
+    padding: 20px;
+    border-radius: 10px;
+    margin: 10px 0;
+}
+.result-box {
+    background-color: #f8f9fa;
+    border: 1px solid #e9ecef;
+    border-radius: 8px;
+    padding: 15px;
+    margin: 10px 0;
+}
+"""
+# Create the Gradio interface
+def create_interface():
+    with gr.Blocks(css=css, title="Voxtral-Small-24B Speech Recognition") as demo:
+        # Header
+        gr.Markdown(
+            """
+            # 🎤 Voxtral-Small-24B Speech Recognition
+            Upload an audio file to transcribe it using Mistral AI's Voxtral-Small-24B-2507 model.
+            """,
+            elem_classes=["main-header"]
+        )
+        # Model info
+        with gr.Accordion("ℹ️ About this model", open=False):
+            gr.Markdown(
+                """
+                **Voxtral-Small-24B-2507** is a speech-to-text model developed by Mistral AI.
+                - **Model**: mistralai/Voxtral-Small-24B-2507
+                - **Type**: Speech-to-Text Transformation
+                - **Developer**: Mistral AI
+                - **Use Case**: Audio transcription and speech recognition
+                - **Supported Formats**: WAV, MP3, FLAC, M4A, OGG
+                💡 **Tip**: For best results, use clear audio files with minimal background noise.
+                """
+            )
+        with gr.Row():
+            with gr.Column(scale=1):
+                # Audio input
+                audio_input = gr.Audio(
+                    label="📁 Upload Audio File",
+                    type="filepath",
+                    sources=["upload", "microphone"]
+                )
+                # Control buttons
+                with gr.Row():
+                    transcribe_btn = gr.Button(
+                        "🚀 Transcribe Audio",
+                        variant="primary",
+                        size="lg"
+                    )
+                    clear_btn = gr.Button(
+                        "🗑️ Clear",
+                        variant="secondary"
+                    )
+            with gr.Column(scale=1):
+                # Results
+                transcription_output = gr.Textbox(
+                    label="📝 Transcription Result",
+                    lines=8,
+                    max_lines=15,
+                    placeholder="Transcribed text will appear here...",
+                    show_copy_button=True
+                )
+                # File info
+                info_output = gr.Textbox(
+                    label="📊 Audio Information",
+                    lines=1,
+                    placeholder="Audio details will appear here..."
+                )
+                # Download option
+                download_file = gr.File(
+                    label="💾 Download Transcription",
+                    visible=False
+                )
+        # Hidden textbox for file content (for download)
+        hidden_text = gr.Textbox(visible=False)
+        # Event handlers
+        transcribe_btn.click(
+            fn=transcribe_audio,
+            inputs=[audio_input],
+            outputs=[transcription_output, info_output, hidden_text],
+            show_progress=True
+        )
+        # Update download file when transcription is complete
+        def update_download(text_content):
+            if text_content and text_content.strip():
+                # Create a temporary file with the transcription
+                temp_file = tempfile.NamedTemporaryFile(
+                    mode='w',
+                    delete=False,
+                    suffix='.txt',
+                    prefix='transcription_'
+                )
+                temp_file.write(text_content)
+                temp_file.close()
+                return gr.File(value=temp_file.name, visible=True)
+            else:
+                return gr.File(visible=False)
+        hidden_text.change(
+            fn=update_download,
+            inputs=[hidden_text],
+            outputs=[download_file]
+        )
+        clear_btn.click(
+            fn=clear_inputs,
+            outputs=[audio_input, transcription_output, info_output, hidden_text]
+        )
+        # Footer
+        gr.Markdown(
+            """
+            ---
+            ### 🛠️ Usage Instructions:
+            1. **Upload**: Click on the audio input area to upload a file or use your microphone
+            2. **Transcribe**: Click the "Transcribe Audio" button to process your audio
+            3. **Results**: View your transcription in the text area on the right
+            4. **Download**: Use the download button to save your transcription as a text file
+            **Supported formats**: WAV, MP3, FLAC, M4A, OGG
+            """
+        )
+    return demo
+# Initialize and launch the app
+if __name__ == "__main__":
+    # Pre-load the model when the app starts
+    print("Initializing Voxtral model...")
+    load_model()
+    # Create and launch the interface
+    demo = create_interface()
+    demo.launch(
+        share=True,
+        show_error=True,
+        server_name="0.0.0.0",
+        server_port=7860
+    )