ravinder024 commited on
Commit
db214be
Β·
verified Β·
1 Parent(s): b030e34

v1 app.py added

Browse files
Files changed (1) hide show
  1. app.py +263 -0
app.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
4
+ import librosa
5
+ import numpy as np
6
+ import os
7
+ import tempfile
8
+ from datetime import datetime
9
+
10
+ # Global variables for model and processor
11
+ processor = None
12
+ model = None
13
+
14
+ def load_model():
15
+ """Load the Voxtral model and processor"""
16
+ global processor, model
17
+
18
+ if processor is not None and model is not None:
19
+ return processor, model
20
+
21
+ try:
22
+ model_name = "mistralai/Voxtral-Small-24B-2507"
23
+
24
+ print("Loading Voxtral model... This may take a few minutes.")
25
+ processor = AutoProcessor.from_pretrained(model_name)
26
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
27
+ model_name,
28
+ torch_dtype=torch.float16,
29
+ device_map="auto",
30
+ low_cpu_mem_usage=True,
31
+ trust_remote_code=True
32
+ )
33
+
34
+ print("Model loaded successfully!")
35
+ return processor, model
36
+
37
+ except Exception as e:
38
+ print(f"Error loading model: {str(e)}")
39
+ return None, None
40
+
41
+ def transcribe_audio(audio_file):
42
+ """Process audio file and return transcription"""
43
+ if audio_file is None:
44
+ return "Please upload an audio file.", "", ""
45
+
46
+ try:
47
+ # Load model if not already loaded
48
+ global processor, model
49
+ if processor is None or model is None:
50
+ processor, model = load_model()
51
+
52
+ if processor is None or model is None:
53
+ return "Error: Model failed to load. Please try again.", "", ""
54
+
55
+ # Load audio file
56
+ if isinstance(audio_file, str):
57
+ # If it's a file path
58
+ audio, sample_rate = librosa.load(audio_file, sr=16000)
59
+ else:
60
+ # If it's uploaded file data
61
+ audio, sample_rate = librosa.load(audio_file.name, sr=16000)
62
+
63
+ # Calculate duration
64
+ duration = len(audio) / sample_rate
65
+
66
+ # Process with the model
67
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
68
+
69
+ # Move inputs to the same device as model
70
+ if torch.cuda.is_available():
71
+ inputs = {k: v.cuda() if isinstance(v, torch.Tensor) else v for k, v in inputs.items()}
72
+
73
+ with torch.no_grad():
74
+ predicted_ids = model.generate(**inputs, max_length=512)
75
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
76
+
77
+ # Generate file info
78
+ word_count = len(transcription.split())
79
+ file_info = f"Duration: {duration:.2f} seconds | Words: {word_count} | Processed: {datetime.now().strftime('%H:%M:%S')}"
80
+
81
+ return transcription, file_info, transcription # Return transcription twice for download
82
+
83
+ except Exception as e:
84
+ error_msg = f"Error processing audio: {str(e)}"
85
+ print(error_msg)
86
+ return error_msg, "", ""
87
+
88
+ def clear_inputs():
89
+ """Clear all inputs and outputs"""
90
+ return None, "", "", ""
91
+
92
+ # Custom CSS for better styling
93
+ css = """
94
+ .gradio-container {
95
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
96
+ }
97
+
98
+ .main-header {
99
+ text-align: center;
100
+ color: #2d5aa0;
101
+ margin-bottom: 20px;
102
+ }
103
+
104
+ .info-box {
105
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
106
+ color: white;
107
+ padding: 20px;
108
+ border-radius: 10px;
109
+ margin: 10px 0;
110
+ }
111
+
112
+ .result-box {
113
+ background-color: #f8f9fa;
114
+ border: 1px solid #e9ecef;
115
+ border-radius: 8px;
116
+ padding: 15px;
117
+ margin: 10px 0;
118
+ }
119
+ """
120
+
121
+ # Create the Gradio interface
122
+ def create_interface():
123
+ with gr.Blocks(css=css, title="Voxtral-Small-24B Speech Recognition") as demo:
124
+
125
+ # Header
126
+ gr.Markdown(
127
+ """
128
+ # 🎀 Voxtral-Small-24B Speech Recognition
129
+
130
+ Upload an audio file to transcribe it using Mistral AI's Voxtral-Small-24B-2507 model.
131
+ """,
132
+ elem_classes=["main-header"]
133
+ )
134
+
135
+ # Model info
136
+ with gr.Accordion("ℹ️ About this model", open=False):
137
+ gr.Markdown(
138
+ """
139
+ **Voxtral-Small-24B-2507** is a speech-to-text model developed by Mistral AI.
140
+
141
+ - **Model**: mistralai/Voxtral-Small-24B-2507
142
+ - **Type**: Speech-to-Text Transformation
143
+ - **Developer**: Mistral AI
144
+ - **Use Case**: Audio transcription and speech recognition
145
+ - **Supported Formats**: WAV, MP3, FLAC, M4A, OGG
146
+
147
+ πŸ’‘ **Tip**: For best results, use clear audio files with minimal background noise.
148
+ """
149
+ )
150
+
151
+ with gr.Row():
152
+ with gr.Column(scale=1):
153
+ # Audio input
154
+ audio_input = gr.Audio(
155
+ label="πŸ“ Upload Audio File",
156
+ type="filepath",
157
+ sources=["upload", "microphone"]
158
+ )
159
+
160
+ # Control buttons
161
+ with gr.Row():
162
+ transcribe_btn = gr.Button(
163
+ "πŸš€ Transcribe Audio",
164
+ variant="primary",
165
+ size="lg"
166
+ )
167
+ clear_btn = gr.Button(
168
+ "πŸ—‘οΈ Clear",
169
+ variant="secondary"
170
+ )
171
+
172
+ with gr.Column(scale=1):
173
+ # Results
174
+ transcription_output = gr.Textbox(
175
+ label="πŸ“ Transcription Result",
176
+ lines=8,
177
+ max_lines=15,
178
+ placeholder="Transcribed text will appear here...",
179
+ show_copy_button=True
180
+ )
181
+
182
+ # File info
183
+ info_output = gr.Textbox(
184
+ label="πŸ“Š Audio Information",
185
+ lines=1,
186
+ placeholder="Audio details will appear here..."
187
+ )
188
+
189
+ # Download option
190
+ download_file = gr.File(
191
+ label="πŸ’Ύ Download Transcription",
192
+ visible=False
193
+ )
194
+
195
+ # Hidden textbox for file content (for download)
196
+ hidden_text = gr.Textbox(visible=False)
197
+
198
+ # Event handlers
199
+ transcribe_btn.click(
200
+ fn=transcribe_audio,
201
+ inputs=[audio_input],
202
+ outputs=[transcription_output, info_output, hidden_text],
203
+ show_progress=True
204
+ )
205
+
206
+ # Update download file when transcription is complete
207
+ def update_download(text_content):
208
+ if text_content and text_content.strip():
209
+ # Create a temporary file with the transcription
210
+ temp_file = tempfile.NamedTemporaryFile(
211
+ mode='w',
212
+ delete=False,
213
+ suffix='.txt',
214
+ prefix='transcription_'
215
+ )
216
+ temp_file.write(text_content)
217
+ temp_file.close()
218
+ return gr.File(value=temp_file.name, visible=True)
219
+ else:
220
+ return gr.File(visible=False)
221
+
222
+ hidden_text.change(
223
+ fn=update_download,
224
+ inputs=[hidden_text],
225
+ outputs=[download_file]
226
+ )
227
+
228
+ clear_btn.click(
229
+ fn=clear_inputs,
230
+ outputs=[audio_input, transcription_output, info_output, hidden_text]
231
+ )
232
+
233
+ # Footer
234
+ gr.Markdown(
235
+ """
236
+ ---
237
+
238
+ ### πŸ› οΈ Usage Instructions:
239
+ 1. **Upload**: Click on the audio input area to upload a file or use your microphone
240
+ 2. **Transcribe**: Click the "Transcribe Audio" button to process your audio
241
+ 3. **Results**: View your transcription in the text area on the right
242
+ 4. **Download**: Use the download button to save your transcription as a text file
243
+
244
+ **Supported formats**: WAV, MP3, FLAC, M4A, OGG
245
+ """
246
+ )
247
+
248
+ return demo
249
+
250
+ # Initialize and launch the app
251
+ if __name__ == "__main__":
252
+ # Pre-load the model when the app starts
253
+ print("Initializing Voxtral model...")
254
+ load_model()
255
+
256
+ # Create and launch the interface
257
+ demo = create_interface()
258
+ demo.launch(
259
+ share=True,
260
+ show_error=True,
261
+ server_name="0.0.0.0",
262
+ server_port=7860
263
+ )