Mahmoud Elsamadony
commited on
Commit
·
988a3de
1
Parent(s):
e1c6b8d
Update
Browse files- API_USAGE.md +238 -0
- README.md +69 -2
- api_client.py +128 -0
- api_requirements.txt +1 -0
- app.py +30 -20
API_USAGE.md
ADDED
|
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Using VTT with Diarization Space via API
|
| 2 |
+
|
| 3 |
+
This guide shows you how to use your Hugging Face Space via API.
|
| 4 |
+
|
| 5 |
+
## Option 1: Using Python (Gradio Client)
|
| 6 |
+
|
| 7 |
+
### Installation
|
| 8 |
+
|
| 9 |
+
```bash
|
| 10 |
+
pip install gradio_client
|
| 11 |
+
```
|
| 12 |
+
|
| 13 |
+
### Quick Start
|
| 14 |
+
|
| 15 |
+
```python
|
| 16 |
+
from gradio_client import Client
|
| 17 |
+
|
| 18 |
+
# Initialize client
|
| 19 |
+
client = Client("MahmoudElsamadony/vtt-with-diariazation")
|
| 20 |
+
|
| 21 |
+
# Transcribe audio
|
| 22 |
+
result = client.predict(
|
| 23 |
+
audio_path="path/to/your/audio.mp3",
|
| 24 |
+
language="ar", # or "en", "fr", etc., or "" for auto-detect
|
| 25 |
+
enable_diarization=False,
|
| 26 |
+
beam_size=5,
|
| 27 |
+
best_of=5,
|
| 28 |
+
api_name="/predict"
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
transcript, details = result
|
| 32 |
+
print(f"Transcript: {transcript}")
|
| 33 |
+
print(f"Language: {details['language']}")
|
| 34 |
+
print(f"Duration: {details['duration']} seconds")
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
### With Speaker Diarization
|
| 38 |
+
|
| 39 |
+
```python
|
| 40 |
+
# Enable diarization to identify different speakers
|
| 41 |
+
result = client.predict(
|
| 42 |
+
audio_path="path/to/your/audio.mp3",
|
| 43 |
+
language="ar",
|
| 44 |
+
enable_diarization=True, # Enable speaker diarization
|
| 45 |
+
beam_size=5,
|
| 46 |
+
best_of=5,
|
| 47 |
+
api_name="/predict"
|
| 48 |
+
)
|
| 49 |
+
|
| 50 |
+
transcript, details = result
|
| 51 |
+
|
| 52 |
+
# Access speaker information
|
| 53 |
+
for segment in details['segments']:
|
| 54 |
+
speaker = segment.get('speaker', 'Unknown')
|
| 55 |
+
text = segment['text']
|
| 56 |
+
start = segment['start']
|
| 57 |
+
print(f"[{start:.2f}s] {speaker}: {text}")
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
### Full Example Script
|
| 61 |
+
|
| 62 |
+
See `api_client.py` for a complete example with multiple use cases.
|
| 63 |
+
|
| 64 |
+
```bash
|
| 65 |
+
python api_client.py
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
## Option 2: Using JavaScript/TypeScript
|
| 69 |
+
|
| 70 |
+
### Installation
|
| 71 |
+
|
| 72 |
+
```bash
|
| 73 |
+
npm install @gradio/client
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
### Usage
|
| 77 |
+
|
| 78 |
+
```javascript
|
| 79 |
+
import { client } from "@gradio/client";
|
| 80 |
+
|
| 81 |
+
const app = await client("MahmoudElsamadony/vtt-with-diariazation");
|
| 82 |
+
|
| 83 |
+
const result = await app.predict("/predict", [
|
| 84 |
+
"path/to/audio.mp3", // audio_path
|
| 85 |
+
"ar", // language
|
| 86 |
+
false, // enable_diarization
|
| 87 |
+
5, // beam_size
|
| 88 |
+
5 // best_of
|
| 89 |
+
]);
|
| 90 |
+
|
| 91 |
+
const [transcript, details] = result.data;
|
| 92 |
+
console.log("Transcript:", transcript);
|
| 93 |
+
console.log("Language:", details.language);
|
| 94 |
+
console.log("Duration:", details.duration);
|
| 95 |
+
```
|
| 96 |
+
|
| 97 |
+
## Option 3: Using cURL (REST API)
|
| 98 |
+
|
| 99 |
+
First, get your Space's API endpoint:
|
| 100 |
+
|
| 101 |
+
```bash
|
| 102 |
+
curl https://mahmoudelsamadony-vtt-with-diariazation.hf.space/info
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
Then make a prediction (you'll need to upload the file first):
|
| 106 |
+
|
| 107 |
+
```bash
|
| 108 |
+
# This is more complex with cURL as you need to handle file uploads
|
| 109 |
+
# It's recommended to use the Python or JavaScript clients instead
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
## Parameters
|
| 113 |
+
|
| 114 |
+
| Parameter | Type | Default | Description |
|
| 115 |
+
|-----------|------|---------|-------------|
|
| 116 |
+
| `audio_path` | string | required | Path to audio file (mp3, wav, m4a, etc.) |
|
| 117 |
+
| `language` | string | "ar" | Language code ("ar", "en", "fr", "de", "es", "ru", "zh") or "" for auto-detect |
|
| 118 |
+
| `enable_diarization` | boolean | false | Enable speaker diarization (identifies different speakers) |
|
| 119 |
+
| `beam_size` | integer | 5 | Beam size for Whisper (1-10, higher = more accurate but slower) |
|
| 120 |
+
| `best_of` | integer | 5 | Best of parameter for Whisper (1-10) |
|
| 121 |
+
|
| 122 |
+
## Response Format
|
| 123 |
+
|
| 124 |
+
The API returns a tuple `(transcript, details)`:
|
| 125 |
+
|
| 126 |
+
### transcript (string)
|
| 127 |
+
The complete transcribed text.
|
| 128 |
+
|
| 129 |
+
### details (object)
|
| 130 |
+
```json
|
| 131 |
+
{
|
| 132 |
+
"text": "Complete transcript text",
|
| 133 |
+
"language": "ar",
|
| 134 |
+
"language_probability": 0.98,
|
| 135 |
+
"duration": 123.45,
|
| 136 |
+
"segments": [
|
| 137 |
+
{
|
| 138 |
+
"start": 0.0,
|
| 139 |
+
"end": 5.2,
|
| 140 |
+
"text": "Segment text",
|
| 141 |
+
"speaker": "SPEAKER_00", // Only if diarization is enabled
|
| 142 |
+
"words": [
|
| 143 |
+
{
|
| 144 |
+
"start": 0.0,
|
| 145 |
+
"end": 0.5,
|
| 146 |
+
"word": "word",
|
| 147 |
+
"probability": 0.95
|
| 148 |
+
}
|
| 149 |
+
]
|
| 150 |
+
}
|
| 151 |
+
],
|
| 152 |
+
"speakers": [ // Only if diarization is enabled
|
| 153 |
+
{
|
| 154 |
+
"start": 0.0,
|
| 155 |
+
"end": 10.5,
|
| 156 |
+
"speaker": "SPEAKER_00"
|
| 157 |
+
}
|
| 158 |
+
]
|
| 159 |
+
}
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
## Error Handling
|
| 163 |
+
|
| 164 |
+
```python
|
| 165 |
+
from gradio_client import Client
|
| 166 |
+
|
| 167 |
+
try:
|
| 168 |
+
client = Client("MahmoudElsamadony/vtt-with-diariazation")
|
| 169 |
+
result = client.predict(
|
| 170 |
+
audio_path="audio.mp3",
|
| 171 |
+
language="ar",
|
| 172 |
+
enable_diarization=False,
|
| 173 |
+
beam_size=5,
|
| 174 |
+
best_of=5,
|
| 175 |
+
api_name="/predict"
|
| 176 |
+
)
|
| 177 |
+
transcript, details = result
|
| 178 |
+
print(transcript)
|
| 179 |
+
except Exception as e:
|
| 180 |
+
print(f"Error: {e}")
|
| 181 |
+
```
|
| 182 |
+
|
| 183 |
+
## Tips
|
| 184 |
+
|
| 185 |
+
1. **First run takes longer** - The space needs to download models (~1.2GB total)
|
| 186 |
+
2. **Diarization requires HF token** - Make sure you've set `HF_TOKEN` in your Space secrets
|
| 187 |
+
3. **Use appropriate beam_size** - Higher values (8-10) are more accurate but slower
|
| 188 |
+
4. **Language auto-detection** - Pass empty string `""` for language to auto-detect
|
| 189 |
+
5. **Rate limits** - Hugging Face Spaces have rate limits for free usage
|
| 190 |
+
|
| 191 |
+
## Local Testing
|
| 192 |
+
|
| 193 |
+
To test the API locally before deploying:
|
| 194 |
+
|
| 195 |
+
```bash
|
| 196 |
+
# In your space directory
|
| 197 |
+
python app.py
|
| 198 |
+
```
|
| 199 |
+
|
| 200 |
+
Then access via:
|
| 201 |
+
```python
|
| 202 |
+
client = Client("http://127.0.0.1:7860")
|
| 203 |
+
```
|
| 204 |
+
|
| 205 |
+
## Advanced: Async Usage
|
| 206 |
+
|
| 207 |
+
```python
|
| 208 |
+
from gradio_client import Client
|
| 209 |
+
|
| 210 |
+
async def transcribe_async():
|
| 211 |
+
client = Client("MahmoudElsamadony/vtt-with-diariazation")
|
| 212 |
+
|
| 213 |
+
# Submit job
|
| 214 |
+
job = client.submit(
|
| 215 |
+
audio_path="audio.mp3",
|
| 216 |
+
language="ar",
|
| 217 |
+
enable_diarization=False,
|
| 218 |
+
beam_size=5,
|
| 219 |
+
best_of=5,
|
| 220 |
+
api_name="/predict"
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
# Do other work while waiting...
|
| 224 |
+
|
| 225 |
+
# Get result when ready
|
| 226 |
+
result = job.result()
|
| 227 |
+
return result
|
| 228 |
+
|
| 229 |
+
# Use with asyncio
|
| 230 |
+
import asyncio
|
| 231 |
+
result = asyncio.run(transcribe_async())
|
| 232 |
+
```
|
| 233 |
+
|
| 234 |
+
## Support
|
| 235 |
+
|
| 236 |
+
For issues with the API, check:
|
| 237 |
+
- Space logs: https://huggingface.co/spaces/MahmoudElsamadony/vtt-with-diariazation/logs
|
| 238 |
+
- Gradio Client docs: https://www.gradio.app/guides/getting-started-with-the-python-client
|
README.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
|
@@ -10,4 +10,71 @@ pinned: false
|
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: VTT with Diarization
|
| 3 |
+
emoji: 🎙️
|
| 4 |
colorFrom: purple
|
| 5 |
colorTo: indigo
|
| 6 |
sdk: gradio
|
|
|
|
| 10 |
license: mit
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# Voice-to-Text with Speaker Diarization
|
| 14 |
+
|
| 15 |
+
Powered by **faster-whisper** and **pyannote.audio** running locally on this Space.
|
| 16 |
+
|
| 17 |
+
## Features
|
| 18 |
+
|
| 19 |
+
- 🎯 **High-quality transcription** using faster-whisper (2-4x faster than OpenAI Whisper)
|
| 20 |
+
- 👥 **Speaker diarization** with pyannote.audio 3.1
|
| 21 |
+
- 🌍 **Multi-language support** (Arabic, English, French, German, Spanish, Russian, Chinese, etc.)
|
| 22 |
+
- ⚙️ **Configurable parameters** (beam size, best_of, model size)
|
| 23 |
+
- 🔧 **Optimized for Arabic customer service calls** with specialized prompts
|
| 24 |
+
|
| 25 |
+
## Usage
|
| 26 |
+
|
| 27 |
+
1. Upload an audio file (mp3, wav, m4a, flac, etc.)
|
| 28 |
+
2. Select language (or leave blank for auto-detect)
|
| 29 |
+
3. Enable speaker diarization if needed (requires HF_TOKEN)
|
| 30 |
+
4. Adjust quality parameters if desired
|
| 31 |
+
5. Click "Transcribe"
|
| 32 |
+
|
| 33 |
+
## Configuration
|
| 34 |
+
|
| 35 |
+
Set these in Space Settings → Variables:
|
| 36 |
+
|
| 37 |
+
- `WHISPER_MODEL_SIZE`: Model size (`tiny`, `base`, `small`, `medium`, `large-v3`) - default: `small`
|
| 38 |
+
- `WHISPER_DEVICE`: Device (`cpu` or `cuda`) - default: `cpu`
|
| 39 |
+
- `WHISPER_COMPUTE_TYPE`: Compute type (`int8`, `int16`, `float32`) - default: `int8`
|
| 40 |
+
- `DEFAULT_LANGUAGE`: Default language code - default: `ar` (Arabic)
|
| 41 |
+
- `WHISPER_BEAM_SIZE`: Beam search size (1-10) - default: `5`
|
| 42 |
+
- `WHISPER_BEST_OF`: Best of candidates (1-10) - default: `5`
|
| 43 |
+
|
| 44 |
+
### Secrets (required for diarization):
|
| 45 |
+
|
| 46 |
+
- `HF_TOKEN`: Your Hugging Face token with access to `pyannote/speaker-diarization-3.1`
|
| 47 |
+
|
| 48 |
+
## Model Information
|
| 49 |
+
|
| 50 |
+
### Whisper Models
|
| 51 |
+
|
| 52 |
+
| Model | Size | RAM | Quality | Speed |
|
| 53 |
+
|-------|------|-----|---------|-------|
|
| 54 |
+
| tiny | 75MB | 1GB | ⭐⭐ | Very Fast |
|
| 55 |
+
| base | 150MB | 1GB | ⭐⭐⭐ | Fast |
|
| 56 |
+
| small | 500MB | 2GB | ⭐⭐⭐⭐ | Moderate |
|
| 57 |
+
| medium | 1.5GB | 5GB | ⭐⭐⭐⭐⭐ | Slow |
|
| 58 |
+
| large-v3 | 3GB | 10GB | ⭐⭐⭐⭐⭐⭐ | Very Slow |
|
| 59 |
+
|
| 60 |
+
### First Run
|
| 61 |
+
|
| 62 |
+
- First transcription will download the selected Whisper model automatically
|
| 63 |
+
- Diarization downloads ~700MB on first use (cached afterward)
|
| 64 |
+
- Models are stored in the Space's persistent storage
|
| 65 |
+
|
| 66 |
+
## Technical Details
|
| 67 |
+
|
| 68 |
+
- Uses the same model loading approach as the Django backend
|
| 69 |
+
- faster-whisper automatically downloads models from Hugging Face
|
| 70 |
+
- Diarization pipeline is downloaded locally to avoid repeated API calls
|
| 71 |
+
- All processing happens on this Space (no external inference APIs)
|
| 72 |
+
|
| 73 |
+
## Credits
|
| 74 |
+
|
| 75 |
+
- [faster-whisper](https://github.com/guillaumekln/faster-whisper) by Guillaume Klein
|
| 76 |
+
- [pyannote.audio](https://github.com/pyannote/pyannote-audio) by Hervé Bredin
|
| 77 |
+
- Original Django backend by IZI Techs
|
| 78 |
+
|
| 79 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
| 80 |
+
|
api_client.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
API Client for VTT with Diarization Hugging Face Space
|
| 3 |
+
Usage example for calling the space via Gradio Client API
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from gradio_client import Client
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
# Your Hugging Face Space URL
|
| 10 |
+
SPACE_URL = "MahmoudElsamadony/vtt-with-diariazation"
|
| 11 |
+
|
| 12 |
+
def transcribe_audio(
|
| 13 |
+
audio_file_path: str,
|
| 14 |
+
language: str = "ar",
|
| 15 |
+
enable_diarization: bool = False,
|
| 16 |
+
beam_size: int = 5,
|
| 17 |
+
best_of: int = 5,
|
| 18 |
+
):
|
| 19 |
+
"""
|
| 20 |
+
Transcribe audio file using the Hugging Face Space API
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
audio_file_path: Path to the audio file (mp3, wav, m4a, etc.)
|
| 24 |
+
language: Language code ("ar", "en", "fr", etc.) or "" for auto-detect
|
| 25 |
+
enable_diarization: Whether to enable speaker diarization
|
| 26 |
+
beam_size: Beam size for Whisper (1-10)
|
| 27 |
+
best_of: Best of parameter for Whisper (1-10)
|
| 28 |
+
|
| 29 |
+
Returns:
|
| 30 |
+
tuple: (transcript_text, detailed_json)
|
| 31 |
+
"""
|
| 32 |
+
# Initialize the client
|
| 33 |
+
client = Client(SPACE_URL)
|
| 34 |
+
|
| 35 |
+
# Call the transcribe function
|
| 36 |
+
result = client.predict(
|
| 37 |
+
audio_path=audio_file_path,
|
| 38 |
+
language=language,
|
| 39 |
+
enable_diarization=enable_diarization,
|
| 40 |
+
beam_size=beam_size,
|
| 41 |
+
best_of=best_of,
|
| 42 |
+
api_name="/predict"
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
return result
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def main():
|
| 49 |
+
"""Example usage of the API client"""
|
| 50 |
+
|
| 51 |
+
# Example 1: Basic transcription (Arabic, no diarization)
|
| 52 |
+
print("=" * 60)
|
| 53 |
+
print("Example 1: Basic Arabic transcription")
|
| 54 |
+
print("=" * 60)
|
| 55 |
+
|
| 56 |
+
# Replace with your actual audio file path
|
| 57 |
+
audio_file = "path/to/your/audio.mp3"
|
| 58 |
+
|
| 59 |
+
if os.path.exists(audio_file):
|
| 60 |
+
transcript, details = transcribe_audio(
|
| 61 |
+
audio_file_path=audio_file,
|
| 62 |
+
language="ar",
|
| 63 |
+
enable_diarization=False,
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
print(f"\nTranscript:\n{transcript}\n")
|
| 67 |
+
print(f"Language: {details.get('language')}")
|
| 68 |
+
print(f"Duration: {details.get('duration')} seconds")
|
| 69 |
+
print(f"Number of segments: {len(details.get('segments', []))}")
|
| 70 |
+
else:
|
| 71 |
+
print(f"Audio file not found: {audio_file}")
|
| 72 |
+
|
| 73 |
+
print("\n" + "=" * 60)
|
| 74 |
+
print("Example 2: Transcription with speaker diarization")
|
| 75 |
+
print("=" * 60)
|
| 76 |
+
|
| 77 |
+
# Example 2: Transcription with diarization
|
| 78 |
+
if os.path.exists(audio_file):
|
| 79 |
+
transcript, details = transcribe_audio(
|
| 80 |
+
audio_file_path=audio_file,
|
| 81 |
+
language="ar",
|
| 82 |
+
enable_diarization=True,
|
| 83 |
+
beam_size=5,
|
| 84 |
+
best_of=5,
|
| 85 |
+
)
|
| 86 |
+
|
| 87 |
+
print(f"\nTranscript:\n{transcript}\n")
|
| 88 |
+
|
| 89 |
+
# Print speaker turns
|
| 90 |
+
if "speakers" in details:
|
| 91 |
+
print("\nSpeaker turns:")
|
| 92 |
+
for turn in details["speakers"][:5]: # Show first 5 turns
|
| 93 |
+
print(f" {turn['speaker']}: {turn['start']:.2f}s - {turn['end']:.2f}s")
|
| 94 |
+
|
| 95 |
+
# Print segments with speakers
|
| 96 |
+
print("\nSegments with speakers:")
|
| 97 |
+
for segment in details.get("segments", [])[:3]: # Show first 3 segments
|
| 98 |
+
speaker = segment.get("speaker", "Unknown")
|
| 99 |
+
text = segment.get("text", "")
|
| 100 |
+
start = segment.get("start", 0)
|
| 101 |
+
print(f" [{start:.2f}s] {speaker}: {text}")
|
| 102 |
+
else:
|
| 103 |
+
print(f"Audio file not found: {audio_file}")
|
| 104 |
+
|
| 105 |
+
print("\n" + "=" * 60)
|
| 106 |
+
print("Example 3: Auto-detect language")
|
| 107 |
+
print("=" * 60)
|
| 108 |
+
|
| 109 |
+
# Example 3: Auto-detect language
|
| 110 |
+
if os.path.exists(audio_file):
|
| 111 |
+
transcript, details = transcribe_audio(
|
| 112 |
+
audio_file_path=audio_file,
|
| 113 |
+
language="", # Empty string for auto-detect
|
| 114 |
+
enable_diarization=False,
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
print(f"\nDetected language: {details.get('language')}")
|
| 118 |
+
print(f"Language probability: {details.get('language_probability'):.2%}")
|
| 119 |
+
print(f"\nTranscript:\n{transcript}")
|
| 120 |
+
else:
|
| 121 |
+
print(f"Audio file not found: {audio_file}")
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
if __name__ == "__main__":
|
| 125 |
+
# Install gradio_client first:
|
| 126 |
+
# pip install gradio_client
|
| 127 |
+
|
| 128 |
+
main()
|
api_requirements.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
gradio_client>=0.7.0
|
app.py
CHANGED
|
@@ -14,16 +14,13 @@ load_dotenv()
|
|
| 14 |
# ---------------------------------------------------------------------------
|
| 15 |
# Configuration via environment variables (override inside HF Space settings)
|
| 16 |
# ---------------------------------------------------------------------------
|
|
|
|
|
|
|
| 17 |
WHISPER_MODEL_SIZE = os.environ.get("WHISPER_MODEL_SIZE", "large-v3")
|
| 18 |
-
WHISPER_REPO_ID = os.environ.get(
|
| 19 |
-
"WHISPER_REPO_ID", f"guillaumekln/faster-whisper-{WHISPER_MODEL_SIZE}"
|
| 20 |
-
)
|
| 21 |
-
WHISPER_LOCAL_DIR = os.environ.get(
|
| 22 |
-
"WHISPER_LOCAL_DIR", f"models/faster-whisper-{WHISPER_MODEL_SIZE}"
|
| 23 |
-
)
|
| 24 |
WHISPER_DEVICE = os.environ.get("WHISPER_DEVICE", "cpu")
|
| 25 |
WHISPER_COMPUTE_TYPE = os.environ.get("WHISPER_COMPUTE_TYPE", "int8_float32")
|
| 26 |
|
|
|
|
| 27 |
DIARIZATION_REPO_ID = os.environ.get(
|
| 28 |
"DIARIZATION_REPO_ID", "pyannote/speaker-diarization-3.1"
|
| 29 |
)
|
|
@@ -67,15 +64,15 @@ def _ensure_snapshot(repo_id: str, local_dir: str, allow_patterns: Optional[List
|
|
| 67 |
|
| 68 |
|
| 69 |
def _load_whisper_model() -> WhisperModel:
|
|
|
|
| 70 |
global _whisper_model
|
| 71 |
if _whisper_model is None:
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
)
|
| 77 |
_whisper_model = WhisperModel(
|
| 78 |
-
|
| 79 |
device=WHISPER_DEVICE,
|
| 80 |
compute_type=WHISPER_COMPUTE_TYPE,
|
| 81 |
)
|
|
@@ -83,14 +80,21 @@ def _load_whisper_model() -> WhisperModel:
|
|
| 83 |
|
| 84 |
|
| 85 |
def _load_diarization_pipeline() -> Optional[Pipeline]:
|
|
|
|
| 86 |
global _diarization_pipeline
|
| 87 |
if _diarization_pipeline is None:
|
| 88 |
if HF_TOKEN is None:
|
| 89 |
raise gr.Error(
|
| 90 |
"HF_TOKEN secret is missing. Add it in Space settings to enable diarization."
|
| 91 |
)
|
|
|
|
|
|
|
|
|
|
| 92 |
local_dir = _ensure_snapshot(DIARIZATION_REPO_ID, DIARIZATION_LOCAL_DIR)
|
| 93 |
-
_diarization_pipeline = Pipeline.from_pretrained(
|
|
|
|
|
|
|
|
|
|
| 94 |
_diarization_pipeline.to(torch.device("cpu"))
|
| 95 |
return _diarization_pipeline
|
| 96 |
|
|
@@ -110,15 +114,19 @@ def transcribe(
|
|
| 110 |
|
| 111 |
model = _load_whisper_model()
|
| 112 |
|
|
|
|
| 113 |
segments, info = model.transcribe(
|
| 114 |
audio_path,
|
| 115 |
language=language if language else None,
|
| 116 |
beam_size=beam_size,
|
| 117 |
best_of=best_of,
|
| 118 |
-
temperature=
|
| 119 |
vad_filter=True,
|
| 120 |
-
vad_parameters=dict(
|
| 121 |
-
|
|
|
|
|
|
|
|
|
|
| 122 |
initial_prompt=initial_prompt,
|
| 123 |
compression_ratio_threshold=2.4,
|
| 124 |
log_prob_threshold=-1.0,
|
|
@@ -242,11 +250,13 @@ def build_interface() -> gr.Blocks:
|
|
| 242 |
)
|
| 243 |
|
| 244 |
gr.Markdown(
|
| 245 |
-
"""
|
| 246 |
## Tips
|
| 247 |
-
-
|
| 248 |
-
-
|
| 249 |
-
-
|
|
|
|
|
|
|
| 250 |
"""
|
| 251 |
)
|
| 252 |
|
|
|
|
| 14 |
# ---------------------------------------------------------------------------
|
| 15 |
# Configuration via environment variables (override inside HF Space settings)
|
| 16 |
# ---------------------------------------------------------------------------
|
| 17 |
+
# Whisper model: use same model names as Django app (tiny, base, small, medium, large-v3)
|
| 18 |
+
# faster-whisper will download these automatically from Hugging Face on first run
|
| 19 |
WHISPER_MODEL_SIZE = os.environ.get("WHISPER_MODEL_SIZE", "large-v3")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
WHISPER_DEVICE = os.environ.get("WHISPER_DEVICE", "cpu")
|
| 21 |
WHISPER_COMPUTE_TYPE = os.environ.get("WHISPER_COMPUTE_TYPE", "int8_float32")
|
| 22 |
|
| 23 |
+
# Diarization: download locally to avoid repeated API calls
|
| 24 |
DIARIZATION_REPO_ID = os.environ.get(
|
| 25 |
"DIARIZATION_REPO_ID", "pyannote/speaker-diarization-3.1"
|
| 26 |
)
|
|
|
|
| 64 |
|
| 65 |
|
| 66 |
def _load_whisper_model() -> WhisperModel:
|
| 67 |
+
"""Load Faster Whisper model lazily (singleton) - same approach as Django app"""
|
| 68 |
global _whisper_model
|
| 69 |
if _whisper_model is None:
|
| 70 |
+
print(f"Loading Faster Whisper model: {WHISPER_MODEL_SIZE} on {WHISPER_DEVICE} with compute_type={WHISPER_COMPUTE_TYPE}")
|
| 71 |
+
|
| 72 |
+
# Load model by name - faster-whisper downloads automatically from HuggingFace
|
| 73 |
+
# This is the same approach used in the Django app
|
|
|
|
| 74 |
_whisper_model = WhisperModel(
|
| 75 |
+
WHISPER_MODEL_SIZE, # Model name: tiny, base, small, medium, large-v3
|
| 76 |
device=WHISPER_DEVICE,
|
| 77 |
compute_type=WHISPER_COMPUTE_TYPE,
|
| 78 |
)
|
|
|
|
| 80 |
|
| 81 |
|
| 82 |
def _load_diarization_pipeline() -> Optional[Pipeline]:
|
| 83 |
+
"""Load speaker diarization pipeline lazily (singleton)"""
|
| 84 |
global _diarization_pipeline
|
| 85 |
if _diarization_pipeline is None:
|
| 86 |
if HF_TOKEN is None:
|
| 87 |
raise gr.Error(
|
| 88 |
"HF_TOKEN secret is missing. Add it in Space settings to enable diarization."
|
| 89 |
)
|
| 90 |
+
|
| 91 |
+
print("Loading diarization pipeline...")
|
| 92 |
+
# Download the pipeline locally to avoid repeated API calls
|
| 93 |
local_dir = _ensure_snapshot(DIARIZATION_REPO_ID, DIARIZATION_LOCAL_DIR)
|
| 94 |
+
_diarization_pipeline = Pipeline.from_pretrained(
|
| 95 |
+
local_dir,
|
| 96 |
+
use_auth_token=HF_TOKEN # Note: newer versions use 'token' instead
|
| 97 |
+
)
|
| 98 |
_diarization_pipeline.to(torch.device("cpu"))
|
| 99 |
return _diarization_pipeline
|
| 100 |
|
|
|
|
| 114 |
|
| 115 |
model = _load_whisper_model()
|
| 116 |
|
| 117 |
+
# Transcription parameters matching Django app configuration
|
| 118 |
segments, info = model.transcribe(
|
| 119 |
audio_path,
|
| 120 |
language=language if language else None,
|
| 121 |
beam_size=beam_size,
|
| 122 |
best_of=best_of,
|
| 123 |
+
temperature=[0.0, 0.2, 0.4, 0.6], # Matching Django app
|
| 124 |
vad_filter=True,
|
| 125 |
+
vad_parameters=dict(
|
| 126 |
+
min_silence_duration_ms=300, # Split sooner on short pauses
|
| 127 |
+
speech_pad_ms=120
|
| 128 |
+
),
|
| 129 |
+
condition_on_previous_text=False, # KEY: stop cross-segment repetition
|
| 130 |
initial_prompt=initial_prompt,
|
| 131 |
compression_ratio_threshold=2.4,
|
| 132 |
log_prob_threshold=-1.0,
|
|
|
|
| 250 |
)
|
| 251 |
|
| 252 |
gr.Markdown(
|
| 253 |
+
f"""
|
| 254 |
## Tips
|
| 255 |
+
- **Current model**: `{WHISPER_MODEL_SIZE}` (first run downloads model automatically)
|
| 256 |
+
- Diarization downloads ~700MB on first use (cached afterward)
|
| 257 |
+
- Store your Hugging Face token in Space Secrets as **HF_TOKEN** (required for diarization)
|
| 258 |
+
- Change `WHISPER_MODEL_SIZE` in Space Variables to `medium` or `large-v3` for higher accuracy
|
| 259 |
+
- Optimized for Arabic customer service calls with specialized initial prompt
|
| 260 |
"""
|
| 261 |
)
|
| 262 |
|