|
|
""" |
|
|
API Client for VTT with Diarization Hugging Face Space |
|
|
Usage example for calling the space via Gradio Client API |
|
|
""" |
|
|
|
|
|
from gradio_client import Client |
|
|
import os |
|
|
|
|
|
|
|
|
SPACE_URL = "MahmoudElsamadony/vtt-with-diariazation" |
|
|
|
|
|
def transcribe_audio( |
|
|
audio_file_path: str, |
|
|
language: str = "ar", |
|
|
enable_diarization: bool = False, |
|
|
beam_size: int = 5, |
|
|
best_of: int = 5, |
|
|
): |
|
|
""" |
|
|
Transcribe audio file using the Hugging Face Space API |
|
|
|
|
|
Args: |
|
|
audio_file_path: Path to the audio file (mp3, wav, m4a, etc.) |
|
|
language: Language code ("ar", "en", "fr", etc.) or "" for auto-detect |
|
|
enable_diarization: Whether to enable speaker diarization |
|
|
beam_size: Beam size for Whisper (1-10) |
|
|
best_of: Best of parameter for Whisper (1-10) |
|
|
|
|
|
Returns: |
|
|
tuple: (transcript_text, detailed_json) |
|
|
""" |
|
|
|
|
|
client = Client(SPACE_URL) |
|
|
|
|
|
|
|
|
result = client.predict( |
|
|
audio_path=audio_file_path, |
|
|
language=language, |
|
|
enable_diarization=enable_diarization, |
|
|
beam_size=beam_size, |
|
|
best_of=best_of, |
|
|
api_name="/predict" |
|
|
) |
|
|
|
|
|
return result |
|
|
|
|
|
|
|
|
def main(): |
|
|
"""Example usage of the API client""" |
|
|
|
|
|
|
|
|
print("=" * 60) |
|
|
print("Example 1: Basic Arabic transcription") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
audio_file = "path/to/your/audio.mp3" |
|
|
|
|
|
if os.path.exists(audio_file): |
|
|
transcript, details = transcribe_audio( |
|
|
audio_file_path=audio_file, |
|
|
language="ar", |
|
|
enable_diarization=False, |
|
|
) |
|
|
|
|
|
print(f"\nTranscript:\n{transcript}\n") |
|
|
print(f"Language: {details.get('language')}") |
|
|
print(f"Duration: {details.get('duration')} seconds") |
|
|
print(f"Number of segments: {len(details.get('segments', []))}") |
|
|
else: |
|
|
print(f"Audio file not found: {audio_file}") |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Example 2: Transcription with speaker diarization") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if os.path.exists(audio_file): |
|
|
transcript, details = transcribe_audio( |
|
|
audio_file_path=audio_file, |
|
|
language="ar", |
|
|
enable_diarization=True, |
|
|
beam_size=5, |
|
|
best_of=5, |
|
|
) |
|
|
|
|
|
print(f"\nTranscript:\n{transcript}\n") |
|
|
|
|
|
|
|
|
if "speakers" in details: |
|
|
print("\nSpeaker turns:") |
|
|
for turn in details["speakers"][:5]: |
|
|
print(f" {turn['speaker']}: {turn['start']:.2f}s - {turn['end']:.2f}s") |
|
|
|
|
|
|
|
|
print("\nSegments with speakers:") |
|
|
for segment in details.get("segments", [])[:3]: |
|
|
speaker = segment.get("speaker", "Unknown") |
|
|
text = segment.get("text", "") |
|
|
start = segment.get("start", 0) |
|
|
print(f" [{start:.2f}s] {speaker}: {text}") |
|
|
else: |
|
|
print(f"Audio file not found: {audio_file}") |
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("Example 3: Auto-detect language") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
if os.path.exists(audio_file): |
|
|
transcript, details = transcribe_audio( |
|
|
audio_file_path=audio_file, |
|
|
language="", |
|
|
enable_diarization=False, |
|
|
) |
|
|
|
|
|
print(f"\nDetected language: {details.get('language')}") |
|
|
print(f"Language probability: {details.get('language_probability'):.2%}") |
|
|
print(f"\nTranscript:\n{transcript}") |
|
|
else: |
|
|
print(f"Audio file not found: {audio_file}") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
|
|
|
|
|
|
main() |