vtt-with-diariazation / api_client.py
Mahmoud Elsamadony
Update GPU Usage
ce8875c
"""
API Client for VTT with Diarization Hugging Face Space
Usage example for calling the space via Gradio Client API
"""
from gradio_client import Client
import os
# Your Hugging Face Space URL
SPACE_URL = "MahmoudElsamadony/vtt-with-diariazation"
def transcribe_audio(
audio_file_path: str,
language: str = "ar",
enable_diarization: bool = False,
beam_size: int = 5,
best_of: int = 5,
):
"""
Transcribe audio file using the Hugging Face Space API
Args:
audio_file_path: Path to the audio file (mp3, wav, m4a, etc.)
language: Language code ("ar", "en", "fr", etc.) or "" for auto-detect
enable_diarization: Whether to enable speaker diarization
beam_size: Beam size for Whisper (1-10)
best_of: Best of parameter for Whisper (1-10)
Returns:
tuple: (transcript_text, detailed_json)
"""
# Initialize the client
client = Client(SPACE_URL)
# Call the transcribe function
result = client.predict(
audio_path=audio_file_path,
language=language,
enable_diarization=enable_diarization,
beam_size=beam_size,
best_of=best_of,
api_name="/predict"
)
return result
def main():
"""Example usage of the API client"""
# Example 1: Basic transcription (Arabic, no diarization)
print("=" * 60)
print("Example 1: Basic Arabic transcription")
print("=" * 60)
# Replace with your actual audio file path
audio_file = "path/to/your/audio.mp3"
if os.path.exists(audio_file):
transcript, details = transcribe_audio(
audio_file_path=audio_file,
language="ar",
enable_diarization=False,
)
print(f"\nTranscript:\n{transcript}\n")
print(f"Language: {details.get('language')}")
print(f"Duration: {details.get('duration')} seconds")
print(f"Number of segments: {len(details.get('segments', []))}")
else:
print(f"Audio file not found: {audio_file}")
print("\n" + "=" * 60)
print("Example 2: Transcription with speaker diarization")
print("=" * 60)
# Example 2: Transcription with diarization
if os.path.exists(audio_file):
transcript, details = transcribe_audio(
audio_file_path=audio_file,
language="ar",
enable_diarization=True,
beam_size=5,
best_of=5,
)
print(f"\nTranscript:\n{transcript}\n")
# Print speaker turns
if "speakers" in details:
print("\nSpeaker turns:")
for turn in details["speakers"][:5]: # Show first 5 turns
print(f" {turn['speaker']}: {turn['start']:.2f}s - {turn['end']:.2f}s")
# Print segments with speakers
print("\nSegments with speakers:")
for segment in details.get("segments", [])[:3]: # Show first 3 segments
speaker = segment.get("speaker", "Unknown")
text = segment.get("text", "")
start = segment.get("start", 0)
print(f" [{start:.2f}s] {speaker}: {text}")
else:
print(f"Audio file not found: {audio_file}")
print("\n" + "=" * 60)
print("Example 3: Auto-detect language")
print("=" * 60)
# Example 3: Auto-detect language
if os.path.exists(audio_file):
transcript, details = transcribe_audio(
audio_file_path=audio_file,
language="", # Empty string for auto-detect
enable_diarization=False,
)
print(f"\nDetected language: {details.get('language')}")
print(f"Language probability: {details.get('language_probability'):.2%}")
print(f"\nTranscript:\n{transcript}")
else:
print(f"Audio file not found: {audio_file}")
if __name__ == "__main__":
# Install gradio_client first:
# pip install gradio_client
main()