Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForSeq2SeqLM, BitsAndBytesConfig, AutoTokenizer | |
| from IndicTransToolkit import IndicProcessor | |
| import speech_recognition as sr | |
| from pydub import AudioSegment | |
| import os | |
| from sentence_transformers import SentenceTransformer, util #Multilingual Similarity | |
| # Constants | |
| BATCH_SIZE = 4 | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| quantization = None | |
| MAX_AUDIO_DURATION = 600 # 10 minutes in seconds | |
| # ---- IndicTrans2 Model Initialization ---- | |
| def initialize_model_and_tokenizer(ckpt_dir, quantization): | |
| if quantization == "4-bit": | |
| qconfig = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| ) | |
| elif quantization == "8-bit": | |
| qconfig = BitsAndBytesConfig( | |
| load_in_8bit=True, | |
| bnb_8bit_use_double_quant=True, | |
| bnb_8bit_compute_dtype=torch.bfloat16, | |
| ) | |
| else: | |
| qconfig = None | |
| tokenizer = AutoTokenizer.from_pretrained(ckpt_dir, trust_remote_code=True) | |
| model = AutoModelForSeq2SeqLM.from_pretrained( | |
| ckpt_dir, | |
| trust_remote_code=True, | |
| low_cpu_mem_usage=True, | |
| quantization_config=qconfig, | |
| ) | |
| if qconfig is None: | |
| model = model.to(DEVICE) | |
| if DEVICE == "cuda": | |
| model.half() | |
| model.eval() | |
| return tokenizer, model | |
| def batch_translate(input_sentences, src_lang, tgt_lang, model, tokenizer, ip): | |
| translations = [] | |
| for i in range(0, len(input_sentences), BATCH_SIZE): | |
| batch = input_sentences[i : i + BATCH_SIZE] | |
| batch = ip.preprocess_batch(batch, src_lang=src_lang, tgt_lang=tgt_lang) | |
| inputs = tokenizer( | |
| batch, | |
| truncation=True, | |
| padding="longest", | |
| return_tensors="pt", | |
| return_attention_mask=True, | |
| ).to(DEVICE) | |
| with torch.no_grad(): | |
| generated_tokens = model.generate( | |
| **inputs, | |
| use_cache=True, | |
| min_length=0, | |
| max_length=256, | |
| num_beams=5, | |
| num_return_sequences=1, | |
| ) | |
| with tokenizer.as_target_tokenizer(): | |
| generated_tokens = tokenizer.batch_decode( | |
| generated_tokens.detach().cpu().tolist(), | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True, | |
| ) | |
| translations += ip.postprocess_batch(generated_tokens, lang=tgt_lang) | |
| del inputs | |
| torch.cuda.empty_cache() | |
| return translations | |
| # Initialize IndicTrans2 | |
| indic_en_ckpt_dir = "ai4bharat/indictrans2-indic-en-1B" | |
| indic_en_tokenizer, indic_en_model = initialize_model_and_tokenizer(indic_en_ckpt_dir, quantization) | |
| ip = IndicProcessor(inference=True) | |
| # Load LaBSE for Multilingual Similarity | |
| similarity_model = SentenceTransformer("sentence-transformers/LaBSE") | |
| # ---- Audio Processing Function ---- | |
| def convert_audio_to_wav(file_path): | |
| """ Convert audio to WAV format for compatibility with SpeechRecognition """ | |
| audio = AudioSegment.from_file(file_path) | |
| wav_path = file_path.replace(file_path.split(".")[-1], "wav") | |
| audio.export(wav_path, format="wav") | |
| return wav_path | |
| def transcribe_audio_in_chunks(audio_path, chunk_duration=30): | |
| """Transcribe long audio files in chunks of `chunk_duration` seconds.""" | |
| recognizer = sr.Recognizer() | |
| audio = AudioSegment.from_wav(audio_path) | |
| # Limit audio duration to MAX_AUDIO_DURATION | |
| if len(audio) > MAX_AUDIO_DURATION * 1000: | |
| audio = audio[:MAX_AUDIO_DURATION * 1000] | |
| full_text = [] | |
| for i in range(0, len(audio), chunk_duration * 1000): | |
| chunk = audio[i : i + chunk_duration * 1000] | |
| chunk_path = f"temp_chunk.wav" | |
| chunk.export(chunk_path, format="wav") | |
| with sr.AudioFile(chunk_path) as source: | |
| audio_data = recognizer.record(source) | |
| try: | |
| text = recognizer.recognize_google(audio_data, language="ml-IN") | |
| full_text.append(text) | |
| except sr.UnknownValueError: | |
| full_text.append("[Unrecognized Audio]") | |
| except sr.RequestError as e: | |
| full_text.append(f"[Speech Error: {e}]") | |
| return " ".join(full_text) | |
| # Multilingual Semantic Similarity Function (Auto-Reference) | |
| def compute_similarity(malayalam_text, english_translation): | |
| """Compares the original Malayalam transcription with back-translated Malayalam text for similarity.""" | |
| if not malayalam_text.strip(): | |
| print("⚠️ Malayalam transcription is empty!") | |
| return "N/A" | |
| if not english_translation.strip(): | |
| print("⚠️ English translation is empty!") | |
| return "N/A" | |
| try: | |
| # Translate English back to Malayalam for comparison | |
| back_translated = batch_translate([english_translation], "eng_Latn", "mal_Mlym", indic_en_model, indic_en_tokenizer, ip)[0] | |
| # Encode Malayalam transcription & Back-Translated Malayalam | |
| embeddings = similarity_model.encode([malayalam_text, back_translated]) | |
| # Compute cosine similarity | |
| similarity_score = util.cos_sim(embeddings[0], embeddings[1]).item() | |
| return round(similarity_score * 100, 2) # Convert to percentage | |
| except Exception as e: | |
| print(f"Error in similarity computation: {e}") | |
| return "N/A" | |
| # ---- Gradio Function ---- | |
| def transcribe_and_translate(audio): | |
| # Convert to WAV if necessary | |
| if not audio.endswith(".wav"): | |
| audio = convert_audio_to_wav(audio) | |
| # Transcribe audio in chunks | |
| malayalam_text = transcribe_audio_in_chunks(audio) | |
| # Translation | |
| en_sents = [malayalam_text] | |
| src_lang, tgt_lang = "mal_Mlym", "eng_Latn" | |
| translations = batch_translate(en_sents, src_lang, tgt_lang, indic_en_model, indic_en_tokenizer, ip) | |
| # Compute Multilingual Semantic Similarity (Malayalam → English → Malayalam) | |
| similarity_score = compute_similarity(malayalam_text, translations[0]) | |
| return malayalam_text, translations[0], f"{similarity_score}%" # Similarity as % | |
| # ---- Gradio Interface ---- | |
| iface = gr.Interface( | |
| fn=transcribe_and_translate, | |
| inputs=[ | |
| gr.Audio(sources=["microphone", "upload"], type="filepath"), # Only audio input | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Malayalam Transcription"), | |
| gr.Textbox(label="English Translation"), | |
| gr.Textbox(label="Semantic Similarity (%)"), # Automatically computed | |
| ], | |
| title="Malayalam Speech Recognition & Translation", | |
| description="Speak in Malayalam → Transcribe using Speech Recognition → Translate to English & Measure Accuracy.", | |
| allow_flagging="never" | |
| ) | |
| iface.launch(debug=True, share=True) | |