agentic-language-partner

Sleeping

mastefan commited on 24 days ago

Commit

7a76102

verified ·

1 Parent(s): 16c3076

Update src/app/conversation_core.py

Files changed (1) hide show

src/app/conversation_core.py CHANGED Viewed

@@ -348,37 +348,33 @@ class ConversationManager:
     ###########################################################
-    # AUDIO TRANSCRIPTION
     ###########################################################
-    def transcribe(self, audio_segment, spoken_lang: str = None):
-        """
-        Transcribe a pydub AudioSegment using faster-whisper.
-        Returns (text, detected_language, confidence).
-        """
-        from faster_whisper import WhisperModel
         import numpy as np
-        # lazy-load global model
-        global _whisper_model
-        if _whisper_model is None:
-            _whisper_model = WhisperModel(
-                "small",
-                device="cpu",
-                compute_type="int8"
-            )
-        # convert AudioSegment → float32 numpy array
-        audio = np.array(audio_segment.get_array_of_samples()).astype("float32") / 32768.0
-        # run transcription
-        segments, info = _whisper_model.transcribe(
-            audio,
-            beam_size=5,
-            language=spoken_lang
-        )
-        full_text = " ".join([s.text.strip() for s in segments])
-        return full_text.strip(), info.language, info.language_probability

     ###########################################################
+    # AUDIO TRANSCRIPTION — Transformers Whisper
     ###########################################################
+    from transformers import pipeline
+    # Load Whisper once at module import (fast + HF-safe)
+    whisper_pipe = pipeline(
+        task="automatic-speech-recognition",
+        model="openai/whisper-small",
+        device="cpu"
+    )
+    def transcribe(self, audio_segment, spoken_lang=None):
         import numpy as np
+        # Convert AudioSegment → numpy float32 PCM
+        audio = np.array(audio_segment.get_array_of_samples()).astype("float32")
+        audio = audio / np.max(np.abs(audio))  # normalize to [-1, 1]
+        # Transformers Whisper expects a Python list or numpy array
+        result = whisper_pipe(audio)
+        text = result.get("text", "").strip()
+        # transformers Whisper does not provide language predictions on CPU
+        return text, spoken_lang or "unknown", 1.0