agentic-language-partner

Sleeping

App Files Files Community

mastefan commited on 23 days ago

Commit

a2b6370

verified ·

1 Parent(s): 55d033f

Update src/app/conversation_core.py

Browse files

Files changed (1) hide show

src/app/conversation_core.py +58 -38

src/app/conversation_core.py CHANGED Viewed

@@ -13,7 +13,6 @@ from .config import get_user_dir
 import torch
 from gtts import gTTS
-import whisper  # openai-whisper library
 from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
@@ -106,21 +105,43 @@ _LANG_HINTS = {
 }
 def load_whisper():
     """
-    Lazily load OpenAI's Whisper model (small) on CPU.
     """
     global _whisper_model
     if _whisper_model is None:
-        # you can pick "tiny", "base", "small" etc.
-        _whisper_model = whisper.load_model("small", device="cpu")
-    return _whisper_model
-    device = "cuda" if torch.cuda.is_available() else "cpu"
-    compute_type = "float16" if device == "cuda" else "int8"
-    print(f"[conversation_core] Loading Whisper {WHISPER_MODEL_SIZE} on {device} ({compute_type})")
-    _WHISPER = WhisperModel("small", device="cpu", compute_type="int8")
-    return _WHISPER
 ###############################################################
@@ -329,37 +350,36 @@ class ConversationManager:
     ###########################################################
     # AUDIO TRANSCRIPTION
     ###########################################################
-    def transcribe(self, audio_segment, spoken_lang: str = "english"):
         """
-        OpenAI Whisper transcription with optional language hint.
-        Returns (text, detected_lang_or_hint, dummy_confidence).
         """
-        import tempfile
-        import os
-        model = load_whisper()
-        # Map spoken_lang ("english", "japanese", etc.) to ISO code
-        lang_hint = _LANG_HINTS.get(spoken_lang.lower(), "en")
-        # Export the pydub AudioSegment to a temporary WAV file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
-            audio_segment.export(tmp.name, format="wav")
-            tmp_path = tmp.name
-        try:
-            # OpenAI whisper: language hint is optional but helps
-            result = model.transcribe(tmp_path, language=lang_hint)
-            text = (result.get("text") or "").strip()
-            # We don't have a simple probability; return 1.0 as dummy
-            return text, lang_hint, 1.0
-        finally:
-            try:
-                os.remove(tmp_path)
-            except OSError:
-                pass
     ###########################################################

 import torch
 from gtts import gTTS
 from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
 }
+##########################################
+# SPEECH RECOGNITION — faster-whisper
+##########################################
+from faster_whisper import WhisperModel
+_whisper_model = None
 def load_whisper():
+    global _whisper_model
+    if _whisper_model is None:
+        _whisper_model = WhisperModel("small", device="cpu", compute_type="int8")
+    return _whisper_model
+def transcribe_audio(audio_segment, spoken_lang=None):
     """
+    Accepts a pydub AudioSegment (mono, 16k).
+    Returns transcript, detected_language, confidence.
     """
     global _whisper_model
     if _whisper_model is None:
+        load_whisper()
+    import numpy as np
+    audio = np.array(audio_segment.get_array_of_samples()).astype("float32") / 32768.0
+    segments, info = _whisper_model.transcribe(
+        audio,
+        beam_size=5,
+        language=spoken_lang,
+    )
+    full_text = " ".join([s.text.strip() for s in segments])
+    return full_text.strip(), info.language, info.language_probability
 ###############################################################
     ###########################################################
     # AUDIO TRANSCRIPTION
     ###########################################################
+    def transcribe(self, audio_segment, spoken_lang: str = None):
         """
+        Transcribe a pydub AudioSegment using faster-whisper.
+        Returns (text, detected_language, confidence).
         """
+        from faster_whisper import WhisperModel
+        import numpy as np
+        # lazy-load global model
+        global _whisper_model
+        if _whisper_model is None:
+            _whisper_model = WhisperModel(
+                "small",
+                device="cpu",
+                compute_type="int8"
+            )
+        # convert AudioSegment → float32 numpy array
+        audio = np.array(audio_segment.get_array_of_samples()).astype("float32") / 32768.0
+        # run transcription
+        segments, info = _whisper_model.transcribe(
+            audio,
+            beam_size=5,
+            language=spoken_lang
+        )
+        full_text = " ".join([s.text.strip() for s in segments])
+        return full_text.strip(), info.language, info.language_probability
     ###########################################################