mastefan commited on
Commit
a2b6370
·
verified ·
1 Parent(s): 55d033f

Update src/app/conversation_core.py

Browse files
Files changed (1) hide show
  1. src/app/conversation_core.py +58 -38
src/app/conversation_core.py CHANGED
@@ -13,7 +13,6 @@ from .config import get_user_dir
13
 
14
  import torch
15
  from gtts import gTTS
16
- import whisper # openai-whisper library
17
  from transformers import (
18
  AutoTokenizer,
19
  AutoModelForCausalLM,
@@ -106,21 +105,43 @@ _LANG_HINTS = {
106
  }
107
 
108
 
 
 
 
 
 
 
 
 
109
  def load_whisper():
 
 
 
 
 
 
 
110
  """
111
- Lazily load OpenAI's Whisper model (small) on CPU.
 
112
  """
113
  global _whisper_model
114
  if _whisper_model is None:
115
- # you can pick "tiny", "base", "small" etc.
116
- _whisper_model = whisper.load_model("small", device="cpu")
117
- return _whisper_model
118
 
119
- device = "cuda" if torch.cuda.is_available() else "cpu"
120
- compute_type = "float16" if device == "cuda" else "int8"
121
- print(f"[conversation_core] Loading Whisper {WHISPER_MODEL_SIZE} on {device} ({compute_type})")
122
- _WHISPER = WhisperModel("small", device="cpu", compute_type="int8")
123
- return _WHISPER
 
 
 
 
 
 
 
 
124
 
125
 
126
  ###############################################################
@@ -329,37 +350,36 @@ class ConversationManager:
329
  ###########################################################
330
  # AUDIO TRANSCRIPTION
331
  ###########################################################
332
- def transcribe(self, audio_segment, spoken_lang: str = "english"):
333
  """
334
- OpenAI Whisper transcription with optional language hint.
335
- Returns (text, detected_lang_or_hint, dummy_confidence).
336
  """
337
- import tempfile
338
- import os
339
-
340
- model = load_whisper()
341
-
342
- # Map spoken_lang ("english", "japanese", etc.) to ISO code
343
- lang_hint = _LANG_HINTS.get(spoken_lang.lower(), "en")
344
-
345
- # Export the pydub AudioSegment to a temporary WAV file
346
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
347
- audio_segment.export(tmp.name, format="wav")
348
- tmp_path = tmp.name
 
 
 
 
 
 
 
 
 
 
 
 
349
 
350
- try:
351
- # OpenAI whisper: language hint is optional but helps
352
- result = model.transcribe(tmp_path, language=lang_hint)
353
- text = (result.get("text") or "").strip()
354
-
355
- # We don't have a simple probability; return 1.0 as dummy
356
- return text, lang_hint, 1.0
357
-
358
- finally:
359
- try:
360
- os.remove(tmp_path)
361
- except OSError:
362
- pass
363
 
364
 
365
  ###########################################################
 
13
 
14
  import torch
15
  from gtts import gTTS
 
16
  from transformers import (
17
  AutoTokenizer,
18
  AutoModelForCausalLM,
 
105
  }
106
 
107
 
108
+ ##########################################
109
+ # SPEECH RECOGNITION — faster-whisper
110
+ ##########################################
111
+
112
+ from faster_whisper import WhisperModel
113
+
114
+ _whisper_model = None
115
+
116
  def load_whisper():
117
+ global _whisper_model
118
+ if _whisper_model is None:
119
+ _whisper_model = WhisperModel("small", device="cpu", compute_type="int8")
120
+ return _whisper_model
121
+
122
+
123
+ def transcribe_audio(audio_segment, spoken_lang=None):
124
  """
125
+ Accepts a pydub AudioSegment (mono, 16k).
126
+ Returns transcript, detected_language, confidence.
127
  """
128
  global _whisper_model
129
  if _whisper_model is None:
130
+ load_whisper()
 
 
131
 
132
+ import numpy as np
133
+
134
+ audio = np.array(audio_segment.get_array_of_samples()).astype("float32") / 32768.0
135
+
136
+ segments, info = _whisper_model.transcribe(
137
+ audio,
138
+ beam_size=5,
139
+ language=spoken_lang,
140
+ )
141
+
142
+ full_text = " ".join([s.text.strip() for s in segments])
143
+
144
+ return full_text.strip(), info.language, info.language_probability
145
 
146
 
147
  ###############################################################
 
350
  ###########################################################
351
  # AUDIO TRANSCRIPTION
352
  ###########################################################
353
+ def transcribe(self, audio_segment, spoken_lang: str = None):
354
  """
355
+ Transcribe a pydub AudioSegment using faster-whisper.
356
+ Returns (text, detected_language, confidence).
357
  """
358
+ from faster_whisper import WhisperModel
359
+ import numpy as np
360
+
361
+ # lazy-load global model
362
+ global _whisper_model
363
+ if _whisper_model is None:
364
+ _whisper_model = WhisperModel(
365
+ "small",
366
+ device="cpu",
367
+ compute_type="int8"
368
+ )
369
+
370
+ # convert AudioSegment → float32 numpy array
371
+ audio = np.array(audio_segment.get_array_of_samples()).astype("float32") / 32768.0
372
+
373
+ # run transcription
374
+ segments, info = _whisper_model.transcribe(
375
+ audio,
376
+ beam_size=5,
377
+ language=spoken_lang
378
+ )
379
+
380
+ full_text = " ".join([s.text.strip() for s in segments])
381
+ return full_text.strip(), info.language, info.language_probability
382
 
 
 
 
 
 
 
 
 
 
 
 
 
 
383
 
384
 
385
  ###########################################################