Update src/app/conversation_core.py
Browse files- src/app/conversation_core.py +58 -38
src/app/conversation_core.py
CHANGED
|
@@ -13,7 +13,6 @@ from .config import get_user_dir
|
|
| 13 |
|
| 14 |
import torch
|
| 15 |
from gtts import gTTS
|
| 16 |
-
import whisper # openai-whisper library
|
| 17 |
from transformers import (
|
| 18 |
AutoTokenizer,
|
| 19 |
AutoModelForCausalLM,
|
|
@@ -106,21 +105,43 @@ _LANG_HINTS = {
|
|
| 106 |
}
|
| 107 |
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
def load_whisper():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
"""
|
| 111 |
-
|
|
|
|
| 112 |
"""
|
| 113 |
global _whisper_model
|
| 114 |
if _whisper_model is None:
|
| 115 |
-
|
| 116 |
-
_whisper_model = whisper.load_model("small", device="cpu")
|
| 117 |
-
return _whisper_model
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
|
| 126 |
###############################################################
|
|
@@ -329,37 +350,36 @@ class ConversationManager:
|
|
| 329 |
###########################################################
|
| 330 |
# AUDIO TRANSCRIPTION
|
| 331 |
###########################################################
|
| 332 |
-
def transcribe(self, audio_segment, spoken_lang: str =
|
| 333 |
"""
|
| 334 |
-
|
| 335 |
-
Returns (text,
|
| 336 |
"""
|
| 337 |
-
import
|
| 338 |
-
import
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
|
| 350 |
-
try:
|
| 351 |
-
# OpenAI whisper: language hint is optional but helps
|
| 352 |
-
result = model.transcribe(tmp_path, language=lang_hint)
|
| 353 |
-
text = (result.get("text") or "").strip()
|
| 354 |
-
|
| 355 |
-
# We don't have a simple probability; return 1.0 as dummy
|
| 356 |
-
return text, lang_hint, 1.0
|
| 357 |
-
|
| 358 |
-
finally:
|
| 359 |
-
try:
|
| 360 |
-
os.remove(tmp_path)
|
| 361 |
-
except OSError:
|
| 362 |
-
pass
|
| 363 |
|
| 364 |
|
| 365 |
###########################################################
|
|
|
|
| 13 |
|
| 14 |
import torch
|
| 15 |
from gtts import gTTS
|
|
|
|
| 16 |
from transformers import (
|
| 17 |
AutoTokenizer,
|
| 18 |
AutoModelForCausalLM,
|
|
|
|
| 105 |
}
|
| 106 |
|
| 107 |
|
| 108 |
+
##########################################
|
| 109 |
+
# SPEECH RECOGNITION — faster-whisper
|
| 110 |
+
##########################################
|
| 111 |
+
|
| 112 |
+
from faster_whisper import WhisperModel
|
| 113 |
+
|
| 114 |
+
_whisper_model = None
|
| 115 |
+
|
| 116 |
def load_whisper():
|
| 117 |
+
global _whisper_model
|
| 118 |
+
if _whisper_model is None:
|
| 119 |
+
_whisper_model = WhisperModel("small", device="cpu", compute_type="int8")
|
| 120 |
+
return _whisper_model
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def transcribe_audio(audio_segment, spoken_lang=None):
|
| 124 |
"""
|
| 125 |
+
Accepts a pydub AudioSegment (mono, 16k).
|
| 126 |
+
Returns transcript, detected_language, confidence.
|
| 127 |
"""
|
| 128 |
global _whisper_model
|
| 129 |
if _whisper_model is None:
|
| 130 |
+
load_whisper()
|
|
|
|
|
|
|
| 131 |
|
| 132 |
+
import numpy as np
|
| 133 |
+
|
| 134 |
+
audio = np.array(audio_segment.get_array_of_samples()).astype("float32") / 32768.0
|
| 135 |
+
|
| 136 |
+
segments, info = _whisper_model.transcribe(
|
| 137 |
+
audio,
|
| 138 |
+
beam_size=5,
|
| 139 |
+
language=spoken_lang,
|
| 140 |
+
)
|
| 141 |
+
|
| 142 |
+
full_text = " ".join([s.text.strip() for s in segments])
|
| 143 |
+
|
| 144 |
+
return full_text.strip(), info.language, info.language_probability
|
| 145 |
|
| 146 |
|
| 147 |
###############################################################
|
|
|
|
| 350 |
###########################################################
|
| 351 |
# AUDIO TRANSCRIPTION
|
| 352 |
###########################################################
|
| 353 |
+
def transcribe(self, audio_segment, spoken_lang: str = None):
|
| 354 |
"""
|
| 355 |
+
Transcribe a pydub AudioSegment using faster-whisper.
|
| 356 |
+
Returns (text, detected_language, confidence).
|
| 357 |
"""
|
| 358 |
+
from faster_whisper import WhisperModel
|
| 359 |
+
import numpy as np
|
| 360 |
+
|
| 361 |
+
# lazy-load global model
|
| 362 |
+
global _whisper_model
|
| 363 |
+
if _whisper_model is None:
|
| 364 |
+
_whisper_model = WhisperModel(
|
| 365 |
+
"small",
|
| 366 |
+
device="cpu",
|
| 367 |
+
compute_type="int8"
|
| 368 |
+
)
|
| 369 |
+
|
| 370 |
+
# convert AudioSegment → float32 numpy array
|
| 371 |
+
audio = np.array(audio_segment.get_array_of_samples()).astype("float32") / 32768.0
|
| 372 |
+
|
| 373 |
+
# run transcription
|
| 374 |
+
segments, info = _whisper_model.transcribe(
|
| 375 |
+
audio,
|
| 376 |
+
beam_size=5,
|
| 377 |
+
language=spoken_lang
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
full_text = " ".join([s.text.strip() for s in segments])
|
| 381 |
+
return full_text.strip(), info.language, info.language_probability
|
| 382 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
|
| 385 |
###########################################################
|