mastefan commited on
Commit
7a76102
·
verified ·
1 Parent(s): 16c3076

Update src/app/conversation_core.py

Browse files
Files changed (1) hide show
  1. src/app/conversation_core.py +21 -25
src/app/conversation_core.py CHANGED
@@ -348,37 +348,33 @@ class ConversationManager:
348
 
349
 
350
  ###########################################################
351
- # AUDIO TRANSCRIPTION
352
  ###########################################################
353
- def transcribe(self, audio_segment, spoken_lang: str = None):
354
- """
355
- Transcribe a pydub AudioSegment using faster-whisper.
356
- Returns (text, detected_language, confidence).
357
- """
358
- from faster_whisper import WhisperModel
 
 
 
 
 
359
  import numpy as np
360
 
361
- # lazy-load global model
362
- global _whisper_model
363
- if _whisper_model is None:
364
- _whisper_model = WhisperModel(
365
- "small",
366
- device="cpu",
367
- compute_type="int8"
368
- )
369
 
370
- # convert AudioSegment float32 numpy array
371
- audio = np.array(audio_segment.get_array_of_samples()).astype("float32") / 32768.0
372
 
373
- # run transcription
374
- segments, info = _whisper_model.transcribe(
375
- audio,
376
- beam_size=5,
377
- language=spoken_lang
378
- )
379
 
380
- full_text = " ".join([s.text.strip() for s in segments])
381
- return full_text.strip(), info.language, info.language_probability
 
382
 
383
 
384
 
 
348
 
349
 
350
  ###########################################################
351
+ # AUDIO TRANSCRIPTION — Transformers Whisper
352
  ###########################################################
353
+
354
+ from transformers import pipeline
355
+
356
+ # Load Whisper once at module import (fast + HF-safe)
357
+ whisper_pipe = pipeline(
358
+ task="automatic-speech-recognition",
359
+ model="openai/whisper-small",
360
+ device="cpu"
361
+ )
362
+
363
+ def transcribe(self, audio_segment, spoken_lang=None):
364
  import numpy as np
365
 
366
+ # Convert AudioSegment → numpy float32 PCM
367
+ audio = np.array(audio_segment.get_array_of_samples()).astype("float32")
368
+ audio = audio / np.max(np.abs(audio)) # normalize to [-1, 1]
 
 
 
 
 
369
 
370
+ # Transformers Whisper expects a Python list or numpy array
371
+ result = whisper_pipe(audio)
372
 
373
+ text = result.get("text", "").strip()
 
 
 
 
 
374
 
375
+ # transformers Whisper does not provide language predictions on CPU
376
+ return text, spoken_lang or "unknown", 1.0
377
+
378
 
379
 
380