fix vad bug

Files changed (3) hide show

config.py +1 -1
transcribe/pipelines/pipe_vad.py +12 -1
transcribe/whisper_llm_serve.py +9 -11

config.py CHANGED Viewed

@@ -2,7 +2,7 @@ import pathlib
 import re
 import logging
-DEBUG = False
 TEST = False
 logging.getLogger("pywhispercpp").setLevel(logging.WARNING)

 import re
 import logging
+DEBUG = True
 TEST = False
 logging.getLogger("pywhispercpp").setLevel(logging.WARNING)

transcribe/pipelines/pipe_vad.py CHANGED Viewed

@@ -5,6 +5,8 @@ import numpy as np
 from silero_vad import get_speech_timestamps
 import torch
 from typing import List
 # import noisereduce as nr
 def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
@@ -95,6 +97,8 @@ class VadPipe(BasePipe):
     def _process_speech_chunk(self, source_audio:np.ndarray):
         speech_dict = self.vac(source_audio, return_seconds=False)
         if speech_dict:
             start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
             if start_frame:
                 relative_start_frame = max(0, (start_frame - self._offset))
@@ -107,27 +111,34 @@ class VadPipe(BasePipe):
             self.vac.reset_states()
         source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
-        speech_data  = self._process_iter_chunk(source_audio)
         self._offset += len(source_audio)
         if speech_data: # 表示有音频的变化点出现
             rel_start_frame, rel_end_frame = speech_data
             if rel_start_frame and not rel_end_frame:
                 self._status = "START" # 语音开始
                 target_audio = source_audio[rel_start_frame:]
             elif not rel_start_frame and rel_end_frame:
                 self._status = "END" # 音频结束
                 target_audio = source_audio[:rel_end_frame]
             elif rel_start_frame and rel_end_frame:
                 self._status = 'END'
                 target_audio = source_audio[rel_start_frame:rel_end_frame]
             else:
                 self._status = 'END'
                 target_audio = np.array([],dtype=np.float32)
         else:
             if self._status == 'START':
                 target_audio = source_audio
             else: # end
                 target_audio = np.array([],dtype=np.float32)
         in_data.audio = target_audio.tobytes()

 from silero_vad import get_speech_timestamps
 import torch
 from typing import List
+import logging
 # import noisereduce as nr
 def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
     def _process_speech_chunk(self, source_audio:np.ndarray):
         speech_dict = self.vac(source_audio, return_seconds=False)
         if speech_dict:
+            relative_start_frame = None
+            relative_end_frame = None
             start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
             if start_frame:
                 relative_start_frame = max(0, (start_frame - self._offset))
             self.vac.reset_states()
         source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
+        speech_data  = self._process_speech_chunk(source_audio)
         self._offset += len(source_audio)
         if speech_data: # 表示有音频的变化点出现
             rel_start_frame, rel_end_frame = speech_data
             if rel_start_frame and not rel_end_frame:
                 self._status = "START" # 语音开始
                 target_audio = source_audio[rel_start_frame:]
+                logging.debug("🔊 Speech started, capturing audio from frame: {}".format(rel_start_frame))
             elif not rel_start_frame and rel_end_frame:
                 self._status = "END" # 音频结束
                 target_audio = source_audio[:rel_end_frame]
+                logging.debug("🔚 Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
             elif rel_start_frame and rel_end_frame:
                 self._status = 'END'
                 target_audio = source_audio[rel_start_frame:rel_end_frame]
+                logging.debug("🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
             else:
                 self._status = 'END'
                 target_audio = np.array([],dtype=np.float32)
+                logging.debug("❌ No valid speech segment detected, setting status to END")
         else:
             if self._status == 'START':
                 target_audio = source_audio
+                logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
             else: # end
                 target_audio = np.array([],dtype=np.float32)
+                self._status = 'END'
+                logging.debug("❌ No speech detected, setting status to END")
         in_data.audio = target_audio.tobytes()

transcribe/whisper_llm_serve.py CHANGED Viewed

@@ -10,7 +10,7 @@ import numpy as np
 import config
 # import wordninja
 from api_model import TransResult, Message, DebugResult
-from .server import ServeClientBase
 from .utils import log_block, save_to_wave, TestDataWriter
 from .translatepipes import TranslatePipes
 from .strategy import (
@@ -20,18 +20,21 @@ import csv
 logger = getLogger("TranscriptionService")
-class WhisperTranscriptionService(ServeClientBase):
     """
     Whisper语音转录服务类，处理音频流转录和翻译
     """
     def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
-        super().__init__(client_uid, websocket)
         self.source_language = language  # 源语言
         self.target_language = dst_lang  # 目标翻译语言
         # 转录结果稳定性管理
         self._translate_pipe = pipe
         # 音频处理相关
@@ -40,7 +43,6 @@ class WhisperTranscriptionService(ServeClientBase):
         self.lock = threading.Lock()
         self._frame_queue = queue.Queue()
         # 文本分隔符，根据语言设置
         self.text_separator = self._get_text_separator(language)
         self.loop = asyncio.get_event_loop()
@@ -54,7 +56,6 @@ class WhisperTranscriptionService(ServeClientBase):
         self.translate_thread = self._start_thread(self._transcription_processing_loop)
         self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
-        #
         self._vad_processed_offset = 0
         # for test
@@ -100,7 +101,7 @@ class WhisperTranscriptionService(ServeClientBase):
         self.text_separator = self._get_text_separator(source_lang)
         self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
-    def add_audio_frames(self, frame_np: np.ndarray) -> None:
         """添加音频帧到处理队列"""
         self._frame_queue.put(frame_np)
@@ -226,9 +227,6 @@ class WhisperTranscriptionService(ServeClientBase):
         """主转录处理循环"""
         c = 0
         while not self._translate_thread_stop.is_set():
-            if self.exit:
-                logger.info("Exiting transcription thread")
-                break
             # 等待音频数据
             if self.frames_np is None:

 import config
 # import wordninja
 from api_model import TransResult, Message, DebugResult
+# from .server import ServeClientBase
 from .utils import log_block, save_to_wave, TestDataWriter
 from .translatepipes import TranslatePipes
 from .strategy import (
 logger = getLogger("TranscriptionService")
+class WhisperTranscriptionService:
     """
     Whisper语音转录服务类，处理音频流转录和翻译
     """
+    SERVER_READY = "SERVER_READY"
+    DISCONNECT = "DISCONNECT"
     def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
         self.source_language = language  # 源语言
         self.target_language = dst_lang  # 目标翻译语言
+        self.client_uid = client_uid
         # 转录结果稳定性管理
+        self.websocket = websocket
         self._translate_pipe = pipe
         # 音频处理相关
         self.lock = threading.Lock()
         self._frame_queue = queue.Queue()
         # 文本分隔符，根据语言设置
         self.text_separator = self._get_text_separator(language)
         self.loop = asyncio.get_event_loop()
         self.translate_thread = self._start_thread(self._transcription_processing_loop)
         self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
         self._vad_processed_offset = 0
         # for test
         self.text_separator = self._get_text_separator(source_lang)
         self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
+    def add_frames(self, frame_np: np.ndarray) -> None:
         """添加音频帧到处理队列"""
         self._frame_queue.put(frame_np)
         """主转录处理循环"""
         c = 0
         while not self._translate_thread_stop.is_set():
             # 等待音频数据
             if self.frames_np is None: