daihui.zhang commited on
Commit
6696134
·
1 Parent(s): d30439a

fix vad bug

Browse files
config.py CHANGED
@@ -2,7 +2,7 @@ import pathlib
2
  import re
3
  import logging
4
 
5
- DEBUG = False
6
  TEST = False
7
  logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
8
 
 
2
  import re
3
  import logging
4
 
5
+ DEBUG = True
6
  TEST = False
7
  logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
8
 
transcribe/pipelines/pipe_vad.py CHANGED
@@ -5,6 +5,8 @@ import numpy as np
5
  from silero_vad import get_speech_timestamps
6
  import torch
7
  from typing import List
 
 
8
  # import noisereduce as nr
9
 
10
  def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
@@ -95,6 +97,8 @@ class VadPipe(BasePipe):
95
  def _process_speech_chunk(self, source_audio:np.ndarray):
96
  speech_dict = self.vac(source_audio, return_seconds=False)
97
  if speech_dict:
 
 
98
  start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
99
  if start_frame:
100
  relative_start_frame = max(0, (start_frame - self._offset))
@@ -107,27 +111,34 @@ class VadPipe(BasePipe):
107
  self.vac.reset_states()
108
 
109
  source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
110
- speech_data = self._process_iter_chunk(source_audio)
111
  self._offset += len(source_audio)
112
  if speech_data: # 表示有音频的变化点出现
113
  rel_start_frame, rel_end_frame = speech_data
114
  if rel_start_frame and not rel_end_frame:
115
  self._status = "START" # 语音开始
116
  target_audio = source_audio[rel_start_frame:]
 
117
  elif not rel_start_frame and rel_end_frame:
118
  self._status = "END" # 音频结束
119
  target_audio = source_audio[:rel_end_frame]
 
120
  elif rel_start_frame and rel_end_frame:
121
  self._status = 'END'
122
  target_audio = source_audio[rel_start_frame:rel_end_frame]
 
123
  else:
124
  self._status = 'END'
125
  target_audio = np.array([],dtype=np.float32)
 
126
  else:
127
  if self._status == 'START':
128
  target_audio = source_audio
 
129
  else: # end
130
  target_audio = np.array([],dtype=np.float32)
 
 
131
 
132
 
133
  in_data.audio = target_audio.tobytes()
 
5
  from silero_vad import get_speech_timestamps
6
  import torch
7
  from typing import List
8
+ import logging
9
+
10
  # import noisereduce as nr
11
 
12
  def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
 
97
  def _process_speech_chunk(self, source_audio:np.ndarray):
98
  speech_dict = self.vac(source_audio, return_seconds=False)
99
  if speech_dict:
100
+ relative_start_frame = None
101
+ relative_end_frame = None
102
  start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
103
  if start_frame:
104
  relative_start_frame = max(0, (start_frame - self._offset))
 
111
  self.vac.reset_states()
112
 
113
  source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
114
+ speech_data = self._process_speech_chunk(source_audio)
115
  self._offset += len(source_audio)
116
  if speech_data: # 表示有音频的变化点出现
117
  rel_start_frame, rel_end_frame = speech_data
118
  if rel_start_frame and not rel_end_frame:
119
  self._status = "START" # 语音开始
120
  target_audio = source_audio[rel_start_frame:]
121
+ logging.debug("🔊 Speech started, capturing audio from frame: {}".format(rel_start_frame))
122
  elif not rel_start_frame and rel_end_frame:
123
  self._status = "END" # 音频结束
124
  target_audio = source_audio[:rel_end_frame]
125
+ logging.debug("🔚 Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
126
  elif rel_start_frame and rel_end_frame:
127
  self._status = 'END'
128
  target_audio = source_audio[rel_start_frame:rel_end_frame]
129
+ logging.debug("🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
130
  else:
131
  self._status = 'END'
132
  target_audio = np.array([],dtype=np.float32)
133
+ logging.debug("❌ No valid speech segment detected, setting status to END")
134
  else:
135
  if self._status == 'START':
136
  target_audio = source_audio
137
+ logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
138
  else: # end
139
  target_audio = np.array([],dtype=np.float32)
140
+ self._status = 'END'
141
+ logging.debug("❌ No speech detected, setting status to END")
142
 
143
 
144
  in_data.audio = target_audio.tobytes()
transcribe/whisper_llm_serve.py CHANGED
@@ -10,7 +10,7 @@ import numpy as np
10
  import config
11
  # import wordninja
12
  from api_model import TransResult, Message, DebugResult
13
- from .server import ServeClientBase
14
  from .utils import log_block, save_to_wave, TestDataWriter
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
@@ -20,18 +20,21 @@ import csv
20
  logger = getLogger("TranscriptionService")
21
 
22
 
23
- class WhisperTranscriptionService(ServeClientBase):
24
  """
25
  Whisper语音转录服务类,处理音频流转录和翻译
26
  """
27
 
 
 
 
28
  def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
29
- super().__init__(client_uid, websocket)
30
  self.source_language = language # 源语言
31
  self.target_language = dst_lang # 目标翻译语言
32
-
33
  # 转录结果稳定性管理
34
-
35
  self._translate_pipe = pipe
36
 
37
  # 音频处理相关
@@ -40,7 +43,6 @@ class WhisperTranscriptionService(ServeClientBase):
40
  self.lock = threading.Lock()
41
  self._frame_queue = queue.Queue()
42
 
43
-
44
  # 文本分隔符,根据语言设置
45
  self.text_separator = self._get_text_separator(language)
46
  self.loop = asyncio.get_event_loop()
@@ -54,7 +56,6 @@ class WhisperTranscriptionService(ServeClientBase):
54
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
55
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
56
 
57
- #
58
  self._vad_processed_offset = 0
59
 
60
  # for test
@@ -100,7 +101,7 @@ class WhisperTranscriptionService(ServeClientBase):
100
  self.text_separator = self._get_text_separator(source_lang)
101
  self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
102
 
103
- def add_audio_frames(self, frame_np: np.ndarray) -> None:
104
  """添加音频帧到处理队列"""
105
  self._frame_queue.put(frame_np)
106
 
@@ -226,9 +227,6 @@ class WhisperTranscriptionService(ServeClientBase):
226
  """主转录处理循环"""
227
  c = 0
228
  while not self._translate_thread_stop.is_set():
229
- if self.exit:
230
- logger.info("Exiting transcription thread")
231
- break
232
 
233
  # 等待音频数据
234
  if self.frames_np is None:
 
10
  import config
11
  # import wordninja
12
  from api_model import TransResult, Message, DebugResult
13
+ # from .server import ServeClientBase
14
  from .utils import log_block, save_to_wave, TestDataWriter
15
  from .translatepipes import TranslatePipes
16
  from .strategy import (
 
20
  logger = getLogger("TranscriptionService")
21
 
22
 
23
+ class WhisperTranscriptionService:
24
  """
25
  Whisper语音转录服务类,处理音频流转录和翻译
26
  """
27
 
28
+ SERVER_READY = "SERVER_READY"
29
+ DISCONNECT = "DISCONNECT"
30
+
31
  def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
32
+
33
  self.source_language = language # 源语言
34
  self.target_language = dst_lang # 目标翻译语言
35
+ self.client_uid = client_uid
36
  # 转录结果稳定性管理
37
+ self.websocket = websocket
38
  self._translate_pipe = pipe
39
 
40
  # 音频处理相关
 
43
  self.lock = threading.Lock()
44
  self._frame_queue = queue.Queue()
45
 
 
46
  # 文本分隔符,根据语言设置
47
  self.text_separator = self._get_text_separator(language)
48
  self.loop = asyncio.get_event_loop()
 
56
  self.translate_thread = self._start_thread(self._transcription_processing_loop)
57
  self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
58
 
 
59
  self._vad_processed_offset = 0
60
 
61
  # for test
 
101
  self.text_separator = self._get_text_separator(source_lang)
102
  self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
103
 
104
+ def add_frames(self, frame_np: np.ndarray) -> None:
105
  """添加音频帧到处理队列"""
106
  self._frame_queue.put(frame_np)
107
 
 
227
  """主转录处理循环"""
228
  c = 0
229
  while not self._translate_thread_stop.is_set():
 
 
 
230
 
231
  # 等待音频数据
232
  if self.frames_np is None: