daihui.zhang commited on
Commit
7e7b241
·
1 Parent(s): 38260dc

update vad config parameter

Browse files
transcribe/pipelines/pipe_vad.py CHANGED
@@ -17,12 +17,12 @@ class VadPipe(BasePipe):
17
  def init(cls):
18
  if cls.model is None:
19
  cls.model = SileroVADProcessor(
20
- activate_threshold=0.5,
21
- fusion_threshold=0.3,
22
- min_speech_duration=0.25,
23
- max_speech_duration=20,
24
- min_silence_duration=250,
25
- sample_rate=cls.sample_rate
26
  )
27
  cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,)
28
  cls.vac.reset_states()
 
17
  def init(cls):
18
  if cls.model is None:
19
  cls.model = SileroVADProcessor(
20
+ activate_threshold=0.4, # 降低以捕获更多音频
21
+ fusion_threshold=0.45, # 提高以更好地融合语音片段
22
+ min_speech_duration=0.2, # 略微降低以捕获短音节
23
+ max_speech_duration=20, # 保持不变
24
+ min_silence_duration=300, # 增加到300毫秒,允许说话间的自然停顿
25
+ sample_rate=cls.sample_rate # 采样率,音频信号的采样频率
26
  )
27
  cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,)
28
  cls.vac.reset_states()
transcribe/whisper_llm_serve.py CHANGED
@@ -146,7 +146,7 @@ class PyWhiperCppServe(ServeClientBase):
146
  return None, left_watch_string, right_watch_string, is_end_sentence
147
 
148
  def speech_to_text(self):
149
- # c = 0
150
  while not self._translate_thread_stop.is_set():
151
  if self.exit:
152
  logger.info("Exiting speech to text thread")
@@ -154,13 +154,14 @@ class PyWhiperCppServe(ServeClientBase):
154
 
155
  if self.frames_np is None :
156
  time.sleep(0.02) # wait for any audio to arrive
 
157
  continue
158
-
159
  audio_buffer = self.get_audio_chunk_for_processing()
160
- audio_duration = (len(audio_buffer) / self.sample_rate)
161
- if audio_duration<0.5 :
162
  time.sleep(0.02) # wait for any audio to arrive
163
  continue
 
164
  # c+= 1
165
  # name = f"dev-{c}.wav"
166
  # save_to_wave(name, audio_buffer)
@@ -173,8 +174,8 @@ class PyWhiperCppServe(ServeClientBase):
173
  # break
174
  # except Exception as e:
175
  # logger.error(f"{e}")
176
- if (time_delay := (1 - audio_duration)) > 0:
177
- time.sleep(time_delay)
178
 
179
  def handle_transcription_output(self, segments, audio_buffer):
180
  texts = self.text_sep.join(i.text for i in segments)
@@ -242,15 +243,18 @@ class PyWhiperCppServe(ServeClientBase):
242
 
243
  def get_audio_chunk_for_processing(self):
244
  self.vad_merge()
245
- # silence_audio = np.zeros((self.sample_rate+100,), dtype=np.float32)
246
  frames = self.frames_np.copy()
247
- #todo 如果补空白音频就会导致 幻听
248
- # if 0< len(frames) < self.sample_rate:
249
- # silence_audio[-len(frames):] = frames
250
- # return silence_audio.copy()
251
- if len(frames) > self.sample_rate:
252
- return frames.copy()
253
- return np.zeros((0,), dtype=np.float32)
 
 
 
254
 
255
  def stop(self):
256
  self._translate_thread_stop.set()
 
146
  return None, left_watch_string, right_watch_string, is_end_sentence
147
 
148
  def speech_to_text(self):
149
+ c = 0
150
  while not self._translate_thread_stop.is_set():
151
  if self.exit:
152
  logger.info("Exiting speech to text thread")
 
154
 
155
  if self.frames_np is None :
156
  time.sleep(0.02) # wait for any audio to arrive
157
+ logger.info("waiting for client data...")
158
  continue
159
+
160
  audio_buffer = self.get_audio_chunk_for_processing()
161
+ if audio_buffer is None:
 
162
  time.sleep(0.02) # wait for any audio to arrive
163
  continue
164
+
165
  # c+= 1
166
  # name = f"dev-{c}.wav"
167
  # save_to_wave(name, audio_buffer)
 
174
  # break
175
  # except Exception as e:
176
  # logger.error(f"{e}")
177
+ # if (time_delay := (1 - audio_duration)) > 0:
178
+ # time.sleep(time_delay)
179
 
180
  def handle_transcription_output(self, segments, audio_buffer):
181
  texts = self.text_sep.join(i.text for i in segments)
 
243
 
244
  def get_audio_chunk_for_processing(self):
245
  self.vad_merge()
246
+ silence_audio = np.zeros((self.sample_rate+1000,), dtype=np.float32)
247
  frames = self.frames_np.copy()
248
+ # 添加对非常短音频的处理
249
+ if len(frames) <= 100:
250
+ # 对于极短的音频段(<=100帧),直接返回空音频
251
+ self.update_audio_buffer(len(frames))
252
+ return None
253
+ elif len(frames) < self.sample_rate:
254
+ silence_audio[-len(frames):] = frames
255
+ return silence_audio.copy()
256
+ return frames.copy()
257
+
258
 
259
  def stop(self):
260
  self._translate_thread_stop.set()