daihui.zhang commited on
Commit
38260dc
·
1 Parent(s): 966f24d

fix error in get audio chunk

Browse files
transcribe/pipelines/pipe_vad.py CHANGED
@@ -41,6 +41,7 @@ class VadPipe(BasePipe):
41
  def process(self, in_data: MetaItem) -> MetaItem:
42
  source_audio = in_data.source_audio
43
  source_audio = np.frombuffer(source_audio, dtype=np.float32)
 
44
  send_audio = b""
45
  speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
46
 
 
41
  def process(self, in_data: MetaItem) -> MetaItem:
42
  source_audio = in_data.source_audio
43
  source_audio = np.frombuffer(source_audio, dtype=np.float32)
44
+
45
  send_audio = b""
46
  speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
47
 
transcribe/whisper_llm_serve.py CHANGED
@@ -152,13 +152,14 @@ class PyWhiperCppServe(ServeClientBase):
152
  logger.info("Exiting speech to text thread")
153
  break
154
 
155
- if self.frames_np is None:
156
  time.sleep(0.02) # wait for any audio to arrive
157
  continue
158
 
159
  audio_buffer = self.get_audio_chunk_for_processing()
160
- if audio_buffer.shape[0] < self.sample_rate * 2:
161
- time.sleep(0.02)
 
162
  continue
163
  # c+= 1
164
  # name = f"dev-{c}.wav"
@@ -172,6 +173,8 @@ class PyWhiperCppServe(ServeClientBase):
172
  # break
173
  # except Exception as e:
174
  # logger.error(f"{e}")
 
 
175
 
176
  def handle_transcription_output(self, segments, audio_buffer):
177
  texts = self.text_sep.join(i.text for i in segments)
@@ -239,8 +242,15 @@ class PyWhiperCppServe(ServeClientBase):
239
 
240
  def get_audio_chunk_for_processing(self):
241
  self.vad_merge()
242
-
243
- return self.frames_np.copy()
 
 
 
 
 
 
 
244
 
245
  def stop(self):
246
  self._translate_thread_stop.set()
 
152
  logger.info("Exiting speech to text thread")
153
  break
154
 
155
+ if self.frames_np is None :
156
  time.sleep(0.02) # wait for any audio to arrive
157
  continue
158
 
159
  audio_buffer = self.get_audio_chunk_for_processing()
160
+ audio_duration = (len(audio_buffer) / self.sample_rate)
161
+ if audio_duration<0.5 :
162
+ time.sleep(0.02) # wait for any audio to arrive
163
  continue
164
  # c+= 1
165
  # name = f"dev-{c}.wav"
 
173
  # break
174
  # except Exception as e:
175
  # logger.error(f"{e}")
176
+ if (time_delay := (1 - audio_duration)) > 0:
177
+ time.sleep(time_delay)
178
 
179
  def handle_transcription_output(self, segments, audio_buffer):
180
  texts = self.text_sep.join(i.text for i in segments)
 
242
 
243
  def get_audio_chunk_for_processing(self):
244
  self.vad_merge()
245
+ # silence_audio = np.zeros((self.sample_rate+100,), dtype=np.float32)
246
+ frames = self.frames_np.copy()
247
+ #todo 如果补空白音频就会导致 幻听
248
+ # if 0< len(frames) < self.sample_rate:
249
+ # silence_audio[-len(frames):] = frames
250
+ # return silence_audio.copy()
251
+ if len(frames) > self.sample_rate:
252
+ return frames.copy()
253
+ return np.zeros((0,), dtype=np.float32)
254
 
255
  def stop(self):
256
  self._translate_thread_stop.set()