david
commited on
Commit
·
d8ef700
1
Parent(s):
37262f1
add custom vad silence
Browse files
transcribe/pipelines/pipe_vad.py
CHANGED
|
@@ -24,6 +24,34 @@ def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000)
|
|
| 24 |
|
| 25 |
return torch.cat(chunks)
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
class VadPipe(BasePipe):
|
| 28 |
model = None
|
| 29 |
sample_rate = 16000
|
|
@@ -63,7 +91,7 @@ class VadPipe(BasePipe):
|
|
| 63 |
speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
|
| 64 |
|
| 65 |
if speech_timestamps:
|
| 66 |
-
send_audio =
|
| 67 |
send_audio = send_audio.numpy()
|
| 68 |
in_data.audio = send_audio
|
| 69 |
# send_audio = self.reduce_noise(send_audio).tobytes()
|
|
|
|
| 24 |
|
| 25 |
return torch.cat(chunks)
|
| 26 |
|
| 27 |
+
def collect_chunks_improved(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
|
| 28 |
+
chunks = []
|
| 29 |
+
silent_samples = int(0.3 * sample_rate) # 300ms 的静音样本数
|
| 30 |
+
silence = torch.zeros(silent_samples) # 创建300ms的静音
|
| 31 |
+
min_gap_samples = int(0.1 * sample_rate) # 最小间隔阈值 (100ms)
|
| 32 |
+
|
| 33 |
+
# 对时间戳进行简单的平滑处理
|
| 34 |
+
smoothed_tss = []
|
| 35 |
+
for i, ts in enumerate(tss):
|
| 36 |
+
if i > 0 and ts['start'] - tss[i-1]['end'] < 0.02 * sample_rate: # 如果间隔小于20ms,认为是连续的
|
| 37 |
+
smoothed_tss[-1]['end'] = ts['end'] # 合并到前一个片段
|
| 38 |
+
else:
|
| 39 |
+
smoothed_tss.append(ts)
|
| 40 |
+
|
| 41 |
+
for i in range(len(smoothed_tss)):
|
| 42 |
+
# 添加当前语音片段
|
| 43 |
+
chunks.append(wav[smoothed_tss[i]['start']: smoothed_tss[i]['end']])
|
| 44 |
+
|
| 45 |
+
# 如果不是最后一个片段,且与下一个片段间隔大于阈值,则添加静音
|
| 46 |
+
if i < len(smoothed_tss) - 1:
|
| 47 |
+
gap = smoothed_tss[i+1]['start'] - smoothed_tss[i]['end']
|
| 48 |
+
if gap > min_gap_samples:
|
| 49 |
+
# 根据间隔大小动态调整静音长度,但最大不超过300ms
|
| 50 |
+
silence_length = min(gap // 2, silent_samples)
|
| 51 |
+
chunks.append(torch.zeros(silence_length))
|
| 52 |
+
|
| 53 |
+
return torch.cat(chunks)
|
| 54 |
+
|
| 55 |
class VadPipe(BasePipe):
|
| 56 |
model = None
|
| 57 |
sample_rate = 16000
|
|
|
|
| 91 |
speech_timestamps = get_speech_timestamps(torch.Tensor(source_audio), self.model.silero_vad, sampling_rate=16000)
|
| 92 |
|
| 93 |
if speech_timestamps:
|
| 94 |
+
send_audio = collect_chunks_improved(speech_timestamps, torch.Tensor(source_audio))
|
| 95 |
send_audio = send_audio.numpy()
|
| 96 |
in_data.audio = send_audio
|
| 97 |
# send_audio = self.reduce_noise(send_audio).tobytes()
|