from .base import MetaItem, BasePipe from ..helpers.vadprocessor import SileroVADProcessor, FixedVADIterator import numpy as np from silero_vad import get_speech_timestamps,collect_chunks import torch import noisereduce as nr class VadPipe(BasePipe): model = None sample_rate = 16000 window_size_samples = 512 @classmethod def init(cls): if cls.model is None: cls.model = SileroVADProcessor( activate_threshold=0.5, fusion_threshold=0.3, min_speech_duration=0.25, max_speech_duration=20, min_silence_duration=250, sample_rate=cls.sample_rate ) cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,) cls.vac.reset_states() def get_previous_buffer(self): if len(self.previous_buffer) == 2: return self.previous_buffer[-1] return np.array([], dtype=np.float32) def reduce_noise(self, data): return nr.reduce_noise(y=data, sr=self.sample_rate) def process(self, in_data: MetaItem) -> MetaItem: source_audio = in_data.source_audio source_audio = np.frombuffer(source_audio, dtype=np.float32) send_audio = b"" speech_timestamps = get_speech_timestamps(source_audio, self.model.silero_vad, sampling_rate=16000) if speech_timestamps: send_audio = collect_chunks(speech_timestamps, torch.Tensor(source_audio)) send_audio = send_audio.numpy() # send_audio = self.reduce_noise(send_audio).tobytes() in_data.source_audio = b"" return in_data