File size: 3,785 Bytes
485d8e3
 
ca5d527
 
c0447ed
37262f1
 
6696134
 
9bdac3d
fca9809
d8ef700
c0447ed
31ad35a
fca9809
c0447ed
ca5d527
 
 
 
 
fca9809
ca5d527
 
 
 
c0447ed
 
31ad35a
ca5d527
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f5bdb50
c0447ed
ca5d527
 
 
 
 
 
 
 
bdb9da4
ca5d527
 
 
bdb9da4
ca5d527
 
 
bdb9da4
ca5d527
 
 
 
 
 
 
f5bdb50
ca5d527
 
 
 
 
 
 
f5bdb50
ca5d527
f5bdb50
ca5d527
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96

from .base import MetaItem, BasePipe
from ..helpers.vadprocessor import FixedVADIterator, SileroVADProcessor

import numpy as np
from silero_vad import get_speech_timestamps
from typing import List
import logging

# import noisereduce as nr


class VadPipe(BasePipe):
    vac = None
    sample_rate = 16000

    def __init__(self, in_queue=None, out_queue=None) -> None:
        super().__init__(in_queue, out_queue)
        self._offset = 0 # 处理的frame size offset
        self._status = 'END'
    

    def reset(self):
        self._offset = 0 
        self._status = 'END'
 
    @classmethod
    def init(cls):
        if cls.vac is None:
            cls.vac = FixedVADIterator(
                threshold=0.3,
                sampling_rate=cls.sample_rate, 
                # speech_pad_ms=10
                min_silence_duration_ms = 100,
                # speech_pad_ms = 30,
                max_speech_duration_s=15
                )
            cls.vac.reset_states()
   

    # def reduce_noise(self, data):
    #     return nr.reduce_noise(y=data, sr=self.sample_rate)

    def _process_speech_chunk(self, source_audio:np.ndarray):
        speech_dict = self.vac(source_audio, return_seconds=False)
        if speech_dict:
            relative_start_frame = None
            relative_end_frame = None
            start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
            if start_frame:
                relative_start_frame = start_frame - self._offset
            if end_frame:
                relative_end_frame = end_frame - self._offset
            return relative_start_frame, relative_end_frame
    
    def process(self, in_data: MetaItem) -> MetaItem:
        if self._offset == 0:
            self.vac.reset_states()
        # silence_audio_100ms = np.zeros(int(0.1*self.sample_rate))
        source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
        speech_data  = self._process_speech_chunk(source_audio)
       
        if speech_data: # 表示有音频的变化点出现
            rel_start_frame, rel_end_frame = speech_data 
            if rel_start_frame is not None and rel_end_frame is None:
                self._status = "START" # 语音开始
                target_audio = source_audio[rel_start_frame:]
                logging.debug("🫸 Speech start frame: {}".format(rel_start_frame))
            elif rel_start_frame is None and rel_end_frame is not None:
                self._status = "END" # 音频结束
                target_audio = source_audio[:rel_end_frame]
                logging.debug(" 🫷Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
            elif rel_start_frame is not None and rel_end_frame is not None:
                self._status = 'END'
                target_audio = source_audio[rel_start_frame:rel_end_frame]
                logging.debug(" 🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
            else:
                self._status = 'END'
                target_audio = np.array([],dtype=np.float32)
                # logging.debug("❌ No valid speech segment detected, setting status to END")
        else:
            if self._status == 'START':
                target_audio = source_audio
                # logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
            else: # end
                target_audio = np.array([],dtype=np.float32)
                # self._status = 'END'
                # logging.debug("❌ No speech detected, setting status to END")

        self._offset += len(source_audio)

        in_data.audio = target_audio.tobytes()
        in_data.source_audio = b''
        in_data.speech_status = self._status
        return in_data