daihui.zhang
commited on
Commit
·
fca9809
1
Parent(s):
9150655
add vad pipeline
Browse files
transcribe/helpers/translator.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
from logging import getLogger
|
| 2 |
from llama_cpp import Llama
|
| 3 |
-
import
|
| 4 |
|
| 5 |
logger = getLogger(__name__)
|
| 6 |
|
|
@@ -19,7 +19,7 @@ class QwenTranslator:
|
|
| 19 |
{"role": "user", "content": prompt},
|
| 20 |
]
|
| 21 |
|
| 22 |
-
|
| 23 |
def translate(self, prompt, src_lang, dst_lang) -> str:
|
| 24 |
message = self.to_message(prompt, src_lang, dst_lang)
|
| 25 |
output = self.llm.create_chat_completion(messages=message, temperature=0)
|
|
|
|
| 1 |
from logging import getLogger
|
| 2 |
from llama_cpp import Llama
|
| 3 |
+
from functools import lru_cache
|
| 4 |
|
| 5 |
logger = getLogger(__name__)
|
| 6 |
|
|
|
|
| 19 |
{"role": "user", "content": prompt},
|
| 20 |
]
|
| 21 |
|
| 22 |
+
@lru_cache(maxsize=10)
|
| 23 |
def translate(self, prompt, src_lang, dst_lang) -> str:
|
| 24 |
message = self.to_message(prompt, src_lang, dst_lang)
|
| 25 |
output = self.llm.create_chat_completion(messages=message, temperature=0)
|
transcribe/helpers/vadprocessor.py
CHANGED
|
@@ -4,10 +4,35 @@ import numpy as np
|
|
| 4 |
import onnxruntime
|
| 5 |
from datetime import timedelta
|
| 6 |
from pydub import AudioSegment
|
| 7 |
-
from silero_vad import load_silero_vad, get_speech_timestamps
|
| 8 |
import os
|
| 9 |
import logging
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class SileroVADProcessor:
|
| 13 |
"""
|
|
|
|
| 4 |
import onnxruntime
|
| 5 |
from datetime import timedelta
|
| 6 |
from pydub import AudioSegment
|
| 7 |
+
from silero_vad import load_silero_vad, get_speech_timestamps, VADIterator
|
| 8 |
import os
|
| 9 |
import logging
|
| 10 |
|
| 11 |
+
class FixedVADIterator(VADIterator):
|
| 12 |
+
'''It fixes VADIterator by allowing to process any audio length, not only exactly 512 frames at once.
|
| 13 |
+
If audio to be processed at once is long and multiple voiced segments detected,
|
| 14 |
+
then __call__ returns the start of the first segment, and end (or middle, which means no end) of the last segment.
|
| 15 |
+
'''
|
| 16 |
+
|
| 17 |
+
def reset_states(self):
|
| 18 |
+
super().reset_states()
|
| 19 |
+
self.buffer = np.array([],dtype=np.float32)
|
| 20 |
+
|
| 21 |
+
def __call__(self, x, return_seconds=False):
|
| 22 |
+
self.buffer = np.append(self.buffer, x)
|
| 23 |
+
ret = None
|
| 24 |
+
while len(self.buffer) >= 512:
|
| 25 |
+
r = super().__call__(self.buffer[:512], return_seconds=return_seconds)
|
| 26 |
+
self.buffer = self.buffer[512:]
|
| 27 |
+
if ret is None:
|
| 28 |
+
ret = r
|
| 29 |
+
elif r is not None:
|
| 30 |
+
if 'end' in r:
|
| 31 |
+
ret['end'] = r['end'] # the latter end
|
| 32 |
+
if 'start' in r and 'end' in ret: # there is an earlier start.
|
| 33 |
+
# Remove end, merging this segment with the previous one.
|
| 34 |
+
del ret['end']
|
| 35 |
+
return ret if ret != {} else None
|
| 36 |
|
| 37 |
class SileroVADProcessor:
|
| 38 |
"""
|
transcribe/pipelines/base.py
CHANGED
|
@@ -2,6 +2,10 @@
|
|
| 2 |
from dataclasses import dataclass, field
|
| 3 |
from multiprocessing import Process, Queue
|
| 4 |
from multiprocessing import Event
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
@dataclass
|
| 7 |
class Segment:
|
|
@@ -53,7 +57,9 @@ class BasePipe(Process):
|
|
| 53 |
raise NotImplementedError
|
| 54 |
|
| 55 |
def run(self):
|
|
|
|
| 56 |
self.init()
|
|
|
|
| 57 |
self.set_ready()
|
| 58 |
while True:
|
| 59 |
item = self.input_queue.get()
|
|
|
|
| 2 |
from dataclasses import dataclass, field
|
| 3 |
from multiprocessing import Process, Queue
|
| 4 |
from multiprocessing import Event
|
| 5 |
+
from logging import getLogger
|
| 6 |
+
|
| 7 |
+
logger = getLogger(__name__)
|
| 8 |
+
|
| 9 |
|
| 10 |
@dataclass
|
| 11 |
class Segment:
|
|
|
|
| 57 |
raise NotImplementedError
|
| 58 |
|
| 59 |
def run(self):
|
| 60 |
+
logger.info(f"start initial {self.__class__.__name__}")
|
| 61 |
self.init()
|
| 62 |
+
logger.info(f"finish initial {self.__class__.__name__}")
|
| 63 |
self.set_ready()
|
| 64 |
while True:
|
| 65 |
item = self.input_queue.get()
|
transcribe/pipelines/pipe_vad.py
CHANGED
|
@@ -1,15 +1,18 @@
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
-
from ..helpers.vadprocessor import SileroVADProcessor
|
| 4 |
import numpy as np
|
| 5 |
-
from silero_vad import
|
| 6 |
import torch
|
|
|
|
|
|
|
| 7 |
|
| 8 |
class VadPipe(BasePipe):
|
| 9 |
model = None
|
| 10 |
-
sample_rate=16000
|
| 11 |
window_size_samples = 512
|
| 12 |
|
|
|
|
| 13 |
@classmethod
|
| 14 |
def init(cls):
|
| 15 |
if cls.model is None:
|
|
@@ -21,50 +24,28 @@ class VadPipe(BasePipe):
|
|
| 21 |
min_silence_duration=250,
|
| 22 |
sample_rate=cls.sample_rate
|
| 23 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
|
| 29 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 30 |
source_audio = in_data.source_audio
|
| 31 |
source_audio = np.frombuffer(source_audio, dtype=np.float32)
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
window = source_audio[i:i+self.window_size_samples]
|
| 39 |
-
|
| 40 |
-
if len(window) < self.window_size_samples:
|
| 41 |
-
padded_window = np.zeros(self.window_size_samples, dtype=np.float32)
|
| 42 |
-
padded_window[:len(window)] = window
|
| 43 |
-
window = padded_window
|
| 44 |
-
|
| 45 |
-
speech_dict = self.vad_iterator(window, return_seconds=False)
|
| 46 |
-
if not speech_dict:
|
| 47 |
-
continue
|
| 48 |
-
|
| 49 |
-
# 计算当前偏移量
|
| 50 |
-
|
| 51 |
-
if speech_dict and 'start' in speech_dict and not is_speech_active:
|
| 52 |
-
is_speech_active = True
|
| 53 |
-
# current_segment_start = speech_dict['start'] + i
|
| 54 |
-
|
| 55 |
-
if is_speech_active:
|
| 56 |
-
speech_segments.append(window)
|
| 57 |
-
# # 如果检测到语音结束
|
| 58 |
-
# if speech_dict and 'end' in speech_dict and is_speech_active:
|
| 59 |
-
# # 调整语音结束时间,加上窗口偏移
|
| 60 |
-
# current_segment_end = min(speech_dict['end'] + i, current_segment_end)
|
| 61 |
-
# is_speech_active = False
|
| 62 |
-
# speech_audio = source_audio[current_segment_start: current_segment_end]
|
| 63 |
-
# speech_segments.append(speech_audio)
|
| 64 |
-
|
| 65 |
-
self.vad_iterator.reset_states()
|
| 66 |
-
combied_audio = np.concatenate(speech_segments, axis=0).tobytes() if len(speech_segments) else b""
|
| 67 |
-
in_data.audio = combied_audio
|
| 68 |
in_data.source_audio = b""
|
| 69 |
-
|
| 70 |
return in_data
|
|
|
|
| 1 |
|
| 2 |
from .base import MetaItem, BasePipe
|
| 3 |
+
from ..helpers.vadprocessor import SileroVADProcessor, FixedVADIterator
|
| 4 |
import numpy as np
|
| 5 |
+
from silero_vad import get_speech_timestamps,collect_chunks
|
| 6 |
import torch
|
| 7 |
+
import noisereduce as nr
|
| 8 |
+
|
| 9 |
|
| 10 |
class VadPipe(BasePipe):
|
| 11 |
model = None
|
| 12 |
+
sample_rate = 16000
|
| 13 |
window_size_samples = 512
|
| 14 |
|
| 15 |
+
|
| 16 |
@classmethod
|
| 17 |
def init(cls):
|
| 18 |
if cls.model is None:
|
|
|
|
| 24 |
min_silence_duration=250,
|
| 25 |
sample_rate=cls.sample_rate
|
| 26 |
)
|
| 27 |
+
cls.vac = FixedVADIterator(cls.model.silero_vad, sampling_rate=cls.sample_rate,)
|
| 28 |
+
cls.vac.reset_states()
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def get_previous_buffer(self):
|
| 32 |
+
if len(self.previous_buffer) == 2:
|
| 33 |
+
return self.previous_buffer[-1]
|
| 34 |
+
return np.array([], dtype=np.float32)
|
| 35 |
+
|
| 36 |
|
| 37 |
+
def reduce_noise(self, data):
|
| 38 |
+
return nr.reduce_noise(y=data, sr=self.sample_rate)
|
| 39 |
+
|
| 40 |
|
| 41 |
def process(self, in_data: MetaItem) -> MetaItem:
|
| 42 |
source_audio = in_data.source_audio
|
| 43 |
source_audio = np.frombuffer(source_audio, dtype=np.float32)
|
| 44 |
+
send_audio = b""
|
| 45 |
+
speech_timestamps = get_speech_timestamps(source_audio, self.model.silero_vad, sampling_rate=16000)
|
| 46 |
+
if speech_timestamps:
|
| 47 |
+
send_audio = collect_chunks(speech_timestamps, torch.Tensor(source_audio))
|
| 48 |
+
send_audio = send_audio.numpy()
|
| 49 |
+
# send_audio = self.reduce_noise(send_audio).tobytes()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
in_data.source_audio = b""
|
|
|
|
| 51 |
return in_data
|
transcribe/pipelines/pipe_whisper.py
CHANGED
|
@@ -7,9 +7,6 @@ from ..helpers.whisper import WhisperCPP
|
|
| 7 |
class WhisperPipe(BasePipe):
|
| 8 |
whisper = None
|
| 9 |
|
| 10 |
-
def __init__(self, in_queue=None, out_queue=None) -> None:
|
| 11 |
-
super().__init__(in_queue, out_queue)
|
| 12 |
-
|
| 13 |
|
| 14 |
@classmethod
|
| 15 |
def init(cls):
|
|
|
|
| 7 |
class WhisperPipe(BasePipe):
|
| 8 |
whisper = None
|
| 9 |
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
@classmethod
|
| 12 |
def init(cls):
|
transcribe/strategy.py
CHANGED
|
@@ -98,7 +98,7 @@ def segement_merge(segments):
|
|
| 98 |
|
| 99 |
for seg in segments:
|
| 100 |
temp_seq.append(seg)
|
| 101 |
-
if any([mk in seg.text for mk in config.SENTENCE_END_MARKERS]):
|
| 102 |
sequences.append(temp_seq.copy())
|
| 103 |
temp_seq = []
|
| 104 |
if temp_seq:
|
|
@@ -114,18 +114,18 @@ def segments_split(segments, audio_buffer: np.ndarray, sample_rate=16000):
|
|
| 114 |
|
| 115 |
if (len(audio_buffer) / sample_rate) < 12:
|
| 116 |
# 低于12s 使用短句符号比如逗号作为判断依据
|
| 117 |
-
markers = config.PAUSE_END_MARKERS
|
| 118 |
is_end = False
|
| 119 |
|
| 120 |
for idx, seg in enumerate(segments):
|
| 121 |
left_watch_sequences.append(seg)
|
| 122 |
if seg.text in markers:
|
| 123 |
seg_index = int(seg.t1 / 100 * sample_rate)
|
| 124 |
-
rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
|
| 125 |
# is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
|
| 126 |
right_watch_sequences = segments[min(idx+1, len(segments)):]
|
| 127 |
-
if rest_buffer_duration >= 1.5:
|
| 128 |
-
|
| 129 |
break
|
| 130 |
return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
|
| 131 |
|
|
|
|
| 98 |
|
| 99 |
for seg in segments:
|
| 100 |
temp_seq.append(seg)
|
| 101 |
+
if any([mk in seg.text for mk in config.SENTENCE_END_MARKERS + config.PAUSE_END_MARKERS]):
|
| 102 |
sequences.append(temp_seq.copy())
|
| 103 |
temp_seq = []
|
| 104 |
if temp_seq:
|
|
|
|
| 114 |
|
| 115 |
if (len(audio_buffer) / sample_rate) < 12:
|
| 116 |
# 低于12s 使用短句符号比如逗号作为判断依据
|
| 117 |
+
markers = config.PAUSE_END_MARKERS + config.SENTENCE_END_MARKERS
|
| 118 |
is_end = False
|
| 119 |
|
| 120 |
for idx, seg in enumerate(segments):
|
| 121 |
left_watch_sequences.append(seg)
|
| 122 |
if seg.text in markers:
|
| 123 |
seg_index = int(seg.t1 / 100 * sample_rate)
|
| 124 |
+
# rest_buffer_duration = (len(audio_buffer) - seg_index) / sample_rate
|
| 125 |
# is_end = any(i in seg.text for i in config.SENTENCE_END_MARKERS)
|
| 126 |
right_watch_sequences = segments[min(idx+1, len(segments)):]
|
| 127 |
+
# if rest_buffer_duration >= 1.5:
|
| 128 |
+
left_watch_idx = seg_index
|
| 129 |
break
|
| 130 |
return left_watch_idx, left_watch_sequences, right_watch_sequences, is_end
|
| 131 |
|
transcribe/whisper_llm_serve.py
CHANGED
|
@@ -40,7 +40,6 @@ class PyWhiperCppServe(ServeClientBase):
|
|
| 40 |
self.lock = threading.Lock()
|
| 41 |
self.frames_np = None
|
| 42 |
self._frame_queue = queue.Queue()
|
| 43 |
-
self._previous_frame_queue = collections.deque(maxlen=2)
|
| 44 |
self.sample_rate = 16000
|
| 45 |
|
| 46 |
self.send_ready_state()
|
|
@@ -69,24 +68,19 @@ class PyWhiperCppServe(ServeClientBase):
|
|
| 69 |
def add_frames(self, frame_np):
|
| 70 |
self._frame_queue.put(frame_np)
|
| 71 |
|
| 72 |
-
def
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
def get_frame_from_queue(self,):
|
| 78 |
while True:
|
| 79 |
try:
|
| 80 |
frame_np = self._frame_queue.get(timeout=0.1)
|
| 81 |
-
# frame_np = item.source_audio
|
| 82 |
-
# self._previous_frame_queue.appendleft(frame_np.copy())
|
| 83 |
-
# prev_frame_np = self.get_prev_frame()
|
| 84 |
-
# if prev_frame_np is not None:
|
| 85 |
-
# frame_np = np.concatenate([prev_frame_np[int(-0.05 * self.sample_rate):],frame_np], axis=0)
|
| 86 |
-
# item = translate_pipes.voice_detect(frame_np.tobytes())
|
| 87 |
-
# if item.audio == b"":
|
| 88 |
-
# continue
|
| 89 |
-
# frame_np = np.frombuffer(item.audio, dtype=np.float32)
|
| 90 |
with self.lock:
|
| 91 |
if self.frames_np is None:
|
| 92 |
self.frames_np = frame_np.copy()
|
|
@@ -96,7 +90,6 @@ class PyWhiperCppServe(ServeClientBase):
|
|
| 96 |
pass
|
| 97 |
|
| 98 |
|
| 99 |
-
|
| 100 |
def update_audio_buffer(self, last_offset):
|
| 101 |
with self.lock:
|
| 102 |
self.frames_np = self.frames_np[last_offset:]
|
|
@@ -244,6 +237,7 @@ class PyWhiperCppServe(ServeClientBase):
|
|
| 244 |
def get_audio_chunk_for_processing(self):
|
| 245 |
if self.frames_np.shape[0] >= self.sample_rate * 1:
|
| 246 |
return self.frames_np.copy()
|
|
|
|
| 247 |
# 计算需要填充的样本数
|
| 248 |
padding_length = self.sample_rate * 1 - len(self.frames_np)
|
| 249 |
# 创建静音填充(零值)
|
|
|
|
| 40 |
self.lock = threading.Lock()
|
| 41 |
self.frames_np = None
|
| 42 |
self._frame_queue = queue.Queue()
|
|
|
|
| 43 |
self.sample_rate = 16000
|
| 44 |
|
| 45 |
self.send_ready_state()
|
|
|
|
| 68 |
def add_frames(self, frame_np):
|
| 69 |
self._frame_queue.put(frame_np)
|
| 70 |
|
| 71 |
+
def vad_merge(self):
|
| 72 |
+
with self.lock:
|
| 73 |
+
frame = self.frames_np.copy()
|
| 74 |
+
item = translate_pipes.voice_detect(frame.tobytes())
|
| 75 |
+
if item.audio != b'':
|
| 76 |
+
frame_np = np.frombuffer(item.audio, dtype=np.float32)
|
| 77 |
+
self.frames_np = frame_np.copy()
|
| 78 |
+
|
| 79 |
|
| 80 |
def get_frame_from_queue(self,):
|
| 81 |
while True:
|
| 82 |
try:
|
| 83 |
frame_np = self._frame_queue.get(timeout=0.1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
with self.lock:
|
| 85 |
if self.frames_np is None:
|
| 86 |
self.frames_np = frame_np.copy()
|
|
|
|
| 90 |
pass
|
| 91 |
|
| 92 |
|
|
|
|
| 93 |
def update_audio_buffer(self, last_offset):
|
| 94 |
with self.lock:
|
| 95 |
self.frames_np = self.frames_np[last_offset:]
|
|
|
|
| 237 |
def get_audio_chunk_for_processing(self):
|
| 238 |
if self.frames_np.shape[0] >= self.sample_rate * 1:
|
| 239 |
return self.frames_np.copy()
|
| 240 |
+
self.vad_merge()
|
| 241 |
# 计算需要填充的样本数
|
| 242 |
padding_length = self.sample_rate * 1 - len(self.frames_np)
|
| 243 |
# 创建静音填充(零值)
|