daihui.zhang
commited on
Commit
·
6696134
1
Parent(s):
d30439a
fix vad bug
Browse files- config.py +1 -1
- transcribe/pipelines/pipe_vad.py +12 -1
- transcribe/whisper_llm_serve.py +9 -11
config.py
CHANGED
|
@@ -2,7 +2,7 @@ import pathlib
|
|
| 2 |
import re
|
| 3 |
import logging
|
| 4 |
|
| 5 |
-
DEBUG =
|
| 6 |
TEST = False
|
| 7 |
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
| 8 |
|
|
|
|
| 2 |
import re
|
| 3 |
import logging
|
| 4 |
|
| 5 |
+
DEBUG = True
|
| 6 |
TEST = False
|
| 7 |
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
| 8 |
|
transcribe/pipelines/pipe_vad.py
CHANGED
|
@@ -5,6 +5,8 @@ import numpy as np
|
|
| 5 |
from silero_vad import get_speech_timestamps
|
| 6 |
import torch
|
| 7 |
from typing import List
|
|
|
|
|
|
|
| 8 |
# import noisereduce as nr
|
| 9 |
|
| 10 |
def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
|
|
@@ -95,6 +97,8 @@ class VadPipe(BasePipe):
|
|
| 95 |
def _process_speech_chunk(self, source_audio:np.ndarray):
|
| 96 |
speech_dict = self.vac(source_audio, return_seconds=False)
|
| 97 |
if speech_dict:
|
|
|
|
|
|
|
| 98 |
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
|
| 99 |
if start_frame:
|
| 100 |
relative_start_frame = max(0, (start_frame - self._offset))
|
|
@@ -107,27 +111,34 @@ class VadPipe(BasePipe):
|
|
| 107 |
self.vac.reset_states()
|
| 108 |
|
| 109 |
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
|
| 110 |
-
speech_data = self.
|
| 111 |
self._offset += len(source_audio)
|
| 112 |
if speech_data: # 表示有音频的变化点出现
|
| 113 |
rel_start_frame, rel_end_frame = speech_data
|
| 114 |
if rel_start_frame and not rel_end_frame:
|
| 115 |
self._status = "START" # 语音开始
|
| 116 |
target_audio = source_audio[rel_start_frame:]
|
|
|
|
| 117 |
elif not rel_start_frame and rel_end_frame:
|
| 118 |
self._status = "END" # 音频结束
|
| 119 |
target_audio = source_audio[:rel_end_frame]
|
|
|
|
| 120 |
elif rel_start_frame and rel_end_frame:
|
| 121 |
self._status = 'END'
|
| 122 |
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
|
|
|
| 123 |
else:
|
| 124 |
self._status = 'END'
|
| 125 |
target_audio = np.array([],dtype=np.float32)
|
|
|
|
| 126 |
else:
|
| 127 |
if self._status == 'START':
|
| 128 |
target_audio = source_audio
|
|
|
|
| 129 |
else: # end
|
| 130 |
target_audio = np.array([],dtype=np.float32)
|
|
|
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
in_data.audio = target_audio.tobytes()
|
|
|
|
| 5 |
from silero_vad import get_speech_timestamps
|
| 6 |
import torch
|
| 7 |
from typing import List
|
| 8 |
+
import logging
|
| 9 |
+
|
| 10 |
# import noisereduce as nr
|
| 11 |
|
| 12 |
def collect_chunks(tss: List[dict], wav: torch.Tensor, sample_rate: int = 16000):
|
|
|
|
| 97 |
def _process_speech_chunk(self, source_audio:np.ndarray):
|
| 98 |
speech_dict = self.vac(source_audio, return_seconds=False)
|
| 99 |
if speech_dict:
|
| 100 |
+
relative_start_frame = None
|
| 101 |
+
relative_end_frame = None
|
| 102 |
start_frame, end_frame = speech_dict.get("start"), speech_dict.get("end")
|
| 103 |
if start_frame:
|
| 104 |
relative_start_frame = max(0, (start_frame - self._offset))
|
|
|
|
| 111 |
self.vac.reset_states()
|
| 112 |
|
| 113 |
source_audio = np.frombuffer(in_data.source_audio, dtype=np.float32)
|
| 114 |
+
speech_data = self._process_speech_chunk(source_audio)
|
| 115 |
self._offset += len(source_audio)
|
| 116 |
if speech_data: # 表示有音频的变化点出现
|
| 117 |
rel_start_frame, rel_end_frame = speech_data
|
| 118 |
if rel_start_frame and not rel_end_frame:
|
| 119 |
self._status = "START" # 语音开始
|
| 120 |
target_audio = source_audio[rel_start_frame:]
|
| 121 |
+
logging.debug("🔊 Speech started, capturing audio from frame: {}".format(rel_start_frame))
|
| 122 |
elif not rel_start_frame and rel_end_frame:
|
| 123 |
self._status = "END" # 音频结束
|
| 124 |
target_audio = source_audio[:rel_end_frame]
|
| 125 |
+
logging.debug("🔚 Speech ended, capturing audio up to frame: {}".format(rel_end_frame))
|
| 126 |
elif rel_start_frame and rel_end_frame:
|
| 127 |
self._status = 'END'
|
| 128 |
target_audio = source_audio[rel_start_frame:rel_end_frame]
|
| 129 |
+
logging.debug("🔄 Speech segment captured from frame {} to frame {}".format(rel_start_frame, rel_end_frame))
|
| 130 |
else:
|
| 131 |
self._status = 'END'
|
| 132 |
target_audio = np.array([],dtype=np.float32)
|
| 133 |
+
logging.debug("❌ No valid speech segment detected, setting status to END")
|
| 134 |
else:
|
| 135 |
if self._status == 'START':
|
| 136 |
target_audio = source_audio
|
| 137 |
+
logging.debug("🔊 Continuing to capture audio as speech is still ongoing")
|
| 138 |
else: # end
|
| 139 |
target_audio = np.array([],dtype=np.float32)
|
| 140 |
+
self._status = 'END'
|
| 141 |
+
logging.debug("❌ No speech detected, setting status to END")
|
| 142 |
|
| 143 |
|
| 144 |
in_data.audio = target_audio.tobytes()
|
transcribe/whisper_llm_serve.py
CHANGED
|
@@ -10,7 +10,7 @@ import numpy as np
|
|
| 10 |
import config
|
| 11 |
# import wordninja
|
| 12 |
from api_model import TransResult, Message, DebugResult
|
| 13 |
-
from .server import ServeClientBase
|
| 14 |
from .utils import log_block, save_to_wave, TestDataWriter
|
| 15 |
from .translatepipes import TranslatePipes
|
| 16 |
from .strategy import (
|
|
@@ -20,18 +20,21 @@ import csv
|
|
| 20 |
logger = getLogger("TranscriptionService")
|
| 21 |
|
| 22 |
|
| 23 |
-
class WhisperTranscriptionService
|
| 24 |
"""
|
| 25 |
Whisper语音转录服务类,处理音频流转录和翻译
|
| 26 |
"""
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
|
| 29 |
-
|
| 30 |
self.source_language = language # 源语言
|
| 31 |
self.target_language = dst_lang # 目标翻译语言
|
| 32 |
-
|
| 33 |
# 转录结果稳定性管理
|
| 34 |
-
|
| 35 |
self._translate_pipe = pipe
|
| 36 |
|
| 37 |
# 音频处理相关
|
|
@@ -40,7 +43,6 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 40 |
self.lock = threading.Lock()
|
| 41 |
self._frame_queue = queue.Queue()
|
| 42 |
|
| 43 |
-
|
| 44 |
# 文本分隔符,根据语言设置
|
| 45 |
self.text_separator = self._get_text_separator(language)
|
| 46 |
self.loop = asyncio.get_event_loop()
|
|
@@ -54,7 +56,6 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 54 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
| 55 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
| 56 |
|
| 57 |
-
#
|
| 58 |
self._vad_processed_offset = 0
|
| 59 |
|
| 60 |
# for test
|
|
@@ -100,7 +101,7 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 100 |
self.text_separator = self._get_text_separator(source_lang)
|
| 101 |
self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
|
| 102 |
|
| 103 |
-
def
|
| 104 |
"""添加音频帧到处理队列"""
|
| 105 |
self._frame_queue.put(frame_np)
|
| 106 |
|
|
@@ -226,9 +227,6 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 226 |
"""主转录处理循环"""
|
| 227 |
c = 0
|
| 228 |
while not self._translate_thread_stop.is_set():
|
| 229 |
-
if self.exit:
|
| 230 |
-
logger.info("Exiting transcription thread")
|
| 231 |
-
break
|
| 232 |
|
| 233 |
# 等待音频数据
|
| 234 |
if self.frames_np is None:
|
|
|
|
| 10 |
import config
|
| 11 |
# import wordninja
|
| 12 |
from api_model import TransResult, Message, DebugResult
|
| 13 |
+
# from .server import ServeClientBase
|
| 14 |
from .utils import log_block, save_to_wave, TestDataWriter
|
| 15 |
from .translatepipes import TranslatePipes
|
| 16 |
from .strategy import (
|
|
|
|
| 20 |
logger = getLogger("TranscriptionService")
|
| 21 |
|
| 22 |
|
| 23 |
+
class WhisperTranscriptionService:
|
| 24 |
"""
|
| 25 |
Whisper语音转录服务类,处理音频流转录和翻译
|
| 26 |
"""
|
| 27 |
|
| 28 |
+
SERVER_READY = "SERVER_READY"
|
| 29 |
+
DISCONNECT = "DISCONNECT"
|
| 30 |
+
|
| 31 |
def __init__(self, websocket, pipe: TranslatePipes, language=None, dst_lang=None, client_uid=None):
|
| 32 |
+
|
| 33 |
self.source_language = language # 源语言
|
| 34 |
self.target_language = dst_lang # 目标翻译语言
|
| 35 |
+
self.client_uid = client_uid
|
| 36 |
# 转录结果稳定性管理
|
| 37 |
+
self.websocket = websocket
|
| 38 |
self._translate_pipe = pipe
|
| 39 |
|
| 40 |
# 音频处理相关
|
|
|
|
| 43 |
self.lock = threading.Lock()
|
| 44 |
self._frame_queue = queue.Queue()
|
| 45 |
|
|
|
|
| 46 |
# 文本分隔符,根据语言设置
|
| 47 |
self.text_separator = self._get_text_separator(language)
|
| 48 |
self.loop = asyncio.get_event_loop()
|
|
|
|
| 56 |
self.translate_thread = self._start_thread(self._transcription_processing_loop)
|
| 57 |
self.frame_processing_thread = self._start_thread(self._frame_processing_loop)
|
| 58 |
|
|
|
|
| 59 |
self._vad_processed_offset = 0
|
| 60 |
|
| 61 |
# for test
|
|
|
|
| 101 |
self.text_separator = self._get_text_separator(source_lang)
|
| 102 |
self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
|
| 103 |
|
| 104 |
+
def add_frames(self, frame_np: np.ndarray) -> None:
|
| 105 |
"""添加音频帧到处理队列"""
|
| 106 |
self._frame_queue.put(frame_np)
|
| 107 |
|
|
|
|
| 227 |
"""主转录处理循环"""
|
| 228 |
c = 0
|
| 229 |
while not self._translate_thread_stop.is_set():
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
# 等待音频数据
|
| 232 |
if self.frames_np is None:
|