daihui.zhang
commited on
Commit
·
730ea7e
1
Parent(s):
bf10488
fix timestamp error
Browse files- config.py +15 -5
- main.py +2 -2
- tests/test_whisper_cpp.py +30 -0
- transcribe/helpers/whisper.py +1 -0
- transcribe/strategy.py +2 -1
- transcribe/utils.py +13 -2
- transcribe/whisper_llm_serve.py +9 -6
config.py
CHANGED
|
@@ -3,15 +3,23 @@ import re
|
|
| 3 |
import logging
|
| 4 |
|
| 5 |
DEBUG = True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
logging.basicConfig(
|
| 7 |
level=logging.DEBUG if DEBUG else logging.INFO,
|
| 8 |
format="%(asctime)s - %(levelname)s - %(message)s",
|
|
|
|
| 9 |
datefmt="%H:%M:%S"
|
| 10 |
)
|
| 11 |
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
BASE_DIR = pathlib.Path(__file__).parent
|
|
@@ -40,8 +48,8 @@ MAX_LENTH_ZH = 4
|
|
| 40 |
WHISPER_PROMPT_EN = ""# "The following is an English sentence."
|
| 41 |
MAX_LENGTH_EN= 1
|
| 42 |
|
| 43 |
-
WHISPER_MODEL = 'medium-q5_0'
|
| 44 |
-
|
| 45 |
|
| 46 |
# LLM
|
| 47 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
|
@@ -71,3 +79,5 @@ LLM_SYS_PROMPT_ZH = """
|
|
| 71 |
LLM_SYS_PROMPT_EN = """
|
| 72 |
你是一个英中文翻译专家,将用户输入的英文翻译成中文,用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合英文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将英文翻译成具有信达雅标准的中文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。
|
| 73 |
"""
|
|
|
|
|
|
|
|
|
| 3 |
import logging
|
| 4 |
|
| 5 |
DEBUG = True
|
| 6 |
+
TEST = True
|
| 7 |
+
logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
logging.basicConfig(
|
| 11 |
level=logging.DEBUG if DEBUG else logging.INFO,
|
| 12 |
format="%(asctime)s - %(levelname)s - %(message)s",
|
| 13 |
+
filename='translator.log',
|
| 14 |
datefmt="%H:%M:%S"
|
| 15 |
)
|
| 16 |
|
| 17 |
+
# Add terminal log
|
| 18 |
+
console_handler = logging.StreamHandler()
|
| 19 |
+
console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
|
| 20 |
+
console_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
|
| 21 |
+
console_handler.setFormatter(console_formatter)
|
| 22 |
+
logging.getLogger().addHandler(console_handler)
|
| 23 |
|
| 24 |
|
| 25 |
BASE_DIR = pathlib.Path(__file__).parent
|
|
|
|
| 48 |
WHISPER_PROMPT_EN = ""# "The following is an English sentence."
|
| 49 |
MAX_LENGTH_EN= 1
|
| 50 |
|
| 51 |
+
# WHISPER_MODEL = 'medium-q5_0'
|
| 52 |
+
WHISPER_MODEL = 'large-v3-turbo-q5_0'
|
| 53 |
|
| 54 |
# LLM
|
| 55 |
LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
|
|
|
|
| 79 |
LLM_SYS_PROMPT_EN = """
|
| 80 |
你是一个英中文翻译专家,将用户输入的英文翻译成中文,用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合英文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将英文翻译成具有信达雅标准的中文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。
|
| 81 |
"""
|
| 82 |
+
|
| 83 |
+
|
main.py
CHANGED
|
@@ -10,7 +10,7 @@ from multiprocessing import Process, freeze_support
|
|
| 10 |
from fastapi.staticfiles import StaticFiles
|
| 11 |
from fastapi.responses import RedirectResponse
|
| 12 |
import os
|
| 13 |
-
|
| 14 |
logger = getLogger(__name__)
|
| 15 |
|
| 16 |
|
|
@@ -27,7 +27,7 @@ async def get_audio_from_websocket(websocket)->np.array:
|
|
| 27 |
frame_data = await websocket.receive_bytes()
|
| 28 |
if frame_data == b"END_OF_AUDIO":
|
| 29 |
return False
|
| 30 |
-
return
|
| 31 |
|
| 32 |
|
| 33 |
@asynccontextmanager
|
|
|
|
| 10 |
from fastapi.staticfiles import StaticFiles
|
| 11 |
from fastapi.responses import RedirectResponse
|
| 12 |
import os
|
| 13 |
+
from transcribe.utils import pcm_bytes_to_np_array
|
| 14 |
logger = getLogger(__name__)
|
| 15 |
|
| 16 |
|
|
|
|
| 27 |
frame_data = await websocket.receive_bytes()
|
| 28 |
if frame_data == b"END_OF_AUDIO":
|
| 29 |
return False
|
| 30 |
+
return pcm_bytes_to_np_array(frame_data)
|
| 31 |
|
| 32 |
|
| 33 |
@asynccontextmanager
|
tests/test_whisper_cpp.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pywhispercpp.model import Model
|
| 2 |
+
import config
|
| 3 |
+
import soundfile
|
| 4 |
+
from pywhispercpp.utils import to_timestamp
|
| 5 |
+
|
| 6 |
+
mel, _, = soundfile.read("/Users/david/Samples/Audio/en/sample-10.wav")
|
| 7 |
+
# mel, _, = soundfile.read(f"{config.ASSERT_DIR}/jfk.flac")
|
| 8 |
+
|
| 9 |
+
models_dir = config.MODEL_DIR.as_posix()
|
| 10 |
+
model = Model(
|
| 11 |
+
model=config.WHISPER_MODEL,
|
| 12 |
+
models_dir=models_dir,
|
| 13 |
+
n_threads=4,
|
| 14 |
+
print_realtime=False,
|
| 15 |
+
print_progress=False,
|
| 16 |
+
print_timestamps=False,
|
| 17 |
+
translate=False,
|
| 18 |
+
temperature=0.,
|
| 19 |
+
no_context=True
|
| 20 |
+
)
|
| 21 |
+
print(mel.shape, mel.dtype) # (160000,) float64
|
| 22 |
+
segments = model.transcribe(mel[:, 0],
|
| 23 |
+
# initial_prompt="",# 'The following is an English sentence.', # "以下是简体中文句子。"
|
| 24 |
+
language='en',
|
| 25 |
+
# initial_prompt="以下是简体中文句子。",
|
| 26 |
+
# language='zh',
|
| 27 |
+
token_timestamps=True,
|
| 28 |
+
max_len=1,)
|
| 29 |
+
for segment in segments:
|
| 30 |
+
print(to_timestamp(segment.t0), to_timestamp(segment.t1), segment.text)
|
transcribe/helpers/whisper.py
CHANGED
|
@@ -48,6 +48,7 @@ class WhisperCPP:
|
|
| 48 |
initial_prompt=prompt,
|
| 49 |
language=language,
|
| 50 |
token_timestamps=True,
|
|
|
|
| 51 |
max_len=max_len
|
| 52 |
)
|
| 53 |
return output
|
|
|
|
| 48 |
initial_prompt=prompt,
|
| 49 |
language=language,
|
| 50 |
token_timestamps=True,
|
| 51 |
+
# split_on_word=True,
|
| 52 |
max_len=max_len
|
| 53 |
)
|
| 54 |
return output
|
transcribe/strategy.py
CHANGED
|
@@ -309,7 +309,8 @@ class TranscriptStabilityAnalyzer:
|
|
| 309 |
logger.debug(f"Current separator: {self._separator}")
|
| 310 |
|
| 311 |
def merge_chunks(self, chunks: List[TranscriptChunk])->str:
|
| 312 |
-
|
|
|
|
| 313 |
|
| 314 |
|
| 315 |
|
|
|
|
| 309 |
logger.debug(f"Current separator: {self._separator}")
|
| 310 |
|
| 311 |
def merge_chunks(self, chunks: List[TranscriptChunk])->str:
|
| 312 |
+
output = list(r.join() for r in chunks if r)
|
| 313 |
+
return output
|
| 314 |
|
| 315 |
|
| 316 |
|
transcribe/utils.py
CHANGED
|
@@ -93,8 +93,19 @@ def resample(file: str, sr: int = 16000):
|
|
| 93 |
|
| 94 |
|
| 95 |
def save_to_wave(filename, data:np.ndarray, sample_rate=16000):
|
| 96 |
-
write(filename, sample_rate, data)
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
class TestDataWriter:
|
| 100 |
def __init__(self, file_path='test_data.csv'):
|
|
|
|
| 93 |
|
| 94 |
|
| 95 |
def save_to_wave(filename, data:np.ndarray, sample_rate=16000):
|
| 96 |
+
write(filename, sample_rate, data.astype(np.int16))
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def pcm_bytes_to_np_array(pcm_bytes: bytes, dtype=np.float32, channels=1):
|
| 100 |
+
# 1. 转换成 numpy int16 数组(每个采样点是 2 字节)
|
| 101 |
+
audio_np = np.frombuffer(pcm_bytes, dtype=np.int16)
|
| 102 |
+
audio_np = audio_np.astype(dtype=dtype)
|
| 103 |
+
if dtype == np.float32:
|
| 104 |
+
audio_np /= 32768.0
|
| 105 |
+
# 2. 如果是多声道,例如 2 通道(立体声),你可以 reshape
|
| 106 |
+
if channels > 1:
|
| 107 |
+
audio_np = audio_np.reshape(-1, channels)
|
| 108 |
+
return audio_np
|
| 109 |
|
| 110 |
class TestDataWriter:
|
| 111 |
def __init__(self, file_path='test_data.csv'):
|
transcribe/whisper_llm_serve.py
CHANGED
|
@@ -124,6 +124,7 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 124 |
frame = self.frames_np.copy()
|
| 125 |
processed_audio = self._translate_pipe.voice_detect(frame.tobytes())
|
| 126 |
self.frames_np = np.frombuffer(processed_audio.audio, dtype=np.float32).copy()
|
|
|
|
| 127 |
# if len(frame) > self.sample_rate:
|
| 128 |
# save_to_wave(f"{self._c}-org.wav", frame)
|
| 129 |
# save_to_wave(f"{self._c}-vad.wav", self.frames_np)
|
|
@@ -136,20 +137,21 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 136 |
# before = self.frames_np.copy()
|
| 137 |
self.frames_np = self.frames_np[offset:]
|
| 138 |
# after = self.frames_np.copy()
|
| 139 |
-
# save_to_wave(f"./tests/{self._c}
|
|
|
|
| 140 |
# save_to_wave(f"./tests/{self._c}_after_cut.wav", after)
|
| 141 |
|
| 142 |
|
| 143 |
def _get_audio_for_processing(self) -> Optional[np.ndarray]:
|
| 144 |
"""准备用于处理的音频块"""
|
| 145 |
# 应用VAD处理
|
| 146 |
-
self._apply_voice_activity_detection()
|
| 147 |
-
|
| 148 |
# 没有音频帧
|
| 149 |
-
if
|
| 150 |
return None
|
| 151 |
|
| 152 |
-
frames =
|
| 153 |
|
| 154 |
# 音频过短时的处理
|
| 155 |
if len(frames) <= 10:
|
|
@@ -297,7 +299,8 @@ class WhisperTranscriptionService(ServeClientBase):
|
|
| 297 |
try:
|
| 298 |
message = Message(result=result, request_id=self.client_uid).model_dump_json(by_alias=True)
|
| 299 |
coro = self.websocket.send_text(message)
|
| 300 |
-
asyncio.run_coroutine_threadsafe(coro, self.loop)
|
|
|
|
| 301 |
except RuntimeError:
|
| 302 |
self.stop()
|
| 303 |
except Exception as e:
|
|
|
|
| 124 |
frame = self.frames_np.copy()
|
| 125 |
processed_audio = self._translate_pipe.voice_detect(frame.tobytes())
|
| 126 |
self.frames_np = np.frombuffer(processed_audio.audio, dtype=np.float32).copy()
|
| 127 |
+
return self.frames_np.copy()
|
| 128 |
# if len(frame) > self.sample_rate:
|
| 129 |
# save_to_wave(f"{self._c}-org.wav", frame)
|
| 130 |
# save_to_wave(f"{self._c}-vad.wav", self.frames_np)
|
|
|
|
| 137 |
# before = self.frames_np.copy()
|
| 138 |
self.frames_np = self.frames_np[offset:]
|
| 139 |
# after = self.frames_np.copy()
|
| 140 |
+
# save_to_wave(f"./tests/{self._c}_before_cut_{offset}.wav", before)
|
| 141 |
+
# save_to_wave(f"./tests/{self._c}_cut.wav", before[:offset])
|
| 142 |
# save_to_wave(f"./tests/{self._c}_after_cut.wav", after)
|
| 143 |
|
| 144 |
|
| 145 |
def _get_audio_for_processing(self) -> Optional[np.ndarray]:
|
| 146 |
"""准备用于处理的音频块"""
|
| 147 |
# 应用VAD处理
|
| 148 |
+
frame_np = self._apply_voice_activity_detection()
|
| 149 |
+
# frame_np = self.frames_np.copy()
|
| 150 |
# 没有音频帧
|
| 151 |
+
if frame_np is None:
|
| 152 |
return None
|
| 153 |
|
| 154 |
+
frames = frame_np.copy()
|
| 155 |
|
| 156 |
# 音频过短时的处理
|
| 157 |
if len(frames) <= 10:
|
|
|
|
| 299 |
try:
|
| 300 |
message = Message(result=result, request_id=self.client_uid).model_dump_json(by_alias=True)
|
| 301 |
coro = self.websocket.send_text(message)
|
| 302 |
+
future = asyncio.run_coroutine_threadsafe(coro, self.loop)
|
| 303 |
+
future.add_done_callback(lambda fut: fut.exception() and self.stop())
|
| 304 |
except RuntimeError:
|
| 305 |
self.stop()
|
| 306 |
except Exception as e:
|