fix timestamp error

Files changed (7) hide show

config.py +15 -5
main.py +2 -2
tests/test_whisper_cpp.py +30 -0
transcribe/helpers/whisper.py +1 -0
transcribe/strategy.py +2 -1
transcribe/utils.py +13 -2
transcribe/whisper_llm_serve.py +9 -6

config.py CHANGED Viewed

@@ -3,15 +3,23 @@ import re
 import logging
 DEBUG = True
 logging.basicConfig(
     level=logging.DEBUG if DEBUG else logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
     datefmt="%H:%M:%S"
 )
-TEST = True
-logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
 BASE_DIR = pathlib.Path(__file__).parent
@@ -40,8 +48,8 @@ MAX_LENTH_ZH = 4
 WHISPER_PROMPT_EN = ""# "The following is an English sentence."
 MAX_LENGTH_EN= 1
-WHISPER_MODEL = 'medium-q5_0'
-# WHISPER_MODEL = 'large-v3-turbo-q5_0'
 # LLM
 LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
@@ -71,3 +79,5 @@ LLM_SYS_PROMPT_ZH = """
 LLM_SYS_PROMPT_EN = """
 你是一个英中文翻译专家，将用户输入的英文翻译成中文，用户可以向助手发送需要翻译的内容，助手会回答相应的翻译结果，并确保符合英文语言习惯，你可以调整语气和风格，并考虑到某些词语的文化内涵和地区差异。同时作为翻译家，需将英文翻译成具有信达雅标准的中文。"信" 即忠实于原文的内容与意图；"达" 意味着译文应通顺易懂，表达清晰；"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神，又符合目标语言文化和读者审美的翻译。
 """

 import logging
 DEBUG = True
+TEST = True
+logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
 logging.basicConfig(
     level=logging.DEBUG if DEBUG else logging.INFO,
     format="%(asctime)s - %(levelname)s - %(message)s",
+    filename='translator.log',
     datefmt="%H:%M:%S"
 )
+# Add terminal log
+console_handler = logging.StreamHandler()
+console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
+console_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
+console_handler.setFormatter(console_formatter)
+logging.getLogger().addHandler(console_handler)
 BASE_DIR = pathlib.Path(__file__).parent
 WHISPER_PROMPT_EN = ""# "The following is an English sentence."
 MAX_LENGTH_EN= 1
+# WHISPER_MODEL = 'medium-q5_0'
+WHISPER_MODEL = 'large-v3-turbo-q5_0'
 # LLM
 LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
 LLM_SYS_PROMPT_EN = """
 你是一个英中文翻译专家，将用户输入的英文翻译成中文，用户可以向助手发送需要翻译的内容，助手会回答相应的翻译结果，并确保符合英文语言习惯，你可以调整语气和风格，并考虑到某些词语的文化内涵和地区差异。同时作为翻译家，需将英文翻译成具有信达雅标准的中文。"信" 即忠实于原文的内容与意图；"达" 意味着译文应通顺易懂，表达清晰；"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神，又符合目标语言文化和读者审美的翻译。
 """

main.py CHANGED Viewed

@@ -10,7 +10,7 @@ from multiprocessing import Process, freeze_support
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import RedirectResponse
 import os
 logger = getLogger(__name__)
@@ -27,7 +27,7 @@ async def get_audio_from_websocket(websocket)->np.array:
     frame_data = await websocket.receive_bytes()
     if frame_data == b"END_OF_AUDIO":
         return False
-    return np.frombuffer(frame_data, dtype=np.int16).astype(np.float32) / 32768.0
 @asynccontextmanager

 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import RedirectResponse
 import os
+from transcribe.utils import pcm_bytes_to_np_array
 logger = getLogger(__name__)
     frame_data = await websocket.receive_bytes()
     if frame_data == b"END_OF_AUDIO":
         return False
+    return pcm_bytes_to_np_array(frame_data)
 @asynccontextmanager

tests/test_whisper_cpp.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from pywhispercpp.model import Model
+import config
+import soundfile
+from pywhispercpp.utils import to_timestamp
+mel, _, = soundfile.read("/Users/david/Samples/Audio/en/sample-10.wav")
+# mel, _, = soundfile.read(f"{config.ASSERT_DIR}/jfk.flac")
+models_dir = config.MODEL_DIR.as_posix()
+model = Model(
+            model=config.WHISPER_MODEL,
+            models_dir=models_dir,
+              n_threads=4,
+              print_realtime=False,
+              print_progress=False,
+              print_timestamps=False,
+              translate=False,
+              temperature=0.,
+              no_context=True
+              )
+print(mel.shape, mel.dtype) # (160000,) float64
+segments = model.transcribe(mel[:, 0],
+                            # initial_prompt="",# 'The following is an English sentence.', # "以下是简体中文句子。"
+                            language='en',
+                            # initial_prompt="以下是简体中文句子。",
+                            # language='zh',
+                            token_timestamps=True,
+                            max_len=1,)
+for segment in segments:
+    print(to_timestamp(segment.t0), to_timestamp(segment.t1), segment.text)

transcribe/helpers/whisper.py CHANGED Viewed

@@ -48,6 +48,7 @@ class WhisperCPP:
                 initial_prompt=prompt,
                 language=language,
                 token_timestamps=True,
                 max_len=max_len
             )
             return output

                 initial_prompt=prompt,
                 language=language,
                 token_timestamps=True,
+                # split_on_word=True,
                 max_len=max_len
             )
             return output

transcribe/strategy.py CHANGED Viewed

@@ -309,7 +309,8 @@ class TranscriptStabilityAnalyzer:
         logger.debug(f"Current separator: {self._separator}")
     def merge_chunks(self, chunks: List[TranscriptChunk])->str:
-        return list(r.join() for r in chunks)

         logger.debug(f"Current separator: {self._separator}")
     def merge_chunks(self, chunks: List[TranscriptChunk])->str:
+        output =  list(r.join() for r in chunks if r)
+        return output

transcribe/utils.py CHANGED Viewed

@@ -93,8 +93,19 @@ def resample(file: str, sr: int = 16000):
 def save_to_wave(filename, data:np.ndarray, sample_rate=16000):
-    write(filename, sample_rate, data)
 class TestDataWriter:
     def __init__(self, file_path='test_data.csv'):

 def save_to_wave(filename, data:np.ndarray, sample_rate=16000):
+    write(filename, sample_rate, data.astype(np.int16))
+def pcm_bytes_to_np_array(pcm_bytes: bytes, dtype=np.float32, channels=1):
+    # 1. 转换成 numpy int16 数组（每个采样点是 2 字节）
+    audio_np = np.frombuffer(pcm_bytes, dtype=np.int16)
+    audio_np = audio_np.astype(dtype=dtype)
+    if dtype == np.float32:
+        audio_np /= 32768.0
+    # 2. 如果是多声道，例如 2 通道（立体声），你可以 reshape
+    if channels > 1:
+        audio_np = audio_np.reshape(-1, channels)
+    return audio_np
 class TestDataWriter:
     def __init__(self, file_path='test_data.csv'):

transcribe/whisper_llm_serve.py CHANGED Viewed

@@ -124,6 +124,7 @@ class WhisperTranscriptionService(ServeClientBase):
                 frame = self.frames_np.copy()
                 processed_audio = self._translate_pipe.voice_detect(frame.tobytes())
                 self.frames_np = np.frombuffer(processed_audio.audio, dtype=np.float32).copy()
                 # if len(frame) > self.sample_rate:
                 #     save_to_wave(f"{self._c}-org.wav", frame)
                 #     save_to_wave(f"{self._c}-vad.wav", self.frames_np)
@@ -136,20 +137,21 @@ class WhisperTranscriptionService(ServeClientBase):
                 # before = self.frames_np.copy()
                 self.frames_np = self.frames_np[offset:]
                 # after = self.frames_np.copy()
-                # save_to_wave(f"./tests/{self._c}_before_cut.wav", before)
                 # save_to_wave(f"./tests/{self._c}_after_cut.wav", after)
     def _get_audio_for_processing(self) -> Optional[np.ndarray]:
         """准备用于处理的音频块"""
         # 应用VAD处理
-        self._apply_voice_activity_detection()
         # 没有音频帧
-        if self.frames_np is None:
             return None
-        frames = self.frames_np.copy()
         # 音频过短时的处理
         if len(frames) <= 10:
@@ -297,7 +299,8 @@ class WhisperTranscriptionService(ServeClientBase):
         try:
             message = Message(result=result, request_id=self.client_uid).model_dump_json(by_alias=True)
             coro = self.websocket.send_text(message)
-            asyncio.run_coroutine_threadsafe(coro, self.loop)
         except RuntimeError:
             self.stop()
         except Exception as e:

                 frame = self.frames_np.copy()
                 processed_audio = self._translate_pipe.voice_detect(frame.tobytes())
                 self.frames_np = np.frombuffer(processed_audio.audio, dtype=np.float32).copy()
+                return self.frames_np.copy()
                 # if len(frame) > self.sample_rate:
                 #     save_to_wave(f"{self._c}-org.wav", frame)
                 #     save_to_wave(f"{self._c}-vad.wav", self.frames_np)
                 # before = self.frames_np.copy()
                 self.frames_np = self.frames_np[offset:]
                 # after = self.frames_np.copy()
+                # save_to_wave(f"./tests/{self._c}_before_cut_{offset}.wav", before)
+                # save_to_wave(f"./tests/{self._c}_cut.wav", before[:offset])
                 # save_to_wave(f"./tests/{self._c}_after_cut.wav", after)
     def _get_audio_for_processing(self) -> Optional[np.ndarray]:
         """准备用于处理的音频块"""
         # 应用VAD处理
+        frame_np = self._apply_voice_activity_detection()
+        # frame_np = self.frames_np.copy()
         # 没有音频帧
+        if frame_np is None:
             return None
+        frames = frame_np.copy()
         # 音频过短时的处理
         if len(frames) <= 10:
         try:
             message = Message(result=result, request_id=self.client_uid).model_dump_json(by_alias=True)
             coro = self.websocket.send_text(message)
+            future = asyncio.run_coroutine_threadsafe(coro, self.loop)
+            future.add_done_callback(lambda fut: fut.exception() and self.stop())
         except RuntimeError:
             self.stop()
         except Exception as e: