daihui.zhang commited on
Commit
730ea7e
·
1 Parent(s): bf10488

fix timestamp error

Browse files
config.py CHANGED
@@ -3,15 +3,23 @@ import re
3
  import logging
4
 
5
  DEBUG = True
 
 
 
 
6
  logging.basicConfig(
7
  level=logging.DEBUG if DEBUG else logging.INFO,
8
  format="%(asctime)s - %(levelname)s - %(message)s",
 
9
  datefmt="%H:%M:%S"
10
  )
11
 
12
- TEST = True
13
-
14
- logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
 
 
 
15
 
16
 
17
  BASE_DIR = pathlib.Path(__file__).parent
@@ -40,8 +48,8 @@ MAX_LENTH_ZH = 4
40
  WHISPER_PROMPT_EN = ""# "The following is an English sentence."
41
  MAX_LENGTH_EN= 1
42
 
43
- WHISPER_MODEL = 'medium-q5_0'
44
- # WHISPER_MODEL = 'large-v3-turbo-q5_0'
45
 
46
  # LLM
47
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
@@ -71,3 +79,5 @@ LLM_SYS_PROMPT_ZH = """
71
  LLM_SYS_PROMPT_EN = """
72
  你是一个英中文翻译专家,将用户输入的英文翻译成中文,用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合英文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将英文翻译成具有信达雅标准的中文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。
73
  """
 
 
 
3
  import logging
4
 
5
  DEBUG = True
6
+ TEST = True
7
+ logging.getLogger("pywhispercpp").setLevel(logging.WARNING)
8
+
9
+
10
  logging.basicConfig(
11
  level=logging.DEBUG if DEBUG else logging.INFO,
12
  format="%(asctime)s - %(levelname)s - %(message)s",
13
+ filename='translator.log',
14
  datefmt="%H:%M:%S"
15
  )
16
 
17
+ # Add terminal log
18
+ console_handler = logging.StreamHandler()
19
+ console_handler.setLevel(logging.DEBUG if DEBUG else logging.INFO)
20
+ console_formatter = logging.Formatter("%(asctime)s - %(levelname)s - %(message)s")
21
+ console_handler.setFormatter(console_formatter)
22
+ logging.getLogger().addHandler(console_handler)
23
 
24
 
25
  BASE_DIR = pathlib.Path(__file__).parent
 
48
  WHISPER_PROMPT_EN = ""# "The following is an English sentence."
49
  MAX_LENGTH_EN= 1
50
 
51
+ # WHISPER_MODEL = 'medium-q5_0'
52
+ WHISPER_MODEL = 'large-v3-turbo-q5_0'
53
 
54
  # LLM
55
  LLM_MODEL_PATH = (MODEL_DIR / "qwen2.5-1.5b-instruct-q5_0.gguf").as_posix()
 
79
  LLM_SYS_PROMPT_EN = """
80
  你是一个英中文翻译专家,将用户输入的英文翻译成中文,用户可以向助手发送需要翻译的内容,助手会回答相应的翻译结果,并确保符合英文语言习惯,你可以调整语气和风格,并考虑到某些词语的文化内涵和地区差异。同时作为翻译家,需将英文翻译成具有信达雅标准的中文。"信" 即忠实于原文的内容与意图;"达" 意味着译文应通顺易懂,表达清晰;"雅" 则追求译文的文化审美和语言的优美。目标是创作出既忠于原作精神,又符合目标语言文化和读者审美的翻译。
81
  """
82
+
83
+
main.py CHANGED
@@ -10,7 +10,7 @@ from multiprocessing import Process, freeze_support
10
  from fastapi.staticfiles import StaticFiles
11
  from fastapi.responses import RedirectResponse
12
  import os
13
-
14
  logger = getLogger(__name__)
15
 
16
 
@@ -27,7 +27,7 @@ async def get_audio_from_websocket(websocket)->np.array:
27
  frame_data = await websocket.receive_bytes()
28
  if frame_data == b"END_OF_AUDIO":
29
  return False
30
- return np.frombuffer(frame_data, dtype=np.int16).astype(np.float32) / 32768.0
31
 
32
 
33
  @asynccontextmanager
 
10
  from fastapi.staticfiles import StaticFiles
11
  from fastapi.responses import RedirectResponse
12
  import os
13
+ from transcribe.utils import pcm_bytes_to_np_array
14
  logger = getLogger(__name__)
15
 
16
 
 
27
  frame_data = await websocket.receive_bytes()
28
  if frame_data == b"END_OF_AUDIO":
29
  return False
30
+ return pcm_bytes_to_np_array(frame_data)
31
 
32
 
33
  @asynccontextmanager
tests/test_whisper_cpp.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pywhispercpp.model import Model
2
+ import config
3
+ import soundfile
4
+ from pywhispercpp.utils import to_timestamp
5
+
6
+ mel, _, = soundfile.read("/Users/david/Samples/Audio/en/sample-10.wav")
7
+ # mel, _, = soundfile.read(f"{config.ASSERT_DIR}/jfk.flac")
8
+
9
+ models_dir = config.MODEL_DIR.as_posix()
10
+ model = Model(
11
+ model=config.WHISPER_MODEL,
12
+ models_dir=models_dir,
13
+ n_threads=4,
14
+ print_realtime=False,
15
+ print_progress=False,
16
+ print_timestamps=False,
17
+ translate=False,
18
+ temperature=0.,
19
+ no_context=True
20
+ )
21
+ print(mel.shape, mel.dtype) # (160000,) float64
22
+ segments = model.transcribe(mel[:, 0],
23
+ # initial_prompt="",# 'The following is an English sentence.', # "以下是简体中文句子。"
24
+ language='en',
25
+ # initial_prompt="以下是简体中文句子。",
26
+ # language='zh',
27
+ token_timestamps=True,
28
+ max_len=1,)
29
+ for segment in segments:
30
+ print(to_timestamp(segment.t0), to_timestamp(segment.t1), segment.text)
transcribe/helpers/whisper.py CHANGED
@@ -48,6 +48,7 @@ class WhisperCPP:
48
  initial_prompt=prompt,
49
  language=language,
50
  token_timestamps=True,
 
51
  max_len=max_len
52
  )
53
  return output
 
48
  initial_prompt=prompt,
49
  language=language,
50
  token_timestamps=True,
51
+ # split_on_word=True,
52
  max_len=max_len
53
  )
54
  return output
transcribe/strategy.py CHANGED
@@ -309,7 +309,8 @@ class TranscriptStabilityAnalyzer:
309
  logger.debug(f"Current separator: {self._separator}")
310
 
311
  def merge_chunks(self, chunks: List[TranscriptChunk])->str:
312
- return list(r.join() for r in chunks)
 
313
 
314
 
315
 
 
309
  logger.debug(f"Current separator: {self._separator}")
310
 
311
  def merge_chunks(self, chunks: List[TranscriptChunk])->str:
312
+ output = list(r.join() for r in chunks if r)
313
+ return output
314
 
315
 
316
 
transcribe/utils.py CHANGED
@@ -93,8 +93,19 @@ def resample(file: str, sr: int = 16000):
93
 
94
 
95
  def save_to_wave(filename, data:np.ndarray, sample_rate=16000):
96
- write(filename, sample_rate, data)
97
-
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  class TestDataWriter:
100
  def __init__(self, file_path='test_data.csv'):
 
93
 
94
 
95
  def save_to_wave(filename, data:np.ndarray, sample_rate=16000):
96
+ write(filename, sample_rate, data.astype(np.int16))
97
+
98
+
99
+ def pcm_bytes_to_np_array(pcm_bytes: bytes, dtype=np.float32, channels=1):
100
+ # 1. 转换成 numpy int16 数组(每个采样点是 2 字节)
101
+ audio_np = np.frombuffer(pcm_bytes, dtype=np.int16)
102
+ audio_np = audio_np.astype(dtype=dtype)
103
+ if dtype == np.float32:
104
+ audio_np /= 32768.0
105
+ # 2. 如果是多声道,例如 2 通道(立体声),你可以 reshape
106
+ if channels > 1:
107
+ audio_np = audio_np.reshape(-1, channels)
108
+ return audio_np
109
 
110
  class TestDataWriter:
111
  def __init__(self, file_path='test_data.csv'):
transcribe/whisper_llm_serve.py CHANGED
@@ -124,6 +124,7 @@ class WhisperTranscriptionService(ServeClientBase):
124
  frame = self.frames_np.copy()
125
  processed_audio = self._translate_pipe.voice_detect(frame.tobytes())
126
  self.frames_np = np.frombuffer(processed_audio.audio, dtype=np.float32).copy()
 
127
  # if len(frame) > self.sample_rate:
128
  # save_to_wave(f"{self._c}-org.wav", frame)
129
  # save_to_wave(f"{self._c}-vad.wav", self.frames_np)
@@ -136,20 +137,21 @@ class WhisperTranscriptionService(ServeClientBase):
136
  # before = self.frames_np.copy()
137
  self.frames_np = self.frames_np[offset:]
138
  # after = self.frames_np.copy()
139
- # save_to_wave(f"./tests/{self._c}_before_cut.wav", before)
 
140
  # save_to_wave(f"./tests/{self._c}_after_cut.wav", after)
141
 
142
 
143
  def _get_audio_for_processing(self) -> Optional[np.ndarray]:
144
  """准备用于处理的音频块"""
145
  # 应用VAD处理
146
- self._apply_voice_activity_detection()
147
-
148
  # 没有音频帧
149
- if self.frames_np is None:
150
  return None
151
 
152
- frames = self.frames_np.copy()
153
 
154
  # 音频过短时的处理
155
  if len(frames) <= 10:
@@ -297,7 +299,8 @@ class WhisperTranscriptionService(ServeClientBase):
297
  try:
298
  message = Message(result=result, request_id=self.client_uid).model_dump_json(by_alias=True)
299
  coro = self.websocket.send_text(message)
300
- asyncio.run_coroutine_threadsafe(coro, self.loop)
 
301
  except RuntimeError:
302
  self.stop()
303
  except Exception as e:
 
124
  frame = self.frames_np.copy()
125
  processed_audio = self._translate_pipe.voice_detect(frame.tobytes())
126
  self.frames_np = np.frombuffer(processed_audio.audio, dtype=np.float32).copy()
127
+ return self.frames_np.copy()
128
  # if len(frame) > self.sample_rate:
129
  # save_to_wave(f"{self._c}-org.wav", frame)
130
  # save_to_wave(f"{self._c}-vad.wav", self.frames_np)
 
137
  # before = self.frames_np.copy()
138
  self.frames_np = self.frames_np[offset:]
139
  # after = self.frames_np.copy()
140
+ # save_to_wave(f"./tests/{self._c}_before_cut_{offset}.wav", before)
141
+ # save_to_wave(f"./tests/{self._c}_cut.wav", before[:offset])
142
  # save_to_wave(f"./tests/{self._c}_after_cut.wav", after)
143
 
144
 
145
  def _get_audio_for_processing(self) -> Optional[np.ndarray]:
146
  """准备用于处理的音频块"""
147
  # 应用VAD处理
148
+ frame_np = self._apply_voice_activity_detection()
149
+ # frame_np = self.frames_np.copy()
150
  # 没有音频帧
151
+ if frame_np is None:
152
  return None
153
 
154
+ frames = frame_np.copy()
155
 
156
  # 音频过短时的处理
157
  if len(frames) <= 10:
 
299
  try:
300
  message = Message(result=result, request_id=self.client_uid).model_dump_json(by_alias=True)
301
  coro = self.websocket.send_text(message)
302
+ future = asyncio.run_coroutine_threadsafe(coro, self.loop)
303
+ future.add_done_callback(lambda fut: fut.exception() and self.stop())
304
  except RuntimeError:
305
  self.stop()
306
  except Exception as e: