add llm

Files changed (5) hide show

run_client.py +1 -0
transcribe/client.py +6 -4
transcribe/transcription.py +12 -0
transcribe/translator.py +2 -3
transcribe/whisper_llm_serve.py +22 -13

run_client.py CHANGED Viewed

@@ -4,6 +4,7 @@ client = TranscriptionClient(
     "localhost",
     9090,
     lang="zh",
     save_output_recording=False,  # Only used for microphone input, False by Default
     output_recording_filename="./output_recording.wav",  # Only used for microphone input
     max_clients=4,

     "localhost",
     9090,
     lang="zh",
+    dst_lang="en",
     save_output_recording=False,  # Only used for microphone input, False by Default
     output_recording_filename="./output_recording.wav",  # Only used for microphone input
     max_clients=4,

transcribe/client.py CHANGED Viewed

@@ -29,6 +29,7 @@ class Client:
             log_transcription=True,
             max_clients=4,
             max_connection_time=600,
     ):
         """
         Initializes a Client instance for audio recording and streaming to a server.
@@ -56,12 +57,12 @@ class Client:
         self.log_transcription = log_transcription
         self.max_clients = max_clients
         self.max_connection_time = max_connection_time
         self.audio_bytes = None
         if host is not None and port is not None:
-            socket_url = f"ws://{host}:{port}"
             self.client_socket = websocket.WebSocketApp(
                 socket_url,
                 on_open=lambda ws: self.on_open(ws),
@@ -657,10 +658,11 @@ class TranscriptionClient(TranscriptionTeeClient):
             max_clients=4,
             max_connection_time=600,
             mute_audio_playback=False,
     ):
         self.client = Client(
             host, port, lang, log_transcription=log_transcription, max_clients=max_clients,
-            max_connection_time=max_connection_time
         )
         if save_output_recording and not output_recording_filename.endswith(".wav"):
@@ -671,5 +673,5 @@ class TranscriptionClient(TranscriptionTeeClient):
             [self.client],
             save_output_recording=save_output_recording,
             output_recording_filename=output_recording_filename,
-            mute_audio_playback=mute_audio_playback
         )

             log_transcription=True,
             max_clients=4,
             max_connection_time=600,
+            dst_lang='zh',
     ):
         """
         Initializes a Client instance for audio recording and streaming to a server.
         self.log_transcription = log_transcription
         self.max_clients = max_clients
         self.max_connection_time = max_connection_time
+        self.dst_lang = dst_lang
         self.audio_bytes = None
         if host is not None and port is not None:
+            socket_url = f"ws://{host}:{port}?from={self.language}&to={self.dst_lang}"
             self.client_socket = websocket.WebSocketApp(
                 socket_url,
                 on_open=lambda ws: self.on_open(ws),
             max_clients=4,
             max_connection_time=600,
             mute_audio_playback=False,
+            dst_lang='en',
     ):
         self.client = Client(
             host, port, lang, log_transcription=log_transcription, max_clients=max_clients,
+            max_connection_time=max_connection_time, dst_lang=dst_lang
         )
         if save_output_recording and not output_recording_filename.endswith(".wav"):
             [self.client],
             save_output_recording=save_output_recording,
             output_recording_filename=output_recording_filename,
+            mute_audio_playback=mute_audio_playback,
         )

transcribe/transcription.py CHANGED Viewed

@@ -10,6 +10,7 @@ import numpy as np
 from .server import ServeClientBase
 from .whisper_llm_serve import PyWhiperCppServe
 from .vad import VoiceActivityDetector
 from websockets.exceptions import ConnectionClosed
 from websockets.sync.server import serve
@@ -226,6 +227,11 @@ class TranscriptionServer:
         client.add_frames(frame_np)
         return True
     def recv_audio(self,
                    websocket,
@@ -234,6 +240,12 @@ class TranscriptionServer:
         self.backend = backend
         if not self.handle_new_connection(websocket):
             return
         try:
             while not self.client_manager.is_client_timeout(websocket):

 from .server import ServeClientBase
 from .whisper_llm_serve import PyWhiperCppServe
 from .vad import VoiceActivityDetector
+from urllib.parse import urlparse, parse_qsl
 from websockets.exceptions import ConnectionClosed
 from websockets.sync.server import serve
         client.add_frames(frame_np)
         return True
+    def set_lang(self, websocket, src_lang, dst_lang):
+        client = self.client_manager.get_client(websocket)
+        if isinstance(client, PyWhiperCppServe):
+            client.set_lang(src_lang, dst_lang)
     def recv_audio(self,
                    websocket,
         self.backend = backend
         if not self.handle_new_connection(websocket):
             return
+        query_parameters_dict = dict(parse_qsl(urlparse(websocket.request.path).query))
+        from_lang, to_lang = query_parameters_dict.get('from'), query_parameters_dict.get('to')
+        if from_lang and to_lang:
+            self.set_lang(websocket, from_lang, to_lang)
+            logging.info(f"Source lange: {from_lang}  -> Dst lange: {to_lang}")
         try:
             while not self.client_manager.is_client_timeout(websocket):

transcribe/translator.py CHANGED Viewed

@@ -28,13 +28,12 @@ class QwenTranslator:
         message = self.to_message(prompt, src_lang, dst_lang)
         start_time = time.monotonic()
         output = self.llm.create_chat_completion(messages=message, temperature=0.9)
-        logger.info(f"LLM translate cose: {time.monotonic() - start_time:.2f}s.")
         return output['choices'][0]['message']['content']
-    def __call__(self, prompt, max_tokens=512,*args, **kwargs):
         return self.llm(
             prompt,
             *args,
-            max_tokens=max_tokens,
             **kwargs
         )

         message = self.to_message(prompt, src_lang, dst_lang)
         start_time = time.monotonic()
         output = self.llm.create_chat_completion(messages=message, temperature=0.9)
+        logger.info(f"LLM inference time: {time.monotonic() - start_time:.2f}s.")
         return output['choices'][0]['message']['content']
+    def __call__(self, prompt,*args, **kwargs):
         return self.llm(
             prompt,
             *args,
             **kwargs
         )

transcribe/whisper_llm_serve.py CHANGED Viewed

@@ -2,6 +2,7 @@
 import soundfile
 from concurrent.futures import ProcessPoolExecutor as Pool
 import numpy as np
 from logging import getLogger
 from difflib import SequenceMatcher
@@ -10,7 +11,7 @@ import config
 import time
 import json
 import threading
 from .server import ServeClientBase
 from .translator import QwenTranslator
 from pywhispercpp.model import Model
@@ -109,7 +110,7 @@ class PywhisperInference:
     llm_model = None
     @classmethod
-    def initializer(cls, warmup=True):
         models_dir = config.MODEL_DIR.as_posix()
         cls.whisper_model = Model(
             model=config.WHISPER_MODEL,
@@ -123,6 +124,7 @@ class PywhisperInference:
         # init llamacpp
         cls.llm_model = QwenTranslator(config.LLM_MODEL_PATH, config.LLM_SYS_PROMPT)
     @classmethod
     def warmup(cls, warmup_steps=1):
@@ -170,20 +172,30 @@ class PyWhiperCppServe(ServeClientBase):
         self.frames_np = None
         self.sample_rate = 16000
         self._pool = Pool(
-            max_workers=1, initializer=PywhisperInference.initializer)
         logger.info('Create a process to process audio.')
         self.trans_thread = threading.Thread(target=self.speech_to_text)
         self.trans_thread.daemon = True
         self.trans_thread.start()
         self.websocket.send(json.dumps({
             "uid": self.client_uid,
             "message": self.SERVER_READY,
             "backend": "pywhispercpp"
         }))
     def add_frames(self, frame_np):
         with self.lock:
             if self.frames_np is None:
@@ -206,7 +218,6 @@ class PyWhiperCppServe(ServeClientBase):
         transcribe_fut = self._pool.submit(
             PywhisperInference.inference, audio_buffer.tobytes(), self.language)
         segments = transcribe_fut.result()
         return segments
     def translate_text(self, text):
@@ -215,8 +226,6 @@ class PyWhiperCppServe(ServeClientBase):
         translate_fut = self._pool.submit(
             PywhisperInference.translate, text, self.language, self.dst_lang)
         return  translate_fut.result()
     def _segments_split(self, segments, audio_buffer: np.ndarray):
         """根据左边第一个标点符号来将序列拆分成 观察段 和 剩余部分"""
@@ -278,15 +287,14 @@ class PyWhiperCppServe(ServeClientBase):
             # logger.info(f"[pywhispercpp:] Processing audio with duration: {len(audio_buffer)}")
             # segments = self.transcribe_audio(audio_buffer)
             try:
-                logger.info(f"[pywhispercpp:] Processing audio with duration: {len(audio_buffer)}")
                 segments = self.transcribe_audio(audio_buffer)
             except KeyboardInterrupt:
                 break
             except Exception as e:
-                logger.error(f"[ERROR]: {e}")
-            else:
-                for item in self.handle_transcription_output(segments, audio_buffer):
-                    print(item)
@@ -306,7 +314,8 @@ class PyWhiperCppServe(ServeClientBase):
             message = self._segment_manager.segment
             seg_id = self._segment_manager.get_seg_id() - 1
             yield (seg_id, message, self.translate_text(message))
-            yield (seg_id + 1, self._segment_manager.string, self.translate_text(self._segment_manager.string))
         else:
             seg_id = self._segment_manager.get_seg_id()
@@ -326,7 +335,7 @@ class PyWhiperCppServe(ServeClientBase):
                 json.dumps(content)
             )
         except Exception as e:
-            logger.error(f"[ERROR]: Sending data to client: {e}")
     def get_audio_chunk_for_processing(self):
         if self.frames_np.shape[0] >= self.sample_rate * 1:

 import soundfile
 from concurrent.futures import ProcessPoolExecutor as Pool
+import multiprocessing as mp
 import numpy as np
 from logging import getLogger
 from difflib import SequenceMatcher
 import time
 import json
 import threading
+from functools import partial
 from .server import ServeClientBase
 from .translator import QwenTranslator
 from pywhispercpp.model import Model
     llm_model = None
     @classmethod
+    def initializer(cls, event:mp.Event, warmup=True):
         models_dir = config.MODEL_DIR.as_posix()
         cls.whisper_model = Model(
             model=config.WHISPER_MODEL,
         # init llamacpp
         cls.llm_model = QwenTranslator(config.LLM_MODEL_PATH, config.LLM_SYS_PROMPT)
+        event.set()
     @classmethod
     def warmup(cls, warmup_steps=1):
         self.frames_np = None
         self.sample_rate = 16000
+        self._ready_state = mp.Event()
         self._pool = Pool(
+            max_workers=1, initializer=partial(PywhisperInference.initializer, event=self._ready_state))
         logger.info('Create a process to process audio.')
         self.trans_thread = threading.Thread(target=self.speech_to_text)
         self.trans_thread.daemon = True
         self.trans_thread.start()
+        self.send_ready_state()
+    def send_ready_state(self):
+        while not self._ready_state:
+            time.sleep(0.1)
         self.websocket.send(json.dumps({
             "uid": self.client_uid,
             "message": self.SERVER_READY,
             "backend": "pywhispercpp"
         }))
+    def set_lang(self, src_lang, dst_lang):
+        self.language = src_lang
+        self.dst_lang = dst_lang
     def add_frames(self, frame_np):
         with self.lock:
             if self.frames_np is None:
         transcribe_fut = self._pool.submit(
             PywhisperInference.inference, audio_buffer.tobytes(), self.language)
         segments = transcribe_fut.result()
         return segments
     def translate_text(self, text):
         translate_fut = self._pool.submit(
             PywhisperInference.translate, text, self.language, self.dst_lang)
         return  translate_fut.result()
     def _segments_split(self, segments, audio_buffer: np.ndarray):
         """根据左边第一个标点符号来将序列拆分成 观察段 和 剩余部分"""
             # logger.info(f"[pywhispercpp:] Processing audio with duration: {len(audio_buffer)}")
             # segments = self.transcribe_audio(audio_buffer)
             try:
+                logger.info(f"Processing audio with duration: {len(audio_buffer) / self.sample_rate:.2f}s")
                 segments = self.transcribe_audio(audio_buffer)
+                for item in self.handle_transcription_output(segments, audio_buffer):
+                    print(item)
             except KeyboardInterrupt:
                 break
             except Exception as e:
+                logger.error(f"{e}")
             message = self._segment_manager.segment
             seg_id = self._segment_manager.get_seg_id() - 1
             yield (seg_id, message, self.translate_text(message))
+            if self._segment_manager.string.strip():
+                yield (seg_id + 1, self._segment_manager.string, self.translate_text(self._segment_manager.string))
         else:
             seg_id = self._segment_manager.get_seg_id()
                 json.dumps(content)
             )
         except Exception as e:
+            logger.error(f"Sending data to client: {e}")
     def get_audio_chunk_for_processing(self):
         if self.frames_np.shape[0] >= self.sample_rate * 1: