fix segments missing error

Files changed (5) hide show

run_client.py +0 -17
run_server.py +0 -31
transcribe/vad.py +0 -164
transcribe/whisper_llm_serve.py +4 -4
transcribe/whispercpp_serve.py +0 -383

run_client.py DELETED Viewed

@@ -1,17 +0,0 @@
-from transcribe.client import TranscriptionClient
-client = TranscriptionClient(
-    "localhost",
-    9090,
-    lang="zh",
-    dst_lang="en",
-    save_output_recording=False,  # Only used for microphone input, False by Default
-    output_recording_filename="./output_recording.wav",  # Only used for microphone input
-    max_clients=4,
-    max_connection_time=600,
-    mute_audio_playback=False,  # Only used for file input, False by Default
-)
-if __name__ == '__main__':
-    client()

run_server.py DELETED Viewed

@@ -1,31 +0,0 @@
-import argparse
-import os
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--port', '-p',
-                        type=int,
-                        default=9090,
-                        help="Websocket port to run the server on.")
-    parser.add_argument('--backend', '-b',
-                        type=str,
-                        default='pywhispercpp',
-                        help='Backends from ["pywhispercpp"]')
-    parser.add_argument('--omp_num_threads', '-omp',
-                        type=int,
-                        default=1,
-                        help="Number of threads to use for OpenMP")
-    args = parser.parse_args()
-    if "OMP_NUM_THREADS" not in os.environ:
-        os.environ["OMP_NUM_THREADS"] = str(args.omp_num_threads)
-    from transcribe.transcription import TranscriptionServer
-    server = TranscriptionServer()
-    server.run(
-        "0.0.0.0",
-        port=args.port,
-        backend=args.backend,
-    )

transcribe/vad.py DELETED Viewed

@@ -1,164 +0,0 @@
-import os
-import subprocess
-import warnings
-import numpy as np
-import onnxruntime
-import torch
-import logging
-from config import VAD_MODEL_PATH
-class VoiceActivityDetection():
-    def __init__(self, force_onnx_cpu=True):
-        # path = self.download()
-        path = VAD_MODEL_PATH
-        if not os.path.exists(path):
-            raise FileNotFoundError(f"Model file not found at {path}. Please download the model.")
-        opts = onnxruntime.SessionOptions()
-        opts.log_severity_level = 3
-        opts.inter_op_num_threads = 1
-        opts.intra_op_num_threads = 1
-        if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
-            self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
-        else:
-            self.session = onnxruntime.InferenceSession(path, providers=['CUDAExecutionProvider'], sess_options=opts)
-        self.reset_states()
-        if '16k' in path:
-            warnings.warn('This model support only 16000 sampling rate!')
-            self.sample_rates = [16000]
-        else:
-            self.sample_rates = [8000, 16000]
-    def _validate_input(self, x, sr: int):
-        if x.dim() == 1:
-            x = x.unsqueeze(0)
-        if x.dim() > 2:
-            raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
-        if sr != 16000 and (sr % 16000 == 0):
-            step = sr // 16000
-            x = x[:, ::step]
-            sr = 16000
-        if sr not in self.sample_rates:
-            raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
-        if sr / x.shape[1] > 31.25:
-            raise ValueError("Input audio chunk is too short")
-        return x, sr
-    def reset_states(self, batch_size=1):
-        self._state = torch.zeros((2, batch_size, 128)).float()
-        self._context = torch.zeros(0)
-        self._last_sr = 0
-        self._last_batch_size = 0
-    def __call__(self, x, sr: int):
-        x, sr = self._validate_input(x, sr)
-        num_samples = 512 if sr == 16000 else 256
-        if x.shape[-1] != num_samples:
-            raise ValueError(
-                f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
-        batch_size = x.shape[0]
-        context_size = 64 if sr == 16000 else 32
-        if not self._last_batch_size:
-            self.reset_states(batch_size)
-        if (self._last_sr) and (self._last_sr != sr):
-            self.reset_states(batch_size)
-        if (self._last_batch_size) and (self._last_batch_size != batch_size):
-            self.reset_states(batch_size)
-        if not len(self._context):
-            self._context = torch.zeros(batch_size, context_size)
-        x = torch.cat([self._context, x], dim=1)
-        if sr in [8000, 16000]:
-            ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')}
-            ort_outs = self.session.run(None, ort_inputs)
-            out, state = ort_outs
-            self._state = torch.from_numpy(state)
-        else:
-            raise ValueError()
-        self._context = x[..., -context_size:]
-        self._last_sr = sr
-        self._last_batch_size = batch_size
-        out = torch.from_numpy(out)
-        return out
-    def audio_forward(self, x, sr: int):
-        outs = []
-        x, sr = self._validate_input(x, sr)
-        self.reset_states()
-        num_samples = 512 if sr == 16000 else 256
-        if x.shape[1] % num_samples:
-            pad_num = num_samples - (x.shape[1] % num_samples)
-            x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
-        for i in range(0, x.shape[1], num_samples):
-            wavs_batch = x[:, i:i + num_samples]
-            out_chunk = self.__call__(wavs_batch, sr)
-            outs.append(out_chunk)
-        stacked = torch.cat(outs, dim=1)
-        return stacked.cpu()
-    @staticmethod
-    def download(model_url="https://github.com/snakers4/silero-vad/raw/v5.0/files/silero_vad.onnx"):
-        target_dir = os.path.expanduser("~/.cache/silero-vad/")
-        # Ensure the target directory exists
-        os.makedirs(target_dir, exist_ok=True)
-        # Define the target file path
-        model_filename = os.path.join(target_dir, "silero_vad.onnx")
-        # Check if the model file already exists
-        if not os.path.exists(model_filename):
-            # If it doesn't exist, download the model using wget
-            try:
-                # subprocess.run(["wget", "-O", model_filename, model_url], check=True)
-                subprocess.run(["curl", "-sL", "-o", model_filename, model_url], check=True)
-            except subprocess.CalledProcessError:
-                print("Failed to download the model using wget.")
-        return model_filename
-class VoiceActivityDetector:
-    def __init__(self, threshold=0.5, frame_rate=16000):
-        """
-        Initializes the VoiceActivityDetector with a voice activity detection model and a threshold.
-        Args:
-            threshold (float, optional): The probability threshold for detecting voice activity. Defaults to 0.5.
-        """
-        self.model = VoiceActivityDetection()
-        self.threshold = threshold
-        self.frame_rate = frame_rate
-    def __call__(self, audio_frame):
-        """
-        Determines if the given audio frame contains speech by comparing the detected speech probability against
-        the threshold.
-        Args:
-            audio_frame (np.ndarray): The audio frame to be analyzed for voice activity. It is expected to be a
-                                      NumPy array of audio samples.
-        Returns:
-            bool: True if the speech probability exceeds the threshold, indicating the presence of voice activity;
-                  False otherwise.
-        """
-        speech_probs = self.model.audio_forward(torch.from_numpy(audio_frame.copy()), self.frame_rate)[0]
-        return torch.any(speech_probs > self.threshold).item()

transcribe/whisper_llm_serve.py CHANGED Viewed

@@ -99,7 +99,7 @@ class WhisperTranscriptionService:
         """设置源语言和目标语言"""
         self.source_language = source_lang
         self.target_language = target_lang
-        # self.text_separator = self._get_text_separator(source_lang)
         # self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
     def add_frames(self, frame_np: np.ndarray) -> None:
@@ -197,13 +197,13 @@ class WhisperTranscriptionService:
             #     logger.error(f"Error processing audio: {e}")
     def _process_transcription_results_2(self, segments: List[TranscriptToken],):
-        seg = segments[0]
         item =  TransResult(
                 seg_id=self.row_number,
-                context=seg.text,
                 from_=self.source_language,
                 to=self.target_language,
-                tran_content=self._translate_text_large(seg.text),
                 partial=False
             )
         self.row_number += 1

         """设置源语言和目标语言"""
         self.source_language = source_lang
         self.target_language = target_lang
+        self.text_separator = self._get_text_separator(source_lang)
         # self._transcrible_analysis = TranscriptStabilityAnalyzer(self.source_language, self.text_separator)
     def add_frames(self, frame_np: np.ndarray) -> None:
             #     logger.error(f"Error processing audio: {e}")
     def _process_transcription_results_2(self, segments: List[TranscriptToken],):
+        seg_text = self.text_separator.join(seg.text for seg in segments)
         item =  TransResult(
                 seg_id=self.row_number,
+                context=seg_text,
                 from_=self.source_language,
                 to=self.target_language,
+                tran_content=self._translate_text_large(seg_text),
                 partial=False
             )
         self.row_number += 1

transcribe/whispercpp_serve.py DELETED Viewed

@@ -1,383 +0,0 @@
-import json
-import logging
-import pathlib
-import threading
-import time
-import config
-import librosa
-import numpy as np
-import soundfile
-from pywhispercpp.model import Model
-logging.basicConfig(level=logging.INFO)
-class ServeClientBase(object):
-    RATE = 16000
-    SERVER_READY = "SERVER_READY"
-    DISCONNECT = "DISCONNECT"
-    def __init__(self, client_uid, websocket):
-        self.client_uid = client_uid
-        self.websocket = websocket
-        self.frames = b""
-        self.timestamp_offset = 0.0
-        self.frames_np = None
-        self.frames_offset = 0.0
-        self.text = []
-        self.current_out = ''
-        self.prev_out = ''
-        self.t_start = None
-        self.exit = False
-        self.same_output_count = 0
-        self.show_prev_out_thresh = 5  # if pause(no output from whisper) show previous output for 5 seconds
-        self.add_pause_thresh = 3  # add a blank to segment list as a pause(no speech) for 3 seconds
-        self.transcript = []
-        self.send_last_n_segments = 10
-        # text formatting
-        self.pick_previous_segments = 2
-        # threading
-        self.lock = threading.Lock()
-    def speech_to_text(self):
-        raise NotImplementedError
-    def transcribe_audio(self):
-        raise NotImplementedError
-    def handle_transcription_output(self):
-        raise NotImplementedError
-    def add_frames(self, frame_np):
-        """
-        Add audio frames to the ongoing audio stream buffer.
-        This method is responsible for maintaining the audio stream buffer, allowing the continuous addition
-        of audio frames as they are received. It also ensures that the buffer does not exceed a specified size
-        to prevent excessive memory usage.
-        If the buffer size exceeds a threshold (45 seconds of audio data), it discards the oldest 30 seconds
-        of audio data to maintain a reasonable buffer size. If the buffer is empty, it initializes it with the provided
-        audio frame. The audio stream buffer is used for real-time processing of audio data for transcription.
-        Args:
-            frame_np (numpy.ndarray): The audio frame data as a NumPy array.
-        """
-        self.lock.acquire()
-        if self.frames_np is not None and self.frames_np.shape[0] > 45 * self.RATE:
-            self.frames_offset += 30.0
-            self.frames_np = self.frames_np[int(30 * self.RATE):]
-            # check timestamp offset(should be >= self.frame_offset)
-            # this basically means that there is no speech as timestamp offset hasnt updated
-            # and is less than frame_offset
-            if self.timestamp_offset < self.frames_offset:
-                self.timestamp_offset = self.frames_offset
-        if self.frames_np is None:
-            self.frames_np = frame_np.copy()
-        else:
-            self.frames_np = np.concatenate((self.frames_np, frame_np), axis=0)
-        self.lock.release()
-    def clip_audio_if_no_valid_segment(self):
-        """
-        Update the timestamp offset based on audio buffer status.
-        Clip audio if the current chunk exceeds 30 seconds, this basically implies that
-        no valid segment for the last 30 seconds from whisper
-        """
-        with self.lock:
-            if self.frames_np[int((self.timestamp_offset - self.frames_offset) * self.RATE):].shape[0] > 25 * self.RATE:
-                duration = self.frames_np.shape[0] / self.RATE
-                self.timestamp_offset = self.frames_offset + duration - 5
-    def get_audio_chunk_for_processing(self):
-        """
-        Retrieves the next chunk of audio data for processing based on the current offsets.
-        Calculates which part of the audio data should be processed next, based on
-        the difference between the current timestamp offset and the frame's offset, scaled by
-        the audio sample rate (RATE). It then returns this chunk of audio data along with its
-        duration in seconds.
-        Returns:
-            tuple: A tuple containing:
-                - input_bytes (np.ndarray): The next chunk of audio data to be processed.
-                - duration (float): The duration of the audio chunk in seconds.
-        """
-        with self.lock:
-            samples_take = max(0, (self.timestamp_offset - self.frames_offset) * self.RATE)
-            input_bytes = self.frames_np[int(samples_take):].copy()
-        duration = input_bytes.shape[0] / self.RATE
-        return input_bytes, duration
-    def prepare_segments(self, last_segment=None):
-        """
-        Prepares the segments of transcribed text to be sent to the client.
-        This method compiles the recent segments of transcribed text, ensuring that only the
-        specified number of the most recent segments are included. It also appends the most
-        recent segment of text if provided (which is considered incomplete because of the possibility
-        of the last word being truncated in the audio chunk).
-        Args:
-            last_segment (str, optional): The most recent segment of transcribed text to be added
-                                          to the list of segments. Defaults to None.
-        Returns:
-            list: A list of transcribed text segments to be sent to the client.
-        """
-        segments = []
-        if len(self.transcript) >= self.send_last_n_segments:
-            segments = self.transcript[-self.send_last_n_segments:].copy()
-        else:
-            segments = self.transcript.copy()
-        if last_segment is not None:
-            segments = segments + [last_segment]
-        logging.info(f"{segments}")
-        return segments
-    def get_audio_chunk_duration(self, input_bytes):
-        """
-        Calculates the duration of the provided audio chunk.
-        Args:
-            input_bytes (numpy.ndarray): The audio chunk for which to calculate the duration.
-        Returns:
-            float: The duration of the audio chunk in seconds.
-        """
-        return input_bytes.shape[0] / self.RATE
-    def send_transcription_to_client(self, segments):
-        """
-        Sends the specified transcription segments to the client over the websocket connection.
-        This method formats the transcription segments into a JSON object and attempts to send
-        this object to the client. If an error occurs during the send operation, it logs the error.
-        Returns:
-            segments (list): A list of transcription segments to be sent to the client.
-        """
-        try:
-            self.websocket.send(
-                json.dumps({
-                    "uid": self.client_uid,
-                    "segments": segments,
-                })
-            )
-        except Exception as e:
-            logging.error(f"[ERROR]: Sending data to client: {e}")
-    def disconnect(self):
-        """
-        Notify the client of disconnection and send a disconnect message.
-        This method sends a disconnect message to the client via the WebSocket connection to notify them
-        that the transcription service is disconnecting gracefully.
-        """
-        self.websocket.send(json.dumps({
-            "uid": self.client_uid,
-            "message": self.DISCONNECT
-        }))
-    def cleanup(self):
-        """
-        Perform cleanup tasks before exiting the transcription service.
-        This method performs necessary cleanup tasks, including stopping the transcription thread, marking
-        the exit flag to indicate the transcription thread should exit gracefully, and destroying resources
-        associated with the transcription process.
-        """
-        logging.info("Cleaning up.")
-        self.exit = True
-class ServeClientWhisperCPP(ServeClientBase):
-    SINGLE_MODEL = None
-    SINGLE_MODEL_LOCK = threading.Lock()
-    def __init__(self, websocket, language=None, client_uid=None,
-                 single_model=False):
-        """
-        Initialize a ServeClient instance.
-        The Whisper model is initialized based on the client's language and device availability.
-        The transcription thread is started upon initialization. A "SERVER_READY" message is sent
-        to the client to indicate that the server is ready.
-        Args:
-            websocket (WebSocket): The WebSocket connection for the client.
-            language (str, optional): The language for transcription. Defaults to None.
-            client_uid (str, optional): A unique identifier for the client. Defaults to None.
-            single_model (bool, optional): Whether to instantiate a new model for each client connection. Defaults to False.
-        """
-        super().__init__(client_uid, websocket)
-        self.language = language
-        self.eos = False
-        if single_model:
-            if ServeClientWhisperCPP.SINGLE_MODEL is None:
-                self.create_model()
-                ServeClientWhisperCPP.SINGLE_MODEL = self.transcriber
-            else:
-                self.transcriber = ServeClientWhisperCPP.SINGLE_MODEL
-        else:
-            self.create_model()
-        # threading
-        logging.info('Create a thread to process audio.')
-        self.trans_thread = threading.Thread(target=self.speech_to_text)
-        self.trans_thread.start()
-        self.websocket.send(json.dumps({
-            "uid": self.client_uid,
-            "message": self.SERVER_READY,
-            "backend": "pywhispercpp"
-        }))
-    def create_model(self, warmup=True):
-        """
-        Instantiates a new model, sets it as the transcriber and does warmup if desired.
-        """
-        self.transcriber = Model(model=config.WHISPER_MODEL, models_dir=config.MODEL_DIR)
-        if warmup:
-            self.warmup()
-    def warmup(self, warmup_steps=1):
-        """
-        Warmup TensorRT since first few inferences are slow.
-        Args:
-            warmup_steps (int): Number of steps to warm up the model for.
-        """
-        logging.info("[INFO:] Warming up whisper.cpp engine..")
-        mel, _, = soundfile.read("assets/jfk.flac")
-        for i in range(warmup_steps):
-            self.transcriber.transcribe(mel, print_progress=False)
-    def set_eos(self, eos):
-        """
-        Sets the End of Speech (EOS) flag.
-        Args:
-            eos (bool): The value to set for the EOS flag.
-        """
-        self.lock.acquire()
-        self.eos = eos
-        self.lock.release()
-    def handle_transcription_output(self, last_segment, duration):
-        """
-        Handle the transcription output, updating the transcript and sending data to the client.
-        Args:
-            last_segment (str): The last segment from the whisper output which is considered to be incomplete because
-                                of the possibility of word being truncated.
-            duration (float): Duration of the transcribed audio chunk.
-        """
-        segments = self.prepare_segments({"text": last_segment})
-        self.send_transcription_to_client(segments)
-        if self.eos:
-            self.update_timestamp_offset(last_segment, duration)
-    def transcribe_audio(self, input_bytes):
-        """
-        Transcribe the audio chunk and send the results to the client.
-        Args:
-            input_bytes (np.array): The audio chunk to transcribe.
-        """
-        if ServeClientWhisperCPP.SINGLE_MODEL:
-            ServeClientWhisperCPP.SINGLE_MODEL_LOCK.acquire()
-        logging.info(f"[pywhispercpp:] Processing audio with duration: {input_bytes.shape[0] / self.RATE}")
-        mel = input_bytes
-        duration = librosa.get_duration(y=input_bytes, sr=self.RATE)
-        if self.language == "zh":
-            prompt = '以下是简体中文普通话的句子。'
-        else:
-            prompt = ''
-        segments = self.transcriber.transcribe(
-            mel,
-            language=self.language,
-            initial_prompt=prompt,
-            token_timestamps=True,
-            # max_len=max_len,
-            print_progress=False
-        )
-        text = []
-        for segment in segments:
-            content = segment.text
-            text.append(content)
-        last_segment = ' '.join(text)
-        logging.info(f"[pywhispercpp:] Last segment: {last_segment}")
-        if ServeClientWhisperCPP.SINGLE_MODEL:
-            ServeClientWhisperCPP.SINGLE_MODEL_LOCK.release()
-        if last_segment:
-            self.handle_transcription_output(last_segment, duration)
-    def update_timestamp_offset(self, last_segment, duration):
-        """
-        Update timestamp offset and transcript.
-        Args:
-            last_segment (str): Last transcribed audio from the whisper model.
-            duration (float): Duration of the last audio chunk.
-        """
-        if not len(self.transcript):
-            self.transcript.append({"text": last_segment + " "})
-        elif self.transcript[-1]["text"].strip() != last_segment:
-            self.transcript.append({"text": last_segment + " "})
-        logging.info(f'Transcript list context: {self.transcript}')
-        with self.lock:
-            self.timestamp_offset += duration
-    def speech_to_text(self):
-        """
-        Process an audio stream in an infinite loop, continuously transcribing the speech.
-        This method continuously receives audio frames, performs real-time transcription, and sends
-        transcribed segments to the client via a WebSocket connection.
-        If the client's language is not detected, it waits for 30 seconds of audio input to make a language prediction.
-        It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
-        are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
-        (no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
-        there is no speech for a specified duration to indicate a pause.
-        Raises:
-            Exception: If there is an issue with audio processing or WebSocket communication.
-        """
-        while True:
-            if self.exit:
-                logging.info("Exiting speech to text thread")
-                break
-            if self.frames_np is None:
-                time.sleep(0.02)  # wait for any audio to arrive
-                continue
-            self.clip_audio_if_no_valid_segment()
-            input_bytes, duration = self.get_audio_chunk_for_processing()
-            if duration < 1:
-                continue
-            try:
-                input_sample = input_bytes.copy()
-                logging.info(f"[pywhispercpp:] Processing audio with duration: {duration}")
-                self.transcribe_audio(input_sample)
-            except Exception as e:
-                logging.error(f"[ERROR]: {e}")