fix error in get seg id

Files changed (3) hide show

transcribe/strategy.py +55 -37
transcribe/utils.py +1 -0
transcribe/whisper_llm_serve.py +1 -2

transcribe/strategy.py CHANGED Viewed

@@ -8,6 +8,7 @@ from typing import List, Tuple, Optional, Deque, Any, Iterator,Literal
 from config import SENTENCE_END_MARKERS, ALL_MARKERS,SENTENCE_END_PATTERN,REGEX_MARKERS, PAUSEE_END_PATTERN,SAMPLE_RATE
 import numpy as np
 from enum import Enum
 logger = logging.getLogger("TranscriptionStrategy")
@@ -83,12 +84,12 @@ class TranscriptChunk:
     def get_split_first_rest(self,  mode: SplitMode):
         chunks = self.split_by(mode)
         fisrt_chunk = chunks[0] if chunks else self
-        rest_chunks = chunks[1:] if chunks else []
         return fisrt_chunk, rest_chunks
     def puncation_numbers(self) -> int:
         """计算片段中标点符号的数量"""
-        return sum(1 for seg in self.items if REGEX_MARKERS.search(seg.text))
     def length(self) -> int:
         """返回片段列表的长度"""
@@ -102,15 +103,15 @@ class TranscriptChunk:
         """比较当前片段与另一个片段的相似度"""
         if not chunk:
             return 0
-        return self._calculate_similarity(self.join(), chunk.join())
     def has_punctuation(self) -> bool:
         return any(seg.is_punctuation() for seg in self.items)
     def get_buffer_index(self) -> int:
-        logger.debug("==== Current cut item ====")
-        logger.debug(f"{self.items[-1]}")
-        logger.debug("==========================")
         return self.items[-1].buffer_index()
     def is_end_sentence(self) ->bool:
@@ -134,7 +135,9 @@ class TranscriptHistory:
     def lastest_chunk(self):
         """获取最后一个片段"""
         return self.history[-1]
 class TranscriptBuffer:
     """
@@ -147,10 +150,10 @@ class TranscriptBuffer:
     """
     def __init__(self):
-        self._segments: List[str] = []     # 确认的完整段落
         self._sentences: List[str] = []    # 当前段落中的短句
         self._buffer: str = ""             # 当前缓冲中的文本
-        self._current_seg_id: int =0
     def get_seg_id(self) -> int:
         return self._current_seg_id
@@ -176,26 +179,31 @@ class TranscriptBuffer:
             end_of_sentence: 是否为句子结尾（如检测到句号）
         """
         if self._sentences:
-            self._segments.append("".join(self._sentences))
-            self._current_seg_id += 1
             self._sentences.clear()
     def update_and_commit(self, stable_string: str, remaining_string:str, is_end_sentence=False):
-        self.update_pending_text(stable_string)
         if is_end_sentence:
-            self.commit_paragraph(end_of_sentence=True)
             self.update_pending_text(remaining_string)
-            # if len() >=20
-            return True
         else:
             self.commit_line()
             self.update_pending_text(remaining_string)
-            return False
     @property
-    def paragraph(self) -> str:
         """当前短句组合"""
         return "".join(self._sentences)
@@ -211,7 +219,7 @@ class TranscriptBuffer:
     @property
     def current_not_commit_text(self) -> str:
-        return self.paragraph + self.pending_text
@@ -230,15 +238,14 @@ class TranscriptStabilityAnalyzer:
         prev = self._transcript_history.previous_chunk()
         self._transcript_buffer.update_pending_text(current.join())
-        if not prev:
             yield TranscriptResult(
                 context=self._transcript_buffer.current_not_commit_text,
                 seg_id=self._transcript_buffer.get_seg_id()
             )
             return
         if buffer_duration <= 12:
             yield from self._handle_short_buffer(current, prev)
         else:
@@ -248,16 +255,24 @@ class TranscriptStabilityAnalyzer:
     def _handle_short_buffer(self, curr: TranscriptChunk, prev: TranscriptChunk) -> Iterator[TranscriptResult]:
         curr_first, curr_rest = curr.get_split_first_rest(SplitMode.PUNCTUATION)
         prev_first, _ = prev.get_split_first_rest(SplitMode.PUNCTUATION)
-        core = curr_first.compare(prev_first)
-        has_punctuation = curr_first.has_punctuation()
-        logger.debug(f"Compare with rev score:{core},is end :{curr_first.is_end_sentence()}, has_punctuation: {has_punctuation}, current_first: {curr_first.join()},")
-        if core >= 0.8:
-            yield from self._yield_commit_results(curr_first, curr_rest, curr_first.is_end_sentence())
-        else:
-            yield TranscriptResult(
-                seg_id=self._transcript_buffer.get_seg_id(),
-                context=self._transcript_buffer.current_not_commit_text
-            )
     def _handle_long_buffer(self, curr: TranscriptChunk) -> Iterator[TranscriptResult]:
@@ -279,10 +294,8 @@ class TranscriptStabilityAnalyzer:
     def _yield_commit_results(self, stable_chunk, remaining_chunks, is_end_sentence: bool) -> Iterator[TranscriptResult]:
         stable_str = stable_chunk.join() if hasattr(stable_chunk, "join") else self.merge_chunks(stable_chunk)
         remaining_str = self.merge_chunks(remaining_chunks)
         frame_cut_index = stable_chunk[-1].get_buffer_index() if isinstance(stable_chunk, list) else stable_chunk.get_buffer_index()
-        logger.debug(f"Current cut index: {frame_cut_index}, Stable string: {stable_str}, Remaining_str:{remaining_str}")
         prev_seg_id = self._transcript_buffer.get_seg_id()
         commit_paragraph = self._transcript_buffer.update_and_commit(stable_str, remaining_str, is_end_sentence)
         logger.debug(f"current buffer: {self._transcript_buffer.__dict__}")
@@ -295,10 +308,15 @@ class TranscriptStabilityAnalyzer:
                 context=self._transcript_buffer.latest_paragraph,
                 is_end_sentence=True
             )
-        # 如果还有挂起的文本
-        if (current_not_commit_text := self._transcript_buffer.current_not_commit_text.strip()):
             yield TranscriptResult(
                 seg_id=self._transcript_buffer.get_seg_id(),
                 cut_index=frame_cut_index,
-                context=current_not_commit_text
             )

 from config import SENTENCE_END_MARKERS, ALL_MARKERS,SENTENCE_END_PATTERN,REGEX_MARKERS, PAUSEE_END_PATTERN,SAMPLE_RATE
 import numpy as np
 from enum import Enum
+from itertools import chain
 logger = logging.getLogger("TranscriptionStrategy")
     def get_split_first_rest(self,  mode: SplitMode):
         chunks = self.split_by(mode)
         fisrt_chunk = chunks[0] if chunks else self
+        rest_chunks = chunks[1:] if chunks else None
         return fisrt_chunk, rest_chunks
     def puncation_numbers(self) -> int:
         """计算片段中标点符号的数量"""
+        return sum(1 for seg in self.items if seg.is_punctuation())
     def length(self) -> int:
         """返回片段列表的长度"""
         """比较当前片段与另一个片段的相似度"""
         if not chunk:
             return 0
+        score =  self._calculate_similarity(self.join(), chunk.join())
+        logger.debug(f"Compare: {self.join()} vs {chunk.join()} : {score}")
+        return score
     def has_punctuation(self) -> bool:
         return any(seg.is_punctuation() for seg in self.items)
     def get_buffer_index(self) -> int:
         return self.items[-1].buffer_index()
     def is_end_sentence(self) ->bool:
     def lastest_chunk(self):
         """获取最后一个片段"""
         return self.history[-1]
+    def clear(self):
+        self.history.clear()
 class TranscriptBuffer:
     """
     """
     def __init__(self):
+        self._segments: List[str] = collections.deque(maxlen=2)     # 确认的完整段落
         self._sentences: List[str] = []    # 当前段落中的短句
         self._buffer: str = ""             # 当前缓冲中的文本
+        self._current_seg_id: int = 0
     def get_seg_id(self) -> int:
         return self._current_seg_id
             end_of_sentence: 是否为句子结尾（如检测到句号）
         """
         if self._sentences:
+            self._segments.appendleft("".join(self._sentences))
             self._sentences.clear()
     def update_and_commit(self, stable_string: str, remaining_string:str, is_end_sentence=False):
+        logger.debug(f"{self.__dict__}")
         if is_end_sentence:
+            self.update_pending_text(stable_string)
+            self.commit_line()
+            current_text_len = len(self.current_not_commit_text)
             self.update_pending_text(remaining_string)
+            if current_text_len >=20:
+                self.commit_paragraph()
+                self._current_seg_id += 1
+                return True
         else:
+            self.update_pending_text(stable_string)
             self.commit_line()
             self.update_pending_text(remaining_string)
+        return False
     @property
+    def un_commit_paragraph(self) -> str:
         """当前短句组合"""
         return "".join(self._sentences)
     @property
     def current_not_commit_text(self) -> str:
+        return self.un_commit_paragraph + self.pending_text
         prev = self._transcript_history.previous_chunk()
         self._transcript_buffer.update_pending_text(current.join())
+        if not prev: # 如果没有历史记录 那么就说明是新的语句 直接输出就行
             yield TranscriptResult(
                 context=self._transcript_buffer.current_not_commit_text,
                 seg_id=self._transcript_buffer.get_seg_id()
             )
             return
+        # yield from self._handle_short_buffer(current, prev)
         if buffer_duration <= 12:
             yield from self._handle_short_buffer(current, prev)
         else:
     def _handle_short_buffer(self, curr: TranscriptChunk, prev: TranscriptChunk) -> Iterator[TranscriptResult]:
         curr_first, curr_rest = curr.get_split_first_rest(SplitMode.PUNCTUATION)
         prev_first, _ = prev.get_split_first_rest(SplitMode.PUNCTUATION)
+        # logger.debug("==== Current cut item ====")
+        # logger.debug(f"{curr.join()} ")
+        # logger.debug(f"{prev.join()}")
+        # logger.debug("==========================")
+        if curr_first and prev_first:
+            core = curr_first.compare(prev_first)
+            # has_punctuation = curr_first.has_punctuation()
+            if core >= 0.8:
+                yield from self._yield_commit_results(curr_first, curr_rest, curr_first.is_end_sentence())
+                return
+        yield TranscriptResult(
+            seg_id=self._transcript_buffer.get_seg_id(),
+            context=self._transcript_buffer.current_not_commit_text
+        )
     def _handle_long_buffer(self, curr: TranscriptChunk) -> Iterator[TranscriptResult]:
     def _yield_commit_results(self, stable_chunk, remaining_chunks, is_end_sentence: bool) -> Iterator[TranscriptResult]:
         stable_str = stable_chunk.join() if hasattr(stable_chunk, "join") else self.merge_chunks(stable_chunk)
         remaining_str = self.merge_chunks(remaining_chunks)
         frame_cut_index = stable_chunk[-1].get_buffer_index() if isinstance(stable_chunk, list) else stable_chunk.get_buffer_index()
         prev_seg_id = self._transcript_buffer.get_seg_id()
         commit_paragraph = self._transcript_buffer.update_and_commit(stable_str, remaining_str, is_end_sentence)
         logger.debug(f"current buffer: {self._transcript_buffer.__dict__}")
                 context=self._transcript_buffer.latest_paragraph,
                 is_end_sentence=True
             )
+            yield TranscriptResult(
+                seg_id=self._transcript_buffer.get_seg_id(),
+                # cut_index=frame_cut_index,
+                context=self._transcript_buffer.pending_text,
+            )
+        else:
             yield TranscriptResult(
                 seg_id=self._transcript_buffer.get_seg_id(),
                 cut_index=frame_cut_index,
+                context=self._transcript_buffer.current_not_commit_text,
             )

transcribe/utils.py CHANGED Viewed

@@ -8,6 +8,7 @@ from scipy.io.wavfile import write
 import av
 def log_block(key: str, value, unit=''):
     """格式化输出日志内容"""
     key_fmt = f"[  {key.ljust(25)}]"  # 左对齐填充
     val_fmt = f"{value} {unit}".strip()

 import av
 def log_block(key: str, value, unit=''):
+    return
     """格式化输出日志内容"""
     key_fmt = f"[  {key.ljust(25)}]"  # 左对齐填充
     val_fmt = f"{value} {unit}".strip()

transcribe/whisper_llm_serve.py CHANGED Viewed

@@ -193,7 +193,6 @@ class WhisperTranscriptionService(ServeClientBase):
             # 处理转录结果并发送到客户端
             for result in self._process_transcription_results(segments, audio_buffer):
-                print(result)
                 self._send_result_to_client(result)
             # except Exception as e:
@@ -217,7 +216,7 @@ class WhisperTranscriptionService(ServeClientBase):
                 self._update_audio_buffer(cut_index)
             translated_context = self._translate_text(ana_result.context)
-            log_block("Translated context:", translated_context)
             yield TransResult(
                 seg_id=ana_result.seg_id,
                 context=ana_result.context,

             # 处理转录结果并发送到客户端
             for result in self._process_transcription_results(segments, audio_buffer):
                 self._send_result_to_client(result)
             # except Exception as e:
                 self._update_audio_buffer(cut_index)
             translated_context = self._translate_text(ana_result.context)
             yield TransResult(
                 seg_id=ana_result.seg_id,
                 context=ana_result.context,