daihui.zhang commited on
Commit
5c84c3c
·
1 Parent(s): 0a036e5

add new logic of check text length

Browse files
Files changed (1) hide show
  1. transcribe/strategy.py +20 -10
transcribe/strategy.py CHANGED
@@ -160,7 +160,7 @@ class TranscriptBuffer:
160
 
161
  def __init__(self, source_lang:str, separator:str):
162
  self._segments: List[str] = collections.deque(maxlen=2) # 确认的完整段落
163
- self._sentences: List[str] = [] # 当前段落中的短句
164
  self._buffer: str = "" # 当前缓冲中的文本
165
  self._current_seg_id: int = 0
166
  self.source_language = source_lang
@@ -189,17 +189,27 @@ class TranscriptBuffer:
189
  Args:
190
  end_of_sentence: 是否为句子结尾(如检测到句号)
191
  """
192
- if self._sentences:
193
- self._segments.append("".join(self._sentences))
194
- self._sentences.clear()
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  def rebuild(self, text):
197
  output = self.split_and_join(
198
  text.replace(
199
  self._separator, ""))
200
- if output != text:
201
- logger.debug(f"Source string: {text.replace(self._separator, '')}")
202
- logger.debug(f"Output string: {output}")
203
  return output
204
 
205
  @staticmethod
@@ -241,7 +251,7 @@ class TranscriptBuffer:
241
  current_text_len = len(self.current_not_commit_text.split(self._separator)) if self._separator else len(self.current_not_commit_text)
242
  # current_text_len = len(self.current_not_commit_text.split(self._separator))
243
  self.update_pending_text(remaining_string)
244
- if current_text_len >=20:
245
  self.commit_paragraph()
246
  self._current_seg_id += 1
247
  return True
@@ -255,7 +265,7 @@ class TranscriptBuffer:
255
  @property
256
  def un_commit_paragraph(self) -> str:
257
  """当前短句组合"""
258
- return "".join(self._sentences)
259
 
260
  @property
261
  def pending_text(self) -> str:
@@ -299,7 +309,7 @@ class TranscriptStabilityAnalyzer:
299
  return
300
 
301
  # yield from self._handle_short_buffer(current, prev)
302
- if buffer_duration < 4:
303
  yield from self._handle_short_buffer(current, prev)
304
  else:
305
  yield from self._handle_long_buffer(current)
 
160
 
161
  def __init__(self, source_lang:str, separator:str):
162
  self._segments: List[str] = collections.deque(maxlen=2) # 确认的完整段落
163
+ self._sentences: List[str] = collections.deque() # 当前段落中的短句
164
  self._buffer: str = "" # 当前缓冲中的文本
165
  self._current_seg_id: int = 0
166
  self.source_language = source_lang
 
189
  Args:
190
  end_of_sentence: 是否为句子结尾(如检测到句号)
191
  """
192
+
193
+ count = 0
194
+ current_sentences = []
195
+ while len(self._sentences) and count <=20:
196
+ item = self._sentences.popleft()
197
+ current_sentences.append(item)
198
+ count += len(item.split(self._separator))
199
+ if current_sentences:
200
+ self._segments.append("".join(current_sentences))
201
+ logger.debug(f"=== count to paragraph ===")
202
+ logger.debug(f"push: {current_sentences}")
203
+ logger.debug(f"rest: {self._sentences}")
204
+ # if self._sentences:
205
+ # self._segments.append("".join(self._sentences))
206
+ # self._sentences.clear()
207
 
208
  def rebuild(self, text):
209
  output = self.split_and_join(
210
  text.replace(
211
  self._separator, ""))
212
+
 
 
213
  return output
214
 
215
  @staticmethod
 
251
  current_text_len = len(self.current_not_commit_text.split(self._separator)) if self._separator else len(self.current_not_commit_text)
252
  # current_text_len = len(self.current_not_commit_text.split(self._separator))
253
  self.update_pending_text(remaining_string)
254
+ if current_text_len >= 20:
255
  self.commit_paragraph()
256
  self._current_seg_id += 1
257
  return True
 
265
  @property
266
  def un_commit_paragraph(self) -> str:
267
  """当前短句组合"""
268
+ return "".join([i for i in self._sentences])
269
 
270
  @property
271
  def pending_text(self) -> str:
 
309
  return
310
 
311
  # yield from self._handle_short_buffer(current, prev)
312
+ if buffer_duration <= 4:
313
  yield from self._handle_short_buffer(current, prev)
314
  else:
315
  yield from self._handle_long_buffer(current)