daihui.zhang
commited on
Commit
·
5c84c3c
1
Parent(s):
0a036e5
add new logic of check text length
Browse files- transcribe/strategy.py +20 -10
transcribe/strategy.py
CHANGED
|
@@ -160,7 +160,7 @@ class TranscriptBuffer:
|
|
| 160 |
|
| 161 |
def __init__(self, source_lang:str, separator:str):
|
| 162 |
self._segments: List[str] = collections.deque(maxlen=2) # 确认的完整段落
|
| 163 |
-
self._sentences: List[str] =
|
| 164 |
self._buffer: str = "" # 当前缓冲中的文本
|
| 165 |
self._current_seg_id: int = 0
|
| 166 |
self.source_language = source_lang
|
|
@@ -189,17 +189,27 @@ class TranscriptBuffer:
|
|
| 189 |
Args:
|
| 190 |
end_of_sentence: 是否为句子结尾(如检测到句号)
|
| 191 |
"""
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
def rebuild(self, text):
|
| 197 |
output = self.split_and_join(
|
| 198 |
text.replace(
|
| 199 |
self._separator, ""))
|
| 200 |
-
|
| 201 |
-
logger.debug(f"Source string: {text.replace(self._separator, '')}")
|
| 202 |
-
logger.debug(f"Output string: {output}")
|
| 203 |
return output
|
| 204 |
|
| 205 |
@staticmethod
|
|
@@ -241,7 +251,7 @@ class TranscriptBuffer:
|
|
| 241 |
current_text_len = len(self.current_not_commit_text.split(self._separator)) if self._separator else len(self.current_not_commit_text)
|
| 242 |
# current_text_len = len(self.current_not_commit_text.split(self._separator))
|
| 243 |
self.update_pending_text(remaining_string)
|
| 244 |
-
if current_text_len >=20:
|
| 245 |
self.commit_paragraph()
|
| 246 |
self._current_seg_id += 1
|
| 247 |
return True
|
|
@@ -255,7 +265,7 @@ class TranscriptBuffer:
|
|
| 255 |
@property
|
| 256 |
def un_commit_paragraph(self) -> str:
|
| 257 |
"""当前短句组合"""
|
| 258 |
-
return "".join(self._sentences)
|
| 259 |
|
| 260 |
@property
|
| 261 |
def pending_text(self) -> str:
|
|
@@ -299,7 +309,7 @@ class TranscriptStabilityAnalyzer:
|
|
| 299 |
return
|
| 300 |
|
| 301 |
# yield from self._handle_short_buffer(current, prev)
|
| 302 |
-
if buffer_duration
|
| 303 |
yield from self._handle_short_buffer(current, prev)
|
| 304 |
else:
|
| 305 |
yield from self._handle_long_buffer(current)
|
|
|
|
| 160 |
|
| 161 |
def __init__(self, source_lang:str, separator:str):
|
| 162 |
self._segments: List[str] = collections.deque(maxlen=2) # 确认的完整段落
|
| 163 |
+
self._sentences: List[str] = collections.deque() # 当前段落中的短句
|
| 164 |
self._buffer: str = "" # 当前缓冲中的文本
|
| 165 |
self._current_seg_id: int = 0
|
| 166 |
self.source_language = source_lang
|
|
|
|
| 189 |
Args:
|
| 190 |
end_of_sentence: 是否为句子结尾(如检测到句号)
|
| 191 |
"""
|
| 192 |
+
|
| 193 |
+
count = 0
|
| 194 |
+
current_sentences = []
|
| 195 |
+
while len(self._sentences) and count <=20:
|
| 196 |
+
item = self._sentences.popleft()
|
| 197 |
+
current_sentences.append(item)
|
| 198 |
+
count += len(item.split(self._separator))
|
| 199 |
+
if current_sentences:
|
| 200 |
+
self._segments.append("".join(current_sentences))
|
| 201 |
+
logger.debug(f"=== count to paragraph ===")
|
| 202 |
+
logger.debug(f"push: {current_sentences}")
|
| 203 |
+
logger.debug(f"rest: {self._sentences}")
|
| 204 |
+
# if self._sentences:
|
| 205 |
+
# self._segments.append("".join(self._sentences))
|
| 206 |
+
# self._sentences.clear()
|
| 207 |
|
| 208 |
def rebuild(self, text):
|
| 209 |
output = self.split_and_join(
|
| 210 |
text.replace(
|
| 211 |
self._separator, ""))
|
| 212 |
+
|
|
|
|
|
|
|
| 213 |
return output
|
| 214 |
|
| 215 |
@staticmethod
|
|
|
|
| 251 |
current_text_len = len(self.current_not_commit_text.split(self._separator)) if self._separator else len(self.current_not_commit_text)
|
| 252 |
# current_text_len = len(self.current_not_commit_text.split(self._separator))
|
| 253 |
self.update_pending_text(remaining_string)
|
| 254 |
+
if current_text_len >= 20:
|
| 255 |
self.commit_paragraph()
|
| 256 |
self._current_seg_id += 1
|
| 257 |
return True
|
|
|
|
| 265 |
@property
|
| 266 |
def un_commit_paragraph(self) -> str:
|
| 267 |
"""当前短句组合"""
|
| 268 |
+
return "".join([i for i in self._sentences])
|
| 269 |
|
| 270 |
@property
|
| 271 |
def pending_text(self) -> str:
|
|
|
|
| 309 |
return
|
| 310 |
|
| 311 |
# yield from self._handle_short_buffer(current, prev)
|
| 312 |
+
if buffer_duration <= 4:
|
| 313 |
yield from self._handle_short_buffer(current, prev)
|
| 314 |
else:
|
| 315 |
yield from self._handle_long_buffer(current)
|