daihui.zhang
commited on
Commit
·
0a036e5
1
Parent(s):
2c5a26c
drop anly puncation audio chunk in analysis
Browse files- transcribe/strategy.py +9 -1
transcribe/strategy.py
CHANGED
|
@@ -76,10 +76,15 @@ class TranscriptChunk:
|
|
| 76 |
|
| 77 |
# 每个切分点向后移一个索引,表示“分隔符归前段”
|
| 78 |
cut_points = [0] + sorted(i + 1 for i in indexes) + [len(self.items)]
|
| 79 |
-
|
| 80 |
TranscriptChunk(items=self.items[start:end], separator=self.separator)
|
| 81 |
for start, end in zip(cut_points, cut_points[1:])
|
| 82 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
|
| 84 |
|
| 85 |
def get_split_first_rest(self, mode: SplitMode):
|
|
@@ -109,6 +114,9 @@ class TranscriptChunk:
|
|
| 109 |
logger.debug(f"Compare: {self.join()} vs {chunk.join()} : {score}")
|
| 110 |
return score
|
| 111 |
|
|
|
|
|
|
|
|
|
|
| 112 |
def has_punctuation(self) -> bool:
|
| 113 |
return any(seg.is_punctuation() for seg in self.items)
|
| 114 |
|
|
|
|
| 76 |
|
| 77 |
# 每个切分点向后移一个索引,表示“分隔符归前段”
|
| 78 |
cut_points = [0] + sorted(i + 1 for i in indexes) + [len(self.items)]
|
| 79 |
+
chunks = [
|
| 80 |
TranscriptChunk(items=self.items[start:end], separator=self.separator)
|
| 81 |
for start, end in zip(cut_points, cut_points[1:])
|
| 82 |
]
|
| 83 |
+
return [
|
| 84 |
+
ck
|
| 85 |
+
for ck in chunks
|
| 86 |
+
if not ck.only_punctuation()
|
| 87 |
+
]
|
| 88 |
|
| 89 |
|
| 90 |
def get_split_first_rest(self, mode: SplitMode):
|
|
|
|
| 114 |
logger.debug(f"Compare: {self.join()} vs {chunk.join()} : {score}")
|
| 115 |
return score
|
| 116 |
|
| 117 |
+
def only_punctuation(self)->bool:
|
| 118 |
+
return all(seg.is_punctuation() for seg in self.items)
|
| 119 |
+
|
| 120 |
def has_punctuation(self) -> bool:
|
| 121 |
return any(seg.is_punctuation() for seg in self.items)
|
| 122 |
|