filter if len(TTS) < 10 samples
Browse files- api.py +7 -0
- assets/audiobook_TTS.docx +2 -2
- msinference.py +9 -6
api.py
CHANGED
|
@@ -164,6 +164,13 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
| 164 |
if precomputed_style_vector is not None:
|
| 165 |
x = []
|
| 166 |
for _sentence in text:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 167 |
x.append(msinference.inference(_sentence,
|
| 168 |
precomputed_style_vector)
|
| 169 |
)
|
|
|
|
| 164 |
if precomputed_style_vector is not None:
|
| 165 |
x = []
|
| 166 |
for _sentence in text:
|
| 167 |
+
|
| 168 |
+
# StyleTTS2 - pronounciation Fx
|
| 169 |
+
|
| 170 |
+
_sentence = _sentence.lower() # .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
|
| 171 |
+
if 'vctk_low#p326' in voice:
|
| 172 |
+
# fix sounding of sleepy AAABS TRAACT
|
| 173 |
+
_sentence = _sentence.replace('abstract', 'ahbstract') # 'ahstract'
|
| 174 |
x.append(msinference.inference(_sentence,
|
| 175 |
precomputed_style_vector)
|
| 176 |
)
|
assets/audiobook_TTS.docx
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7499070a3e0b743e102cf6181b22544ba4febd6fc57f12757187b9c85554502f
|
| 3 |
+
size 205578
|
msinference.py
CHANGED
|
@@ -49,7 +49,7 @@ textclenaer = TextCleaner()
|
|
| 49 |
|
| 50 |
|
| 51 |
to_mel = torchaudio.transforms.MelSpectrogram(
|
| 52 |
-
n_mels=80, n_fft=2048, win_length=1200, hop_length=
|
| 53 |
mean, std = -4, 4
|
| 54 |
|
| 55 |
# START UTIL
|
|
@@ -162,7 +162,6 @@ def inference(text,
|
|
| 162 |
use_gruut=False):
|
| 163 |
# Ignore .,; AT end of sentence; or just [-50:]
|
| 164 |
|
| 165 |
-
|
| 166 |
text = text.strip()
|
| 167 |
|
| 168 |
ps = global_phonemizer.phonemize([text])
|
|
@@ -240,10 +239,14 @@ def inference(text,
|
|
| 240 |
x = model.decoder(asr,
|
| 241 |
F0_pred, N_pred, ref.squeeze().unsqueeze(0))
|
| 242 |
|
| 243 |
-
x = x.
|
| 244 |
-
|
| 245 |
-
x /= np.abs(x).max() + 1e-7
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
return x
|
| 248 |
|
| 249 |
|
|
@@ -434,7 +437,7 @@ def foreign(text=None, # list of text
|
|
| 434 |
uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
|
| 435 |
_t = text_mapper.uromanize(_t, uroman_pl)
|
| 436 |
|
| 437 |
-
_t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
|
| 438 |
|
| 439 |
_t = text_mapper.filter_oov(_t, lang=lang)
|
| 440 |
|
|
|
|
| 49 |
|
| 50 |
|
| 51 |
to_mel = torchaudio.transforms.MelSpectrogram(
|
| 52 |
+
n_mels=80, n_fft=2048, win_length=1200, hop_length=328)
|
| 53 |
mean, std = -4, 4
|
| 54 |
|
| 55 |
# START UTIL
|
|
|
|
| 162 |
use_gruut=False):
|
| 163 |
# Ignore .,; AT end of sentence; or just [-50:]
|
| 164 |
|
|
|
|
| 165 |
text = text.strip()
|
| 166 |
|
| 167 |
ps = global_phonemizer.phonemize([text])
|
|
|
|
| 239 |
x = model.decoder(asr,
|
| 240 |
F0_pred, N_pred, ref.squeeze().unsqueeze(0))
|
| 241 |
|
| 242 |
+
x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
|
|
|
|
|
|
|
| 243 |
|
| 244 |
+
print(x.shape,' A')
|
| 245 |
+
if x.shape[0] > 10:
|
| 246 |
+
x /= np.abs(x).max() + 1e-7
|
| 247 |
+
else:
|
| 248 |
+
print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
|
| 249 |
+
x = np.zeros(0)
|
| 250 |
return x
|
| 251 |
|
| 252 |
|
|
|
|
| 437 |
uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
|
| 438 |
_t = text_mapper.uromanize(_t, uroman_pl)
|
| 439 |
|
| 440 |
+
_t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u') # Parse STTS2 pronounciation on tts_mult()
|
| 441 |
|
| 442 |
_t = text_mapper.filter_oov(_t, lang=lang)
|
| 443 |
|