dkounadis
/

artificial-styletts2

@@ -164,6 +164,13 @@ def tts_multi_sentence(precomputed_style_vector=None,
     if precomputed_style_vector is not None:
         x = []
         for _sentence in text:
             x.append(msinference.inference(_sentence,
                         precomputed_style_vector)
                      )

     if precomputed_style_vector is not None:
         x = []
         for _sentence in text:
+            # StyleTTS2 - pronounciation Fx
+            _sentence = _sentence.lower()  # .replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
+            if 'vctk_low#p326' in voice:
+                # fix sounding of sleepy AAABS TRAACT
+                _sentence = _sentence.replace('abstract', 'ahbstract')  # 'ahstract'
             x.append(msinference.inference(_sentence,
                         precomputed_style_vector)
                      )

assets/audiobook_TTS.docx CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bbdcb8fe14e0713954e3fa49dc53eaca041c2ac6cfa6de098e892f5a7da38c27
-size 221307

 version https://git-lfs.github.com/spec/v1
+oid sha256:7499070a3e0b743e102cf6181b22544ba4febd6fc57f12757187b9c85554502f
+size 205578

msinference.py CHANGED Viewed

@@ -49,7 +49,7 @@ textclenaer = TextCleaner()
 to_mel = torchaudio.transforms.MelSpectrogram(
-    n_mels=80, n_fft=2048, win_length=1200, hop_length=300)
 mean, std = -4, 4
 # START UTIL
@@ -162,7 +162,6 @@ def inference(text,
               use_gruut=False):
     # Ignore .,; AT end of sentence; or just [-50:]
     text = text.strip()
     ps = global_phonemizer.phonemize([text])
@@ -240,10 +239,14 @@ def inference(text,
         x = model.decoder(asr,
                                 F0_pred, N_pred, ref.squeeze().unsqueeze(0))
-    x = x.squeeze().cpu().numpy()[..., :-2504] # weird pulse at the end of sentences
-    x /= np.abs(x).max() + 1e-7
     return x
@@ -434,7 +437,7 @@ def foreign(text=None,   # list of text
             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
             _t = text_mapper.uromanize(_t, uroman_pl)
-        _t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
         _t = text_mapper.filter_oov(_t, lang=lang)

 to_mel = torchaudio.transforms.MelSpectrogram(
+    n_mels=80, n_fft=2048, win_length=1200, hop_length=328)
 mean, std = -4, 4
 # START UTIL
               use_gruut=False):
     # Ignore .,; AT end of sentence; or just [-50:]
     text = text.strip()
     ps = global_phonemizer.phonemize([text])
         x = model.decoder(asr,
                                 F0_pred, N_pred, ref.squeeze().unsqueeze(0))
+    x = x.cpu().numpy()[0, 0, :-400] # weird pulse at the end of sentences
+    print(x.shape,' A')
+    if x.shape[0] > 10:
+        x /= np.abs(x).max() + 1e-7
+    else:
+        print('\n\n\n\n\nEMPTY TTS\n\n\n\n\n\nn', x.shape)
+        x = np.zeros(0)
     return x
             uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
             _t = text_mapper.uromanize(_t, uroman_pl)
+        _t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')  # Parse STTS2 pronounciation on tts_mult()
         _t = text_mapper.filter_oov(_t, lang=lang)