visuals for 3mimic & 1human - draft
Browse files- mimic3_make_harvard_sentences.py +127 -231
mimic3_make_harvard_sentences.py
CHANGED
|
@@ -1,10 +1,3 @@
|
|
| 1 |
-
# 1. Syntesize Harvard Sentences via Mimic-3 - 1 voice
|
| 2 |
-
# 1. Synthesize via StyleTTS2 --> use same or sweetdreams
|
| 3 |
-
# 2. Run audinterface on this 767
|
| 4 |
-
# 3. .mimic3_pkl .styletts2_pkl -> different durations
|
| 5 |
-
|
| 6 |
-
# It may crash due to non-truly-blocking shutil.copyfile() saying onnx protobuf incomplete file
|
| 7 |
-
# You have to rerun the script - it will copy all voices from hf:mimic3-voices to ~/.local/mimic3
|
| 8 |
import shutil
|
| 9 |
import csv
|
| 10 |
import io
|
|
@@ -12,6 +5,7 @@ import os
|
|
| 12 |
import typing
|
| 13 |
import wave
|
| 14 |
import sys
|
|
|
|
| 15 |
from mimic3_tts.__main__ import (CommandLineInterfaceState,
|
| 16 |
get_args,
|
| 17 |
initialize_args,
|
|
@@ -21,7 +15,7 @@ from mimic3_tts.__main__ import (CommandLineInterfaceState,
|
|
| 21 |
shutdown_tts,
|
| 22 |
OutputNaming,
|
| 23 |
process_line)
|
| 24 |
-
|
| 25 |
import time
|
| 26 |
import json
|
| 27 |
import pandas as pd
|
|
@@ -39,31 +33,44 @@ import audiofile
|
|
| 39 |
|
| 40 |
|
| 41 |
# ================================================ LIST OF VOICES
|
| 42 |
-
ROOT_DIR = '/data/dkounadis/mimic3-voices/'
|
| 43 |
-
foreign_voices = []
|
| 44 |
-
english_voices = []
|
| 45 |
-
for lang in os.listdir(ROOT_DIR + 'voices'):
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# ================================================== INTERFACE MODELS
|
| 68 |
LABELS = [
|
| 69 |
'arousal', 'dominance', 'valence',
|
|
@@ -156,8 +163,8 @@ interface = audinterface.Feature(
|
|
| 156 |
process_func=process_function,
|
| 157 |
# process_func_args={'outputs': 'logits_scene'},
|
| 158 |
process_func_applies_sliding_window=False,
|
| 159 |
-
win_dur=
|
| 160 |
-
hop_dur=
|
| 161 |
sampling_rate=16000,
|
| 162 |
resample=True,
|
| 163 |
verbose=True,
|
|
@@ -168,38 +175,6 @@ interface = audinterface.Feature(
|
|
| 168 |
|
| 169 |
|
| 170 |
|
| 171 |
-
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
# Filter insufficient durations - prompt
|
| 188 |
-
foreign_voices = [i for i in foreign_voices if i not in ['bn/multi_low#02194',
|
| 189 |
-
'uk_UK/m-ailabs_low#obruchov',
|
| 190 |
-
'uk_UK/m-ailabs_low#shepel',
|
| 191 |
-
'uk_UK/m-ailabs_low#loboda',
|
| 192 |
-
'uk_UK/m-ailabs_low#miskun',
|
| 193 |
-
'uk_UK/m-ailabs_low#sumska',
|
| 194 |
-
'uk_UK/m-ailabs_low#pysariev',
|
| 195 |
-
]]
|
| 196 |
-
|
| 197 |
-
# print(english_voices, '\n_________________________\n', foreign_voices)
|
| 198 |
-
# ----------------------
|
| 199 |
-
# print(foreign_voices.keys(), len(foreign_voices))
|
| 200 |
-
# raise SystemExit
|
| 201 |
-
|
| 202 |
-
|
| 203 |
def process_lines(state: CommandLineInterfaceState, wav_path=None):
|
| 204 |
'''MIMIC3 INTERNAL CALL that yields the sigh sound'''
|
| 205 |
|
|
@@ -264,114 +239,36 @@ def process_lines(state: CommandLineInterfaceState, wav_path=None):
|
|
| 264 |
# https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign
|
| 265 |
|
| 266 |
# STYLES Already Made - HF
|
| 267 |
-
|
| 268 |
-
|
| 269 |
|
| 270 |
-
Path(english_dir).mkdir(parents=True, exist_ok=True)
|
| 271 |
-
Path(foreign_dir).mkdir(parents=True, exist_ok=True)
|
| 272 |
|
| 273 |
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
# # state.stdout = True
|
| 280 |
-
# # state.tts = True
|
| 281 |
-
# process_lines(state, wav_path='tmp1.wav')
|
| 282 |
-
# shutdown_tts(state)
|
| 283 |
-
# x, fs = audiofile.read('tmp1.wav')
|
| 284 |
-
# total_audio_mimic3.append(x)
|
| 285 |
-
# print(fs, text, 'mimic3')
|
| 286 |
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
#
|
| 295 |
-
#
|
| 296 |
-
|
| 297 |
-
#
|
| 298 |
-
#
|
| 299 |
-
|
| 300 |
-
#
|
| 301 |
-
#
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
# load all harvard and for every voice -> load-its-style -> synth-mimic3 -> synth-stylett2 -> run-both-pkl
|
| 344 |
-
# FOREIGN
|
| 345 |
-
for folder, list_voices in [
|
| 346 |
-
['foreign', foreign_voices],
|
| 347 |
-
['english', english_voices],
|
| 348 |
-
]:
|
| 349 |
-
print(folder, list_voices[:4], '\n\nEEEEEEEEEEEEEEEEEEEEEEEEEEEEEE')
|
| 350 |
-
for _id, _voice in enumerate(list_voices[:4]):
|
| 351 |
-
_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
|
| 352 |
-
_dir = folder + '_pkl/'
|
| 353 |
-
if 'cmu-arctic' in _str:
|
| 354 |
-
_str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
|
| 355 |
-
|
| 356 |
-
print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
|
| 357 |
-
|
| 358 |
-
if (
|
| 359 |
-
not os.path.isfile(_dir + 'mimic3__' + _str + '.wav') or
|
| 360 |
-
not os.path.isfile(_dir + 'styletts2__' + _str + '.wav')
|
| 361 |
-
):
|
| 362 |
-
|
| 363 |
-
# Mimic3 GitHub Quota exceded:
|
| 364 |
-
# https://github.com/MycroftAI/mimic3-voices
|
| 365 |
-
# Above repo can exceed download quota of LFS
|
| 366 |
-
# Copy mimic-voices from local copies
|
| 367 |
-
# clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
|
| 368 |
-
# copy to ~/
|
| 369 |
-
#
|
| 370 |
-
#
|
| 371 |
home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
|
| 372 |
Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
|
| 373 |
-
|
| 374 |
-
|
| 375 |
speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
|
| 376 |
|
| 377 |
|
|
@@ -386,53 +283,28 @@ for folder, list_voices in [
|
|
| 386 |
f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
|
| 387 |
home_voice_dir + 'generator.onnx')
|
| 388 |
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
# pre made
|
| 392 |
-
prompt_path = f'mimic3_{folder}_4x/' + _str + '.wav'
|
| 393 |
-
|
| 394 |
-
|
| 395 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 396 |
|
|
|
|
| 397 |
|
|
|
|
|
|
|
| 398 |
|
| 399 |
|
|
|
|
| 400 |
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
# ACTUAL TTS
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
with open('harvard.json', 'r') as f:
|
| 409 |
-
harvard_individual_sentences = json.load(f)['sentences']
|
| 410 |
-
total_audio_mimic3 = []
|
| 411 |
-
total_audio_stts2 = []
|
| 412 |
-
ix = 0
|
| 413 |
-
for list_of_10 in harvard_individual_sentences[:1]: # 77
|
| 414 |
-
text = ' '.join(list_of_10['sentences'])
|
| 415 |
-
# harvard.append(long_sentence.replace('.', ' '))
|
| 416 |
-
# for text in list_of_10['sentences']:
|
| 417 |
-
style_vec = msinference.compute_style(prompt_path)
|
| 418 |
-
print(ix, text)
|
| 419 |
-
ix += 1
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
x = msinference.inference(text,
|
| 423 |
-
style_vec,
|
| 424 |
-
alpha=0.3,
|
| 425 |
-
beta=0.7,
|
| 426 |
-
diffusion_steps=7,
|
| 427 |
-
embedding_scale=1)
|
| 428 |
-
|
| 429 |
-
total_audio_stts2.append(x)
|
| 430 |
-
|
| 431 |
-
# also synthesize mimic with the same sentence and voice
|
| 432 |
-
|
| 433 |
-
# MIMIC-3 = = = = = = = = = = = = = = BEGIN
|
| 434 |
-
|
| 435 |
-
rate = 1 # high speed sounds nice if used as speaker-reference audio for StyleTTS2
|
| 436 |
_ssml = (
|
| 437 |
'<speak>'
|
| 438 |
'<prosody volume=\'64\'>'
|
|
@@ -472,51 +344,75 @@ for folder, list_voices in [
|
|
| 472 |
process_lines(state, wav_path='tmp1.wav')
|
| 473 |
shutdown_tts(state)
|
| 474 |
x, fs = audiofile.read('tmp1.wav')
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 479 |
|
| 480 |
|
| 481 |
|
|
|
|
|
|
|
|
|
|
| 482 |
|
| 483 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
|
| 485 |
-
total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
|
| 486 |
-
audiofile.write(_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 24000)
|
| 487 |
|
| 488 |
-
total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
|
| 489 |
-
audiofile.write(_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 22050)
|
| 490 |
|
| 491 |
-
print('Saving:', _dir + 'mimic3__' + _str + '.wav')
|
| 492 |
-
else:
|
| 493 |
-
print('Skip:', _dir + 'styletts2__' + _str + '.wav')
|
| 494 |
|
| 495 |
-
|
| 496 |
-
# AUD I N T E R F A C E
|
| 497 |
-
# file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
|
| 498 |
|
| 499 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 500 |
|
| 501 |
-
for engine in ['mimic3', 'styletts2']:
|
| 502 |
-
harvard_of_voice = f'{_dir}{engine}__{_str}'
|
| 503 |
-
if not os.path.exists(harvard_of_voice + '.pkl'):
|
| 504 |
-
df = interface.process_file(harvard_of_voice + '.wav')
|
| 505 |
-
df.to_pickle(harvard_of_voice + '.pkl')
|
| 506 |
-
else:
|
| 507 |
-
# df = pd.read_pickle(harvard_of_voice + '.pkl')
|
| 508 |
-
print(harvard_of_voice + '.pkl', 'FOUND')
|
| 509 |
|
| 510 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 511 |
|
| 512 |
|
| 513 |
|
| 514 |
-
|
|
|
|
| 515 |
|
| 516 |
|
| 517 |
|
| 518 |
|
| 519 |
-
|
| 520 |
|
| 521 |
# ===============================================================================
|
| 522 |
# V I S U A L S
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import shutil
|
| 2 |
import csv
|
| 3 |
import io
|
|
|
|
| 5 |
import typing
|
| 6 |
import wave
|
| 7 |
import sys
|
| 8 |
+
import audresample
|
| 9 |
from mimic3_tts.__main__ import (CommandLineInterfaceState,
|
| 10 |
get_args,
|
| 11 |
initialize_args,
|
|
|
|
| 15 |
shutdown_tts,
|
| 16 |
OutputNaming,
|
| 17 |
process_line)
|
| 18 |
+
import msinference
|
| 19 |
import time
|
| 20 |
import json
|
| 21 |
import pandas as pd
|
|
|
|
| 33 |
|
| 34 |
|
| 35 |
# ================================================ LIST OF VOICES
|
| 36 |
+
# ROOT_DIR = '/data/dkounadis/mimic3-voices/'
|
| 37 |
+
# foreign_voices = []
|
| 38 |
+
# english_voices = []
|
| 39 |
+
# for lang in os.listdir(ROOT_DIR + 'voices'):
|
| 40 |
|
| 41 |
+
# for voice in os.listdir(ROOT_DIR + 'voices/' + lang):
|
| 42 |
+
# if 'en_' in lang:
|
| 43 |
+
|
| 44 |
+
# try:
|
| 45 |
+
# with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
|
| 46 |
+
# for spk in f:
|
| 47 |
+
# english_voices.append(lang + '/' + voice + '#' + spk.rstrip())
|
| 48 |
+
# # voice_spk_string = lang + '/' + voice + '#' + spk.rstrip() for spk in f
|
| 49 |
+
# except FileNotFoundError:
|
| 50 |
+
# english_voices.append(lang + '/' + voice)
|
| 51 |
+
|
| 52 |
+
# else:
|
| 53 |
|
| 54 |
+
# try:
|
| 55 |
+
# with open(ROOT_DIR + 'voices/' + lang + '/' + voice + '/speakers.txt', 'r') as f:
|
| 56 |
+
# for spk in f:
|
| 57 |
+
# foreign_voices.append(lang + '/' + voice + '#' + spk.rstrip())
|
| 58 |
|
| 59 |
+
# except FileNotFoundError:
|
| 60 |
+
# foreign_voices.append(lang + '/' + voice)
|
| 61 |
+
# #
|
| 62 |
+
# [print(i) for i in foreign_voices]
|
| 63 |
+
# print('\n_______________________________\n')
|
| 64 |
+
# [print(i) for i in english_voices]
|
| 65 |
+
# ====================================================== END PRINT LIST OF VOICES
|
| 66 |
+
list_voices = [
|
| 67 |
+
'en_US/m-ailabs_low#mary_ann',
|
| 68 |
+
'en_UK/apope_low',
|
| 69 |
+
'de_DE/thorsten-emotion_low#neutral', # is the 4x really interesting we can just write it in Section
|
| 70 |
+
'human'
|
| 71 |
+
] # special - for human we load specific style file - no Mimic3 is run
|
| 72 |
+
|
| 73 |
+
|
| 74 |
# ================================================== INTERFACE MODELS
|
| 75 |
LABELS = [
|
| 76 |
'arousal', 'dominance', 'valence',
|
|
|
|
| 163 |
process_func=process_function,
|
| 164 |
# process_func_args={'outputs': 'logits_scene'},
|
| 165 |
process_func_applies_sliding_window=False,
|
| 166 |
+
win_dur=7.0,
|
| 167 |
+
hop_dur=4.0,
|
| 168 |
sampling_rate=16000,
|
| 169 |
resample=True,
|
| 170 |
verbose=True,
|
|
|
|
| 175 |
|
| 176 |
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
def process_lines(state: CommandLineInterfaceState, wav_path=None):
|
| 179 |
'''MIMIC3 INTERNAL CALL that yields the sigh sound'''
|
| 180 |
|
|
|
|
| 239 |
# https://huggingface.co/dkounadis/artificial-styletts2/tree/main/mimic3_foreign
|
| 240 |
|
| 241 |
# STYLES Already Made - HF
|
| 242 |
+
out_dir = 'out_dir/'
|
| 243 |
+
Path(out_dir).mkdir(parents=True, exist_ok=True)
|
| 244 |
|
|
|
|
|
|
|
| 245 |
|
| 246 |
|
| 247 |
+
for _id, _voice in enumerate(list_voices):
|
| 248 |
+
_str = _voice.replace('/', '_').replace('#', '_').replace('_low', '')
|
| 249 |
+
|
| 250 |
+
if 'cmu-arctic' in _str:
|
| 251 |
+
_str = _str.replace('cmu-arctic', 'cmu_arctic') #+ '.wav'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
|
| 253 |
+
print('\n\n\n\nExecuting', _voice,'\n\n\n\n\n')
|
| 254 |
+
|
| 255 |
+
if (
|
| 256 |
+
not os.path.isfile(out_dir + 'mimic3__' + _str + '.wav') or
|
| 257 |
+
not os.path.isfile(out_dir + 'styletts2__' + _str + '.wav')
|
| 258 |
+
):
|
| 259 |
+
|
| 260 |
+
# Mimic3 GitHub Quota exceded:
|
| 261 |
+
# https://github.com/MycroftAI/mimic3-voices
|
| 262 |
+
# Above repo can exceed download quota of LFS
|
| 263 |
+
# Copy mimic-voices from local copies
|
| 264 |
+
# clone https://huggingface.co/mukowaty/mimic3-voices/tree/main/voices
|
| 265 |
+
# copy to ~/
|
| 266 |
+
#
|
| 267 |
+
#
|
| 268 |
+
if 'human' not in _voice:
|
| 269 |
+
# assure mimic-3 generator .onnx exists
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
home_voice_dir = f'/home/audeering.local/dkounadis/.local/share/mycroft/mimic3/voices/{_voice.split("#")[0]}/'
|
| 271 |
Path(home_voice_dir).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
|
| 272 |
speaker_free_voice_name = _voice.split("#")[0] if '#' in _voice else _voice
|
| 273 |
|
| 274 |
|
|
|
|
| 283 |
f'/data/dkounadis/mimic3-voices/voices/{speaker_free_voice_name}/generator.onnx',
|
| 284 |
home_voice_dir + 'generator.onnx')
|
| 285 |
|
| 286 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
|
| 288 |
+
|
| 289 |
+
# prompt_path = f'mimic3_{folder}_4x/' + _str + '.wav'
|
| 290 |
+
with open('harvard.json', 'r') as f:
|
| 291 |
+
harvard_individual_sentences = json.load(f)['sentences']
|
| 292 |
+
total_audio_mimic3 = []
|
| 293 |
+
total_audio_stts2 = []
|
| 294 |
+
ix = 0
|
| 295 |
+
for list_of_10 in harvard_individual_sentences[:1]: # 77
|
| 296 |
|
| 297 |
+
text = ' '.join(list_of_10['sentences'])
|
| 298 |
|
| 299 |
+
print(ix, text)
|
| 300 |
+
ix += 1
|
| 301 |
|
| 302 |
|
| 303 |
+
# Synthesis Mimic-3 then use it as prompt for StyleTTS2
|
| 304 |
|
| 305 |
+
# MIMIC-3 if _voice is not HUMAN
|
| 306 |
+
if 'human' not in _voice:
|
| 307 |
+
rate = 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
_ssml = (
|
| 309 |
'<speak>'
|
| 310 |
'<prosody volume=\'64\'>'
|
|
|
|
| 344 |
process_lines(state, wav_path='tmp1.wav')
|
| 345 |
shutdown_tts(state)
|
| 346 |
x, fs = audiofile.read('tmp1.wav')
|
| 347 |
+
print(x.shape)
|
| 348 |
+
else:
|
| 349 |
+
# MSP['valence.train.votes'].get().sort_values('7').index[-1]
|
| 350 |
+
human_style = '/cache/audb/msppodcast/2.4.0/fe182b91/Audios/MSP-PODCAST_0235_0053.wav'
|
| 351 |
+
x, fs = audiofile.read(human_style)
|
| 352 |
+
print(x.shape,' human') # crop human to almost mimic-3 duration
|
| 353 |
+
total_audio_mimic3.append(x)
|
| 354 |
+
print(fs, text, 'mimic3')
|
| 355 |
+
|
| 356 |
+
# MIMIC3 = = = = = = = = = = = = = = END
|
| 357 |
|
| 358 |
|
| 359 |
|
| 360 |
+
|
| 361 |
+
style_vec = msinference.compute_style('tmp1.wav') # use mimic-3 as prompt
|
| 362 |
+
|
| 363 |
|
| 364 |
|
| 365 |
+
x = msinference.inference(text,
|
| 366 |
+
style_vec,
|
| 367 |
+
alpha=0.3,
|
| 368 |
+
beta=0.7,
|
| 369 |
+
diffusion_steps=7,
|
| 370 |
+
embedding_scale=1)
|
| 371 |
+
|
| 372 |
+
total_audio_stts2.append(x)
|
| 373 |
|
|
|
|
|
|
|
| 374 |
|
|
|
|
|
|
|
| 375 |
|
|
|
|
|
|
|
|
|
|
| 376 |
|
|
|
|
|
|
|
|
|
|
| 377 |
|
| 378 |
|
| 379 |
+
total_audio_stts2 = np.concatenate(total_audio_stts2) # -- concat 77x lists
|
| 380 |
+
total_audio_stts2 = audresample.resample(total_audio_stts2, original_rate=24000, target_rate=16000)[0] # for audinterface
|
| 381 |
+
audiofile.write(out_dir + 'styletts2__' + _str + '.wav', total_audio_stts2, 16000)
|
| 382 |
+
|
| 383 |
+
total_audio_mimic3 = np.concatenate(total_audio_mimic3) # -- concat 77x lists
|
| 384 |
+
total_audio_mimic3 = audresample.resample(total_audio_mimic3, original_rate=24000, target_rate=16000)[0]
|
| 385 |
+
audiofile.write(out_dir + 'mimic3__' + _str + '.wav', total_audio_mimic3, 16000)
|
| 386 |
+
|
| 387 |
+
print('Saving:', out_dir + 'mimic3__' + _str + '.wav')
|
| 388 |
+
else:
|
| 389 |
+
print('Skip:', out_dir + 'styletts2__' + _str + '.wav')
|
| 390 |
+
|
| 391 |
+
|
| 392 |
+
# AUD I N T E R F A C E
|
| 393 |
+
# file_interface = f'timeseries_{long_audio.replace("/", "")}.pkl'
|
| 394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 395 |
|
| 396 |
|
| 397 |
+
for engine in ['mimic3', 'styletts2']:
|
| 398 |
+
harvard_of_voice = f'{out_dir}{engine}__{_str}'
|
| 399 |
+
if not os.path.exists(harvard_of_voice + '.pkl'):
|
| 400 |
+
df = interface.process_file(harvard_of_voice + '.wav')
|
| 401 |
+
df.to_pickle(harvard_of_voice + '.pkl')
|
| 402 |
+
else:
|
| 403 |
+
# df = pd.read_pickle(harvard_of_voice + '.pkl')
|
| 404 |
+
print(harvard_of_voice + '.pkl', 'FOUND')
|
| 405 |
+
|
| 406 |
|
| 407 |
|
| 408 |
|
| 409 |
+
|
| 410 |
+
|
| 411 |
|
| 412 |
|
| 413 |
|
| 414 |
|
| 415 |
+
print('\nVisuals\n')
|
| 416 |
|
| 417 |
# ===============================================================================
|
| 418 |
# V I S U A L S
|