re-name arg scene to soundscape
Browse files- api.py +14 -14
- msinference.py +19 -11
- tts.py +16 -10
api.py
CHANGED
|
@@ -18,7 +18,7 @@ from flask_cors import CORS
|
|
| 18 |
from moviepy.editor import *
|
| 19 |
from audiocraft.builders import AudioGen
|
| 20 |
CACHE_DIR = 'flask_cache/'
|
| 21 |
-
NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same
|
| 22 |
|
| 23 |
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
|
| 24 |
|
|
@@ -82,14 +82,14 @@ def _shift(x):
|
|
| 82 |
# fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4)) + .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
|
| 83 |
return x #* fade_in # silence this
|
| 84 |
|
| 85 |
-
def overlay(x,
|
| 86 |
|
| 87 |
-
if
|
| 88 |
|
| 89 |
# SOUNDS
|
| 90 |
-
print(f'AudioGen {NUM_SOUND_GENERATIONS} x {
|
| 91 |
background = sound_generator.generate(
|
| 92 |
-
[
|
| 93 |
).reshape(-1).detach().cpu().numpy() # bs, 11400
|
| 94 |
|
| 95 |
# upsample 16 kHz AudioGen to 24kHZ StyleTTS
|
|
@@ -113,7 +113,7 @@ def overlay(x, scene=None):
|
|
| 113 |
# background = _shift(background)
|
| 114 |
print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
| 115 |
f'{np.abs(background.max())=}\n{x.shape=}')
|
| 116 |
-
x = .
|
| 117 |
else:
|
| 118 |
print('sound_background = None')
|
| 119 |
return x
|
|
@@ -121,7 +121,7 @@ def overlay(x, scene=None):
|
|
| 121 |
def tts_multi_sentence(precomputed_style_vector=None,
|
| 122 |
text=None,
|
| 123 |
voice=None,
|
| 124 |
-
|
| 125 |
speed=None):
|
| 126 |
'''create 24kHZ np.array with tts
|
| 127 |
|
|
@@ -129,7 +129,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
| 129 |
to perform affective TTS.
|
| 130 |
text : string
|
| 131 |
voice : string or None (falls to styleTTS)
|
| 132 |
-
|
| 133 |
'''
|
| 134 |
|
| 135 |
|
|
@@ -161,7 +161,7 @@ def tts_multi_sentence(precomputed_style_vector=None,
|
|
| 161 |
|
| 162 |
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
|
| 163 |
|
| 164 |
-
return overlay(x,
|
| 165 |
|
| 166 |
|
| 167 |
|
|
@@ -201,7 +201,7 @@ def serve_wav():
|
|
| 201 |
affective = r.get('affective')[0],
|
| 202 |
voice = r.get('voice')[0],
|
| 203 |
speed = float(r.get('speed')[0]), # For Non-English MMS TTS
|
| 204 |
-
|
| 205 |
)
|
| 206 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
| 207 |
|
|
@@ -399,7 +399,7 @@ def serve_wav():
|
|
| 399 |
pieces.append(tts_multi_sentence(text=[_text_],
|
| 400 |
precomputed_style_vector=precomputed_style_vector,
|
| 401 |
voice=args.voice,
|
| 402 |
-
|
| 403 |
speed=args.speed)
|
| 404 |
)
|
| 405 |
total = np.concatenate(pieces, 0)
|
|
@@ -420,7 +420,7 @@ def serve_wav():
|
|
| 420 |
x = tts_multi_sentence(text=text,
|
| 421 |
precomputed_style_vector=precomputed_style_vector,
|
| 422 |
voice=args.voice,
|
| 423 |
-
|
| 424 |
speed=args.speed)
|
| 425 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
| 426 |
|
|
@@ -439,7 +439,7 @@ def serve_wav():
|
|
| 439 |
x = tts_multi_sentence(text=text,
|
| 440 |
precomputed_style_vector=precomputed_style_vector,
|
| 441 |
voice=args.voice,
|
| 442 |
-
|
| 443 |
speed=args.speed
|
| 444 |
)
|
| 445 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
|
@@ -468,7 +468,7 @@ def serve_wav():
|
|
| 468 |
x = tts_multi_sentence(text=text,
|
| 469 |
precomputed_style_vector=precomputed_style_vector,
|
| 470 |
voice=args.voice,
|
| 471 |
-
|
| 472 |
speed=args.speed)
|
| 473 |
OUT_FILE = 'tmp.wav'
|
| 474 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
|
|
|
| 18 |
from moviepy.editor import *
|
| 19 |
from audiocraft.builders import AudioGen
|
| 20 |
CACHE_DIR = 'flask_cache/'
|
| 21 |
+
NUM_SOUND_GENERATIONS = 1 # batch size to generate same text (same soundscape for long video)
|
| 22 |
|
| 23 |
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
|
| 24 |
|
|
|
|
| 82 |
# fade_in = 1 - .5 * np.tanh(-4*(np.linspace(-10, 10, n) - 9.4)) + .5 * np.tanh(4*(np.linspace(-10, 10, n) + 9.4))
|
| 83 |
return x #* fade_in # silence this
|
| 84 |
|
| 85 |
+
def overlay(x, soundscape=None):
|
| 86 |
|
| 87 |
+
if soundscape is not None:
|
| 88 |
|
| 89 |
# SOUNDS
|
| 90 |
+
print(f'AudioGen {NUM_SOUND_GENERATIONS} x {soundscape}')
|
| 91 |
background = sound_generator.generate(
|
| 92 |
+
[soundscape] * NUM_SOUND_GENERATIONS
|
| 93 |
).reshape(-1).detach().cpu().numpy() # bs, 11400
|
| 94 |
|
| 95 |
# upsample 16 kHz AudioGen to 24kHZ StyleTTS
|
|
|
|
| 113 |
# background = _shift(background)
|
| 114 |
print(f'\n====SOUND BACKGROUND SHAPE\n{background.shape=}',
|
| 115 |
f'{np.abs(background.max())=}\n{x.shape=}')
|
| 116 |
+
x = .6 * x + .4 * background[:len(x)]
|
| 117 |
else:
|
| 118 |
print('sound_background = None')
|
| 119 |
return x
|
|
|
|
| 121 |
def tts_multi_sentence(precomputed_style_vector=None,
|
| 122 |
text=None,
|
| 123 |
voice=None,
|
| 124 |
+
soundscape=None,
|
| 125 |
speed=None):
|
| 126 |
'''create 24kHZ np.array with tts
|
| 127 |
|
|
|
|
| 129 |
to perform affective TTS.
|
| 130 |
text : string
|
| 131 |
voice : string or None (falls to styleTTS)
|
| 132 |
+
soundscape : 'A castle in far away lands' -> if passed will generate background sound soundscape
|
| 133 |
'''
|
| 134 |
|
| 135 |
|
|
|
|
| 161 |
|
| 162 |
x /= np.abs(x).max() + 1e-7 # amplify speech to full [-1,1]
|
| 163 |
|
| 164 |
+
return overlay(x, soundscape=soundscape)
|
| 165 |
|
| 166 |
|
| 167 |
|
|
|
|
| 201 |
affective = r.get('affective')[0],
|
| 202 |
voice = r.get('voice')[0],
|
| 203 |
speed = float(r.get('speed')[0]), # For Non-English MMS TTS
|
| 204 |
+
soundscape=r.get('soundscape')[0] if r.get('soundscape') is not None else None,
|
| 205 |
)
|
| 206 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
| 207 |
|
|
|
|
| 399 |
pieces.append(tts_multi_sentence(text=[_text_],
|
| 400 |
precomputed_style_vector=precomputed_style_vector,
|
| 401 |
voice=args.voice,
|
| 402 |
+
soundscape=args.soundscape,
|
| 403 |
speed=args.speed)
|
| 404 |
)
|
| 405 |
total = np.concatenate(pieces, 0)
|
|
|
|
| 420 |
x = tts_multi_sentence(text=text,
|
| 421 |
precomputed_style_vector=precomputed_style_vector,
|
| 422 |
voice=args.voice,
|
| 423 |
+
soundscape=args.soundscape,
|
| 424 |
speed=args.speed)
|
| 425 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
| 426 |
|
|
|
|
| 439 |
x = tts_multi_sentence(text=text,
|
| 440 |
precomputed_style_vector=precomputed_style_vector,
|
| 441 |
voice=args.voice,
|
| 442 |
+
soundscape=args.soundscape,
|
| 443 |
speed=args.speed
|
| 444 |
)
|
| 445 |
soundfile.write(AUDIO_TRACK, x, 24000)
|
|
|
|
| 468 |
x = tts_multi_sentence(text=text,
|
| 469 |
precomputed_style_vector=precomputed_style_vector,
|
| 470 |
voice=args.voice,
|
| 471 |
+
soundscape=args.soundscape,
|
| 472 |
speed=args.speed)
|
| 473 |
OUT_FILE = 'tmp.wav'
|
| 474 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
msinference.py
CHANGED
|
@@ -373,13 +373,16 @@ class TextForeign(object):
|
|
| 373 |
def foreign(text=None, # list of text
|
| 374 |
lang='romanian',
|
| 375 |
speed=None):
|
|
|
|
|
|
|
|
|
|
| 376 |
# https://huggingface.co/spaces/mms-meta/MMS
|
| 377 |
|
| 378 |
-
if 'hun' in lang
|
| 379 |
|
| 380 |
lang_code = 'hun'
|
| 381 |
|
| 382 |
-
elif 'ser' in lang
|
| 383 |
|
| 384 |
if has_cyrillic(text[0]): # check 0-th sentence if is cyrillic
|
| 385 |
|
|
@@ -389,14 +392,22 @@ def foreign(text=None, # list of text
|
|
| 389 |
|
| 390 |
lang_code = 'rmc-script_latin' # romani carpathian (has also Vlax)
|
| 391 |
|
| 392 |
-
elif 'rom' in lang
|
| 393 |
|
| 394 |
lang_code = 'ron'
|
| 395 |
speed = 1.24 if speed is None else speed
|
| 396 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 397 |
else:
|
|
|
|
| 398 |
lang_code = lang.split()[0].strip()
|
| 399 |
-
|
|
|
|
|
|
|
| 400 |
print(f'\n\nLANG {lang_code=}\n_____________________\n')
|
| 401 |
vocab_file = hf_hub_download(
|
| 402 |
repo_id="facebook/mms-tts",
|
|
@@ -444,8 +455,10 @@ def foreign(text=None, # list of text
|
|
| 444 |
uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
|
| 445 |
_t = text_mapper.uromanize(_t, uroman_pl)
|
| 446 |
|
| 447 |
-
_t = _t.lower().replace("ţ", "ț").replace('ț','ts')
|
|
|
|
| 448 |
_t = text_mapper.filter_oov(_t, lang=lang)
|
|
|
|
| 449 |
# print(f'{speed=}\n\n\n\n_______________________________ {_t}')
|
| 450 |
stn_tst = text_mapper.get_text(_t, hps)
|
| 451 |
with torch.no_grad():
|
|
@@ -464,16 +477,11 @@ def foreign(text=None, # list of text
|
|
| 464 |
|
| 465 |
x /= np.abs(x).max() + 1e-7
|
| 466 |
|
| 467 |
-
#
|
| 468 |
-
# x = hyp #, text
|
| 469 |
-
print(x.shape, x.min(), x.max(), hps.data.sampling_rate) # (hps.data.sampling_rate,
|
| 470 |
|
| 471 |
x = audresample.resample(signal=x.astype(np.float32),
|
| 472 |
original_rate=16000,
|
| 473 |
target_rate=24000)[0, :] # reshapes (64,) -> (1,64)
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
return x
|
| 478 |
|
| 479 |
|
|
|
|
| 373 |
def foreign(text=None, # list of text
|
| 374 |
lang='romanian',
|
| 375 |
speed=None):
|
| 376 |
+
|
| 377 |
+
lang = lang.lower() # https://huggingface.co/dkounadis/artificial-styletts2/blob/main/Utils/all_langs.csv
|
| 378 |
+
|
| 379 |
# https://huggingface.co/spaces/mms-meta/MMS
|
| 380 |
|
| 381 |
+
if 'hun' in lang:
|
| 382 |
|
| 383 |
lang_code = 'hun'
|
| 384 |
|
| 385 |
+
elif 'ser' in lang:
|
| 386 |
|
| 387 |
if has_cyrillic(text[0]): # check 0-th sentence if is cyrillic
|
| 388 |
|
|
|
|
| 392 |
|
| 393 |
lang_code = 'rmc-script_latin' # romani carpathian (has also Vlax)
|
| 394 |
|
| 395 |
+
elif 'rom' in lang:
|
| 396 |
|
| 397 |
lang_code = 'ron'
|
| 398 |
speed = 1.24 if speed is None else speed
|
| 399 |
|
| 400 |
+
elif 'ger' in lang:
|
| 401 |
+
|
| 402 |
+
lang_code = 'deu'
|
| 403 |
+
speed = 1.14 if speed is None else speed
|
| 404 |
+
|
| 405 |
else:
|
| 406 |
+
|
| 407 |
lang_code = lang.split()[0].strip()
|
| 408 |
+
|
| 409 |
+
# Load VITS
|
| 410 |
+
|
| 411 |
print(f'\n\nLANG {lang_code=}\n_____________________\n')
|
| 412 |
vocab_file = hf_hub_download(
|
| 413 |
repo_id="facebook/mms-tts",
|
|
|
|
| 455 |
uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
|
| 456 |
_t = text_mapper.uromanize(_t, uroman_pl)
|
| 457 |
|
| 458 |
+
_t = _t.lower().replace("ţ", "ț").replace('ț','ts').replace('î', 'u')
|
| 459 |
+
|
| 460 |
_t = text_mapper.filter_oov(_t, lang=lang)
|
| 461 |
+
|
| 462 |
# print(f'{speed=}\n\n\n\n_______________________________ {_t}')
|
| 463 |
stn_tst = text_mapper.get_text(_t, hps)
|
| 464 |
with torch.no_grad():
|
|
|
|
| 477 |
|
| 478 |
x /= np.abs(x).max() + 1e-7
|
| 479 |
|
| 480 |
+
# print(x.shape, x.min(), x.max(), hps.data.sampling_rate)
|
|
|
|
|
|
|
| 481 |
|
| 482 |
x = audresample.resample(signal=x.astype(np.float32),
|
| 483 |
original_rate=16000,
|
| 484 |
target_rate=24000)[0, :] # reshapes (64,) -> (1,64)
|
|
|
|
|
|
|
|
|
|
| 485 |
return x
|
| 486 |
|
| 487 |
|
tts.py
CHANGED
|
@@ -42,6 +42,14 @@ def command_line_args():
|
|
| 42 |
default='sample.txt',
|
| 43 |
type=str,
|
| 44 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
parser.add_argument(
|
| 46 |
'--native',
|
| 47 |
help="""
|
|
@@ -83,21 +91,24 @@ def command_line_args():
|
|
| 83 |
|
| 84 |
def send_to_server(args):
|
| 85 |
url = "http://192.168.88.209:5000"
|
|
|
|
|
|
|
| 86 |
|
| 87 |
payload = {
|
| 88 |
'affective': args.affective,
|
| 89 |
'voice': args.voice,
|
|
|
|
| 90 |
'native': args.native,
|
| 91 |
'text': args.text,
|
| 92 |
'image': args.image,
|
| 93 |
'video': args.video,
|
| 94 |
'speed': args.speed,
|
|
|
|
| 95 |
# 'out_file': args.out_file # let serve save as temp
|
| 96 |
}
|
| 97 |
|
| 98 |
-
#
|
| 99 |
-
|
| 100 |
-
# In files= sent actual files if provided
|
| 101 |
text_file = open(args.text, 'rb')
|
| 102 |
|
| 103 |
image_file, video_file, native_file = None, None, None
|
|
@@ -107,7 +118,6 @@ def send_to_server(args):
|
|
| 107 |
image_file = open(args.image, 'rb')
|
| 108 |
except FileNotFoundError:
|
| 109 |
pass
|
| 110 |
-
|
| 111 |
|
| 112 |
if args.video is not None:
|
| 113 |
print('\nLOADING vid\n')
|
|
@@ -122,14 +132,10 @@ def send_to_server(args):
|
|
| 122 |
native_file = open(args.native, 'rb')
|
| 123 |
except FileNotFoundError:
|
| 124 |
pass
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
# --------------------- send this extra
|
| 129 |
|
| 130 |
-
#
|
| 131 |
|
| 132 |
-
response = requests.post(url, data=payload,
|
| 133 |
files=[(args.text, text_file),
|
| 134 |
(args.image, image_file),
|
| 135 |
(args.video, video_file),
|
|
|
|
| 42 |
default='sample.txt',
|
| 43 |
type=str,
|
| 44 |
)
|
| 45 |
+
parser.add_argument(
|
| 46 |
+
'--soundscape',
|
| 47 |
+
help='soundscape - MUST BE IN BRACKETS: \"forest\"',
|
| 48 |
+
default='wind fjord',
|
| 49 |
+
nargs='?',
|
| 50 |
+
type=str,
|
| 51 |
+
const=None,
|
| 52 |
+
)
|
| 53 |
parser.add_argument(
|
| 54 |
'--native',
|
| 55 |
help="""
|
|
|
|
| 91 |
|
| 92 |
def send_to_server(args):
|
| 93 |
url = "http://192.168.88.209:5000"
|
| 94 |
+
|
| 95 |
+
# Args
|
| 96 |
|
| 97 |
payload = {
|
| 98 |
'affective': args.affective,
|
| 99 |
'voice': args.voice,
|
| 100 |
+
'soundscape': args.soundscape,
|
| 101 |
'native': args.native,
|
| 102 |
'text': args.text,
|
| 103 |
'image': args.image,
|
| 104 |
'video': args.video,
|
| 105 |
'speed': args.speed,
|
| 106 |
+
|
| 107 |
# 'out_file': args.out_file # let serve save as temp
|
| 108 |
}
|
| 109 |
|
| 110 |
+
# Send Files
|
| 111 |
+
|
|
|
|
| 112 |
text_file = open(args.text, 'rb')
|
| 113 |
|
| 114 |
image_file, video_file, native_file = None, None, None
|
|
|
|
| 118 |
image_file = open(args.image, 'rb')
|
| 119 |
except FileNotFoundError:
|
| 120 |
pass
|
|
|
|
| 121 |
|
| 122 |
if args.video is not None:
|
| 123 |
print('\nLOADING vid\n')
|
|
|
|
| 132 |
native_file = open(args.native, 'rb')
|
| 133 |
except FileNotFoundError:
|
| 134 |
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
|
| 136 |
+
#
|
| 137 |
|
| 138 |
+
response = requests.post(url, data=payload, # contains str
|
| 139 |
files=[(args.text, text_file),
|
| 140 |
(args.image, image_file),
|
| 141 |
(args.video, video_file),
|