slow OK 4s live
Browse files- README.md +15 -0
- live_api.py +43 -87
- live_demo.py +12 -111
README.md
CHANGED
|
@@ -124,3 +124,18 @@ From an image and text create a video:
|
|
| 124 |
|
| 125 |
python tts.py --text sample.txt --image assets/image_from_T31.jpg
|
| 126 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
|
| 125 |
python tts.py --text sample.txt --image assets/image_from_T31.jpg
|
| 126 |
```
|
| 127 |
+
|
| 128 |
+
|
| 129 |
+
# Live Demo - Paplay
|
| 130 |
+
|
| 131 |
+
Flask
|
| 132 |
+
|
| 133 |
+
```python
|
| 134 |
+
CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python live_api.py
|
| 135 |
+
```
|
| 136 |
+
|
| 137 |
+
Client (Ubutu)
|
| 138 |
+
|
| 139 |
+
```python
|
| 140 |
+
python live_demo.py # will ask text input & play soundscape
|
| 141 |
+
```
|
live_api.py
CHANGED
|
@@ -4,7 +4,7 @@ import numpy as np
|
|
| 4 |
import soundfile
|
| 5 |
import audresample
|
| 6 |
import text_utils
|
| 7 |
-
|
| 8 |
import re
|
| 9 |
import srt
|
| 10 |
import subprocess
|
|
@@ -17,89 +17,67 @@ from flask_cors import CORS
|
|
| 17 |
from audiocraft.audiogen import AudioGen, audio_write
|
| 18 |
|
| 19 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
| 20 |
-
sound_generator.set_generation_params(duration=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
CACHE_DIR = 'flask_cache/'
|
| 23 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
| 24 |
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
x = np.roll(x, i)
|
| 30 |
-
# fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
|
| 31 |
-
# x = x * fade_in
|
| 32 |
-
return x
|
| 33 |
-
|
| 34 |
-
def overlay(x, sound_background=None):
|
| 35 |
-
if sound_background is not None:
|
| 36 |
-
sound_background = sound_background.detach().cpu().numpy()[0, :]
|
| 37 |
-
len_speech = len(x)
|
| 38 |
-
if len_speech > len(sound_background):
|
| 39 |
-
n_repeat = len_speech // len(sound_background) + 1
|
| 40 |
-
replica = [sound_background] * n_repeat
|
| 41 |
-
replica = [_shift(_) for _ in replica]
|
| 42 |
-
sound_background = np.concatenate(replica)
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
|
| 46 |
-
x = .74 * x + .26 * sound_background[:len_speech]
|
| 47 |
-
return x
|
| 48 |
-
|
| 49 |
-
def tts_multi_sentence(precomputed_style_vector=None,
|
| 50 |
-
text=None,
|
| 51 |
-
voice=None,
|
| 52 |
-
scene=None):
|
| 53 |
-
'''create 24kHZ np.array with tts
|
| 54 |
-
|
| 55 |
-
precomputed_style_vector : required if en_US or en_UK in voice, so
|
| 56 |
-
to perform affective TTS.
|
| 57 |
-
text : string
|
| 58 |
-
voice : string or None (falls to styleTTS)
|
| 59 |
-
scene : 'A castle in far away lands' -> if passed will generate background sound scene
|
| 60 |
-
'''
|
| 61 |
-
# Generate sound scene - up sample to 24KHz
|
| 62 |
if scene is not None:
|
| 63 |
|
| 64 |
sound_background = sound_generator.generate([scene])[0]
|
| 65 |
sound_background = audio_write(None,
|
| 66 |
sound_background.cpu(),
|
| 67 |
-
24000, #
|
| 68 |
strategy="loudness",
|
| 69 |
-
loudness_compressor=True)
|
| 70 |
else:
|
| 71 |
sound_background = None
|
| 72 |
|
| 73 |
-
# StyleTTS2
|
| 74 |
-
if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
|
| 86 |
-
|
| 87 |
|
| 88 |
-
|
| 89 |
-
text_utils.store_ssml(text=text, voice=voice) # Text has to be list of single sentences
|
| 90 |
-
ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
|
| 91 |
-
ps.wait()
|
| 92 |
-
x, fs = soundfile.read('_tmp.wav')
|
| 93 |
-
x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :] # reshapes (64,) -> (1,64)
|
| 94 |
|
| 95 |
-
return overlay(x, sound_background)
|
| 96 |
|
| 97 |
|
| 98 |
|
| 99 |
|
| 100 |
-
# voices = {}
|
| 101 |
-
# import phonemizer
|
| 102 |
-
# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True, with_stress=True)
|
| 103 |
|
| 104 |
app = Flask(__name__)
|
| 105 |
cors = CORS(app)
|
|
@@ -120,40 +98,18 @@ def serve_wav():
|
|
| 120 |
|
| 121 |
args = SimpleNamespace(
|
| 122 |
text=None if r.get('text') is None else r.get('text'), # string not file?
|
| 123 |
-
voice=r.get('voice')[0],
|
| 124 |
-
native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
|
| 125 |
-
affective = r.get('affective')[0],
|
| 126 |
scene=r.get('scene')[0]
|
| 127 |
)
|
| 128 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
| 129 |
|
| 130 |
|
| 131 |
-
print(args, 'ENTER Script')
|
| 132 |
-
do_video_dub = False
|
| 133 |
-
|
| 134 |
-
# ====STYLE VECTOR====
|
| 135 |
-
|
| 136 |
-
precomputed_style_vector = None
|
| 137 |
-
# NOTE: style vector may be None
|
| 138 |
|
| 139 |
-
if precomputed_style_vector is None:
|
| 140 |
-
if 'en_US' in args.voice or 'en_UK' in args.voice:
|
| 141 |
-
_dir = '/' if args.affective else '_v2/'
|
| 142 |
-
precomputed_style_vector = msinference.compute_style(
|
| 143 |
-
'assets/wavs/style_vector' + _dir + args.voice.replace(
|
| 144 |
-
'/', '_').replace(
|
| 145 |
-
'#', '_').replace(
|
| 146 |
-
'cmu-arctic', 'cmu_arctic').replace(
|
| 147 |
-
'_low', '') + '.wav')
|
| 148 |
-
print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
|
| 149 |
|
| 150 |
|
| 151 |
|
| 152 |
|
| 153 |
-
x = tts_multi_sentence(
|
| 154 |
-
|
| 155 |
-
voice=args.voice,
|
| 156 |
-
scene=args.scene)
|
| 157 |
OUT_FILE = 'tmp.wav'
|
| 158 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
| 159 |
|
|
|
|
| 4 |
import soundfile
|
| 5 |
import audresample
|
| 6 |
import text_utils
|
| 7 |
+
|
| 8 |
import re
|
| 9 |
import srt
|
| 10 |
import subprocess
|
|
|
|
| 17 |
from audiocraft.audiogen import AudioGen, audio_write
|
| 18 |
|
| 19 |
sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
|
| 20 |
+
sound_generator.set_generation_params(duration=4)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# ====STYLE VECTOR====
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
# AFFECTIVE = True
|
| 28 |
+
# VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann
|
| 29 |
+
|
| 30 |
+
# _dir = '/' if AFFECTIVE else '_v2/'
|
| 31 |
+
# precomputed_style_vector = msinference.compute_style(
|
| 32 |
+
# 'assets/wavs/style_vector' + _dir + VOICE.replace(
|
| 33 |
+
# '/', '_').replace(
|
| 34 |
+
# '#', '_').replace(
|
| 35 |
+
# 'cmu-arctic', 'cmu_arctic').replace(
|
| 36 |
+
# '_low', '') + '.wav')
|
| 37 |
+
# print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ==== STYLE VECTOR
|
| 41 |
|
| 42 |
CACHE_DIR = 'flask_cache/'
|
| 43 |
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
|
| 44 |
|
| 45 |
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def tts_multi_sentence(scene=None):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
if scene is not None:
|
| 50 |
|
| 51 |
sound_background = sound_generator.generate([scene])[0]
|
| 52 |
sound_background = audio_write(None,
|
| 53 |
sound_background.cpu(),
|
| 54 |
+
24000, # Same as styleTTs sample_rate,
|
| 55 |
strategy="loudness",
|
| 56 |
+
loudness_compressor=True).detach().cpu().numpy()[0, :]
|
| 57 |
else:
|
| 58 |
sound_background = None
|
| 59 |
|
| 60 |
+
# # StyleTTS2
|
| 61 |
+
# if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
|
| 62 |
+
# assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
|
| 63 |
+
# x = []
|
| 64 |
+
# for _sentence in text:
|
| 65 |
+
# x.append(msinference.inference(_sentence,
|
| 66 |
+
# precomputed_style_vector,
|
| 67 |
+
# alpha=0.3,
|
| 68 |
+
# beta=0.7,
|
| 69 |
+
# diffusion_steps=7,
|
| 70 |
+
# embedding_scale=1))
|
| 71 |
+
# x = np.concatenate(x)
|
| 72 |
|
| 73 |
+
# return overlay(x, sound_background)
|
| 74 |
|
| 75 |
+
return sound_background
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
|
|
|
|
| 77 |
|
| 78 |
|
| 79 |
|
| 80 |
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
app = Flask(__name__)
|
| 83 |
cors = CORS(app)
|
|
|
|
| 98 |
|
| 99 |
args = SimpleNamespace(
|
| 100 |
text=None if r.get('text') is None else r.get('text'), # string not file?
|
|
|
|
|
|
|
|
|
|
| 101 |
scene=r.get('scene')[0]
|
| 102 |
)
|
| 103 |
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
|
| 104 |
|
| 105 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
|
| 109 |
|
| 110 |
|
| 111 |
+
x = tts_multi_sentence(args.scene)
|
| 112 |
+
# print('\n\n\n\n Obtai TTS output shape', x.shape)
|
|
|
|
|
|
|
| 113 |
OUT_FILE = 'tmp.wav'
|
| 114 |
soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
|
| 115 |
|
live_demo.py
CHANGED
|
@@ -1,29 +1,9 @@
|
|
| 1 |
-
import numpy as np
|
| 2 |
import argparse
|
| 3 |
import os
|
| 4 |
import requests
|
| 5 |
import subprocess
|
| 6 |
|
| 7 |
|
| 8 |
-
# SSH AGENT
|
| 9 |
-
# eval $(ssh-agent -s)
|
| 10 |
-
# ssh-add ~/.ssh/id_ed25519_github2024
|
| 11 |
-
#
|
| 12 |
-
# git remote set-url origin [email protected]:audeering/shift
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
# https://stackoverflow.com/questions/57158779/how-to-stop-audio-with-playsound-module
|
| 17 |
-
# import multiprocessing
|
| 18 |
-
# from playsound import playsound
|
| 19 |
-
|
| 20 |
-
# p = multiprocessing.Process(target=playsound, args=("file.mp3",))
|
| 21 |
-
# p.start()
|
| 22 |
-
# input("press ENTER to stop playback")
|
| 23 |
-
# p.terminate()
|
| 24 |
-
# from playsound import playsound
|
| 25 |
-
# playsound('/path/to/a/sound/file/you/want/to/play.mp3')
|
| 26 |
-
|
| 27 |
|
| 28 |
|
| 29 |
def command_line_args():
|
|
@@ -44,98 +24,20 @@ def command_line_args():
|
|
| 44 |
parser.add_argument(
|
| 45 |
'--text',
|
| 46 |
help="Text to be synthesized.",
|
| 47 |
-
default='
|
| 48 |
-
type=str,
|
| 49 |
-
)
|
| 50 |
-
parser.add_argument(
|
| 51 |
-
'--native',
|
| 52 |
-
help="""
|
| 53 |
-
--native: (without argument) a flag to do voice cloning using the speech from --video,
|
| 54 |
-
--native my_voice.wav: Voice cloning from user provided audio""",
|
| 55 |
-
# nargs='?',
|
| 56 |
-
# const=None,
|
| 57 |
-
# default=False # default has to be none
|
| 58 |
-
)
|
| 59 |
-
parser.add_argument(
|
| 60 |
-
'--voice',
|
| 61 |
-
help="TTS voice - Available voices: https://audeering.github.io/shift/",
|
| 62 |
-
default="en_US/m-ailabs_low#judy_bieber", #'en_US/cmu-arctic_low#lnh',
|
| 63 |
-
type=str,
|
| 64 |
-
)
|
| 65 |
-
parser.add_argument(
|
| 66 |
-
'--image',
|
| 67 |
-
help="If provided is set as background for output video, see --text",
|
| 68 |
-
type=str,
|
| 69 |
-
)
|
| 70 |
-
parser.add_argument(
|
| 71 |
-
'--video',
|
| 72 |
-
help="Video file for video translation. Voice cloned from the video",
|
| 73 |
type=str,
|
| 74 |
)
|
| 75 |
-
parser.add_argument(
|
| 76 |
-
'--out_file',
|
| 77 |
-
help="Output file name.",
|
| 78 |
-
type=str,
|
| 79 |
-
default='b6'
|
| 80 |
-
)
|
| 81 |
-
parser.add_argument(
|
| 82 |
-
'--scene',
|
| 83 |
-
help='Sound scene description.',
|
| 84 |
-
type=str,
|
| 85 |
-
default='calm background sounds of a castle'
|
| 86 |
-
)
|
| 87 |
return parser
|
| 88 |
|
| 89 |
def send_to_server(args):
|
| 90 |
url = "http://192.168.88.209:5000"
|
| 91 |
|
| 92 |
payload = {
|
| 93 |
-
'affective': args.affective,
|
| 94 |
-
'voice': args.voice,
|
| 95 |
-
'native': args.native,
|
| 96 |
'text': args.text,
|
| 97 |
-
'
|
| 98 |
-
'video': args.video,
|
| 99 |
-
'scene': args.scene,
|
| 100 |
-
# 'out_file': args.out_file # let serve save as temp
|
| 101 |
}
|
| 102 |
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
# In files= sent actual files if provided
|
| 106 |
-
text_file = open(args.text, 'rb')
|
| 107 |
-
|
| 108 |
-
image_file, video_file, native_file = None, None, None
|
| 109 |
-
if args.image is not None:
|
| 110 |
-
print('\nLOADING IMAGE\n')
|
| 111 |
-
try:
|
| 112 |
-
image_file = open(args.image, 'rb')
|
| 113 |
-
except FileNotFoundError:
|
| 114 |
-
pass
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
if args.video is not None:
|
| 118 |
-
print('\nLOADING vid\n')
|
| 119 |
-
try:
|
| 120 |
-
video_file = open(args.video, 'rb')
|
| 121 |
-
except FileNotFoundError:
|
| 122 |
-
pass
|
| 123 |
-
|
| 124 |
-
if args.native is not None:
|
| 125 |
-
print('\nLOADING natv\n')
|
| 126 |
-
try:
|
| 127 |
-
native_file = open(args.native, 'rb')
|
| 128 |
-
except FileNotFoundError:
|
| 129 |
-
pass
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
# --------------------- send this extra
|
| 134 |
-
|
| 135 |
-
print('Sending...\n')
|
| 136 |
-
|
| 137 |
-
response = requests.post(url, data=payload,
|
| 138 |
-
files=[(args.image, image_file)]) # NONEs do not arrive to servers dict
|
| 139 |
|
| 140 |
# Check the response from the server
|
| 141 |
if response.status_code == 200:
|
|
@@ -152,22 +54,21 @@ def send_to_server(args):
|
|
| 152 |
def cli(): # args.out_file is not send to server - server writes tmp - copied by client
|
| 153 |
parser = command_line_args()
|
| 154 |
args = parser.parse_args()
|
|
|
|
| 155 |
while True:
|
| 156 |
-
args.text = input("
|
|
|
|
|
|
|
|
|
|
| 157 |
response = send_to_server(args)
|
| 158 |
-
out_file =
|
| 159 |
-
|
| 160 |
with open(out_file, 'wb') as f:
|
| 161 |
f.write(response.content)
|
| 162 |
-
print('REsponse AT client []\n----------------------------', response.headers)
|
| 163 |
-
|
| 164 |
-
|
| 165 |
subprocess.run(["paplay", out_file])
|
| 166 |
-
|
| 167 |
|
| 168 |
|
| 169 |
if __name__ == '__main__':
|
| 170 |
cli()
|
| 171 |
-
|
| 172 |
-
# assume also video and text for video we have to write some classes for video for audiocraft
|
| 173 |
-
# then call tts.py on this video with nonempty labels - thus calls audiocraft
|
|
|
|
|
|
|
| 1 |
import argparse
|
| 2 |
import os
|
| 3 |
import requests
|
| 4 |
import subprocess
|
| 5 |
|
| 6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
def command_line_args():
|
|
|
|
| 24 |
parser.add_argument(
|
| 25 |
'--text',
|
| 26 |
help="Text to be synthesized.",
|
| 27 |
+
default='How is hoowl',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
type=str,
|
| 29 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
return parser
|
| 31 |
|
| 32 |
def send_to_server(args):
|
| 33 |
url = "http://192.168.88.209:5000"
|
| 34 |
|
| 35 |
payload = {
|
|
|
|
|
|
|
|
|
|
| 36 |
'text': args.text,
|
| 37 |
+
'scene': args.scene
|
|
|
|
|
|
|
|
|
|
| 38 |
}
|
| 39 |
|
| 40 |
+
response = requests.post(url, data=payload) # NONEs do not arrive to servers dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
# Check the response from the server
|
| 43 |
if response.status_code == 200:
|
|
|
|
| 54 |
def cli(): # args.out_file is not send to server - server writes tmp - copied by client
|
| 55 |
parser = command_line_args()
|
| 56 |
args = parser.parse_args()
|
| 57 |
+
os.system('cls' if os.name == 'nt' else 'clear')
|
| 58 |
while True:
|
| 59 |
+
args.text = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
|
| 60 |
+
# _text, _scene = args.text.split('|')
|
| 61 |
+
# args.text = _text
|
| 62 |
+
args.scene = args.text #_scene
|
| 63 |
response = send_to_server(args)
|
| 64 |
+
out_file = '_gen_.wav' #+ response.headers['suffix-file-type'].split('.')[-1]
|
| 65 |
+
|
| 66 |
with open(out_file, 'wb') as f:
|
| 67 |
f.write(response.content)
|
| 68 |
+
# print('REsponse AT client []\n----------------------------', response.headers)
|
| 69 |
+
|
|
|
|
| 70 |
subprocess.run(["paplay", out_file])
|
|
|
|
| 71 |
|
| 72 |
|
| 73 |
if __name__ == '__main__':
|
| 74 |
cli()
|
|
|
|
|
|
|
|
|