dkounadis
/

artificial-styletts2

 python tts.py --text sample.txt --image assets/image_from_T31.jpg
 ```
+# Live Demo - Paplay
+Flask
+```python
+CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=4 python live_api.py
+```
+Client (Ubutu)
+```python
+python live_demo.py  # will ask text input & play soundscape
+```

live_api.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 import soundfile
 import audresample
 import text_utils
-import msinference
 import re
 import srt
 import subprocess
@@ -17,89 +17,67 @@ from flask_cors import CORS
 from audiocraft.audiogen import AudioGen, audio_write
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
-sound_generator.set_generation_params(duration=6)
 CACHE_DIR = 'flask_cache/'
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
-def _shift(x):
-    n = x.shape[0]
-    i = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0
-    x = np.roll(x, i)
-    # fade_in = .5 + .5 * np.tanh(4*(np.linspace(-10, 10, x.shape[0]) + 9.4))
-    # x = x * fade_in
-    return x
-def overlay(x, sound_background=None):
-    if sound_background is not None:
-        sound_background = sound_background.detach().cpu().numpy()[0, :]
-        len_speech = len(x)
-        if len_speech > len(sound_background):
-            n_repeat = len_speech // len(sound_background) + 1
-            replica = [sound_background] * n_repeat
-            replica = [_shift(_) for _ in replica]
-            sound_background = np.concatenate(replica)
-        print(f'\nSOUND BACKGROUND SHAPE\n{sound_background.shape=}\n{x.shape=}\n- - - -')
-        x = .74 * x + .26 * sound_background[:len_speech]
-    return x
-def tts_multi_sentence(precomputed_style_vector=None,
-                       text=None,
-                       voice=None,
-                       scene=None):
-    '''create 24kHZ np.array with tts
-       precomputed_style_vector :   required if en_US or en_UK in voice, so
-                                    to perform affective TTS.
-       text  : string
-       voice : string or None (falls to styleTTS)
-       scene : 'A castle in far away lands' -> if passed will generate background sound scene
-       '''
-    # Generate sound scene - up sample to 24KHz
     if scene is not None:
         sound_background = sound_generator.generate([scene])[0]
         sound_background = audio_write(None,
                                        sound_background.cpu(),
-                                       24000,  # sound_generator.sample_rate,
                                        strategy="loudness",
-                                       loudness_compressor=True)
     else:
         sound_background = None
-    # StyleTTS2
-    if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
-        assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
-        x = []
-        for _sentence in text:
-            x.append(msinference.inference(_sentence,
-                        precomputed_style_vector,
-                                    alpha=0.3,
-                                    beta=0.7,
-                                    diffusion_steps=7,
-                                    embedding_scale=1))
-        x = np.concatenate(x)
-        return overlay(x, sound_background)
-    # Fallback - Mimic-3
-    text_utils.store_ssml(text=text, voice=voice)  # Text has to be list of single sentences
-    ps = subprocess.Popen(f'cat _tmp_ssml.txt | mimic3 --ssml > _tmp.wav', shell=True)
-    ps.wait()
-    x, fs = soundfile.read('_tmp.wav')
-    x = audresample.resample(x.astype(np.float32), 24000, fs)[0, :]  # reshapes (64,) -> (1,64)
-    return overlay(x, sound_background)
-# voices = {}
-# import phonemizer
-# global_phonemizer = phonemizer.backend.EspeakBackend(language='en-us', preserve_punctuation=True,  with_stress=True)
 app = Flask(__name__)
 cors = CORS(app)
@@ -120,40 +98,18 @@ def serve_wav():
     args = SimpleNamespace(
         text=None if r.get('text') is None else r.get('text'),  # string not file?
-        voice=r.get('voice')[0],
-        native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
-        affective = r.get('affective')[0],
         scene=r.get('scene')[0]
         )
     # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
-    print(args, 'ENTER Script')
-    do_video_dub = False
-    # ====STYLE VECTOR====
-    precomputed_style_vector = None
-    # NOTE: style vector may be None
-    if precomputed_style_vector is None:
-        if 'en_US' in args.voice or 'en_UK' in args.voice:
-            _dir = '/' if args.affective else '_v2/'
-            precomputed_style_vector = msinference.compute_style(
-                'assets/wavs/style_vector' + _dir + args.voice.replace(
-                    '/', '_').replace(
-                    '#', '_').replace(
-                    'cmu-arctic', 'cmu_arctic').replace(
-                    '_low', '') + '.wav')
-    print('\n  STYLE VECTOR \n', precomputed_style_vector.shape)
-    x = tts_multi_sentence(text=args.text,
-                            precomputed_style_vector=precomputed_style_vector,
-                            voice=args.voice,
-                            scene=args.scene)
     OUT_FILE = 'tmp.wav'
     soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)

 import soundfile
 import audresample
 import text_utils
 import re
 import srt
 import subprocess
 from audiocraft.audiogen import AudioGen, audio_write
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
+sound_generator.set_generation_params(duration=4)
+# ====STYLE VECTOR====
+# AFFECTIVE = True
+# VOICE = 'en_UK/apope_low'  #  	en_US/m-ailabs_low#mary_ann
+# _dir = '/' if AFFECTIVE else '_v2/'
+# precomputed_style_vector = msinference.compute_style(
+#     'assets/wavs/style_vector' + _dir + VOICE.replace(
+#         '/', '_').replace(
+#         '#', '_').replace(
+#         'cmu-arctic', 'cmu_arctic').replace(
+#         '_low', '') + '.wav')
+# print('\n  STYLE VECTOR \n', precomputed_style_vector.shape)
+# ==== STYLE VECTOR
 CACHE_DIR = 'flask_cache/'
 Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
+def tts_multi_sentence(scene=None):
     if scene is not None:
         sound_background = sound_generator.generate([scene])[0]
         sound_background = audio_write(None,
                                        sound_background.cpu(),
+                                       24000,  # Same as styleTTs sample_rate,
                                        strategy="loudness",
+                                       loudness_compressor=True).detach().cpu().numpy()[0, :]
     else:
         sound_background = None
+    # # StyleTTS2
+    # if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
+    #     assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
+    #     x = []
+    #     for _sentence in text:
+    #         x.append(msinference.inference(_sentence,
+    #                     precomputed_style_vector,
+    #                                 alpha=0.3,
+    #                                 beta=0.7,
+    #                                 diffusion_steps=7,
+    #                                 embedding_scale=1))
+    #     x = np.concatenate(x)
+    #     return overlay(x, sound_background)
+    return sound_background
 app = Flask(__name__)
 cors = CORS(app)
     args = SimpleNamespace(
         text=None if r.get('text') is None else r.get('text'),  # string not file?
         scene=r.get('scene')[0]
         )
     # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
+    x = tts_multi_sentence(args.scene)
+    # print('\n\n\n\n Obtai TTS output shape', x.shape)
     OUT_FILE = 'tmp.wav'
     soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)

live_demo.py CHANGED Viewed

@@ -1,29 +1,9 @@
-import numpy as np
 import argparse
 import os
 import requests
 import subprocess
-# SSH AGENT
-#   eval $(ssh-agent -s)
-#   ssh-add ~/.ssh/id_ed25519_github2024
-#
-#   git remote set-url origin [email protected]:audeering/shift
-# https://stackoverflow.com/questions/57158779/how-to-stop-audio-with-playsound-module
-# import multiprocessing
-# from playsound import playsound
-# p = multiprocessing.Process(target=playsound, args=("file.mp3",))
-# p.start()
-# input("press ENTER to stop playback")
-# p.terminate()
-# from playsound import playsound
-# playsound('/path/to/a/sound/file/you/want/to/play.mp3')
 def command_line_args():
@@ -44,98 +24,20 @@ def command_line_args():
     parser.add_argument(
         '--text',
         help="Text to be synthesized.",
-        default='sample.txt',
-        type=str,
-    )
-    parser.add_argument(
-        '--native',
-        help="""
-        --native: (without argument) a flag to do voice cloning using the speech from --video,
-        --native my_voice.wav:  Voice cloning from user provided audio""",
-        # nargs='?',
-        # const=None,
-        # default=False   # default has to be none
-        )
-    parser.add_argument(
-        '--voice',
-        help="TTS voice - Available voices: https://audeering.github.io/shift/",
-        default="en_US/m-ailabs_low#judy_bieber", #'en_US/cmu-arctic_low#lnh',
-        type=str,
-    )
-    parser.add_argument(
-        '--image',
-        help="If provided is set as background for output video, see --text",
-        type=str,
-    )
-    parser.add_argument(
-        '--video',
-        help="Video file for video translation. Voice cloned from the video",
         type=str,
     )
-    parser.add_argument(
-        '--out_file',
-        help="Output file name.",
-        type=str,
-        default='b6'
-    )
-    parser.add_argument(
-        '--scene',
-        help='Sound scene description.',
-        type=str,
-        default='calm background sounds of a castle'
-    )
     return parser
 def send_to_server(args):
     url = "http://192.168.88.209:5000"
     payload = {
-        'affective': args.affective,
-        'voice': args.voice,
-        'native': args.native,
         'text': args.text,
-        'image': args.image,
-        'video': args.video,
-        'scene': args.scene,
-        # 'out_file': args.out_file   # let serve save as temp
     }
-    # In data= we can write args
-    # In files=  sent actual files if provided
-    text_file = open(args.text, 'rb')
-    image_file, video_file, native_file = None, None, None
-    if args.image is not None:
-        print('\nLOADING IMAGE\n')
-        try:
-            image_file = open(args.image, 'rb')
-        except FileNotFoundError:
-            pass
-    if args.video is not None:
-        print('\nLOADING vid\n')
-        try:
-            video_file = open(args.video, 'rb')
-        except FileNotFoundError:
-            pass
-    if args.native is not None:
-        print('\nLOADING natv\n')
-        try:
-            native_file = open(args.native, 'rb')
-        except FileNotFoundError:
-            pass
-        # --------------------- send this extra
-    print('Sending...\n')
-    response = requests.post(url, data=payload,
-                             files=[(args.image, image_file)])  # NONEs do not arrive to servers dict
     # Check the response from the server
     if response.status_code == 200:
@@ -152,22 +54,21 @@ def send_to_server(args):
 def cli(): # args.out_file is not send to server - server writes tmp - copied by client
     parser = command_line_args()
     args = parser.parse_args()
     while True:
-        args.text = input("Type your text: ")
         response = send_to_server(args)
-        out_file = args.out_file + '.' + response.headers['suffix-file-type'].split('.')[-1]
         with open(out_file, 'wb') as f:
             f.write(response.content)
-        print('REsponse AT client []\n----------------------------', response.headers)
         subprocess.run(["paplay", out_file])
 if __name__ == '__main__':
     cli()
-# assume also video and text for video we have to write some classes for video for audiocraft
-# then call tts.py on this video with nonempty labels - thus calls audiocraft

 import argparse
 import os
 import requests
 import subprocess
 def command_line_args():
     parser.add_argument(
         '--text',
         help="Text to be synthesized.",
+        default='How is hoowl',
         type=str,
     )
     return parser
 def send_to_server(args):
     url = "http://192.168.88.209:5000"
     payload = {
         'text': args.text,
+        'scene': args.scene
     }
+    response = requests.post(url, data=payload)  # NONEs do not arrive to servers dict
     # Check the response from the server
     if response.status_code == 200:
 def cli(): # args.out_file is not send to server - server writes tmp - copied by client
     parser = command_line_args()
     args = parser.parse_args()
+    os.system('cls' if os.name == 'nt' else 'clear')
     while True:
+        args.text = input("\n\n\n\nDescribe Any Sound: \n\n\n\n")
+        # _text, _scene = args.text.split('|')
+        # args.text = _text
+        args.scene = args.text #_scene
         response = send_to_server(args)
+        out_file = '_gen_.wav'  #+ response.headers['suffix-file-type'].split('.')[-1]
         with open(out_file, 'wb') as f:
             f.write(response.content)
+        # print('REsponse AT client []\n----------------------------', response.headers)
         subprocess.run(["paplay", out_file])
 if __name__ == '__main__':
     cli()