dkounadis
/

artificial-styletts2

@@ -21,7 +21,8 @@ from audiocraft.audiogen import AudioGen, audio_write
 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
 sound_generator.set_generation_params(duration=6)
-Path('./flask_cache').mkdir(parents=True, exist_ok=True)
 # SSH AGENT
 #   eval $(ssh-agent -s)
@@ -127,15 +128,15 @@ def serve_wav():
     # Physically Save Client Files
     for filename, obj in request.files.items():
-        obj.save(f'flask_cache/{filename.replace("/","")}')
     print('Saved all files on Server Side\n\n')
-    args = SimpleNamespace(text=None if r.get('text') is None else 'flask_cache/' + r.get('text')[0].replace("/",""),
-                video=None if r.get('video') is None else 'flask_cache/' + r.get('video')[0].replace("/",""),
-                image=None if r.get('image') is None else 'flask_cache/' + r.get('image')[0].replace("/",""),
                 voice=r.get('voice')[0],
-                native=None if r.get('native') is None else 'flask_cache/' + r.get('native')[0].replace("/",""),
                 affective = r.get('affective')[0],
                 scene=r.get('scene')[0]
                 )
@@ -291,7 +292,7 @@ def serve_wav():
         # ==== TTS .srt ====
         if do_video_dub:
-            OUT_FILE = './flask_cache/tmp.mp4' #args.out_file + '_video_dub.mp4'
             subtitles = text
             MAX_LEN = int(subtitles[-1][2] + 17) * 24000
             # 17 extra seconds fail-safe for long-last-segment
@@ -321,7 +322,7 @@ def serve_wav():
                             (.64 * total + .27 * x_native)[:, None],
                             24000)
         else:  # Video from plain (.txt)
-            OUT_FILE = './flask_cache/tmp.mp4' #args.out_file + '_video_from_txt.mp4'
             x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
@@ -333,7 +334,7 @@ def serve_wav():
     if args.image is not None:
         STATIC_FRAME = args.image  # 'assets/image_from_T31.jpg'
-        OUT_FILE = './flask_cache/tmp.mp4' #args.out_file + '_image_to_speech.mp4'
         # SILENT CLIP
@@ -346,7 +347,7 @@ def serve_wav():
                                scene=args.scene
                                )
         soundfile.write(AUDIO_TRACK, x, 24000)
-    elif args.video or args.image:
         # write final output video
         subprocess.call(
             ["ffmpeg",
@@ -361,7 +362,7 @@ def serve_wav():
                 "0:v:0",
                 "-map",
                 " 1:a:0",
-                OUT_FILE])
         print(f'\noutput video is saved as {OUT_FILE}')
@@ -372,8 +373,8 @@ def serve_wav():
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                scene=args.scene)
-        OUT_FILE = './flask_cache/tmp.wav' #args.out_file + '.wav'
-        soundfile.write(OUT_FILE, x, 24000)
@@ -393,7 +394,7 @@ def serve_wav():
     # send server's output as default file -> srv_result.xx
     print(f'\n=SERVER saved as {OUT_FILE=}\n')
-    response = send_from_directory('flask_cache/', path=OUT_FILE)
     response.headers['suffix-file-type'] = OUT_FILE
     return response

 sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium')
 sound_generator.set_generation_params(duration=6)
+CACHE_DIR = 'flask_cache/'
+Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
 # SSH AGENT
 #   eval $(ssh-agent -s)
     # Physically Save Client Files
     for filename, obj in request.files.items():
+        obj.save(f'{CACHE_DIR}{filename.replace("/","")}')
     print('Saved all files on Server Side\n\n')
+    args = SimpleNamespace(text=None if r.get('text') is None else CACHE_DIR + r.get('text')[0].replace("/",""),
+                video=None if r.get('video') is None else CACHE_DIR + r.get('video')[0].replace("/",""),
+                image=None if r.get('image') is None else CACHE_DIR + r.get('image')[0].replace("/",""),
                 voice=r.get('voice')[0],
+                native=None if r.get('native') is None else CACHE_DIR + r.get('native')[0].replace("/",""),
                 affective = r.get('affective')[0],
                 scene=r.get('scene')[0]
                 )
         # ==== TTS .srt ====
         if do_video_dub:
+            OUT_FILE = 'tmp.mp4' #args.out_file + '_video_dub.mp4'
             subtitles = text
             MAX_LEN = int(subtitles[-1][2] + 17) * 24000
             # 17 extra seconds fail-safe for long-last-segment
                             (.64 * total + .27 * x_native)[:, None],
                             24000)
         else:  # Video from plain (.txt)
+            OUT_FILE = 'tmp.mp4'
             x = tts_multi_sentence(text=text,
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
     if args.image is not None:
         STATIC_FRAME = args.image  # 'assets/image_from_T31.jpg'
+        OUT_FILE = 'tmp.mp4' #args.out_file + '_image_to_speech.mp4'
         # SILENT CLIP
                                scene=args.scene
                                )
         soundfile.write(AUDIO_TRACK, x, 24000)
+    if args.video or args.image:
         # write final output video
         subprocess.call(
             ["ffmpeg",
                 "0:v:0",
                 "-map",
                 " 1:a:0",
+                CACHE_DIR + OUT_FILE])
         print(f'\noutput video is saved as {OUT_FILE}')
                                precomputed_style_vector=precomputed_style_vector,
                                voice=args.voice,
                                scene=args.scene)
+        OUT_FILE = 'tmp.wav'
+        soundfile.write(CACHE_DIR + OUT_FILE, x, 24000)
     # send server's output as default file -> srv_result.xx
     print(f'\n=SERVER saved as {OUT_FILE=}\n')
+    response = send_from_directory(CACHE_DIR, path=OUT_FILE)
     response.headers['suffix-file-type'] = OUT_FILE
     return response

landscape2soundscape.py CHANGED Viewed

@@ -56,7 +56,7 @@ DESCRIPTIONS = [
         '01_Schick_AII840_001.jpg',                               # image
         '01_Schick_AII840_001.txt',                               # text
         'Statue in shire hill on autumn beach.',                  # audiocraft
-        'Gottlieb Chick - Bildnis der Heinrike Dannecker - 1802', # cv2 puttext title
         'en_US/m-ailabs_low#mary_ann',
      ],
     # 2
@@ -156,7 +156,7 @@ SILENT_VIDEO = '_silent_video.mp4'
 # SILENT CLIP
-for _img_, _text_, soundscape_text, _title_, _voice_ in DESCRIPTIONS[:1]:
     # cv2put txt
     im = cv2.imread(PIC_DIR + _img_)  # IMG must have EVEN shape

         '01_Schick_AII840_001.jpg',                               # image
         '01_Schick_AII840_001.txt',                               # text
         'Statue in shire hill on autumn beach.',                  # audiocraft
+        'Gottlieb Schick - Bildnis der Heinrike Dannecker - 1802', # cv2 puttext title
         'en_US/m-ailabs_low#mary_ann',
      ],
     # 2
 # SILENT CLIP
+for _img_, _text_, soundscape_text, _title_, _voice_ in DESCRIPTIONS[:20]:
     # cv2put txt
     im = cv2.imread(PIC_DIR + _img_)  # IMG must have EVEN shape