Spaces:

gsarti
/

local-tts

Running

App Files Files Community

gsarti commited on Jan 15

Commit

2efd326

1 Parent(s): 885609d

Added streaming

Browse files

Files changed (2) hide show

app.py +63 -35
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,7 +1,10 @@
 import os
 import spaces
 import tempfile
-import soundfile as sf
 import requests
 from markdown import Markdown
 from io import StringIO
@@ -17,6 +20,31 @@ voices = {
     "en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
 }
 def unmark_element(element, stream=None):
     if stream is None:
         stream = StringIO()
@@ -39,28 +67,6 @@ def markdown2text(text):
     return __md.convert(text)
-@spaces.GPU
-def text_to_speech(text, voice, speed, lang):
-    try:
-        # Generate audio
-        samples, sample_rate = kokoro.create(
-            text,
-            voice=voice,
-            speed=float(speed),
-            lang=lang
-        )
-        # Create temporary file
-        temp_dir = tempfile.mkdtemp()
-        temp_path = os.path.join(temp_dir, "output.wav")
-        # Save to temporary file
-        sf.write(temp_path, samples, sample_rate)
-        return temp_path
-    except Exception as e:
-        return f"Error: {str(e)}"
 def create_temp_html_from_url(url: str) -> str:
     try:
         response = requests.get(url)
@@ -68,7 +74,6 @@ def create_temp_html_from_url(url: str) -> str:
         html = response.text
         temp_dir = tempfile.mkdtemp()
         temp_path = os.path.join(temp_dir, "output.html")
         with open(temp_path, "w") as f:
             f.write(html)
     except Exception as e:
@@ -76,7 +81,7 @@ def create_temp_html_from_url(url: str) -> str:
     return temp_path
-def process_input(input_type, url_input, file_input, text_input, voice, speed, lang):
     if input_type in ["URL", "File"]:
         if input_type == "URL":
             filepath = create_temp_html_from_url(url_input)
@@ -84,17 +89,31 @@ def process_input(input_type, url_input, file_input, text_input, voice, speed, l
             filepath = file_input
         print(filepath)
         markdown = md.convert(filepath).text_content
-        text = markdown2text(markdown)
     else:
         markdown = text_input
-        text = text_input
-    audio_path = text_to_speech(text, voice, speed, lang)
-    return markdown, audio_path
 with gr.Blocks() as demo:
     gr.Markdown(
-        "# Local TTS demo 🗣️ \nProvide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
     )
     with gr.Row():
@@ -105,7 +124,7 @@ with gr.Blocks() as demo:
                 lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
                 voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
         with gr.Column():
-            url_input = gr.Textbox(label="Enter URL")
             file_input = gr.File(label="Upload File", visible=False)
             text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)
@@ -121,14 +140,23 @@ with gr.Blocks() as demo:
     lang.change(update_lang, lang, [voice])
     with gr.Accordion("Markdown output", open=False):
         output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
-    output_audio = gr.Audio(label="Generated Audio")
     submit_button = gr.Button("Convert")
     submit_button.click(
-        process_input,
-        inputs=[input_type, url_input, file_input, text_input, voice, speed, lang],
-        outputs=[output_markdown, output_audio],
     )
 demo.launch()

 import os
+import io
 import spaces
 import tempfile
+#import soundfile as sf
+import numpy as np
+from pydub import AudioSegment
 import requests
 from markdown import Markdown
 from io import StringIO
     "en-gb": ['bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis']
 }
+def numpy_to_mp3(audio_array, sampling_rate):
+    # Normalize audio_array if it's floating-point
+    if np.issubdtype(audio_array.dtype, np.floating):
+        max_val = np.max(np.abs(audio_array))
+        audio_array = (audio_array / max_val) * 32767 # Normalize to 16-bit range
+        audio_array = audio_array.astype(np.int16)
+    # Create an audio segment from the numpy array
+    audio_segment = AudioSegment(
+        audio_array.tobytes(),
+        frame_rate=sampling_rate,
+        sample_width=audio_array.dtype.itemsize,
+        channels=1
+    )
+    # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
+    mp3_io = io.BytesIO()
+    audio_segment.export(mp3_io, format="mp3", bitrate="320k")
+    # Get the MP3 bytes
+    mp3_bytes = mp3_io.getvalue()
+    mp3_io.close()
+    return mp3_bytes
 def unmark_element(element, stream=None):
     if stream is None:
         stream = StringIO()
     return __md.convert(text)
 def create_temp_html_from_url(url: str) -> str:
     try:
         response = requests.get(url)
         html = response.text
         temp_dir = tempfile.mkdtemp()
         temp_path = os.path.join(temp_dir, "output.html")
         with open(temp_path, "w") as f:
             f.write(html)
     except Exception as e:
     return temp_path
+def parse(input_type, url_input, file_input, text_input):
     if input_type in ["URL", "File"]:
         if input_type == "URL":
             filepath = create_temp_html_from_url(url_input)
             filepath = file_input
         print(filepath)
         markdown = md.convert(filepath).text_content
     else:
         markdown = text_input
+    return markdown
+def clean(output_markdown):
+    return markdown2text(output_markdown)
+@spaces.GPU
+async def text_to_speech(output_text, voice, speed, lang):
+    stream = kokoro.create_stream(
+        output_text,
+        voice=voice,
+        speed=float(speed),
+        lang=lang
+    )
+    async for samples, sample_rate in stream:
+        yield numpy_to_mp3(samples, sampling_rate=sample_rate)
 with gr.Blocks() as demo:
     gr.Markdown(
+        "# Stream Local TTS with Kokoro-82M 🗣️\n"
+        "Provide a URL or upload a file to convert its content into speech using [Markitdown](https://github.com/microsoft/markitdown) and [Kokoro-ONNX](https://github.com/thewh1teagle/kokoro-onnx)."
     )
     with gr.Row():
                 lang = gr.Dropdown(choices=voices.keys(), label="Language", value="en-us")
                 voice = gr.Dropdown(choices=voices[lang.value], label="Voice", value=voices[lang.value][0])
         with gr.Column():
+            url_input = gr.Textbox(label="Enter URL", lines=1)
             file_input = gr.File(label="Upload File", visible=False)
             text_input = gr.Textbox(label="Text", visible=False, lines=5, placeholder="Enter text here", show_label=False, interactive=True)
     lang.change(update_lang, lang, [voice])
     with gr.Accordion("Markdown output", open=False):
+        output_text = gr.Textbox(visible=False)
         output_markdown = gr.Markdown("Parsed markdown will appear here", label="Parsed Text", show_copy_button=True)
+    output_audio = gr.Audio(label="Generated Audio", streaming=True, autoplay=True, loop=False)
     submit_button = gr.Button("Convert")
     submit_button.click(
+        parse,
+        inputs=[input_type, url_input, file_input, text_input],
+        outputs=[output_markdown],
+    ).success(
+        clean,
+        inputs=[output_markdown],
+        outputs=[output_text],
+    ).success(
+        text_to_speech,
+        inputs=[output_text, voice, speed, lang],
+        outputs=[output_audio],
     )
 demo.launch()

requirements.txt CHANGED Viewed

@@ -1,4 +1,5 @@
 spaces
 gradio>=5.3.0
 kokoro-onnx>=0.2.7
-markitdown>=0.0.1a3

 spaces
 gradio>=5.3.0
 kokoro-onnx>=0.2.7
+markitdown>=0.0.1a3
+soundfile>=0.13.0