TIGER-audio-extraction1

Running

App Files Files Community

fffiloni commited on May 26

Commit

396365e

verified ·

1 Parent(s): f2e99a3

gradio MCP mode ready

Browse files

Files changed (1) hide show

gradio_app.py +64 -14

gradio_app.py CHANGED Viewed

@@ -72,6 +72,23 @@ def separate_speakers_core(audio_path):
 @spaces.GPU()
 def separate_dnr(audio_file):
     audio, sr = torchaudio.load(audio_file)
     audio = audio.to(device)
@@ -96,6 +113,21 @@ def separate_dnr(audio_file):
 @spaces.GPU()
 def separate_speakers(audio_path):
     output_files = separate_speakers_core(audio_path)
     updates = []
     for i in range(MAX_SPEAKERS):
@@ -107,6 +139,22 @@ def separate_speakers(audio_path):
 @spaces.GPU()
 def separate_dnr_video(video_path):
     audio_path, video = extract_audio_from_video(video_path, 44100)
     dialog_path, effect_path, music_path = separate_dnr(audio_path)
@@ -120,19 +168,24 @@ def separate_dnr_video(video_path):
     return dialog_video, effect_video, music_video
-def convert_to_ffmpeg_friendly(input_wav, output_wav):
-    subprocess.run([
-        "ffmpeg", "-y",
-        "-i", input_wav,
-        "-ar", str(TARGET_SR),
-        "-ac", "1",
-        "-sample_fmt", "s16",
-        output_wav
-    ], check=True)
 @spaces.GPU()
 def separate_speakers_video(video_path):
     audio_path, video = extract_audio_from_video(video_path, 16000)
     output_files = separate_speakers_core(audio_path)
@@ -155,9 +208,6 @@ def separate_speakers_video(video_path):
     return updates
 # --- Gradio UI ---
 with gr.Blocks() as demo:
     gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
@@ -209,4 +259,4 @@ with gr.Blocks() as demo:
             vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)
 if __name__ == "__main__":
-    demo.launch(ssr_mode=False)

 @spaces.GPU()
 def separate_dnr(audio_file):
+    """
+    Perform Dialog, Effects, and Music (DnR) separation on an uploaded audio file.
+    Args:
+        audio_file (str): File path to the input WAV audio file.
+            This should be a mixed audio track containing dialog, background music, and sound effects.
+    Returns:
+        Tuple[str, str, str]: Paths to the separated audio files:
+            - Dialog-only audio (dialog.wav)
+            - Sound effects-only audio (effect.wav)
+            - Background music-only audio (music.wav)
+    This function uses a pretrained DnR model (TIGER-DnR) to isolate the components in the audio.
+    It is intended for tasks such as improving intelligibility or remixing.
+    """
     audio, sr = torchaudio.load(audio_file)
     audio = audio.to(device)
 @spaces.GPU()
 def separate_speakers(audio_path):
+    """
+    Perform speaker separation on a mixed audio file containing multiple speakers.
+    Args:
+        audio_path (str): File path to the audio WAV file containing overlapping speech from multiple people.
+    Returns:
+        List[gr.update]: A list of Gradio update objects, each containing:
+            - A separate audio file for each identified speaker (up to MAX_SPEAKERS)
+            - Visibility and label updates for the UI
+    This function internally calls a pretrained speech separation model (TIGER-speech)
+    and isolates individual speaker tracks from the input audio.
+    """
     output_files = separate_speakers_core(audio_path)
     updates = []
     for i in range(MAX_SPEAKERS):
 @spaces.GPU()
 def separate_dnr_video(video_path):
+    """
+    Separate dialog, effects, and music from the audio of an uploaded video file and reattach them to the original video.
+    Args:
+        video_path (str): File path to the input video file (e.g., MP4 or MOV).
+            The video should contain a composite audio track with dialog, effects, and music.
+    Returns:
+        Tuple[str, str, str]: Paths to the output videos with:
+            - Only dialog audio track (dialog_video.mp4)
+            - Only effects audio track (effect_video.mp4)
+            - Only music audio track (music_video.mp4)
+    The audio is extracted from the video, separated using the DnR model, and then reattached to the original video visuals.
+    """
     audio_path, video = extract_audio_from_video(video_path, 44100)
     dialog_path, effect_path, music_path = separate_dnr(audio_path)
     return dialog_video, effect_video, music_video
 @spaces.GPU()
 def separate_speakers_video(video_path):
+    """
+    Separate individual speakers from the audio track of a video and reattach each speaker’s voice to a copy of the original video.
+    Args:
+        video_path (str): File path to a video file with overlapping speech from multiple speakers.
+    Returns:
+        List[gr.update]: A list of Gradio update objects each containing:
+            - A new video file where the audio consists of only one speaker's voice
+            - Visibility and label information for UI display
+    The function extracts audio from the video, separates individual speakers using a pretrained model,
+    and generates one video per speaker by replacing the audio in the original video.
+    """
     audio_path, video = extract_audio_from_video(video_path, 16000)
     output_files = separate_speakers_core(audio_path)
     return updates
 # --- Gradio UI ---
 with gr.Blocks() as demo:
     gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
             vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)
 if __name__ == "__main__":
+    demo.launch(ssr_mode=False, mcp_server=True)