gradio MCP mode ready
Browse files- gradio_app.py +64 -14
gradio_app.py
CHANGED
|
@@ -72,6 +72,23 @@ def separate_speakers_core(audio_path):
|
|
| 72 |
|
| 73 |
@spaces.GPU()
|
| 74 |
def separate_dnr(audio_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
audio, sr = torchaudio.load(audio_file)
|
| 76 |
audio = audio.to(device)
|
| 77 |
|
|
@@ -96,6 +113,21 @@ def separate_dnr(audio_file):
|
|
| 96 |
|
| 97 |
@spaces.GPU()
|
| 98 |
def separate_speakers(audio_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
output_files = separate_speakers_core(audio_path)
|
| 100 |
updates = []
|
| 101 |
for i in range(MAX_SPEAKERS):
|
|
@@ -107,6 +139,22 @@ def separate_speakers(audio_path):
|
|
| 107 |
|
| 108 |
@spaces.GPU()
|
| 109 |
def separate_dnr_video(video_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
audio_path, video = extract_audio_from_video(video_path, 44100)
|
| 111 |
dialog_path, effect_path, music_path = separate_dnr(audio_path)
|
| 112 |
|
|
@@ -120,19 +168,24 @@ def separate_dnr_video(video_path):
|
|
| 120 |
|
| 121 |
return dialog_video, effect_video, music_video
|
| 122 |
|
| 123 |
-
def convert_to_ffmpeg_friendly(input_wav, output_wav):
|
| 124 |
-
subprocess.run([
|
| 125 |
-
"ffmpeg", "-y",
|
| 126 |
-
"-i", input_wav,
|
| 127 |
-
"-ar", str(TARGET_SR),
|
| 128 |
-
"-ac", "1",
|
| 129 |
-
"-sample_fmt", "s16",
|
| 130 |
-
output_wav
|
| 131 |
-
], check=True)
|
| 132 |
-
|
| 133 |
|
| 134 |
@spaces.GPU()
|
| 135 |
def separate_speakers_video(video_path):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 136 |
audio_path, video = extract_audio_from_video(video_path, 16000)
|
| 137 |
output_files = separate_speakers_core(audio_path)
|
| 138 |
|
|
@@ -155,9 +208,6 @@ def separate_speakers_video(video_path):
|
|
| 155 |
return updates
|
| 156 |
|
| 157 |
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
# --- Gradio UI ---
|
| 162 |
with gr.Blocks() as demo:
|
| 163 |
gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
|
|
@@ -209,4 +259,4 @@ with gr.Blocks() as demo:
|
|
| 209 |
vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)
|
| 210 |
|
| 211 |
if __name__ == "__main__":
|
| 212 |
-
demo.launch(ssr_mode=False)
|
|
|
|
| 72 |
|
| 73 |
@spaces.GPU()
|
| 74 |
def separate_dnr(audio_file):
|
| 75 |
+
"""
|
| 76 |
+
Perform Dialog, Effects, and Music (DnR) separation on an uploaded audio file.
|
| 77 |
+
|
| 78 |
+
Args:
|
| 79 |
+
audio_file (str): File path to the input WAV audio file.
|
| 80 |
+
This should be a mixed audio track containing dialog, background music, and sound effects.
|
| 81 |
+
|
| 82 |
+
Returns:
|
| 83 |
+
Tuple[str, str, str]: Paths to the separated audio files:
|
| 84 |
+
- Dialog-only audio (dialog.wav)
|
| 85 |
+
- Sound effects-only audio (effect.wav)
|
| 86 |
+
- Background music-only audio (music.wav)
|
| 87 |
+
|
| 88 |
+
This function uses a pretrained DnR model (TIGER-DnR) to isolate the components in the audio.
|
| 89 |
+
It is intended for tasks such as improving intelligibility or remixing.
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
audio, sr = torchaudio.load(audio_file)
|
| 93 |
audio = audio.to(device)
|
| 94 |
|
|
|
|
| 113 |
|
| 114 |
@spaces.GPU()
|
| 115 |
def separate_speakers(audio_path):
|
| 116 |
+
"""
|
| 117 |
+
Perform speaker separation on a mixed audio file containing multiple speakers.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
audio_path (str): File path to the audio WAV file containing overlapping speech from multiple people.
|
| 121 |
+
|
| 122 |
+
Returns:
|
| 123 |
+
List[gr.update]: A list of Gradio update objects, each containing:
|
| 124 |
+
- A separate audio file for each identified speaker (up to MAX_SPEAKERS)
|
| 125 |
+
- Visibility and label updates for the UI
|
| 126 |
+
|
| 127 |
+
This function internally calls a pretrained speech separation model (TIGER-speech)
|
| 128 |
+
and isolates individual speaker tracks from the input audio.
|
| 129 |
+
"""
|
| 130 |
+
|
| 131 |
output_files = separate_speakers_core(audio_path)
|
| 132 |
updates = []
|
| 133 |
for i in range(MAX_SPEAKERS):
|
|
|
|
| 139 |
|
| 140 |
@spaces.GPU()
|
| 141 |
def separate_dnr_video(video_path):
|
| 142 |
+
"""
|
| 143 |
+
Separate dialog, effects, and music from the audio of an uploaded video file and reattach them to the original video.
|
| 144 |
+
|
| 145 |
+
Args:
|
| 146 |
+
video_path (str): File path to the input video file (e.g., MP4 or MOV).
|
| 147 |
+
The video should contain a composite audio track with dialog, effects, and music.
|
| 148 |
+
|
| 149 |
+
Returns:
|
| 150 |
+
Tuple[str, str, str]: Paths to the output videos with:
|
| 151 |
+
- Only dialog audio track (dialog_video.mp4)
|
| 152 |
+
- Only effects audio track (effect_video.mp4)
|
| 153 |
+
- Only music audio track (music_video.mp4)
|
| 154 |
+
|
| 155 |
+
The audio is extracted from the video, separated using the DnR model, and then reattached to the original video visuals.
|
| 156 |
+
"""
|
| 157 |
+
|
| 158 |
audio_path, video = extract_audio_from_video(video_path, 44100)
|
| 159 |
dialog_path, effect_path, music_path = separate_dnr(audio_path)
|
| 160 |
|
|
|
|
| 168 |
|
| 169 |
return dialog_video, effect_video, music_video
|
| 170 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
|
| 172 |
@spaces.GPU()
|
| 173 |
def separate_speakers_video(video_path):
|
| 174 |
+
"""
|
| 175 |
+
Separate individual speakers from the audio track of a video and reattach each speaker’s voice to a copy of the original video.
|
| 176 |
+
|
| 177 |
+
Args:
|
| 178 |
+
video_path (str): File path to a video file with overlapping speech from multiple speakers.
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
List[gr.update]: A list of Gradio update objects each containing:
|
| 182 |
+
- A new video file where the audio consists of only one speaker's voice
|
| 183 |
+
- Visibility and label information for UI display
|
| 184 |
+
|
| 185 |
+
The function extracts audio from the video, separates individual speakers using a pretrained model,
|
| 186 |
+
and generates one video per speaker by replacing the audio in the original video.
|
| 187 |
+
"""
|
| 188 |
+
|
| 189 |
audio_path, video = extract_audio_from_video(video_path, 16000)
|
| 190 |
output_files = separate_speakers_core(audio_path)
|
| 191 |
|
|
|
|
| 208 |
return updates
|
| 209 |
|
| 210 |
|
|
|
|
|
|
|
|
|
|
| 211 |
# --- Gradio UI ---
|
| 212 |
with gr.Blocks() as demo:
|
| 213 |
gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation")
|
|
|
|
| 259 |
vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs)
|
| 260 |
|
| 261 |
if __name__ == "__main__":
|
| 262 |
+
demo.launch(ssr_mode=False, mcp_server=True)
|