import gradio as gr from gradio_client import Client import torch import os from scipy.io.wavfile import write hf_token = os.environ.get('HF_TOKEN') #splt_client = Client("https://fffiloni-splittrack2musicgen.hf.space/") def split_process(audio, chosen_out_track): os.makedirs("out", exist_ok=True) write('test.wav', audio[0], audio[1]) os.system("python3 -m demucs.separate -n mdx_extra_q -j 4 test.wav -o out") #return "./out/mdx_extra_q/test/vocals.wav","./out/mdx_extra_q/test/bass.wav","./out/mdx_extra_q/test/drums.wav","./out/mdx_extra_q/test/other.wav" if chosen_out_track == "vocals": return "/out/mdx_extra_q/test/vocals.wav" elif chosen_out_track == "bass": return "./out/mdx_extra_q/test/bass.wav" elif chosen_out_track == "drums": return "./out/mdx_extra_q/test/drums.wav" elif chosen_out_track == "other": return "./out/mdx_extra_q/test/other.wav" elif chosen_out_track == "all-in": return "test.wav" from transformers import pipeline from transformers.pipelines.audio_utils import ffmpeg_read import tempfile MODEL_NAME = "openai/whisper-large-v3-turbo" BATCH_SIZE = 8 FILE_LIMIT_MB = 1000 YT_LENGTH_LIMIT_S = 3600 # limit to 1 hour YouTube files device = 0 if torch.cuda.is_available() else "cpu" pipe = pipeline( task="automatic-speech-recognition", model=MODEL_NAME, chunk_length_s=30, device=device, token=hf_token ) #@spaces.GPU def transcribe(inputs, task): if inputs is None: raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.") text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"] return text import re def format_lyrics(text): # Use regex to find parts that start with a capital letter and insert a newline formatted_text = re.sub(r'(?
Send the audio file of your favorite song, and get the lyrics !
Under the hood, we split and get the vocals track from the audio file, then send the vocals to Whisper.