Spaces:
Runtime error
Runtime error
| import json | |
| import os | |
| import subprocess | |
| from pathlib import Path | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import torch | |
| from demucs.apply import apply_model | |
| from demucs.pretrained import DEFAULT_MODEL, get_model | |
| from huggingface_hub import hf_hub_download, list_repo_files | |
| from so_vits_svc_fork.hparams import HParams | |
| from so_vits_svc_fork.inference.core import Svc | |
| ################################################################### | |
| # REPLACE THESE VALUES TO CHANGE THE MODEL REPO/CKPT NAME/SETTINGS | |
| ################################################################### | |
| # The Hugging Face Hub repo ID | |
| repo_id = "dog/kanye" | |
| # If None, Uses latest ckpt in the repo | |
| ckpt_name = None | |
| # If None, Uses "kmeans.pt" if it exists in the repo | |
| cluster_model_name = None | |
| # Set the default f0 type to use - use the one it was trained on. | |
| # The default for so-vits-svc-fork is "dio". | |
| # Options: "crepe", "crepe-tiny", "parselmouth", "dio", "harvest" | |
| default_f0_method = "crepe" | |
| # The default ratio of cluster inference to SVC inference. | |
| # If cluster_model_name is not found in the repo, this is set to 0. | |
| default_cluster_infer_ratio = 0.5 | |
| # Limit on duration of audio at inference time. increase if you can | |
| # In this parent app, we set the limit with an env var to 30 seconds | |
| # If you didnt set env var + you go OOM try changing 9e9 to <=300ish | |
| duration_limit = int(os.environ.get("MAX_DURATION_SECONDS", 9e9)) | |
| ################################################################### | |
| # Figure out the latest generator by taking highest value one. | |
| # Ex. if the repo has: G_0.pth, G_100.pth, G_200.pth, we'd use G_200.pth | |
| if ckpt_name is None: | |
| latest_id = sorted( | |
| [ | |
| int(Path(x).stem.split("_")[1]) | |
| for x in list_repo_files(repo_id) | |
| if x.startswith("G_") and x.endswith(".pth") | |
| ] | |
| )[-1] | |
| ckpt_name = f"G_{latest_id}.pth" | |
| cluster_model_name = cluster_model_name or "kmeans.pt" | |
| if cluster_model_name in list_repo_files(repo_id): | |
| print(f"Found Cluster model - Downloading {cluster_model_name} from {repo_id}") | |
| cluster_model_path = hf_hub_download(repo_id, cluster_model_name) | |
| else: | |
| print(f"Could not find {cluster_model_name} in {repo_id}. Using None") | |
| cluster_model_path = None | |
| default_cluster_infer_ratio = default_cluster_infer_ratio if cluster_model_path else 0 | |
| generator_path = hf_hub_download(repo_id, ckpt_name) | |
| config_path = hf_hub_download(repo_id, "config.json") | |
| hparams = HParams(**json.loads(Path(config_path).read_text())) | |
| speakers = list(hparams.spk.keys()) | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| model = Svc(net_g_path=generator_path, config_path=config_path, device=device, cluster_model_path=cluster_model_path) | |
| demucs_model = get_model(DEFAULT_MODEL) | |
| def extract_vocal_demucs(model, filename, sr=44100, device=None, shifts=1, split=True, overlap=0.25, jobs=0): | |
| wav, sr = librosa.load(filename, mono=False, sr=sr) | |
| wav = torch.tensor(wav) | |
| ref = wav.mean(0) | |
| wav = (wav - ref.mean()) / ref.std() | |
| sources = apply_model( | |
| model, wav[None], device=device, shifts=shifts, split=split, overlap=overlap, progress=True, num_workers=jobs | |
| )[0] | |
| sources = sources * ref.std() + ref.mean() | |
| # We take just the vocals stem. I know the vocals for this model are at index -1 | |
| # If using different model, check model.sources.index('vocals') | |
| vocal_wav = sources[-1] | |
| # I did this because its the same normalization the so-vits model required | |
| vocal_wav = vocal_wav / max(1.01 * vocal_wav.abs().max(), 1) | |
| vocal_wav = vocal_wav.numpy() | |
| vocal_wav = librosa.to_mono(vocal_wav) | |
| vocal_wav = vocal_wav.T | |
| instrumental_wav = sources[:-1].sum(0).numpy().T | |
| return vocal_wav, instrumental_wav | |
| def download_youtube_clip( | |
| video_identifier, | |
| start_time, | |
| end_time, | |
| output_filename, | |
| num_attempts=5, | |
| url_base="https://www.youtube.com/watch?v=", | |
| quiet=False, | |
| force=False, | |
| ): | |
| output_path = Path(output_filename) | |
| if output_path.exists(): | |
| if not force: | |
| return output_path | |
| else: | |
| output_path.unlink() | |
| quiet = "--quiet --no-warnings" if quiet else "" | |
| command = f""" | |
| yt-dlp {quiet} -x --audio-format wav -f bestaudio -o "{output_filename}" --download-sections "*{start_time}-{end_time}" "{url_base}{video_identifier}" # noqa: E501 | |
| """.strip() | |
| attempts = 0 | |
| while True: | |
| try: | |
| _ = subprocess.check_output(command, shell=True, stderr=subprocess.STDOUT) | |
| except subprocess.CalledProcessError: | |
| attempts += 1 | |
| if attempts == num_attempts: | |
| return None | |
| else: | |
| break | |
| if output_path.exists(): | |
| return output_path | |
| else: | |
| return None | |
| def predict( | |
| speaker, | |
| audio, | |
| transpose: int = 0, | |
| auto_predict_f0: bool = False, | |
| cluster_infer_ratio: float = 0, | |
| noise_scale: float = 0.4, | |
| f0_method: str = "crepe", | |
| db_thresh: int = -40, | |
| pad_seconds: float = 0.5, | |
| chunk_seconds: float = 0.5, | |
| absolute_thresh: bool = False, | |
| ): | |
| audio, _ = librosa.load(audio, sr=model.target_sample, duration=duration_limit) | |
| audio = model.infer_silence( | |
| audio.astype(np.float32), | |
| speaker=speaker, | |
| transpose=transpose, | |
| auto_predict_f0=auto_predict_f0, | |
| cluster_infer_ratio=cluster_infer_ratio, | |
| noise_scale=noise_scale, | |
| f0_method=f0_method, | |
| db_thresh=db_thresh, | |
| pad_seconds=pad_seconds, | |
| chunk_seconds=chunk_seconds, | |
| absolute_thresh=absolute_thresh, | |
| ) | |
| return model.target_sample, audio | |
| SPACE_ID = "nateraw/voice-cloning" | |
| description = f""" | |
| # Attention - This Space may be slow in the shared UI if there is a long queue. To speed it up, you can duplicate and use it with a paid private T4 GPU. | |
| <center><a class="duplicate-button" style="display:inline-block" target="_blank" href="https://huggingface.co/spaces/{SPACE_ID}?duplicate=true"><img src="https://img.shields.io/badge/-Duplicate%20Space-blue?labelColor=white&style=flat&logo=&logoWidth=14" alt="Duplicate Space"></a></center> | |
| #### This app uses models trained with [so-vits-svc-fork](https://github.com/voicepaw/so-vits-svc-fork) to clone a voice. Model currently being used is https://hf.co/{repo_id}. To change the model being served, duplicate the space and update the `repo_id`/other settings in `app.py`. | |
| #### Train Your Own: [](https://colab.research.google.com/github/nateraw/voice-cloning/blob/main/training_so_vits_svc_fork.ipynb) | |
| """.strip() | |
| article = """ | |
| <p style='text-align: center'> | |
| <a href='https://github.com/voicepaw/so-vits-svc-fork' target='_blank'>Github Repo</a> | |
| </p> | |
| """.strip() | |
| interface_mic = gr.Interface( | |
| predict, | |
| inputs=[ | |
| gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"), | |
| gr.Audio(type="filepath", source="microphone", label="Source Audio"), | |
| gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"), | |
| gr.Checkbox(False, label="Auto Predict F0"), | |
| gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio"), | |
| gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"), | |
| gr.Dropdown( | |
| choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], | |
| value=default_f0_method, | |
| label="f0 method", | |
| ), | |
| ], | |
| outputs="audio", | |
| title="Voice Cloning", | |
| description=description, | |
| article=article, | |
| ) | |
| interface_file = gr.Interface( | |
| predict, | |
| inputs=[ | |
| gr.Dropdown(speakers, value=speakers[0], label="Target Speaker"), | |
| gr.Audio(type="filepath", source="upload", label="Source Audio"), | |
| gr.Slider(-12, 12, value=0, step=1, label="Transpose (Semitones)"), | |
| gr.Checkbox(False, label="Auto Predict F0"), | |
| gr.Slider(0.0, 1.0, value=default_cluster_infer_ratio, step=0.1, label="cluster infer ratio"), | |
| gr.Slider(0.0, 1.0, value=0.4, step=0.1, label="noise scale"), | |
| gr.Dropdown( | |
| choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], | |
| value=default_f0_method, | |
| label="f0 method", | |
| ), | |
| ], | |
| outputs="audio", | |
| title="Voice Cloning", | |
| description=description, | |
| article=article, | |
| ) | |
| interface = gr.TabbedInterface( | |
| [interface_mic, interface_file], | |
| ["Clone From Mic", "Clone From File"], | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |