Spaces:
Runtime error
Runtime error
| import sys,os | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from vits.models import SynthesizerInfer | |
| from omegaconf import OmegaConf | |
| import torchcrepe | |
| import torch | |
| import io | |
| import os | |
| import gradio as gr | |
| import librosa | |
| import numpy as np | |
| import soundfile | |
| import logging | |
| logging.getLogger('numba').setLevel(logging.WARNING) | |
| logging.getLogger('markdown_it').setLevel(logging.WARNING) | |
| logging.getLogger('urllib3').setLevel(logging.WARNING) | |
| logging.getLogger('matplotlib').setLevel(logging.WARNING) | |
| def load_svc_model(checkpoint_path, model): | |
| assert os.path.isfile(checkpoint_path) | |
| checkpoint_dict = torch.load(checkpoint_path, map_location="cpu") | |
| saved_state_dict = checkpoint_dict["model_g"] | |
| state_dict = model.state_dict() | |
| new_state_dict = {} | |
| for k, v in state_dict.items(): | |
| new_state_dict[k] = saved_state_dict[k] | |
| model.load_state_dict(new_state_dict) | |
| return model | |
| def compute_f0_nn(filename, device): | |
| audio, sr = librosa.load(filename, sr=16000) | |
| assert sr == 16000 | |
| # Load audio | |
| audio = torch.tensor(np.copy(audio))[None] | |
| # Here we'll use a 20 millisecond hop length | |
| hop_length = 320 | |
| # Provide a sensible frequency range for your domain (upper limit is 2006 Hz) | |
| # This would be a reasonable range for speech | |
| fmin = 50 | |
| fmax = 1000 | |
| # Select a model capacity--one of "tiny" or "full" | |
| model = "full" | |
| # Pick a batch size that doesn't cause memory errors on your gpu | |
| batch_size = 512 | |
| # Compute pitch using first gpu | |
| pitch, periodicity = torchcrepe.predict( | |
| audio, | |
| sr, | |
| hop_length, | |
| fmin, | |
| fmax, | |
| model, | |
| batch_size=batch_size, | |
| device=device, | |
| return_periodicity=True, | |
| ) | |
| pitch = np.repeat(pitch, 2, -1) # 320 -> 160 * 2 | |
| periodicity = np.repeat(periodicity, 2, -1) # 320 -> 160 * 2 | |
| # CREPE was not trained on silent audio. some error on silent need filter. | |
| periodicity = torchcrepe.filter.median(periodicity, 9) | |
| pitch = torchcrepe.filter.mean(pitch, 3) | |
| # pitch[periodicity < 0.1] = 0 | |
| pitch = pitch.squeeze(0) | |
| return pitch | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| hp = OmegaConf.load("configs/base.yaml") | |
| model = SynthesizerInfer( | |
| hp.data.filter_length // 2 + 1, | |
| hp.data.segment_size // hp.data.hop_length, | |
| hp) | |
| load_svc_model("vits_pretrain/sovits5.0_bigvgan_mix.pth", model) | |
| model.eval() | |
| model.to(device) | |
| def svc_change(argswave, argsspk): | |
| argsppg = "svc_tmp.ppg.npy" | |
| os.system(f"python whisper/inference.py -w {argswave} -p {argsppg}") | |
| argsvec = "svc_tmp.vec.npy" | |
| os.system(f"python hubert/inference.py -w {argswave} -v {argsvec}") | |
| spk = np.load(argsspk) | |
| spk = torch.FloatTensor(spk) | |
| ppg = np.load(argsppg) | |
| ppg = np.repeat(ppg, 2, 0) # 320 PPG -> 160 * 2 | |
| ppg = torch.FloatTensor(ppg) | |
| vec = np.load(argsvec) | |
| vec = np.repeat(vec, 2, 0) # 320 PPG -> 160 * 2 | |
| vec = torch.FloatTensor(vec) | |
| pit = compute_f0_nn(argswave, device) | |
| pit = torch.FloatTensor(pit) | |
| len_pit = pit.size()[0] | |
| len_vec = vec.size()[0] | |
| len_ppg = ppg.size()[0] | |
| len_min = min(len_pit, len_vec) | |
| len_min = min(len_min, len_ppg) | |
| pit = pit[:len_min] | |
| vec = vec[:len_min, :] | |
| ppg = ppg[:len_min, :] | |
| with torch.no_grad(): | |
| spk = spk.unsqueeze(0).to(device) | |
| source = pit.unsqueeze(0).to(device) | |
| source = model.pitch2source(source) | |
| hop_size = hp.data.hop_length | |
| all_frame = len_min | |
| hop_frame = 10 | |
| out_chunk = 2500 # 25 S | |
| out_index = 0 | |
| out_audio = [] | |
| has_audio = False | |
| while (out_index + out_chunk < all_frame): | |
| has_audio = True | |
| if (out_index == 0): # start frame | |
| cut_s = out_index | |
| cut_s_out = 0 | |
| else: | |
| cut_s = out_index - hop_frame | |
| cut_s_out = hop_frame * hop_size | |
| if (out_index + out_chunk + hop_frame > all_frame): # end frame | |
| cut_e = out_index + out_chunk | |
| cut_e_out = 0 | |
| else: | |
| cut_e = out_index + out_chunk + hop_frame | |
| cut_e_out = -1 * hop_frame * hop_size | |
| sub_ppg = ppg[cut_s:cut_e, :].unsqueeze(0).to(device) | |
| sub_vec = vec[cut_s:cut_e, :].unsqueeze(0).to(device) | |
| sub_pit = pit[cut_s:cut_e].unsqueeze(0).to(device) | |
| sub_len = torch.LongTensor([cut_e - cut_s]).to(device) | |
| sub_har = source[:, :, cut_s * | |
| hop_size:cut_e * hop_size].to(device) | |
| sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har) | |
| sub_out = sub_out[0, 0].data.cpu().detach().numpy() | |
| sub_out = sub_out[cut_s_out:cut_e_out] | |
| out_audio.extend(sub_out) | |
| out_index = out_index + out_chunk | |
| if (out_index < all_frame): | |
| if (has_audio): | |
| cut_s = out_index - hop_frame | |
| cut_s_out = hop_frame * hop_size | |
| else: | |
| cut_s = 0 | |
| cut_s_out = 0 | |
| sub_ppg = ppg[cut_s:, :].unsqueeze(0).to(device) | |
| sub_vec = vec[cut_s:, :].unsqueeze(0).to(device) | |
| sub_pit = pit[cut_s:].unsqueeze(0).to(device) | |
| sub_len = torch.LongTensor([all_frame - cut_s]).to(device) | |
| sub_har = source[:, :, cut_s * hop_size:].to(device) | |
| sub_out = model.inference(sub_ppg, sub_vec, sub_pit, spk, sub_len, sub_har) | |
| sub_out = sub_out[0, 0].data.cpu().detach().numpy() | |
| sub_out = sub_out[cut_s_out:] | |
| out_audio.extend(sub_out) | |
| out_audio = np.asarray(out_audio) | |
| return out_audio | |
| def svc_main(sid, input_audio): | |
| if input_audio is None: | |
| return "You need to upload an audio", None | |
| sampling_rate, audio = input_audio | |
| audio = (audio / np.iinfo(audio.dtype).max).astype(np.float32) | |
| if len(audio.shape) > 1: | |
| audio = librosa.to_mono(audio.transpose(1, 0)) | |
| if sampling_rate != 16000: | |
| audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=16000) | |
| if (len(audio) > 16000*100): | |
| audio = audio[:16000*100] | |
| wav_path = "temp.wav" | |
| soundfile.write(wav_path, audio, 16000, format="wav") | |
| out_audio = svc_change(wav_path, f"configs/singers/singer00{sid}.npy") | |
| return "Success", (32000, out_audio) | |
| app = gr.Blocks() | |
| with app: | |
| with gr.Tabs(): | |
| with gr.TabItem("sovits 5.0"): | |
| gr.Markdown(value=""" | |
| 基于开源数据:Multi-Singer | |
| https://github.com/Multi-Singer/Multi-Singer.github.io | |
| 最终版本: | |
| 1,mix_encoder: whisper + hubert, 解决跨语言转换和纯对白语音训练 | |
| 2,解决F0瑕疵 | |
| """) | |
| sid = gr.Dropdown(label="音色", choices=[ | |
| "22", "33", "47", "51"], value="47") | |
| vc_input3 = gr.Audio(label="上传音频") | |
| vc_submit = gr.Button("转换", variant="primary") | |
| vc_output1 = gr.Textbox(label="状态信息") | |
| vc_output2 = gr.Audio(label="转换音频") | |
| vc_submit.click(svc_main, [sid, vc_input3], [vc_output1, vc_output2]) | |
| app.launch() | |