From caf873cf43acab029bdbd60514241cb9d5758995 Mon Sep 17 00:00:00 2001 From: ylzz1997 Date: Mon, 17 Apr 2023 18:21:00 +0800 Subject: [PATCH] Updata threshold para --- inference/infer_tool.py | 15 +++++++++------ inference_main.py | 7 ++++++- utils.py | 4 ++-- webUI.py | 13 +++++++------ 4 files changed, 24 insertions(+), 15 deletions(-) diff --git a/inference/infer_tool.py b/inference/infer_tool.py index d42ed273..91561cfb 100644 --- a/inference/infer_tool.py +++ b/inference/infer_tool.py @@ -152,12 +152,12 @@ def load_model(self): - def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling): + def get_unit_f0(self, in_path, tran, cluster_infer_ratio, speaker, f0_filter ,F0_mean_pooling,cr_threshold=0.05): wav, sr = librosa.load(in_path, sr=self.target_sample) if F0_mean_pooling == True: - f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev) + f0, uv = utils.compute_f0_uv_torchcrepe(torch.FloatTensor(wav), sampling_rate=self.target_sample, hop_length=self.hop_size,device=self.dev,cr_threshold = cr_threshold) if f0_filter and sum(f0) == 0: raise F0FilterException("No voice detected") f0 = torch.FloatTensor(list(f0)) @@ -193,7 +193,8 @@ def infer(self, speaker, tran, raw_path, noice_scale=0.4, f0_filter=False, F0_mean_pooling=False, - enhancer_adaptive_key = 0 + enhancer_adaptive_key = 0, + cr_threshold = 0.05 ): speaker_id = self.spk2id.__dict__.get(speaker) @@ -201,7 +202,7 @@ def infer(self, speaker, tran, raw_path, if len(self.spk2id.__dict__) >= speaker: speaker_id = speaker sid = torch.LongTensor([int(speaker_id)]).to(self.dev).unsqueeze(0) - c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling) + c, f0, uv = self.get_unit_f0(raw_path, tran, cluster_infer_ratio, speaker, f0_filter,F0_mean_pooling,cr_threshold=cr_threshold) if "half" in self.net_g_path and torch.cuda.is_available(): c = c.half() with torch.no_grad(): @@ -245,7 +246,8 @@ def slice_inference(self, lg_num=0, lgr_num =0.75, F0_mean_pooling = False, - enhancer_adaptive_key = 0 + enhancer_adaptive_key = 0, + cr_threshold = 0.05 ): wav_path = raw_audio_path chunks = slicer.cut(wav_path, db_thresh=slice_db) @@ -285,7 +287,8 @@ def slice_inference(self, auto_predict_f0=auto_predict_f0, noice_scale=noice_scale, F0_mean_pooling = F0_mean_pooling, - enhancer_adaptive_key = enhancer_adaptive_key + enhancer_adaptive_key = enhancer_adaptive_key, + cr_threshold = cr_threshold ) _audio = out_audio.cpu().numpy() pad_len = int(self.target_sample * pad_seconds) diff --git a/inference_main.py b/inference_main.py index eabca1ae..7f9bb507 100644 --- a/inference_main.py +++ b/inference_main.py @@ -65,6 +65,9 @@ def main(): help='Proportion of cross length retention, range (0-1]. After forced slicing, the beginning and end of each segment need to be discarded.') parser.add_argument('-eak', '--enhancer_adaptive_key', type=int, default=0, help='Adapt the enhancer to a higher range of sound. The unit is the semitones, default 0.') + parser.add_argument('-ft', '--F0_filter_threshold', type=float, default=0.05, + help='F0 Filtering threshold: This parameter is valid only when f0_mean_pooling is enabled. Values range from 0 to 1. Reducing this value reduces the probability of being out of tune, but increases matte.') + args = parser.parse_args() @@ -83,6 +86,7 @@ def main(): F0_mean_pooling = args.f0_mean_pooling enhance = args.enhance enhancer_adaptive_key = args.enhancer_adaptive_key + cr_threshold = args.F0_filter_threshold svc_model = Svc(args.model_path, args.config_path, args.device, args.cluster_model_path,enhance) infer_tool.mkdir(["raw", "results"]) @@ -132,7 +136,8 @@ def main(): auto_predict_f0=auto_predict_f0, noice_scale=noice_scale, F0_mean_pooling = F0_mean_pooling, - enhancer_adaptive_key = enhancer_adaptive_key + enhancer_adaptive_key = enhancer_adaptive_key, + cr_threshold = cr_threshold ) _audio = out_audio.cpu().numpy() pad_len = int(svc_model.target_sample * pad_seconds) diff --git a/utils.py b/utils.py index 775abed9..326a6ef8 100644 --- a/utils.py +++ b/utils.py @@ -80,7 +80,7 @@ def normalize_f0(f0, x_mask, uv, random_scale=True): exit(0) return f0_norm * x_mask -def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512,device=None): +def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_length=512,device=None,cr_threshold=0.05): from modules.crepe import CrepePitchExtractor x = wav_numpy if p_len is None: @@ -90,7 +90,7 @@ def compute_f0_uv_torchcrepe(wav_numpy, p_len=None, sampling_rate=44100, hop_len f0_min = 50 f0_max = 1100 - F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device) + F0Creper = CrepePitchExtractor(hop_length=hop_length,f0_min=f0_min,f0_max=f0_max,device=device,threshold=cr_threshold) f0,uv = F0Creper(x[None,:].float(),sampling_rate,pad_to=p_len) return f0,uv diff --git a/webUI.py b/webUI.py index 7cec50dd..499ff0ba 100644 --- a/webUI.py +++ b/webUI.py @@ -106,7 +106,7 @@ def modelUnload(): return sid.update(choices = [],value=""),"模型卸载完毕!" -def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key): +def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold): global model try: if input_audio is None: @@ -120,7 +120,7 @@ def vc_fn(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise audio = librosa.to_mono(audio.transpose(1, 0)) temp_path = "temp.wav" soundfile.write(temp_path, audio, sampling_rate, format="wav") - _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key) + _audio = model.slice_inference(temp_path, sid, vc_transform, slice_db, cluster_ratio, auto_f0, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold) model.clear_empty() os.remove(temp_path) #构建保存文件的路径,并保存到results文件夹内 @@ -166,7 +166,7 @@ def tts_func(_text,_rate,_voice): def text_clear(text): return re.sub(r"[\n\,\(\) ]", "", text) -def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key): +def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold): #使用edge-tts把文字转成音频 text2tts=text_clear(text2tts) output_file=tts_func(text2tts,tts_rate,tts_voice) @@ -184,7 +184,7 @@ def vc_fn2(sid, input_audio, vc_transform, auto_f0,cluster_ratio, slice_db, nois sample_rate, data=gr_pu.audio_from_file(save_path2) vc_input=(sample_rate, data) - a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key) + a,b=vc_fn(sid, vc_input, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold) os.remove(output_file) os.remove(save_path2) return a,b @@ -242,6 +242,7 @@ def debug_change(): lg_num = gr.Number(label="两端音频切片的交叉淡入长度,如果自动切片后出现人声不连贯可调整该数值,如果连贯建议采用默认值0,注意,该设置会影响推理速度,单位为秒/s", value=0) lgr_num = gr.Number(label="自动音频切片后,需要舍弃每段切片的头尾。该参数设置交叉长度保留的比例,范围0-1,左开右闭", value=0.75) enhancer_adaptive_key = gr.Number(label="使增强器适应更高的音域(单位为半音数)|默认为0", value=0) + cr_threshold = gr.Number(label="F0过滤阈值,只有启动f0_mean_pooling时有效. 数值范围从0-1. 降低该值可减少跑调概率,但会增加哑音", value=0.05) with gr.Tabs(): with gr.TabItem("音频转音频"): vc_input3 = gr.Audio(label="选择音频") @@ -299,8 +300,8 @@ def debug_change(): WebUI设置 """) debug_button = gr.Checkbox(label="Debug模式,如果向社区反馈BUG需要打开,打开后控制台可以显示具体错误提示", value=debug) - vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key], [vc_output1, vc_output2]) - vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key], [vc_output1, vc_output2]) + vc_submit.click(vc_fn, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2]) + vc_submit2.click(vc_fn2, [sid, vc_input3, vc_transform,auto_f0,cluster_ratio, slice_db, noise_scale,pad_seconds,cl_num,lg_num,lgr_num,text2tts,tts_rate,tts_voice,F0_mean_pooling,enhancer_adaptive_key,cr_threshold], [vc_output1, vc_output2]) debug_button.change(debug_change,[],[]) model_load_button.click(modelAnalysis,[model_path,config_path,cluster_model_path,device,enhance],[sid,sid_output]) model_unload_button.click(modelUnload,[],[sid,sid_output])