From babdc2f7fd9b24903d56e0a587b343aad21df03e Mon Sep 17 00:00:00 2001 From: Blaise Date: Sun, 22 Dec 2024 22:48:27 +0100 Subject: [PATCH] improve preprocess readability + soxr_vhq resample --- requirements.txt | 1 + rvc/train/preprocess/preprocess.py | 126 ++++++++++++++++++----------- 2 files changed, 82 insertions(+), 45 deletions(-) diff --git a/requirements.txt b/requirements.txt index 78cf0a35..5167f376 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ soundfile==0.12.1 noisereduce pedalboard stftpitchshift +soxr # Machine learning and deep learning omegaconf>=2.0.6; sys_platform == 'darwin' diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py index b117a91a..71c5c899 100644 --- a/rvc/train/preprocess/preprocess.py +++ b/rvc/train/preprocess/preprocess.py @@ -11,6 +11,7 @@ import librosa import multiprocessing import noisereduce as nr +import soxr now_directory = os.getcwd() sys.path.append(now_directory) @@ -18,22 +19,31 @@ from rvc.lib.utils import load_audio from rvc.train.preprocess.slicer import Slicer -# Remove colab logs import logging logging.getLogger("numba.core.byteflow").setLevel(logging.WARNING) logging.getLogger("numba.core.ssa").setLevel(logging.WARNING) logging.getLogger("numba.core.interpreter").setLevel(logging.WARNING) -# Constants OVERLAP = 0.3 MAX_AMPLITUDE = 0.9 ALPHA = 0.75 HIGH_PASS_CUTOFF = 48 SAMPLE_RATE_16K = 16000 +RES_TYPE = "soxr_vhq" + class PreProcess: def __init__(self, sr: int, exp_dir: str, per: float): + self.sr = sr + self.per = per + self.exp_dir = exp_dir + self.device = "cpu" + self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") + self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") + os.makedirs(self.gt_wavs_dir, exist_ok=True) + os.makedirs(self.wavs16k_dir, exist_ok=True) + self.slicer = Slicer( sr=sr, threshold=-42, @@ -42,17 +52,9 @@ def __init__(self, sr: int, exp_dir: str, per: float): hop_size=15, max_sil_kept=500, ) - self.sr = sr self.b_high, self.a_high = signal.butter( N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr ) - self.per = per - self.exp_dir = exp_dir - self.device = "cpu" - self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios") - self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k") - os.makedirs(self.gt_wavs_dir, exist_ok=True) - os.makedirs(self.wavs16k_dir, exist_ok=True) def _normalize_audio(self, audio: np.ndarray): tmp_max = np.abs(audio).max() @@ -76,7 +78,10 @@ def process_audio_segment( normalized_audio.astype(np.float32), ) audio_16k = librosa.resample( - normalized_audio, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K + normalized_audio, + orig_sr=self.sr, + target_sr=SAMPLE_RATE_16K, + res_type=RES_TYPE, ) wavfile.write( os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{idx1}.wav"), @@ -84,27 +89,39 @@ def process_audio_segment( audio_16k.astype(np.float32), ) - def simple_cut(self, audio: np.ndarray, sid: int, idx0: int, chunk_len: float, overlap_len: float): - chunk_length = int(self.sr * chunk_len) - overlap_length = int(self.sr * overlap_len) - i = 0 - while i < len(audio): - chunk = audio[i:i + chunk_length] - if len(chunk) == chunk_length: + def simple_cut( + self, + audio: np.ndarray, + sid: int, + idx0: int, + chunk_len: float, + overlap_len: float, + ): + chunk_samples = int(self.sr * chunk_len) + overlap_samples = int(self.sr * overlap_len) + step = chunk_samples - overlap_samples + num_chunks = (len(audio) - chunk_samples) // step + 1 + for i in range(num_chunks): + start = i * step + end = start + chunk_samples + if end <= len(audio): + chunk = audio[start:end] + file_index = i # full SR for training wavfile.write( - os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav"), + os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{file_index}.wav"), self.sr, chunk.astype(np.float32), ) # 16KHz for feature extraction - chunk_16k = librosa.resample(chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K) + chunk_16k = librosa.resample( + chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type=RES_TYPE + ) wavfile.write( - os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav"), + os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{file_index}.wav"), SAMPLE_RATE_16K, chunk_16k.astype(np.float32), ) - i += chunk_length - overlap_length def process_audio( self, @@ -127,28 +144,38 @@ def process_audio( audio = signal.lfilter(self.b_high, self.a_high, audio) audio = self._normalize_audio(audio) if noise_reduction: - audio = nr.reduce_noise(y=audio, sr=self.sr, prop_decrease=reduction_strength) + audio = nr.reduce_noise( + y=audio, + sr=self.sr, + prop_decrease=reduction_strength, + n_fft=2048, + hop_length=512, + ) if cut_preprocess == "Skip": - # no cutting - self.process_audio_segment(audio, sid, idx0, 0,) + self.process_audio_segment(audio, sid, idx0, 0) elif cut_preprocess == "Simple": - # simple self.simple_cut(audio, sid, idx0, chunk_len, overlap_len) elif cut_preprocess == "Automatic": + segments = self.slicer.slice(audio) idx1 = 0 - # legacy - for audio_segment in self.slicer.slice(audio): - i = 0 - while True: - start = int(self.sr * (self.per - OVERLAP) * i) - i += 1 - if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr: - tmp_audio = audio_segment[start : start + int(self.per * self.sr)] - self.process_audio_segment(tmp_audio, sid, idx0, idx1, ) + for audio_segment in segments: + segment_length = len(audio_segment) + per_samples = int(self.sr * self.per) + overlap_samples_segment = int(self.sr * OVERLAP) + step = per_samples - overlap_samples_segment + + num_sub_segments = (segment_length - per_samples + step - 1) // step + + for i in range(num_sub_segments): + start = i * step + end = start + per_samples + if end <= segment_length: + tmp_audio = audio_segment[start:end] + self.process_audio_segment(tmp_audio, sid, idx0, idx1) idx1 += 1 - else: + elif start < segment_length: tmp_audio = audio_segment[start:] - self.process_audio_segment(tmp_audio, sid, idx0, idx1,) + self.process_audio_segment(tmp_audio, sid, idx0, idx1) idx1 += 1 break @@ -156,6 +183,7 @@ def process_audio( print(f"Error processing audio: {error}") return audio_length + def format_duration(seconds): hours = int(seconds // 3600) minutes = int((seconds % 3600) // 60) @@ -182,9 +210,16 @@ def save_dataset_duration(file_path, dataset_duration): def process_audio_wrapper(args): - pp, file, cut_preprocess, process_effects, noise_reduction, reduction_strength, chunk_len, overlap_len = ( - args - ) + ( + pp, + file, + cut_preprocess, + process_effects, + noise_reduction, + reduction_strength, + chunk_len, + overlap_len, + ) = args file_path, idx0, sid = file return pp.process_audio( file_path, @@ -198,6 +233,7 @@ def process_audio_wrapper(args): overlap_len, ) + def preprocess_training_set( input_root: str, sr: int, @@ -209,7 +245,7 @@ def preprocess_training_set( noise_reduction: bool, reduction_strength: float, chunk_len: float, - overlap_len: float, + overlap_len: float, ): start_time = time.time() pp = PreProcess(sr, exp_dir, per) @@ -230,7 +266,6 @@ def preprocess_training_set( f'Speaker ID folder is expected to be integer, got "{os.path.basename(root)}" instead.' ) - # print(f"Number of files: {len(files)}") audio_length = [] with tqdm(total=len(files)) as pbar: with concurrent.futures.ProcessPoolExecutor( @@ -256,15 +291,16 @@ def preprocess_training_set( audio_length.append(future.result()) pbar.update(1) - audio_length = sum(audio_length) + total_audio_length = sum(audio_length) save_dataset_duration( - os.path.join(exp_dir, "model_info.json"), dataset_duration=audio_length + os.path.join(exp_dir, "model_info.json"), dataset_duration=total_audio_length ) elapsed_time = time.time() - start_time print( - f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(audio_length)} seconds of audio." + f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(total_audio_length)} seconds of audio." ) + if __name__ == "__main__": experiment_directory = str(sys.argv[1]) input_root = str(sys.argv[2])