revert preprocess changes

IAHispano · Dec 22, 2024 · e96a659 · e96a659
1 parent d535029
commit e96a659
Showing 1 changed file with 59 additions and 47 deletions.
diff --git a/rvc/train/preprocess/preprocess.py b/rvc/train/preprocess/preprocess.py
@@ -35,15 +35,6 @@
 
 class PreProcess:
     def __init__(self, sr: int, exp_dir: str, per: float):
-        self.sr = sr
-        self.per = per
-        self.exp_dir = exp_dir
-        self.device = "cpu"
-        self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios")
-        self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k")
-        os.makedirs(self.gt_wavs_dir, exist_ok=True)
-        os.makedirs(self.wavs16k_dir, exist_ok=True)
-
         self.slicer = Slicer(
             sr=sr,
             threshold=-42,
@@ -52,9 +43,17 @@ def __init__(self, sr: int, exp_dir: str, per: float):
             hop_size=15,
             max_sil_kept=500,
         )
+        self.sr = sr
         self.b_high, self.a_high = signal.butter(
             N=5, Wn=HIGH_PASS_CUTOFF, btype="high", fs=self.sr
         )
+        self.per = per
+        self.exp_dir = exp_dir
+        self.device = "cpu"
+        self.gt_wavs_dir = os.path.join(exp_dir, "sliced_audios")
+        self.wavs16k_dir = os.path.join(exp_dir, "sliced_audios_16k")
+        os.makedirs(self.gt_wavs_dir, exist_ok=True)
+        os.makedirs(self.wavs16k_dir, exist_ok=True)
 
     def _normalize_audio(self, audio: np.ndarray):
         tmp_max = np.abs(audio).max()
@@ -97,19 +96,18 @@ def simple_cut(
         chunk_len: float,
         overlap_len: float,
     ):
-        chunk_samples = int(self.sr * chunk_len)
-        overlap_samples = int(self.sr * overlap_len)
-        step = chunk_samples - overlap_samples
-        num_chunks = (len(audio) - chunk_samples) // step + 1
-        for i in range(num_chunks):
-            start = i * step
-            end = start + chunk_samples
-            if end <= len(audio):
-                chunk = audio[start:end]
-                file_index = i
+        chunk_length = int(self.sr * chunk_len)
+        overlap_length = int(self.sr * overlap_len)
+        i = 0
+        while i < len(audio):
+            chunk = audio[i : i + chunk_length]
+            if len(chunk) == chunk_length:
                 # full SR for training
                 wavfile.write(
-                    os.path.join(self.gt_wavs_dir, f"{sid}_{idx0}_{file_index}.wav"),
+                    os.path.join(
+                        self.gt_wavs_dir,
+                        f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav",
+                    ),
                     self.sr,
                     chunk.astype(np.float32),
                 )
@@ -118,10 +116,14 @@ def simple_cut(
                     chunk, orig_sr=self.sr, target_sr=SAMPLE_RATE_16K, res_type=RES_TYPE
                 )
                 wavfile.write(
-                    os.path.join(self.wavs16k_dir, f"{sid}_{idx0}_{file_index}.wav"),
+                    os.path.join(
+                        self.wavs16k_dir,
+                        f"{sid}_{idx0}_{i // (chunk_length - overlap_length)}.wav",
+                    ),
                     SAMPLE_RATE_16K,
                     chunk_16k.astype(np.float32),
                 )
+            i += chunk_length - overlap_length
 
     def process_audio(
         self,
@@ -145,37 +147,46 @@ def process_audio(
                 audio = self._normalize_audio(audio)
             if noise_reduction:
                 audio = nr.reduce_noise(
-                    y=audio,
-                    sr=self.sr,
-                    prop_decrease=reduction_strength,
-                    n_fft=2048,
-                    hop_length=512,
+                    y=audio, sr=self.sr, prop_decrease=reduction_strength
                 )
             if cut_preprocess == "Skip":
-                self.process_audio_segment(audio, sid, idx0, 0)
+                # no cutting
+                self.process_audio_segment(
+                    audio,
+                    sid,
+                    idx0,
+                    0,
+                )
             elif cut_preprocess == "Simple":
+                # simple
                 self.simple_cut(audio, sid, idx0, chunk_len, overlap_len)
             elif cut_preprocess == "Automatic":
-                segments = self.slicer.slice(audio)
                 idx1 = 0
-                for audio_segment in segments:
-                    segment_length = len(audio_segment)
-                    per_samples = int(self.sr * self.per)
-                    overlap_samples_segment = int(self.sr * OVERLAP)
-                    step = per_samples - overlap_samples_segment
-
-                    num_sub_segments = (segment_length - per_samples + step - 1) // step
-
-                    for i in range(num_sub_segments):
-                        start = i * step
-                        end = start + per_samples
-                        if end <= segment_length:
-                            tmp_audio = audio_segment[start:end]
-                            self.process_audio_segment(tmp_audio, sid, idx0, idx1)
+                # legacy
+                for audio_segment in self.slicer.slice(audio):
+                    i = 0
+                    while True:
+                        start = int(self.sr * (self.per - OVERLAP) * i)
+                        i += 1
+                        if len(audio_segment[start:]) > (self.per + OVERLAP) * self.sr:
+                            tmp_audio = audio_segment[
+                                start : start + int(self.per * self.sr)
+                            ]
+                            self.process_audio_segment(
+                                tmp_audio,
+                                sid,
+                                idx0,
+                                idx1,
+                            )
                             idx1 += 1
-                        elif start < segment_length:
+                        else:
                             tmp_audio = audio_segment[start:]
-                            self.process_audio_segment(tmp_audio, sid, idx0, idx1)
+                            self.process_audio_segment(
+                                tmp_audio,
+                                sid,
+                                idx0,
+                                idx1,
+                            )
                             idx1 += 1
                             break
 
@@ -266,6 +277,7 @@ def preprocess_training_set(
                 f'Speaker ID folder is expected to be integer, got "{os.path.basename(root)}" instead.'
             )
 
+    # print(f"Number of files: {len(files)}")
     audio_length = []
     with tqdm(total=len(files)) as pbar:
         with concurrent.futures.ProcessPoolExecutor(
@@ -291,13 +303,13 @@ def preprocess_training_set(
                 audio_length.append(future.result())
                 pbar.update(1)
 
-    total_audio_length = sum(audio_length)
+    audio_length = sum(audio_length)
     save_dataset_duration(
-        os.path.join(exp_dir, "model_info.json"), dataset_duration=total_audio_length
+        os.path.join(exp_dir, "model_info.json"), dataset_duration=audio_length
     )
     elapsed_time = time.time() - start_time
     print(
-        f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(total_audio_length)} seconds of audio."
+        f"Preprocess completed in {elapsed_time:.2f} seconds on {format_duration(audio_length)} seconds of audio."
     )