From 8d6398b2afdfd15c9bdd3faa435bcf72227f4017 Mon Sep 17 00:00:00 2001
From: AznamirWoW <101997116+AznamirWoW@users.noreply.github.com>
Date: Fri, 24 Jan 2025 17:37:28 -0500
Subject: [PATCH] simplified pitch guidance method added low pass filter in
 attempt to suppress aliasing

---
 rvc/lib/algorithm/generators/refinegan.py | 94 +++++++++++++----------
 1 file changed, 53 insertions(+), 41 deletions(-)

diff --git a/rvc/lib/algorithm/generators/refinegan.py b/rvc/lib/algorithm/generators/refinegan.py
index c2aba01d..432a33ef 100644
--- a/rvc/lib/algorithm/generators/refinegan.py
+++ b/rvc/lib/algorithm/generators/refinegan.py
@@ -1,3 +1,4 @@
+import math
 import numpy as np
 import torch
 from torch import nn
@@ -311,9 +312,10 @@ def __init__(
         start_channels: int = 16,
         gin_channels: int = 256,
         checkpointing: bool = False,
+        upsample_initial_channel = 512
     ):
         super().__init__()
-        self.downsample_rates = downsample_rates
+
         self.upsample_rates = upsample_rates
         self.leaky_relu_slope = leaky_relu_slope
         self.checkpointing = checkpointing
@@ -321,11 +323,11 @@ def __init__(
         self.upp = np.prod(upsample_rates)
         self.m_source = SineGenerator(sample_rate)
 
-        # expands
-        self.source_conv = weight_norm(
+        # expanded f0 sinegen -> match mel_conv
+        self.pre_conv = weight_norm(
             nn.Conv1d(
                 in_channels=1,
-                out_channels=start_channels,
+                out_channels=upsample_initial_channel // 2,
                 kernel_size=7,
                 stride=1,
                 padding=3,
@@ -333,30 +335,36 @@ def __init__(
             )
         )
 
-        channels = start_channels
-        self.downsample_blocks = nn.ModuleList([])
-        for rate in downsample_rates:
-            new_channels = channels * 2
+        stride_f0s = [
+            math.prod(upsample_rates[i + 1 :]) if i + 1 < len(upsample_rates) else 1
+            for i in range(len(upsample_rates))
+        ]
+
+        channels = upsample_initial_channel
 
+        self.downsample_blocks = nn.ModuleList([])
+        for i, u in enumerate(upsample_rates):
+            # handling odd upsampling rates
+            stride = stride_f0s[i]
+            kernel = 1 if stride == 1 else stride * 2 - stride % 2
+            padding = 0 if stride == 1 else (kernel - stride) // 2
+
+            # f0 input gets upscaled to full segment size, then downscaled back to match each upscale step
+            
             self.downsample_blocks.append(
-                nn.Sequential(
-                    nn.Upsample(scale_factor=1 / rate, mode="linear"),
-                    ResBlock(
-                        in_channels=channels,
-                        out_channels=new_channels,
-                        kernel_size=7,
-                        dilation=(1, 3, 5),
-                        leaky_relu_slope=leaky_relu_slope,
-                    ),
+                nn.Conv1d(
+                    in_channels=1,
+                    out_channels=channels // 2 ** (i + 2),
+                    kernel_size=kernel,
+                    stride=stride,
+                    padding = padding
                 )
             )
 
-            channels = new_channels
-
         self.mel_conv = weight_norm(
             nn.Conv1d(
                 in_channels=num_mels,
-                out_channels=channels,
+                out_channels=channels // 2,
                 kernel_size=7,
                 stride=1,
                 padding=3,
@@ -364,18 +372,29 @@ def __init__(
         )
 
         if gin_channels != 0:
-            self.cond = nn.Conv1d(256, channels, 1)
-
-        channels *= 2
+            self.cond = nn.Conv1d(256, channels // 2, 1)
 
         self.upsample_blocks = nn.ModuleList([])
         self.upsample_conv_blocks = nn.ModuleList([])
+        self.filters = nn.ModuleList([])
 
         for rate in upsample_rates:
             new_channels = channels // 2
 
             self.upsample_blocks.append(nn.Upsample(scale_factor=rate, mode="linear"))
 
+            low_pass = nn.Conv1d(
+                channels,
+                channels,
+                kernel_size=15,
+                padding=7, 
+                groups=channels,
+                bias=False)
+
+            low_pass.weight.data.fill_(1.0 / 15)
+            
+            self.filters.append(low_pass)
+
             self.upsample_conv_blocks.append(
                 ParallelResBlock(
                     in_channels=channels + channels // 4,
@@ -397,6 +416,7 @@ def __init__(
                 padding=3,
             )
         )
+        
 
     def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None):
 
@@ -405,20 +425,8 @@ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None):
         )
         har_source = self.m_source(f0.transpose(1, 2)).transpose(1, 2)
 
-        # expanding pitch source to 16 channels
-        # new tensor
-        x = self.source_conv(har_source)
-        # making a downscaled version to match upscaler stages
-        downs = []
-        for i, block in enumerate(self.downsample_blocks):
-            # in-place call
-            x = F.leaky_relu_(x, self.leaky_relu_slope)
-            downs.append(x)
-            if self.training and self.checkpointing:
-                x = checkpoint(block, x, use_reentrant=False)
-            else:
-                x = block(x)
-
+        x = self.pre_conv(har_source)
+        x = F.interpolate(x, size=mel.shape[-1], mode="linear")
         # expanding spectrogram from 192 to 256 channels
         mel = self.mel_conv(mel)
 
@@ -427,22 +435,26 @@ def forward(self, mel: torch.Tensor, f0: torch.Tensor, g: torch.Tensor = None):
             mel += self.cond(g)
         x = torch.cat([mel, x], dim=1)
 
-        for ups, res, down in zip(
+        for ups, res, down, flt in zip(
             self.upsample_blocks,
             self.upsample_conv_blocks,
-            reversed(downs),
+            self.downsample_blocks,
+            self.filters,
         ):
             # in-place call
             x = F.leaky_relu_(x, self.leaky_relu_slope)
 
             if self.training and self.checkpointing:
                 x = checkpoint(ups, x, use_reentrant=False)
-                x = torch.cat([x, down], dim=1)
+                x = checkpoint(flt, x, use_reentrant=False)
+                x = torch.cat([x, down(har_source)], dim=1)
                 x = checkpoint(res, x, use_reentrant=False)
             else:
                 x = ups(x)
-                x = torch.cat([x, down], dim=1)
+                x = flt(x)
+                x = torch.cat([x, down(har_source)], dim=1)
                 x = res(x)
+                
         # in-place call
         x = F.leaky_relu_(x, self.leaky_relu_slope)
         x = self.conv_post(x)