Merge pull request #1 from maks00170/project

review passed
deepvk · Jan 16, 2024 · 3bea291 · 3bea291
2 parents e72f16a + 26b6b2d
commit 3bea291
Show file tree

Hide file tree

Showing 21 changed files with 3,143 additions and 0 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -0,0 +1,9 @@
+.github
+**/__pycache__/
+separator/inference/
+streaming/weights/
+streaming/input/
+streaming/streams/
+streaming/model/
+streaming/tflite_model/
+
diff --git a/.github/workflows/main.yaml b/.github/workflows/main.yaml
@@ -0,0 +1,18 @@
+name: Main
+
+on: [push, pull_request]
+
+jobs:
+  main:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v3
+        with:
+          python-version: 3.10.12
+          cache: "pip"
+      - name: "installation"
+        run: |
+          pip install -r requirements-dev.txt
+      - name: "black"
+        run: black . --check --diff --color --exclude .*/config/
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,8 @@
+.vscode
+**/__pycache__/
+separator/inference/
+streaming/weights
+streaming/input/
+streaming/streams/
+streaming/model/
+streaming/tflite_model/
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,16 @@
+FROM nvcr.io/nvidia/tensorrt:22.08-py3
+
+ENV PYTHONUNBUFFERED=1
+
+RUN apt-get -y update && apt-get -y upgrade
+RUN apt-get install -y --no-install-recommends ffmpeg
+RUN apt-get install -y python3-pip
+RUN echo 'alias python=python3' >> ~/.bashrc
+RUN echo 'NCCL_SOCKET_IFNAME=lo' >> ~/.bashrc
+
+
+WORKDIR /app
+COPY requirements.txt requirements.txt
+RUN pip install -r requirements.txt
+
+ENTRYPOINT [ "bash" ]
diff --git a/README.md b/README.md
@@ -0,0 +1,73 @@
+# PM-Unet: phase and magnitude aware model for music source separation
+ [![githubio](https://img.shields.io/badge/GitHub.io-Audio_Samples-blue?logo=Github&style=flat-square)](https://d-a-yakovlev.github.io/test/)
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1OXlCZgd5KidMDZDUItOIT9ZA4IUJHXsZ?usp=sharing)
+
+## Navigation
+1. [Structure](#structure)
+2. [Docker](#docker)
+3. [Training](#training)
+4. [Inference](#inference)
+
+## Structure
+- [`separator`](./separator) ‒ main source code with model and dataset implementations and code to train model.
+- [`streaming`](./streaming/demo) ‒ source code inference tf-lite version model.
+
+## Docker
+#### To set up environment with Docker
+
+If you don't have Docker installed, please follow the links to find installation instructions for [Ubuntu](https://docs.docker.com/desktop/install/linux-install/), [Mac](https://docs.docker.com/desktop/install/mac-install/) or [Windows](https://docs.docker.com/desktop/install/windows-install/).
+
+Build docker image:
+
+    docker build -t pmunet .
+
+Run docker image:
+
+    bash run_docker.sh
+
+## Data
+Used dataset [MUSDB18-HQ](https://sigsep.github.io/datasets/musdb.html#musdb18-hq-uncompressed-wav). 
+
+[![Download dataset](https://img.shields.io/badge/Download%20dataset-65c73b)](https://zenodo.org/record/3338373/files/musdb18hq.zip?download=1)
+
+The dataset consists of
+150 full-length stereo tracks sampled at 44.1 kHz. providing a
+complete audio mix and four main elements: ”vocal”, ”bass”,
+”drums” and ”other” for each sample, which can be considered as a target in the context of source separation. The kit
+structure offers 100 training compositions and 50 validation
+compositions
+
+## Training
+1. Configure arguments in `separator/config/config.py`.
+2. `cd separator`.
+3. Run `python3 separator/pl_model.py`.
+
+## Inference
+
+### Auto local
+1. Configure arguments in `separator/config/config.py`.
+2. `cd separator`.
+3. `python3 inference.py [-IO]`
+    - `-I` specify path to mixture, 
+    - `-O` output dir, both of them optional. 
+
+By default script loads `.pt` file with weights and `sample.wav` from google drive.
+
+#### For example
+``` 
+python3 inference.py -I path/to/mix -O out_dir
+```
+With successful script run four audio files (`vocals.wav` and `drums.wav`, `bass.wav`, `other.wav`) will be in `out_dir`. By default in `separator/inference/output`.
+
+**You can download weights manually**
+
+Download one the .pt file below:
+ * [LSTM-bottleneck version](https://drive.google.com/file/d/18jT2TYffdRD1fL7wecAiM5nJPM_OKpNB/view?usp=drive_link)
+ * [WIthout LSTM-bottleneck version](https://drive.google.com/file/d/1VO07OYbsnCuEJYRSuA8HhjlQnx6dbWX7/view?usp=drive_link)
+
+ ### Streaming
+ In streaming section located scripts for: convert model to `tflite` format and run `tflite` model in `"stream mode"`.
+
+1. Configure arguments in `streaming/config/config.py`.
+2. `cd streaming`.
+3. `python3 runner.py`
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -0,0 +1,3 @@
+black
+mypy
+pytest
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,19 @@
+ffmpeg-python==0.2.0
+gdown==4.6.3
+julius==0.2.7
+lpips==0.1.4
+musdb==0.4.0
+nobuco
+omegaconf==2.3.0
+openunmix==1.2.1
+soundfile==0.12.1
+sox==1.4.1
+stempeg==0.2.3
+sympy==1.12
+tensorflow>=2.13.0
+torch==2.0.1
+torch-audiomentations==0.11.0
+torchaudio==2.0.2
+torchmetrics==0.11.4
+pytorch-lightning==2.0.3
+tqdm==4.65.0
diff --git a/run_docker.sh b/run_docker.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+app=$(pwd)
+
+docker run --name pmunet -it --rm \
+    --net=host --ipc=host \
+    --gpus "all" \
+    -v ${app}:/app \
+    pmunet
diff --git a/separator/config/config.py b/separator/config/config.py
@@ -0,0 +1,101 @@
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union
+
+
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Union
+
+
+@dataclass
+class TrainConfig:
+
+    # DATA OPTIONS
+    musdb_path          : str = "musdb18hq" # Directory path where the MUSDB18-HQ dataset is stored.
+    metadata_train_path : str = "metadata"  # Directory path for saving training metadata, like track names and lengths.
+    metadata_test_path  : str = "metadata1" # Directory path for saving testing metadata.
+    segment             : int = 5           # Length (in seconds) of each audio segment used during training.
+
+    # MODEL OPTIONS
+    model_source   : tuple = ("drums", "bass", "other", "vocals") # Sources to target in source separation.
+    model_depth    : int   = 4                                    # The depth of the U-Net architecture.
+    model_channel  : int   = 28                                   # Number of initial channels in U-Net layers.
+    is_mono        : bool  = False                                # Indicates whether the input audio should be treated as mono (True) or stereo (False).
+    mask_mode      : bool  = False                                # Whether to utilize masking within the model.
+    skip_mode      : str   = "concat"                             # Mode of skip connections in U-Net ('concat' for concatenation, 'add' for summation).
+    nfft           : int   = 4096                                 # Number of bins used in STFT.
+    bottlneck_lstm : bool  = True                                 # Determines whether to use LSTM layers as bottleneck in the U-Net architecture.
+    layers         : int   = 2                                    # Number of LSTM layers if bottleneck.
+    stft_flag      : bool  = True                                 # A flag to decide whether to apply the STFT is required for tflite. 
+
+    # TRAIN OPTIONS
+    device                   : str  = "cuda"       # The computing platform for training: 'cuda' for NVIDIA GPUs or 'cpu'.
+    batch_size               : int  = 6            # Batch size for training.
+    shuffle_train            : bool = True         # Whether to shuffle the training dataset.
+    shuffle_valid            : bool = False        # Whether to shuffle the valid dataset.
+    drop_last                : bool = True         # Whether to drop the last incomplete batch in train data.
+    num_workers              : int  = 2            # Number of worker processes used for loading data.
+    metric_monitor_mode      : str  = "min"        # Strategy for monitoring metrics to save model checkpoints.
+    save_top_k_model_weights : int  = 1            # Number of best-performing model weights to save based on the monitored metric.
+
+    factor                   : int = 1             # Factors for different components of the loss function.
+    c_factor                 : int = 1
+
+    loss_nfft                : tuple = (4096,)     # Number of FFT bins for calculating loss.
+    gamma                    : float = 0.3         # Gamma parameter for adjusting the focus of the loss on certain aspects of the audio spectrum.
+    lr                       : float = 0.5 * 3e-3  # Learning rate for the optimizer.
+    T_0                      : int   = 40          # Period of the cosine annealing schedule in learning rate adjustment.
+    max_epochs               : int   = 100         # Maximum number of training epochs.
+    precision                : str   = 16          # Precision of training computations.
+    grad_clip                : float = 0.5         # Gradient clipping value.
+
+    # AUGMENTATION OPTIONS
+    proba_shift           : float = 0.5        # Probability of applying the shift.
+    shift                 : int   = 8192       # Maximum number of samples for the shift.
+    proba_flip_channel    : float = 1          # Probability of applying the flip left-right channels.
+    proba_flip_sign       : float = 1          # Probability of applying the sign flip.
+    pitchshift_proba      : float = 0.2        # Probability of applying pitch shift.
+    vocals_min_semitones  : int   = -5         # The lower limit of vocal semitones.
+    vocals_max_semitones  : int   = 5          # The upper limit of vocal semitones.
+    other_min_semitones   : int   = -2         # The lower limit of non-vocal semitones.
+    other_max_semitones   : int   = 2          # The upper limit of non-vocal semitones.
+    pitchshift_flag_other : bool  = False      # Flag to enable pitch shift augmentation on non-vocal sources.
+    time_change_proba     : float = 0.2        # Probability of applying time stretching.
+    time_change_factors   : tuple = (0.8, 0.85, 0.9, 0.95, 1.05, 1.1, 1.15, 1.2, 1.25, 1.3) # Factors for time stretching/compression, defining the range and intensity of this augmentation.
+    remix_proba           : float = 1          # Probability of remixing audio tracks.
+    remix_group_size      : int   = batch_size # Size of groups within which shuffling occurs.
+    scale_proba           : float = 1          # Probability of applying the scaling.
+    scale_min             : float = 0.25       # Minimum scaling factor.
+    scale_max             : float = 1.25       # Maximum scaling factor.
+    fade_mask_proba       : float = 0.1        # Probability of applying a fade effect.
+    double_proba          : float = 0.1        # Probability of doubling one channel's audio to both channels.
+    reverse_proba         : float = 0.2        # Probability of reversing a segment of the audio track.
+    mushap_proba          : float = 0.0        # Probability create mashups.
+    mushap_depth          : int   = 2          # Number of tracks to mix.
+
+
+@dataclass
+class InferenceConfig:
+    GDRIVE_PREFIX = "https://drive.google.com/uc?id=" # Google Drive URL
+
+    # MODEL OPTIONS
+    weights_dir           : Path = Path("/app/separator/inference/weights")            # file name where weights are saved
+    weights_LSTM_filename : str  = "weight_LSTM.pt"                                    # file name model with LSTM
+    weights_conv_filename : str  = "weight_conv.pt"                                    # file name model without LSTM
+    gdrive_weights_LSTM   : str  = f"{GDRIVE_PREFIX}1uhAVMvW3x-KL2T2-VkjKjn9K7dTJnoyo" # Google Drive URL that directs weights LSTM
+    gdrive_weights_conv   : str  = f"{GDRIVE_PREFIX}1VO07OYbsnCuEJYRSuA8HhjlQnx6dbWX7" # Google Drive URL that directs weights without_LSTM
+    device                : str  = "cpu"                                               # The computing platform for inference
+
+    # INFERENCE OPTIONS
+    segment            : int              = 7                                 # Length (in seconds) of each audio segment used during inference.
+    overlap            : float            = 0.2                               # overlapping segments at the beginning of the track and at the end
+    offset             : Union[int, None] = None                              # start (in seconds) of segment to split
+    duration           : Union[int, None] = None                              # duration (in seconds) of segment to split, use with `offset`
+    sample_rate        : int              = 44100                             # sample rate track
+    num_channels       : int              = 2                                 # Number of channels in the audio track 
+    default_result_dir : str              = "/app/separator/inference/output" # path file output tracks
+    default_input_dir  : str              = "/app/separator/inference/input"  # path file input track
+
+    # TEST TRACK
+    gdrive_mix : str = f"{GDRIVE_PREFIX}1zJpyW1fYxHKXDcDH9s5DiBCYiRpraDB3" # Google Drive URL that directs test track
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    black
+    mypy
+    pytest