rakuri255 · BWagener · Jul 20, 2024 · Jul 20, 2024 · Jul 20, 2024 · Jul 20, 2024
diff --git a/.gitignore b/.gitignore
@@ -15,3 +15,6 @@ test_output
 output
 /UltraSinger*.spec
 /registry_path.txt
+
+# evaluation resources
+/evaluation
diff --git a/evaluation/README.md b/evaluation/README.md
@@ -0,0 +1,107 @@
+# UltraSinger evaluation
+
+This tool exists to measure the accuracy of UltraSinger.
+
+It takes a directory of known-good UltraStar format files, runs them through UltraSinger, and compares the output to the
+original files.
+
+The idea is, that as you make changes to UltraSinger, you can run this tool to see how the changes affect the accuracy
+of UltraSinger. The tool will reuse any cached files from previous runs, as long as the configuration used to generate the cache is the same.
+
+## Measurements taken
+
+### Pitch
+
+#### Base measurements
+
+| measurement                          | description                                                                                                                       |
+|--------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------|
+| input_match_ratio                    | ratio of how many of the pitch datapoints in the **input** can be found as an exact match in the _output_                         |
+| output_match_ratio                   | ratio of how many of the pitch datapoints in the _output_ can be found as an exact match in the **input**                         |
+| no_pitch_where_should_be_pitch_ratio | ratio of how many of the datapoints in the **input** have a pitch, where the corresponding datapoint in the _output_ has no pitch |
+| pitch_where_should_be_no_pitch_ratio | ratio of how many of the datapoints in the _output_ have a pitch, where the corresponding datapoint in the **input** has no pitch |
+
+#### Measurements after transposing the output
+
+For these measurements the output is transposed by up to 12 half-steps, and the octave is being ignored when comparing
+to the input. Whichever half-step value scores highest is used. This accounts for octave mismatches and wrongly
+transposed inputs
+
+| measurement                                        | description                                                                                                                        |
+|----------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------|
+| best_input_pitch_shift_match_ratio                 | same as input_match_ratio but after transposing the _output_ to achieve the highest possible input_match_ratio                     |
+| matching_input_best_output_pitch_shift_match_ratio | the corresponding output_match_ratio when transposing the same amount of half-steps as used for best_input_pitch_shift_match_ratio |
+| best_output_pitch_shift_match_ratio                | same as output_match_ratio but after transposing the _output_ to achieve the highest possible output_match_ratio                   |
+| matching_output_best_input_pitch_shift_match_ratio | the corresponding input_match_ratio when transposing the same amount of half-steps as used for best_output_pitch_shift_match_ratio |
+
+
+
+## Running the evaluation
+
+- Copy the `example.local.py` file in the `evaluation/input/config` directory and name it `local.py`. This file is used to configure the evaluation tool.
+- Add songs to the `evaluation/input/songs` directory. You can use the songs from https://github.com/UltraStar-Deluxe/songs.
+- Simply run `py UltraSingerEvaluation.py` after following the "How to use this source code/Run" instructions in the root README.md.
+- The evaluation tool will create a directory in the `evaluation/output` directory with the current date and time as the name. The output of the evaluation will be stored in this directory.
+
+### Comparing runs
+
+- To compare the results of all runs in the `evaluation/output` folder, run `py UltraSingerMetaEvaluation.py`. This will output each run's measurements to the console.
+
+## Directory structure
+
+```
+evaluation
+├───input
+│   ├───config # programmatic configuration of UltraSingerEvaluation
+│   │   │   example.local.py # example configuration file, copy this and name it local.py
+│   │   │   local.py # your configuration file, UltraSingerEvaluation will look for this file
+│   │   │
+│   └───songs # this is the directory that contains the known-good songs to run through UltraSinger and then compare against
+│       ├───Jonathan Coulton - A Talk with George
+│       │   │   audio.mp3
+│       │   │   background.jpg
+│       │   │   cover.jpg
+│       │   │   license.txt
+│       │   │   song.txt # known good input UltraStar txt file. UltraSingerEvaluation compares this to the output of UltraSinger
+│       │   │
+│       │   └───cache # this cache will be reused for subsequent evaluation runs
+│       │       │   crepe_False_full_10_cuda.json # the cached file's name contains the configuration used to generate it
+│       │       │   Jonathan Coulton - A Talk with George.wav
+│       │       │   Jonathan Coulton - A Talk with George_denoised.wav
+│       │       │   Jonathan Coulton - A Talk with George_mono.wav
+│       │       │   Jonathan Coulton - A Talk with George_mute.wav
+│       │       │   whisper_large-v2_cuda_None_None_16_None_en.json # the cached file's name contains the configuration used to generate it
+│       │       │
+│       │       └───separated
+│       │           └───htdemucs
+│       │               └───audio
+│       │                       no_vocals.wav
+│       │                       vocals.wav
+│       │
+│       ├───...
+│       │   │   ...
+│       │
+│       └───Many - Songs
+│           │   ...
+│
+└───output
+    └───2024-07-27_16-58-27
+        │   run.json
+        │
+        └───songs
+            ├───Jonathan Coulton - A Talk with George
+            │       Jonathan Coulton - A Talk with George.txt # UltraStar txt file generated by UltraSinger
+            │
+            ├───...
+            │       ....txt # UltraStar txt file generated by UltraSinger
+            │
+            └───Many - Songs
+                    Many - Songs.txt # UltraStar txt file generated by UltraSinger
+```
+
+## TODO
+
+- automate comparison in [UltraSingerMetaEvaluation.py](..%2Fsrc%2FUltraSingerMetaEvaluation.py) instead of just printing each run's measurements
+- currently only pitch accuracy is being measured, text accuracy should be measured as well
+- the cached file's configuration is part of their filename, this will quickly become unmanageable, a better way to store this information should be found
+- the tool could verify that there are no changes according to git and store the latest commit hash for a test run ([TestRun.py](..%2Fsrc%2Fmodules%2FEvaluation%2FTestRun.py))
diff --git a/evaluation/input/.gitkeep b/evaluation/input/.gitkeep
diff --git a/evaluation/input/config/example.local.py b/evaluation/input/config/example.local.py
@@ -0,0 +1,41 @@
+# programmatically customize settings for evaluation runs
+
+import os
+
+from Settings import Settings
+
+
+def init_settings() -> Settings:
+    settings = Settings()
+    settings.language = "en"
+    # settings.pitch_loudness_threshold = 10000
+    settings.create_midi = False
+    settings.create_plot = False
+    settings.calculate_score = True
+    settings.create_karaoke = False
+    settings.keep_cache = True
+    settings.ignore_audio = False
+    # settings.whisper_batch_size = 12
+    # settings.whisper_compute_type = "int8"
+    # settings.test_songs_input_folder = "C:/Users/Benedikt/git/songs/Creative Commons"
+    # settings.skip_cache_vocal_separation = True
+    # settings.skip_cache_denoise_vocal_audio = True
+    # settings.skip_cache_transcription = True
+    # settings.skip_cache_pitch_detection = True
+
+
+    dedicated_test_folder = ""
+    # dedicated_test_folder = "C:/My/Dedicated/Test/songs"
+    dedicated_test_songs_exist = False
+    if os.path.isdir(dedicated_test_folder):
+        for item in os.listdir(dedicated_test_folder):
+            if os.path.isdir(os.path.join(dedicated_test_folder, item)):
+                dedicated_test_songs_exist = True
+
+    if dedicated_test_songs_exist:
+        settings.test_songs_input_folder = dedicated_test_folder
+
+    return settings
+
+
+user_settings = init_settings()
diff --git a/evaluation/input/songs/.gitkeep b/evaluation/input/songs/.gitkeep
diff --git a/evaluation/output/.gitkeep b/evaluation/output/.gitkeep
diff --git a/pytest/modules/UltraStar/converter/test_ultrastar_converter.py b/pytest/modules/UltraStar/converter/test_ultrastar_converter.py
@@ -1,6 +1,6 @@
 """Tests for ultrastar_converter.py"""
 
-from modules.Ultrastar.coverter.ultrastar_converter import real_bpm_to_ultrastar_bpm
+from modules.Ultrastar.converter.ultrastar_converter import real_bpm_to_ultrastar_bpm
 
 
 def test_real_bpm_to_ultrastar_bpm():

diff --git a/pytest/modules/UltraStar/converter/test_ultrastar_txt_converter.py b/pytest/modules/UltraStar/converter/test_ultrastar_txt_converter.py
@@ -1,7 +1,7 @@
 """Tests for ultrastar_txt_converter.py"""
 
 import unittest
-from modules.Ultrastar.coverter.ultrastar_txt_converter import extract_year
+from modules.Ultrastar.converter.ultrastar_txt_converter import extract_year
 
 
 class TestUltrastarTxtConverter(unittest.TestCase):

diff --git a/src/Settings.py b/src/Settings.py
@@ -1,4 +1,5 @@
-from dataclasses import dataclass
+from dataclasses import dataclass, field
+from typing import Optional
 
 from dataclasses_json import dataclass_json
 
@@ -10,55 +11,56 @@
 @dataclass_json
 @dataclass
 class Settings:
-    APP_VERSION = "0.0.12-dev2"
+    APP_VERSION: str = "0.0.12-dev2"
 
-    create_midi = True
-    create_plot = False
-    create_audio_chunks = False
-    hyphenation = True
-    use_separated_vocal = True
-    create_karaoke = True
-    ignore_audio = False
-    input_file_is_ultrastar_txt = False # todo: to process_data
-    keep_cache = False
+    create_midi: bool = True
+    create_plot: bool = False
+    create_audio_chunks: bool = False
+    hyphenation: bool = True
+    use_separated_vocal: bool = True
+    create_karaoke: bool = True
+    ignore_audio: Optional[bool] = None
+    input_file_is_ultrastar_txt: bool = False # todo: to process_data
+    keep_cache: bool = False
 
     # Process data Paths
-    input_file_path = ""
-    output_folder_path = ""
-    
-    language = None
-    format_version = FormatVersion.V1_0_0
+    input_file_path: str = ""
+    output_folder_path: str = ""
+
+    language: Optional[str] = None
+    format_version: str = FormatVersion.V1_0_0
 
     # Demucs
-    demucs_model = DemucsModel.HTDEMUCS  # htdemucs|htdemucs_ft|htdemucs_6s|hdemucs_mmi|mdx|mdx_extra|mdx_q|mdx_extra_q|SIG
+    demucs_model: str = DemucsModel.HTDEMUCS  # htdemucs|htdemucs_ft|htdemucs_6s|hdemucs_mmi|mdx|mdx_extra|mdx_q|mdx_extra_q|SIG
 
     # Whisper
-    transcriber = "whisper"  # whisper
-    whisper_model = WhisperModel.LARGE_V2  # Multilingual model tiny|base|small|medium|large-v1|large-v2|large-v3
+    transcriber: str = "whisper"  # whisper
+    whisper_model: str = WhisperModel.LARGE_V2  # Multilingual model tiny|base|small|medium|large-v1|large-v2|large-v3
     # English-only model tiny.en|base.en|small.en|medium.en
-    whisper_align_model = None   # Model for other languages from huggingface.co e.g -> "gigant/romanian-wav2vec2"
-    whisper_batch_size = 16   # reduce if low on GPU mem
-    whisper_compute_type = None   # change to "int8" if low on GPU mem (may reduce accuracy)
+    whisper_align_model: Optional[str] = None   # Model for other languages from huggingface.co e.g -> "gigant/romanian-wav2vec2"
+    whisper_batch_size: int = 16   # reduce if low on GPU mem
+    whisper_compute_type: Optional[str] = None   # change to "int8" if low on GPU mem (may reduce accuracy)
 
     # Pitch
-    crepe_model_capacity = "full"  # tiny|small|medium|large|full
-    crepe_step_size = 10 # in miliseconds
+    crepe_model_capacity: str = "full"  # tiny|small|medium|large|full
+    crepe_step_size: int = 10 # in miliseconds
+    pitch_loudness_threshold: int = -60
 
     # Device
-    pytorch_device = 'cpu'  # cpu|cuda
-    tensorflow_device = 'cpu'  # cpu|cuda
-    force_cpu = False
-    force_whisper_cpu = False
-    force_crepe_cpu = False
+    pytorch_device: str = "cpu"  # cpu|cuda
+    tensorflow_device: str = "cpu"  # cpu|cuda
+    force_cpu: bool = False
+    force_whisper_cpu: bool = False
+    force_crepe_cpu: bool = False
 
     # MuseScore
-    musescore_path = None
+    musescore_path: Optional[str] = None
 
     # UltraSinger Evaluation Configuration
-    test_songs_input_folder = None
-    cache_override_path = None
-    skip_cache_vocal_separation = False
-    skip_cache_denoise_vocal_audio = False
-    skip_cache_transcription = False
-    skip_cache_pitch_detection = False
-    calculate_score = True
+    test_songs_input_folder: Optional[str] = None
+    cache_override_path: Optional[str] = None
+    skip_cache_vocal_separation: bool = False
+    skip_cache_denoise_vocal_audio: bool = False
+    skip_cache_transcription: bool = False
+    skip_cache_pitch_detection: bool = False
+    calculate_score: bool = True
diff --git a/src/UltraSinger.py b/src/UltraSinger.py
@@ -8,7 +8,7 @@
 
 from packaging import version
 
-from modules import os_helper
+from modules import os_helper, timer
 from modules.Audio.denoise import denoise_vocal_audio
 from modules.Audio.separation import separate_vocal_from_audio
 from modules.Audio.vocal_chunks import (
@@ -50,7 +50,7 @@
 from modules.Speech_Recognition.TranscribedData import TranscribedData
 from modules.Ultrastar.ultrastar_score_calculator import Score, calculate_score_points
 from modules.Ultrastar.ultrastar_txt import FILE_ENCODING, FormatVersion
-from modules.Ultrastar.coverter.ultrastar_txt_converter import from_ultrastar_txt, \
+from modules.Ultrastar.converter.ultrastar_txt_converter import from_ultrastar_txt, \
     create_ultrastar_txt_from_midi_segments, create_ultrastar_txt_from_automation
 from modules.Ultrastar.ultrastar_parser import parse_ultrastar_txt
 from modules.common_print import print_support, print_help, print_version
@@ -212,7 +212,8 @@ def InitProcessData():
         process_data.basename = basename
         process_data.process_data_paths.audio_output_file_path = audio_file_path
         # todo: ignore transcribe
-        settings.ignore_audio = True
+        if settings.ignore_audio is None:
+            settings.ignore_audio = True
 
     elif settings.input_file_path.startswith("https:"):
         # Youtube
@@ -295,9 +296,9 @@ def CreateUltraStarTxt(process_data: ProcessData):
     if settings.calculate_score:
         simple_score, accurate_score = calculate_score_points(process_data, ultrastar_file_output)
 
-    # Add calculated score to Ultrastar txt
+        # Add calculated score to Ultrastar txt
     #Todo: Missing Karaoke
-    ultrastar_writer.add_score_to_ultrastar_txt(ultrastar_file_output, simple_score)
+        ultrastar_writer.add_score_to_ultrastar_txt(ultrastar_file_output, simple_score)
     return accurate_score, simple_score, ultrastar_file_output