Music-and-Culture-Technology-Lab · e7mac · Aug 18, 2023 · Aug 18, 2023 · yoyolicoris · Sep 5, 2023
diff --git a/cog.yaml b/cog.yaml
@@ -1,19 +1,17 @@
 build:
   gpu: true
-  python_version: "3.8"
+  python_version: 3.8
   system_packages:
-    - "libgl1-mesa-glx"
-    - "libglib2.0-0"
-    - "libsndfile1-dev"
-    - "ffmpeg"
+    - libsndfile1-dev
+    - ffmpeg
+    - libfluidsynth3
+
   python_packages:
-    - "ipython==7.30.1"
-    - "numpy==1.21.4"
+    - numpy==1.21.4
   run:
     - pip install -U pip
     - pip install --upgrade cython
-    - pip install omnizart
-    - apt-get update && apt-get install -y fluidsynth
     - pip install pyfluidsynth
+    - pip install git+https://github.com/e7mac/omnizart.git
 
-predict: "scripts/predict.py:Predictor"
+predict: scripts/predict.py:Predictor
diff --git a/pyproject.toml b/pyproject.toml
@@ -37,7 +37,7 @@ vamp = "^1.1.0"
 urllib3 = "1.26.4"
 spleeter = "^2.3.0"
 mir_eval = "^0.6"
-tensorflow = "2.5.0"
+tensorflow = "2.13.0"
 
 [tool.poetry.dev-dependencies]
 pytest = "^5.2"

diff --git a/scripts/predict.py b/scripts/predict.py
@@ -9,7 +9,8 @@
 import shutil
 from pathlib import Path
 
-import cog
+from cog import BaseModel, BasePredictor, Path, Input
+from typing import Optional
 import scipy.io.wavfile as wave
 
 from omnizart.remote import download_large_file_from_google_drive
@@ -20,8 +21,13 @@
 from omnizart.vocal import app as vapp
 from omnizart.vocal_contour import app as vcapp
 
+class Output(BaseModel):
+    midi: Path
+    wav: Optional[Path]
+    csv: Optional[Path]
+
+class Predictor(BasePredictor):
 
-class Predictor(cog.Predictor):
     def setup(self):
         self.SF2_FILE = "general_soundfont.sf2"
         if not os.path.exists(self.SF2_FILE):
@@ -34,19 +40,12 @@ def setup(self):
         self.app = {"music": mapp, "chord": capp, "drum": dapp, "vocal": vapp, "vocal-contour": vcapp, "beat": bapp}
         self.model_path = {"piano": "Piano", "piano-v2": "PianoV2", "assemble": "Stream", "pop-song": "Pop", "": None}
 
-    @cog.input(
-        "audio",
-        type=Path,
-        help="Path to the input music. Supports mp3 and wav format.",
-    )
-    @cog.input(
-        "mode",
-        type=str,
-        default="music-piano-v2",
-        options=["music-piano", "music-piano-v2", "music-assemble", "chord", "drum", "vocal", "vocal-contour", "beat"],
-        help="Transcription mode",
-    )
-    def predict(self, audio, mode):
+    def predict(self,
+            audio: Path = Input(description="Path to the input music. Supports mp3 and wav format."),
+            mode: str = Input(default="music-piano-v2", description="Transcription mode", choices=["music-piano", "music-piano-v2", "music-assemble", "chord", "drum", "vocal", "vocal-contour", "beat"]),
+            render_audio: bool = Input(default=False, description="Option to render to mp3"),
+    ) -> Output:
+        """Run a single prediction on the model"""
         assert str(audio).endswith(".mp3") or str(audio).endswith(".wav"), "Please upload mp3 or wav file."
         temp_folder = "cog_temp"
         os.makedirs(temp_folder, exist_ok=True)
@@ -65,22 +64,45 @@ def predict(self, audio, mode):
 
             app = self.app[mode]
             model_path = self.model_path[model]
-            midi = app.transcribe(wav_file_path, model_path=model_path)
+            midi_path = f"{temp_folder}/{audio_name}.mid"
+            midi = app.transcribe(wav_file_path, model_path=model_path, output=midi_path)
+
+            mid_out_path = None
+            audio_out_path = None
+            csv_out_path = None
 
-            if mode == "vocal-contour":
-                out_name = f"{audio_name}_trans.wav"
-            else:
-                print("Synthesizing MIDI...")
-                out_name = f"{temp_folder}/{audio_name}_synth.wav"
-                raw_wav = midi.fluidsynth(fs=44100, sf2_path=self.SF2_FILE)
-                wave.write(out_name, 44100, raw_wav)
+            if render_audio == True:
+                if mode == "vocal-contour":
+                    out_name = f"{audio_name}_trans.wav"
+                else:
+                    print("Synthesizing MIDI...")
+                    out_name = f"{temp_folder}/{audio_name}_synth.wav"
+                    raw_wav = midi.fluidsynth(fs=44100, sf2_path=self.SF2_FILE)
+                    wave.write(out_name, 44100, raw_wav)
 
-            out_path = Path(tempfile.mkdtemp()) / "out.mp3"  # out_path is automatically cleaned up by cog
-            subprocess.run(["ffmpeg", "-y", "-i", out_name, str(out_path)])
+                audio_out_path = Path(tempfile.mkdtemp()) / "out.mp3"  # out_path is automatically cleaned up by cog
+                subprocess.run(["ffmpeg", "-y", "-i", out_name, str(audio_out_path)])
+
+            mid_out_path = Path(tempfile.mkdtemp()) / "out.mid"  # out_path is automatically cleaned up by cog
+            shutil.copyfile(midi_path, mid_out_path)
+            if mode == "chord" :
+                csv_in_path = str(midi_path).replace(".mid", ".csv")
+                csv_out_path = str(mid_out_path).replace(".mid", ".csv")
+                shutil.copyfile(csv_in_path, csv_out_path)
+                csv_out_path = Path(csv_out_path)
         finally:
             shutil.rmtree(temp_folder)
             if os.path.exists(f"{audio_name}.mid"):
                 os.remove(f"{audio_name}.mid")
             if os.path.exists(f"{audio_name}_trans.wav"):
                 os.remove(f"{audio_name}_trans.wav")
-        return out_path
+            if os.path.exists(f"{audio_name}.csv"):
+                os.remove(f"{audio_name}.csv")
+        if mode == "chord":
+            if render_audio == True:
+                return Output(midi=mid_out_path, wav=audio_out_path, csv=csv_out_path)
+            else:
+                return Output(midi=mid_out_path, csv=csv_out_path)
+        if render_audio == True:
+            return Output(midi=mid_out_path, wav=audio_out_path)
+        return Output(midi=mid_out_path)