add multi-speaker support

devidw · Jan 5, 2024 · 6576107 · 6576107
1 parent 04ad13b
commit 6576107
Showing 4 changed files with 128 additions and 35 deletions.
diff --git a/dswav/__main__.py b/dswav/__main__.py
@@ -71,6 +71,7 @@ def upload_handler(project_name: str, scp_cmd: str):
     args = scp_cmd.replace("%", f"./projects/{project_name}/ds.zip").split(" ")
     print(args)
     subprocess.run(args)
+    print("done")
 
 
 if __name__ == "__main__":
@@ -144,7 +145,7 @@ def upload_handler(project_name: str, scp_cmd: str):
             button.click(add_ending_silence_handler, inputs=[project_name])
 
         with gr.Tab("mp3 to wav @ sr"):
-            sr = gr.Textbox(label="Sample Rate", value="22050")
+            sr = gr.Textbox(label="Sample Rate", value="24000")
             input_path = gr.Textbox(label="input mp3s path")
             output_path = gr.Textbox(label="output wavs path")
             button = gr.Button()

diff --git a/dswav/ds.py b/dswav/ds.py
@@ -1,7 +1,7 @@
 import json
 import shutil
 import subprocess
-from typing import List, Optional
+from typing import Dict, List, Optional
 from dswav.config import Config
 from concurrent.futures import ThreadPoolExecutor
 import os
@@ -16,10 +16,16 @@
     read_sentences,
 )
 from dswav.styletts2 import text_to_phonemes
+from nltk.tokenize import word_tokenize
+import random
 
 
+# MAX_LEN = 512
+MAX_LEN = 400
 SILENCE_EOS = " …"
 SILENCE_LENGTH_MS = 100
+# DS_SIZE_LIMIT = 17
+DS_SIZE_LIMIT = None
 
 
 def add_silence_if_needed(project_name: str):
@@ -53,38 +59,116 @@ def combine_many(project_name: str, merges: List[str]):
         with open(f"{merge}/index.json", "r") as f:
             index = json.loads(f.read())
         sentences.extend(
-            list(map(lambda x: Sentence(x["id"], x["content"], []), index))
+            list(
+                map(
+                    lambda x: Sentence(
+                        x["id"],
+                        x["content"],
+                        [],
+                        speaker_id=x["speaker_id"],
+                    ),
+                    index,
+                )
+            )
         )
-        copy_files(f"{merge}/wavs", f"./projects/{project_name}/ds/wavs")
+        # copy_files(f"{merge}/wavs", f"./projects/{project_name}/ds/wavs")
     write_sentences(project_name, sentences)
 
 
 def build_ds(project_name: str, add_ending_silence: bool):
     sentences: List[Sentence] = read_sentences(project_name)
 
-    train_list, val_list = split_list(sentences, 0.99)
+    if DS_SIZE_LIMIT:
+        random.shuffle(sentences)
+        sentences = sentences[:DS_SIZE_LIMIT]
 
-    with open(f"./projects/{project_name}/ds/metadata.csv", "w") as f:
-        csv_content = "\n".join(
-            [f"{s.id}|{s.sentence}|{s.sentence}" for s in sentences]
-        )
-        f.write(csv_content)
+    SPEAKER_IDS: Dict[str, int] = {}
+
+    for sentence in sentences:
+        if sentence.speaker_id in SPEAKER_IDS:
+            continue
+        SPEAKER_IDS[sentence.speaker_id] = len(SPEAKER_IDS.keys()) + 1
+
+    print(SPEAKER_IDS)
+
+    # with open(f"./projects/{project_name}/ds/metadata.csv", "w") as f:
+    #     csv_content = "\n".join(
+    #         [f"{s.id}|{s.sentence}|{s.sentence}" for s in sentences]
+    #     )
+    #     f.write(csv_content)
 
     def get_phonemes(sentence: Sentence):
-        return text_to_phonemes(
-            sentence.sentence
-            if not add_ending_silence
-            else f"{sentence.sentence}{SILENCE_EOS}"
+        try:
+            the_text = (
+                sentence.sentence
+                if not add_ending_silence
+                else f"{sentence.sentence}{SILENCE_EOS}"
+            )
+            if the_text == "":
+                return None
+            # token_count = len(word_tokenize(the_text))
+            # if token_count > MAX_LEN:
+            #     return None
+            if len(the_text) > MAX_LEN:
+                return None
+            return text_to_phonemes(the_text)
+        except:
+            return None
+
+    l0 = len(sentences)
+    lines = list(
+        filter(
+            lambda x: x["content"] != None,
+            map(
+                lambda x: {
+                    "sentence": x,
+                    "id": x.id,
+                    "content": get_phonemes(x),
+                    "speaker_id": SPEAKER_IDS[x.speaker_id],
+                },
+                sentences,
+            ),
+        )
+    )
+    l1 = len(lines)
+    print(
+        f"{l0-l1} samples dropped due to being too long or too short, <=0 or >{MAX_LEN}"
+    )
+
+    train_list, val_list = split_list(lines, 0.99)
+
+    with open(f"./projects/{project_name}/debug.json", "w") as f:
+        json.dump(
+            list(
+                map(
+                    lambda x: {
+                        **x,
+                        "sentence": x["sentence"].content,
+                    },
+                    lines,
+                )
+            ),
+            f,
+            indent=4,
+            ensure_ascii=False,
         )
 
     with open(f"./projects/{project_name}/ds/train_list.txt", "w") as f:
         data = "\n".join(
-            [f"{line.id}.wav|{get_phonemes(line)}|0" for line in train_list]
+            [
+                f"{line['id']}.wav|{line['content']}|{line['speaker_id']}"
+                for line in train_list
+            ]
         )
         f.write(data)
 
     with open(f"./projects/{project_name}/ds/val_list.txt", "w") as f:
-        data = "\n".join([f"{line.id}.wav|{get_phonemes(line)}|0" for line in val_list])
+        data = "\n".join(
+            [
+                f"{line['id']}.wav|{line['content']}|{line['speaker_id']}"
+                for line in val_list
+            ]
+        )
         f.write(data)
 
     shutil.make_archive(

diff --git a/dswav/sentence.py b/dswav/sentence.py
@@ -26,18 +26,27 @@ def to_dict(self):
 
 
 class Sentence:
+    _id: Optional[str] = None
     words: List[Word]
     content: str = ""
-    _id: Optional[str] = None
-
-    def __init__(self, id: Optional[str], content: str, words: List[Word]) -> None:
+    speaker_id: str = ""
+
+    def __init__(
+        self,
+        id: Optional[str],
+        content: str,
+        words: List[Word],
+        speaker_id: str,
+    ) -> None:
         self._id = id if id else None
-        self.content = content
         self.words = words
+        self.content = content
+        self.speaker_id = speaker_id
 
     def to_dict(self):
         return {
             "id": self.id,
+            "speaker_id": self.speaker_id,
             "content": self.content,
             "words": [word.to_dict() for word in self.words],
         }
@@ -97,9 +106,17 @@ def read_sentences(project_name: str):
 
     for single in raw:
         words = list(
-            map(lambda x: Word(x["word"], x["start"], x["end"]), single["words"])
+            map(
+                lambda x: Word(x["word"], x["start"], x["end"]),
+                single["words"],
+            )
+        )
+        sentence = Sentence(
+            single["id"],
+            single["content"],
+            words,
+            speaker_id=single["speaker_id"],
         )
-        sentence = Sentence(single["id"], single["content"], words)
         sentences.append(sentence)
 
     return sentences
@@ -118,7 +135,7 @@ def write_sentences(project_name: str, sentences: List[Sentence]):
 def compute_sentences(words: List[Word]):
     """ """
     sentences: List[Sentence] = []
-    tmp: Sentence = Sentence(None, "", [])
+    tmp: Sentence = Sentence(None, "", [], "")
     is_multi = False
 
     for word in words:
@@ -151,7 +168,7 @@ def compute_sentences(words: List[Word]):
                 continue
 
             sentences.append(tmp)
-            tmp = Sentence(None, "", [])
+            tmp = Sentence(None, "", [], "")
             is_multi = False
 
     return sentences
diff --git a/scripts/mp3-to-wav.sh b/scripts/mp3-to-wav.sh
@@ -4,15 +4,6 @@ IN_DIR=$1
 OUT_DIR=$2
 SR=$3
 
+export OUT_DIR SR  # Export these variables so they're available to subshells
 
-# Loop through each MP3 file in the directory
-for file in "$IN_DIR"/*.mp3; do
-    # Skip if not a file
-    [ -f "$file" ] || continue
-
-    # Construct the WAV filename
-    base_name=$(basename "$file" .mp3)
-
-    # Convert MP3 to WAV using FFmpeg
-    ffmpeg -i "$file" -ar $SR "$OUT_DIR/$base_name.wav"
-done
+find "$IN_DIR" -name '*.mp3' | parallel -I% --max-args 1 ffmpeg -i % -ar $SR "$OUT_DIR/{/.}.wav"