Skip to content

Commit

Permalink
add multi-speaker support
Browse files Browse the repository at this point in the history
devidw committed Jan 5, 2024
1 parent 04ad13b commit 6576107
Showing 4 changed files with 128 additions and 35 deletions.
3 changes: 2 additions & 1 deletion dswav/__main__.py
Original file line number Diff line number Diff line change
@@ -71,6 +71,7 @@ def upload_handler(project_name: str, scp_cmd: str):
args = scp_cmd.replace("%", f"./projects/{project_name}/ds.zip").split(" ")
print(args)
subprocess.run(args)
print("done")


if __name__ == "__main__":
@@ -144,7 +145,7 @@ def upload_handler(project_name: str, scp_cmd: str):
button.click(add_ending_silence_handler, inputs=[project_name])

with gr.Tab("mp3 to wav @ sr"):
sr = gr.Textbox(label="Sample Rate", value="22050")
sr = gr.Textbox(label="Sample Rate", value="24000")
input_path = gr.Textbox(label="input mp3s path")
output_path = gr.Textbox(label="output wavs path")
button = gr.Button()
114 changes: 99 additions & 15 deletions dswav/ds.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import shutil
import subprocess
from typing import List, Optional
from typing import Dict, List, Optional
from dswav.config import Config
from concurrent.futures import ThreadPoolExecutor
import os
@@ -16,10 +16,16 @@
read_sentences,
)
from dswav.styletts2 import text_to_phonemes
from nltk.tokenize import word_tokenize
import random


# MAX_LEN = 512
MAX_LEN = 400
SILENCE_EOS = " …"
SILENCE_LENGTH_MS = 100
# DS_SIZE_LIMIT = 17
DS_SIZE_LIMIT = None


def add_silence_if_needed(project_name: str):
@@ -53,38 +59,116 @@ def combine_many(project_name: str, merges: List[str]):
with open(f"{merge}/index.json", "r") as f:
index = json.loads(f.read())
sentences.extend(
list(map(lambda x: Sentence(x["id"], x["content"], []), index))
list(
map(
lambda x: Sentence(
x["id"],
x["content"],
[],
speaker_id=x["speaker_id"],
),
index,
)
)
)
copy_files(f"{merge}/wavs", f"./projects/{project_name}/ds/wavs")
# copy_files(f"{merge}/wavs", f"./projects/{project_name}/ds/wavs")
write_sentences(project_name, sentences)


def build_ds(project_name: str, add_ending_silence: bool):
sentences: List[Sentence] = read_sentences(project_name)

train_list, val_list = split_list(sentences, 0.99)
if DS_SIZE_LIMIT:
random.shuffle(sentences)
sentences = sentences[:DS_SIZE_LIMIT]

with open(f"./projects/{project_name}/ds/metadata.csv", "w") as f:
csv_content = "\n".join(
[f"{s.id}|{s.sentence}|{s.sentence}" for s in sentences]
)
f.write(csv_content)
SPEAKER_IDS: Dict[str, int] = {}

for sentence in sentences:
if sentence.speaker_id in SPEAKER_IDS:
continue
SPEAKER_IDS[sentence.speaker_id] = len(SPEAKER_IDS.keys()) + 1

print(SPEAKER_IDS)

# with open(f"./projects/{project_name}/ds/metadata.csv", "w") as f:
# csv_content = "\n".join(
# [f"{s.id}|{s.sentence}|{s.sentence}" for s in sentences]
# )
# f.write(csv_content)

def get_phonemes(sentence: Sentence):
return text_to_phonemes(
sentence.sentence
if not add_ending_silence
else f"{sentence.sentence}{SILENCE_EOS}"
try:
the_text = (
sentence.sentence
if not add_ending_silence
else f"{sentence.sentence}{SILENCE_EOS}"
)
if the_text == "":
return None
# token_count = len(word_tokenize(the_text))
# if token_count > MAX_LEN:
# return None
if len(the_text) > MAX_LEN:
return None
return text_to_phonemes(the_text)
except:
return None

l0 = len(sentences)
lines = list(
filter(
lambda x: x["content"] != None,
map(
lambda x: {
"sentence": x,
"id": x.id,
"content": get_phonemes(x),
"speaker_id": SPEAKER_IDS[x.speaker_id],
},
sentences,
),
)
)
l1 = len(lines)
print(
f"{l0-l1} samples dropped due to being too long or too short, <=0 or >{MAX_LEN}"
)

train_list, val_list = split_list(lines, 0.99)

with open(f"./projects/{project_name}/debug.json", "w") as f:
json.dump(
list(
map(
lambda x: {
**x,
"sentence": x["sentence"].content,
},
lines,
)
),
f,
indent=4,
ensure_ascii=False,
)

with open(f"./projects/{project_name}/ds/train_list.txt", "w") as f:
data = "\n".join(
[f"{line.id}.wav|{get_phonemes(line)}|0" for line in train_list]
[
f"{line['id']}.wav|{line['content']}|{line['speaker_id']}"
for line in train_list
]
)
f.write(data)

with open(f"./projects/{project_name}/ds/val_list.txt", "w") as f:
data = "\n".join([f"{line.id}.wav|{get_phonemes(line)}|0" for line in val_list])
data = "\n".join(
[
f"{line['id']}.wav|{line['content']}|{line['speaker_id']}"
for line in val_list
]
)
f.write(data)

shutil.make_archive(
33 changes: 25 additions & 8 deletions dswav/sentence.py
Original file line number Diff line number Diff line change
@@ -26,18 +26,27 @@ def to_dict(self):


class Sentence:
_id: Optional[str] = None
words: List[Word]
content: str = ""
_id: Optional[str] = None

def __init__(self, id: Optional[str], content: str, words: List[Word]) -> None:
speaker_id: str = ""

def __init__(
self,
id: Optional[str],
content: str,
words: List[Word],
speaker_id: str,
) -> None:
self._id = id if id else None
self.content = content
self.words = words
self.content = content
self.speaker_id = speaker_id

def to_dict(self):
return {
"id": self.id,
"speaker_id": self.speaker_id,
"content": self.content,
"words": [word.to_dict() for word in self.words],
}
@@ -97,9 +106,17 @@ def read_sentences(project_name: str):

for single in raw:
words = list(
map(lambda x: Word(x["word"], x["start"], x["end"]), single["words"])
map(
lambda x: Word(x["word"], x["start"], x["end"]),
single["words"],
)
)
sentence = Sentence(
single["id"],
single["content"],
words,
speaker_id=single["speaker_id"],
)
sentence = Sentence(single["id"], single["content"], words)
sentences.append(sentence)

return sentences
@@ -118,7 +135,7 @@ def write_sentences(project_name: str, sentences: List[Sentence]):
def compute_sentences(words: List[Word]):
""" """
sentences: List[Sentence] = []
tmp: Sentence = Sentence(None, "", [])
tmp: Sentence = Sentence(None, "", [], "")
is_multi = False

for word in words:
@@ -151,7 +168,7 @@ def compute_sentences(words: List[Word]):
continue

sentences.append(tmp)
tmp = Sentence(None, "", [])
tmp = Sentence(None, "", [], "")
is_multi = False

return sentences
13 changes: 2 additions & 11 deletions scripts/mp3-to-wav.sh
Original file line number Diff line number Diff line change
@@ -4,15 +4,6 @@ IN_DIR=$1
OUT_DIR=$2
SR=$3

export OUT_DIR SR # Export these variables so they're available to subshells

# Loop through each MP3 file in the directory
for file in "$IN_DIR"/*.mp3; do
# Skip if not a file
[ -f "$file" ] || continue

# Construct the WAV filename
base_name=$(basename "$file" .mp3)

# Convert MP3 to WAV using FFmpeg
ffmpeg -i "$file" -ar $SR "$OUT_DIR/$base_name.wav"
done
find "$IN_DIR" -name '*.mp3' | parallel -I% --max-args 1 ffmpeg -i % -ar $SR "$OUT_DIR/{/.}.wav"

0 comments on commit 6576107

Please sign in to comment.