Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

stitching vocab to generate wakeword samples #66

Merged
merged 14 commits into from
Apr 7, 2021
Merged
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ client.start().join()

Assuming MFA is installed using `download_mfa.sh` and [Common Voice dataset](https://commonvoice.mozilla.org/) is downloaded already, one can easily generate a dataset for custom wakeword using `generate_dataset.sh` script.
```bash
./generate_dataset.sh <common voice dataset path> <underscore separated wakeword (e.g. hey_fire_fox)> <inference sequence (e.g. [0,1,2])>
./generate_dataset.sh <common voice dataset path> <underscore separated wakeword (e.g. hey_fire_fox)> <inference sequence (e.g. [0,1,2])> <(Optional) "true" to skip negative dataset generation>
```

In the example that follows, we describe the process of generating a dataste for the word, "fire."
Expand Down Expand Up @@ -102,10 +102,16 @@ mfa_align data/fire-positive/audio eng.dict pretrained_models/english.zip output
DATASET_PATH=data/fire-positive python -m training.run.attach_alignment --align-type mfa -i output-folder
```

8. (Optional) Stitch vocab samples of aligned dataset to generate wakeword samples

```bash
VOCAB='["fire"]' INFERENCE_SEQUENCE=[0] python -m training.run.stitch_vocab_samples --aligned-dataset "data/fire-positive" --stitched-dataset "data/fire-stitched"
```

### Training and Running a Model

1. Source the relevant environment variables for training the `res8` model: `source envs/res8.env`.
2. Train the model: `python -m training.run.train -i data/fire-positive data/fire-negative --model res8 --workspace workspaces/fire-res8`.
2. Train the model: `python -m training.run.train -i data/fire-positive data/fire-negative data/fire-stitched --model res8 --workspace workspaces/fire-res8`.
3. For the CLI demo, run `python -m training.run.demo --model res8 --workspace workspaces/fire-res8`.

`train_model.sh` is also available which encaspulates individual command into a single bash script
Expand Down
32 changes: 21 additions & 11 deletions generate_dataset.sh
Original file line number Diff line number Diff line change
@@ -1,14 +1,18 @@
#bin/bash
# TODO:: enable this flag after fixing segfault issue of create_new_dataset
# set -e
set -e

COMMON_VOICE_DATASET_PATH=${1} # common voice dataset path
DATASET_NAME=${2} # underscore separated wakeword (e.g. hey_fire_fox)
INFERENCE_SEQUENCE=${3} # inference sequence (e.g. [0,1,2])
#${4} pass true to skip generating negative dataset

if [ $# -lt 3 ]; then
echo 1>&2 "invalid arguments: ./generate_dataset.sh <common voice dataset path> <underscore separated wakeword> <inference sequence>"
exit 2
echo 1>&2 "invalid arguments: ./generate_dataset.sh <common voice dataset path> <underscore separated wakeword> <inference sequence>"
exit 2
elif [ $# -eq 4 ]; then
SKIP_NEG_DATASET=${4}
else
SKIP_NEG_DATASET="false"
fi

echo "COMMON_VOICE_DATASET_PATH: ${COMMON_VOICE_DATASET_PATH}"
Expand All @@ -27,13 +31,15 @@ DATASET_FOLDER="data/${DATASET_NAME}"
echo ">>> generating datasets for ${VOCAB} at ${DATASET_FOLDER}"
mkdir -p "${DATASET_FOLDER}"

NEG_DATASET_PATH="${DATASET_FOLDER}/negative"
echo ">>> generating negative dataset: ${NEG_DATASET_PATH}"
mkdir -p "${NEG_DATASET_PATH}"
time VOCAB=${VOCAB} INFERENCE_SEQUENCE=${INFERENCE_SEQUENCE} DATASET_PATH=${NEG_DATASET_PATH} python -m training.run.create_raw_dataset -i ${COMMON_VOICE_DATASET_PATH} --positive-pct 0 --negative-pct 5
if [ ${SKIP_NEG_DATASET} != "true" ]; then
NEG_DATASET_PATH="${DATASET_FOLDER}/negative"
echo ">>> generating negative dataset: ${NEG_DATASET_PATH}"
mkdir -p "${NEG_DATASET_PATH}"
time VOCAB=${VOCAB} INFERENCE_SEQUENCE=${INFERENCE_SEQUENCE} DATASET_PATH=${NEG_DATASET_PATH} python -m training.run.create_raw_dataset -i ${COMMON_VOICE_DATASET_PATH} --positive-pct 0 --negative-pct 5

echo ">>> generating mock alignment for the negative set"
time DATASET_PATH=${NEG_DATASET_PATH} python -m training.run.attach_alignment --align-type stub
echo ">>> generating mock alignment for the negative set"
time DATASET_PATH=${NEG_DATASET_PATH} python -m training.run.attach_alignment --align-type stub
fi

POS_DATASET_PATH="${DATASET_FOLDER}/positive"
echo ">>> generating positive dataset: ${POS_DATASET_PATH}"
Expand All @@ -53,6 +59,10 @@ time yes n | ./bin/mfa_align --verbose --clean --num_jobs 12 "../${POS_DATASET_P
popd

echo ">>> attaching the MFA alignment to the positive dataset"
DATASET_PATH=${POS_DATASET_PATH} python -m training.run.attach_alignment --align-type mfa -i "${POS_DATASET_ALIGNMENT}"
time DATASET_PATH=${POS_DATASET_PATH} python -m training.run.attach_alignment --align-type mfa -i "${POS_DATASET_ALIGNMENT}"

STITCHED_DATASET="${DATASET_FOLDER}/stitched"
echo ">>> stitching vocab samples to generate a datset made up of stitched wakeword samples: ${STITCHED_DATASET}"
time VOCAB=${VOCAB} INFERENCE_SEQUENCE=${INFERENCE_SEQUENCE} python -m training.run.stitch_vocab_samples --aligned-dataset "${POS_DATASET_PATH}" --stitched-dataset "${STITCHED_DATASET}"

echo ">>> Dataset is ready for ${VOCAB}"
13 changes: 7 additions & 6 deletions howl/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,19 +48,20 @@ def __init__(self,
elif token_type == 'word':
self.add_vocab(vocab)

# initialize vocab set for the system
self.negative_label = len(self.adjusted_vocab)
self.vocab = Vocab({word: idx for idx, word in enumerate(
self.adjusted_vocab)}, oov_token_id=self.negative_label)

# initialize labeler; make sure this is located before adding other labels
if token_type == 'phone':
phone_phrases = [PhonePhrase.from_string(
x) for x in self.adjusted_vocab]
self.labeler = PhoneticFrameLabeler(phone_phrases)
elif token_type == 'word':
print('labeler vocab: ', self.adjusted_vocab)
self.labeler = WordFrameLabeler(self.adjusted_vocab)
self.labeler = WordFrameLabeler(self.vocab)

# initialize vocab set for the system and add negative label
self.negative_label = len(self.adjusted_vocab)
self.vocab = Vocab({word: idx for idx, word in enumerate(
self.adjusted_vocab)}, oov_token_id=self.negative_label)
# add negative label
self.add_vocab(['[OOV]'])

# initialize TranscriptSearcher with the processed targets
Expand Down
9 changes: 7 additions & 2 deletions howl/data/dataset/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from copy import deepcopy
from dataclasses import dataclass
from pathlib import Path
from typing import Generic, List, Mapping, Optional, TypeVar
from typing import Generic, List, Mapping, Optional, Tuple, TypeVar

import torch
from pydantic import BaseModel
Expand Down Expand Up @@ -31,6 +31,8 @@
@dataclass
class FrameLabelData:
timestamp_label_map: Mapping[float, int]
start_timestamp: List[Tuple[int, float]]
char_indices: List[Tuple[int, List[int]]]


@dataclass
Expand Down Expand Up @@ -158,7 +160,10 @@ def emplaced_audio_data(self,
new: bool = False) -> 'WakeWordClipExample':
ex = super().emplaced_audio_data(audio_data, scale, bias, new)
label_data = {} if new else {scale * k + bias: v for k, v in self.label_data.timestamp_label_map.items()}
return WakeWordClipExample(FrameLabelData(label_data), ex.metadata, audio_data, self.sample_rate)
return WakeWordClipExample(FrameLabelData(label_data, self.label_data.start_timestamp, self.label_data.char_indices),
ex.metadata,
audio_data,
self.sample_rate)


@dataclass
Expand Down
37 changes: 22 additions & 15 deletions howl/data/dataset/labeller.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import List

from howl.data.dataset.phone import PhonePhrase
from howl.data.tokenize import Vocab

from .base import AudioClipMetadata, FrameLabelData

Expand Down Expand Up @@ -38,22 +39,28 @@ def compute_frame_labels(self, metadata: AudioClipMetadata) -> FrameLabelData:


class WordFrameLabeler(FrameLabeler):
def __init__(self, words: List[str], ceil_word_boundary: bool = False):
self.words = words
def __init__(self, vocab: Vocab, ceil_word_boundary: bool = False):
self.vocab = vocab
self.ceil_word_boundary = ceil_word_boundary

def compute_frame_labels(self, metadata: AudioClipMetadata) -> FrameLabelData:
frame_labels = dict()
t = f' {metadata.transcription} '
start = 0
for idx, word in enumerate(self.words):
while True:
try:
start = t.index(word, start)
except ValueError:
break
while self.ceil_word_boundary and start + len(word) < len(t) - 1 and t[start + len(word)] != ' ':
start += 1
frame_labels[metadata.end_timestamps[start + len(word.rstrip()) - 2]] = idx
start += 1
return FrameLabelData(frame_labels)
char_indices = []
start_timestamp = []

char_idx = 0
for word in metadata.transcription.split():
vocab_found, remaining_transcript = self.vocab.trie.max_split(word)
word_size = len(word.rstrip())

# if the current word is in vocab, store necessary informations
if vocab_found and remaining_transcript == "":
label = self.vocab[word]
end_timestamp = metadata.end_timestamps[char_idx + word_size - 1]
frame_labels[end_timestamp] = label
char_indices.append((label, list(range(char_idx, char_idx + word_size))))
start_timestamp.append((label, metadata.end_timestamps[char_idx-1] if char_idx > 0 else 0.0))

char_idx += word_size + 1 # space

return FrameLabelData(frame_labels, start_timestamp, char_indices)
33 changes: 18 additions & 15 deletions howl/data/dataset/serialize.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
import json
import logging
from collections import defaultdict
from copy import deepcopy
from functools import partial
from typing import Tuple, TypeVar, List
from pathlib import Path
from multiprocessing import Pool
import json
import logging
from pathlib import Path
from typing import List, Tuple, TypeVar

from tqdm import tqdm
import pandas as pd
import soundfile

from .base import DatasetType, AudioClipMetadata, UNKNOWN_TRANSCRIPTION
from .dataset import AudioClipDataset, WakeWordDataset, AudioClassificationDataset, AudioDataset, \
HonkSpeechCommandsDataset
from howl.registered import RegisteredObjectBase
from howl.utils.audio import silent_load
from howl.utils.hash import sha256_int
from howl.utils.transcribe import SpeechToText
from tqdm import tqdm

from .base import UNKNOWN_TRANSCRIPTION, AudioClipMetadata, DatasetType
from .dataset import (AudioClassificationDataset, AudioClipDataset,
AudioDataset, HonkSpeechCommandsDataset, WakeWordDataset)

__all__ = ['AudioDatasetWriter',
'AudioClipDatasetLoader',
Expand Down Expand Up @@ -58,14 +57,15 @@ def __exit__(self, *args):


class AudioDatasetWriter:
def __init__(self, dataset: AudioClipDataset, mode: str = 'w', print_progress: bool = True):
def __init__(self, dataset: AudioClipDataset, prefix: str = '', mode: str = 'w', print_progress: bool = True):
self.dataset = dataset
self.print_progress = print_progress
self.mode = mode
self.prefix = prefix

def write(self, folder: Path):
def process(metadata: AudioClipMetadata):
new_path = audio_folder / metadata.path.with_suffix('.wav').name
new_path = (audio_folder / metadata.audio_id).with_suffix('.wav')
if not new_path.exists():
audio_data = silent_load(str(metadata.path), self.dataset.sr, self.dataset.mono)
soundfile.write(str(new_path), audio_data, self.dataset.sr)
Expand All @@ -75,7 +75,7 @@ def process(metadata: AudioClipMetadata):
folder.mkdir(exist_ok=True)
audio_folder = folder / 'audio'
audio_folder.mkdir(exist_ok=True)
with AudioDatasetMetadataWriter(folder, self.dataset.set_type, mode=self.mode) as writer:
with AudioDatasetMetadataWriter(folder, self.dataset.set_type, prefix=self.prefix, mode=self.mode) as writer:
for metadata in tqdm(self.dataset.metadata_list, disable=not self.print_progress, desc='Writing files'):
try:
process(metadata)
Expand Down Expand Up @@ -133,15 +133,17 @@ class WakeWordDatasetLoader(MetadataLoaderMixin, PathDatasetLoader):
dataset_class = WakeWordDataset
metadata_class = AudioClipMetadata


def transcribe_hey_snips_audio(path, metadata):
stt = SpeechToText()
path = (path / metadata['audio_file_path']).absolute()
transcription = 'hey snips'
if metadata['is_hotword'] == 0: # negative sample
if metadata['is_hotword'] == 0: # negative sample
transcription = stt.transcribe(path)

return path, transcription


class HeySnipsWakeWordLoader(RegisteredPathDatasetLoader, name='hey-snips'):
def __init__(self, num_processes=8):
self.stt = SpeechToText()
Expand Down Expand Up @@ -192,7 +194,7 @@ def load(filename, set_type):
return (load('train.json', DatasetType.TRAINING),
load('dev.json', DatasetType.DEV),
load('test.json', DatasetType.TEST))


class GoogleSpeechCommandsDatasetLoader(RegisteredPathDatasetLoader, name='gsc'):
def __init__(self, vocab: List[str] = None, use_bg_noise: bool = False):
Expand Down Expand Up @@ -237,7 +239,8 @@ def load(filename, set_type):
df = pd.read_csv(str(path / filename), sep='\t', quoting=3, na_filter=False)
metadata_list = []
for tup in df.itertuples():
metadata_list.append(AudioClipMetadata(path=(path / 'clips' / tup.path).absolute(), transcription=tup.sentence))
metadata_list.append(AudioClipMetadata(
path=(path / 'clips' / tup.path).absolute(), transcription=tup.sentence))
return AudioClipDataset(metadata_list=metadata_list, set_type=set_type, **dataset_kwargs)

assert path.exists(), 'dataset path doesn\'t exist'
Expand Down
Loading