castorini · ljj7975 · Apr 7, 2021 · Mar 15, 2021 · Mar 16, 2021 · Mar 18, 2021
diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ client.start().join()
 
 Assuming MFA is installed using `download_mfa.sh` and [Common Voice dataset](https://commonvoice.mozilla.org/) is downloaded already, one can easily generate a dataset for custom wakeword using `generate_dataset.sh` script.
 ```bash
-./generate_dataset.sh <common voice dataset path> <underscore separated wakeword (e.g. hey_fire_fox)> <inference sequence (e.g. [0,1,2])>
+./generate_dataset.sh <common voice dataset path> <underscore separated wakeword (e.g. hey_fire_fox)> <inference sequence (e.g. [0,1,2])> <(Optional) "true" to skip negative dataset generation>
 ```
 
 In the example that follows, we describe the process of generating a dataste for the word, "fire."
@@ -102,10 +102,16 @@ mfa_align data/fire-positive/audio eng.dict pretrained_models/english.zip output
 DATASET_PATH=data/fire-positive python -m training.run.attach_alignment --align-type mfa -i output-folder
 ```
 
+8. (Optional) Stitch vocab samples of aligned dataset to generate wakeword samples
+
+```bash
+VOCAB='["fire"]' INFERENCE_SEQUENCE=[0] python -m training.run.stitch_vocab_samples --aligned-dataset "data/fire-positive" --stitched-dataset "data/fire-stitched"
+```
+
 ### Training and Running a Model
 
 1. Source the relevant environment variables for training the `res8` model: `source envs/res8.env`.
-2. Train the model: `python -m training.run.train -i data/fire-positive data/fire-negative --model res8 --workspace workspaces/fire-res8`.
+2. Train the model: `python -m training.run.train -i data/fire-positive data/fire-negative data/fire-stitched --model res8 --workspace workspaces/fire-res8`.
 3. For the CLI demo, run `python -m training.run.demo --model res8 --workspace workspaces/fire-res8`.
 
 `train_model.sh` is also available which encaspulates individual command into a single bash script

diff --git a/generate_dataset.sh b/generate_dataset.sh
@@ -1,14 +1,18 @@
 #bin/bash
-# TODO:: enable this flag after fixing segfault issue of create_new_dataset
-# set -e
+set -e
 
 COMMON_VOICE_DATASET_PATH=${1} # common voice dataset path
 DATASET_NAME=${2} # underscore separated wakeword (e.g. hey_fire_fox)
 INFERENCE_SEQUENCE=${3} # inference sequence (e.g. [0,1,2])
+#${4} pass true to skip generating negative dataset
 
 if [ $# -lt 3 ]; then
-  echo 1>&2 "invalid arguments: ./generate_dataset.sh <common voice dataset path> <underscore separated wakeword> <inference sequence>"
-  exit 2
+    echo 1>&2 "invalid arguments: ./generate_dataset.sh <common voice dataset path> <underscore separated wakeword> <inference sequence>"
+    exit 2
+elif [ $# -eq 4 ]; then
+    SKIP_NEG_DATASET=${4}
+else
+    SKIP_NEG_DATASET="false"
 fi
 
 echo "COMMON_VOICE_DATASET_PATH: ${COMMON_VOICE_DATASET_PATH}"
@@ -27,13 +31,15 @@ DATASET_FOLDER="data/${DATASET_NAME}"
 echo ">>> generating datasets for ${VOCAB} at ${DATASET_FOLDER}"
 mkdir -p "${DATASET_FOLDER}"
 
-NEG_DATASET_PATH="${DATASET_FOLDER}/negative"
-echo ">>> generating negative dataset: ${NEG_DATASET_PATH}"
-mkdir -p "${NEG_DATASET_PATH}"
-time VOCAB=${VOCAB} INFERENCE_SEQUENCE=${INFERENCE_SEQUENCE} DATASET_PATH=${NEG_DATASET_PATH} python -m training.run.create_raw_dataset -i ${COMMON_VOICE_DATASET_PATH} --positive-pct 0 --negative-pct 5
+if [ ${SKIP_NEG_DATASET} != "true" ]; then
+    NEG_DATASET_PATH="${DATASET_FOLDER}/negative"
+    echo ">>> generating negative dataset: ${NEG_DATASET_PATH}"
+    mkdir -p "${NEG_DATASET_PATH}"
+    time VOCAB=${VOCAB} INFERENCE_SEQUENCE=${INFERENCE_SEQUENCE} DATASET_PATH=${NEG_DATASET_PATH} python -m training.run.create_raw_dataset -i ${COMMON_VOICE_DATASET_PATH} --positive-pct 0 --negative-pct 5
 
-echo ">>> generating mock alignment for the negative set"
-time DATASET_PATH=${NEG_DATASET_PATH} python -m training.run.attach_alignment --align-type stub
+    echo ">>> generating mock alignment for the negative set"
+    time DATASET_PATH=${NEG_DATASET_PATH} python -m training.run.attach_alignment --align-type stub
+fi
 
 POS_DATASET_PATH="${DATASET_FOLDER}/positive"
 echo ">>> generating positive dataset: ${POS_DATASET_PATH}"
@@ -53,6 +59,10 @@ time yes n | ./bin/mfa_align --verbose --clean --num_jobs 12 "../${POS_DATASET_P
 popd
 
 echo ">>> attaching the MFA alignment to the positive dataset"
-DATASET_PATH=${POS_DATASET_PATH} python -m training.run.attach_alignment --align-type mfa -i "${POS_DATASET_ALIGNMENT}"
+time DATASET_PATH=${POS_DATASET_PATH} python -m training.run.attach_alignment --align-type mfa -i "${POS_DATASET_ALIGNMENT}"
+
+STITCHED_DATASET="${DATASET_FOLDER}/stitched"
+echo ">>> stitching vocab samples to generate a datset made up of stitched wakeword samples: ${STITCHED_DATASET}"
+time VOCAB=${VOCAB} INFERENCE_SEQUENCE=${INFERENCE_SEQUENCE} python -m training.run.stitch_vocab_samples --aligned-dataset "${POS_DATASET_PATH}" --stitched-dataset "${STITCHED_DATASET}"
 
 echo ">>> Dataset is ready for ${VOCAB}"
diff --git a/howl/context.py b/howl/context.py
@@ -48,19 +48,20 @@ def __init__(self,
         elif token_type == 'word':
             self.add_vocab(vocab)
 
+        # initialize vocab set for the system
+        self.negative_label = len(self.adjusted_vocab)
+        self.vocab = Vocab({word: idx for idx, word in enumerate(
+            self.adjusted_vocab)}, oov_token_id=self.negative_label)
+
         # initialize labeler; make sure this is located before adding other labels
         if token_type == 'phone':
             phone_phrases = [PhonePhrase.from_string(
                 x) for x in self.adjusted_vocab]
             self.labeler = PhoneticFrameLabeler(phone_phrases)
         elif token_type == 'word':
-            print('labeler vocab: ', self.adjusted_vocab)
-            self.labeler = WordFrameLabeler(self.adjusted_vocab)
+            self.labeler = WordFrameLabeler(self.vocab)
 
-        # initialize vocab set for the system and add negative label
-        self.negative_label = len(self.adjusted_vocab)
-        self.vocab = Vocab({word: idx for idx, word in enumerate(
-            self.adjusted_vocab)}, oov_token_id=self.negative_label)
+        # add negative label
         self.add_vocab(['[OOV]'])
 
         # initialize TranscriptSearcher with the processed targets

diff --git a/howl/data/dataset/base.py b/howl/data/dataset/base.py
@@ -3,7 +3,7 @@
 from copy import deepcopy
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Generic, List, Mapping, Optional, TypeVar
+from typing import Generic, List, Mapping, Optional, Tuple, TypeVar
 
 import torch
 from pydantic import BaseModel
@@ -31,6 +31,8 @@
 @dataclass
 class FrameLabelData:
     timestamp_label_map: Mapping[float, int]
+    start_timestamp: List[Tuple[int, float]]
+    char_indices: List[Tuple[int, List[int]]]
 
 
 @dataclass
@@ -158,7 +160,10 @@ def emplaced_audio_data(self,
                             new: bool = False) -> 'WakeWordClipExample':
         ex = super().emplaced_audio_data(audio_data, scale, bias, new)
         label_data = {} if new else {scale * k + bias: v for k, v in self.label_data.timestamp_label_map.items()}
-        return WakeWordClipExample(FrameLabelData(label_data), ex.metadata, audio_data, self.sample_rate)
+        return WakeWordClipExample(FrameLabelData(label_data, self.label_data.start_timestamp, self.label_data.char_indices),
+                                   ex.metadata,
+                                   audio_data,
+                                   self.sample_rate)
 
 
 @dataclass

diff --git a/howl/data/dataset/labeller.py b/howl/data/dataset/labeller.py
@@ -3,6 +3,7 @@
 from typing import List
 
 from howl.data.dataset.phone import PhonePhrase
+from howl.data.tokenize import Vocab
 
 from .base import AudioClipMetadata, FrameLabelData
 
@@ -38,22 +39,28 @@ def compute_frame_labels(self, metadata: AudioClipMetadata) -> FrameLabelData:
 
 
 class WordFrameLabeler(FrameLabeler):
-    def __init__(self, words: List[str], ceil_word_boundary: bool = False):
-        self.words = words
+    def __init__(self, vocab: Vocab, ceil_word_boundary: bool = False):
+        self.vocab = vocab
         self.ceil_word_boundary = ceil_word_boundary
 
     def compute_frame_labels(self, metadata: AudioClipMetadata) -> FrameLabelData:
         frame_labels = dict()
-        t = f' {metadata.transcription} '
-        start = 0
-        for idx, word in enumerate(self.words):
-            while True:
-                try:
-                    start = t.index(word, start)
-                except ValueError:
-                    break
-                while self.ceil_word_boundary and start + len(word) < len(t) - 1 and t[start + len(word)] != ' ':
-                    start += 1
-                frame_labels[metadata.end_timestamps[start + len(word.rstrip()) - 2]] = idx
-                start += 1
-        return FrameLabelData(frame_labels)
+        char_indices = []
+        start_timestamp = []
+
+        char_idx = 0
+        for word in metadata.transcription.split():
+            vocab_found, remaining_transcript = self.vocab.trie.max_split(word)
+            word_size = len(word.rstrip())
+
+            # if the current word is in vocab, store necessary informations
+            if vocab_found and remaining_transcript == "":
+                label = self.vocab[word]
+                end_timestamp = metadata.end_timestamps[char_idx + word_size - 1]
+                frame_labels[end_timestamp] = label
+                char_indices.append((label, list(range(char_idx, char_idx + word_size))))
+                start_timestamp.append((label, metadata.end_timestamps[char_idx-1] if char_idx > 0 else 0.0))
+
+            char_idx += word_size + 1  # space
+
+        return FrameLabelData(frame_labels, start_timestamp, char_indices)
diff --git a/howl/data/dataset/serialize.py b/howl/data/dataset/serialize.py
@@ -1,24 +1,23 @@
+import json
+import logging
 from collections import defaultdict
 from copy import deepcopy
 from functools import partial
-from typing import Tuple, TypeVar, List
-from pathlib import Path
 from multiprocessing import Pool
-import json
-import logging
+from pathlib import Path
+from typing import List, Tuple, TypeVar
 
-from tqdm import tqdm
 import pandas as pd
 import soundfile
-
-from .base import DatasetType, AudioClipMetadata, UNKNOWN_TRANSCRIPTION
-from .dataset import AudioClipDataset, WakeWordDataset, AudioClassificationDataset, AudioDataset, \
-    HonkSpeechCommandsDataset
 from howl.registered import RegisteredObjectBase
 from howl.utils.audio import silent_load
 from howl.utils.hash import sha256_int
 from howl.utils.transcribe import SpeechToText
+from tqdm import tqdm
 
+from .base import UNKNOWN_TRANSCRIPTION, AudioClipMetadata, DatasetType
+from .dataset import (AudioClassificationDataset, AudioClipDataset,
+                      AudioDataset, HonkSpeechCommandsDataset, WakeWordDataset)
 
 __all__ = ['AudioDatasetWriter',
            'AudioClipDatasetLoader',
@@ -58,14 +57,15 @@ def __exit__(self, *args):
 
 
 class AudioDatasetWriter:
-    def __init__(self, dataset: AudioClipDataset, mode: str = 'w', print_progress: bool = True):
+    def __init__(self, dataset: AudioClipDataset, prefix: str = '', mode: str = 'w', print_progress: bool = True):
         self.dataset = dataset
         self.print_progress = print_progress
         self.mode = mode
+        self.prefix = prefix
 
     def write(self, folder: Path):
         def process(metadata: AudioClipMetadata):
-            new_path = audio_folder / metadata.path.with_suffix('.wav').name
+            new_path = (audio_folder / metadata.audio_id).with_suffix('.wav')
             if not new_path.exists():
                 audio_data = silent_load(str(metadata.path), self.dataset.sr, self.dataset.mono)
                 soundfile.write(str(new_path), audio_data, self.dataset.sr)
@@ -75,7 +75,7 @@ def process(metadata: AudioClipMetadata):
         folder.mkdir(exist_ok=True)
         audio_folder = folder / 'audio'
         audio_folder.mkdir(exist_ok=True)
-        with AudioDatasetMetadataWriter(folder, self.dataset.set_type, mode=self.mode) as writer:
+        with AudioDatasetMetadataWriter(folder, self.dataset.set_type, prefix=self.prefix, mode=self.mode) as writer:
             for metadata in tqdm(self.dataset.metadata_list, disable=not self.print_progress, desc='Writing files'):
                 try:
                     process(metadata)
@@ -133,15 +133,17 @@ class WakeWordDatasetLoader(MetadataLoaderMixin, PathDatasetLoader):
     dataset_class = WakeWordDataset
     metadata_class = AudioClipMetadata
 
+
 def transcribe_hey_snips_audio(path, metadata):
     stt = SpeechToText()
     path = (path / metadata['audio_file_path']).absolute()
     transcription = 'hey snips'
-    if metadata['is_hotword'] == 0: # negative sample
+    if metadata['is_hotword'] == 0:  # negative sample
         transcription = stt.transcribe(path)
 
     return path, transcription
 
+
 class HeySnipsWakeWordLoader(RegisteredPathDatasetLoader, name='hey-snips'):
     def __init__(self, num_processes=8):
         self.stt = SpeechToText()
@@ -192,7 +194,7 @@ def load(filename, set_type):
         return (load('train.json', DatasetType.TRAINING),
                 load('dev.json', DatasetType.DEV),
                 load('test.json', DatasetType.TEST))
-        
+
 
 class GoogleSpeechCommandsDatasetLoader(RegisteredPathDatasetLoader, name='gsc'):
     def __init__(self, vocab: List[str] = None, use_bg_noise: bool = False):
@@ -237,7 +239,8 @@ def load(filename, set_type):
             df = pd.read_csv(str(path / filename), sep='\t', quoting=3, na_filter=False)
             metadata_list = []
             for tup in df.itertuples():
-                metadata_list.append(AudioClipMetadata(path=(path / 'clips' / tup.path).absolute(), transcription=tup.sentence))
+                metadata_list.append(AudioClipMetadata(
+                    path=(path / 'clips' / tup.path).absolute(), transcription=tup.sentence))
             return AudioClipDataset(metadata_list=metadata_list, set_type=set_type, **dataset_kwargs)
 
         assert path.exists(), 'dataset path doesn\'t exist'