forked from gooofy/zamia-speech
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspeech_audio_scan.py
executable file
·234 lines (170 loc) · 7.31 KB
/
speech_audio_scan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2018 Marc Puels
# Copyright 2016, 2017 Guenter Bartsch
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Lesser General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
#
# Scan directory for audio files and convert them to wav files
#
# For each speech corpus `speech_corpus`
#
# 1. the resulting wav files are written to the directory
# `.speechrc.wav16`/<speech_corpus>/
#
# 2. the transcripts in data/src/speech/<speech_corpus>/transcripts_*.csv are
# updated.
#
import os
import sys
import logging
from nltools import misc
from speech_transcripts import Transcripts
from optparse import OptionParser
PROC_TITLE = 'speech_audio_scan'
def exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora):
missing_directories = []
for speech_corpus in speech_corpora:
corpus_dir = '%s/%s' % (speech_corpora_dir, speech_corpus)
if not os.path.isdir(corpus_dir):
missing_directories.append(corpus_dir)
if missing_directories:
logging.error(
"Could not find the following directories. Please update the var "
"`speech_corpora` in ~/.speechrc or move the missing corpus under "
"the directory set by `speech_corpora`. Missing directories: " +
", ".join(missing_directories))
sys.exit(1)
def scan_audiodir(audiodir, transcripts, out_wav16_subdir):
# keep track of all cfns we have audio files for
cfn_audio = set()
for subdir in os.listdir(audiodir):
if not '-' in subdir:
logging.warn('skipping %s as it does not match our naming scheme' % subdir)
continue
logging.debug ("scanning %s in %s" % (subdir, audiodir))
subdirfn = '%s/%s' % (audiodir, subdir)
wavdirfn = '%s/wav' % subdirfn
flacdirfn = '%s/flac' % subdirfn
# do we have prompts?
prompts = {}
promptsfn = '%s/etc/prompts-original' % subdirfn
if os.path.isfile(promptsfn):
with open(promptsfn) as promptsf:
while True:
line = promptsf.readline().decode('utf8', errors='ignore')
if not line:
break
line = line.rstrip()
if '\t' in line:
afn = line.split('\t')[0]
ts = line[len(afn)+1:]
else:
afn = line.split(' ')[0]
ts = line[len(afn)+1:]
prompts[afn] = ts.replace(';',',')
# print repr(prompts)
for audiodirfn in [wavdirfn, flacdirfn]:
if not os.path.isdir(audiodirfn):
continue
for audiofullfn in os.listdir(audiodirfn):
audiofn = os.path.splitext(audiofullfn)[0]
cfn = '%s_%s' % (subdir, audiofn)
cfn_audio.add(cfn)
if not cfn in transcripts:
# import pdb; pdb.set_trace()
# print repr(prompts)
prompt = prompts[audiofn] if audiofn in prompts else ''
logging.info ("new audio found: %s %s %s" % (cfn, audiofn, prompt))
spk = cfn.split('-')[0]
v = { 'dirfn' : os.path.basename(os.path.normpath(subdirfn)),
'audiofn' : audiofn,
'prompt' : prompt,
'ts' : '',
'quality' : 0,
'spk' : spk}
transcripts[cfn] = v
audio_convert (cfn, subdir, audiofn, audiodir, out_wav16_subdir)
# report missing audio files
for cfn in sorted(transcripts):
if cfn in cfn_audio:
continue
logging.warn('audio file missing for %s' % cfn)
def audio_convert(cfn, subdir, fn, audiodir, wav16_dir):
# global mfcc_dir
# convert audio if not done yet
w16filename = "%s/%s.wav" % (wav16_dir, cfn)
if not os.path.isfile(w16filename):
wavfilename = "%s/%s/wav/%s.wav" % (audiodir, subdir, fn)
if not os.path.isfile(wavfilename):
# flac ?
flacfilename = "%s/%s/flac/%s.flac" % (audiodir, subdir, fn)
if not os.path.isfile(flacfilename):
print " WAV file '%s' does not exist, neither does FLAC file '%s' => skipping submission." % (
wavfilename, flacfilename)
return False
print "%-20s: converting %s => %s (16kHz mono)" % (
cfn, flacfilename, w16filename)
os.system(
"sox '%s' -r 16000 -b 16 -c 1 %s" % (flacfilename, w16filename))
else:
print "%-20s: converting %s => %s (16kHz mono)" % (
cfn, wavfilename, w16filename)
os.system(
"sox '%s' -r 16000 -b 16 -c 1 %s" % (wavfilename, w16filename))
return True
if __name__ == "__main__":
# @plac.annotations(
# verbose=("Enable verbose logging", "flag", "v"),
# speech_corpora=("Name of the speech corpus to scan. Example values: "
# + ", ".join(SPEECH_CORPORA), "positional", None, str, None,
# "speech_corpus"))
misc.init_app(PROC_TITLE)
#
# config
#
config = misc.load_config('.speechrc')
speech_corpora_dir = config.get("speech", "speech_corpora")
wav16 = config.get("speech", "wav16")
speech_corpora_available = []
for corpus in os.listdir(speech_corpora_dir):
if not os.path.isdir('%s/%s' % (speech_corpora_dir, corpus)):
continue
speech_corpora_available.append(corpus)
#
# commandline
#
parser = OptionParser("usage: %%prog [options] <speech_corpora>\n speech_corpora: one or more of %s" % ", ".join(speech_corpora_available))
parser.add_option ("-v", "--verbose", action="store_true", dest="verbose",
help="verbose output")
(options, speech_corpora) = parser.parse_args()
if options.verbose:
logging.basicConfig(level=logging.DEBUG)
else:
logging.basicConfig(level=logging.INFO)
if len(speech_corpora) < 1:
logging.error("At least one speech corpus must be provided.")
sys.exit(1)
exit_if_corpus_is_missing(speech_corpora_dir, speech_corpora)
for speech_corpus in speech_corpora:
transcripts = Transcripts(corpus_name=speech_corpus, create_db=True)
out_wav16_subdir = '%s/%s' % (wav16, speech_corpus)
misc.mkdirs(out_wav16_subdir)
in_root_corpus_dir = '%s/%s' % (speech_corpora_dir, speech_corpus)
scan_audiodir(str(in_root_corpus_dir), transcripts, str(out_wav16_subdir))
transcripts.save()
print speech_corpus, "new transcripts saved."
print