-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbin_tools.py
106 lines (90 loc) · 3.53 KB
/
bin_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
"""Tools relevant to BIN"""
# -*- coding: utf-8 -*-
#
# Copyright 2020 Cadia - Language and Voice Lab
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import bisect
import re
from tqdm import tqdm
from conf import ICE_ALPHABET, OTHER_CHARS, BIN_LIST_SORTED_PATH
SUB_PATTERN = re.compile(r'[^{}{}]'.format(ICE_ALPHABET, OTHER_CHARS))
LOOKUP_PATTERN = re.compile(r'[^{}]'.format(ICE_ALPHABET))
def bin2list(in_path: str, out_path: str):
'''
Save a lower-case only version of all cases of all words
in the BIN index.
Input arguments:
* in_path (string): A path to the BIN index
* out_path (string): A target path to save the formatted
index
'''
with open(in_path) as i_f, open(out_path, 'w') as o_f:
for line in tqdm(i_f):
o_f.write(f'{line.split(";")[4].lower()}\n')
def bin_verify(bin_path: str, in_path: str, out_path: str, bad_path: str):
'''
Input arguments:
* bin_path (string): A path to the index generated by bin2list
* in_path (string): A path to an utterance list
* out_path (string): A target path to save the filtered
utterance list
* bad_path (string): A target path to save the utterances that
include words not found in the BIN list
'''
bin_words = []
with open(bin_path) as i_f:
for line in tqdm(i_f):
bin_words.append(line.strip())
num_lines = len(open(in_path).readlines())
with open(in_path) as i_f, open(out_path, 'w') as o_f, \
open(bad_path, 'w') as b_f:
for line in tqdm(i_f, total=num_lines):
words = line.split('\t')[0].strip().lower().split()
valid = True
for word in words:
word = re.sub(SUB_PATTERN, '', word)
i = bisect.bisect_left(bin_words, word)
if i == len(bin_words) or bin_words[i] != word:
# word not in BIN
b_f.write(f'{" ".join(words)}\tBIN-{word}-{i}\n')
valid = False
break
if valid:
o_f.write(line)
class BinVerifer:
"""A class for verifying if sentences are in BIN"""
def __init__(self, path: str = BIN_LIST_SORTED_PATH):
self.bin_words = []
with open(path) as i_f:
for line in i_f:
self.bin_words.append(line.strip())
def in_bin(self, word: str):
'''
If a word contains any other characters then the icelandic
alphabet or something in {'.', ','. ';'. '?'} we throw it
out.
'''
word = word.lower()
if word != re.sub(SUB_PATTERN, '', word):
return False
word = re.sub(LOOKUP_PATTERN, '', word)
i = bisect.bisect_left(self.bin_words, word)
return not (i == len(self.bin_words) or self.bin_words[i] != word)
def check_utt(self, utt: str):
"""Returns True if all words in the sentence are in BIN"""
for word in utt.split():
if not self.in_bin(word):
# word not in BIN
return False
return True