-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathbin.py
95 lines (72 loc) · 3.4 KB
/
bin.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
import argparse
import json
import os
import re
import subprocess
import tempfile
import pympi
from huggingface_hub import snapshot_download
from signwriting.tokenizer import SignWritingTokenizer
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--elan', required=True, type=str, help='path to elan file')
return parser.parse_args()
def main():
args = get_args()
print('Downloading model...')
path_to_model = snapshot_download(repo_id="sign/sockeye-signwriting-to-text")
print('Loading ELAN file...')
eaf = pympi.Elan.Eaf(file_path=args.elan, author="sign-language-processing/signbank-plus")
sign_annotations = eaf.get_annotation_data_for_tier('SIGN')
if len(sign_annotations) == 0:
print('No signs available to translate')
return
sign_language_code = dict(eaf.get_properties()).get('language', None)
if sign_language_code is None:
print('No language code available')
return
tokenizer = SignWritingTokenizer()
sentence_to_translate = {}
for tier in eaf.get_tier_names():
if tier.startswith('SENTENCE_'):
spoken_language_code = tier.split('_')[1]
sentence_boundaries = eaf.get_annotation_data_for_tier(tier)
for (sentence_start, sentence_end, _) in sentence_boundaries:
signs_in_sentence = [sign for sign_start, sign_end, sign in sign_annotations
if sentence_start <= sign_end and sentence_end >= sign_start]
if len(signs_in_sentence) == 0:
continue
signs_tokens = " ".join(tokenizer.text_to_tokens(' '.join(signs_in_sentence), box_position=False))
input_sentence = f"${spoken_language_code} ${sign_language_code} {signs_tokens}"
sentence_to_translate[(tier, sentence_start, sentence_end)] = input_sentence
print(f"Translating {len(sentence_to_translate)} sentences...")
# Write sentences to a temporary file
temp_input_file = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False)
for sentence in sentence_to_translate.values():
temp_input_file.write(sentence + "\n")
temp_input_file.close()
temp_output_file = tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False)
temp_output_file.close()
# Call the translation script
cmd = ['python', '-m', 'sockeye.translate', '-m', path_to_model,
'--input', temp_input_file.name, '--output', temp_output_file.name, '--nbest-size=5']
print(' '.join(cmd))
subprocess.run(cmd)
# Read the output file
with open(temp_output_file.name, 'r') as f:
lines = f.readlines()
# Remove BPE tokenization
lines = [re.sub('(@@ |@@$)', '', line.strip()) for line in lines]
translations = [" / ".join(json.loads(line)["translations"]) for line in lines]
print('Adding translations to ELAN file...')
for (tier, sentence_start, sentence_end), translation in zip(sentence_to_translate.keys(), translations):
eaf.remove_annotation(tier, sentence_start, sentence_end)
print(f"{tier} {sentence_start} {sentence_end} {translation}")
eaf.add_annotation(tier, sentence_start, sentence_end, translation)
eaf.to_file(args.elan)
print('Cleaning up...')
os.unlink(temp_input_file.name)
os.unlink(temp_output_file.name)
if __name__ == '__main__':
main()