-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathtruncate
executable file
·35 lines (28 loc) · 909 Bytes
/
truncate
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env python3
"""
Truncates a sentence to a set length based on the number of tokens.
If "-l zh" is passed, Han characters count as tokens.
"""
import sys
import regex as re
def main(args):
# TODO: should preserve the particular whitespaces
for line in sys.stdin:
if args.lang == "zh":
count = 0
for char in line.strip():
if re.match(r'\p{Han}', char) or char == " ":
count += 1
if count <= args.length:
print(char, end="")
print()
else:
tokens = line.strip().split()
print(" ".join(tokens[0:args.length]))
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("length", type=int)
parser.add_argument("--lang", "-l", default="en")
args = parser.parse_args()
main(args)