-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathmycut
executable file
·88 lines (66 loc) · 2.6 KB
/
mycut
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env python3
"""
Addresses limitations in cut's field handling.
* Allows redundant output of columns
* Columns can be output in any order
* Supports intermingled ranges (x-y) and lone columns (x).
* Pads input stream to requested column
* Supports open-ended ranges at the end (x-)
* Supports negative column indexing
* Use -w for whitespace splitting
* Delimiter can be multi-char and even a regex
"""
import re
import sys
import gzip
def main(args):
fields = []
maxfieldi = 0
for fieldtoken in args.fields.split(","):
if "-" in fieldtoken[1:]:
# A range (check from 1st char to permit negative indexing)
start, stop = fieldtoken.split("-")
assert start != "" or stop != "", f"invalid range {fieldtoken}: only one end can be open-ended"
if start and stop:
assert int(stop) >= int(start), f"invalid range {fieldtoken}: must be non-decreasing"
start = int(start) if start else 1
stop = int(stop) if stop else -1
fields.append((start, stop))
maxfieldi = max(maxfieldi, start, stop)
else:
fields.append(int(fieldtoken))
maxfieldi = max(maxfieldi, fields[-1])
for line in args.infile:
if args.whitespace:
line = line.strip()
# front-pad for 1-indexing
columns = [""] + re.split(args.delimiter, line.rstrip())
# pad columns to the maximum field length
if maxfieldi >= len(columns):
columns += [""] * (maxfieldi - len(columns) + 1)
output = []
for field in fields:
if type(field) is int:
output.append(columns[field])
elif type(field) is tuple:
start, stop = field
if stop == -1:
stop = len(columns) - 1
output += [columns[f] for f in range(start, stop + 1)]
print(*output, sep=args.delimiter)
def handle_sigpipe(signum, frame):
sys.exit(0)
if __name__ == "__main__":
import argparse
import signal
# Don't complain when downstream CLI tools quit
signal.signal(signal.SIGPIPE, handle_sigpipe)
parser = argparse.ArgumentParser()
parser.add_argument("infile", nargs="?", type=argparse.FileType("r"), default=sys.stdin)
parser.add_argument("--delimiter", "-d", default="\t")
parser.add_argument("--whitespace", "-w", action="store_true", help="Trim line and split on whitespace")
parser.add_argument("--fields", "-f", default="1")
args = parser.parse_args()
if args.whitespace:
args.delimiter = r'\s+'
main(args)