-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathunpaste
executable file
·69 lines (55 loc) · 2.6 KB
/
unpaste
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python3
# *-* encoding: utf-8 *-*
"""
Takes a tab-delimited stream on STDIN and writes the columns in turn to the files specified as
arguments (i.e., the opposite of UNIX `paste`). If there are more fields than files, the last file
will get all remaining fields. If there are more files than fields, files beyond the last field will
get blank lines.
paste file1 file2 [...] | ./unpaste file1.copy file2.copy [...]
Author: Matt Post <[email protected]>
"""
import sys
import argparse
import multiprocessing
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--append', '-a', action='store_true', default=False,
help='Append to files instead of overwriting.')
parser.add_argument('--tee', '-t', action='store_true', default=False,
help='Also echo to STDOUT')
parser.add_argument('--delimiter', '-d', type=str, default='\t',
help='Delimiter to use')
parser.add_argument('files', nargs='+',
help='File(s) to write to')
parser.add_argument("--blocksize", "-b", type=int, default=2*10**8,
help="Blocksize for pgzip")
parser.add_argument("--num-threads", "-j", type=int, default=multiprocessing.cpu_count(),
help="Number of threads for pgzip. Default: %(default)s")
args = parser.parse_args()
def _writeto(filename, append=False, threads_per_file=1):
mode = 'at' if append else 'wt'
if filename.endswith('.gz'):
try:
import pgzip
print(f"Using pgzip with {threads_per_file} threads.", file=sys.stderr)
return pgzip.open(filename, mode, thread=threads_per_file, blocksize=args.blocksize)
except ModuleNotFoundError:
print("pgzip not found, using much slower gzip instead.", file=sys.stderr)
print("We recommend you install it: 'pip install pgzip'", file=sys.stderr)
import gzip
return gzip.open(filename, mode)
else:
return open(filename, mode)
threads_per_file = args.num_threads // len(args.files)
fhs = list(map(lambda x: _writeto(x, args.append, threads_per_file), args.files))
for line in sys.stdin:
line = line.rstrip('\n')
fields = line.split(args.delimiter, maxsplit=len(fhs)-1)
for i, field in enumerate(fields):
print(field, file=fhs[i])
for i in range(len(fields), len(fhs)):
print(file=fhs[i])
if args.tee:
print(line)
for fh in fhs:
fh.close()