-
Notifications
You must be signed in to change notification settings - Fork 44
/
Copy pathhtml_table_converter
executable file
·161 lines (143 loc) · 4.7 KB
/
html_table_converter
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @license AGPLv3 <https://www.gnu.org/licenses/agpl-3.0.html>
# @author Copyright (C) 2015 Robin Schneider <[email protected]>
"""
Parse HTML tables and write them to a CSV file.
"""
__version__ = '0.8'
# modules {{{
# std {{{
import re
import csv
import logging
# }}}
import urllib.request
from html_table_parser import HTMLTableParser
# }}}
def convert_td_to_float(table: list, decimal_separator: str = '.') -> list:
for row_ind, row in enumerate(table):
for td_ind, td in enumerate(row):
if decimal_separator:
if re.match(r'\d+\.\d+,\d+', td):
logger.debug(
"Found German floating number format: {}".format(td))
td_num = re.sub(
r'[^.0-9]',
'',
td.replace('.', '').replace(',', '.'),
)
try:
td_num = float(td_num)
except ValueError:
pass
else:
td = str(td_num).replace('.', decimal_separator)
elif re.match(r'\d+,\d+\.\d+', td):
logger.debug(
"Found English floating number format: {}".format(td))
td_num = re.sub(r'[^.0-9]', '', td)
try:
td_num = float(td_num)
except ValueError:
pass
else:
td = str(td_num).replace('.', decimal_separator)
table[row_ind][td_ind] = td
return table
def write_csv_file(filepath: str, data: list) -> None:
with open(filepath, encoding='utf-8', mode='w', newline='') as fh:
csv_w = csv.writer(fh, delimiter=',')
csv_w.writerows(data)
# main {{{
if __name__ == '__main__':
from argparse import ArgumentParser
from pprint import pprint
# Script Arguments {{{
args_parser = ArgumentParser(
description=__doc__,
)
args_parser.add_argument(
'-V', '--version',
action='version',
version='%(prog)s {version}'.format(version=__version__)
)
args_parser.add_argument(
'-d', '--debug',
help="Print lots of debugging statements",
action="store_const", dest="loglevel", const=logging.DEBUG,
default=logging.WARNING,
)
args_parser.add_argument(
'-v', '--verbose',
help="Be verbose",
action="store_const", dest="loglevel", const=logging.INFO,
)
args_parser.add_argument(
'-f', '--output-format',
help="Specify output format. Default is %(default)s.",
default='csv',
)
args_group = args_parser.add_mutually_exclusive_group(required=True)
args_group.add_argument(
'-i', '--input-file',
help="Specified the input file.",
)
args_group.add_argument(
'-u', '--input-url',
help="Specified the input URL.",
)
args_parser.add_argument(
'-o', '--output',
help="Write output to OUTPUT instead of stdout.",
)
args_parser.add_argument(
'-s', '--decimal-separator',
help="Decimal separator."
" Default is to not touch the decimal separator format.",
default=None,
)
args_parser.add_argument(
'-e', '--decode-html-entities',
action='store_true',
help="Decode HTML entities.",
)
args_parser.add_argument(
'-j', '--data-separator',
help="If there are multiple tags inside an td or th,"
" join there content with the given string.",
default=' ',
)
args = args_parser.parse_args()
logger = logging.getLogger(__file__)
logging.basicConfig(
format='%(levelname)s: %(message)s',
level=args.loglevel,
)
# }}}
if args.input_url:
with urllib.request.urlopen(args.input_url) as response:
html_data = response.read().decode('utf-8')
if args.input_file:
with open(args.input_file, encoding='utf-8') as fh:
html_data = fh.read()
p = HTMLTableParser(
decode_html_entities=args.decode_html_entities,
data_separator=args.data_separator,
)
p.feed(html_data)
tables = [convert_td_to_float(table, args.decimal_separator)
for table in p.tables]
if args.output:
for index in range(len(tables)):
write_csv_file(
'.'.join(
(
args.output,
str(index),
args.output_format)
),
tables[index]
)
else:
pprint(tables)