-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcpcgr.py
154 lines (120 loc) · 5.37 KB
/
cpcgr.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
from urllib import request
import logging
# from pprint import pprint
import pandas as pd
class ParseCPCGR:
root_path = 'http://cpcgamereviews.com/%s/index.html'
decode_ref = 'utf-8'
map_sym = {'\n': '', '\t': ''}
line_skip = '<br />'
all_paths = []
def __init__(self):
# generate all paths
logging.info('Generate all paths')
self.all_paths = [self.root_path % chr(ord('a') + i) for i in range(26)]
self.reviews = None
@staticmethod
def extractRange(data, tag, tagval=None):
taglng = '<%s' % tag + (' %s>' % tagval if tagval else '>')
part = data.split(taglng)
part = part if len(part) > 1 else ['', '']
return part[1].split('</%s>' % tag)[0]
@staticmethod
def replaceSym(data, sym_map):
for sym, rep in sym_map.items():
data = data.replace(sym, rep)
return data
def loadurl(self, path, clean=True):
with request.urlopen(path) as url:
data = url.read().decode(self.decode_ref)
return self.replaceSym(data, self.map_sym) if clean else data
def readIndex(self, data):
ind = self.extractRange(data, 'table', 'class="page-index"')
ind = self.replaceSym(self.extractRange(ind, 'tr'), {'/td': 'td', '<td>': ''})
return ind.split(self.line_skip)[:-1]
def htmlToDict(self, txt):
all_tag = {}
still_good = True
non_tag = []
while still_good:
check_non_tag = txt.split('<')
if check_non_tag[0] != '':
non_tag += [check_non_tag[0]]
txt = '<' + '<'.join(check_non_tag[1:])
txttag = txt.split('>')
if len(txttag) > 1:
txt = '>'.join(txttag[1:])
tag_ref = txttag[0].replace('<', '')
tag_name = tag_ref[:tag_ref.find(' ')]
tag_info = tag_ref[tag_ref.find(' ') + 1:].split('=')
if tag_name == 'img':
tag_value = ''
else:
end_tag = txt.split('</%s>' % tag_name)
txt = ('</%s>' % tag_name).join(end_tag[1:])
tag_value = end_tag[0]
tag_value = self.htmlToDict(tag_value) if (tag_value.find('<') >= 0) else tag_value
if tag_name == 'a':
temp_dict = {tag_info[0]: {'name': tag_info[1].replace('"', ''),
'value': tag_value
}
}
else:
tag_key = tag_info[1].replace('"', '')
temp_dict = {tag_key: {'type': tag_info[0],
'value': tag_value
}
}
all_tag.update(temp_dict)
else:
still_good = False
all_tag['nontag'] = non_tag
return all_tag
def parseSingleReview(self, review, path):
game_info = self.extractRange(review, 'div', 'class="gamedetails"')
game_info_dict = self.htmlToDict(game_info)
score = self.htmlToDict(self.extractRange(review, 'p', 'class="rating"'))
score = score if score['nontag'] else self.htmlToDict(self.extractRange(review, 'p',
'class="rating no-line-break"'))
publ = self.replaceSym(game_info_dict['publisher']['value'], {'(': '', ')': ''})
publ = publ.split(',')
publ = publ if len(publ) >= 2 else [publ, 'Unknown']
valid_output = {'ref': '#'.join([path, game_info_dict['id']['name']]),
'name': game_info_dict['gametitle']['value']['href']['value'],
'publisher': publ[0],
'year': publ[1].replace(' ', ''),
'score': int(score['nontag'][0])
}
return valid_output
def parsePageReviews(self, data, path):
review_data = self.extractRange(data, 'table', 'class="reviews"')
review_split = self.replaceSym(review_data, {'</tr>': '', '</td>': ''})
review_split = review_split.split('<tr>')[1:]
all_reviews = []
for review in review_split:
all_reviews += [self.parseSingleReview(review, path)]
return all_reviews
def parsePage(self, path):
data = self.loadurl(path)
index = self.readIndex(data)
logging.info('Checking %s' % path)
logging.info(' Found -> %i subpage(s)' % len(index))
all_page_data = []
for i in range(len(index)):
path_page = path if i == 0 else path.replace('.html', '%i.html' % (i + 1))
logging.info(' Parsing -> %s' % path_page)
page_data = data if i == 0 else self.loadurl(path_page)
all_page_data += self.parsePageReviews(page_data, path_page)
return all_page_data
def parseAll(self):
all_res = []
for path in self.all_paths:
all_res += self.parsePage(path)
self.reviews = all_res
res_tbl = pd.DataFrame(all_res)
return res_tbl
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
reviews = ParseCPCGR()
reviews_results = reviews.parseAll()
reviews_results.to_csv('review.csv')