-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path6-spells.py
148 lines (122 loc) · 4.2 KB
/
6-spells.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/python3
# -*- coding: utf-8 -*-
########## For read json file ##########
# import json
# with open('Cours/BDD/monsters.txt', 'r') as f:
# monsters = json.load(f)
# spark = [[k,v] for monster in monsters for k,v in monster.items()]
###### Import ########
import json
import re
import requests
import signal
import sqlite3
import string
import sys
from bs4 import BeautifulSoup
###### Variable ######
# List
spells = []
# URL for parcing
url = "https://www.d20pfsrd.com/magic/all-spells/{0}"
# File
output = "spells.json"
# Debug
i = 0
n = 5
###### Function ######
def signal_exit(sig, frame):
print('You pressed Ctrl + C !')
if sys.argv[1] == 2 or sys.argv[1] == 3:
closeSQLite(sqliteCon)
exit(0)
signal.signal(signal.SIGINT, signal_exit)
def get_name(sub_soup):
return re.sub("(â|\u00e2\u0080\u0099|\u2018|\u2019)", "'",sub_soup.select('h1')[0].text.strip().title())
def get_values(all_p):
level = []
classes = []
values = [p.text for p in all_p if re.search('Level', p.text)]
values = values[0].split(";")
if len(values) > 1:
values = [v for v in values if re.search('Level', v)]
values = values[0][6:].split()
elif len(values) == 1:
if re.findall('[:]',values[0]):
values = values[0].split(":")
values = values[1][6:].split()
else:
values = values[0].split("Level")
values = values[1].split()
else:
values = values[1][6:].split()
for i in range(len(values)):
if re.findall('[/]',values[i]):
classe = values[i].split("/")
[classes.append(cla.title().strip()) for cla in classe]
elif not re.findall('[0-9]',values[i]):
classes.append(values[i].title().strip())
elif re.findall('[0-9]',values[i]):
level = re.findall('[0-9]',values[i])
try:
level = int(level[0])
except:
level = 0
return list(set(classes)), level
def get_components(all_p):
components = []
values = [p.text for p in all_p if re.search('Components', p.text)]
try:
values = values[0].split("Components")[1].split()
except:
return components
for i in range(len(values)):
char = re.findall('V|S|M|DF',values[i])
if re.findall('[/]',values[i]):
chars = values[i].split('/')
[components.append(c) for c in chars]
elif char:
components.append(''.join(char))
return list(set(components))
def get_description(sub_soup):
try:
try:
desc = re.sub("(â|\u00e2\u0080\u0099|\u2018|\u2019)", "'", sub_soup.find('p', class_='divider', text='DESCRIPTION').find_next('p').text)
except:
desc = re.sub("(â|\u00e2\u0080\u0099|\u2018|\u2019)", "'", sub_soup.find('p', style='font-size: 12px;font-weight: bold;border-top: thin solid;border-bottom: thin solid', text='DESCRIPTION').find_next('p').text)
except:
return ''
return re.sub("(â)|â", "–", desc)
def spells_in_json(name, level, classes, components, description, url):
jsn = {}
jsn['name'] = name
jsn['level'] = level
jsn['classes'] = classes
jsn['components'] = components
jsn['description'] = description
jsn['url'] = url
return jsn
###### Program #######
for c in string.ascii_lowercase:
page = requests.get(url.format(c))
soup = BeautifulSoup(page.text, 'html.parser')
links = [a['href'] for a in soup.find_all('a', href=True) if a.text if re.search(f'{url.format(c)}/',a['href'])]
for l in links:
print(l)
sub_page = requests.get(l)
sub_soup = BeautifulSoup(sub_page.text, 'html.parser')
all_p = sub_soup.find_all('p')
name = get_name(sub_soup)
classes, level = get_values(all_p)
components = get_components(all_p)
description = get_description(sub_soup)
spells.append(spells_in_json(name, level, classes, components, description, l))
print(name, level, classes, components, description, l)
# # For Debug
# if i == n:
# break
# i = i + 1
# break
with open (output, "w") as f:
json.dump(spells, f, indent=3, sort_keys=False)
exit(0)