-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path4-monters.py
109 lines (87 loc) · 3 KB
/
4-monters.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python3
# -*- coding: utf-8 -*-
########## For read json file ##########
# import json
# with open('Cours/BDD/monsters.txt', 'r') as f:
# monsters = json.load(f)
# spark = [[k,v] for monster in monsters for k,v in monster.items()]
###### Import ########
import json
import re
import requests
import signal
import sqlite3
import string
import sys
from bs4 import BeautifulSoup
###### Variable ######
# List
monsters = []
# URL for parcing
url = "https://www.d20pfsrd.com/bestiary/bestiary-alphabetical/"
# For parcing
index = "index-monsters-{0}"
# File
output = "monsters.json"
# Debug
i = 0
n = 5
###### Function ######
def signal_exit(sig, frame):
print('You pressed Ctrl + C !')
if sys.argv[1] == 2 or sys.argv[1] == 3:
closeSQLite(sqliteCon)
exit(0)
signal.signal(signal.SIGINT, signal_exit)
def get_name(mod_soup):
if mod_soup.find(class_ = re.compile("(^title$|^text$)")):
#return mod_soup.find(class_ = re.compile("(^title|^text$)")).get_text().split("CR")[0].strip().replace("â", "'").replace("\u00e2\u0080\u0099", "'")
return re.sub("(â|\u00e2\u0080\u0099|\u2018|\u2019)", "'", mod_soup.find(class_ = re.compile("(^title|^text$)")).get_text().split("CR")[0].strip())
else:
return None
def get_spells(mod_soup):
spells = []
if mod_soup.find_all('p', style=re.compile("margin-left:*")):
for s in mod_soup.find_all('p', style=re.compile("margin-left:*")):
if s.find_all(class_ = "spell"):
for spell in s.find_all(class_ = "spell"):
spells.append(re.sub("(â|\u00e2\u0080\u0099|\u2018|\u2019)", "'", spell.get_text().title().strip()))
return list(set(spells))
else:
return []
def m(name, spells):
jsn = {}
jsn['name'] = name
jsn['spells'] = spells
return jsn
###### Program #######
page = requests.get(url)
soup = BeautifulSoup(page.text, 'html.parser')
for alpha in soup.find_all(class_= 'page new parent'):
first = True
sub_page = requests.get(alpha.a['href'])
sub_soup = BeautifulSoup(sub_page.text, 'html.parser')
for sub_alpha in sub_soup.find_all('a'):
monster = {}
if sub_alpha.get('href') != None:
if re.search('monster-listings',sub_alpha.get('href')):
if first:
first = False
continue
print(sub_alpha.get('href'))
mob = requests.get(sub_alpha.get('href'))
mod_soup = BeautifulSoup(mob.text, 'html.parser')
name = get_name(mod_soup)
spells = get_spells(mod_soup)
print(f"[{name},{spells}]")
if name:
# monsters.append({name: spells})
monsters.append(m(name, spells))
# # For Debug
# if i == n:
# break
# i = i + 1
# break
with open (output, "w") as f:
json.dump(monsters, f, indent=3, sort_keys=False)
exit(0)