forked from ekelen/tarot-api
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsacred_texts_scraper.py
120 lines (104 loc) · 4.42 KB
/
sacred_texts_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# For reference only. Not using anymore.
# A first effort at scraping with Python.
# Use at your own risk. :)
import json
import re
import requests
from bs4 import BeautifulSoup
base_url = 'http://www.sacred-texts.com/tarot/pkt/pkt'
majors_url = 'http://www.sacred-texts.com/tarot/pkt/pkt0303.htm'
cards = []
minorText = []
majorText = []
class Card:
def __init__(self, value, value_int, name, name_short, meaning_up, meaning_rev):
self.value = value.lower()
self.value_int = value_int
self.name = name.title()
self.name_short = name_short.lower()
self.meaning_up = meaning_up
self.meaning_rev = meaning_rev
class Major(Card):
def __init__(self, *args):
super(Major, self).__init__(*args)
self.type = "major"
def to_JSON(self):
return {
'name': self.name,
'name_short': self.name_short,
'value': self.value,
'value_int': self.value_int,
'meaning_up': self.meaning_up,
'meaning_rev': self.meaning_rev,
'type': self.type
}
class Minor(Card):
def __init__(self, suit, desc, *args):
super(Minor, self).__init__(*args)
self.type = "minor"
self.name = self.value.capitalize() + ' of ' + suit.capitalize()
self.desc = desc
self.suit = suit.lower()
def to_JSON(self):
return {
'value': self.value,
'value_int': self.value_int,
'name': self.name,
'name_short': self.name_short,
'suit': self.suit,
'meaning_up': self.meaning_up,
'meaning_rev': self.meaning_rev,
'type': self.type,
'desc': self.desc
}
def get_majors():
majs = requests.get(majors_url)
soup = BeautifulSoup(majs.content, 'html.parser')
for p in soup.find_all('p'):
line = p.text
m = re.match(r'([0-9]+|(ZERO))(\..+?(?=\.))', line)
if m:
value = m[1]
value_int = 0 if value == 'ZERO' else int(value)
name = m[3][2:]
name_short = 'ar' + '{:02}'.format(value_int)
meaning_up = line[len(m[0])+3:line.find("Reversed")]
meaning_rev = line[line.find("Reversed")+len("Reversed"):]
c = Major(value, value_int, name, name_short, meaning_up, meaning_rev)
entry = {'name_short': name_short, 'name': name, 'text': line, 'value': value}
majorText.append(entry)
cards.append(c.to_JSON())
print('Added major card', c.name)
def get_minors():
suits_tup = [["wa", "wands"], ["cu", "cups"], ["pe", "pentacles"], ["sw", "swords"]]
mins_tup = [["pa", "page", 11], ["kn", "knight", 12], ["qu", "queen", 13], ["ki", "king", 14], ["ac", "ace", 1], ["02", "Two", 2], ["03", "Three", 3], ["04", "Four", 4], ["05", "Five", 5], ["06", "Six", 6], ["07", "Seven", 7], ["08", "Eight", 8], ["09", "Nine", 9], ["10", "Ten", 10]]
for suit in suits_tup:
for value in mins_tup:
page_url = base_url + suit[0] + value[0] + ".htm"
card_page = requests.get(page_url)
soup = BeautifulSoup(card_page.content, 'html.parser')
res = soup.select_one("p:nth-of-type(3)")
if(res):
value_long = value[1]
value_int = value[2]
suit_long = suit[1]
name_short = suit[0] + value[0]
name_long = value_long + ' of ' + suit_long
line = res.text
entry = {'name_short': name_short, 'text': line, 'value_long': value_long, 'value_int': value_int, 'name': name_long}
minorText.append(entry)
desc = line[:line.find("Divinatory Meanings")]
meaning_up = line[line.find("Divinatory Meanings")+len("Divinatory Meanings"):line.find("Reversed")]
meaning_rev = line[line.find("Reversed")+len("Reversed"):]
c = Minor(suit_long, desc, value_long, value_int, name_long, name_short, meaning_up, meaning_rev)
cards.append(c.to_JSON())
print('Added minor card ', c.name)
get_majors()
get_minors()
with open('card_data_tmp.json', mode='w', encoding='utf-8') as f:
entry = {'count': len(cards), 'cards': cards}
json.dump(entry, f)
with open('min_text.json', mode='w', encoding='utf-8') as f:
json.dump(minorText, f)
with open('maj_text.json', mode='w', encoding='utf-8') as f:
json.dump(majorText, f)