-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathscrape-reddit-parts
executable file
·69 lines (50 loc) · 2.07 KB
/
scrape-reddit-parts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python
import itertools
import json
import os
import praw
USERNAMES = '_9MOTHER9HORSE9EYES9', 'karen_castillo'
# Follow API guidelines: https://github.com/reddit/reddit/wiki/API#rules
USER_AGENT = '9m9h9e9 e-book script:v0.3 (by /u/cryzed-)'
def scrape_parts(client, username):
redditor = client.get_redditor(username)
parts = {}
print('-', username)
print(' - Scraping parts...')
for thing in itertools.chain(redditor.get_comments(limit=None), redditor.get_submitted(limit=None)):
html = getattr(thing, 'body_html', getattr(thing, 'selftext_html', None))
if not html:
print(' - Skipping %s (%d): no content' % (thing.permalink, thing.created))
continue
text = getattr(thing, 'body', getattr(thing, 'selftext', None))
if text == '[removed]':
print(' - Skipping %s (%d): removed' % (thing.permalink, thing.created))
continue
print(' - Adding: %s...' % thing.id)
parts[thing.id] = (thing.created, html, text)
return parts
def main():
client = praw.Reddit(user_agent=USER_AGENT)
for username in USERNAMES:
parts = scrape_parts(client, username)
username_path = os.path.join('parts', username)
os.makedirs(username_path, exist_ok=True)
spine_path = os.path.join(username_path, 'spine.json')
if os.path.exists(spine_path):
with open(spine_path) as file:
spine = json.load(file)
else:
spine = {}
print(' - Saving parts...')
for id_, (created, html, text) in parts.items():
with open(os.path.join('parts', username, id_ + '.html'), 'w', encoding='UTF-8') as file:
file.write(html)
with open(os.path.join('parts', username, id_ + '.txt'), 'w', encoding='UTF-8') as file:
file.write(text)
spine[id_] = created
print(' - Saving spine...')
with open(spine_path, 'w') as file:
json.dump(spine, file)
print('Finished!')
if __name__ == '__main__':
main()