-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
117 lines (95 loc) · 3.63 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo
import json
import os
import requests
ICON = "https://assets.newrepublic.com/assets/favicons/apple-touch-icon-180x180.png"
URL = "https://newrepublic.com/culture/tags/books"
HEADERS = {
'User-Agent':
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 '
'Safari/605.1.15'
}
def log(message):
now = datetime.now(ZoneInfo('America/New_York'))
print(now.strftime('%Y-%m-%d %H:%M:%S') + " " + message)
# noinspection PyUnusedLocal
def lambda_handler(event, context):
return {
'statusCode': 200,
'headers': {
'Content-Type': 'application/json',
'Cache-Control': 'max-age=720'
},
'body': get_json_feed(False)
}
def sorter(el):
return el["date_published"]
def get_json_feed(debug):
log("HTTP Start " + URL)
html = requests.get(URL, headers=HEADERS)
log("HTTP End")
if debug:
post_file = open("data.html", "w")
post_file.write(html.text)
post_file.close()
log("Parse Start")
page = BeautifulSoup(html.text, 'html.parser')
log("Parse End")
oldest = datetime.now(ZoneInfo('America/New_York')) - timedelta(days=65)
feed_items = []
for article in page.find_all('div', {'class': 'articleResults__result'}):
article_date_string = article.find('div', {'class': 'articleResults__date'}).text
article_date = datetime.strptime(article_date_string, "%B %d, %Y").astimezone(ZoneInfo('America/New_York'))
if article_date < oldest:
continue
article_heading = article.find('a', {'class': 'Hed'})
article_title = article.find('div', {'class': 'Hed'}).text
article_url = 'https://newrepublic.com' + article_heading.get(key='href')
article_author = article.find('div', {'class': 'articleResults__byline'}).text.strip()
article_page = BeautifulSoup(requests.get(article_url).content, 'html.parser')
article_body = str(article_page.find('div', {'class': 'article-body'}))
article_image = ICON
lede = article_page.find('div', {'class': 'article-lede'})
if lede is not None:
picture = lede.find('picture')
if picture is not None:
source = picture.find('source')
if source is not None:
article_image = 'https:' + source.get('data-srcset').split(' ', 1)[0]
log(article_title)
feed_article = {
'id': article_url,
'title': article_title,
'authors': [{'name': article_author}],
'url': article_url,
'content_html': article_body,
'date_published': article_date.isoformat(),
'image': article_image,
'banner_image': article_image,
}
feed_items.append(feed_article)
feed = {
'version': 'https://jsonfeed.org/version/1.1',
'title': 'New Republic Books',
'home_page_url': URL,
'user_comment': 'Generated by https://github.com/prenagha/newrepublic-books-jsonfeed',
'icon': ICON,
'favicon': ICON,
'items': sorted(feed_items, key=sorter, reverse=True)
}
log("END")
return json.dumps(feed, indent=2)
def test_feed():
log('TEST START')
debug = 'LAMBDA_NAME' not in os.environ
feed_str = get_json_feed(debug)
if debug:
feed_file = open("feed.json", "w")
feed_file.write(feed_str)
feed_file.close()
assert ('date_published' in feed_str)
log('TEST END')
if __name__ == '__main__':
test_feed()