main.py

from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from zoneinfo import ZoneInfo

import json
import os
import requests

ICON = "https://assets.newrepublic.com/assets/favicons/apple-touch-icon-180x180.png"
URL = "https://newrepublic.com/culture/tags/books"
HEADERS = {
    'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/18.3 '
        'Safari/605.1.15'
}


def log(message):
    now = datetime.now(ZoneInfo('America/New_York'))
    print(now.strftime('%Y-%m-%d %H:%M:%S') + " " + message)


# noinspection PyUnusedLocal
def lambda_handler(event, context):
    return {
        'statusCode': 200,
        'headers': {
            'Content-Type': 'application/json',
            'Cache-Control': 'max-age=720'
        },
        'body': get_json_feed(False)
    }


def sorter(el):
    return el["date_published"]


def get_json_feed(debug):
    log("HTTP Start " + URL)
    html = requests.get(URL, headers=HEADERS)
    log("HTTP End")

    if debug:
        post_file = open("data.html", "w")
        post_file.write(html.text)
        post_file.close()

    log("Parse Start")
    page = BeautifulSoup(html.text, 'html.parser')
    log("Parse End")

    oldest = datetime.now(ZoneInfo('America/New_York')) - timedelta(days=65)
    feed_items = []
    for article in page.find_all('div', {'class': 'articleResults__result'}):
        article_date_string = article.find('div', {'class': 'articleResults__date'}).text
        article_date = datetime.strptime(article_date_string, "%B %d, %Y").astimezone(ZoneInfo('America/New_York'))
        if article_date < oldest:
            continue
        article_heading = article.find('a', {'class': 'Hed'})
        article_title = article.find('div', {'class': 'Hed'}).text
        article_url = 'https://newrepublic.com' + article_heading.get(key='href')
        article_author = article.find('div', {'class': 'articleResults__byline'}).text.strip()

        article_page = BeautifulSoup(requests.get(article_url).content, 'html.parser')
        article_body = str(article_page.find('div', {'class': 'article-body'}))

        article_image = ICON
        lede = article_page.find('div', {'class': 'article-lede'})
        if lede is not None:
            picture = lede.find('picture')
            if picture is not None:
                source = picture.find('source')
                if source is not None:
                    article_image = 'https:' + source.get('data-srcset').split(' ', 1)[0]

        log(article_title)

        feed_article = {
            'id': article_url,
            'title': article_title,
            'authors': [{'name': article_author}],
            'url': article_url,
            'content_html': article_body,
            'date_published': article_date.isoformat(),
            'image': article_image,
            'banner_image': article_image,
        }
        feed_items.append(feed_article)

    feed = {
        'version': 'https://jsonfeed.org/version/1.1',
        'title': 'New Republic Books',
        'home_page_url': URL,
        'user_comment': 'Generated by https://github.com/prenagha/newrepublic-books-jsonfeed',
        'icon': ICON,
        'favicon': ICON,
        'items': sorted(feed_items, key=sorter, reverse=True)
    }
    log("END")
    return json.dumps(feed, indent=2)


def test_feed():
    log('TEST START')
    debug = 'LAMBDA_NAME' not in os.environ
    feed_str = get_json_feed(debug)
    if debug:
        feed_file = open("feed.json", "w")
        feed_file.write(feed_str)
        feed_file.close()
    assert ('date_published' in feed_str)
    log('TEST END')


if __name__ == '__main__':
    test_feed()