-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwikiquote.py
executable file
·128 lines (118 loc) · 4.13 KB
/
wikiquote.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python
#
# Description:
# Outputs random quote from list of Wikiquote pages.
#
# Requirements:
# bs4 (BeautifulSoup) to parse Wikiquote website.
#
# Configuration:
# max_length (int)
# Max number of characters to display from a quote.
#
# output_width (int)
# Wrap quote at nth character (multi-line output).
#
# pages (nested list of strings)
# List of Wikiquote pages to scrape quotes from.
# Inner lists having two strings: [Wikiquote page slug, author name].
# e.g. [
# ["Leo_Tolstoy", "Tolstoy"],
# ["Robert_Anton_Wilson", "RAW"]
# ]
#
# How It Works:
# BeautifulSoup (BS) does the heavy lifting. Picks a random author from the
# pages variable, builds a Wikiquote URL from it, and requests the page
# contents from that Wikiquote URL. Parses that Wikiquote page content for
# quotes, adds them all to a list, and then selects one at random. Finally,
# outputs that quote and its author's name.
#
# Identifying quotes from the Wikiquote page content:
# Wikiquote doesn't have a standardized page structure. Based on test
# data, the author's quotes appear after the second <h2> element. They
# are contained within <ul> list items, and each quote's source is also
# contained within that list, but as it's own <ul> list. The script
# identifies the second <h2> element to determine where the quotes start
# and then iterates through each <ul> list where the quotes appear. If
# there's a nested source/attribution <ul> list, it's first stripped out
# and the parent <ul> list's text is stored in the script's quote list.
#
# Example page source:
# <h2>Quotes</h2> <-- 2nd occurance of h2 element
# <ul>
# <li>"Quote."</li> <-- Quote we care about
# <ul>
# <li>Quote source</li> <-- Attribution to ignore
# </ul>
# </ul>
# <ul>
# <li>"Quote."</li> <-- Another quote we care about
# <ul>
# <li>Quote source</li> <-- Attribution to ignore
# </ul>
# </ul>
# <h2>Quotes about [Author Name]</h2>
# [etc.]
#
import random
import requests
import textwrap
import bs4
# Configuration
max_length = 512
output_width = 79
pages = [
["Alex_Jones", "Alex Jones"],
["Bobby_Fischer", "Bobby Fischer"],
["L._Ron_Hubbard", "L. Ron Hubbard"]
]
# Constants
base = "https://en.wikiquote.org/wiki/"
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " +
"AppleWebKit/537.36 (KHTML, like Gecko) " +
"Chrome/114.0.0.0 Safari/537.36"}
random_author = random.randrange(len(pages))
url = base + pages[random_author][0]
# Functions
def get_page(url, headers):
""" Return Wikiquote page content """
r = requests.get(url=url, headers=headers)
soup = bs4.BeautifulSoup(r.content, "html.parser")
return soup
def get_quote(soup):
""" Return random quote from input Wikiquote page content. """
quotes = []
h2 = soup.find_all("h2")[1]
for sibling in h2.find_next_siblings():
if sibling.name == "h2":
break
if sibling.name == "ul":
quote = sibling.find("li")
try:
quote.ul.decompose()
except:
pass
quote = quote.get_text()
quotes.append(quote)
quote = quotes[random.randrange(len(quotes))]
quote.strip()
if len(quote) > max_length:
quote = textwrap.shorten(quote,
width=max_length, placeholder="...")
quote = textwrap.fill(quote, width=output_width)
return quote
def get_author(random_author):
""" Return name of quote author, aligned right """
author = "-- " + pages[random_author][1]
author = author.rjust(output_width)
return author
def main():
""" Prepare variables, call functions, output results. """
page = get_page(url, headers)
quote = get_quote(page)
author = get_author(random_author)
print(quote)
print(author)
# Execute
main()