-
Notifications
You must be signed in to change notification settings - Fork 35
/
Copy pathutil.py
34 lines (31 loc) · 922 Bytes
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# %%
import requests
import bs4
import re
from googlesearch import search
import re
# %%
def get_url_text(text):
urls = search(text, tld='com', lang='en-US', safe='on', stop=15)
link = None
for url in urls:
if 'wikipedia' in url:
link = url
break
# Get wikipedia text
if link:
response = requests.get(link)
lines = []
if response is not None:
html = bs4.BeautifulSoup(response.text, 'html.parser')
title = html.select("#firstHeading")[0].text
paragraphs = html.select("p")
for para in paragraphs:
text = re.sub(r'\[\w+\]', '', para.text)
text = re.sub(r'\s\s+', '', text)
text = re.sub(r'\n', '', text)
if len(text.split()) > 5:
lines.append(text)
return link, lines
else:
return None, None