Skip to content

Commit

Permalink
🧹🧹🧹
Browse files Browse the repository at this point in the history
  • Loading branch information
sinanatra committed Mar 6, 2021
1 parent 019a1ae commit 9316eb7
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 15 deletions.
20 changes: 6 additions & 14 deletions html_proc.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,48 +11,43 @@
import lxml.etree as ET
from utils.tweet import updateStatus

# Connects to the Mongo instance
client = pymongo.MongoClient(os.environ['MONGO'])
db = client.ilpost
words = db.words


# Parses newspaper's feed
opener = urllib.request.build_opener()
tree = ET.parse(opener.open('https://www.ilpost.it/feed/'))

for link in tree.findall('channel/item/link'):

print('href: ', link.text)
try:
innerHtml = urllib.request.urlopen(link.text).read()
innerSoup = BeautifulSoup(innerHtml, features="lxml")
# ignore all scripts and css
for script in innerSoup(["script", "style"]):
script.extract()

#ignore iframes
for div in innerSoup.find_all("blockquote", {'class':'twitter-tweet'}):
div.decompose()

for div in innerSoup.find_all("blockquote", {'class':'instagram-media'}):
div.decompose()

#ignore tags
for div in innerSoup.find_all("a", {'rel':'tag'}):
div.decompose()

#get title
title = innerSoup.find("h1", {'class':'entry-title'}).get_text()

# get and cleans the text
text = innerSoup.find('article').get_text()

lines = (line.strip() for line in text.splitlines())
chunks = (phrase.strip()
for line in lines for phrase in line.split(" "))
text = '\n'.join(chunk for chunk in chunks if chunk)
# get tokens - ignore punctuation and capital letters
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)

# remove duplicates
tokens = list(set(tokens))

Expand All @@ -65,20 +60,17 @@
continue
else:
print('new token!', token)

#Adds word to Mongo
x = words.insert_one({ "word": token })

#Defines text snippet
range_snippet = 50
start_index = text.find(token)

end_index = start_index + len(token)
snippet = ''
for i in range(start_index - range_snippet, end_index + range_snippet):
snippet += text[i]

finalsnippet = ' '.join(snippet.split()[1:-1])+ ' ...'

# tweets stuff
# tweets word, snippet and link
updateStatus(token, link.text, title, finalsnippet)
time.sleep(5)
except Exception as e:
Expand Down
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Basic architecture

Il Post first said is essentially a single script which runs every two hour as a cron job on Github.

`html_proc.py` parses the xml provided by the newspaper `https://www.ilpost.it/feed/`. It opens every new article url, retrieves the article text, tokenize every word and tweets new words using `utils/tweet.py`. It also append every new word to a Mongo DB instance.
`html_proc.py` parses the xml provided by the newspaper `https://www.ilpost.it/feed/`. It opens every new article url, retrieves the article text, tokenize every word and tweets new words using `utils/tweet.py`. It also appends every new word to a Mongo DB instance.


Requisites
Expand Down

0 comments on commit 9316eb7

Please sign in to comment.