🧹🧹🧹

sinanatra · Mar 6, 2021 · 9316eb7 · 9316eb7
1 parent 019a1ae
commit 9316eb7
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 15 deletions.
diff --git a/html_proc.py b/html_proc.py
@@ -11,48 +11,43 @@
 import lxml.etree as ET    
 from utils.tweet import updateStatus
 
+# Connects to the Mongo instance
 client = pymongo.MongoClient(os.environ['MONGO'])
 db = client.ilpost
 words = db.words
-
+
+# Parses newspaper's feed
 opener = urllib.request.build_opener()
 tree = ET.parse(opener.open('https://www.ilpost.it/feed/'))
 
 for link in tree.findall('channel/item/link'):
-
     print('href: ', link.text)
     try:
         innerHtml = urllib.request.urlopen(link.text).read()
         innerSoup = BeautifulSoup(innerHtml, features="lxml")
         # ignore all scripts and css
         for script in innerSoup(["script", "style"]):
             script.extract()
-
         #ignore iframes
         for div in innerSoup.find_all("blockquote", {'class':'twitter-tweet'}): 
             div.decompose()
-
         for div in innerSoup.find_all("blockquote", {'class':'instagram-media'}):
             div.decompose()
-
         #ignore tags
         for div in innerSoup.find_all("a", {'rel':'tag'}):
             div.decompose()
 
         #get title
         title = innerSoup.find("h1", {'class':'entry-title'}).get_text()
-
         # get  and cleans the text
         text = innerSoup.find('article').get_text()
-
         lines = (line.strip() for line in text.splitlines())
         chunks = (phrase.strip()
                     for line in lines for phrase in line.split("  "))
         text = '\n'.join(chunk for chunk in chunks if chunk)
         # get tokens - ignore punctuation and capital letters
         tokenizer = RegexpTokenizer(r'\w+')
         tokens = tokenizer.tokenize(text)
-
         # remove duplicates
         tokens = list(set(tokens))
 
@@ -65,20 +60,17 @@
                     continue
                 else:
                     print('new token!', token)
-
+                    #Adds word to Mongo
                     x = words.insert_one({ "word": token })
-
+                    #Defines text snippet
                     range_snippet = 50
                     start_index = text.find(token)
-
                     end_index = start_index + len(token) 
                     snippet = ''
                     for i in range(start_index - range_snippet, end_index + range_snippet):
                         snippet += text[i]   
-
                     finalsnippet = ' '.join(snippet.split()[1:-1])+ ' ...'
-
-                    # tweets stuff
+                    # tweets word, snippet and link
                     updateStatus(token, link.text, title, finalsnippet)
                     time.sleep(5)
     except Exception as e:

diff --git a/readme.md b/readme.md
@@ -9,7 +9,7 @@ Basic architecture
 
 Il Post first said is essentially a single script which runs every two hour as a cron job on Github.
 
-`html_proc.py` parses the xml provided by the newspaper `https://www.ilpost.it/feed/`. It opens every new article url, retrieves the article text, tokenize every word and tweets new words using `utils/tweet.py`. It also append every new word to a Mongo DB instance.
+`html_proc.py` parses the xml provided by the newspaper `https://www.ilpost.it/feed/`. It opens every new article url, retrieves the article text, tokenize every word and tweets new words using `utils/tweet.py`. It also appends every new word to a Mongo DB instance.
 
 
 Requisites