-
Notifications
You must be signed in to change notification settings - Fork 2
/
scrapper_article.py
88 lines (74 loc) · 2.98 KB
/
scrapper_article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import json
from typing import Dict, List
import logging
import time
from requests_html import HTMLSession, Element
import requests_html
logging.basicConfig(filename="app_scrapper.log", format='%(asctime)s %(message)s', filemode='w')
logging.getLogger(requests_html.__name__).setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
logger.info('INFO: Reading article file')
print('INFO: Reading article file')
with open('article2user.json', 'r') as f_in:
data: Dict = json.load(f_in)
articles = list(data.keys())
logger.info(f'INFO: Found {len(articles)} articles')
print(f'INFO: Found {len(articles)} articles')
session = HTMLSession()
logger.info('INFO: Created session')
print('INFO: Created session')
text_collection = {}
count_error = 0
for index, article in enumerate(articles):
try:
r = session.get(str(article).strip(' \n'))
try:
r.html.render()
print('INFO: HTML render successful')
logger.info('INFO: HTML render successful')
except:
print('ERROR: Exception occurred for rendering. Retrying ...')
logger.exception('ERROR: Exception occurred for rendering. Retrying ...')
r.html.render()
print('INFO: HTML render successful')
logger.info('INFO: HTML render successful')
lst: List[Element] = r.html.find('.body')
lst_title: List[Element] = r.html.find('.title')
lst_lead: List[Element] = r.html.find('.lead')
if len(lst) != 0:
text = str(lst[0].text).replace('\n', ' ')
else:
text = ""
print(f'INFO: No body found for {article}')
logger.info(f'INFO: No body found for {article}')
if len(lst_title) != 0:
head = str(lst_title[0].text).replace('\n', ' ')
else:
head = ""
print(f'INFO: No head found for {article}')
logger.info(f'INFO: No head found for {article}')
if len(lst_lead) != 0:
lead = str(lst_lead[0].text).replace('\n', ' ')
else:
lead = ""
print(f'INFO: No lead found for {article}')
logger.info(f'INFO: No lead found for {article}')
text_collection[index] = {'text': text, 'head': head, 'lead': lead, 'article': article}
except:
print(f'ERROR: Exception occurred for {article}')
logger.exception(f'ERROR: Exception occurred for {article}')
count_error += 1
if index % 50 == 0:
print(f'INFO: Taking rest for 10s')
logger.info(f'INFO: Taking rest for 10s')
time.sleep(10)
session = HTMLSession()
logger.info('INFO: Created new session')
print('INFO: Created new session')
logger.info(f'INFO: Articles not scraped count {count_error}')
print(f'INFO: Articles not scraped count {count_error}')
with open('article_text.json', 'w') as f_out:
json.dump(text_collection, f_out)
logger.info('INFO: Articles text saved')
print('INFO: Articles text saved')