-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinternet_conn.py
154 lines (130 loc) · 4.44 KB
/
internet_conn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import requests as _req
from bs4 import BeautifulSoup as bs
from TFSummarizer import FrequencySummarizer as fs
import wikipedia as _wk
from re import findall as _findall
from LexRankSummarizer import summary_from_lex_text
from BERTSummarizer import bertSummariser
from TextRankSummarizer import textRankSummarizer
_guardian_key = "f26b6c0c-379f-4961-a47f-1dfaf40b9aa2"
_nyt_key = "02bcc44cbbf544b5927be9b458490c6f"
#_guardian_url = "http://content.guardianapis.com/search"
_guardian_url = "http://content.guardianapis.com/search?api-key=f26b6c0c-379f-4961-a47f-1dfaf40b9aa2"
_nyt_url = "https://api.nytimes.com/svc/search/v2/articlesearch.json"
# Speech URL
class MediaAggregatorMixin:
def __init__(self):
print("inside MediaAggregatorMixin")
pass
def get_news(self, query):
pass
def get_limit(self):
pass
class GuardianAggregator:
def __init__(self):
self._params = {"q": "", "api-key": _guardian_key}
self._limit = None
def get_news(self, query):
# print("inside get_news of GuardianAggregator")
self._params["q"] = str(query)
response = _req.get(_guardian_url, params = self._params)
return [x["webUrl"] for x in response.json()["response"]["results"] if x["type"] == "article" and
x["sectionName"] != u"Media" and "quiz" not in x["webUrl"].lower()]
def get_limit(self):
# print("inside get_limit of get_limit")
if not self._limit:
self.get_news("test")
return self._limit
class NYTAggregator:
def __init__(self):
self._params = {"q": "", "api-key": _nyt_key}
def get_news(self, query):
# print("inside get_news of NYTAggregator")
self._params["q"] = str(query)
response = _req.get(_nyt_url, params = self._params)
return [x["web_url"] for x in response.json()["response"]["docs"] if x["type_of_material"] == "News"]
def get_limit(self):
return self._limit
def get_gkg(query):
print ("inside get_news of get_gkg")
try:
page_object = _wk.page(query)
s = page_object.content
summ = summ_from_text(s, 5)
return summ
except (_wk.DisambiguationError,e):
return False
# Text summarizer call
def summ_from_text(text, length):
n = length
summary = fs().summarize(text,n)
# print("summary")
# print (summary)
return ' '.join(summary)
# URL Func
def shorten_news(url, n):
# print("inside get_news of shorten_news")
response = _req.get(url)
# print(response)
if not response.ok:
return False
page = response.content
soup = bs(page, "lxml")
# print("soup")
data = "\n".join([x.text for x in soup.findAll("p") if len(x.text.split()) > 1])
# print("data: ", data)
summary = fs().summarize(data, n)
# print("summary")
# print(summary)
# summary.insert(0, soup.title.text)
# print("nxjnd")
return ' '.join(summary)
# Summary for LexRank Algorithm
def shorten_lex_text(url, n):
# print("inside url extractor")
n = int(n)
response = _req.get(url)
# print(response)
if not response.ok:
return False
page = response.content
soup = bs(page, "lxml")
# print("soup in lexrank")
data = "\n".join([x.text for x in soup.findAll("p") if len(x.text.split()) > 1])
# print("lexrank data: ", data)
# print(type(data))
# print(n, type(n))
summary = summary_from_lex_text(data, n)
# print(summary)
return summary
# Summary for BERT Sum Algorithm
def shorten_bert_text(url, n):
# print("inside url extractor")
n = int(n)
response = _req.get(url)
# print(response)
if not response.ok:
return False
page = response.content
soup = bs(page, "lxml")
# print("soup in bert sum")
data = "\n".join([x.text for x in soup.findAll("p") if len(x.text.split()) > 1])
# print("bert data: ", data)
summary = bertSummariser(data.encode('utf-8'), n)
# print("bert summary: ", summary)
return summary
# Summary for Text Rank Algorithm
def shorten_text_rank(url, n):
n = int(n)
response = _req.get(url)
# print(response)
if not response.ok:
return False
page = response.content
soup = bs(page, "lxml")
# print("soup in bert sum")
data = "\n".join([x.text for x in soup.findAll("p") if len(x.text.split()) > 1])
# print("bert data: ", data)
summary = textRankSummarizer(data, n)
# print("bert summary: ", summary)
return summary