-
Notifications
You must be signed in to change notification settings - Fork 93
/
Copy pathtext_url_summary_transformer.py
89 lines (64 loc) · 2.43 KB
/
text_url_summary_transformer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
"""Extract text from URL and summarizes it"""
from h2oaicore.transformer_utils import CustomTransformer
import datatable as dt
import numpy as np
class TextURLSummaryTransformer(CustomTransformer):
_unsupervised = True
_numeric_output = False
_testing_can_skip_failure = False # ensure tested as if shouldn't fail
_modules_needed_by_name = ["gensim==4.3.2", "beautifulsoup4==4.12.3"]
_display_name = 'TextURLSummaryTransformer'
@staticmethod
def is_enabled():
return True
@staticmethod
def do_acceptance_test():
return False
@staticmethod
def can_use(accuracy, interpretability, **kwargs):
return False
@staticmethod
def get_default_properties():
return dict(col_type="text", min_cols=1, max_cols=1, relative_importance=1)
# Parses URLs and gets summary
def parse_url(self, url):
from bs4 import BeautifulSoup
from gensim.summarization.summarizer import summarize
import requests
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36'}
summaries = []
# URL cleaning
if url is not None:
url = url.strip('\'"')
try:
page = requests.get(url, headers=headers, stream=True)
soup = BeautifulSoup(page.content, "lxml")
# print ('got soup')
text = ' '.join(map(lambda p: p.text, soup.find_all('p')))
# print ('got text')
text_summary = summarize(text)
# print ('got summary')
except:
text_summary = ''
summaries.append(text_summary)
return summaries
def fit_transform(self, X: dt.Frame, y: np.array = None):
return self.transform(X)
def transform(self, X: dt.Frame):
import pandas as pd
import multiprocessing as mp
num_workers = mp.cpu_count()
XX = X.to_pandas().iloc[:, 0].values
urls = XX
summaries = []
# Start parallel process with n jobs
p = mp.Pool(num_workers)
# Call parse_url function with list of all urls
all_summaries = p.map(self.parse_url, urls)
p.terminate()
p.join()
# Flatten list of lists
summaries = [ent for sublist in all_summaries for ent in sublist]
ret = pd.DataFrame({'URLSummary': summaries})
return ret