-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsummarizer.py
55 lines (44 loc) · 1.73 KB
/
summarizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# -*- coding: utf-8 -*-
"""Summarizer.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1PZeJspKhkv8_-I-bREFGws_j8nlj-7-y
"""
#function to summarize text
from heapq import nlargest
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from string import punctuation
nlp = spacy.load("en_core_web_sm")
def text_summarizer(raw_docx):
raw_text = raw_docx
docx = nlp(raw_text)
stopwords = list(STOP_WORDS)
# word.text is tokenization in spacy
word_frequencies = {}
for word in docx:
if word.text not in stopwords:
if word.text not in word_frequencies.keys():
word_frequencies[word.text] = 1
else:
word_frequencies[word.text] += 1
maximum_frequncy = max(word_frequencies.values())
for word in word_frequencies.keys():
word_frequencies[word] = (word_frequencies[word]/maximum_frequncy)
# Sentence Tokens
sentence_list = [ sentence for sentence in docx.sents ]
# Calculate Sentence Score and Ranking
sentence_scores = {}
for sent in sentence_list:
for word in sent:
if word.text.lower() in word_frequencies.keys():
if len(sent.text.split(' ')) < 40:
if sent not in sentence_scores.keys():
sentence_scores[sent] = word_frequencies[word.text.lower()]
else:
sentence_scores[sent] += word_frequencies[word.text.lower()]
# Find N Largest
summary_sentences = nlargest(4, sentence_scores, key=sentence_scores.get)
final_sentences = [ w.text for w in summary_sentences ]
summary = ' '.join(final_sentences)
return summary