untitled.py

# -*- coding: utf-8 -*-
"""Untitled1.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1rBmAc0MEudmEyGv9umsbCngJ56qsrZdy

# **ROHIT BISHT 102053006 2COE2**

> Indented block
"""
import pip
import sys
import os
import math
import subprocess
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
'pandas'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
'numpy'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
'spacy'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install',
'sklearn'])

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
import csv
from nltk.tag import pos_tag # for proper noun,
from nltk.tokenize import word_tokenize, sent_tokenize
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
import re
import spacy
from spacy import displacy

NER = spacy.load("en_core_web_sm")
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()


filename=sys.argv[1]
f = open(filename,encoding="utf8")
#print(filename)
text=f.read() #append each line in the file to a list\n
f.close()

sent_tokens = nltk.sent_tokenize(text)
word_tokens = nltk.word_tokenize(text)
word_tokens_lower=[word.lower() for word in word_tokens]
stopWords = list(set(stopwords.words("english")))
word_tokens_refined=[x for x in word_tokens_lower if x not in stopWords]
#print(len(word_tokens_refined))

# ......................feature 1 (cue phrases).................\n
#QPhrases=["incidentally", "example" ,"anyway","furthermore","according","first","second","then","now","thus","moreover","therefore","hence","lastly","finally","summary"]
QPhrases=['Acute', 'respiratory', 'stress', 'syndrome', 'ARDS', 'condition', 'oxygen', 'fatal', 'Asymptomatic', 'presenting', 'symptoms', 'disease', 'In', 'case', 'COVID-19', 'means', 'absence', 'fever', 'dry', 'cough', 'sore', 'throat', 'shortness', 'breath', 'body', 'aches', 'among', 'less', 'common', 'Notably', 'recommended', 'individuals', 'get', 'tested', 'unless', 'exhibit', 'risk', 'false', 'negatives', 'words', 'tests', 'accurate', 'present', 'Case', 'fatality', 'rate', 'ratio', 'deaths', 'total', 'number', 'diagnosed', 'Clinical', 'trial', 'research', 'experiments', 'human', 'participants', 'designed', 'answer', 'questions', 'new', 'treatments', ';', 'coronaviruses', 'safety', 'efficacy', 'potential', 'vaccine', 'Community', 'spread', 'contagious', 'geographic', 'area', 'knowledge', 'someone', 'contracted', 'known', 'contact', 'traced', 'infected', 'Confirmed', 'positive', 'contrast', 'presumptive', 'confirmation', 'Centers', 'Disease', 'Control', 'Prevention', 'CDC', 'test', 'individual', 'Contact', 'tracing', 'identifying', 'monitoring', 'people', 'may', 'infectious', 'person', 'usually', 'involves', 'self', 'quarantine', 'effort', 'control', 'Contactless', 'without', 'example', '“', 'contactless', 'delivery', '”', 'would', 'include', 'leaving', 'purchased', 'items', 'entryway', 'home', 'rather', 'handing', 'directly', 'Containment', 'geographical', 'zone', 'limited', 'access', 'contain', 'outbreak', 'Coronavirus', 'family', 'viruses', 'SARS', 'severe', 'acute', 'MERS', 'Middle', 'East', 'well', 'illnesses', 'A', 'CoV', 'typically', 'animals', 'humans', '—', 'event', 'zoonotic', 'transfer', 'named', 'term', 'corona”—Latin', 'crown', 'refers', 'shape', 'virus', 'observed', 'microscopically', 'stands', 'novel', 'coronavirus', '2019', 'year', 'initial', 'detection', 'illness', 'related', 'current', 'pandemic', 'caused', 'CoV-2', '2).Epidemic', 'widespread', 'occurrence', 'community', 'Epidemic', 'curve', 'graph', 'chart', 'depicting', 'progression', 'particular', 'population', 'Epidemiology', 'branch', 'medicine', 'deals', 'largely', 'public', 'health', 'including', 'incidence', 'distribution', 'analysis', 'diseases', 'Essential', 'business', 'although', 'definition', 'varies', 'cities', 'states', 'based', 'restrictions', 'essential', 'businesses', 'serve', 'critical', 'purpose', 'grocery', 'stores', 'pharmacies', 'waste', 'collection', 'providers', 'gas', 'stations', 'banks', 'transportation', 'agriculture', 'services', 'This', 'contrasts', 'recreational', 'purposesFlattening', 'attempt', 'create', 'gradual', 'uptick', 'cases', 'steep', 'rise', 'avoid', 'overburdening', 'system', 'flattening', 'necessarily', 'decrease', 'projected', 'spreads', 'period', 'time', 'Forehead', 'thermometer', 'device', 'measures', 'temperature', 'hovering', 'near', 'forehead', 'traditional', 'insertion', 'Herd', 'immunity', 'reduction', 'infection', 'within', 'often', 'previous', 'exposure', 'vaccination', 'Hydroxychloroquine', 'oral', 'used', 'treat', 'malaria', 'rheumatoid', 'arthritis', 'lupus', 'Its', 'effectiveness', 'treating', 'patients', 'still', 'question', 'Immune', 'surveillance', 'process', 'immune', '’s', 'activities', 'destruction', 'foreign', 'substances', 'cells', 'tissues', 'Immunosuppressed', 'experiences', 'reduced', 'result', 'conditions', 'People', 'immunosuppressed', 'greater', 'hospitalization', 'sickness', 'Incubation', 'first', 'exposed', 'appearance', 'level', 'contagion', 'arise', 'experts', 'believe', 'begin', 'exhibiting', 'Index', 'documented', 'patient', 'epidemic', 'Interchangeable', 'zero', 'Intensivist', 'physician', 'specializes', 'intensive', 'units', 'Lockdown', 'emergency', 'measure', 'restricted', 'certain', 'areas', 'transmission', 'lockdown', 'encouraged', 'stay', 'National', 'state', 'resulting', 'global', 'threat', 'On', 'March', '13', '2020', 'President', 'Trump', 'issued', 'national', 'concerning', 'allowed', 'loosened', 'requirements', 'hospitals', 'allow', 'respond', 'crisis', 'Novel', 'strain', 'nCoV', 'never', 'detected', 'Pandemic', 'worldwide', 'larger', 'reach', 'Until', 'last', 'H1N1', 'influenza', '2009.Patient', 'Person', 'physical', 'coughing', 'sneezing', 'via', 'contaminated', 'objects', 'surfaces', 'Physical', 'distancing', 'practice', 'maintaining', 'space', 'oneself', 'others', 'and/or', 'avoiding', 'direct', 'PPE', 'personal', 'protective', 'equipment', 'specialized', 'clothing', 'safeguard', 'hazards', 'airborne', 'particles', 'protect', 'parts', 'normal', 'attire', 'nose', 'mouth', 'eyes', 'hands', 'feet', 'N95', 'respirators', 'considered', 'ideal', 'workers', 'CoV-2.Pre', 'symptomatic', 'yet', 'displaying', 'Presumptive', 'local', 'lab', 'whose', 'results', 'awaiting', 'CDC.PUI', 'investigation', 'PUI', 'suspected', 'potentially', 'COVID-19.Remdesivir', 'investigational', 'antiviral', 'administered', 'intravenously', 'inhibits', 'viral', 'replication', 'It', 'promising', 'treatment', 'developed', 'Ebola', 'Respirator', 'inhaling', 'something', 'hazardous', 'air', 'particulate', 'CoV2', 'fully', 'defined', '2', 'causes', 'COVID-19.Screening', 'act', 'verifying', 'testing', 'Self', 'isolation', 'separating', 'refraining', 'two', 'weeks', 'observe', 'whether', 'Shelter', 'place', 'government', 'shelter', 'asks', 'residents', 'remain', 'leave', 'perform', 'duties', 'deemed', 'slow', 'Social', 'remaining', 'physically', 'apart', 'stem', 'move', 'remote', 'work', 'cancellation', 'events', 'least', 'six', 'away', 'Spanish', 'flu', '1918', 'recent', 'history', 'according', 'estimated', '500', 'million', 'infections', '50', 'genes', 'avian', 'origin', 'Super', 'spreader', 'highly', 'large', 'uninfected', 'network', 'contacts', 'Symptomatic', 'showing', 'Health', 'officials', 'transmitting', 'highest', 'Vaccine', 'biological', 'preparation', 'organisms', 'provides', 'Currently', 'COVID-19.Ventilator', 'machine', 'lungs', 'unable', 'breathe', 'breathing', 'Because', 'cause', 'lower', 'ventilators']
cue_phrases={}
for sentence in sent_tokens:
    cue_phrases[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    for word in word_tokens:
        if word.lower() in QPhrases:
            cue_phrases[sentence] += 1
maximum_frequency = max(cue_phrases.values())
for k in cue_phrases.keys():
    try:
        cue_phrases[k] = cue_phrases[k] / maximum_frequency
        cue_phrases[k]=round(cue_phrases[k],3)
    except ZeroDivisionError:
        x=0
#print(cue_phrases.values())

# print([sent_tokens[4]])
# print(cue_phrases[sent_tokens[4]])

# .......................feature 2 (numerical data)...................\n",
numeric_data={}
for sentence in sent_tokens:
    numeric_data[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    for word in word_tokens:
        if word.isdigit():
            numeric_data[sentence] += 1
maximum_frequency = max(numeric_data.values())
for k in numeric_data.keys():
    try:
        numeric_data[k] = (numeric_data[k]/maximum_frequency)
        numeric_data[k] = round(numeric_data[k], 3)
    except ZeroDivisionError:
        x=0
#print(numeric_data.values())

#....................feature -3 (sentence length)........................\n",
sent_len_score={}
for sentence in sent_tokens:
    sent_len_score[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    if len(word_tokens) in range(0,10):
        sent_len_score[sentence]=1-0.05*(10-len(word_tokens))
    elif len(word_tokens) in range(7,20):
        sent_len_score[sentence]=1
    else:
        sent_len_score[sentence]=1-(0.05)*(len(word_tokens)-20)
for k in sent_len_score.keys():
    sent_len_score[k]=round(sent_len_score[k],4)
#print(sent_len_score.values())

#....................feature-4(sentence position)........................\n",
sentence_position={}
d=1
no_of_sent=len(sent_tokens)
for i in range(no_of_sent):
    a=1/d
    b=1/(no_of_sent-d+1)
    sentence_position[sent_tokens[d-1]]=max(a,b)
    d=d+1
for k in sentence_position.keys():
    sentence_position[k]=round(sentence_position[k],3)
#print(sentence_position.values())

#........................feature-5 (upper cases).................................\n",
upper_case={}
for sentence in sent_tokens:
    upper_case[sentence] = 0
    word_tokens = nltk.word_tokenize(sentence)
    for k in word_tokens:
        if k.isupper():
            upper_case[sentence] += 1
maximum_frequency = max(upper_case.values())
for k in upper_case.keys():
    try:
        upper_case[k] = (upper_case[k]/maximum_frequency)
        upper_case[k] = round(upper_case[k], 3)
    except ZeroDivisionError:
        x=0
#print(upper_case.values())

#......................... feature-6 (number of proper noun)...................\n",
proper_noun={}
for sentence in sent_tokens:
    tagged_sent = pos_tag(sentence.split())
    propernouns = [word for word, pos in tagged_sent if pos == 'NNP']
    proper_noun[sentence]=len(propernouns)
maximum_frequency = max(proper_noun.values())
for k in proper_noun.keys():
    try:
        proper_noun[k] = (proper_noun[k]/maximum_frequency)
        proper_noun[k] = round(proper_noun[k], 3)
    except ZeroDivisionError:
        x=0
#print(proper_noun.values())

#..............................Removing noise .................#
noise={}
for sentence in sent_tokens:
  a=re.findall(r'([a-zA-Z0-9+._-]+@[a-zA-Z0-9+._-]+\.[a-zA-Z0-9+._-])',sentence)
  url = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
  b=re.findall(url,sentence)
  c=re.findall(r'\s[A-Za-z\s]*\?',sentence)
  noise[sentence]=0
  if len(a)>0:
    noise[sentence]=-5
  if len(b)>0:
    noise[sentence]=-5
  if len(c)>0:
    noise[sentence]=-5

#..................................Feature of more Named Entities in a sentence .....................#
named_Entities={}
for sentence in sent_tokens:
  n=NER(sentence)
  # gpe=0 # ‘Countries, cities, states’
  # org=0 #‘Companies, agencies, institutions, etc.’
  # product=0
  # loc=0
  # date=0
  # ordinal=0
  # money=0
  # person=0
  # cardinal=0
  num=0
  for word in n.ents:
    if word.label_ !="":
      num+=1
    # if word.label == "GPE":
    #   gpe+=1
    # if word.label == "ORG":
    #   org+=1
  named_Entities[sentence]=num
maxer=max(named_Entities.values())
for k in named_Entities.keys():
    try:
        named_Entities[k] = (named_Entities[k]/maxer)
        named_Entities[k] = round(named_Entities[k], 3)
    except ZeroDivisionError:
        x=0
#print(named_Entities.values())

#..................................Feature of finding dates in a sentence .....................#
dates={}

for sentence in sent_tokens:
  d=NER(sentence)
  date=0
  for word in d.ents:
    if word.label_ == "DATE":
      date+=1
  dates[sentence]=date
maxda=max(dates.values())
for k in dates.keys():
    try:
        dates[k] = (dates[k]/maxda)
        dates[k] = round(dates[k], 3)
    except ZeroDivisionError:
        x=0
#print(dates.values())


# ......................Adding TF-IDF for ranking of each sentence .....................#

frequency_matrix = {}
stopWords = set(stopwords.words("english"))
ps = PorterStemmer()
total_documents = len(sent_tokens)
for sent in sent_tokens:
    freq_table = {}
    #print(sent)
    words = word_tokenize(sent)
    for word in words:
        word = word.lower()
        word = ps.stem(word)
        # if word in stopWords:
        #     continue

        if word in freq_table:
            freq_table[word] += 1
        else:
            freq_table[word] = 1

    frequency_matrix[sent] = freq_table
#print(frequency_matrix)
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


tf_matrix = {}

for sent, f_table in frequency_matrix.items():
    tf_table = {}

    count_words_in_sentence = len(f_table)
    for word, count in f_table.items():
        tf_table[word] = count / count_words_in_sentence

    tf_matrix[sent] = tf_table

#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

word_per_doc_table = {}

for sent, f_table in frequency_matrix.items():
    for word, count in f_table.items():
        if word in word_per_doc_table:
            word_per_doc_table[word] += 1
        else:
            word_per_doc_table[word] = 1


#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

idf_matrix = {}

for sent, f_table in frequency_matrix.items():
    idf_table = {}

    for word in f_table.keys():
        idf_table[word] = math.log10(total_documents / float(word_per_doc_table[word]))

    idf_matrix[sent] = idf_table

#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>

tf_idf_matrix = {}

for (sent1, f_table1), (sent2, f_table2) in zip(tf_matrix.items(), idf_matrix.items()):

    tf_idf_table = {}

    for (word1, value1), (word2, value2) in zip(f_table1.items(),
                                                f_table2.items()):  # here, keys are the same in both the table
        tf_idf_table[word1] = float(value1 * value2)

    tf_idf_matrix[sent1] = tf_idf_table


#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>


sentenceValue = {}

for sent, f_table in tf_idf_matrix.items():
    total_score_per_sentence = 0

    count_words_in_sentence = len(f_table)
    for word, score in f_table.items():
        total_score_per_sentence += score

    sentenceValue[sent] = total_score_per_sentence / count_words_in_sentence

maxsv=max(sentenceValue.values())
for k in sentenceValue.keys():
    try:
        sentenceValue[k] = (sentenceValue[k]/maxsv)
        sentenceValue[k] = round(sentenceValue[k], 3)
    except ZeroDivisionError:
        x=0
#print(sentenceValue.values())


total_score={}
for k in cue_phrases.keys():
    total_score[k]=cue_phrases[k]+numeric_data[k]+sent_len_score[k]+sentence_position[k]+upper_case[k]+proper_noun[k]+noise[k]+named_Entities[k]+dates[k]+(sentenceValue[k]*5)
#print(total_score.values())

sumValues = 0
for sentence in total_score:
    sumValues += total_score[sentence]
average = int(sumValues / len(total_score))
# Storing sentences into our summary. \n",
summary = ''
for sentence in sent_tokens:
    if (sentence in total_score) and (total_score[sentence] > (1.2*average)):
        summary += " " + sentence
#print(summary)


filename=sys.argv[2]
f2 = open((filename),'w')
f2.truncate(0)
#print(filename)
f2.write(summary)
f2.close()