-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb scraping.py
136 lines (71 loc) · 2.26 KB
/
web scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python
# coding: utf-8
# In[4]:
from newsapi import NewsApiClient
newsapi = NewsApiClient(api_key='862c6dd645dd4a78ab20df9e09bc6705')
news_sources = newsapi.get_sources()
for source in news_sources['sources']:
print(source['name']) #prints the sources we are gonnna get our news from
data = newsapi.get_everything(q='Salman Khan',language='en',)
print(data)
#print(type(data))
#print(data.keys())
#print(type(data['articles']))
print(data['articles'][0])
for article in data['articles']:
print('Title : ',article['title'])
print('Description : ',article['description'],'\n\n')
print('publised at :' ,article['publishedAt'])
def get_data(keyword):
news = []
allarticles = newsapi.get_everything(
q=keyword,
language='en',
)
articles = allarticles["articles"]
for i in articles:
article = {'title':i["title"],'description' : i["description"],'link':i['url'],'published':i['publishedAt']}
news.append(article)
return news
data = get_data('salman khan')
data
date_time = data[0]["published"]
date_time
import datetime
format = "%Y-%m-%dT%I%M53Z"
from dateutil import parser
DT = parser.parse(date_time)
print(DT)
import pandas as pd
df = pd.DataFrame(data)
df
get_ipython().system('pip install bs4')
get_ipython().system('pip install beautifulsoup4')
from bs4 import BeautifulSoup
get_ipython().system('pip install requests')
get_ipython().system('pip install html5lib')
get_ipython().system('pip install nltk')
import requests
import re
url= 'https://indianexpress.com/article/entertainment/web-series/johnny-lever-and-saurabh-shukla-parody-pathaan-scene-ft-shah-rukh-salman-khan-kapil-sharma-diss-8475319/'
#for i in df:
r= requests.get(url)
htmlcontent = r.content
soup= BeautifulSoup(htmlcontent, 'html.parser')
soup.prettify
print(soup.find_all('p'))
# In[98]:
for element in soup.find_all('p'):
final_data= element.get_text()
print(final_data)
# In[88]:
import spacy
from spacy.lang.en.examples import sentences
nlp = spacy.load("en_core_web_sm")
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
get_ipython().system('pip install -U spacy')
get_ipython().system('python -m spacy download en_core_web_sm')
# In[84]:
nlp = spacy.load("en_core_web_sm")