-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb_scraper.py
188 lines (140 loc) · 8.45 KB
/
imdb_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import urllib.request as req
from bs4 import BeautifulSoup
import pandas as pd
import os
import time
import sys
import json
from urllib.error import HTTPError
import numpy as np
title_list = []
combine_list2 = [] #contains df first four columns
def writeToExcel(data):
df = pd.DataFrame(data, columns = ['Title', 'Year', 'Genre', 'IMDB rating'])
df['tomato_meter'] = tomato_meter
df['audience_rotten_tomatoes'] = audi_score
df.to_excel("output.xlsx")
print(".... written to excel")
#gets data response for the request
def getResponse(reqURL):
print("in getResponse() ")
response = req.urlopen(reqURL) #opens url
data = response.read() #reads the whole page source
print("printing data")
#print(type(data))
return data
def getTopThrillers(url):
print(url)
print("in getTopThrillers() ")
data = getResponse(url)
soup = BeautifulSoup(data, 'html.parser')
#print(soup.prettify()) #displays page source info in pretty form
movie_name_list = []
ratings_list = []
for show in soup.find_all('div', {'class' :'lister-item-content'}):
#print(show)
if "Filming" not in str(show):
for show in show.find_all('div', {'class' :'inline-block ratings-imdb-rating'}):
ratings_list.append(show.text.strip().split("\n"))
else:
print("hereeeeeeeeeee")
ratings_list.append("-")
for show in soup.find_all('h3', {'class' :'lister-item-header'}):
movie_name_list.append(show.text.strip().split("\n")[1])
title_list.append(show.text.strip().split("\n")[1])
rows = [show.text.strip().split("\n")[0] for show in soup.find_all('h3', {'class' :'lister-item-header'})]
year_list = [show.text.strip().split("\n")[2] for show in soup.find_all('h3', {'class' :'lister-item-header'})]
genre_list = [show.text.strip().split("\n") for show in soup.find_all('span', {'class' :'genre'})]
# print(movie_name_list)
combine_list1 = []
temp = ""
for row, m, y, g, r in zip(rows, movie_name_list, year_list, genre_list, ratings_list):
combine_list1.extend((row.strip(), m.strip(' '), y.strip(' '), str(g).strip('[]').replace("'",""), str(r).strip('[]').replace("'","") ))
combine_list2.append(combine_list1)
combine_list1 = []
for ele in combine_list2:
print(ele)
# writeToExcel(combine_list2)
def is_ascii(s):
return all(ord(c) < 128 for c in s)
def getResponse_rotten_tomato(url, title):
print("in getResponse_rotten_tomato")
data = ""
if is_ascii(title) == False: return "-"
try:
#print("in try loop")
final_url = url + "tv/" + title
print(final_url)
response_tv = req.urlopen(final_url) #opens url
data = response_tv.read()
#print(type(data))
return data
except HTTPError as e:
# print("here")
# print(type(e.code))
if e.code == 404:
#print("here2")
try:
final_url = url+ "m/" + title
print(final_url)
response_m = req.urlopen(final_url) #opens url
data = response_m.read()
if len(data) != 0:
return data
else:
return "-"
except HTTPError as e:
return "-"
tomato_meter = []
audi_score = []
def getRottenTomatoesRating(data):
print("in getRottenTomatoesRating() ")
soup = BeautifulSoup(data, 'html.parser')
rating = [percent.text.strip() for percent in soup.find_all('span', {'class' : 'mop-ratings-wrap__percentage'})]
print(rating)
if len(rating) == 2:
tomato_meter.append(rating[0])
audi_score.append(rating[1])
elif len(rating) == 1:
tomato_meter.append("-")
audi_score.append(rating[0])
else:
tomato_meter.append("-")
audi_score.append("-")
def main():
page = [i for i in range(1,50,50)]
print(page)
for pg in page :
base_url = "https://www.imdb.com/search/title/?genres=thriller&start="+ str(pg) + "&ref_=adv_nxt"
getTopThrillers(base_url)
#title_list = ['Ozark', 'Élite', 'La Casa de Papel', 'El hoyo', 'The Walking Dead', 'Contagion', 'Bad Boys for Life', 'Killing Eve', 'The Invisible Man', 'Underwater']
#base_url = "https://www.rottentomatoes.com/search?search="
# combine_list2 = [['Ozark', '(2017– )', 'Crime, Drama, Thriller', '8.4'], ['La Casa de Papel', '(2017– )', 'Action, Crime, Mystery', '8.5'], ['El hoyo', '(2019)', 'Horror, Sci-Fi, Thriller', '7.0'], ['The Walking Dead', '(2010– )', 'Drama, Horror, Thriller', '8.2'], ['Contagion', '(2011)', 'Action, Drama, Thriller', '6.7'], ['Bad Boys for Life', '(2020)', 'Action, Comedy, Crime', '6.8'], ['Killing Eve', '(2018– )', 'Action, Adventure, Drama', '8.3'], ['The Invisible Man', '(2020)', 'Horror, Mystery, Sci-Fi', '7.2'], ['Underwater', '(2020)', 'Action, Drama, Horror', '5.9'], ['Knives Out', '(2019)', 'Comedy, Crime, Drama', '8.0'], ['Gisaengchung', '(2019)', 'Comedy, Drama, Thriller', '8.6'], ['Homeland', '(2011–2020)', 'Crime, Drama, Mystery', '8.3'], ['Breaking Bad', '(2008–2013)', 'Crime, Drama, Thriller', '9.5'], ['The Hunt', '(II) (2020)', 'Action, Horror, Thriller', '6.4'], ['Supernatural', '(2005– )', 'Drama, Fantasy, Horror', '8.4'], ['The Blacklist', '(2013– )', 'Crime, Drama, Mystery', '8.0'], ['Élite', '(2018– )', 'Crime, Drama, Thriller', '7.6'], ['Stranger Things', '(2016– )', 'Drama, Fantasy, Horror', '8.8'], ['Altered Carbon', '(2018– )', 'Action, Drama, Sci-Fi', '8.1'], ['Black Mirror', '(2011– )', 'Drama, Sci-Fi, Thriller', '8.8'], ['The Outsider', '(2020– )', 'Crime, Drama, Mystery', '8.0'], ['Spenser Confidential', '(2020)', 'Action, Comedy, Crime', '6.2'], ['Uncut Gems', '(2019)', 'Crime, Drama, Thriller', '7.5'], ['Criminal Minds', '(2005–2020)', 'Crime, Drama, Mystery', '8.1'], ['Devs', '(2020– )', 'Drama, Mystery, Sci-Fi', '7.9'], ['Joker', '(2019)', 'Crime, Drama, Thriller', '8.5'], ['Asur: Welcome to Your Dark Side', '(2020– )', 'Crime, Drama, Thriller', '8.6'], ['Law & Order: Special Victims Unit', '(1999– )', 'Crime, Drama, Mystery', '8.0'], ['Locke & Key', '(2020– )', 'Drama, Fantasy, Horror', '7.4'], ['Midsommar', '(2019)', 'Drama, Horror, Mystery', '7.1'], ['Fargo', '(2014– )', 'Crime, Drama, Thriller', '8.9'], ['The Wire', '(2002–2008)', 'Crime, Drama, Thriller', '9.3'], ['The Sinner', '(2017– )', 'Crime, Drama, Mystery', '8.0'], ['American Horror Story', '(2011– )', 'Drama, Horror, Thriller', '8.1'], ['Dark', '(2017– )', 'Crime, Drama, Mystery', '8.7'], ['You', '(2018– )', 'Crime, Drama, Romance', '7.8'], ['NCIS: Naval Criminal Investigative Service', '(2003– )', 'Action, Crime, Drama', '7.8'], ["The Handmaid's Tale", '(2017– )', 'Drama, Sci-Fi, Thriller', '8.5'], ['The Stranger', '(I) (2020– )', 'Mystery, Thriller', '7.4'], ['Toy Boy', '(2019)', 'Crime, Drama, Thriller', '6.5'], ['Hogar', '(2020)', 'Drama, Thriller', '6.4'], ['Defending Jacob', '(2020– )', 'Mystery, Thriller', '8.1'], ['Chicago P.D.', '(2014– )', 'Action, Crime, Drama', '8.1'], ['How to Get Away with Murder', '(2014– )', 'Crime, Drama, Mystery', '9.4'], ['Chernobyl', '(2019)', 'Drama, History, Thriller', '8.3'], ['Prison Break', '(2005–2017)', 'Action, Crime, Drama', '9.0'], ['True Detective', '(2014– )', 'Crime, Drama, Mystery', '8.3'], ['Kingdom', '(2019– )', 'Action, Drama, Horror', '9.1'], ['Sherlock', '(2010– )', 'Crime, Drama, Mystery', '8.8']]
# base_url = "https://www.rottentomatoes.com/"
# titles = np.array(combine_list2)
# for title in titles[:,0]:
# print()
# print(title)
# #title = str(title.replace(" ","_").encode('utf-8'))
# title = title.replace(" ","_")
# ret = getResponse_rotten_tomato(base_url,title)
# #print(type(ret))
# if not isinstance(ret,type(None)):
# getRottenTomatoesRating(ret)
# else:
# tomato_meter.append("-")
# audi_score.append("-")
# # print(tomato_meter)
# # print(audi_score)
# # print(combine_list2)
# # tomato_meter = ['81%', '90%', '-', '81%', '-', '77%', '87%', '50%', '-', '97%', '-', '85%', '96%', '-', '93%', '91%', '-', '93%', '76%', '83%', '82%', '39%', '92%', '-', '82%', '-', '-', '-', '-', '83%', '96%', '94%', '95%', '-', '94%', '90%', '-', '-', '-', '-', '-', '-', '-', '90%', '96%', '60%', '78%', '80%', '78%', '89%']
# # audi_score = ['92%', '84%', '-', '78%', '-', '96%', '89%', '50%', '15%', '92%', '-', '86%', '98%', '100%', '86%', '89%', '-', '91%', '63%', '81%', '85%', '57%', '52%', '87%', '68%', '33%', '-', '-', '-', '63%', '93%', '97%', '75%', '76%', '94%', '81%', '-', '-', '83%', '40%', '100%', '-', '-', '86%', '98%', '77%', '78%', '89%', '83%', '94%']
# print(len(tomato_meter))
# print(len(audi_score))
print(len(combine_list2))
# writeToExcel(combine_list2)
if __name__ == "__main__":
main()