-
Notifications
You must be signed in to change notification settings - Fork 0
/
code2.R
54 lines (44 loc) · 1.68 KB
/
code2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
library(tidyverse)
library(jsonlite)
movie = read.csv("dataset/tmdb_5000_movies.csv", header=T)
colnames(movie)[4] = "movie_id"
movie$genres = as.character(movie$genres)
all_genres = movie %>%
filter(nchar(genres) > 2) %>%
mutate(
js = lapply(genres, fromJSON)
) %>%
unnest(js) %>%
mutate(genre=name)
colnames(all_genres)
movie = all_genres[,
c('movie_id', 'original_title', 'genre',
'runtime', 'vote_average', 'vote_count', 'title',
'budget'
)
]
rm(all_genres)
colnames(movie)[1] = "id"
movie = subset(movie, runtime > 0 & vote_count > 9)
movie = subset(movie, !(genre %in% c("Foreign", "TV Movie")))
## RATING vs GENRE
ggplot(movie, aes(y=genre, x =vote_average, fill = genre)) +
geom_boxplot(alpha = 0.7,
outlier.colour = "black", outlier.shape = 20) +
xlab("Avg rating (out of 10)") +
scale_y_discrete(limits = sort(unique(movie$genre), decreasing = T), name = "Genre") +
ggtitle("Boxplot of avg. rating by Genre") +
theme(plot.title = element_text(hjust = 0.5))
ggplot(movie, aes(x=vote_average, fill=genre)) +
geom_density() +
facet_wrap(~genre) +
ggtitle("Probability density plot for avg. RATING by different GENREs")
## BUDGET vs GENRE
budget_movie = subset(movie, budget > 100000)
budget_movie$budget = budget_movie$budget / 1000000
ggplot(budget_movie, aes(y=genre, x =budget, fill = genre)) +
geom_boxplot(alpha = 0.7,
outlier.colour = "#1F3552", outlier.shape = 20) +
xlab("Budget (million USD)") +
scale_y_discrete(limits = sort(unique(movie$genre), decreasing = T), name = "Genre") +
ggtitle("Boxplot of Budget by Genre")