-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathimdb_data.py
36 lines (30 loc) · 1.19 KB
/
imdb_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
import datetime
import math
import pandas as pd
def get_imdb_data() -> pd.DataFrame:
df = pd.read_csv("ratings.csv")
df = df[df["Title Type"] == "Movie"].copy()
df = df.drop("Title Type", axis=1).copy()
df["Release Date"] = pd.to_datetime(df["Release Date"])
df["Date Rated"] = pd.to_datetime(df["Date Rated"])
df["Diff in ratings"] = round(df["IMDb Rating"] - df["Your Rating"], 1)
df["Link"] = "<a href=”" + df["URL"].astype(str) + "”>" + df["Title"].astype(str)
df["Year"] = pd.to_numeric(df["Year"], errors="coerce")
min_year = int(df["Year"].dropna().min()) if not df["Year"].dropna().empty else 1900
decade_date_range = range(
math.floor(min_year / 10) * 10, datetime.date.today().year + 11, 10
)
decade_date_labels = [
str(i) + "'s"
for i in list(decade_date_range)[: len(list(decade_date_range)) - 1]
]
df["Decade"] = pd.cut(
df["Year"],
bins=list(decade_date_range),
labels=decade_date_labels,
include_lowest=True,
right=False,
)
df["genre_list"] = df["Genres"].str.split(", ")
df["Days not rated"] = (df["Date Rated"] - df["Release Date"]).dt.days
return df