-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #32 from noisebridge/pipfile
Update Pipfile
- Loading branch information
Showing
19 changed files
with
403 additions
and
77 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
{ | ||
"recommendations": [ | ||
"charliermarsh.ruff" | ||
], | ||
"unwantedRecommendations": [ | ||
|
||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
{ | ||
"[python]": { | ||
"editor.formatOnSave": true, | ||
"editor.defaultFormatter": "charliermarsh.ruff", | ||
"editor.codeActionsOnSave": { | ||
"source.organizeImports": "explicit" | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# Configuration settings (e.g., MongoDB URI, paths) | ||
# Configuration settings (e.g., MongoDB URI, paths) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# Scripts to build interaction and feature matrices | ||
# Scripts to build interaction and feature matrices |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# Data preprocessing scripts (e.g., feature extraction) | ||
# Data preprocessing scripts (e.g., feature extraction) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,45 +1,61 @@ | ||
import requests | ||
import csv | ||
import os | ||
import sys | ||
import time | ||
|
||
import requests | ||
from tqdm import tqdm | ||
import sys | ||
|
||
|
||
class WikidataServiceTimeoutException(Exception): | ||
pass | ||
|
||
data_dir = os.path.join(os.path.dirname(__file__), '../../data') | ||
out_dir = os.path.join(os.path.dirname(__file__), '../../out') | ||
user_agent = 'Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>' | ||
|
||
data_dir = os.path.join(os.path.dirname(__file__), "../../data") | ||
out_dir = os.path.join(os.path.dirname(__file__), "../../out") | ||
user_agent = "Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>" | ||
|
||
|
||
# Reading netflix text file | ||
def read_netflix_txt(txt_file, test): | ||
num_rows = None | ||
if test == True: | ||
if test: | ||
num_rows = 100 | ||
|
||
with open(txt_file, "r", encoding = "ISO-8859-1") as netflix_data: | ||
with open(txt_file, "r", encoding="ISO-8859-1") as netflix_data: | ||
for i, line in enumerate(netflix_data): | ||
if num_rows is not None and i >= num_rows: | ||
break | ||
yield line.rstrip().split(',', 2) | ||
yield line.rstrip().split(",", 2) | ||
|
||
|
||
# Writing netflix csv file | ||
def create_netflix_csv(csv_name, data_list): | ||
with open(csv_name, 'w') as netflix_csv: | ||
def create_netflix_csv(csv_name, data_list): | ||
with open(csv_name, "w") as netflix_csv: | ||
csv.writer(netflix_csv).writerows(data_list) | ||
|
||
|
||
# Extracting movie info from Wiki data | ||
def wiki_feature_info(data, key): | ||
if len(data['results']['bindings']) < 1 or key not in data['results']['bindings'][0]: | ||
if ( | ||
len(data["results"]["bindings"]) < 1 | ||
or key not in data["results"]["bindings"][0] | ||
): | ||
return None | ||
if key == 'genreLabel': | ||
return list({d['genreLabel']['value'] for d in data['results']['bindings'] if 'genreLabel' in d}) | ||
return data['results']['bindings'][0][key]['value'].split('/')[-1] | ||
if key == "genreLabel": | ||
return list( | ||
{ | ||
d["genreLabel"]["value"] | ||
for d in data["results"]["bindings"] | ||
if "genreLabel" in d | ||
} | ||
) | ||
return data["results"]["bindings"][0][key]["value"].split("/")[-1] | ||
|
||
|
||
# Formatting SPARQL query for Wiki data | ||
def format_sparql_query(title, year): | ||
QUERY = ''' | ||
QUERY = """ | ||
SELECT * WHERE { | ||
SERVICE wikibase:mwapi { | ||
bd:serviceParam wikibase:api "EntitySearch" ; | ||
|
@@ -83,15 +99,16 @@ def format_sparql_query(title, year): | |
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . } | ||
} | ||
''' | ||
return QUERY % {'Title': title, 'Year': year} | ||
""" | ||
return QUERY % {"Title": title, "Year": year} | ||
|
||
|
||
# Getting list of movie IDs, genre IDs, and director IDs from request | ||
def wiki_query(data_csv, user_agent): | ||
wiki_movie_ids = [] | ||
wiki_genres = [] | ||
wiki_directors = [] | ||
|
||
for row in tqdm(data_csv): | ||
if row[1] is None: | ||
continue | ||
|
@@ -101,61 +118,76 @@ def wiki_query(data_csv, user_agent): | |
tries = 0 | ||
while True: | ||
try: | ||
response = requests.post('https://query.wikidata.org/sparql', | ||
headers={'User-Agent': user_agent}, | ||
data={ | ||
'query': SPARQL, | ||
'format': 'json', | ||
}, | ||
timeout=20, | ||
response = requests.post( | ||
"https://query.wikidata.org/sparql", | ||
headers={"User-Agent": user_agent}, | ||
data={ | ||
"query": SPARQL, | ||
"format": "json", | ||
}, | ||
timeout=20, | ||
) | ||
break | ||
except requests.exceptions.Timeout: | ||
wait_time = 2 ** tries * 5 | ||
wait_time = 2**tries * 5 | ||
time.sleep(wait_time) | ||
tries += 1 | ||
if tries > 5: | ||
raise WikidataServiceTimeoutException( | ||
f'Tried {tries} time, could not reach Wikidata ' | ||
f'(movie: {row[2]} {row[1]})' | ||
f"Tried {tries} time, could not reach Wikidata " | ||
f"(movie: {row[2]} {row[1]})" | ||
) | ||
|
||
response.raise_for_status() | ||
data = response.json() | ||
wiki_movie_ids.append(wiki_feature_info(data, 'item')) | ||
wiki_genres.append(wiki_feature_info(data, 'genreLabel')) | ||
wiki_directors.append(wiki_feature_info(data, 'directorLabel')) | ||
|
||
wiki_movie_ids.append(wiki_feature_info(data, "item")) | ||
wiki_genres.append(wiki_feature_info(data, "genreLabel")) | ||
wiki_directors.append(wiki_feature_info(data, "directorLabel")) | ||
|
||
return wiki_movie_ids, wiki_genres, wiki_directors | ||
|
||
|
||
# Calling all functions | ||
def process_data(test=False): | ||
missing_count = 0 | ||
processed_data = [] | ||
|
||
netflix_data = read_netflix_txt(os.path.join(data_dir, 'movie_titles.txt'), test) | ||
netflix_data = read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), test) | ||
|
||
netflix_csv = os.path.join(out_dir, 'movie_titles.csv') | ||
netflix_csv = os.path.join(out_dir, "movie_titles.csv") | ||
|
||
wiki_movie_ids_list, wiki_genres_list, wiki_directors_list = wiki_query(netflix_data, user_agent) | ||
wiki_movie_ids_list, wiki_genres_list, wiki_directors_list = wiki_query( | ||
netflix_data, user_agent | ||
) | ||
|
||
num_rows = len(wiki_movie_ids_list) | ||
|
||
for index, row in enumerate(netflix_data): | ||
netflix_id, year, title = row | ||
if wiki_movie_ids_list[index] is None: | ||
missing_count += 1 | ||
movie = [netflix_id, wiki_movie_ids_list[index], title, year, wiki_genres_list[index], wiki_directors_list[index]] | ||
movie = [ | ||
netflix_id, | ||
wiki_movie_ids_list[index], | ||
title, | ||
year, | ||
wiki_genres_list[index], | ||
wiki_directors_list[index], | ||
] | ||
processed_data.append(movie) | ||
|
||
create_netflix_csv(netflix_csv, processed_data) | ||
|
||
print(f'missing: {missing_count} ({missing_count / num_rows * 100}%)') | ||
print(f'found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100}%)') | ||
print(f'total: {num_rows}') | ||
print(f"missing: {missing_count} ({missing_count / num_rows * 100}%)") | ||
print( | ||
f"found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100}%)" | ||
) | ||
print(f"total: {num_rows}") | ||
|
||
|
||
if __name__ == '__main__': | ||
if __name__ == "__main__": | ||
# Test is true if no argument is passed or if the first argument is not '--prod'. | ||
test = len(sys.argv) < 2 or sys.argv[1] != '--prod' | ||
test = len(sys.argv) < 2 or sys.argv[1] != "--prod" | ||
process_data(test=test) | ||
process_data(test=test) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
from wiki_to_netflix import format_sparql_query, wiki_query, process_data | ||
from wiki_to_netflix import format_sparql_query | ||
from wiki_to_netflix_test_data import EXPECTED_SPARQL_QUERY | ||
|
||
|
||
def test_format_sparql_query(): | ||
QUERY = format_sparql_query("The Room", 2003) | ||
assert QUERY == EXPECTED_SPARQL_QUERY | ||
assert QUERY == EXPECTED_SPARQL_QUERY |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# MongoDB connection setup | ||
# MongoDB connection setup |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# Functions to query MongoDB for movies and interactions | ||
# Functions to query MongoDB for movies and interactions |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from mediabridge.data_processing import wiki_to_netflix | ||
|
||
q = wiki_to_netflix.format_sparql_query('The Room', 2003) | ||
q = wiki_to_netflix.format_sparql_query("The Room", 2003) | ||
print(q) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# Script to make predictions using the trained model | ||
# Script to make predictions using the trained model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# Script to train the LightFM model | ||
# Script to train the LightFM model |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
# Utility functions (e.g., for building matrices) | ||
# Utility functions (e.g., for building matrices) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
# Default selections for ruff, plus isort. | ||
lint.select = ["E4", "E7", "E9", "F", "I001"] |