Skip to content

Commit

Permalink
Merge branch 'main' into interaction-matrix-of-user
Browse files Browse the repository at this point in the history
  • Loading branch information
siddz415 authored Oct 25, 2024
2 parents b5143b0 + a9ad482 commit 3f6bc40
Show file tree
Hide file tree
Showing 19 changed files with 345 additions and 2 deletions.
25 changes: 25 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
name: CI

on: [push, pull_request]

jobs:
python-tests:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: '3.x'

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install pipenv
pipenv install --dev
- name: Run tests
run: pipenv run pytest
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,12 @@
interaction-matrix-of-user
mv_0000001.txt
mv_0000002.txt
movie_titles.txt
myenv/

data
out
.env
.pytest_cache
__pycache__
main
15 changes: 15 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]
requests = "==2.26.0"
python-dotenv = "==1.0.1"
pytest = "==8.3.3"
pytest-cov = "==5.0.0"

[dev-packages]

[requires]
python_version = "3.12"
71 changes: 71 additions & 0 deletions Pipfile.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

23 changes: 21 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,22 @@
# Noisebridge Python Project
# What is MediaBridge?

https://www.noisebridge.net/wiki/Python_Project_Meetup
MediaBridge is a project being developed at the [Noisebridge](https://github.com/noisebridge) hackerspace in San Francisco, CA, USA. See also the [Noisebridge hompage](https://www.noisebridge.net/wiki/Noisebridge) and the [wiki entry for this project](https://www.noisebridge.net/wiki/Python_Project_Meetup).

MediaBridge is in a _very_ early stage of the development. It's intended functionality is to provide recommendations that _bridge_ media types. So for example, you might say you're interested in the film _Saw_ and MediaBrige might recommend the video game _Silent Hill_ or a Stephen King book. For now, we are working on simply returning recommendations for movies, based on the [Netflix Prize dataset](https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data).

Currently, we are only accepting contributions from members of the project who meet in person at Noisebridge.

## Development

This code uses Python 3, probably at least 3.9.

To install the project dependencies, first install pipenv globally with `pip install pipenv`. Then create a virtual env/install dependencies with `pipenv install`.

To run code in the project, prefix your command with `pipenv run`, a la `pipenv run python -m mediabridge.main`.

## Testing

To run unit tests,

1. Ensure `pipenv` is installed
2. Run `pipenv run pytest`
1 change: 1 addition & 0 deletions mediabridge/config/setting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Configuration settings (e.g., MongoDB URI, paths)
1 change: 1 addition & 0 deletions mediabridge/data_processing/build_matrices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Scripts to build interaction and feature matrices
1 change: 1 addition & 0 deletions mediabridge/data_processing/credentials
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
user_agent = "INSERT UNIQUE CREDENTIALS" #put unique credentials here
1 change: 1 addition & 0 deletions mediabridge/data_processing/preprocess.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Data preprocessing scripts (e.g., feature extraction)
139 changes: 139 additions & 0 deletions mediabridge/data_processing/wiki_to_netflix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
import requests
import csv
import os

data_dir = os.path.join(os.path.dirname(__file__), '../../data')
out_dir = os.path.join(os.path.dirname(__file__), '../../out')
user_agent = 'Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>'

# Reading netflix text file
def read_netflix_txt(txt_file, test):
num_rows = None
if test == True:
num_rows = 100

with open(txt_file, "r", encoding = "ISO-8859-1") as netflix_data:
for i, line in enumerate(netflix_data):
if num_rows is not None and i >= num_rows:
break
yield line.rstrip().split(',', 2)

# Writing netflix csv file
def create_netflix_csv(csv_name, data_list):
with open(csv_name, 'w') as netflix_csv:
csv.writer(netflix_csv).writerows(data_list)

# Extracting movie info from Wiki data
def wiki_feature_info(data, key):
if len(data['results']['bindings']) < 1 or key not in data['results']['bindings'][0]:
return None
if key == 'genreLabel':
return list({d['genreLabel']['value'] for d in data['results']['bindings'] if 'genreLabel' in d})
return data['results']['bindings'][0][key]['value'].split('/')[-1]

# Formatting SPARQL query for Wiki data
def format_sparql_query(title, year):
QUERY = '''
SELECT * WHERE {
SERVICE wikibase:mwapi {
bd:serviceParam wikibase:api "EntitySearch" ;
wikibase:endpoint "www.wikidata.org" ;
mwapi:search "%(Title)s" ;
mwapi:language "en" .
?item wikibase:apiOutputItem mwapi:item .
}
?item wdt:P31/wdt:P279* wd:Q11424 .
{
# Get US release date
?item p:P577 ?releaseDateStatement .
?releaseDateStatement ps:P577 ?releaseDate .
?releaseDateStatement pq:P291 wd:Q30 .
}
UNION
{
# Get unspecified release date
?item p:P577 ?releaseDateStatement .
?releaseDateStatement ps:P577 ?releaseDate .
FILTER NOT EXISTS { ?releaseDateStatement pq:P291 ?country }
}
FILTER (YEAR(?releaseDate) = %(Year)d) .
?item rdfs:label ?itemLabel .
FILTER (lang(?itemLabel) = "en") .
OPTIONAL {
?item wdt:P136 ?genre .
?genre rdfs:label ?genreLabel .
FILTER (lang(?genreLabel) = "en") .
}
OPTIONAL {?item wdt:P57 ?director.
?director rdfs:label ?directorLabel.
FILTER (lang(?directorLabel) = "en")}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
'''
return QUERY % {'Title': title, 'Year': year}

# Getting list of movie IDs, genre IDs, and director IDs from request
def wiki_query(data_csv, user_agent):
wiki_movie_ids = []
wiki_genres = []
wiki_directors = []

for row in data_csv:
if row[1] is None:
continue

SPARQL = format_sparql_query(row[2], int(row[1]))

response = requests.post('https://query.wikidata.org/sparql',
headers={'User-Agent': user_agent},
data={
'query': SPARQL,
'format': 'json',
}
)
response.raise_for_status()

data = response.json()

wiki_movie_ids.append(wiki_feature_info(data, 'item'))
wiki_genres.append(wiki_feature_info(data, 'genreLabel'))
wiki_directors.append(wiki_feature_info(data, 'directorLabel'))

return wiki_movie_ids, wiki_genres, wiki_directors

# Calling all functions
def process_data(test=False):
missing_count = 0
processed_data = []

netflix_data = read_netflix_txt(os.path.join(data_dir, 'movie_titles.txt'), test)

netflix_csv = os.path.join(out_dir, 'movie_titles.csv')

wiki_movie_ids_list, wiki_genres_list, wiki_directors_list = wiki_query(netflix_data, user_agent)

num_rows = len(wiki_movie_ids_list)

for index, row in enumerate(netflix_data):
netflix_id, year, title = row
if wiki_movie_ids_list[index] is None:
missing_count += 1
movie = [netflix_id, wiki_movie_ids_list[index], title, year, wiki_genres_list[index], wiki_directors_list[index]]
processed_data.append(movie)

create_netflix_csv(netflix_csv, processed_data)

print(f'missing: {missing_count} ({missing_count / num_rows * 100}%)')
print(f'found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100}%)')
print(f'total: {num_rows}')

if __name__ == '__main__':
process_data(True)
6 changes: 6 additions & 0 deletions mediabridge/data_processing/wiki_to_netflix_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from wiki_to_netflix import format_sparql_query, wiki_query, process_data
from wiki_to_netflix_test_data import EXPECTED_SPARQL_QUERY

def test_format_sparql_query():
QUERY = format_sparql_query("The Room", 2003)
assert QUERY == EXPECTED_SPARQL_QUERY
45 changes: 45 additions & 0 deletions mediabridge/data_processing/wiki_to_netflix_test_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
EXPECTED_SPARQL_QUERY ='''
SELECT * WHERE {
SERVICE wikibase:mwapi {
bd:serviceParam wikibase:api "EntitySearch" ;
wikibase:endpoint "www.wikidata.org" ;
mwapi:search "The Room" ;
mwapi:language "en" .
?item wikibase:apiOutputItem mwapi:item .
}
?item wdt:P31/wdt:P279* wd:Q11424 .
{
# Get US release date
?item p:P577 ?releaseDateStatement .
?releaseDateStatement ps:P577 ?releaseDate .
?releaseDateStatement pq:P291 wd:Q30 .
}
UNION
{
# Get unspecified release date
?item p:P577 ?releaseDateStatement .
?releaseDateStatement ps:P577 ?releaseDate .
FILTER NOT EXISTS { ?releaseDateStatement pq:P291 ?country }
}
FILTER (YEAR(?releaseDate) = 2003) .
?item rdfs:label ?itemLabel .
FILTER (lang(?itemLabel) = "en") .
OPTIONAL {
?item wdt:P136 ?genre .
?genre rdfs:label ?genreLabel .
FILTER (lang(?genreLabel) = "en") .
}
OPTIONAL {?item wdt:P57 ?director.
?director rdfs:label ?directorLabel.
FILTER (lang(?directorLabel) = "en")}
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
}
'''
1 change: 1 addition & 0 deletions mediabridge/db/connect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# MongoDB connection setup
1 change: 1 addition & 0 deletions mediabridge/db/queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Functions to query MongoDB for movies and interactions
4 changes: 4 additions & 0 deletions mediabridge/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from mediabridge.data_processing import wiki_to_netflix

q = wiki_to_netflix.format_sparql_query('The Room', 2003)
print(q)
1 change: 1 addition & 0 deletions mediabridge/models/predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Script to make predictions using the trained model
1 change: 1 addition & 0 deletions mediabridge/models/train_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Script to train the LightFM model
1 change: 1 addition & 0 deletions mediabridge/models/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# Utility functions (e.g., for building matrices)
2 changes: 2 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
[pytest]
python_files = *_test.py

0 comments on commit 3f6bc40

Please sign in to comment.