Merge branch 'main' into interaction-matrix-of-user

noisebridge · Oct 25, 2024 · 3f6bc40 · 3f6bc40
2 parents b5143b0 + a9ad482
commit 3f6bc40
Show file tree

Hide file tree

Showing 19 changed files with 345 additions and 2 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -0,0 +1,25 @@
+name: CI
+
+on: [push, pull_request]
+
+jobs:
+  python-tests:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Set up Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: '3.x'
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install pipenv
+          pipenv install --dev
+
+      - name: Run tests
+        run: pipenv run pytest
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,12 @@
+interaction-matrix-of-user
 mv_0000001.txt
 mv_0000002.txt
 movie_titles.txt
 myenv/
+
+data
+out
+.env
+.pytest_cache
+__pycache__
+ main
diff --git a/Pipfile b/Pipfile
@@ -0,0 +1,15 @@
+[[source]]
+url = "https://pypi.org/simple"
+verify_ssl = true
+name = "pypi"
+
+[packages]
+requests = "==2.26.0"
+python-dotenv = "==1.0.1"
+pytest = "==8.3.3"
+pytest-cov = "==5.0.0"
+
+[dev-packages]
+
+[requires]
+python_version = "3.12"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -1,3 +1,22 @@
-# Noisebridge Python Project
+# What is MediaBridge?
 
-https://www.noisebridge.net/wiki/Python_Project_Meetup
+MediaBridge is a project being developed at the [Noisebridge](https://github.com/noisebridge) hackerspace in San Francisco, CA, USA. See also the [Noisebridge hompage](https://www.noisebridge.net/wiki/Noisebridge) and the [wiki entry for this project](https://www.noisebridge.net/wiki/Python_Project_Meetup).
+
+MediaBridge is in a _very_ early stage of the development. It's intended functionality is to provide recommendations that _bridge_ media types. So for example, you might say you're interested in the film _Saw_ and MediaBrige might recommend the video game _Silent Hill_ or a Stephen King book. For now, we are working on simply returning recommendations for movies, based on the [Netflix Prize dataset](https://www.kaggle.com/datasets/netflix-inc/netflix-prize-data).
+
+Currently, we are only accepting contributions from members of the project who meet in person at Noisebridge.
+
+## Development
+
+This code uses Python 3, probably at least 3.9.
+
+To install the project dependencies, first install pipenv globally with `pip install pipenv`. Then create a virtual env/install dependencies with `pipenv install`.
+
+To run code in the project, prefix your command with `pipenv run`, a la `pipenv run python -m mediabridge.main`.
+
+## Testing
+
+To run unit tests,
+
+1. Ensure `pipenv` is installed
+2. Run `pipenv run pytest`
diff --git a/mediabridge/config/setting.py b/mediabridge/config/setting.py
@@ -0,0 +1 @@
+ # Configuration settings (e.g., MongoDB URI, paths)
diff --git a/mediabridge/data_processing/build_matrices.py b/mediabridge/data_processing/build_matrices.py
@@ -0,0 +1 @@
+# Scripts to build interaction and feature matrices
diff --git a/mediabridge/data_processing/credentials b/mediabridge/data_processing/credentials
@@ -0,0 +1 @@
+user_agent = "INSERT UNIQUE CREDENTIALS" #put unique credentials here
diff --git a/mediabridge/data_processing/preprocess.py b/mediabridge/data_processing/preprocess.py
@@ -0,0 +1 @@
+# Data preprocessing scripts (e.g., feature extraction)
diff --git a/mediabridge/data_processing/wiki_to_netflix.py b/mediabridge/data_processing/wiki_to_netflix.py
@@ -0,0 +1,139 @@
+import requests
+import csv
+import os
+
+data_dir = os.path.join(os.path.dirname(__file__), '../../data')
+out_dir = os.path.join(os.path.dirname(__file__), '../../out')
+user_agent = 'Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>'
+
+# Reading netflix text file
+def read_netflix_txt(txt_file, test):
+    num_rows = None
+    if test == True:
+        num_rows = 100
+
+    with open(txt_file, "r", encoding = "ISO-8859-1") as netflix_data:
+        for i, line in enumerate(netflix_data):
+            if num_rows is not None and i >= num_rows:
+                break
+            yield line.rstrip().split(',', 2)
+
+# Writing netflix csv file
+def create_netflix_csv(csv_name, data_list):   
+    with open(csv_name, 'w') as netflix_csv:
+        csv.writer(netflix_csv).writerows(data_list)
+
+# Extracting movie info from Wiki data
+def wiki_feature_info(data, key):
+    if len(data['results']['bindings']) < 1 or key not in data['results']['bindings'][0]:
+        return None
+    if key == 'genreLabel':
+        return list({d['genreLabel']['value'] for d in data['results']['bindings'] if 'genreLabel' in d})
+    return data['results']['bindings'][0][key]['value'].split('/')[-1] 
+
+# Formatting SPARQL query for Wiki data
+def format_sparql_query(title, year):
+    QUERY = '''
+        SELECT * WHERE {
+            SERVICE wikibase:mwapi {
+                bd:serviceParam wikibase:api "EntitySearch" ;
+                                wikibase:endpoint "www.wikidata.org" ;
+                                mwapi:search "%(Title)s" ;
+                                mwapi:language "en" .
+                ?item wikibase:apiOutputItem mwapi:item .
+            }
+
+            ?item wdt:P31/wdt:P279* wd:Q11424 .
+            
+            {
+                # Get US release date
+                ?item p:P577 ?releaseDateStatement .
+                ?releaseDateStatement ps:P577 ?releaseDate .
+                ?releaseDateStatement pq:P291 wd:Q30 .  
+            }
+            UNION
+            {
+                # Get unspecified release date
+                ?item p:P577 ?releaseDateStatement .
+                ?releaseDateStatement ps:P577 ?releaseDate .
+                FILTER NOT EXISTS { ?releaseDateStatement pq:P291 ?country }
+            }
+        
+            FILTER (YEAR(?releaseDate) = %(Year)d) .
+
+            ?item rdfs:label ?itemLabel .
+            FILTER (lang(?itemLabel) = "en") .
+
+            OPTIONAL {
+                ?item wdt:P136 ?genre .
+                ?genre rdfs:label ?genreLabel .
+                FILTER (lang(?genreLabel) = "en") .
+            }
+
+            OPTIONAL {?item wdt:P57 ?director.
+                            ?director rdfs:label ?directorLabel.
+                            FILTER (lang(?directorLabel) = "en")}
+
+            SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
+            }
+    
+        '''
+    return QUERY % {'Title': title, 'Year': year}
+
+# Getting list of movie IDs, genre IDs, and director IDs from request
+def wiki_query(data_csv, user_agent):
+    wiki_movie_ids = []
+    wiki_genres = []
+    wiki_directors = []
+
+    for row in data_csv:
+        if row[1] is None:
+            continue
+
+        SPARQL = format_sparql_query(row[2], int(row[1]))
+
+        response = requests.post('https://query.wikidata.org/sparql',
+                    headers={'User-Agent': user_agent},
+                    data={
+                    'query': SPARQL,
+                    'format': 'json',
+                    }
+        )
+        response.raise_for_status() 
+
+        data = response.json()
+
+        wiki_movie_ids.append(wiki_feature_info(data, 'item'))
+        wiki_genres.append(wiki_feature_info(data, 'genreLabel'))
+        wiki_directors.append(wiki_feature_info(data, 'directorLabel'))
+
+    return wiki_movie_ids, wiki_genres, wiki_directors
+
+# Calling all functions
+def process_data(test=False):
+    missing_count = 0
+    processed_data = []
+
+    netflix_data = read_netflix_txt(os.path.join(data_dir, 'movie_titles.txt'), test)
+
+    netflix_csv = os.path.join(out_dir, 'movie_titles.csv')
+
+    wiki_movie_ids_list, wiki_genres_list, wiki_directors_list = wiki_query(netflix_data, user_agent)
+
+    num_rows = len(wiki_movie_ids_list)
+
+    for index, row in enumerate(netflix_data):
+        netflix_id, year, title = row
+        if wiki_movie_ids_list[index] is None:
+            missing_count += 1
+        movie = [netflix_id, wiki_movie_ids_list[index], title, year, wiki_genres_list[index], wiki_directors_list[index]]
+        processed_data.append(movie)
+
+    create_netflix_csv(netflix_csv, processed_data)
+
+    print(f'missing:  {missing_count} ({missing_count / num_rows * 100}%)')
+    print(f'found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100}%)')
+    print(f'total: {num_rows}')
+
+if __name__ == '__main__':
+    process_data(True)
diff --git a/mediabridge/data_processing/wiki_to_netflix_test.py b/mediabridge/data_processing/wiki_to_netflix_test.py
@@ -0,0 +1,6 @@
+from wiki_to_netflix import format_sparql_query, wiki_query, process_data
+from wiki_to_netflix_test_data import EXPECTED_SPARQL_QUERY
+
+def test_format_sparql_query():
+    QUERY = format_sparql_query("The Room", 2003)
+    assert QUERY == EXPECTED_SPARQL_QUERY
diff --git a/mediabridge/data_processing/wiki_to_netflix_test_data.py b/mediabridge/data_processing/wiki_to_netflix_test_data.py
@@ -0,0 +1,45 @@
+EXPECTED_SPARQL_QUERY ='''
+        SELECT * WHERE {
+            SERVICE wikibase:mwapi {
+                bd:serviceParam wikibase:api "EntitySearch" ;
+                                wikibase:endpoint "www.wikidata.org" ;
+                                mwapi:search "The Room" ;
+                                mwapi:language "en" .
+                ?item wikibase:apiOutputItem mwapi:item .
+            }
+
+            ?item wdt:P31/wdt:P279* wd:Q11424 .
+            
+            {
+                # Get US release date
+                ?item p:P577 ?releaseDateStatement .
+                ?releaseDateStatement ps:P577 ?releaseDate .
+                ?releaseDateStatement pq:P291 wd:Q30 .  
+            }
+            UNION
+            {
+                # Get unspecified release date
+                ?item p:P577 ?releaseDateStatement .
+                ?releaseDateStatement ps:P577 ?releaseDate .
+                FILTER NOT EXISTS { ?releaseDateStatement pq:P291 ?country }
+            }
+        
+            FILTER (YEAR(?releaseDate) = 2003) .
+
+            ?item rdfs:label ?itemLabel .
+            FILTER (lang(?itemLabel) = "en") .
+
+            OPTIONAL {
+                ?item wdt:P136 ?genre .
+                ?genre rdfs:label ?genreLabel .
+                FILTER (lang(?genreLabel) = "en") .
+            }
+
+            OPTIONAL {?item wdt:P57 ?director.
+                            ?director rdfs:label ?directorLabel.
+                            FILTER (lang(?directorLabel) = "en")}
+
+            SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
+            }
+    
+        '''
diff --git a/mediabridge/db/connect.py b/mediabridge/db/connect.py
@@ -0,0 +1 @@
+ # MongoDB connection setup
diff --git a/mediabridge/db/queries.py b/mediabridge/db/queries.py
@@ -0,0 +1 @@
+ # Functions to query MongoDB for movies and interactions
diff --git a/mediabridge/main.py b/mediabridge/main.py
@@ -0,0 +1,4 @@
+from mediabridge.data_processing import wiki_to_netflix
+
+q = wiki_to_netflix.format_sparql_query('The Room', 2003)
+print(q)
diff --git a/mediabridge/models/predict.py b/mediabridge/models/predict.py
@@ -0,0 +1 @@
+   # Script to make predictions using the trained model
diff --git a/mediabridge/models/train_model.py b/mediabridge/models/train_model.py
@@ -0,0 +1 @@
+# Script to train the LightFM model
diff --git a/mediabridge/models/utils.py b/mediabridge/models/utils.py
@@ -0,0 +1 @@
+# Utility functions (e.g., for building matrices)
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+python_files = *_test.py
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Configuration settings (e.g., MongoDB URI, paths)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Scripts to build interaction and feature matrices
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		user_agent = "INSERT UNIQUE CREDENTIALS" #put unique credentials here
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Data preprocessing scripts (e.g., feature extraction)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Functions to query MongoDB for movies and interactions
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Script to make predictions using the trained model
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		# Utility functions (e.g., for building matrices)