Merge pull request #32 from noisebridge/pipfile

Update Pipfile
noisebridge · Nov 1, 2024 · dfa61cc · dfa61cc
2 parents 08cf876 + 4aaac29
commit dfa61cc
Show file tree

Hide file tree

Showing 19 changed files with 403 additions and 77 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -23,3 +23,24 @@ jobs:
 
       - name: Run tests
         run: pipenv run pytest
+
+  ruff:
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install ruff
+    
+      # Update output format to enable automatic inline annotations.
+      - name: Run Ruff
+        run: ruff check --output-format=github .
+
diff --git a/.vscode/extensions.json b/.vscode/extensions.json
@@ -0,0 +1,8 @@
+{
+	"recommendations": [
+		"charliermarsh.ruff"
+	],
+	"unwantedRecommendations": [
+
+	]
+}
diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -0,0 +1,9 @@
+{
+  "[python]": {
+    "editor.formatOnSave": true,
+    "editor.defaultFormatter": "charliermarsh.ruff",
+    "editor.codeActionsOnSave": {
+      "source.organizeImports": "explicit"
+    }
+  }
+}
diff --git a/Pipfile b/Pipfile
@@ -4,13 +4,14 @@ verify_ssl = true
 name = "pypi"
 
 [packages]
-requests = "==2.26.0"
-python-dotenv = "==1.0.1"
-tqdm = "==4.66.5"
-pytest = "==8.3.3"
-pytest-cov = "==5.0.0"
+python-dotenv = "~=1.0"
+requests = "~=2.26"
+ruff = "~=0.7"
+tqdm = "~=4.66"
 
 [dev-packages]
+pytest = "~=8.3"
+pytest-cov = "~=5.0"
 
 [requires]
 python_version = "3.12"
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ Currently, we are only accepting contributions from members of the project who m
 
 This code uses Python 3. It is tested on Python 3.12, but will probably work on versions back to 3.9.
 
-To install the project dependencies, first install pipenv globally with `pip install pipenv`. Then create a virtual env/install dependencies with `pipenv install`.
+To install the project dependencies, first install pipenv globally with `pip install pipenv`. Then create a virtual env/install dependencies with `pipenv install --dev`.
 
 To run code in the project, prefix your command with `pipenv run`, a la `pipenv run python -m mediabridge.main`.
 
@@ -20,3 +20,13 @@ To run unit tests,
 
 1. Ensure `pipenv` is installed
 2. Run `pipenv run pytest`
+
+There is a GitHub actions "check" for passing tests, which must pass for you to be able to merge your PR.
+
+## Code formatting
+
+We use [ruff](https://docs.astral.sh/ruff/) for code formatting, linting, and import sorting. If you've installed the project with the instructions above, you should have access to the `ruff` binary.
+
+The repo comes with a `.vscode` directory that contains a recommended ruff extension, as well as settings to set ruff as your Python formatter and to format code and sort imports on save. If you're not using VSCode, you can run `ruff format` from the project root directory to format all Python code.
+
+There is a GitHub actions "check" for code formatting, which will fail if you have unformatted code in your PR.
diff --git a/mediabridge/config/setting.py b/mediabridge/config/setting.py
@@ -1 +1 @@
- # Configuration settings (e.g., MongoDB URI, paths)
+# Configuration settings (e.g., MongoDB URI, paths)
diff --git a/mediabridge/data_processing/build_matrices.py b/mediabridge/data_processing/build_matrices.py
@@ -1 +1 @@
-# Scripts to build interaction and feature matrices
+# Scripts to build interaction and feature matrices
diff --git a/mediabridge/data_processing/preprocess.py b/mediabridge/data_processing/preprocess.py
@@ -1 +1 @@
-# Data preprocessing scripts (e.g., feature extraction)
+# Data preprocessing scripts (e.g., feature extraction)
diff --git a/mediabridge/data_processing/wiki_to_netflix.py b/mediabridge/data_processing/wiki_to_netflix.py
@@ -1,45 +1,61 @@
-import requests
 import csv
 import os
+import sys
 import time
+
+import requests
 from tqdm import tqdm
-import sys
+
 
 class WikidataServiceTimeoutException(Exception):
     pass
 
-data_dir = os.path.join(os.path.dirname(__file__), '../../data')
-out_dir = os.path.join(os.path.dirname(__file__), '../../out')
-user_agent = 'Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>'
+
+data_dir = os.path.join(os.path.dirname(__file__), "../../data")
+out_dir = os.path.join(os.path.dirname(__file__), "../../out")
+user_agent = "Noisebridge MovieBot 0.0.1/Audiodude <[email protected]>"
+
 
 # Reading netflix text file
 def read_netflix_txt(txt_file, test):
     num_rows = None
-    if test == True:
+    if test:
         num_rows = 100
 
-    with open(txt_file, "r", encoding = "ISO-8859-1") as netflix_data:
+    with open(txt_file, "r", encoding="ISO-8859-1") as netflix_data:
         for i, line in enumerate(netflix_data):
             if num_rows is not None and i >= num_rows:
                 break
-            yield line.rstrip().split(',', 2)
+            yield line.rstrip().split(",", 2)
+
 
 # Writing netflix csv file
-def create_netflix_csv(csv_name, data_list):   
-    with open(csv_name, 'w') as netflix_csv:
+def create_netflix_csv(csv_name, data_list):
+    with open(csv_name, "w") as netflix_csv:
         csv.writer(netflix_csv).writerows(data_list)
 
+
 # Extracting movie info from Wiki data
 def wiki_feature_info(data, key):
-    if len(data['results']['bindings']) < 1 or key not in data['results']['bindings'][0]:
+    if (
+        len(data["results"]["bindings"]) < 1
+        or key not in data["results"]["bindings"][0]
+    ):
         return None
-    if key == 'genreLabel':
-        return list({d['genreLabel']['value'] for d in data['results']['bindings'] if 'genreLabel' in d})
-    return data['results']['bindings'][0][key]['value'].split('/')[-1] 
+    if key == "genreLabel":
+        return list(
+            {
+                d["genreLabel"]["value"]
+                for d in data["results"]["bindings"]
+                if "genreLabel" in d
+            }
+        )
+    return data["results"]["bindings"][0][key]["value"].split("/")[-1]
+
 
 # Formatting SPARQL query for Wiki data
 def format_sparql_query(title, year):
-    QUERY = '''
+    QUERY = """
         SELECT * WHERE {
             SERVICE wikibase:mwapi {
                 bd:serviceParam wikibase:api "EntitySearch" ;
@@ -83,15 +99,16 @@ def format_sparql_query(title, year):
             SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
             }
     
-        '''
-    return QUERY % {'Title': title, 'Year': year}
+        """
+    return QUERY % {"Title": title, "Year": year}
+
 
 # Getting list of movie IDs, genre IDs, and director IDs from request
 def wiki_query(data_csv, user_agent):
     wiki_movie_ids = []
     wiki_genres = []
     wiki_directors = []
-        
+
     for row in tqdm(data_csv):
         if row[1] is None:
             continue
@@ -101,61 +118,76 @@ def wiki_query(data_csv, user_agent):
         tries = 0
         while True:
             try:
-                response = requests.post('https://query.wikidata.org/sparql',
-                            headers={'User-Agent': user_agent},
-                            data={
-                            'query': SPARQL,
-                            'format': 'json',
-                            },
-                            timeout=20,
+                response = requests.post(
+                    "https://query.wikidata.org/sparql",
+                    headers={"User-Agent": user_agent},
+                    data={
+                        "query": SPARQL,
+                        "format": "json",
+                    },
+                    timeout=20,
                 )
                 break
             except requests.exceptions.Timeout:
-                wait_time = 2 ** tries * 5
+                wait_time = 2**tries * 5
                 time.sleep(wait_time)
                 tries += 1
                 if tries > 5:
                     raise WikidataServiceTimeoutException(
-                        f'Tried {tries} time, could not reach Wikidata '
-                        f'(movie: {row[2]} {row[1]})'
+                        f"Tried {tries} time, could not reach Wikidata "
+                        f"(movie: {row[2]} {row[1]})"
                     )
-        
+
         response.raise_for_status()
         data = response.json()
-        
-        wiki_movie_ids.append(wiki_feature_info(data, 'item'))
-        wiki_genres.append(wiki_feature_info(data, 'genreLabel'))
-        wiki_directors.append(wiki_feature_info(data, 'directorLabel'))
-    
+
+        wiki_movie_ids.append(wiki_feature_info(data, "item"))
+        wiki_genres.append(wiki_feature_info(data, "genreLabel"))
+        wiki_directors.append(wiki_feature_info(data, "directorLabel"))
+
     return wiki_movie_ids, wiki_genres, wiki_directors
 
+
 # Calling all functions
 def process_data(test=False):
     missing_count = 0
     processed_data = []
 
-    netflix_data = read_netflix_txt(os.path.join(data_dir, 'movie_titles.txt'), test)
+    netflix_data = read_netflix_txt(os.path.join(data_dir, "movie_titles.txt"), test)
 
-    netflix_csv = os.path.join(out_dir, 'movie_titles.csv')
+    netflix_csv = os.path.join(out_dir, "movie_titles.csv")
 
-    wiki_movie_ids_list, wiki_genres_list, wiki_directors_list = wiki_query(netflix_data, user_agent)
+    wiki_movie_ids_list, wiki_genres_list, wiki_directors_list = wiki_query(
+        netflix_data, user_agent
+    )
 
     num_rows = len(wiki_movie_ids_list)
 
     for index, row in enumerate(netflix_data):
         netflix_id, year, title = row
         if wiki_movie_ids_list[index] is None:
             missing_count += 1
-        movie = [netflix_id, wiki_movie_ids_list[index], title, year, wiki_genres_list[index], wiki_directors_list[index]]
+        movie = [
+            netflix_id,
+            wiki_movie_ids_list[index],
+            title,
+            year,
+            wiki_genres_list[index],
+            wiki_directors_list[index],
+        ]
         processed_data.append(movie)
 
     create_netflix_csv(netflix_csv, processed_data)
 
-    print(f'missing:  {missing_count} ({missing_count / num_rows * 100}%)')
-    print(f'found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100}%)')
-    print(f'total: {num_rows}')
+    print(f"missing:  {missing_count} ({missing_count / num_rows * 100}%)")
+    print(
+        f"found: {num_rows - missing_count} ({(num_rows - missing_count) / num_rows * 100}%)"
+    )
+    print(f"total: {num_rows}")
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     # Test is true if no argument is passed or if the first argument is not '--prod'.
-    test = len(sys.argv) < 2 or sys.argv[1] != '--prod'
+    test = len(sys.argv) < 2 or sys.argv[1] != "--prod"
+    process_data(test=test)
     process_data(test=test)
diff --git a/mediabridge/data_processing/wiki_to_netflix_test.py b/mediabridge/data_processing/wiki_to_netflix_test.py
@@ -1,6 +1,7 @@
-from wiki_to_netflix import format_sparql_query, wiki_query, process_data
+from wiki_to_netflix import format_sparql_query
 from wiki_to_netflix_test_data import EXPECTED_SPARQL_QUERY
 
+
 def test_format_sparql_query():
     QUERY = format_sparql_query("The Room", 2003)
-    assert QUERY == EXPECTED_SPARQL_QUERY
+    assert QUERY == EXPECTED_SPARQL_QUERY
diff --git a/mediabridge/data_processing/wiki_to_netflix_test_data.py b/mediabridge/data_processing/wiki_to_netflix_test_data.py
@@ -1,4 +1,4 @@
-EXPECTED_SPARQL_QUERY ='''
+EXPECTED_SPARQL_QUERY = """
         SELECT * WHERE {
             SERVICE wikibase:mwapi {
                 bd:serviceParam wikibase:api "EntitySearch" ;
@@ -42,4 +42,4 @@
             SERVICE wikibase:label { bd:serviceParam wikibase:language "en" . }
             }
     
-        '''
+        """
diff --git a/mediabridge/db/connect.py b/mediabridge/db/connect.py
@@ -1 +1 @@
- # MongoDB connection setup
+# MongoDB connection setup
diff --git a/mediabridge/db/queries.py b/mediabridge/db/queries.py
@@ -1 +1 @@
- # Functions to query MongoDB for movies and interactions
+# Functions to query MongoDB for movies and interactions
diff --git a/mediabridge/main.py b/mediabridge/main.py
@@ -1,4 +1,4 @@
 from mediabridge.data_processing import wiki_to_netflix
 
-q = wiki_to_netflix.format_sparql_query('The Room', 2003)
+q = wiki_to_netflix.format_sparql_query("The Room", 2003)
 print(q)
diff --git a/mediabridge/models/predict.py b/mediabridge/models/predict.py
@@ -1 +1 @@
-   # Script to make predictions using the trained model
+# Script to make predictions using the trained model
diff --git a/mediabridge/models/train_model.py b/mediabridge/models/train_model.py
@@ -1 +1 @@
-# Script to train the LightFM model
+# Script to train the LightFM model
diff --git a/mediabridge/models/utils.py b/mediabridge/models/utils.py
@@ -1 +1 @@
-# Utility functions (e.g., for building matrices)
+# Utility functions (e.g., for building matrices)
diff --git a/ruff.toml b/ruff.toml
@@ -0,0 +1,2 @@
+# Default selections for ruff, plus isort.
+lint.select = ["E4", "E7", "E9", "F", "I001"]
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		# Configuration settings (e.g., MongoDB URI, paths)
		# Configuration settings (e.g., MongoDB URI, paths)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		# Scripts to build interaction and feature matrices
		# Scripts to build interaction and feature matrices
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		# Data preprocessing scripts (e.g., feature extraction)
		# Data preprocessing scripts (e.g., feature extraction)
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		# MongoDB connection setup
		# MongoDB connection setup
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		# Functions to query MongoDB for movies and interactions
		# Functions to query MongoDB for movies and interactions
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		# Script to make predictions using the trained model
		# Script to make predictions using the trained model
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		# Script to train the LightFM model
		# Script to train the LightFM model
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		# Utility functions (e.g., for building matrices)
		# Utility functions (e.g., for building matrices)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# Default selections for ruff, plus isort.
		lint.select = ["E4", "E7", "E9", "F", "I001"]