Stripped down tag_predictor module

buddhist-uni · Feb 2, 2024 · c2c70f3 · c2c70f3
1 parent ac91487
commit c2c70f3
Show file tree

Hide file tree

Showing 4 changed files with 271 additions and 224 deletions.
diff --git a/scripts/android_import_sutta.py b/scripts/android_import_sutta.py
@@ -12,6 +12,8 @@
 from parallels import get_parallels_yaml
 from gdrive import upload_to_google_drive, get_gfolders_for_course, get_known_courses, create_drive_shortcut, DRIVE_LINK
 from archivedotorg import save_url_to_archiveorg
+from pdfutils import readpdf
+from tag_predictor import TagPredictor
 
 yaml_list_prefix = '\n  - '
 NONSC_TRANSLATORS = [{
@@ -232,6 +234,7 @@ def process_pdf(pdf_file):
   print(f"Processing {pdf_file}...")
   pdf_file = Path(pdf_file)
   pages = get_page_count(pdf_file)
+  pdf_text = readpdf(pdf_file)
   guess = guess_id_from_filename(pdf_file.stem)
   while True:
     sutta = input_with_prefill("Sutta ID? ", guess)
@@ -263,12 +266,14 @@ def process_pdf(pdf_file):
     trans = nonsc_trans[transidx]
     external_url = make_nonsc_url(trans['website_data'], book, nums)
     trans = fill_in_trans_data(trans, external_url)
+  blurb = get_blurb_for_suttaid(slug)
+  course = TagPredictor.load().predict([blurb + ' ' + pdf_text])[0]
   print(f"Going with {trans['author_short']}")
   pali_name = input_with_prefill("Pāli name? ", scdata['original_title'].replace("sutta", " Sutta").strip())
   eng_name = input_with_prefill("English title? ", scdata['translated_title'].strip())
   title = f"{sutta} {pali_name}{': '+eng_name if eng_name else ''}"
   filename = f"{title.replace(':','_')} - {trans['author']}.pdf"
-  course = input_with_tab_complete("course: ", get_known_courses())
+  course = input_with_tab_complete("course: ", get_known_courses(), prefill=course)
   folder_id, shortcut_folder = get_gfolders_for_course(course)
   drive_links = "drive_links"
   if shortcut_folder and not folder_id:
@@ -353,7 +358,6 @@ def process_pdf(pdf_file):
     coursefields = f"""course: {course}
 status: featured
 """
-  blurb = get_blurb_for_suttaid(slug)
   blurb = f"\n\n{blurb}\n<!---->" if blurb else ""
   mdfile.write_text(f"""---
 title: "{title}"

diff --git a/scripts/strutils.py b/scripts/strutils.py
@@ -279,15 +279,18 @@ def hook():
     readline.set_pre_input_hook()
     return result
 
-def input_with_tab_complete(prompt, typeahead_suggestions, delims=None):
+def input_with_tab_complete(prompt, typeahead_suggestions, delims=None, prefill=None):
     prev_complr = readline.get_completer()
     prev_delims = readline.get_completer_delims()
     readline.set_completer(lambda text, state: (
       [s for s in typeahead_suggestions if s.startswith(text)][state]
 ))
     readline.set_completer_delims(delims or ' /')
     readline.parse_and_bind('tab: complete')
-    ret = input(prompt)
+    if prefill:
+      ret = input_with_prefill(prompt, prefill)
+    else:
+      ret = input(prompt)
     readline.set_completer(prev_complr)
     readline.set_completer_delims(prev_delims)
     return ret

diff --git a/scripts/tag_predictor.py b/scripts/tag_predictor.py
@@ -0,0 +1,176 @@
+#!/bin/python3
+
+# import argparse
+from pathlib import Path
+import json
+import regex
+
+from nltk.stem.snowball import SnowballStemmer
+import numpy as np
+from sklearn.utils.validation import (
+    check_X_y,
+    check_is_fitted,
+    check_array,
+)
+from sklearn.utils.multiclass import unique_labels
+from sklearn.base import clone as sklearn_clone
+from sklearn.feature_extraction.text import (
+    CountVectorizer,
+    TfidfTransformer,
+)
+from sklearn.pipeline import Pipeline
+from sklearn.base import (
+    BaseEstimator,
+    ClassifierMixin,
+    TransformerMixin,
+)
+import joblib
+from unidecode import unidecode
+
+from strutils import (
+    git_root_folder,
+)
+
+# This config file hosts all essential configuration data
+CONFIG_FILE = Path.home().joinpath('.auto_sort_unreads_rc.json')
+CONFIG = dict()
+DATA_DIRECTORY = ''
+if CONFIG_FILE.exists():
+    CONFIG = json.loads(CONFIG_FILE.read_text())
+    DATA_DIRECTORY = CONFIG.get('data_directory')
+if not DATA_DIRECTORY:
+    DATA_DIRECTORY = input("Please provide the absolute path to a directory to store all the data in: ")
+    CONFIG['data_directory'] = DATA_DIRECTORY
+    CONFIG_FILE.write_text(json.dumps(CONFIG))
+DATA_DIRECTORY = Path(DATA_DIRECTORY)
+MODELS_DIRECTORY = DATA_DIRECTORY.joinpath('models')
+
+STOP_WORDS = set(git_root_folder.joinpath('scripts/stop_words.txt').read_text().split('\n'))
+STOP_WORDS.update([w.lower() for w in STOP_WORDS])
+stemmer = SnowballStemmer('english')
+STOP_WORDS.update([stemmer.stem(word) for word in STOP_WORDS])
+
+def normalize_text(text: str) -> str:
+    text = unidecode(text).lower()
+    text = (
+        stemmer.stem(word)
+        for word in regex.split(r"[^a-z]+", text)
+        if len(word) >= 4 and word not in STOP_WORDS
+    )
+    return ' '.join(text)
+
+
+class RemoveSparseFeatures(BaseEstimator, TransformerMixin):
+    def __init__(self, k=15):
+        self.k = k
+
+    def fit(self, X, y=None):
+        self.num_features_in = X.shape[1]
+        self.sparse_mask = np.where(np.sum(X != 0, axis=0) >= self.k)[1]
+        self.num_features_out = self.sparse_mask.shape[0]
+        return self
+
+    def transform(self, X):
+        if hasattr(self, 'sparse_mask'):
+            return X[:, self.sparse_mask]
+        else:
+            raise ValueError("The transformer has not been fitted yet.")
+
+
+class ZeroLearningClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, label=None):
+        self.label = label
+        self.classes_ = []
+    def fit(self, X, y=None, sample_weight=None):
+        if self.label is None and len(y) > 0:
+            self.label = y[0]
+            self.classes_ = [self.label]
+        return self
+    def predict(self, X):
+        return np.full(shape=(X.shape[0],), fill_value=self.label)
+    def explain_yourself(self, *args):
+        return f"I'm a leaf node that always predicts '{self.label}'"
+
+class OBUNodeClassifier(BaseEstimator, ClassifierMixin):
+    """
+    My custom sklearn classifier for making one step prediction
+    
+    It takes a base_classifier instance (Logit by default)
+    and wraps it in a Pipeline that also does whatever last-minute
+    feature selection and normalization we need.
+    """
+    def __init__(
+        self,
+        base_classifier:BaseEstimator=None,
+        min_df=15,
+    ) -> None:
+        super().__init__()
+        self.min_df = min_df
+        if isinstance(base_classifier, BaseEstimator):
+            self.base_classifier = sklearn_clone(base_classifier)
+        else:
+            raise ValueError("Need to pass a base classifier to NodeClassifier")
+
+    def fit(self, X, y, sample_weight=None):
+        X, y = check_X_y(X, y, accept_sparse=True)
+        self.classes_ = unique_labels(y)
+        self.N_ = len(y)
+        self.pipeline_ = Pipeline(steps=[
+            ('filter_rare_words', RemoveSparseFeatures(k=self.min_df)),
+            ('tfidf', TfidfTransformer()),
+            ('classifier', self.base_classifier)
+        ])
+        self.pipeline_.fit(X, y, classifier__sample_weight=sample_weight)
+        return self
+
+    def predict(self, X):
+        check_is_fitted(self)
+        X = check_array(X, accept_sparse=True)
+        return self.pipeline_.predict(X)
+
+class TagPredictor:
+    """
+    Loads a trained classifier from a pkl file and does Classification prediction tasks on text.
+
+    Usage
+    -------
+    big_classifier = OBUTopicClassifier.load(DATA_DIRECTORY.joinpath('models/default.pkl'))
+    tags = big_classifier.predict(['Introduction to Buddhism', 'How to Meditate: A Guide to Peace'])
+    # tags should now ~= ['buddhism', 'meditation']
+    """
+    def __init__(
+        self,
+        vocabulary,
+        classifiers: dict[str, BaseEstimator],
+    ) -> None:
+        self.classifiers_ = classifiers
+        self.vectorizer_ = CountVectorizer(lowercase=False, vocabulary=vocabulary)
+
+    def predict(self, X, normalized=False) -> list[str]:
+        """Given an array of (normalized?) strings, predict the topics"""
+        if not normalized:
+            X = list(map(normalize_text, X))
+        X = self.vectorizer_.transform(X)
+        prev_prediction = ['']*X.shape[0]
+        curr_prediction = ['root']*X.shape[0]
+        predicting = True
+        while predicting:
+            next_prediction = []
+            predicting = False
+            for i in range(X.shape[0]):
+                if prev_prediction[i] == curr_prediction[i]:
+                    next_prediction.append(curr_prediction[i])
+                else:
+                    predicting = True
+                    next_prediction.append(self.classifiers_[curr_prediction[i]].predict(X[i,:])[0])
+            prev_prediction = curr_prediction
+            curr_prediction = next_prediction
+        return curr_prediction
+
+    @classmethod
+    def load(cls, filepath: Path | str=None):
+        """Loads a new instance of OBUTopicClassifier from the given save_as'ed .pkl file"""
+        if not filepath:
+            filepath = MODELS_DIRECTORY.joinpath('default.pkl')
+        vocabulary, classifiers = joblib.load(filepath)
+        return cls(vocabulary, classifiers)