Merge pull request #68 from scaife-viewer/atlas/tokenizer-hookset

Improve text part ingestion, tokenizers and text annotations
scaife-viewer · Jan 3, 2024 · 57b4c73 · 57b4c73
2 parents e49994c + 2f511ed
commit 57b4c73
Show file tree

Hide file tree

Showing 14 changed files with 366 additions and 57 deletions.
diff --git a/atlas/makemigrations.py b/atlas/makemigrations.py
diff --git a/atlas/scaife_viewer/atlas/constants.py b/atlas/scaife_viewer/atlas/constants.py
@@ -84,8 +84,9 @@
 ]
 
 HUMAN_FRIENDLY_LANGUAGE_MAP = {
-    "eng": "English",
     "ang": "English, Old (ca.450-1100)",
+    "ara": "Arabic",
+    "eng": "English",
     "fa": "Farsi",
     "far": "Farsi",
     "fre": "French",
@@ -95,9 +96,14 @@
     "lat": "Latin",
 }
 
+# TODO: Reconsider default
 TEXT_ANNOTATION_KIND_SCHOLIA = "scholia"
 TEXT_ANNOTATION_KIND_SYNTAX_TREE = "syntax-tree"
+# TODO: Refactor as textual notes
+# TODO: Reconsider this pattern
+TEXT_ANNOTATION_KIND_COMMENTARY = "commentary"
 TEXT_ANNOTATION_KIND_CHOICES = (
     (TEXT_ANNOTATION_KIND_SCHOLIA, "Scholia"),
     (TEXT_ANNOTATION_KIND_SYNTAX_TREE, "Syntax tree"),
+    (TEXT_ANNOTATION_KIND_COMMENTARY, "Commentary"),
 )
diff --git a/atlas/scaife_viewer/atlas/hooks.py b/atlas/scaife_viewer/atlas/hooks.py
@@ -136,10 +136,38 @@ def run_ingestion_pipeline(self, outf):
 
         return run_ingestion_pipeline(outf)
 
+    def get_token_annotation_paths(self):
+        from .conf import settings  # noqa; avoids race condition
+
+        path = Path(
+            settings.SV_ATLAS_DATA_DIR,
+            "annotations",
+            "token-annotations",
+        )
+
+        def isdir(path):
+            return path.is_dir()
+
+        return _get_annotation_paths(path, predicate=isdir)
+
     def get_text_annotation_paths(self):
         from .conf import settings  # noqa; avoids race condition
 
-        path = Path(settings.SV_ATLAS_DATA_DIR, "annotations", "text-annotations",)
+        path = Path(
+            settings.SV_ATLAS_DATA_DIR,
+            "annotations",
+            "text-annotations",
+        )
+        return _get_annotation_paths(path)
+
+    def get_commentary_annotation_paths(self):
+        from .conf import settings  # noqa; avoids race condition
+
+        path = Path(
+            settings.SV_ATLAS_DATA_DIR,
+            "annotations",
+            "commentaries",
+        )
         return _get_annotation_paths(path)
 
     def get_syntax_tree_annotation_paths(self):
@@ -163,6 +191,11 @@ def get_dictionary_annotation_paths(self):
         predicate = lambda x: x.suffix == ".json" or x.is_dir()  # noqa
         return _get_annotation_paths(path, predicate=predicate)
 
+    def get_prepared_tokens(self, version_urn):
+        from .parallel_tokenizers import prepare_tokens
+
+        return prepare_tokens(version_urn)
+
 
 class HookProxy:
     def __getattr__(self, attr):

diff --git a/atlas/scaife_viewer/atlas/importers/text_annotations.py b/atlas/scaife_viewer/atlas/importers/text_annotations.py
@@ -7,6 +7,7 @@
 from ..constants import (
     TEXT_ANNOTATION_KIND_SCHOLIA,
     TEXT_ANNOTATION_KIND_SYNTAX_TREE,
+    TEXT_ANNOTATION_KIND_COMMENTARY,
 )
 from ..hooks import hookset
 from ..models import Node, TextAnnotation
@@ -84,6 +85,7 @@ def _resolve_text_annotation_text_parts(qs):
     chunked_bulk_create(TextAnnotationThroughModel, prepared_objs)
 
 
+# TODO: Break this part into individual pipelines
 def import_text_annotations(reset=False):
     if reset:
         TextAnnotation.objects.all().delete()
@@ -98,6 +100,14 @@ def import_text_annotations(reset=False):
             _prepare_text_annotations(path, counters, kind=TEXT_ANNOTATION_KIND_SCHOLIA)
         )
 
+    commentary_annotation_paths = hookset.get_commentary_annotation_paths()
+    for path in commentary_annotation_paths:
+        to_create.extend(
+            _prepare_text_annotations(
+                path, counters, kind=TEXT_ANNOTATION_KIND_COMMENTARY
+            )
+        )
+
     syntax_tree_annotation_paths = hookset.get_syntax_tree_annotation_paths()
     for path in syntax_tree_annotation_paths:
         to_create.extend(

diff --git a/atlas/scaife_viewer/atlas/importers/token_annotations.py b/atlas/scaife_viewer/atlas/importers/token_annotations.py
@@ -1,35 +1,21 @@
 import csv
 import os
 import re
-from pathlib import Path
 
 import yaml
 
-from scaife_viewer.atlas.conf import settings
-
+from ..hooks import hookset
 from ..models import Node, Token, TokenAnnotation, TokenAnnotationCollection
 
-
-ANNOTATIONS_DATA_PATH = Path(
-    settings.SV_ATLAS_DATA_DIR, "annotations", "token-annotations"
-)
-
-
 VE_REF_PATTTERN = re.compile(r"(?P<ref>.*).t(?P<token>.*)")
 
 
-def get_paths():
-    if not os.path.exists(ANNOTATIONS_DATA_PATH):
-        return []
-    for path in ANNOTATIONS_DATA_PATH.iterdir():
-        if not path.is_dir():
-            continue
-        yield path
-
-
 def resolve_version(path):
     versionish = f'{os.path.basename(path).split(".csv")[0]}:'
-    return Node.objects.filter(urn__endswith=versionish).get()
+    version_obj = Node.objects.filter(urn__endswith=versionish).first()
+    if not version_obj:
+        print(f'Could not resolve version for {path.name} [urn="{versionish}"]')
+    return version_obj
 
 
 def extract_ref_and_token_position(row):
@@ -81,7 +67,11 @@ def create_token_annotations(collection, version, lookup, refs):
             continue
 
         to_create.append(
-            TokenAnnotation(token=token, data=data, collection=collection,)
+            TokenAnnotation(
+                token=token,
+                data=data,
+                collection=collection,
+            )
         )
     return len(TokenAnnotation.objects.bulk_create(to_create))
 
@@ -92,9 +82,9 @@ def apply_token_annotations(reset=True):
     want to revisit how this entire extraction works in the future
     """
 
-    paths = get_paths()
+    paths = hookset.get_token_annotation_paths()
     for path in paths:
-        metadata_path = Path(path, "metadata.yml")
+        metadata_path = path / "metadata.yml"
         collection = yaml.safe_load(metadata_path.open())
 
         if reset:
@@ -105,10 +95,12 @@ def apply_token_annotations(reset=True):
         if not values or not values.endswith("csv"):
             continue
 
-        values_path = Path(path, values)
+        values_path = path / values
         lookup, refs = extract_lookup_and_refs(values_path)
         # TODO: Move this to metadata and or values
         version = resolve_version(values_path)
+        if not version:
+            continue
 
         # TODO: Set attribution information
         metadata = collection.pop("metadata", {})

diff --git a/atlas/scaife_viewer/atlas/importers/versions.py b/atlas/scaife_viewer/atlas/importers/versions.py
@@ -1,18 +1,24 @@
 import logging
 from collections import defaultdict
 
+from django.conf import settings
 from django.db import IntegrityError
 from django.utils.translation import ugettext_noop
 
 from tqdm import tqdm
 from treebeard.exceptions import PathOverflow
 
 from scaife_viewer.atlas import constants
+from scaife_viewer.atlas.parallel_tokenizers import tokenize_text_parts_parallel
 
 from ..hooks import hookset
-from ..models import Node
+from ..models import Node, Token
 from ..urn import URN
-from ..utils import chunked_bulk_create, get_lowest_citable_depth
+from ..utils import (
+    chunked_bulk_create,
+    chunked_bulk_delete,
+    get_lowest_citable_depth,
+)
 
 
 logger = logging.getLogger(__name__)
@@ -45,14 +51,20 @@ class CTSImporter:
     CTS_URN_SCHEME_EXEMPLAR = constants.CTS_URN_NODES
 
     def __init__(
-        self, library, version_data, nodes=dict(), node_last_child_lookup=None
+        self,
+        library,
+        version_data,
+        nodes=dict(),
+        node_last_child_lookup=None,
+        partial_ingestion=False,
     ):
         self.library = library
         self.version_data = version_data
         self.nodes = nodes
         # TODO: Decouple "version_data" further
         self.urn = URN(self.version_data["urn"].strip())
         self.work_urn = self.urn.up_to(self.urn.WORK)
+        self.partial_ingestion = partial_ingestion
 
         try:
             label = get_first_value_for_language(version_data["label"], "eng")
@@ -191,6 +203,13 @@ def generate_node(self, idx, node_data, parent_urn):
                 return Node.objects.get(urn=node_data["urn"])
         parent = self.nodes.get(parent_urn)
         if USE_BULK_INGESTION:
+            # NOTE: parent.pk will _only_ be populated if invoked with the
+            # partial_ingestion flag
+            if self.partial_ingestion and parent.pk and parent.depth < 4:
+                # If parent has a database id, and is a workpart,
+                # we should also add the child to the database.
+                # Typically this is going to be a work or version.
+                return self.add_child(parent, node_data)
             return self.add_child_bulk(parent, node_data)
         return self.add_child(parent, node_data)
 
@@ -243,6 +262,15 @@ def extract_urn_and_text_content(self, line):
             urn = f"{self.urn}{ref}"
         return URN(urn), text_content
 
+    def get_or_generate_node(self, idx, node_data, parent_urn):
+        try:
+            node = Node.objects.get(urn=node_data["urn"])
+            print(f'retrieved existing node from db: {node_data["urn"]}')
+        except Node.DoesNotExist:
+            node = self.generate_node(idx, node_data, parent_urn)
+            print(f'inserted node: {node_data["urn"]}')
+        return node
+
     def generate_branch(self, urn=None, line=None):
         if line:
             node_urn, text_content = self.extract_urn_and_text_content(line)
@@ -258,7 +286,10 @@ def generate_branch(self, urn=None, line=None):
             if node is None:
                 node_data.update({"idx": self.get_node_idx(node_data)})
                 parent_urn = self.get_parent_urn(idx, branch_data)
-                node = self.generate_node(idx, node_data, parent_urn)
+                if idx < 4 and self.partial_ingestion:
+                    node = self.get_or_generate_node(idx, node_data, parent_urn)
+                else:
+                    node = self.generate_node(idx, node_data, parent_urn)
                 self.nodes[node_data["urn"]] = node
 
     def apply(self):
@@ -288,7 +319,9 @@ def get_first_value_for_language(values, lang, fallback=True):
     return value.get("value")
 
 
-def import_versions(reset=False, predicate=None):
+# TODO: Determine best signature; do we decouple partial_ingestion or
+# infer it based on the predicate?
+def import_versions(reset=False, predicate=None, partial_ingestion=False):
     if reset:
         Node.objects.filter(kind="nid").delete()
     # TODO: Wire up logging
@@ -304,9 +337,9 @@ def import_versions(reset=False, predicate=None):
     if predicate:
         """
         Suggested usage:
-        predicate = lambda x: x["urn"] == "urn:cts:greekLit:tlg0012.tlg001.parrish-eng1:"
+        predicate = lambda x: x["urn"].count("urn:cts:greekLit:tlg0012.tlg001.parrish-eng1:")
         from scaife_viewer.atlas.importers.versions import *
-        import_versions(predicate=predicate)
+        import_versions(predicate=predicate, partial_ingestion=True)
         """
         to_ingest = filter(predicate, to_ingest)
 
@@ -318,7 +351,13 @@ def import_versions(reset=False, predicate=None):
     # counter from tqdm would be more useful.
     with tqdm() as pbar:
         for version_data in to_ingest:
-            importer = importer_class(library, version_data, nodes, lookup)
+            importer = importer_class(
+                library,
+                version_data,
+                nodes,
+                lookup,
+                partial_ingestion=partial_ingestion,
+            )
             deferred_nodes = importer.apply()
 
             to_defer.extend(deferred_nodes)
@@ -331,3 +370,56 @@ def import_versions(reset=False, predicate=None):
     logger.info("Inserting Node tree")
     chunked_bulk_create(Node, to_defer)
     logger.info(f"{Node.objects.count()} total nodes on the tree.")
+
+
+def reset_nodes(version_urn, fast_reset=False):
+    # FIXME: Remove customizations from Node so we can use default queryset methods?
+    nodes = Node.objects.filter(urn__startswith=version_urn).filter(numchild=0)
+    if not nodes:
+        return
+
+    parent = nodes.first().get_parent()
+
+    # NOTE: fast_reset doesn't work because of ForeignKey cascade issues
+    if fast_reset:
+        nodes._raw_delete(using=settings.SV_ATLAS_DB_LABEL)
+    else:
+        chunked_bulk_delete(nodes)
+
+    parent.numchild = parent.get_children().count()
+    parent.save()
+    return
+
+
+def test_partial_ingestion(version_urn):
+    """
+    Usage:
+
+    from scaife_viewer.atlas.importers.versions import test_partial_ingestion
+    test_partial_ingestion("urn:cts:greekLit:tlg0012.tlg001.perseus-grc2:")
+    """
+    print(f"Resetting nodes: {version_urn}")
+    reset_nodes(version_urn, fast_reset=False)
+    print("Done")
+
+    def predicate(obj):
+        return obj["urn"].count(version_urn)
+
+    print(f"Ingesting nodes matching {version_urn}")
+    import_versions(reset=False, predicate=predicate, partial_ingestion=True)
+
+
+def test_partial_tokenizer(version_urn):
+    """
+    Usage:
+
+    from scaife_viewer.atlas.importers.versions import test_partial_tokenizer
+    test_partial_tokenizer("urn:cts:greekLit:tlg0012.tlg001.perseus-grc2:")
+    """
+
+    assert (
+        Token.objects.filter(text_part__urn__startswith=version_urn).exists() is False
+    )
+
+    tokenize_text_parts_parallel([version_urn])
+    assert Token.objects.filter(text_part__urn__startswith=version_urn).exists()
diff --git a/atlas/scaife_viewer/atlas/managers.py b/atlas/scaife_viewer/atlas/managers.py
@@ -0,0 +1,16 @@
+from django.db import models
+
+
+class NodeManager(models.Manager):
+    """
+    Overrides MP_NodeManager's custom delete method.
+
+    This is needed because we aren't setting `numchild`, so
+    the custom delete method fails.
+
+    FIXME: Remove overrides
+    """
+
+    def get_queryset(self):
+        queryset = super().get_queryset()
+        return queryset.order_by("path")