Skip to content

Commit

Permalink
Merge pull request #68 from scaife-viewer/atlas/tokenizer-hookset
Browse files Browse the repository at this point in the history
Improve text part ingestion, tokenizers and text annotations
  • Loading branch information
jacobwegner authored Jan 3, 2024
2 parents e49994c + 2f511ed commit 57b4c73
Show file tree
Hide file tree
Showing 14 changed files with 366 additions and 57 deletions.
Empty file modified atlas/makemigrations.py
100644 → 100755
Empty file.
8 changes: 7 additions & 1 deletion atlas/scaife_viewer/atlas/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,9 @@
]

HUMAN_FRIENDLY_LANGUAGE_MAP = {
"eng": "English",
"ang": "English, Old (ca.450-1100)",
"ara": "Arabic",
"eng": "English",
"fa": "Farsi",
"far": "Farsi",
"fre": "French",
Expand All @@ -95,9 +96,14 @@
"lat": "Latin",
}

# TODO: Reconsider default
TEXT_ANNOTATION_KIND_SCHOLIA = "scholia"
TEXT_ANNOTATION_KIND_SYNTAX_TREE = "syntax-tree"
# TODO: Refactor as textual notes
# TODO: Reconsider this pattern
TEXT_ANNOTATION_KIND_COMMENTARY = "commentary"
TEXT_ANNOTATION_KIND_CHOICES = (
(TEXT_ANNOTATION_KIND_SCHOLIA, "Scholia"),
(TEXT_ANNOTATION_KIND_SYNTAX_TREE, "Syntax tree"),
(TEXT_ANNOTATION_KIND_COMMENTARY, "Commentary"),
)
35 changes: 34 additions & 1 deletion atlas/scaife_viewer/atlas/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,38 @@ def run_ingestion_pipeline(self, outf):

return run_ingestion_pipeline(outf)

def get_token_annotation_paths(self):
from .conf import settings # noqa; avoids race condition

path = Path(
settings.SV_ATLAS_DATA_DIR,
"annotations",
"token-annotations",
)

def isdir(path):
return path.is_dir()

return _get_annotation_paths(path, predicate=isdir)

def get_text_annotation_paths(self):
from .conf import settings # noqa; avoids race condition

path = Path(settings.SV_ATLAS_DATA_DIR, "annotations", "text-annotations",)
path = Path(
settings.SV_ATLAS_DATA_DIR,
"annotations",
"text-annotations",
)
return _get_annotation_paths(path)

def get_commentary_annotation_paths(self):
from .conf import settings # noqa; avoids race condition

path = Path(
settings.SV_ATLAS_DATA_DIR,
"annotations",
"commentaries",
)
return _get_annotation_paths(path)

def get_syntax_tree_annotation_paths(self):
Expand All @@ -163,6 +191,11 @@ def get_dictionary_annotation_paths(self):
predicate = lambda x: x.suffix == ".json" or x.is_dir() # noqa
return _get_annotation_paths(path, predicate=predicate)

def get_prepared_tokens(self, version_urn):
from .parallel_tokenizers import prepare_tokens

return prepare_tokens(version_urn)


class HookProxy:
def __getattr__(self, attr):
Expand Down
10 changes: 10 additions & 0 deletions atlas/scaife_viewer/atlas/importers/text_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..constants import (
TEXT_ANNOTATION_KIND_SCHOLIA,
TEXT_ANNOTATION_KIND_SYNTAX_TREE,
TEXT_ANNOTATION_KIND_COMMENTARY,
)
from ..hooks import hookset
from ..models import Node, TextAnnotation
Expand Down Expand Up @@ -84,6 +85,7 @@ def _resolve_text_annotation_text_parts(qs):
chunked_bulk_create(TextAnnotationThroughModel, prepared_objs)


# TODO: Break this part into individual pipelines
def import_text_annotations(reset=False):
if reset:
TextAnnotation.objects.all().delete()
Expand All @@ -98,6 +100,14 @@ def import_text_annotations(reset=False):
_prepare_text_annotations(path, counters, kind=TEXT_ANNOTATION_KIND_SCHOLIA)
)

commentary_annotation_paths = hookset.get_commentary_annotation_paths()
for path in commentary_annotation_paths:
to_create.extend(
_prepare_text_annotations(
path, counters, kind=TEXT_ANNOTATION_KIND_COMMENTARY
)
)

syntax_tree_annotation_paths = hookset.get_syntax_tree_annotation_paths()
for path in syntax_tree_annotation_paths:
to_create.extend(
Expand Down
38 changes: 15 additions & 23 deletions atlas/scaife_viewer/atlas/importers/token_annotations.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,21 @@
import csv
import os
import re
from pathlib import Path

import yaml

from scaife_viewer.atlas.conf import settings

from ..hooks import hookset
from ..models import Node, Token, TokenAnnotation, TokenAnnotationCollection


ANNOTATIONS_DATA_PATH = Path(
settings.SV_ATLAS_DATA_DIR, "annotations", "token-annotations"
)


VE_REF_PATTTERN = re.compile(r"(?P<ref>.*).t(?P<token>.*)")


def get_paths():
if not os.path.exists(ANNOTATIONS_DATA_PATH):
return []
for path in ANNOTATIONS_DATA_PATH.iterdir():
if not path.is_dir():
continue
yield path


def resolve_version(path):
versionish = f'{os.path.basename(path).split(".csv")[0]}:'
return Node.objects.filter(urn__endswith=versionish).get()
version_obj = Node.objects.filter(urn__endswith=versionish).first()
if not version_obj:
print(f'Could not resolve version for {path.name} [urn="{versionish}"]')
return version_obj


def extract_ref_and_token_position(row):
Expand Down Expand Up @@ -81,7 +67,11 @@ def create_token_annotations(collection, version, lookup, refs):
continue

to_create.append(
TokenAnnotation(token=token, data=data, collection=collection,)
TokenAnnotation(
token=token,
data=data,
collection=collection,
)
)
return len(TokenAnnotation.objects.bulk_create(to_create))

Expand All @@ -92,9 +82,9 @@ def apply_token_annotations(reset=True):
want to revisit how this entire extraction works in the future
"""

paths = get_paths()
paths = hookset.get_token_annotation_paths()
for path in paths:
metadata_path = Path(path, "metadata.yml")
metadata_path = path / "metadata.yml"
collection = yaml.safe_load(metadata_path.open())

if reset:
Expand All @@ -105,10 +95,12 @@ def apply_token_annotations(reset=True):
if not values or not values.endswith("csv"):
continue

values_path = Path(path, values)
values_path = path / values
lookup, refs = extract_lookup_and_refs(values_path)
# TODO: Move this to metadata and or values
version = resolve_version(values_path)
if not version:
continue

# TODO: Set attribution information
metadata = collection.pop("metadata", {})
Expand Down
108 changes: 100 additions & 8 deletions atlas/scaife_viewer/atlas/importers/versions.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import logging
from collections import defaultdict

from django.conf import settings
from django.db import IntegrityError
from django.utils.translation import ugettext_noop

from tqdm import tqdm
from treebeard.exceptions import PathOverflow

from scaife_viewer.atlas import constants
from scaife_viewer.atlas.parallel_tokenizers import tokenize_text_parts_parallel

from ..hooks import hookset
from ..models import Node
from ..models import Node, Token
from ..urn import URN
from ..utils import chunked_bulk_create, get_lowest_citable_depth
from ..utils import (
chunked_bulk_create,
chunked_bulk_delete,
get_lowest_citable_depth,
)


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -45,14 +51,20 @@ class CTSImporter:
CTS_URN_SCHEME_EXEMPLAR = constants.CTS_URN_NODES

def __init__(
self, library, version_data, nodes=dict(), node_last_child_lookup=None
self,
library,
version_data,
nodes=dict(),
node_last_child_lookup=None,
partial_ingestion=False,
):
self.library = library
self.version_data = version_data
self.nodes = nodes
# TODO: Decouple "version_data" further
self.urn = URN(self.version_data["urn"].strip())
self.work_urn = self.urn.up_to(self.urn.WORK)
self.partial_ingestion = partial_ingestion

try:
label = get_first_value_for_language(version_data["label"], "eng")
Expand Down Expand Up @@ -191,6 +203,13 @@ def generate_node(self, idx, node_data, parent_urn):
return Node.objects.get(urn=node_data["urn"])
parent = self.nodes.get(parent_urn)
if USE_BULK_INGESTION:
# NOTE: parent.pk will _only_ be populated if invoked with the
# partial_ingestion flag
if self.partial_ingestion and parent.pk and parent.depth < 4:
# If parent has a database id, and is a workpart,
# we should also add the child to the database.
# Typically this is going to be a work or version.
return self.add_child(parent, node_data)
return self.add_child_bulk(parent, node_data)
return self.add_child(parent, node_data)

Expand Down Expand Up @@ -243,6 +262,15 @@ def extract_urn_and_text_content(self, line):
urn = f"{self.urn}{ref}"
return URN(urn), text_content

def get_or_generate_node(self, idx, node_data, parent_urn):
try:
node = Node.objects.get(urn=node_data["urn"])
print(f'retrieved existing node from db: {node_data["urn"]}')
except Node.DoesNotExist:
node = self.generate_node(idx, node_data, parent_urn)
print(f'inserted node: {node_data["urn"]}')
return node

def generate_branch(self, urn=None, line=None):
if line:
node_urn, text_content = self.extract_urn_and_text_content(line)
Expand All @@ -258,7 +286,10 @@ def generate_branch(self, urn=None, line=None):
if node is None:
node_data.update({"idx": self.get_node_idx(node_data)})
parent_urn = self.get_parent_urn(idx, branch_data)
node = self.generate_node(idx, node_data, parent_urn)
if idx < 4 and self.partial_ingestion:
node = self.get_or_generate_node(idx, node_data, parent_urn)
else:
node = self.generate_node(idx, node_data, parent_urn)
self.nodes[node_data["urn"]] = node

def apply(self):
Expand Down Expand Up @@ -288,7 +319,9 @@ def get_first_value_for_language(values, lang, fallback=True):
return value.get("value")


def import_versions(reset=False, predicate=None):
# TODO: Determine best signature; do we decouple partial_ingestion or
# infer it based on the predicate?
def import_versions(reset=False, predicate=None, partial_ingestion=False):
if reset:
Node.objects.filter(kind="nid").delete()
# TODO: Wire up logging
Expand All @@ -304,9 +337,9 @@ def import_versions(reset=False, predicate=None):
if predicate:
"""
Suggested usage:
predicate = lambda x: x["urn"] == "urn:cts:greekLit:tlg0012.tlg001.parrish-eng1:"
predicate = lambda x: x["urn"].count("urn:cts:greekLit:tlg0012.tlg001.parrish-eng1:")
from scaife_viewer.atlas.importers.versions import *
import_versions(predicate=predicate)
import_versions(predicate=predicate, partial_ingestion=True)
"""
to_ingest = filter(predicate, to_ingest)

Expand All @@ -318,7 +351,13 @@ def import_versions(reset=False, predicate=None):
# counter from tqdm would be more useful.
with tqdm() as pbar:
for version_data in to_ingest:
importer = importer_class(library, version_data, nodes, lookup)
importer = importer_class(
library,
version_data,
nodes,
lookup,
partial_ingestion=partial_ingestion,
)
deferred_nodes = importer.apply()

to_defer.extend(deferred_nodes)
Expand All @@ -331,3 +370,56 @@ def import_versions(reset=False, predicate=None):
logger.info("Inserting Node tree")
chunked_bulk_create(Node, to_defer)
logger.info(f"{Node.objects.count()} total nodes on the tree.")


def reset_nodes(version_urn, fast_reset=False):
# FIXME: Remove customizations from Node so we can use default queryset methods?
nodes = Node.objects.filter(urn__startswith=version_urn).filter(numchild=0)
if not nodes:
return

parent = nodes.first().get_parent()

# NOTE: fast_reset doesn't work because of ForeignKey cascade issues
if fast_reset:
nodes._raw_delete(using=settings.SV_ATLAS_DB_LABEL)
else:
chunked_bulk_delete(nodes)

parent.numchild = parent.get_children().count()
parent.save()
return


def test_partial_ingestion(version_urn):
"""
Usage:
from scaife_viewer.atlas.importers.versions import test_partial_ingestion
test_partial_ingestion("urn:cts:greekLit:tlg0012.tlg001.perseus-grc2:")
"""
print(f"Resetting nodes: {version_urn}")
reset_nodes(version_urn, fast_reset=False)
print("Done")

def predicate(obj):
return obj["urn"].count(version_urn)

print(f"Ingesting nodes matching {version_urn}")
import_versions(reset=False, predicate=predicate, partial_ingestion=True)


def test_partial_tokenizer(version_urn):
"""
Usage:
from scaife_viewer.atlas.importers.versions import test_partial_tokenizer
test_partial_tokenizer("urn:cts:greekLit:tlg0012.tlg001.perseus-grc2:")
"""

assert (
Token.objects.filter(text_part__urn__startswith=version_urn).exists() is False
)

tokenize_text_parts_parallel([version_urn])
assert Token.objects.filter(text_part__urn__startswith=version_urn).exists()
16 changes: 16 additions & 0 deletions atlas/scaife_viewer/atlas/managers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from django.db import models


class NodeManager(models.Manager):
"""
Overrides MP_NodeManager's custom delete method.
This is needed because we aren't setting `numchild`, so
the custom delete method fails.
FIXME: Remove overrides
"""

def get_queryset(self):
queryset = super().get_queryset()
return queryset.order_by("path")
Loading

0 comments on commit 57b4c73

Please sign in to comment.