Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve text part ingestion, tokenizers and text annotations #68

Merged
merged 27 commits into from
Jan 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
f4b4ecf
Initial pass at partial ingestion
jacobwegner Jun 21, 2023
e324cef
Add helper function to re-tokenize a text
jacobwegner Jun 21, 2023
5b3a269
Remove unused kwarg from tokenize_text_parts
jacobwegner Jun 21, 2023
946f11a
Add Arabic language mapping
jacobwegner Jun 22, 2023
10c1c6f
Fix tokenizer signature

jacobwegner Jun 22, 2023
6dbb5e7
Add prepare_version_tokens hook
jacobwegner Jul 13, 2023
32c97ba
Make makemigrations executable
jacobwegner Jul 18, 2023
4c42051
Add punctuation and space_after to Token
jacobwegner Jul 18, 2023
516f921
Apply linting fix
jacobwegner Jul 18, 2023
2d19a63
Remove punctuation field
jacobwegner Jul 18, 2023
3bbc893
Ensure default tokenizer populates space_after
jacobwegner Jul 18, 2023
3b53871
Rename to get_prepared_tokens
jacobwegner Jul 18, 2023
552ee49
Add helper to retrieve tokens from a CSV
jacobwegner Jul 18, 2023
1f636d9
Merge branch 'main' into atlas/tokenizer-hookset
jacobwegner Aug 30, 2023
bb463c8
Override Node manager to support bulk delete
jacobwegner Sep 1, 2023
5a8508d
Factor out tokenize_text_parts_parallel for use outside of pipelines
jacobwegner Sep 1, 2023
9e301a2
Update test_partial_ingestion and add test_partial_tokenizer
jacobwegner Sep 1, 2023
6f5feb3
Refactor CREATE_UPDATE_DELETE_BATCH_SIZE and add chunked_bulk_update
jacobwegner Sep 1, 2023
07b033c
Handle a few edge cases when not all data is ingested
jacobwegner Sep 8, 2023
ecaa76b
Ensure ATLAS "raw" database options target proper database
jacobwegner Sep 8, 2023
ec8866c
Handle edge cases in tokenizer if not all versions are being tokenized
jacobwegner Sep 8, 2023
58ed409
Factor token annotation paths out to hookset
jacobwegner Sep 12, 2023
608d7e6
Merge branch 'main' into atlas/tokenizer-hookset
jacobwegner Oct 24, 2023
06bb787
Add merge migration
jacobwegner Oct 24, 2023
1c55e53
fix: separate scholia from commentary text annotations
jacobwegner Nov 7, 2023
17bb7d5
Attach text annotations to ImageROI
jacobwegner Nov 7, 2023
2f511ed
Merge branch 'main' into atlas/tokenizer-hookset
jacobwegner Dec 28, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file modified atlas/makemigrations.py
100644 → 100755
Empty file.
8 changes: 7 additions & 1 deletion atlas/scaife_viewer/atlas/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,9 @@
]

HUMAN_FRIENDLY_LANGUAGE_MAP = {
"eng": "English",
"ang": "English, Old (ca.450-1100)",
"ara": "Arabic",
"eng": "English",
"fa": "Farsi",
"far": "Farsi",
"fre": "French",
Expand All @@ -95,9 +96,14 @@
"lat": "Latin",
}

# TODO: Reconsider default
TEXT_ANNOTATION_KIND_SCHOLIA = "scholia"
TEXT_ANNOTATION_KIND_SYNTAX_TREE = "syntax-tree"
# TODO: Refactor as textual notes
# TODO: Reconsider this pattern
TEXT_ANNOTATION_KIND_COMMENTARY = "commentary"
TEXT_ANNOTATION_KIND_CHOICES = (
(TEXT_ANNOTATION_KIND_SCHOLIA, "Scholia"),
(TEXT_ANNOTATION_KIND_SYNTAX_TREE, "Syntax tree"),
(TEXT_ANNOTATION_KIND_COMMENTARY, "Commentary"),
)
35 changes: 34 additions & 1 deletion atlas/scaife_viewer/atlas/hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,10 +136,38 @@ def run_ingestion_pipeline(self, outf):

return run_ingestion_pipeline(outf)

def get_token_annotation_paths(self):
from .conf import settings # noqa; avoids race condition

path = Path(
settings.SV_ATLAS_DATA_DIR,
"annotations",
"token-annotations",
)

def isdir(path):
return path.is_dir()

return _get_annotation_paths(path, predicate=isdir)

def get_text_annotation_paths(self):
from .conf import settings # noqa; avoids race condition

path = Path(settings.SV_ATLAS_DATA_DIR, "annotations", "text-annotations",)
path = Path(
settings.SV_ATLAS_DATA_DIR,
"annotations",
"text-annotations",
)
return _get_annotation_paths(path)

def get_commentary_annotation_paths(self):
from .conf import settings # noqa; avoids race condition

path = Path(
settings.SV_ATLAS_DATA_DIR,
"annotations",
"commentaries",
)
return _get_annotation_paths(path)

def get_syntax_tree_annotation_paths(self):
Expand All @@ -163,6 +191,11 @@ def get_dictionary_annotation_paths(self):
predicate = lambda x: x.suffix == ".json" or x.is_dir() # noqa
return _get_annotation_paths(path, predicate=predicate)

def get_prepared_tokens(self, version_urn):
from .parallel_tokenizers import prepare_tokens

return prepare_tokens(version_urn)


class HookProxy:
def __getattr__(self, attr):
Expand Down
10 changes: 10 additions & 0 deletions atlas/scaife_viewer/atlas/importers/text_annotations.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from ..constants import (
TEXT_ANNOTATION_KIND_SCHOLIA,
TEXT_ANNOTATION_KIND_SYNTAX_TREE,
TEXT_ANNOTATION_KIND_COMMENTARY,
)
from ..hooks import hookset
from ..models import Node, TextAnnotation
Expand Down Expand Up @@ -84,6 +85,7 @@ def _resolve_text_annotation_text_parts(qs):
chunked_bulk_create(TextAnnotationThroughModel, prepared_objs)


# TODO: Break this part into individual pipelines
def import_text_annotations(reset=False):
if reset:
TextAnnotation.objects.all().delete()
Expand All @@ -98,6 +100,14 @@ def import_text_annotations(reset=False):
_prepare_text_annotations(path, counters, kind=TEXT_ANNOTATION_KIND_SCHOLIA)
)

commentary_annotation_paths = hookset.get_commentary_annotation_paths()
for path in commentary_annotation_paths:
to_create.extend(
_prepare_text_annotations(
path, counters, kind=TEXT_ANNOTATION_KIND_COMMENTARY
)
)

syntax_tree_annotation_paths = hookset.get_syntax_tree_annotation_paths()
for path in syntax_tree_annotation_paths:
to_create.extend(
Expand Down
38 changes: 15 additions & 23 deletions atlas/scaife_viewer/atlas/importers/token_annotations.py
Original file line number Diff line number Diff line change
@@ -1,35 +1,21 @@
import csv
import os
import re
from pathlib import Path

import yaml

from scaife_viewer.atlas.conf import settings

from ..hooks import hookset
from ..models import Node, Token, TokenAnnotation, TokenAnnotationCollection


ANNOTATIONS_DATA_PATH = Path(
settings.SV_ATLAS_DATA_DIR, "annotations", "token-annotations"
)


VE_REF_PATTTERN = re.compile(r"(?P<ref>.*).t(?P<token>.*)")


def get_paths():
if not os.path.exists(ANNOTATIONS_DATA_PATH):
return []
for path in ANNOTATIONS_DATA_PATH.iterdir():
if not path.is_dir():
continue
yield path


def resolve_version(path):
versionish = f'{os.path.basename(path).split(".csv")[0]}:'
return Node.objects.filter(urn__endswith=versionish).get()
version_obj = Node.objects.filter(urn__endswith=versionish).first()
if not version_obj:
print(f'Could not resolve version for {path.name} [urn="{versionish}"]')
return version_obj


def extract_ref_and_token_position(row):
Expand Down Expand Up @@ -81,7 +67,11 @@ def create_token_annotations(collection, version, lookup, refs):
continue

to_create.append(
TokenAnnotation(token=token, data=data, collection=collection,)
TokenAnnotation(
token=token,
data=data,
collection=collection,
)
)
return len(TokenAnnotation.objects.bulk_create(to_create))

Expand All @@ -92,9 +82,9 @@ def apply_token_annotations(reset=True):
want to revisit how this entire extraction works in the future
"""

paths = get_paths()
paths = hookset.get_token_annotation_paths()
for path in paths:
metadata_path = Path(path, "metadata.yml")
metadata_path = path / "metadata.yml"
collection = yaml.safe_load(metadata_path.open())

if reset:
Expand All @@ -105,10 +95,12 @@ def apply_token_annotations(reset=True):
if not values or not values.endswith("csv"):
continue

values_path = Path(path, values)
values_path = path / values
lookup, refs = extract_lookup_and_refs(values_path)
# TODO: Move this to metadata and or values
version = resolve_version(values_path)
if not version:
continue

# TODO: Set attribution information
metadata = collection.pop("metadata", {})
Expand Down
108 changes: 100 additions & 8 deletions atlas/scaife_viewer/atlas/importers/versions.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,24 @@
import logging
from collections import defaultdict

from django.conf import settings
from django.db import IntegrityError
from django.utils.translation import ugettext_noop

from tqdm import tqdm
from treebeard.exceptions import PathOverflow

from scaife_viewer.atlas import constants
from scaife_viewer.atlas.parallel_tokenizers import tokenize_text_parts_parallel

from ..hooks import hookset
from ..models import Node
from ..models import Node, Token
from ..urn import URN
from ..utils import chunked_bulk_create, get_lowest_citable_depth
from ..utils import (
chunked_bulk_create,
chunked_bulk_delete,
get_lowest_citable_depth,
)


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -45,14 +51,20 @@ class CTSImporter:
CTS_URN_SCHEME_EXEMPLAR = constants.CTS_URN_NODES

def __init__(
self, library, version_data, nodes=dict(), node_last_child_lookup=None
self,
library,
version_data,
nodes=dict(),
node_last_child_lookup=None,
partial_ingestion=False,
):
self.library = library
self.version_data = version_data
self.nodes = nodes
# TODO: Decouple "version_data" further
self.urn = URN(self.version_data["urn"].strip())
self.work_urn = self.urn.up_to(self.urn.WORK)
self.partial_ingestion = partial_ingestion

try:
label = get_first_value_for_language(version_data["label"], "eng")
Expand Down Expand Up @@ -191,6 +203,13 @@ def generate_node(self, idx, node_data, parent_urn):
return Node.objects.get(urn=node_data["urn"])
parent = self.nodes.get(parent_urn)
if USE_BULK_INGESTION:
# NOTE: parent.pk will _only_ be populated if invoked with the
# partial_ingestion flag
if self.partial_ingestion and parent.pk and parent.depth < 4:
# If parent has a database id, and is a workpart,
# we should also add the child to the database.
# Typically this is going to be a work or version.
return self.add_child(parent, node_data)
return self.add_child_bulk(parent, node_data)
return self.add_child(parent, node_data)

Expand Down Expand Up @@ -243,6 +262,15 @@ def extract_urn_and_text_content(self, line):
urn = f"{self.urn}{ref}"
return URN(urn), text_content

def get_or_generate_node(self, idx, node_data, parent_urn):
try:
node = Node.objects.get(urn=node_data["urn"])
print(f'retrieved existing node from db: {node_data["urn"]}')
except Node.DoesNotExist:
node = self.generate_node(idx, node_data, parent_urn)
print(f'inserted node: {node_data["urn"]}')
return node

def generate_branch(self, urn=None, line=None):
if line:
node_urn, text_content = self.extract_urn_and_text_content(line)
Expand All @@ -258,7 +286,10 @@ def generate_branch(self, urn=None, line=None):
if node is None:
node_data.update({"idx": self.get_node_idx(node_data)})
parent_urn = self.get_parent_urn(idx, branch_data)
node = self.generate_node(idx, node_data, parent_urn)
if idx < 4 and self.partial_ingestion:
node = self.get_or_generate_node(idx, node_data, parent_urn)
else:
node = self.generate_node(idx, node_data, parent_urn)
self.nodes[node_data["urn"]] = node

def apply(self):
Expand Down Expand Up @@ -288,7 +319,9 @@ def get_first_value_for_language(values, lang, fallback=True):
return value.get("value")


def import_versions(reset=False, predicate=None):
# TODO: Determine best signature; do we decouple partial_ingestion or
# infer it based on the predicate?
def import_versions(reset=False, predicate=None, partial_ingestion=False):
if reset:
Node.objects.filter(kind="nid").delete()
# TODO: Wire up logging
Expand All @@ -304,9 +337,9 @@ def import_versions(reset=False, predicate=None):
if predicate:
"""
Suggested usage:
predicate = lambda x: x["urn"] == "urn:cts:greekLit:tlg0012.tlg001.parrish-eng1:"
predicate = lambda x: x["urn"].count("urn:cts:greekLit:tlg0012.tlg001.parrish-eng1:")
from scaife_viewer.atlas.importers.versions import *
import_versions(predicate=predicate)
import_versions(predicate=predicate, partial_ingestion=True)
"""
to_ingest = filter(predicate, to_ingest)

Expand All @@ -318,7 +351,13 @@ def import_versions(reset=False, predicate=None):
# counter from tqdm would be more useful.
with tqdm() as pbar:
for version_data in to_ingest:
importer = importer_class(library, version_data, nodes, lookup)
importer = importer_class(
library,
version_data,
nodes,
lookup,
partial_ingestion=partial_ingestion,
)
deferred_nodes = importer.apply()

to_defer.extend(deferred_nodes)
Expand All @@ -331,3 +370,56 @@ def import_versions(reset=False, predicate=None):
logger.info("Inserting Node tree")
chunked_bulk_create(Node, to_defer)
logger.info(f"{Node.objects.count()} total nodes on the tree.")


def reset_nodes(version_urn, fast_reset=False):
# FIXME: Remove customizations from Node so we can use default queryset methods?
nodes = Node.objects.filter(urn__startswith=version_urn).filter(numchild=0)
if not nodes:
return

parent = nodes.first().get_parent()

# NOTE: fast_reset doesn't work because of ForeignKey cascade issues
if fast_reset:
nodes._raw_delete(using=settings.SV_ATLAS_DB_LABEL)
else:
chunked_bulk_delete(nodes)

parent.numchild = parent.get_children().count()
parent.save()
return


def test_partial_ingestion(version_urn):
"""
Usage:

from scaife_viewer.atlas.importers.versions import test_partial_ingestion
test_partial_ingestion("urn:cts:greekLit:tlg0012.tlg001.perseus-grc2:")
"""
print(f"Resetting nodes: {version_urn}")
reset_nodes(version_urn, fast_reset=False)
print("Done")

def predicate(obj):
return obj["urn"].count(version_urn)

print(f"Ingesting nodes matching {version_urn}")
import_versions(reset=False, predicate=predicate, partial_ingestion=True)


def test_partial_tokenizer(version_urn):
"""
Usage:

from scaife_viewer.atlas.importers.versions import test_partial_tokenizer
test_partial_tokenizer("urn:cts:greekLit:tlg0012.tlg001.perseus-grc2:")
"""

assert (
Token.objects.filter(text_part__urn__startswith=version_urn).exists() is False
)

tokenize_text_parts_parallel([version_urn])
assert Token.objects.filter(text_part__urn__startswith=version_urn).exists()
16 changes: 16 additions & 0 deletions atlas/scaife_viewer/atlas/managers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
from django.db import models


class NodeManager(models.Manager):
"""
Overrides MP_NodeManager's custom delete method.

This is needed because we aren't setting `numchild`, so
the custom delete method fails.

FIXME: Remove overrides
"""

def get_queryset(self):
queryset = super().get_queryset()
return queryset.order_by("path")
Loading