Skip to content

Commit

Permalink
Merge pull request #17 from scaife-viewer/atlas/cts-ingestion
Browse files Browse the repository at this point in the history
Refactor ATLAS config, provide isolated ATLAS database and support library ingestion from CTS Collections
  • Loading branch information
jacobwegner authored Sep 29, 2020
2 parents d0c7eef + c5a6dc9 commit cf1a5a9
Show file tree
Hide file tree
Showing 32 changed files with 641 additions and 137 deletions.
75 changes: 75 additions & 0 deletions atlas/README.md
Original file line number Diff line number Diff line change
@@ -1 +1,76 @@
# Aligned Text and Linguistic Annotation Server (ATLAS)

## Settings

Settings can be overridden at a project level using via the `SV_ATLAS_<name>`
naming convention.

### Data model

**DATA_DIR**
Default: `None`

The path to the directory containing ATLAS data

**DATA_MODEL_ID**
Default: A base64 encoded representation of the last release (in `YYYY-MM-DD-###` format) where a
backwards incompatible schema change occurred.

Site developers can use the value of this setting to help inform when ATLAS content should be re-ingested
due to BI schema changes.

**INGESTION_CONCURRENCY**
Default: `None`

Sets the number of processes available to ProcessPoolExecutors during ingestion.

When `None`, defaults to number of processors as reported by multiprocessing.cpu_count()

**NODE_ALPHABET**
Default: `"0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"`

Used by `django-treebeard` to calculate the maximum path steps.

See the [django-treebeard docs](https://django-treebeard.readthedocs.io/en/latest/mp_tree.html#treebeard.mp_tree.MP_Node.alphabet) for more information.


### Database

**DB_LABEL**
Default: `"atlas"`

The label to use for the ATLAS-specific database (required when using the `ATLASRouter` database router)

**DB_PATH**
Default: `None`

The path to the SQLite database referenced by `DB_LABEL`.


### GraphQL

**IN_MEMORY_PASSAGE_CHUNK_MAX**
Default: `2500`

Sets the upper limit on the number of text parts used for in-memory passage chunking.

When the number of text parts exceeds this limit, ATLAS will fall back to a database-backed
chunking alogrithm.

For most smaller passages, the in-memory chunking is faster than using the database.


### Other

**HOOKSET**
Default: `"scaife_viewer.atlas.hooks.DefaultHookSet"`

The path to a hookset that can be used to customize ATLAS functionality.

## GraphQL Endpoint
URL Name: `sv_atlas:graphql_endpoint`

Primary GraphQL endpoint for `scaife-viewer-atlas` projects.

When accessed [via a browser](https://github.com/graphql-python/graphene-django/blob/2e806384f60505a29745752bf9c477c71668f0fa/graphene_django/views.py#L154), delivers a [GraphiQL Playground](https://github.com/graphql/graphiql#graphiql) that can be used
to explore ATLAS GraphQL fields.
5 changes: 2 additions & 3 deletions atlas/scaife_viewer/atlas/apps.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from django.apps import AppConfig as BaseAppConfig
from django.conf import settings
from django.db.backends.signals import connection_created
from django.utils.translation import ugettext_lazy as _

Expand All @@ -14,9 +15,7 @@ def tweak_sqlite_pragma(sender, connection, **kwargs):
"""
Customize PRAGMA settings for SQLite
"""
# TODO: Bind this only to the ATLAS database,
# rather than assuming any SQLite connection
if connection.vendor == "sqlite":
if connection.vendor == "sqlite" and connection.alias == settings.SV_ATLAS_DB_LABEL:
cursor = connection.cursor()
cursor.execute("PRAGMA synchronous=OFF;")
cursor.execute("PRAGMA cache_size=100000;")
Expand Down
50 changes: 47 additions & 3 deletions atlas/scaife_viewer/atlas/conf.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,56 @@
import base64
import importlib

from django.conf import settings # noqa
from django.core.exceptions import ImproperlyConfigured

from appconf import AppConf


def load_path_attr(path):
i = path.rfind(".")
module, attr = path[:i], path[i + 1 :]
try:
mod = importlib.import_module(module)
except ImportError as e:
raise ImproperlyConfigured("Error importing {0}: '{1}'".format(module, e))
try:
attr = getattr(mod, attr)
except AttributeError:
raise ImproperlyConfigured(
"Module '{0}' does not define a '{1}'".format(module, attr)
)
return attr


class ATLASAppConf(AppConf):
# `INGESTION_CONCURRENCY` defaults to number of processors
# as reported by multiprocessing.cpu_count()
# Data model
DATA_DIR = None
DATA_MODEL_ID = base64.b64encode(b"2020-09-08-001\n").decode()
INGESTION_CONCURRENCY = None
NODE_ALPHABET = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"

# GraphQL settings
IN_MEMORY_PASSAGE_CHUNK_MAX = 2500

# Database settings
DB_LABEL = "atlas"
DB_PATH = None

# Other
HOOKSET = "scaife_viewer.atlas.hooks.DefaultHookSet"

class Meta:
prefix = "scaife_viewer_atlas"
prefix = "sv_atlas"

def configure_hookset(self, value):
return load_path_attr(value)()

def configure_data_dir(self, value):
# NOTE: We've chosen an explicit `configure` method
# vs making `DATA_DIR` a required field so we can check
# that DATA_DIR is a non-None value.
if value is None:
msg = f"{self._meta.prefixed_name('DATA_DIR')} must be defined"
raise ImproperlyConfigured(msg)
return value
10 changes: 10 additions & 0 deletions atlas/scaife_viewer/atlas/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,3 +81,13 @@
(NAMED_ENTITY_KIND_PERSON, "Person"),
(NAMED_ENTITY_KIND_PLACE, "Place"),
]

HUMAN_FRIENDLY_LANGUAGE_MAP = {
"eng": "English",
"fa": "Farsi",
"fre": "French",
"ger": "German",
"grc": "Greek",
"heb": "Hebrew",
"lat": "Latin",
}
54 changes: 54 additions & 0 deletions atlas/scaife_viewer/atlas/db_routers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from scaife_viewer.atlas.conf import ATLASAppConf


atlas_conf = ATLASAppConf()
ATLAS_DB_LABEL = atlas_conf.DB_LABEL


class ATLASRouter:
"""
A router to control all database operations on models in the
library application.
"""

route_app_labels = {"scaife_viewer_atlas"}

def db_for_read(self, model, **hints):
"""
Attempts to read library models go to DB_LABEL.
"""
if model._meta.app_label in self.route_app_labels:
return ATLAS_DB_LABEL
return None

def db_for_write(self, model, **hints):
"""
Attempts to write library models go to DB_LABEL.
"""
if model._meta.app_label in self.route_app_labels:
return ATLAS_DB_LABEL
return None

def allow_relation(self, obj1, obj2, **hints):
"""
Allow relations if a model in library app is
involved.
"""
if (
obj1._meta.app_label in self.route_app_labels
or obj2._meta.app_label in self.route_app_labels
):
return True
return None

def allow_migrate(self, db, app_label, model_name=None, **hints):
"""
Only add library apps to the DB_LABEL database.
Do not add library apps to any other database.
"""
if db == ATLAS_DB_LABEL:
return app_label in self.route_app_labels
elif app_label in self.route_app_labels:
return db == ATLAS_DB_LABEL
return None
78 changes: 78 additions & 0 deletions atlas/scaife_viewer/atlas/hooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from . import constants
from .resolvers.default import resolve_library


def ensure_trailing_colon(urn):
if not urn.endswith(":"):
return f"{urn}:"
return urn


class DefaultHookSet:
def resolve_library(self):
return resolve_library()

def can_access_urn(self, request, urn):
return True

def get_human_lang(self, value):
return constants.HUMAN_FRIENDLY_LANGUAGE_MAP.get(value, value)

def get_importer_class(self):
from .importers.versions import CTSImporter # noqa: avoids circular import

return CTSImporter

def extract_cts_text_group_metadata(self, text_group):
return dict(
urn=f"{ensure_trailing_colon(text_group.urn)}",
name=[dict(lang="eng", value=str(text_group.label))],
)

def extract_cts_work_metadata(self, work):
# FIXME: backport `lang` attr to scaife-viewer-core
lang = getattr(work, "lang", work.metadata.lang)
return dict(
urn=f"{ensure_trailing_colon(work.urn)}",
lang=lang,
title=[
{
# TODO: provide a better api for work.label lang
"lang": work.label._language,
"value": str(work.label),
}
],
)

def extract_cts_version_metadata(self, version):
return dict(
urn=f"{ensure_trailing_colon(version.urn)}",
version_kind=version.kind,
# TODO: provide first_passage_urn
citation_scheme=[c.name for c in version.metadata.citation],
label=[
{
# TODO: provide a better api for version.label lang
"lang": version.label._language,
"value": str(version.label),
}
],
description=[
{
# TODO: provide a better api for version.description lang
"lang": version.description._language,
"value": str(version.description),
}
],
lang=version.lang,
)


class HookProxy:
def __getattr__(self, attr):
from .conf import settings # noqa; avoids race condition

return getattr(settings.SV_ATLAS_HOOKSET, attr)


hookset = HookProxy()
5 changes: 2 additions & 3 deletions atlas/scaife_viewer/atlas/importers/alignments.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,8 @@
import os
from collections import defaultdict

from django.conf import settings

from scaife_viewer.atlas.backports.scaife_viewer.cts.utils import natural_keys
from scaife_viewer.atlas.conf import settings
from scaife_viewer.atlas.urn import URN

from ..models import (
Expand All @@ -17,7 +16,7 @@


ANNOTATIONS_DATA_PATH = os.path.join(
settings.ATLAS_CONFIG["DATA_DIR"], "annotations", "text-alignments"
settings.SV_ATLAS_DATA_DIR, "annotations", "text-alignments"
)
RAW_PATH = os.path.join(ANNOTATIONS_DATA_PATH, "raw")

Expand Down
4 changes: 2 additions & 2 deletions atlas/scaife_viewer/atlas/importers/audio_annotations.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import csv
import os

from django.conf import settings
from scaife_viewer.atlas.conf import settings

from ..models import AudioAnnotation

Expand All @@ -10,7 +10,7 @@
COPYRIGHT_FRAGMENT = "© 2016 David Chamberlain under CC BY 4.0 License, https://creativecommons.org/licenses/by/4.0/"

ANNOTATIONS_DATA_PATH = os.path.join(
settings.ATLAS_CONFIG["DATA_DIR"], "annotations", "audio-annotations"
settings.SV_ATLAS_DATA_DIR, "annotations", "audio-annotations"
)

CITE_IDENTIFIER = "urn:cite2:exploreHomer:audio.v1:"
Expand Down
4 changes: 2 additions & 2 deletions atlas/scaife_viewer/atlas/importers/image_annotations.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import os

from django.conf import settings
from scaife_viewer.atlas.conf import settings

from ..models import (
IMAGE_ANNOTATION_KIND_CANVAS,
Expand All @@ -12,7 +12,7 @@


ANNOTATIONS_DATA_PATH = os.path.join(
settings.ATLAS_CONFIG["DATA_DIR"], "annotations", "image-annotations"
settings.SV_ATLAS_DATA_DIR, "annotations", "image-annotations"
)


Expand Down
4 changes: 2 additions & 2 deletions atlas/scaife_viewer/atlas/importers/metrical_annotations.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import csv
import os

from django.conf import settings
from scaife_viewer.atlas.conf import settings

from ..models import MetricalAnnotation

Expand All @@ -10,7 +10,7 @@
COPYRIGHT_FRAGMENT = "© 2016 David Chamberlain under CC BY 4.0 License, https://creativecommons.org/licenses/by/4.0/"

ANNOTATIONS_DATA_PATH = os.path.join(
settings.ATLAS_CONFIG["DATA_DIR"], "annotations", "metrical-annotations"
settings.SV_ATLAS_DATA_DIR, "annotations", "metrical-annotations"
)

CITE_IDENTIFIER = "urn:cite2:exploreHomer:metrical_annotation.v1:"
Expand Down
6 changes: 3 additions & 3 deletions atlas/scaife_viewer/atlas/importers/named_entities.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
import csv
import os

from django.conf import settings

import logfmt

from scaife_viewer.atlas.conf import settings

from ..models import NamedEntity, Node


NAMED_ENTITIES_DATA_PATH = os.path.join(
settings.ATLAS_CONFIG["DATA_DIR"], "annotations", "named-entities"
settings.SV_ATLAS_DATA_DIR, "annotations", "named-entities"
)
ENTITIES_DIR = os.path.join(NAMED_ENTITIES_DATA_PATH, "processed", "entities")
STANDOFF_DIR = os.path.join(NAMED_ENTITIES_DATA_PATH, "processed", "standoff")
Expand Down
Loading

0 comments on commit cf1a5a9

Please sign in to comment.