Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Tjmadonna/252 vitessce config speedup #256

Merged
merged 12 commits into from
Jan 19, 2024
Merged
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.2.24
1.3.1
40 changes: 4 additions & 36 deletions ingest-api-spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ openapi: 3.0.0
info:
description: |
A RESTful web service exposing calls needed for the SenNet Data Sharing Portal.
version: 1.2.24
version: 1.3.1
title: SenNet Ingest API
contact:
name: SenNet Help Desk
Expand Down Expand Up @@ -349,41 +349,9 @@ components:
- Hold
- Invalid
description: 'One of: New|Processing|QA|Published|Error|Hold|Invalid'
data_types:
type: array
items:
type: string
enum:
- 10x-multiome
- bulk-RNA
- CITE-Seq
- CODEX
- codex_cytokit
- codex_cytokit_v1
- CosMX (RNA)
- DBiT-seq
- FACS - Fluorescence-activated Cell Sorting
- GeoMX (RNA)
- image_pyramid
- LC-MS
- Lightsheet
- MIBI
- mibi_deepcell
- Mint-ChIP
- publication
- publication_ancillary
- salmon_rnaseq_10x
- salmon_rnaseq_bulk
- salmon_sn_rnaseq_10x
- SASP
- scRNA-seq
- sn_atac_seq
- snATAC-seq
- snRNA-seq
- snRNAseq-10xGenomics-v3
- Stained Slides
- Visium
description: The data or assay types contained in this dataset as a json array of strings. Each is an assay code from [assay types](https://ontology.api.hubmapconsortium.org/datasets?application_context=Sennet).
dataset_type:
type: string
description: "The data or assay type contained in this dataset. Must be one of the values found in: [dataset types](https://ontology-api.dev.hubmapconsortium.org/valueset?parent_sab=SENNET&parent_code=C003041&child_sabs=SENNET)."
local_directory_rel_path:
type: string
readOnly: true
Expand Down
49 changes: 48 additions & 1 deletion src/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import argparse
from flask import Flask, g
from pymemcache import serde
from pymemcache.client.base import PooledClient

# HuBMAP commons
from hubmap_commons.hm_auth import AuthHelper
Expand All @@ -25,18 +27,28 @@
# Local Modules
from lib.file_upload_helper import UploadFileHelper
from lib.neo4j_helper import Neo4jHelper
from lib.vitessce import VitessceConfigCache

# Set logging format and level (default is warning)
# All the API logging is forwarded to the uWSGI server and gets written into the log file `uwsgi-ingest-api.log`
# Log rotation is handled via logrotate on the host system with a configuration file
# Do NOT handle log file and rotation via the Python logging to avoid issues with multi-worker processes
logging.basicConfig(format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s', level=logging.DEBUG, datefmt='%Y-%m-%d %H:%M:%S')
logging.basicConfig(format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s', level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

# Specify the absolute path of the instance folder and use the config file relative to the instance path
app = Flask(__name__, instance_path=os.path.join(os.path.abspath(os.path.dirname(__file__)), 'instance'), instance_relative_config=True)
app.config.from_pyfile('app.cfg')

app.vitessce_cache = None
if 'MEMCACHED_MODE' in app.config:
MEMCACHED_MODE = app.config['MEMCACHED_MODE']
# Use prefix to distinguish the cached data of same source across different deployments
MEMCACHED_PREFIX = app.config['MEMCACHED_PREFIX']
else:
MEMCACHED_MODE = False
MEMCACHED_PREFIX = 'NONE'

app.register_blueprint(auth_blueprint)
app.register_blueprint(status_blueprint)
app.register_blueprint(privs_blueprint)
Expand Down Expand Up @@ -130,6 +142,41 @@
# Log the full stack trace, prepend a line with our message
logger.exception(msg)

####################################################################################################
## Memcached client initialization
####################################################################################################

memcached_client_instance = None

if MEMCACHED_MODE:
try:
# Use client pool to maintain a pool of already-connected clients for improved performance
# The uwsgi config launches the app across multiple threads (8) inside each process (32), making essentially 256 processes
# Set the connect_timeout and timeout to avoid blocking the process when memcached is slow, defaults to "forever"
# connect_timeout: seconds to wait for a connection to the memcached server
# timeout: seconds to wait for send or reveive calls on the socket connected to memcached
# Use the ignore_exc flag to treat memcache/network errors as cache misses on calls to the get* methods
# Set the no_delay flag to sent TCP_NODELAY (disable Nagle's algorithm to improve TCP/IP networks and decrease the number of packets)
# If you intend to use anything but str as a value, it is a good idea to use a serializer
memcached_client_instance = PooledClient(app.config['MEMCACHED_SERVER'],
max_pool_size=256,
connect_timeout=1,
timeout=30,
ignore_exc=True,
no_delay=True,
serde=serde.pickle_serde)
app.vitessce_cache = VitessceConfigCache(memcached_client_instance, MEMCACHED_PREFIX)

# memcached_client_instance can be instantiated without connecting to the Memcached server
# A version() call will throw error (e.g., timeout) when failed to connect to server
# Need to convert the version in bytes to string
logger.info(f'Connected to Memcached server {memcached_client_instance.version().decode()} successfully :)')
except Exception:
msg = 'Failed to connect to the Memcached server :('
# Log the full stack trace, prepend a line with our message
logger.exception(msg)
# Turn off the caching
MEMCACHED_MODE = False

"""
Close the current neo4j connection at the end of every request
Expand Down
6 changes: 6 additions & 0 deletions src/instance/app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ NEO4J_SERVER = 'bolt://sennet-neo4j:7687'
NEO4J_USERNAME = 'neo4j'
NEO4J_PASSWORD = '123'

# Set MEMCACHED_MODE to False to disable the caching for local development
MEMCACHED_MODE = True
MEMCACHED_SERVER = 'host:11211'
# Change prefix based on deployment environment, default for DEV
MEMCACHED_PREFIX = 'sn_ingest_dev_'

# Globus App ID and secret
APP_CLIENT_ID = ''
APP_CLIENT_SECRET = ''
Expand Down
6 changes: 0 additions & 6 deletions src/lib/datacite_doi_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,6 @@

requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

# Set logging fromat and level (default is warning)
# All the API logging is forwarded to the uWSGI server and gets written into the log file `uwsgo-entity-api.log`
# Log rotation is handled via logrotate on the host system with a configuration file
# Do NOT handle log file and rotation via the Python logging to avoid issues with multi-worker processes
logging.basicConfig(format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s', level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)


Expand Down
7 changes: 0 additions & 7 deletions src/lib/dataset_helper.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import os
import sys
from array import array

import yaml
Expand All @@ -17,12 +16,6 @@

requests.packages.urllib3.disable_warnings(category=InsecureRequestWarning)

# Set logging fromat and level (default is warning)
# All the API logging is forwarded to the uWSGI server and gets written into the log file `uwsgo-entity-api.log`
# Log rotation is handled via logrotate on the host system with a configuration file
# Do NOT handle log file and rotation via the Python logging to avoid issues with multi-worker processes
logging.basicConfig(format='[%(asctime)s] %(levelname)s in %(module)s: %(message)s', level=logging.DEBUG,
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger(__name__)

# In Python, "privacy" depends on "consenting adults'" levels of agreement, we can't force it.
Expand Down
6 changes: 3 additions & 3 deletions src/lib/ontology.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from atlas_consortia_commons.ubkg.ubkg_sdk import UbkgSDK
from flask import current_app


def get_organ_types_ep():
return UbkgSDK.get_endpoint(current_app.ubkg.organ_types)


def get_assay_types_ep():
return UbkgSDK.get_endpoint(current_app.ubkg.assay_types)
def get_dataset_types_ep():
return UbkgSDK.get_endpoint(current_app.ubkg.dataset_types)


class Ontology(UbkgSDK):
Expand All @@ -15,4 +16,3 @@ def assay_types_ext():
Ontology.Ops.key = 'data_type'
Ontology.Ops.url_params = '&dataset_provider=external'
return Ontology.transform_ontology(current_app.ubkg.assay_types, 'AssayTypesExt')

4 changes: 0 additions & 4 deletions src/lib/rule_chain.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import json
import logging
from pathlib import Path
from sys import stdout
import urllib.request

from flask import current_app
Expand All @@ -26,9 +25,6 @@ def initialize_rule_chain():
except json.decoder.JSONDecodeError as excp:
raise RuleSyntaxException(excp) from excp
rule_chain = RuleLoader(json_rules).load()
print("RULE CHAIN FOLLOWS")
rule_chain.dump(stdout)
print("RULE CHAIN ABOVE")


def calculate_assay_info(metadata: dict) -> dict:
Expand Down
28 changes: 28 additions & 0 deletions src/lib/vitessce.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import json
from typing import Optional

from pymemcache.client.base import PooledClient


class VitessceConfigCache:
"""Memcached wrapper for Vitessce configuration."""

def __init__(self, memcached_client: PooledClient, memcached_prefix: str):
self._memcached_client = memcached_client
self._memcached_prefix = f"{memcached_prefix}_vitessce"

def get(self, uuid) -> Optional[str]:
return self._memcached_client.get(f"{self._memcached_prefix}_{uuid}")

def set(self, uuid: str, config: dict, groups_token: str):
if self._should_cache(config, groups_token):
self._memcached_client.set(f"{self._memcached_prefix}_{uuid}", config)

def delete(self, uuid: str):
return self._memcached_client.delete(
f"{self._memcached_prefix}_{uuid}", noreply=False
)

def _should_cache(self, config: dict, groups_token: str) -> bool:
# Don't cache if the config contains the groups token
return groups_token not in json.dumps(config, separators=(",", ":"))
5 changes: 4 additions & 1 deletion src/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ globus_sdk==2.0.1
hubmap-sdk==1.0.4
Werkzeug==2.3.7

# For interacting with memcached
pymemcache==4.0.0

# The commons package requires requests>=2.22.0
requests==2.27.1

Expand All @@ -11,7 +14,7 @@ requests==2.27.1
# Default is main branch specified in docker-compose.development.yml if not set
# git+https://github.com/hubmapconsortium/commons.git@${COMMONS_BRANCH}#egg=hubmap-commons
hubmap-commons==2.1.13
atlas-consortia-commons==1.0.5
atlas-consortia-commons==1.0.6

# For assay type rules
rule_engine==4.1.0
Expand Down
Loading