Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updating instances of ingest_metadata.metadata and ingest_metadata.fi… #581

Open
wants to merge 7 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/instance/app.cfg.example
Original file line number Diff line number Diff line change
Expand Up @@ -107,5 +107,8 @@ UBKG_SERVER = 'https://ontology.api.hubmapconsortium.org/'
UBKG_ENDPOINT_VALUESET = 'valueset?parent_sab=SENNET&parent_code={code}&child_sabs=SENNET'
UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":{"code": "C000008", "key": "organs", "endpoint": "organs?application_context=SENNET"}, "entities": "C000012", "source_types":"C050020"}'

# UBKG Integration Configs for Rule Chain
APPLICATION_CONTEXT = 'SENNET'

# URI from which to load the assay classifier rules.
RULE_CHAIN_URI = 'https://raw.githubusercontent.com/sennetconsortium/ingest-api/main/src/routes/assayclassifier/testing_rule_chain.json'
4 changes: 2 additions & 2 deletions src/lib/dataset_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def get_dataset_ingest_update_record(self, json_data):

metadata = json_data['metadata']
if 'files_info_alt_path' in metadata:
metadata['files'] = self.get_file_list(metadata['files_info_alt_path'])
update_record['files'] = self.get_file_list(metadata['files_info_alt_path'])

if 'overwrite_metadata' in json_data and json_data['overwrite_metadata'] == False:
raise ValueError("overwrite_metadata set to False, merging of metadata is not supported on update")
Expand Down Expand Up @@ -375,7 +375,7 @@ def get_dataset_ingest_update_record(self, json_data):

metadata['metadata'] = meta_lvl3

update_record[HubmapConst.DATASET_INGEST_METADATA_ATTRIBUTE] = metadata
update_record['metadata'] = metadata

if not antibodies is None:
update_record['antibodies'] = antibodies
Expand Down
105 changes: 88 additions & 17 deletions src/lib/rule_chain.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,41 @@
SCHEMA_FILE = "rule_chain_schema.json"
SCHEMA_BASE_URI = "http://schemata.hubmapconsortium.org/"


rule_chain = None

# Have to translate pre-UBKG keys to UBKG keys
# Format is:
# "Key before UBKG integration": "UBKG Key"
pre_integration_to_ubkg_translation = {
'vitessce-hints': 'vitessce_hints',
'dir-schema': 'dir_schema',
'tbl-schema': 'tbl_schema',
'contains-pii': 'contains_full_genetic_sequences',
'dataset-type': 'dataset_type',
'is-multi-assay': 'is_multiassay',
'pipeline-shorthand': 'pipeline_shorthand',
'must-contain': 'must_contain',
}

# These are the keys returned by the rule chain before UBKG integration.
# We will return the UBKG data in this format as well for MVP.
# This is to avoid too much churn on end-users.
# We set primary manually so ignore it.
pre_integration_keys = [
'assaytype',
'vitessce-hints',
'dir-schema',
'tbl-schema',
'contains-pii',
# 'primary',
'dataset-type',
'description',
'is-multi-assay',
'pipeline-shorthand',
'must-contain',
"process_state"
]


def initialize_rule_chain():
"""Initialize the rule chain from the source URI.
Expand Down Expand Up @@ -79,9 +111,9 @@ def calculate_data_types(entity: Entity) -> list[str]:
# the data_types field is not empty and not a list of empty strings
# If it has a value it must be an old derived dataset so use that to match the rules
if (
hasattr(entity, "data_types")
and entity.data_types
and set(entity.data_types) != {""}
hasattr(entity, "data_types")
and entity.data_types
and set(entity.data_types) != {""}
):
data_types = entity.data_types
# Moving forward (2024) we are no longer using data_types for derived datasets.
Expand Down Expand Up @@ -114,26 +146,27 @@ def build_entity_metadata(entity: Union[Entity, dict]) -> dict:

metadata = {}
dag_prov_list = []
if hasattr(entity, "ingest_metadata"):
# This if block should catch primary datasets because primary datasets should
# their metadata ingested as part of the reorganization.
if "metadata" in entity.ingest_metadata:
metadata = entity.ingest_metadata["metadata"]
else:
# If there is no ingest-metadata, then it must be a derived dataset
metadata["data_types"] = calculate_data_types(entity)

# This if block should catch primary datasets because primary datasets should
# their metadata ingested as part of the reorganization.
if hasattr(entity, "metadata"):
metadata = entity.metadata
else:
# If there is no ingest-metadata, then it must be a derived dataset
metadata["data_types"] = calculate_data_types(entity)

if hasattr(entity, "ingest_metadata"):
dag_prov_list = [elt['origin'] + ':' + elt['name']
for elt in entity.ingest_metadata.get('dag_provenance_list',
[])
if 'origin' in elt and 'name' in elt
]

# In the case of Publications, we must also set the data_types.
# The primary publication will always have metadata,
# so we have to do the association here.
if entity.entity_type == "Publication":
metadata["data_types"] = calculate_data_types(entity)
# In the case of Publications, we must also set the data_types.
# The primary publication will always have metadata,
# so we have to do the association here.
if entity.entity_type == "Publication":
metadata["data_types"] = calculate_data_types(entity)

# If there is no metadata, then it must be a derived dataset
else:
Expand All @@ -149,6 +182,44 @@ def build_entity_metadata(entity: Union[Entity, dict]) -> dict:
return metadata


def apply_source_type_transformations(source_type: str, rule_value_set: dict) -> dict:
# If we get more complicated transformations we should consider refactoring.
# For now, this should suffice.
if source_type.upper() == "MOUSE":
rule_value_set["contains-pii"] = False

return rule_value_set


def get_data_from_ubkg(ubkg_code: str) -> dict:
query = urllib.parse.urlencode({"application_context": current_app.config['APPLICATION_CONTEXT']})
ubkg_api_url = f"{current_app.config['UBKG_SERVER']}assayclasses/{ubkg_code}?{query}"
req = urllib.request.Request(ubkg_api_url)
try:
with urllib.request.urlopen(req) as response:
response_data = response.read().decode("utf-8")
except urllib.error.URLError as excp:
print(f"Error getting extra info from UBKG {excp}")
return {}

return json.loads(response_data)


def standardize_results(rule_chain_json: dict, ubkg_json: dict) -> dict:
# Initialize this with conditional logic to set 'primary' true or false.
ubkg_transformed_json = {
"primary": ubkg_json.get("process_state") == "primary"
}

for pre_integration_key in pre_integration_keys:
ubkg_key = pre_integration_to_ubkg_translation.get(pre_integration_key, pre_integration_key)
ubkg_value = ubkg_json.get(ubkg_key)
if ubkg_value is not None:
ubkg_transformed_json[pre_integration_key] = ubkg_value

return rule_chain_json | ubkg_transformed_json


class NoMatchException(Exception):
pass

Expand Down
26 changes: 17 additions & 9 deletions src/routes/assayclassifier/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
build_entity_metadata,
calculate_assay_info,
initialize_rule_chain,
get_data_from_ubkg,
standardize_results
)
from lib.services import get_entity

Expand All @@ -30,7 +32,7 @@ def get_ds_assaytype(ds_uuid: str):
token = get_token()
entity = get_entity(ds_uuid, token)
metadata = build_entity_metadata(entity)
rule_value_set = calculate_assay_info(metadata)
rules_json = calculate_assay_info(metadata)

if sources := entity.sources:
source_type = ""
Expand All @@ -39,9 +41,12 @@ def get_ds_assaytype(ds_uuid: str):
# If there is a single Human source_type, treat this as a Human case
if source_type.upper() == "HUMAN":
break
apply_source_type_transformations(source_type, rule_value_set)
apply_source_type_transformations(source_type, rules_json)

return jsonify(rule_value_set)
ubkg_value_json = get_data_from_ubkg(rules_json.get("ubkg_code")).get("value", {})
merged_json = standardize_results(rules_json, ubkg_value_json)
merged_json["ubkg_json"] = ubkg_value_json
return jsonify(merged_json)
except ValueError as excp:
logger.error(excp, exc_info=True)
return Response("Bad parameter: {excp}", 400)
Expand Down Expand Up @@ -97,21 +102,21 @@ def get_ds_rule_metadata(ds_uuid: str):
)


def apply_source_type_transformations(source_type: str, rule_value_set: dict) -> dict:
def apply_source_type_transformations(source_type: str, rules_json: dict) -> dict:
# If we get more complicated transformations we should consider refactoring.
# For now, this should suffice.
if "MOUSE" in source_type.upper():
rule_value_set["contains-pii"] = False
rules_json["contains-pii"] = False

return rule_value_set
return rules_json


@assayclassifier_blueprint.route("/assaytype", methods=["POST"])
@require_valid_token()
@require_json(param="metadata")
def get_assaytype_from_metadata(token: str, user: User, metadata: dict):
try:
rule_value_set = calculate_assay_info(metadata)
rules_json = calculate_assay_info(metadata)

if parent_sample_ids := metadata.get("parent_sample_id"):
source_type = ""
Expand All @@ -123,8 +128,11 @@ def get_assaytype_from_metadata(token: str, user: User, metadata: dict):
if source_type.upper() == "HUMAN":
break

apply_source_type_transformations(source_type, rule_value_set)
return jsonify(rule_value_set)
apply_source_type_transformations(source_type, rules_json)
ubkg_value_json = get_data_from_ubkg(rules_json.get("ubkg_code")).get("value", {})
merged_json = standardize_results(rules_json, ubkg_value_json)
merged_json["ubkg_json"] = ubkg_value_json
return jsonify(merged_json)
except ResponseException as re:
logger.error(re, exc_info=True)
return re.response
Expand Down
Loading
Loading