sennetconsortium · maxsibilla · Oct 28, 2024 · Nov 8, 2024 · Nov 20, 2024 · Nov 20, 2024
@@ -107,5 +107,8 @@ UBKG_SERVER = 'https://ontology.api.hubmapconsortium.org/'
 UBKG_ENDPOINT_VALUESET = 'valueset?parent_sab=SENNET&parent_code={code}&child_sabs=SENNET'
 UBKG_CODES = '{"specimen_categories":"C020076", "organ_types":{"code": "C000008", "key": "organs", "endpoint": "organs?application_context=SENNET"}, "entities": "C000012", "source_types":"C050020"}'
 
+# UBKG Integration Configs for Rule Chain
+APPLICATION_CONTEXT = 'SENNET'
+
 # URI from which to load the assay classifier rules.
 RULE_CHAIN_URI = 'https://raw.githubusercontent.com/sennetconsortium/ingest-api/main/src/routes/assayclassifier/testing_rule_chain.json'
@@ -320,7 +320,7 @@ def get_dataset_ingest_update_record(self, json_data):
 
         metadata = json_data['metadata']
         if 'files_info_alt_path' in metadata:
-            metadata['files'] = self.get_file_list(metadata['files_info_alt_path'])
+            update_record['files'] = self.get_file_list(metadata['files_info_alt_path'])
 
         if 'overwrite_metadata' in json_data and json_data['overwrite_metadata'] == False:
             raise ValueError("overwrite_metadata set to False, merging of metadata is not supported on update")
@@ -375,7 +375,7 @@ def get_dataset_ingest_update_record(self, json_data):
 
                 metadata['metadata'] = meta_lvl3
 
-        update_record[HubmapConst.DATASET_INGEST_METADATA_ATTRIBUTE] = metadata
+        update_record['metadata'] = metadata
 
         if not antibodies is None:
             update_record['antibodies'] = antibodies

@@ -15,9 +15,41 @@
 SCHEMA_FILE = "rule_chain_schema.json"
 SCHEMA_BASE_URI = "http://schemata.hubmapconsortium.org/"
 
-
 rule_chain = None
 
+# Have to translate pre-UBKG keys to UBKG keys
+# Format is:
+# "Key before UBKG integration": "UBKG Key"
+pre_integration_to_ubkg_translation = {
+    'vitessce-hints': 'vitessce_hints',
+    'dir-schema': 'dir_schema',
+    'tbl-schema': 'tbl_schema',
+    'contains-pii': 'contains_full_genetic_sequences',
+    'dataset-type': 'dataset_type',
+    'is-multi-assay': 'is_multiassay',
+    'pipeline-shorthand': 'pipeline_shorthand',
+    'must-contain': 'must_contain',
+}
+
+# These are the keys returned by the rule chain before UBKG integration.
+# We will return the UBKG data in this format as well for MVP.
+# This is to avoid too much churn on end-users.
+# We set primary manually so ignore it.
+pre_integration_keys = [
+    'assaytype',
+    'vitessce-hints',
+    'dir-schema',
+    'tbl-schema',
+    'contains-pii',
+    # 'primary',
+    'dataset-type',
+    'description',
+    'is-multi-assay',
+    'pipeline-shorthand',
+    'must-contain',
+    "process_state"
+]
+
 
 def initialize_rule_chain():
     """Initialize the rule chain from the source URI.
@@ -79,9 +111,9 @@ def calculate_data_types(entity: Entity) -> list[str]:
     # the data_types field is not empty and not a list of empty strings
     # If it has a value it must be an old derived dataset so use that to match the rules
     if (
-        hasattr(entity, "data_types")
-        and entity.data_types
-        and set(entity.data_types) != {""}
+            hasattr(entity, "data_types")
+            and entity.data_types
+            and set(entity.data_types) != {""}
     ):
         data_types = entity.data_types
     # Moving forward (2024) we are no longer using data_types for derived datasets.
@@ -114,26 +146,27 @@ def build_entity_metadata(entity: Union[Entity, dict]) -> dict:
 
     metadata = {}
     dag_prov_list = []
-    if hasattr(entity, "ingest_metadata"):
-        # This if block should catch primary datasets because primary datasets should
-        # their metadata ingested as part of the reorganization.
-        if "metadata" in entity.ingest_metadata:
-            metadata = entity.ingest_metadata["metadata"]
-        else:
-            # If there is no ingest-metadata, then it must be a derived dataset
-            metadata["data_types"] = calculate_data_types(entity)
 
+    # This if block should catch primary datasets because primary datasets should
+    # their metadata ingested as part of the reorganization.
+    if hasattr(entity, "metadata"):
+        metadata = entity.metadata
+    else:
+        # If there is no ingest-metadata, then it must be a derived dataset
+        metadata["data_types"] = calculate_data_types(entity)
+
+    if hasattr(entity, "ingest_metadata"):
         dag_prov_list = [elt['origin'] + ':' + elt['name']
                          for elt in entity.ingest_metadata.get('dag_provenance_list',
                                                                [])
                          if 'origin' in elt and 'name' in elt
                          ]
 
-        # In the case of Publications, we must also set the data_types.
-        # The primary publication will always have metadata,
-        # so we have to do the association here.
-        if entity.entity_type == "Publication":
-            metadata["data_types"] = calculate_data_types(entity)
+    # In the case of Publications, we must also set the data_types.
+    # The primary publication will always have metadata,
+    # so we have to do the association here.
+    if entity.entity_type == "Publication":
+        metadata["data_types"] = calculate_data_types(entity)
 
     # If there is no metadata, then it must be a derived dataset
     else:
@@ -149,6 +182,44 @@ def build_entity_metadata(entity: Union[Entity, dict]) -> dict:
     return metadata
 
 
+def apply_source_type_transformations(source_type: str, rule_value_set: dict) -> dict:
+    # If we get more complicated transformations we should consider refactoring.
+    # For now, this should suffice.
+    if source_type.upper() == "MOUSE":
+        rule_value_set["contains-pii"] = False
+
+    return rule_value_set
+
+
+def get_data_from_ubkg(ubkg_code: str) -> dict:
+    query = urllib.parse.urlencode({"application_context": current_app.config['APPLICATION_CONTEXT']})
+    ubkg_api_url = f"{current_app.config['UBKG_SERVER']}assayclasses/{ubkg_code}?{query}"
+    req = urllib.request.Request(ubkg_api_url)
+    try:
+        with urllib.request.urlopen(req) as response:
+            response_data = response.read().decode("utf-8")
+    except urllib.error.URLError as excp:
+        print(f"Error getting extra info from UBKG {excp}")
+        return {}
+
+    return json.loads(response_data)
+
+
+def standardize_results(rule_chain_json: dict, ubkg_json: dict) -> dict:
+    # Initialize this with conditional logic to set 'primary' true or false.
+    ubkg_transformed_json = {
+        "primary": ubkg_json.get("process_state") == "primary"
+    }
+
+    for pre_integration_key in pre_integration_keys:
+        ubkg_key = pre_integration_to_ubkg_translation.get(pre_integration_key, pre_integration_key)
+        ubkg_value = ubkg_json.get(ubkg_key)
+        if ubkg_value is not None:
+            ubkg_transformed_json[pre_integration_key] = ubkg_value
+
+    return rule_chain_json | ubkg_transformed_json
+
+
 class NoMatchException(Exception):
     pass
 

@@ -16,6 +16,8 @@
     build_entity_metadata,
     calculate_assay_info,
     initialize_rule_chain,
+    get_data_from_ubkg,
+    standardize_results
 )
 from lib.services import get_entity
 
@@ -30,7 +32,7 @@ def get_ds_assaytype(ds_uuid: str):
         token = get_token()
         entity = get_entity(ds_uuid, token)
         metadata = build_entity_metadata(entity)
-        rule_value_set = calculate_assay_info(metadata)
+        rules_json = calculate_assay_info(metadata)
 
         if sources := entity.sources:
             source_type = ""
@@ -39,9 +41,12 @@ def get_ds_assaytype(ds_uuid: str):
                     # If there is a single Human source_type, treat this as a Human case
                     if source_type.upper() == "HUMAN":
                         break
-            apply_source_type_transformations(source_type, rule_value_set)
+            apply_source_type_transformations(source_type, rules_json)
 
-        return jsonify(rule_value_set)
+        ubkg_value_json = get_data_from_ubkg(rules_json.get("ubkg_code")).get("value", {})
+        merged_json = standardize_results(rules_json, ubkg_value_json)
+        merged_json["ubkg_json"] = ubkg_value_json
+        return jsonify(merged_json)
     except ValueError as excp:
         logger.error(excp, exc_info=True)
         return Response("Bad parameter: {excp}", 400)
@@ -97,21 +102,21 @@ def get_ds_rule_metadata(ds_uuid: str):
         )
 
 
-def apply_source_type_transformations(source_type: str, rule_value_set: dict) -> dict:
+def apply_source_type_transformations(source_type: str, rules_json: dict) -> dict:
     # If we get more complicated transformations we should consider refactoring.
     # For now, this should suffice.
     if "MOUSE" in source_type.upper():
-        rule_value_set["contains-pii"] = False
+        rules_json["contains-pii"] = False
 
-    return rule_value_set
+    return rules_json
 
 
 @assayclassifier_blueprint.route("/assaytype", methods=["POST"])
 @require_valid_token()
 @require_json(param="metadata")
 def get_assaytype_from_metadata(token: str, user: User, metadata: dict):
     try:
-        rule_value_set = calculate_assay_info(metadata)
+        rules_json = calculate_assay_info(metadata)
 
         if parent_sample_ids := metadata.get("parent_sample_id"):
             source_type = ""
@@ -123,8 +128,11 @@ def get_assaytype_from_metadata(token: str, user: User, metadata: dict):
                     if source_type.upper() == "HUMAN":
                         break
 
-            apply_source_type_transformations(source_type, rule_value_set)
-        return jsonify(rule_value_set)
+            apply_source_type_transformations(source_type, rules_json)
+        ubkg_value_json = get_data_from_ubkg(rules_json.get("ubkg_code")).get("value", {})
+        merged_json = standardize_results(rules_json, ubkg_value_json)
+        merged_json["ubkg_json"] = ubkg_value_json
+        return jsonify(merged_json)
     except ResponseException as re:
         logger.error(re, exc_info=True)
         return re.response