From 3d3ee51262e6dd0d66d7128833647546362b7041 Mon Sep 17 00:00:00 2001 From: IgorRodchenkov Date: Sun, 16 Jun 2024 23:19:38 -0400 Subject: [PATCH] Added a util method to auto-fix invalid URIs; use that in Normalizer. --- .../io/jsonld/JsonldBiopaxConverterTest.java | 6 +++--- .../src/test/resources/demo-pathway.owl | 8 ++++---- .../paxtools/normalizer/Normalizer.java | 14 +++++++------ .../paxtools/normalizer/NormalizerTest.java | 20 ------------------- .../paxtools/controller/ModelUtils.java | 16 ++++++++++++++- .../biopax/paxtools/io/SimpleIOHandler.java | 8 +++++++- .../paxtools/controller/ModelUtilsTest.java | 8 ++++---- 7 files changed, 41 insertions(+), 39 deletions(-) diff --git a/json-converter/src/test/java/org/biopax/paxtools/io/jsonld/JsonldBiopaxConverterTest.java b/json-converter/src/test/java/org/biopax/paxtools/io/jsonld/JsonldBiopaxConverterTest.java index 51f96e97..b36a1c8b 100644 --- a/json-converter/src/test/java/org/biopax/paxtools/io/jsonld/JsonldBiopaxConverterTest.java +++ b/json-converter/src/test/java/org/biopax/paxtools/io/jsonld/JsonldBiopaxConverterTest.java @@ -31,10 +31,10 @@ final void testSomePc14DemoPathway() throws IOException { // convert owl test file in resource directory to jsonld format InputStream in = getClass().getResourceAsStream("/demo-pathway.owl"); - //- there is no rdf:datatype=... anymore; should be fine as the datatypes are defined in the biopax-level3.owl spec! - //todo: for some reason, Jena libs v4 or v5 fail at e.g. rdf:about="TEST_CHEBI:cs_26d67131a0608673ae6a683d1dad18f7", + //there is no rdf:datatype=... anymore; should be fine as the datatypes are defined in the biopax-level3.owl spec; + //Jena libs v4, v5 fail at e.g. rdf:about="TEST_CHEBI:cs_26d67131a0608673ae6a683d1dad18f7" (not a valid URI due to '_' in the prefix), //but jena v3 just prints warnings, e.g.: org.apache.jena.riot - [line: 155, col: 82] {W107} Bad URI: Code: 0/ILLEGAL_CHARACTER in SCHEME: The character violates the grammar rules for URIs/IRIs. - //howver, removing the underscore from TEST_CHEBI - makes those warning/errors go away... + //however, removing the underscore (or replacing with '.') from TEST_CHEBI - makes those warning/errors go away - so I did in demo-pathway.owl ByteArrayOutputStream baos = new ByteArrayOutputStream(); converter.convertToJsonld(in, baos); diff --git a/json-converter/src/test/resources/demo-pathway.owl b/json-converter/src/test/resources/demo-pathway.owl index d4ebf8db..226913b3 100644 --- a/json-converter/src/test/resources/demo-pathway.owl +++ b/json-converter/src/test/resources/demo-pathway.owl @@ -147,7 +147,7 @@ hgnc.symbol - + InChI InChI=1S/C10H18O/c1-5-10(4,11)8-6-7-9(2)3/h5,7,11H,1,6,8H2,2-4H3/t10-/m0/s1 @@ -158,7 +158,7 @@ ncbigene - + InChI InChI=1S/C10H16/c1-7-8-4-5-9(6-8)10(7,2)3/h8-9H,1,4-6H2,2-3H3/t8-,9+/m0/s1 @@ -452,7 +452,7 @@ A linalool that has formula C10H18O. is_conjugate_acid_of 422 is_enantiomer_of 98 - + @@ -490,7 +490,7 @@ CC1(C)[C@@H]2CC[C@@H](C2)C1=C is_enantiomer_of 89 A camphene that has formula C10H16. - + diff --git a/normalizer/src/main/java/org/biopax/paxtools/normalizer/Normalizer.java b/normalizer/src/main/java/org/biopax/paxtools/normalizer/Normalizer.java index 1894efa3..b0c8ee7f 100644 --- a/normalizer/src/main/java/org/biopax/paxtools/normalizer/Normalizer.java +++ b/normalizer/src/main/java/org/biopax/paxtools/normalizer/Normalizer.java @@ -484,13 +484,15 @@ public void normalize(Model model) { */ public void normalize(Model model, boolean usePrefixAsDbName) { - if(model.getLevel() != BioPAXLevel.L3) + if(model.getLevel() != BioPAXLevel.L3) { throw new IllegalArgumentException("Not Level3 model. " + - "Consider converting it first (e.g., with the PaxTools)."); + "Consider converting it first (e.g., with the PaxTools)."); + } //if set, update the xml:base - if(xmlBase != null && !xmlBase.isEmpty()) + if(xmlBase != null && !xmlBase.isEmpty()) { model.setXmlBase(xmlBase); + } // Normalize/merge xrefs first and then - CVs // (xrefs could have URIs that should be instead used for CV, PR, SMR or BS biopax types) @@ -518,15 +520,15 @@ public void normalize(Model model, boolean usePrefixAsDbName) { log.info("Normalizing entity references..." + description); normalizeERs(model); + + log.info("Fixing invalid URIs if any..."); + ModelUtils.fixInvalidUris(model); // find/add lost (in replace) children log.info("Repairing..." + description); model.repair(); // it does not remove dangling utility class objects (can be done separately, later, if needed) - - log.info("Optional tasks (reasoning)..." + description); } - private void normalizeCVs(Model model) { NormalizerMap map = new NormalizerMap(model); diff --git a/normalizer/src/test/java/org/biopax/paxtools/normalizer/NormalizerTest.java b/normalizer/src/test/java/org/biopax/paxtools/normalizer/NormalizerTest.java index c0039ef0..d6925592 100644 --- a/normalizer/src/test/java/org/biopax/paxtools/normalizer/NormalizerTest.java +++ b/normalizer/src/test/java/org/biopax/paxtools/normalizer/NormalizerTest.java @@ -1,6 +1,5 @@ package org.biopax.paxtools.normalizer; - import org.apache.commons.lang3.StringUtils; import org.biopax.paxtools.io.SimpleIOHandler; import org.biopax.paxtools.model.BioPAXElement; @@ -366,23 +365,4 @@ void normalizeInoh() { e = model.getByID(model.getXmlBase() + "IMR_0100366_G_alpha_s_Canonical"); assertTrue(e instanceof ProteinReference); } - - private void print(XReferrable xr, Model m) { - System.out.println(); - System.out.println("model=" + m.contains(xr) + ":\t" - + xr.getUri() + - " is " + xr.getModelInterface().getSimpleName() - + " and has xrefs: "); - for(Xref x : xr.getXref()) { - System.out.println("model=" + m.contains(x) + ":\t" - +" " + x + " is " - + x.getModelInterface().getSimpleName() - + " - " + x.getUri() + ", db=" + x.getDb() - + ", id=" + x.getId() + ", idVer=" + x.getIdVersion()); - for(XReferrable rx : x.getXrefOf()) { - System.out.println("model=" + m.contains(rx) + ":\t" - + " xrefOf: " + rx); - } - } - } } diff --git a/paxtools-core/src/main/java/org/biopax/paxtools/controller/ModelUtils.java b/paxtools-core/src/main/java/org/biopax/paxtools/controller/ModelUtils.java index 98803c58..0da36d4f 100644 --- a/paxtools-core/src/main/java/org/biopax/paxtools/controller/ModelUtils.java +++ b/paxtools-core/src/main/java/org/biopax/paxtools/controller/ModelUtils.java @@ -1,5 +1,6 @@ package org.biopax.paxtools.controller; +import org.apache.commons.lang3.StringUtils; import org.biopax.paxtools.impl.BioPAXElementImpl; import org.biopax.paxtools.io.BioPAXIOHandler; import org.biopax.paxtools.io.SimpleIOHandler; @@ -15,6 +16,7 @@ import java.io.*; import java.lang.reflect.Method; +import java.net.URI; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; import java.util.*; @@ -1290,7 +1292,6 @@ public static void updateUri(Model model, BioPAXElement el, String newUri) { m.setAccessible(true); m.invoke(el, newUri); } catch (Exception e) { - e.printStackTrace(); throw new RuntimeException(e); } @@ -1342,4 +1343,17 @@ public static boolean isGeneric(BioPAXElement e) { ); //false when e==null } + + public static void fixInvalidUris(Model model) { + String prefix = StringUtils.isBlank(model.getXmlBase()) ? "" : model.getXmlBase(); + for(BioPAXElement bpe : new HashSet<>(model.getObjects())) { + try { + URI.create(bpe.getUri()); + } catch (IllegalArgumentException e) { + String uri = prefix + md5hex(bpe.getUri()); + LOG.info("Replaced invalid URI: '{}' with generated: '{}'", bpe.getUri(), uri); + updateUri(model, bpe, uri); + } + } + } } diff --git a/paxtools-core/src/main/java/org/biopax/paxtools/io/SimpleIOHandler.java b/paxtools-core/src/main/java/org/biopax/paxtools/io/SimpleIOHandler.java index 27424896..191f849a 100644 --- a/paxtools-core/src/main/java/org/biopax/paxtools/io/SimpleIOHandler.java +++ b/paxtools-core/src/main/java/org/biopax/paxtools/io/SimpleIOHandler.java @@ -20,6 +20,7 @@ import javax.xml.stream.XMLStreamException; import javax.xml.stream.XMLStreamReader; import java.io.*; +import java.net.URI; import java.nio.charset.StandardCharsets; import java.util.*; @@ -363,6 +364,11 @@ private String processIndividual(Model model) throws XMLStreamException throw new BioPaxIOException( String.format("Error processing %s%s (rdf:ID/rdf:about not found)", r.getNamespaceURI(), getXmlStreamInfo())); } + try { + URI.create(id); + } catch (IllegalArgumentException e) { + log.error("Invalid URI '{}' at {}{}", id, r.getNamespaceURI(), getXmlStreamInfo()); + } Class type; try { @@ -387,7 +393,7 @@ private String processIndividual(Model model) throws XMLStreamException } else { //abstract BioPAX types, e.g. Entity, UtilityClass, cannot be used directly in RDF+XML model/file! - log.error(String.format("Ignoring abstract %s, id: %s", (r.hasText()?r.getText():getXmlStreamInfo()), id)); + log.error("Ignoring abstract {}, id: {}", (r.hasText()?r.getText():getXmlStreamInfo()), id); //id = null; //todo: uncomment/test (currently, ignored object's uri can become parent's property value, e.g. CV term) //skip(); //was a bug - throws a misleading exception at the next element in some cases //todo: shall we instead throw an exception when e.g. ? diff --git a/paxtools-core/src/test/java/org/biopax/paxtools/controller/ModelUtilsTest.java b/paxtools-core/src/test/java/org/biopax/paxtools/controller/ModelUtilsTest.java index d6fde8f3..f69bfe3e 100644 --- a/paxtools-core/src/test/java/org/biopax/paxtools/controller/ModelUtilsTest.java +++ b/paxtools-core/src/test/java/org/biopax/paxtools/controller/ModelUtilsTest.java @@ -99,10 +99,10 @@ public final void mergeAndReplace() { assertEquals(8, m.getObjects().size()); // + pr3 assertTrue(m.contains(pr3)); // added! - assertTrue(m.contains(pr2)); // not deleted (may be dangling now)! - assertTrue(m.contains(x2)); // not deleted (may be dangling now)! - assertTrue(m.contains(pr1)); // not deleted (may be dangling now)! - assertTrue(m.contains(x1)); // not deleted (may be dangling now)! + assertTrue(m.contains(pr2)); // not deleted (maybe dangling now)! + assertTrue(m.contains(x2)); // not deleted (maybe dangling now)! + assertTrue(m.contains(pr1)); // not deleted (maybe dangling now)! + assertTrue(m.contains(x1)); // not deleted (maybe dangling now)! // delete dangling ModelUtils.removeObjectsIfDangling(m, ProteinReference.class);