diff --git a/pom.xml b/pom.xml index ac6e616d..cc66aaf3 100644 --- a/pom.xml +++ b/pom.xml @@ -29,8 +29,9 @@ UTF-8 9.7.0 -Xmx3g -Dfile.encoding=UTF-8 -ea --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED - - target/spring-instrument.jar + ${settings.localRepository}/org/springframework/spring-instrument/${spring-framework.version}/spring-instrument-${spring-framework.version}.jar + + diff --git a/src/main/java/cpath/cleaner/PathbankCleaner.java b/src/main/java/cpath/cleaner/PathbankCleaner.java index f904e3b5..6ddbc779 100644 --- a/src/main/java/cpath/cleaner/PathbankCleaner.java +++ b/src/main/java/cpath/cleaner/PathbankCleaner.java @@ -9,6 +9,8 @@ import org.biopax.paxtools.model.Model; import org.biopax.paxtools.model.level3.*; import org.biopax.paxtools.model.level3.Process; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.InputStream; import java.io.OutputStream; @@ -19,6 +21,8 @@ */ final class PathbankCleaner implements Cleaner { + private static final Logger log = LoggerFactory.getLogger(PathbankCleaner.class); + public void clean(InputStream data, OutputStream cleanedData) { // create bp model from dataFile SimpleIOHandler simpleReader = new SimpleIOHandler(BioPAXLevel.L3); @@ -28,59 +32,61 @@ public void clean(InputStream data, OutputStream cleanedData) { if (!model.containsID(model.getXmlBase() + "Reference/TAXONOMY_9606") && !model.containsID(model.getXmlBase() + "Reference/Taxonomy_9606") && !model.getObjects(BioSource.class).isEmpty()) + { throw new RuntimeException("Highly likely non-human datafile (skip)."); + } //since Apr-2018, top pathway URIs are "normalized" like: http://identifiers.org/smpdb/... //let's fix pathway uris base - use bioregistry.io/pathbank: instead CPathUtils.rebaseUris(model, "http://identifiers.org/smpdb/", "bioregistry.io/pathbank:"); - // Normalize Pathway URIs KEGG stable id, where possible - Set pathways = new HashSet<>(model.getObjects(Pathway.class)); - for (Pathway pw : pathways) { - - //smpdb/pathbank use pathwayOrder, but it's useless - no nextStep at all! - for (PathwayStep step : new HashSet<>(pw.getPathwayOrder())) { - if (step.getNextStep().isEmpty() && step.getNextStepOf().isEmpty()) { - for (Process process : step.getStepProcess()) - if (process instanceof Interaction && !Interaction.class.equals(process.getModelInterface())) - pw.addPathwayComponent(process); - pw.removePathwayOrder(step); + //remove pathways that have "SubPathway" name; + //though all these could be merged to become more informative pathways (once all the datafiles get merged), + //they add too much/unordered nesting/complexity to our model; not very helpful for the graph queries, SIF/GMT... + //due to the way pathwayOrder and pathwayComponent are used... + for (Pathway sp : new HashSet<>(model.getObjects(Pathway.class))) { + if (sp.getName().contains("SubPathway")) { + for (Pathway p : new HashSet<>(sp.getPathwayComponentOf())) { + p.removePathwayComponent(sp); } + model.remove(sp); } + } - //remove all Interaction.class (base) objects - for (Interaction it : new HashSet<>(model.getObjects(Interaction.class))) { - if (Interaction.class.equals(it.getModelInterface())) { - model.remove(it); - } + //remove all Interaction.class (base) objects + for (Interaction it : new HashSet<>(model.getObjects(Interaction.class))) { + if (Interaction.class.equals(it.getModelInterface())) { + model.remove(it); } + } - //remove sub-pathways that have "SubPathway" in names... - //forgot why we do this (likely due to same pathways were defined in other files and we merge all...) - for (Pathway pathway : new HashSet<>(model.getObjects(Pathway.class))) { - if (pathway.getName().contains("SubPathway")) { - model.remove(pathway); - for (Pathway pp : new HashSet<>(pathway.getPathwayComponentOf())) { - pp.removePathwayComponent(pathway); - } - for (PathwayStep ps : new HashSet<>(pathway.getStepProcessOf())) { - ps.removeStepProcess(pathway); + //smpdb/pathbank use pathwayOrder, but it seems useless/nonsense, - no nextStep, participants are also added as pathwayComponent... + //move reaction/control from PathwayStep having no nextStep to pathwayComponent property of the parent pw. + for (PathwayStep step : new HashSet<>(model.getObjects(PathwayStep.class))) { + Pathway p = step.getPathwayOrderOf(); + if (step.getNextStep().isEmpty() && step.getNextStepOf().isEmpty()) { //seems always TRUE (pathbank 2024/02)! + for (Process process : step.getStepProcess()) { + if (process instanceof Interaction && !Interaction.class.equals(process.getModelInterface())) { + p.addPathwayComponent(process); } } + step.getPathwayOrderOf().removePathwayOrder(step); + model.remove(step); + } else { + log.debug("keep pw step {} of pw {}", step.getUri(), p.getUri()); } } + //delete dummy names if any for (Named o : model.getObjects(Named.class)) { - //delete bogus dummy names for (String name : new HashSet<>(o.getName())) { if (StringUtils.startsWithIgnoreCase(name, "SubPathway")) { o.removeName(name); - o.addComment(name); } } } -// ModelUtils.replace(model, replacements); + //remove dangling, e.g., pathway steps, cv, xrefs, etc. ModelUtils.removeObjectsIfDangling(model, UtilityClass.class); // convert model back to OutputStream for return diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml index 13c3f6d3..83b6747f 100644 --- a/src/main/resources/logback.xml +++ b/src/main/resources/logback.xml @@ -6,7 +6,7 @@ - + @@ -16,10 +16,10 @@ - - + + diff --git a/src/test/java/cpath/cleaner/PathbankCleanerTest.java b/src/test/java/cpath/cleaner/PathbankCleanerTest.java index d9510667..acaee851 100644 --- a/src/test/java/cpath/cleaner/PathbankCleanerTest.java +++ b/src/test/java/cpath/cleaner/PathbankCleanerTest.java @@ -23,15 +23,12 @@ public class PathbankCleanerTest { public final void testClean() throws IOException { Cleaner cleaner = new PathbankCleaner(); String uri1 = "bioregistry.io/pathbank:SMP0000040"; //was "http://identifiers.org/smpdb/SMP0000040"; - String f1 = getClass().getClassLoader().getResource("").getPath() + File.separator + "testCleanPW000146.owl"; + String f1 = getClass().getClassLoader().getResource("").getPath() + File.separator + "PW000146.cleaned.owl"; cleaner.clean(new FileInputStream(getClass().getResource("/PW000146.owl").getFile()), new FileOutputStream(f1)); Model m1 = new SimpleIOHandler().convertFromOWL(new FileInputStream(f1)); - assertTrue(m1.containsID(uri1)); Pathway p1 = (Pathway)m1.getByID(uri1); - assertEquals("Glycolysis", p1.getDisplayName()); - String uri2 = "bioregistry.io/pathbank:SMP0000057"; //was "http://identifiers.org/smpdb/SMP0000057"; - String f2 = getClass().getClassLoader().getResource("").getPath() + File.separator + "testCleanPW000005.owl"; + String f2 = getClass().getClassLoader().getResource("").getPath() + File.separator + "PW000005.cleaned.owl"; cleaner.clean(new FileInputStream(getClass().getResource("/PW000005.owl").getFile()), new FileOutputStream(f2)); Model m2 = new SimpleIOHandler().convertFromOWL(new FileInputStream(f2)); @@ -40,17 +37,22 @@ public final void testClean() throws IOException { Model model = BioPAXLevel.L3.getDefaultFactory().createModel(); merger.merge(model, m2); merger.merge(model, m1); - assertTrue(model.containsID(uri1)); - assertTrue(model.containsID(uri2)); - new SimpleIOHandler().convertToOWL(model, new FileOutputStream( - getClass().getClassLoader().getResource("").getPath() - + File.separator + "testCleanSmpdbMergeOK.owl")); - Pathway pw = (Pathway) model.getByID(uri1); - assertEquals(37, pw.getPathwayComponent().size()); - assertTrue(pw.getPathwayOrder().isEmpty()); //smpdb/pathbank use pathwayOrder but it's useless - no nextStep at all! - assertEquals(2, model.getObjects(Pathway.class).size()); - assertTrue(model.getObjects(PathwayStep.class).isEmpty()); - } + //write the merged model (debug) + new SimpleIOHandler().convertToOWL(model, new FileOutputStream( + getClass().getClassLoader().getResource("").getPath() + + File.separator + "PW000005-000146.merged.owl")); + + assertAll( + () -> assertTrue(m1.containsID(uri1)), + () -> assertEquals("Glycolysis", p1.getDisplayName()), + () -> assertTrue(model.containsID(uri1)), + () -> assertTrue(model.containsID(uri2)), + () -> assertEquals(37, pw.getPathwayComponent().size()), + () -> assertTrue(pw.getPathwayOrder().isEmpty()), //smpdb/pathbank use pathwayOrder, but it's useless - no nextStep at all! + () -> assertEquals(2, model.getObjects(Pathway.class).size()), + () -> assertTrue(model.getObjects(PathwayStep.class).isEmpty()) + ); + } } diff --git a/src/main/resources/logback-test.xml b/src/test/resources/logback-test.xml similarity index 87% rename from src/main/resources/logback-test.xml rename to src/test/resources/logback-test.xml index e2721b6e..1075851e 100644 --- a/src/main/resources/logback-test.xml +++ b/src/test/resources/logback-test.xml @@ -6,7 +6,7 @@ - + @@ -16,8 +16,7 @@ - - + diff --git a/work/cpath2.sh b/work/cpath2.sh index c93bce4e..18b56fec 100644 --- a/work/cpath2.sh +++ b/work/cpath2.sh @@ -5,7 +5,7 @@ export CPATH2_HOME="." JDK_JAVA_OPTIONS="--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED" DEBUG_OPTS="" #DEBUG_OPTS="-Dlogback.configurationFile=logback.xml -Xdebug -Xnoagent -Djava.compiler=NONE -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=12345" -BASE_OPTS="-Dfile.encoding=UTF-8 -Xss32m -Xmx60g -Dpaxtools.normalizer.use-latest-registry=true -Dpaxtools.core.use-latest-genenames=true" +BASE_OPTS="-Dlogback.configurationFile=logback.xml -Dfile.encoding=UTF-8 -Xss32m -Xmx60g -Dpaxtools.normalizer.use-latest-registry=true -Dpaxtools.core.use-latest-genenames=true" #use List instead Map collections for read-only BioPAX model if we start the Web app but not if building the instance/data EXTRA_OPTS="$BASE_OPTS -Dpaxtools.model.safeset=list -server" diff --git a/work/logback.xml b/work/logback.xml new file mode 100644 index 00000000..83b6747f --- /dev/null +++ b/work/logback.xml @@ -0,0 +1,26 @@ + + + + + true + + + + + + + + %d %-5level [%thread] %logger{25} - %msg%n + true + + + + + + + + + + + + \ No newline at end of file