Skip to content

Commit

Permalink
Simplified PathbankCleaner (there were unnecessary nested loops); log…
Browse files Browse the repository at this point in the history
…back;
  • Loading branch information
IgorRodchenkov committed Mar 5, 2024
1 parent 242960a commit 0717768
Show file tree
Hide file tree
Showing 7 changed files with 88 additions and 54 deletions.
5 changes: 3 additions & 2 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<lucene.version>9.7.0</lucene.version>
<jvm.options>-Xmx3g -Dfile.encoding=UTF-8 -ea --add-opens=java.base/java.lang=ALL-UNNAMED --add-opens=java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED</jvm.options>
<!-- <agent>${settings.localRepository}/org/springframework/spring-instrument/${spring-framework.version}/spring-instrument-${spring-framework.version}.jar</agent>-->
<agent>target/spring-instrument.jar</agent><!-- this copy is created by maven-dependency-plugin -->
<agent>${settings.localRepository}/org/springframework/spring-instrument/${spring-framework.version}/spring-instrument-${spring-framework.version}.jar</agent>
<!-- this copy is created by maven-dependency-plugin -->
<!-- <agent>target/spring-instrument.jar</agent>-->
</properties>

<issueManagement>
Expand Down
64 changes: 35 additions & 29 deletions src/main/java/cpath/cleaner/PathbankCleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.model.level3.Process;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.InputStream;
import java.io.OutputStream;
Expand All @@ -19,6 +21,8 @@
*/
final class PathbankCleaner implements Cleaner {

private static final Logger log = LoggerFactory.getLogger(PathbankCleaner.class);

public void clean(InputStream data, OutputStream cleanedData) {
// create bp model from dataFile
SimpleIOHandler simpleReader = new SimpleIOHandler(BioPAXLevel.L3);
Expand All @@ -28,59 +32,61 @@ public void clean(InputStream data, OutputStream cleanedData) {
if (!model.containsID(model.getXmlBase() + "Reference/TAXONOMY_9606")
&& !model.containsID(model.getXmlBase() + "Reference/Taxonomy_9606")
&& !model.getObjects(BioSource.class).isEmpty())
{
throw new RuntimeException("Highly likely non-human datafile (skip).");
}

//since Apr-2018, top pathway URIs are "normalized" like: http://identifiers.org/smpdb/...
//let's fix pathway uris base - use bioregistry.io/pathbank: instead
CPathUtils.rebaseUris(model, "http://identifiers.org/smpdb/", "bioregistry.io/pathbank:");

// Normalize Pathway URIs KEGG stable id, where possible
Set<Pathway> pathways = new HashSet<>(model.getObjects(Pathway.class));
for (Pathway pw : pathways) {

//smpdb/pathbank use pathwayOrder, but it's useless - no nextStep at all!
for (PathwayStep step : new HashSet<>(pw.getPathwayOrder())) {
if (step.getNextStep().isEmpty() && step.getNextStepOf().isEmpty()) {
for (Process process : step.getStepProcess())
if (process instanceof Interaction && !Interaction.class.equals(process.getModelInterface()))
pw.addPathwayComponent(process);
pw.removePathwayOrder(step);
//remove pathways that have "SubPathway" name;
//though all these could be merged to become more informative pathways (once all the datafiles get merged),
//they add too much/unordered nesting/complexity to our model; not very helpful for the graph queries, SIF/GMT...
//due to the way pathwayOrder and pathwayComponent are used...
for (Pathway sp : new HashSet<>(model.getObjects(Pathway.class))) {
if (sp.getName().contains("SubPathway")) {
for (Pathway p : new HashSet<>(sp.getPathwayComponentOf())) {
p.removePathwayComponent(sp);
}
model.remove(sp);
}
}

//remove all Interaction.class (base) objects
for (Interaction it : new HashSet<>(model.getObjects(Interaction.class))) {
if (Interaction.class.equals(it.getModelInterface())) {
model.remove(it);
}
//remove all Interaction.class (base) objects
for (Interaction it : new HashSet<>(model.getObjects(Interaction.class))) {
if (Interaction.class.equals(it.getModelInterface())) {
model.remove(it);
}
}

//remove sub-pathways that have "SubPathway" in names...
//forgot why we do this (likely due to same pathways were defined in other files and we merge all...)
for (Pathway pathway : new HashSet<>(model.getObjects(Pathway.class))) {
if (pathway.getName().contains("SubPathway")) {
model.remove(pathway);
for (Pathway pp : new HashSet<>(pathway.getPathwayComponentOf())) {
pp.removePathwayComponent(pathway);
}
for (PathwayStep ps : new HashSet<>(pathway.getStepProcessOf())) {
ps.removeStepProcess(pathway);
//smpdb/pathbank use pathwayOrder, but it seems useless/nonsense, - no nextStep, participants are also added as pathwayComponent...
//move reaction/control from PathwayStep having no nextStep to pathwayComponent property of the parent pw.
for (PathwayStep step : new HashSet<>(model.getObjects(PathwayStep.class))) {
Pathway p = step.getPathwayOrderOf();
if (step.getNextStep().isEmpty() && step.getNextStepOf().isEmpty()) { //seems always TRUE (pathbank 2024/02)!
for (Process process : step.getStepProcess()) {
if (process instanceof Interaction && !Interaction.class.equals(process.getModelInterface())) {
p.addPathwayComponent(process);
}
}
step.getPathwayOrderOf().removePathwayOrder(step);
model.remove(step);
} else {
log.debug("keep pw step {} of pw {}", step.getUri(), p.getUri());
}
}

//delete dummy names if any
for (Named o : model.getObjects(Named.class)) {
//delete bogus dummy names
for (String name : new HashSet<>(o.getName())) {
if (StringUtils.startsWithIgnoreCase(name, "SubPathway")) {
o.removeName(name);
o.addComment(name);
}
}
}

// ModelUtils.replace(model, replacements);
//remove dangling, e.g., pathway steps, cv, xrefs, etc.
ModelUtils.removeObjectsIfDangling(model, UtilityClass.class);

// convert model back to OutputStream for return
Expand Down
6 changes: 3 additions & 3 deletions src/main/resources/logback.xml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
</contextListener>

<!-- To enable JMX Management (also prints additional logback initializing info)-->
<jmxConfigurator/>
<!-- <jmxConfigurator/>-->

<appender name="console" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
Expand All @@ -16,10 +16,10 @@
</appender>

<logger name="org.bbop" level="error"/>
<logger name="org.biopax" level="warn"/>
<logger name="org.biopax.paxtools.util.BPCollections" level="info"/>
<logger name="org.biopax.paxtools.normalizer.Resolver" level="info"/>
<root level="info">
<logger name="cpath" level="info"/>
<root level="error">
<appender-ref ref="console"/>
</root>

Expand Down
34 changes: 18 additions & 16 deletions src/test/java/cpath/cleaner/PathbankCleanerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,12 @@ public class PathbankCleanerTest {
public final void testClean() throws IOException {
Cleaner cleaner = new PathbankCleaner();
String uri1 = "bioregistry.io/pathbank:SMP0000040"; //was "http://identifiers.org/smpdb/SMP0000040";
String f1 = getClass().getClassLoader().getResource("").getPath() + File.separator + "testCleanPW000146.owl";
String f1 = getClass().getClassLoader().getResource("").getPath() + File.separator + "PW000146.cleaned.owl";
cleaner.clean(new FileInputStream(getClass().getResource("/PW000146.owl").getFile()), new FileOutputStream(f1));
Model m1 = new SimpleIOHandler().convertFromOWL(new FileInputStream(f1));
assertTrue(m1.containsID(uri1));
Pathway p1 = (Pathway)m1.getByID(uri1);
assertEquals("Glycolysis", p1.getDisplayName());

String uri2 = "bioregistry.io/pathbank:SMP0000057"; //was "http://identifiers.org/smpdb/SMP0000057";
String f2 = getClass().getClassLoader().getResource("").getPath() + File.separator + "testCleanPW000005.owl";
String f2 = getClass().getClassLoader().getResource("").getPath() + File.separator + "PW000005.cleaned.owl";
cleaner.clean(new FileInputStream(getClass().getResource("/PW000005.owl").getFile()), new FileOutputStream(f2));
Model m2 = new SimpleIOHandler().convertFromOWL(new FileInputStream(f2));

Expand All @@ -40,17 +37,22 @@ public final void testClean() throws IOException {
Model model = BioPAXLevel.L3.getDefaultFactory().createModel();
merger.merge(model, m2);
merger.merge(model, m1);
assertTrue(model.containsID(uri1));
assertTrue(model.containsID(uri2));
new SimpleIOHandler().convertToOWL(model, new FileOutputStream(
getClass().getClassLoader().getResource("").getPath()
+ File.separator + "testCleanSmpdbMergeOK.owl"));

Pathway pw = (Pathway) model.getByID(uri1);
assertEquals(37, pw.getPathwayComponent().size());
assertTrue(pw.getPathwayOrder().isEmpty()); //smpdb/pathbank use pathwayOrder but it's useless - no nextStep at all!
assertEquals(2, model.getObjects(Pathway.class).size());
assertTrue(model.getObjects(PathwayStep.class).isEmpty());
}

//write the merged model (debug)
new SimpleIOHandler().convertToOWL(model, new FileOutputStream(
getClass().getClassLoader().getResource("").getPath()
+ File.separator + "PW000005-000146.merged.owl"));

assertAll(
() -> assertTrue(m1.containsID(uri1)),
() -> assertEquals("Glycolysis", p1.getDisplayName()),
() -> assertTrue(model.containsID(uri1)),
() -> assertTrue(model.containsID(uri2)),
() -> assertEquals(37, pw.getPathwayComponent().size()),
() -> assertTrue(pw.getPathwayOrder().isEmpty()), //smpdb/pathbank use pathwayOrder, but it's useless - no nextStep at all!
() -> assertEquals(2, model.getObjects(Pathway.class).size()),
() -> assertTrue(model.getObjects(PathwayStep.class).isEmpty())
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
</contextListener>

<!-- To enable JMX Management (also prints additional logback initializing info)-->
<jmxConfigurator/>
<!-- <jmxConfigurator/>-->

<appender name="console" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
Expand All @@ -16,8 +16,7 @@
</appender>

<logger name="cpath" level="debug" />
<logger name="org.bbop" level="error"/>
<root level="info">
<root level="error">
<appender-ref ref="console"/>
</root>

Expand Down
2 changes: 1 addition & 1 deletion work/cpath2.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export CPATH2_HOME="."
JDK_JAVA_OPTIONS="--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/sun.nio.ch=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED"
DEBUG_OPTS=""
#DEBUG_OPTS="-Dlogback.configurationFile=logback.xml -Xdebug -Xnoagent -Djava.compiler=NONE -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=12345"
BASE_OPTS="-Dfile.encoding=UTF-8 -Xss32m -Xmx60g -Dpaxtools.normalizer.use-latest-registry=true -Dpaxtools.core.use-latest-genenames=true"
BASE_OPTS="-Dlogback.configurationFile=logback.xml -Dfile.encoding=UTF-8 -Xss32m -Xmx60g -Dpaxtools.normalizer.use-latest-registry=true -Dpaxtools.core.use-latest-genenames=true"
#use List instead Map collections for read-only BioPAX model if we start the Web app but not if building the instance/data
EXTRA_OPTS="$BASE_OPTS -Dpaxtools.model.safeset=list -server"

Expand Down
26 changes: 26 additions & 0 deletions work/logback.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<?xml version="1.0" encoding="UTF-8"?>
<configuration>

<contextListener class="ch.qos.logback.classic.jul.LevelChangePropagator">
<resetJUL>true</resetJUL>
</contextListener>

<!-- To enable JMX Management (also prints additional logback initializing info)-->
<!-- <jmxConfigurator/>-->

<appender name="console" class="ch.qos.logback.core.ConsoleAppender">
<encoder>
<pattern>%d %-5level [%thread] %logger{25} - %msg%n</pattern>
<outputPatternAsHeader>true</outputPatternAsHeader>
</encoder>
</appender>

<logger name="org.bbop" level="error"/>
<logger name="org.biopax.paxtools.util.BPCollections" level="info"/>
<logger name="org.biopax.paxtools.normalizer.Resolver" level="info"/>
<logger name="cpath" level="info"/>
<root level="error">
<appender-ref ref="console"/>
</root>

</configuration>

0 comments on commit 0717768

Please sign in to comment.