Removed duplicate/old code; moved IDFetcher to -core and Configurable…

…IDFetcher (uses Resolver) to -normalizer modules (trying to avoid circular dependencies in the future, i.e., core should not depend on any other module, gsea shouldn't depend on pattern and vice versa; but it's ok if gsea, pattern to depend on core and normalizer...)
BioPAX · Apr 29, 2024 · 890f31f · 890f31f
1 parent 3ec3dc4
commit 890f31f
Show file tree

Hide file tree

Showing 15 changed files with 59 additions and 208 deletions.
diff --git a/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GMTConverter.java b/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GMTConverter.java
@@ -7,6 +7,7 @@
 import org.biopax.paxtools.model.BioPAXLevel;
 import org.biopax.paxtools.model.Model;
 import org.biopax.paxtools.model.level3.*;
+import org.biopax.paxtools.normalizer.ConfigurableIDFetcher;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -17,36 +18,25 @@
 import java.util.*;
 
 /**
- * An advanced BioPAX to GMT format converter, which can output IDs of genetic elements or chemicals
- * (the output file can be loaded with the GSEA software if gene/protein IDs are used).
- * 
- * Each output entry (row) consists of three columns (tab separated):
- * name (URI), description, and the list of identifiers (of the same type).
- * For all ERs not associated with any pathway, "other" is used for name and uri.
+ * An experimental/advanced BioPAX to GMT converter
+ * that can output the desired type IDs for both genetic elements and chemicals
+ * (the output file can be then loaded with GSEA software if gene/protein IDs were used).
  *
- * The list may have one or more IDs of the same type per Protein Reference (PR),
- * e.g., UniProt IDs or HGNC Symbols; PRs not having an xref of 
- * given db/id type are ignored. If there are less than three protein 
- * references per entry in total, it will not be printed.
+ * @see GSEAConverter
  *
- * Note, this code assumes that the model has successfully been validated
- * and perhaps normalized (using the BioPAX Validator, Paxtools Normalizer).
- * A BioPAX L1 or L2 model is first converted to the L3.
+ * TODO: finish, test, make it public...
  */
 final class GMTConverter {
 	private final static Logger LOG = LoggerFactory.getLogger(GMTConverter.class);
 
-	private final IdFetcher idFetcher;
+	private final IDFetcher idFetcher;
 	private boolean skipSubPathways;
 	private boolean skipOutsidePathways;
 	private int minNumIdsPerEntry;
 
-	/**
-	 * Constructor.
-	 */
 	public GMTConverter()
 	{
-		idFetcher = new IdFetcher().chemDbStartsWithOrEquals("chebi")
+		idFetcher = new ConfigurableIDFetcher().chemDbStartsWithOrEquals("chebi")
 			.seqDbStartsWithOrEquals("hgnc.symbol") //the order in the list does matter
 			.seqDbStartsWithOrEquals("hgnc symbol")
 			.seqDbStartsWithOrEquals("hgnc");

diff --git a/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GSEAConverter.java b/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GSEAConverter.java
@@ -19,9 +19,9 @@
 import java.util.*;
 
 /**
- * Converts a BioPAX model to the GMT format (used by GSEA software).
+ *   Converts a BioPAX model to the GMT format (used by GSEA software).
  * 
- * It creates GSEA entries from sequence entity reference xrefs
+ *   It creates GSEA entries from sequence entity reference xrefs
  * in the BioPAX model as follows:
  *
  *   Each entry (row) consists of three columns (tab separated):
@@ -35,14 +35,10 @@
  * given db/id type are ignored. Optionally, if there are less than three protein
  * references per entry, it will not be printed.
  * 
- * Note, to effectively enforce cross-species violation, 
- * 'organism' property and pathways must be set
- * to a BioSource object that has a valid unification xref: 
- * db="Taxonomy" and id= some valid taxonomy id.
- *
- * Note, this code assumes that the model has successfully been validated
- * and perhaps normalized (using the BioPAX Validator, Paxtools Normalizer).
- * A BioPAX L1 or L2 model is first converted to the L3.
+ * Notes:
+ *  - to effectively enforce the cross-species checks, 'organism' property of pathways and participants must be set
+ * to a BioSource element with valid ncbitaxon (taxonomy) unification xref;
+ *  - this assumes that the BioPAX Level3 model was validated  and normalized (with BioPAX Validator and Normalizer);
  */
 public class GSEAConverter
 {
@@ -98,7 +94,7 @@ public GSEAConverter(String idType, boolean crossSpeciesCheckEnabled)
 	public GSEAConverter(String idType, boolean crossSpeciesCheckEnabled, boolean skipSubPathways)
 	{
 		if(Resolver.isKnownNameOrVariant(idType)) {
-			this.idType = Resolver.getNamespace(idType, true).getPrefix().toLowerCase(); //(it must be already lowecase though)
+			this.idType = Resolver.getNamespace(idType).getPrefix().toLowerCase(); //(it must be already lowecase though)
 		} else {
 			this.idType = (StringUtils.isNotBlank(idType)) ? idType.toLowerCase() : idType;
 		}

diff --git a/.../pattern/miner/ConfigurableIDFetcher.java → ...ols/normalizer/ConfigurableIDFetcher.java b/.../pattern/miner/ConfigurableIDFetcher.java → ...ols/normalizer/ConfigurableIDFetcher.java
@@ -1,12 +1,11 @@
-package org.biopax.paxtools.pattern.miner;
+package org.biopax.paxtools.normalizer;
 
 import org.apache.commons.lang3.StringUtils;
+import org.biopax.paxtools.controller.IDFetcher;
 import org.biopax.paxtools.model.BioPAXElement;
 import org.biopax.paxtools.model.level3.*;
-import org.biopax.paxtools.normalizer.Namespace;
-import org.biopax.paxtools.normalizer.Resolver;
-import org.biopax.paxtools.util.HGNC;
 import org.biopax.paxtools.util.ClassFilterSet;
+import org.biopax.paxtools.util.HGNC;
 
 import java.util.*;
 
@@ -87,25 +86,36 @@ public ConfigurableIDFetcher useNameWhenNoDbMatch(boolean useNameWhenNoDbMatch)
 	public Set<String> fetchID(BioPAXElement ele)
 	{
 		Set<String> set = new HashSet<>();
+		if(ele instanceof EntityReference) {
+			System.out.println("fetchID(ER): " + ele.getUri());
+		}
 		if(ele instanceof XReferrable) {
 			//Iterate the db priority list, match/filter all xrefs to collect the IDs of given type, until 'set' is not empty.
-			List<String> dbStartsWithOrEquals =	(ele instanceof SmallMoleculeReference || ele instanceof SmallMolecule)
+			List<String> dbStartsWithOrEquals =
+					(ele instanceof SmallMoleculeReference || ele instanceof SmallMolecule)
 							? chemDbStartsWithOrEquals : seqDbStartsWithOrEquals;
 			for (String dbStartsWith : dbStartsWithOrEquals) {
-				for (UnificationXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(),
-						UnificationXref.class)) {
-					collectXrefIdIfDbLike(x, dbStartsWith, set);
+				//a shortcut for normalized URIs; prevents collecting lots of secondary IDs of the same type
+				if(StringUtils.containsIgnoreCase(ele.getUri(),"identifiers.org/"+dbStartsWith)
+						|| StringUtils.containsIgnoreCase(ele.getUri(),"bioregistry.io/"+dbStartsWith)) {
+					//can be http://identifiers.org/hgnc.symbol:PCNA or http://bioregistry.io/chebi:20, etc.
+					set.add(StringUtils.substringAfterLast(ele.getUri(), "/"));
 				}
-				//if none were found in the unification xrefs, then try the relationship xrefs
-				if (set.isEmpty()) {
-					for (RelationshipXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(),
-							RelationshipXref.class)) {
+				else {
+					for (UnificationXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), UnificationXref.class)) {
 						collectXrefIdIfDbLike(x, dbStartsWith, set);
 					}
+					//if none found, try relationship xrefs
+					if (set.isEmpty()) {
+						for (RelationshipXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), RelationshipXref.class)) {
+							collectXrefIdIfDbLike(x, dbStartsWith, set);
+						}
+					}
 				}
-				//once we've found some ID, no need to try another id type
-				if (!set.isEmpty())
+				//once we've found some ID, no need to try next (lower priority) id type in the list
+				if (!set.isEmpty()) {
 					break;
+				}
 			}
 		}
 

diff --git a/normalizer/src/test/java/org/biopax/paxtools/normalizer/ResolverTest.java b/normalizer/src/test/java/org/biopax/paxtools/normalizer/ResolverTest.java
@@ -12,7 +12,7 @@ public final void getNamespace() {
 				() ->	Assertions.assertNotNull(Resolver.getNamespace("obo.mi")),//auto-detected as "mi"
 				() ->	Assertions.assertEquals(MI, Resolver.getNamespace("urn:miriam:mi").getName()),
 				() ->	Assertions.assertNotNull(Resolver.getNamespace("psi-mi")),//becomes "mi"
-				() ->	Assertions.assertNotNull(Resolver.getNamespace("MolecularInteractions Ontology", true)),//misspelling variant (allowed by default)
+				() ->	Assertions.assertNotNull(Resolver.getNamespace("MolecularInteractions Ontology", true)),//misspelling variant (allowed by default too)
 				() ->	Assertions.assertNull(Resolver.getNamespace("MolecularInteractions Ontology", false)),//null when spelling variants not allowed
 				() ->	Assertions.assertNotNull(Resolver.getNamespace("http://bioregistry.io/chebi")),
 				() ->	Assertions.assertNotNull(Resolver.getNamespace("bioregistry.io/uniprot")),

diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/constraint/HasAnID.java b/pattern/src/main/java/org/biopax/paxtools/pattern/constraint/HasAnID.java
@@ -1,8 +1,8 @@
 package org.biopax.paxtools.pattern.constraint;
 
+import org.biopax.paxtools.controller.IDFetcher;
 import org.biopax.paxtools.model.BioPAXElement;
 import org.biopax.paxtools.pattern.Match;
-import org.biopax.paxtools.pattern.miner.IDFetcher;
 
 import java.util.Map;
 import java.util.Set;

diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/BlacklistGenerator.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/BlacklistGenerator.java
@@ -1,5 +1,6 @@
 package org.biopax.paxtools.pattern.miner;
 
+import org.biopax.paxtools.controller.IDFetcher;
 import org.biopax.paxtools.model.BioPAXElement;
 import org.biopax.paxtools.model.Model;
 import org.biopax.paxtools.model.level3.SmallMoleculeReference;

diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/CommonIDFetcher.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/CommonIDFetcher.java
@@ -1,6 +1,7 @@
 package org.biopax.paxtools.pattern.miner;
 
 import org.apache.commons.lang3.StringUtils;
+import org.biopax.paxtools.controller.IDFetcher;
 import org.biopax.paxtools.model.BioPAXElement;
 import org.biopax.paxtools.model.level3.*;
 import org.biopax.paxtools.normalizer.Resolver;
@@ -12,10 +13,9 @@
 
 /**
  * Tries to get Gene Symbols or UniProt IDs for genes
- * and - display names for small molecules;
- *
- * This id-fetcher is mainly to use with the BioPAX pathway data
- * from Pathway Commons (PC2) db (normalized and enriched with xrefs).
+ * and - get display names for small molecules.
+ * This is mainly to use with the BioPAX pathway data
+ * from Pathway Commons db (normalized and enriched with xrefs).
  *
  * @author Ozgun Babur et al.
  */

diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/MinerAdapter.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/MinerAdapter.java
@@ -1,6 +1,7 @@
 package org.biopax.paxtools.pattern.miner;
 
 import org.biopax.paxtools.controller.PathAccessor;
+import org.biopax.paxtools.controller.IDFetcher;
 import org.biopax.paxtools.model.BioPAXElement;
 import org.biopax.paxtools.model.level3.*;
 import org.biopax.paxtools.pattern.Match;

diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFMiner.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFMiner.java
@@ -1,5 +1,6 @@
 package org.biopax.paxtools.pattern.miner;
 
+import org.biopax.paxtools.controller.IDFetcher;
 import org.biopax.paxtools.pattern.Match;
 import org.biopax.paxtools.pattern.util.Blacklist;
 

diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFSearcher.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFSearcher.java
@@ -1,5 +1,6 @@
 package org.biopax.paxtools.pattern.miner;
 
+import org.biopax.paxtools.controller.IDFetcher;
 import org.biopax.paxtools.model.BioPAXElement;
 import org.biopax.paxtools.model.Model;
 import org.biopax.paxtools.pattern.Match;

diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SimpleIDFetcher.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SimpleIDFetcher.java
@@ -1,5 +1,6 @@
 package org.biopax.paxtools.pattern.miner;
 
+import org.biopax.paxtools.controller.IDFetcher;
 import org.biopax.paxtools.model.BioPAXElement;
 
 import java.util.Collections;
@@ -15,5 +16,4 @@ public Set<String> fetchID(BioPAXElement ele)
 	{
 		return Collections.singleton(ele.getUri());
 	}
-
 }
diff --git a/pattern/src/test/java/org/biopax/paxtools/pattern/miner/SIFSearcherTest.java b/pattern/src/test/java/org/biopax/paxtools/pattern/miner/SIFSearcherTest.java
@@ -1,12 +1,12 @@
 package org.biopax.paxtools.pattern.miner;
 
+import org.biopax.paxtools.controller.IDFetcher;
 import org.biopax.paxtools.io.SimpleIOHandler;
 import org.biopax.paxtools.model.BioPAXLevel;
 import org.biopax.paxtools.model.Model;
 import org.biopax.paxtools.model.level3.*;
-import org.biopax.paxtools.pattern.Pattern;
+import org.biopax.paxtools.normalizer.ConfigurableIDFetcher;
 import org.biopax.paxtools.pattern.PatternBoxTest;
-import org.biopax.paxtools.pattern.constraint.IDConstraint;
 import org.biopax.paxtools.pattern.util.AdjacencyMatrix;
 import org.biopax.paxtools.pattern.util.Blacklist;
 import org.junit.jupiter.api.Disabled;

diff --git a/paxtools-console/src/main/java/org/biopax/paxtools/Commands.java b/paxtools-console/src/main/java/org/biopax/paxtools/Commands.java
@@ -21,6 +21,7 @@
 import org.biopax.paxtools.model.Model;
 import org.biopax.paxtools.model.level2.entity;
 import org.biopax.paxtools.model.level3.*;
+import org.biopax.paxtools.normalizer.ConfigurableIDFetcher;
 import org.biopax.paxtools.normalizer.Namespace;
 import org.biopax.paxtools.normalizer.Resolver;
 import org.biopax.paxtools.pattern.miner.*;
@@ -91,7 +92,6 @@ static void toGSEA(String[] argv) throws IOException {
 			}
 		}
 
-		// The Constructor args: GSEAConverter(idTypeNameOrPrefix, crossSpeciesCheckEnabled?, skipSubPathways?)
 		GSEAConverter gseaConverter = new GSEAConverter(argv[3], !crossSpecies, !subPathways);
 		gseaConverter.setSkipOutsidePathways(!notPathways);
 		gseaConverter.setAllowedOrganisms(organisms);//if organisms is empty then all species are allowed (no filtering)
@@ -333,8 +333,9 @@ static void toSifnx(String[] argv) throws IOException {
 
 		Model model = getModel(io, argv[1]);
 
-		if(mergeInteractions)
+		if(mergeInteractions) {
 			ModelUtils.mergeEquivalentInteractions(model);
+		}
 
 		//Create a new SIF searcher:
 		//set SIF miners to use (default is to use all types, given no include/exclude args provided)

diff --git a/...pax/paxtools/pattern/miner/IDFetcher.java → ...biopax/paxtools/controller/IDFetcher.java b/...pax/paxtools/pattern/miner/IDFetcher.java → ...biopax/paxtools/controller/IDFetcher.java
@@ -1,4 +1,4 @@
-package org.biopax.paxtools.pattern.miner;
+package org.biopax.paxtools.controller;
 
 import org.biopax.paxtools.model.BioPAXElement;
 
@@ -11,8 +11,8 @@ public interface IDFetcher
 {
 	/**
 	 * Finds a String ID for the given element.
-	 * @param ele element to fecth the ID from
-	 * @return ID
+	 * @param ele element to fetch the ID from
+	 * @return some identifiers
 	 */
-	public Set<String> fetchID(BioPAXElement ele);
+	Set<String> fetchID(BioPAXElement ele);
 }