Skip to content

Commit

Permalink
Removed duplicate/old code; moved IDFetcher to -core and Configurable…
Browse files Browse the repository at this point in the history
…IDFetcher (uses Resolver) to -normalizer modules

(trying to avoid circular dependencies in the future, i.e., core should not depend on any other module,
gsea shouldn't depend on pattern and vice versa; but it's ok if gsea, pattern to depend on core and normalizer...)
  • Loading branch information
IgorRodchenkov committed Apr 29, 2024
1 parent 3ec3dc4 commit 890f31f
Show file tree
Hide file tree
Showing 15 changed files with 59 additions and 208 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.biopax.paxtools.model.BioPAXLevel;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.normalizer.ConfigurableIDFetcher;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

Expand All @@ -17,36 +18,25 @@
import java.util.*;

/**
* An advanced BioPAX to GMT format converter, which can output IDs of genetic elements or chemicals
* (the output file can be loaded with the GSEA software if gene/protein IDs are used).
*
* Each output entry (row) consists of three columns (tab separated):
* name (URI), description, and the list of identifiers (of the same type).
* For all ERs not associated with any pathway, "other" is used for name and uri.
* An experimental/advanced BioPAX to GMT converter
* that can output the desired type IDs for both genetic elements and chemicals
* (the output file can be then loaded with GSEA software if gene/protein IDs were used).
*
* The list may have one or more IDs of the same type per Protein Reference (PR),
* e.g., UniProt IDs or HGNC Symbols; PRs not having an xref of
* given db/id type are ignored. If there are less than three protein
* references per entry in total, it will not be printed.
* @see GSEAConverter
*
* Note, this code assumes that the model has successfully been validated
* and perhaps normalized (using the BioPAX Validator, Paxtools Normalizer).
* A BioPAX L1 or L2 model is first converted to the L3.
* TODO: finish, test, make it public...
*/
final class GMTConverter {
private final static Logger LOG = LoggerFactory.getLogger(GMTConverter.class);

private final IdFetcher idFetcher;
private final IDFetcher idFetcher;
private boolean skipSubPathways;
private boolean skipOutsidePathways;
private int minNumIdsPerEntry;

/**
* Constructor.
*/
public GMTConverter()
{
idFetcher = new IdFetcher().chemDbStartsWithOrEquals("chebi")
idFetcher = new ConfigurableIDFetcher().chemDbStartsWithOrEquals("chebi")
.seqDbStartsWithOrEquals("hgnc.symbol") //the order in the list does matter
.seqDbStartsWithOrEquals("hgnc symbol")
.seqDbStartsWithOrEquals("hgnc");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,9 @@
import java.util.*;

/**
* Converts a BioPAX model to the GMT format (used by GSEA software).
* Converts a BioPAX model to the GMT format (used by GSEA software).
*
* It creates GSEA entries from sequence entity reference xrefs
* It creates GSEA entries from sequence entity reference xrefs
* in the BioPAX model as follows:
*
* Each entry (row) consists of three columns (tab separated):
Expand All @@ -35,14 +35,10 @@
* given db/id type are ignored. Optionally, if there are less than three protein
* references per entry, it will not be printed.
*
* Note, to effectively enforce cross-species violation,
* 'organism' property and pathways must be set
* to a BioSource object that has a valid unification xref:
* db="Taxonomy" and id= some valid taxonomy id.
*
* Note, this code assumes that the model has successfully been validated
* and perhaps normalized (using the BioPAX Validator, Paxtools Normalizer).
* A BioPAX L1 or L2 model is first converted to the L3.
* Notes:
* - to effectively enforce the cross-species checks, 'organism' property of pathways and participants must be set
* to a BioSource element with valid ncbitaxon (taxonomy) unification xref;
* - this assumes that the BioPAX Level3 model was validated and normalized (with BioPAX Validator and Normalizer);
*/
public class GSEAConverter
{
Expand Down Expand Up @@ -98,7 +94,7 @@ public GSEAConverter(String idType, boolean crossSpeciesCheckEnabled)
public GSEAConverter(String idType, boolean crossSpeciesCheckEnabled, boolean skipSubPathways)
{
if(Resolver.isKnownNameOrVariant(idType)) {
this.idType = Resolver.getNamespace(idType, true).getPrefix().toLowerCase(); //(it must be already lowecase though)
this.idType = Resolver.getNamespace(idType).getPrefix().toLowerCase(); //(it must be already lowecase though)
} else {
this.idType = (StringUtils.isNotBlank(idType)) ? idType.toLowerCase() : idType;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,11 @@
package org.biopax.paxtools.pattern.miner;
package org.biopax.paxtools.normalizer;

import org.apache.commons.lang3.StringUtils;
import org.biopax.paxtools.controller.IDFetcher;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.normalizer.Namespace;
import org.biopax.paxtools.normalizer.Resolver;
import org.biopax.paxtools.util.HGNC;
import org.biopax.paxtools.util.ClassFilterSet;
import org.biopax.paxtools.util.HGNC;

import java.util.*;

Expand Down Expand Up @@ -87,25 +86,36 @@ public ConfigurableIDFetcher useNameWhenNoDbMatch(boolean useNameWhenNoDbMatch)
public Set<String> fetchID(BioPAXElement ele)
{
Set<String> set = new HashSet<>();
if(ele instanceof EntityReference) {
System.out.println("fetchID(ER): " + ele.getUri());
}
if(ele instanceof XReferrable) {
//Iterate the db priority list, match/filter all xrefs to collect the IDs of given type, until 'set' is not empty.
List<String> dbStartsWithOrEquals = (ele instanceof SmallMoleculeReference || ele instanceof SmallMolecule)
List<String> dbStartsWithOrEquals =
(ele instanceof SmallMoleculeReference || ele instanceof SmallMolecule)
? chemDbStartsWithOrEquals : seqDbStartsWithOrEquals;
for (String dbStartsWith : dbStartsWithOrEquals) {
for (UnificationXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(),
UnificationXref.class)) {
collectXrefIdIfDbLike(x, dbStartsWith, set);
//a shortcut for normalized URIs; prevents collecting lots of secondary IDs of the same type
if(StringUtils.containsIgnoreCase(ele.getUri(),"identifiers.org/"+dbStartsWith)
|| StringUtils.containsIgnoreCase(ele.getUri(),"bioregistry.io/"+dbStartsWith)) {
//can be http://identifiers.org/hgnc.symbol:PCNA or http://bioregistry.io/chebi:20, etc.
set.add(StringUtils.substringAfterLast(ele.getUri(), "/"));
}
//if none were found in the unification xrefs, then try the relationship xrefs
if (set.isEmpty()) {
for (RelationshipXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(),
RelationshipXref.class)) {
else {
for (UnificationXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), UnificationXref.class)) {
collectXrefIdIfDbLike(x, dbStartsWith, set);
}
//if none found, try relationship xrefs
if (set.isEmpty()) {
for (RelationshipXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), RelationshipXref.class)) {
collectXrefIdIfDbLike(x, dbStartsWith, set);
}
}
}
//once we've found some ID, no need to try another id type
if (!set.isEmpty())
//once we've found some ID, no need to try next (lower priority) id type in the list
if (!set.isEmpty()) {
break;
}
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ public final void getNamespace() {
() -> Assertions.assertNotNull(Resolver.getNamespace("obo.mi")),//auto-detected as "mi"
() -> Assertions.assertEquals(MI, Resolver.getNamespace("urn:miriam:mi").getName()),
() -> Assertions.assertNotNull(Resolver.getNamespace("psi-mi")),//becomes "mi"
() -> Assertions.assertNotNull(Resolver.getNamespace("MolecularInteractions Ontology", true)),//misspelling variant (allowed by default)
() -> Assertions.assertNotNull(Resolver.getNamespace("MolecularInteractions Ontology", true)),//misspelling variant (allowed by default too)
() -> Assertions.assertNull(Resolver.getNamespace("MolecularInteractions Ontology", false)),//null when spelling variants not allowed
() -> Assertions.assertNotNull(Resolver.getNamespace("http://bioregistry.io/chebi")),
() -> Assertions.assertNotNull(Resolver.getNamespace("bioregistry.io/uniprot")),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
package org.biopax.paxtools.pattern.constraint;

import org.biopax.paxtools.controller.IDFetcher;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.pattern.Match;
import org.biopax.paxtools.pattern.miner.IDFetcher;

import java.util.Map;
import java.util.Set;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.biopax.paxtools.pattern.miner;

import org.biopax.paxtools.controller.IDFetcher;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.SmallMoleculeReference;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.biopax.paxtools.pattern.miner;

import org.apache.commons.lang3.StringUtils;
import org.biopax.paxtools.controller.IDFetcher;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.normalizer.Resolver;
Expand All @@ -12,10 +13,9 @@

/**
* Tries to get Gene Symbols or UniProt IDs for genes
* and - display names for small molecules;
*
* This id-fetcher is mainly to use with the BioPAX pathway data
* from Pathway Commons (PC2) db (normalized and enriched with xrefs).
* and - get display names for small molecules.
* This is mainly to use with the BioPAX pathway data
* from Pathway Commons db (normalized and enriched with xrefs).
*
* @author Ozgun Babur et al.
*/
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package org.biopax.paxtools.pattern.miner;

import org.biopax.paxtools.controller.PathAccessor;
import org.biopax.paxtools.controller.IDFetcher;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.pattern.Match;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.biopax.paxtools.pattern.miner;

import org.biopax.paxtools.controller.IDFetcher;
import org.biopax.paxtools.pattern.Match;
import org.biopax.paxtools.pattern.util.Blacklist;

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.biopax.paxtools.pattern.miner;

import org.biopax.paxtools.controller.IDFetcher;
import org.biopax.paxtools.model.BioPAXElement;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.pattern.Match;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package org.biopax.paxtools.pattern.miner;

import org.biopax.paxtools.controller.IDFetcher;
import org.biopax.paxtools.model.BioPAXElement;

import java.util.Collections;
Expand All @@ -15,5 +16,4 @@ public Set<String> fetchID(BioPAXElement ele)
{
return Collections.singleton(ele.getUri());
}

}
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
package org.biopax.paxtools.pattern.miner;

import org.biopax.paxtools.controller.IDFetcher;
import org.biopax.paxtools.io.SimpleIOHandler;
import org.biopax.paxtools.model.BioPAXLevel;
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.pattern.Pattern;
import org.biopax.paxtools.normalizer.ConfigurableIDFetcher;
import org.biopax.paxtools.pattern.PatternBoxTest;
import org.biopax.paxtools.pattern.constraint.IDConstraint;
import org.biopax.paxtools.pattern.util.AdjacencyMatrix;
import org.biopax.paxtools.pattern.util.Blacklist;
import org.junit.jupiter.api.Disabled;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.biopax.paxtools.model.Model;
import org.biopax.paxtools.model.level2.entity;
import org.biopax.paxtools.model.level3.*;
import org.biopax.paxtools.normalizer.ConfigurableIDFetcher;
import org.biopax.paxtools.normalizer.Namespace;
import org.biopax.paxtools.normalizer.Resolver;
import org.biopax.paxtools.pattern.miner.*;
Expand Down Expand Up @@ -91,7 +92,6 @@ static void toGSEA(String[] argv) throws IOException {
}
}

// The Constructor args: GSEAConverter(idTypeNameOrPrefix, crossSpeciesCheckEnabled?, skipSubPathways?)
GSEAConverter gseaConverter = new GSEAConverter(argv[3], !crossSpecies, !subPathways);
gseaConverter.setSkipOutsidePathways(!notPathways);
gseaConverter.setAllowedOrganisms(organisms);//if organisms is empty then all species are allowed (no filtering)
Expand Down Expand Up @@ -333,8 +333,9 @@ static void toSifnx(String[] argv) throws IOException {

Model model = getModel(io, argv[1]);

if(mergeInteractions)
if(mergeInteractions) {
ModelUtils.mergeEquivalentInteractions(model);
}

//Create a new SIF searcher:
//set SIF miners to use (default is to use all types, given no include/exclude args provided)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.biopax.paxtools.pattern.miner;
package org.biopax.paxtools.controller;

import org.biopax.paxtools.model.BioPAXElement;

Expand All @@ -11,8 +11,8 @@ public interface IDFetcher
{
/**
* Finds a String ID for the given element.
* @param ele element to fecth the ID from
* @return ID
* @param ele element to fetch the ID from
* @return some identifiers
*/
public Set<String> fetchID(BioPAXElement ele);
Set<String> fetchID(BioPAXElement ele);
}
Loading

0 comments on commit 890f31f

Please sign in to comment.