From 890f31f5d56f8570317bc398a40dd138ba033688 Mon Sep 17 00:00:00 2001 From: IgorRodchenkov Date: Mon, 29 Apr 2024 02:43:56 -0400 Subject: [PATCH] Removed duplicate/old code; moved IDFetcher to -core and ConfigurableIDFetcher (uses Resolver) to -normalizer modules (trying to avoid circular dependencies in the future, i.e., core should not depend on any other module, gsea shouldn't depend on pattern and vice versa; but it's ok if gsea, pattern to depend on core and normalizer...) --- .../biopax/paxtools/io/gsea/GMTConverter.java | 26 +-- .../paxtools/io/gsea/GSEAConverter.java | 18 +-- .../normalizer}/ConfigurableIDFetcher.java | 38 +++-- .../paxtools/normalizer/ResolverTest.java | 2 +- .../paxtools/pattern/constraint/HasAnID.java | 2 +- .../pattern/miner/BlacklistGenerator.java | 1 + .../pattern/miner/CommonIDFetcher.java | 8 +- .../paxtools/pattern/miner/MinerAdapter.java | 1 + .../paxtools/pattern/miner/SIFMiner.java | 1 + .../paxtools/pattern/miner/SIFSearcher.java | 1 + .../pattern/miner/SimpleIDFetcher.java | 2 +- .../pattern/miner/SIFSearcherTest.java | 4 +- .../java/org/biopax/paxtools/Commands.java | 5 +- .../paxtools/controller}/IDFetcher.java | 8 +- .../biopax/paxtools/controller/IdFetcher.java | 150 ------------------ 15 files changed, 59 insertions(+), 208 deletions(-) rename {pattern/src/main/java/org/biopax/paxtools/pattern/miner => normalizer/src/main/java/org/biopax/paxtools/normalizer}/ConfigurableIDFetcher.java (80%) rename {pattern/src/main/java/org/biopax/paxtools/pattern/miner => paxtools-core/src/main/java/org/biopax/paxtools/controller}/IDFetcher.java (55%) delete mode 100644 paxtools-core/src/main/java/org/biopax/paxtools/controller/IdFetcher.java diff --git a/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GMTConverter.java b/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GMTConverter.java index d92c89068..294691955 100644 --- a/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GMTConverter.java +++ b/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GMTConverter.java @@ -7,6 +7,7 @@ import org.biopax.paxtools.model.BioPAXLevel; import org.biopax.paxtools.model.Model; import org.biopax.paxtools.model.level3.*; +import org.biopax.paxtools.normalizer.ConfigurableIDFetcher; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -17,36 +18,25 @@ import java.util.*; /** - * An advanced BioPAX to GMT format converter, which can output IDs of genetic elements or chemicals - * (the output file can be loaded with the GSEA software if gene/protein IDs are used). - * - * Each output entry (row) consists of three columns (tab separated): - * name (URI), description, and the list of identifiers (of the same type). - * For all ERs not associated with any pathway, "other" is used for name and uri. + * An experimental/advanced BioPAX to GMT converter + * that can output the desired type IDs for both genetic elements and chemicals + * (the output file can be then loaded with GSEA software if gene/protein IDs were used). * - * The list may have one or more IDs of the same type per Protein Reference (PR), - * e.g., UniProt IDs or HGNC Symbols; PRs not having an xref of - * given db/id type are ignored. If there are less than three protein - * references per entry in total, it will not be printed. + * @see GSEAConverter * - * Note, this code assumes that the model has successfully been validated - * and perhaps normalized (using the BioPAX Validator, Paxtools Normalizer). - * A BioPAX L1 or L2 model is first converted to the L3. + * TODO: finish, test, make it public... */ final class GMTConverter { private final static Logger LOG = LoggerFactory.getLogger(GMTConverter.class); - private final IdFetcher idFetcher; + private final IDFetcher idFetcher; private boolean skipSubPathways; private boolean skipOutsidePathways; private int minNumIdsPerEntry; - /** - * Constructor. - */ public GMTConverter() { - idFetcher = new IdFetcher().chemDbStartsWithOrEquals("chebi") + idFetcher = new ConfigurableIDFetcher().chemDbStartsWithOrEquals("chebi") .seqDbStartsWithOrEquals("hgnc.symbol") //the order in the list does matter .seqDbStartsWithOrEquals("hgnc symbol") .seqDbStartsWithOrEquals("hgnc"); diff --git a/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GSEAConverter.java b/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GSEAConverter.java index 89935d2c9..cfda13587 100644 --- a/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GSEAConverter.java +++ b/gsea-converter/src/main/java/org/biopax/paxtools/io/gsea/GSEAConverter.java @@ -19,9 +19,9 @@ import java.util.*; /** - * Converts a BioPAX model to the GMT format (used by GSEA software). + * Converts a BioPAX model to the GMT format (used by GSEA software). * - * It creates GSEA entries from sequence entity reference xrefs + * It creates GSEA entries from sequence entity reference xrefs * in the BioPAX model as follows: * * Each entry (row) consists of three columns (tab separated): @@ -35,14 +35,10 @@ * given db/id type are ignored. Optionally, if there are less than three protein * references per entry, it will not be printed. * - * Note, to effectively enforce cross-species violation, - * 'organism' property and pathways must be set - * to a BioSource object that has a valid unification xref: - * db="Taxonomy" and id= some valid taxonomy id. - * - * Note, this code assumes that the model has successfully been validated - * and perhaps normalized (using the BioPAX Validator, Paxtools Normalizer). - * A BioPAX L1 or L2 model is first converted to the L3. + * Notes: + * - to effectively enforce the cross-species checks, 'organism' property of pathways and participants must be set + * to a BioSource element with valid ncbitaxon (taxonomy) unification xref; + * - this assumes that the BioPAX Level3 model was validated and normalized (with BioPAX Validator and Normalizer); */ public class GSEAConverter { @@ -98,7 +94,7 @@ public GSEAConverter(String idType, boolean crossSpeciesCheckEnabled) public GSEAConverter(String idType, boolean crossSpeciesCheckEnabled, boolean skipSubPathways) { if(Resolver.isKnownNameOrVariant(idType)) { - this.idType = Resolver.getNamespace(idType, true).getPrefix().toLowerCase(); //(it must be already lowecase though) + this.idType = Resolver.getNamespace(idType).getPrefix().toLowerCase(); //(it must be already lowecase though) } else { this.idType = (StringUtils.isNotBlank(idType)) ? idType.toLowerCase() : idType; } diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/ConfigurableIDFetcher.java b/normalizer/src/main/java/org/biopax/paxtools/normalizer/ConfigurableIDFetcher.java similarity index 80% rename from pattern/src/main/java/org/biopax/paxtools/pattern/miner/ConfigurableIDFetcher.java rename to normalizer/src/main/java/org/biopax/paxtools/normalizer/ConfigurableIDFetcher.java index 53e296a29..24cb658e7 100644 --- a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/ConfigurableIDFetcher.java +++ b/normalizer/src/main/java/org/biopax/paxtools/normalizer/ConfigurableIDFetcher.java @@ -1,12 +1,11 @@ -package org.biopax.paxtools.pattern.miner; +package org.biopax.paxtools.normalizer; import org.apache.commons.lang3.StringUtils; +import org.biopax.paxtools.controller.IDFetcher; import org.biopax.paxtools.model.BioPAXElement; import org.biopax.paxtools.model.level3.*; -import org.biopax.paxtools.normalizer.Namespace; -import org.biopax.paxtools.normalizer.Resolver; -import org.biopax.paxtools.util.HGNC; import org.biopax.paxtools.util.ClassFilterSet; +import org.biopax.paxtools.util.HGNC; import java.util.*; @@ -87,25 +86,36 @@ public ConfigurableIDFetcher useNameWhenNoDbMatch(boolean useNameWhenNoDbMatch) public Set fetchID(BioPAXElement ele) { Set set = new HashSet<>(); + if(ele instanceof EntityReference) { + System.out.println("fetchID(ER): " + ele.getUri()); + } if(ele instanceof XReferrable) { //Iterate the db priority list, match/filter all xrefs to collect the IDs of given type, until 'set' is not empty. - List dbStartsWithOrEquals = (ele instanceof SmallMoleculeReference || ele instanceof SmallMolecule) + List dbStartsWithOrEquals = + (ele instanceof SmallMoleculeReference || ele instanceof SmallMolecule) ? chemDbStartsWithOrEquals : seqDbStartsWithOrEquals; for (String dbStartsWith : dbStartsWithOrEquals) { - for (UnificationXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), - UnificationXref.class)) { - collectXrefIdIfDbLike(x, dbStartsWith, set); + //a shortcut for normalized URIs; prevents collecting lots of secondary IDs of the same type + if(StringUtils.containsIgnoreCase(ele.getUri(),"identifiers.org/"+dbStartsWith) + || StringUtils.containsIgnoreCase(ele.getUri(),"bioregistry.io/"+dbStartsWith)) { + //can be http://identifiers.org/hgnc.symbol:PCNA or http://bioregistry.io/chebi:20, etc. + set.add(StringUtils.substringAfterLast(ele.getUri(), "/")); } - //if none were found in the unification xrefs, then try the relationship xrefs - if (set.isEmpty()) { - for (RelationshipXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), - RelationshipXref.class)) { + else { + for (UnificationXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), UnificationXref.class)) { collectXrefIdIfDbLike(x, dbStartsWith, set); } + //if none found, try relationship xrefs + if (set.isEmpty()) { + for (RelationshipXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), RelationshipXref.class)) { + collectXrefIdIfDbLike(x, dbStartsWith, set); + } + } } - //once we've found some ID, no need to try another id type - if (!set.isEmpty()) + //once we've found some ID, no need to try next (lower priority) id type in the list + if (!set.isEmpty()) { break; + } } } diff --git a/normalizer/src/test/java/org/biopax/paxtools/normalizer/ResolverTest.java b/normalizer/src/test/java/org/biopax/paxtools/normalizer/ResolverTest.java index eedfb5e72..e7f4c63c2 100644 --- a/normalizer/src/test/java/org/biopax/paxtools/normalizer/ResolverTest.java +++ b/normalizer/src/test/java/org/biopax/paxtools/normalizer/ResolverTest.java @@ -12,7 +12,7 @@ public final void getNamespace() { () -> Assertions.assertNotNull(Resolver.getNamespace("obo.mi")),//auto-detected as "mi" () -> Assertions.assertEquals(MI, Resolver.getNamespace("urn:miriam:mi").getName()), () -> Assertions.assertNotNull(Resolver.getNamespace("psi-mi")),//becomes "mi" - () -> Assertions.assertNotNull(Resolver.getNamespace("MolecularInteractions Ontology", true)),//misspelling variant (allowed by default) + () -> Assertions.assertNotNull(Resolver.getNamespace("MolecularInteractions Ontology", true)),//misspelling variant (allowed by default too) () -> Assertions.assertNull(Resolver.getNamespace("MolecularInteractions Ontology", false)),//null when spelling variants not allowed () -> Assertions.assertNotNull(Resolver.getNamespace("http://bioregistry.io/chebi")), () -> Assertions.assertNotNull(Resolver.getNamespace("bioregistry.io/uniprot")), diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/constraint/HasAnID.java b/pattern/src/main/java/org/biopax/paxtools/pattern/constraint/HasAnID.java index 1dfc6f2db..984b69eeb 100644 --- a/pattern/src/main/java/org/biopax/paxtools/pattern/constraint/HasAnID.java +++ b/pattern/src/main/java/org/biopax/paxtools/pattern/constraint/HasAnID.java @@ -1,8 +1,8 @@ package org.biopax.paxtools.pattern.constraint; +import org.biopax.paxtools.controller.IDFetcher; import org.biopax.paxtools.model.BioPAXElement; import org.biopax.paxtools.pattern.Match; -import org.biopax.paxtools.pattern.miner.IDFetcher; import java.util.Map; import java.util.Set; diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/BlacklistGenerator.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/BlacklistGenerator.java index d7e3cc534..035e2dd75 100644 --- a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/BlacklistGenerator.java +++ b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/BlacklistGenerator.java @@ -1,5 +1,6 @@ package org.biopax.paxtools.pattern.miner; +import org.biopax.paxtools.controller.IDFetcher; import org.biopax.paxtools.model.BioPAXElement; import org.biopax.paxtools.model.Model; import org.biopax.paxtools.model.level3.SmallMoleculeReference; diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/CommonIDFetcher.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/CommonIDFetcher.java index 1efb1df0a..79c9d5e98 100644 --- a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/CommonIDFetcher.java +++ b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/CommonIDFetcher.java @@ -1,6 +1,7 @@ package org.biopax.paxtools.pattern.miner; import org.apache.commons.lang3.StringUtils; +import org.biopax.paxtools.controller.IDFetcher; import org.biopax.paxtools.model.BioPAXElement; import org.biopax.paxtools.model.level3.*; import org.biopax.paxtools.normalizer.Resolver; @@ -12,10 +13,9 @@ /** * Tries to get Gene Symbols or UniProt IDs for genes - * and - display names for small molecules; - * - * This id-fetcher is mainly to use with the BioPAX pathway data - * from Pathway Commons (PC2) db (normalized and enriched with xrefs). + * and - get display names for small molecules. + * This is mainly to use with the BioPAX pathway data + * from Pathway Commons db (normalized and enriched with xrefs). * * @author Ozgun Babur et al. */ diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/MinerAdapter.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/MinerAdapter.java index 1fae59a00..9353be3ab 100644 --- a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/MinerAdapter.java +++ b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/MinerAdapter.java @@ -1,6 +1,7 @@ package org.biopax.paxtools.pattern.miner; import org.biopax.paxtools.controller.PathAccessor; +import org.biopax.paxtools.controller.IDFetcher; import org.biopax.paxtools.model.BioPAXElement; import org.biopax.paxtools.model.level3.*; import org.biopax.paxtools.pattern.Match; diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFMiner.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFMiner.java index fc19ca496..2fa5d548a 100644 --- a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFMiner.java +++ b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFMiner.java @@ -1,5 +1,6 @@ package org.biopax.paxtools.pattern.miner; +import org.biopax.paxtools.controller.IDFetcher; import org.biopax.paxtools.pattern.Match; import org.biopax.paxtools.pattern.util.Blacklist; diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFSearcher.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFSearcher.java index 7b0d9b36c..4069d600f 100644 --- a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFSearcher.java +++ b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SIFSearcher.java @@ -1,5 +1,6 @@ package org.biopax.paxtools.pattern.miner; +import org.biopax.paxtools.controller.IDFetcher; import org.biopax.paxtools.model.BioPAXElement; import org.biopax.paxtools.model.Model; import org.biopax.paxtools.pattern.Match; diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SimpleIDFetcher.java b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SimpleIDFetcher.java index 96c2a558c..12b8b713b 100644 --- a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SimpleIDFetcher.java +++ b/pattern/src/main/java/org/biopax/paxtools/pattern/miner/SimpleIDFetcher.java @@ -1,5 +1,6 @@ package org.biopax.paxtools.pattern.miner; +import org.biopax.paxtools.controller.IDFetcher; import org.biopax.paxtools.model.BioPAXElement; import java.util.Collections; @@ -15,5 +16,4 @@ public Set fetchID(BioPAXElement ele) { return Collections.singleton(ele.getUri()); } - } diff --git a/pattern/src/test/java/org/biopax/paxtools/pattern/miner/SIFSearcherTest.java b/pattern/src/test/java/org/biopax/paxtools/pattern/miner/SIFSearcherTest.java index 6b503cfcd..5f88d4590 100644 --- a/pattern/src/test/java/org/biopax/paxtools/pattern/miner/SIFSearcherTest.java +++ b/pattern/src/test/java/org/biopax/paxtools/pattern/miner/SIFSearcherTest.java @@ -1,12 +1,12 @@ package org.biopax.paxtools.pattern.miner; +import org.biopax.paxtools.controller.IDFetcher; import org.biopax.paxtools.io.SimpleIOHandler; import org.biopax.paxtools.model.BioPAXLevel; import org.biopax.paxtools.model.Model; import org.biopax.paxtools.model.level3.*; -import org.biopax.paxtools.pattern.Pattern; +import org.biopax.paxtools.normalizer.ConfigurableIDFetcher; import org.biopax.paxtools.pattern.PatternBoxTest; -import org.biopax.paxtools.pattern.constraint.IDConstraint; import org.biopax.paxtools.pattern.util.AdjacencyMatrix; import org.biopax.paxtools.pattern.util.Blacklist; import org.junit.jupiter.api.Disabled; diff --git a/paxtools-console/src/main/java/org/biopax/paxtools/Commands.java b/paxtools-console/src/main/java/org/biopax/paxtools/Commands.java index 92b9e73fd..62d85a78a 100644 --- a/paxtools-console/src/main/java/org/biopax/paxtools/Commands.java +++ b/paxtools-console/src/main/java/org/biopax/paxtools/Commands.java @@ -21,6 +21,7 @@ import org.biopax.paxtools.model.Model; import org.biopax.paxtools.model.level2.entity; import org.biopax.paxtools.model.level3.*; +import org.biopax.paxtools.normalizer.ConfigurableIDFetcher; import org.biopax.paxtools.normalizer.Namespace; import org.biopax.paxtools.normalizer.Resolver; import org.biopax.paxtools.pattern.miner.*; @@ -91,7 +92,6 @@ static void toGSEA(String[] argv) throws IOException { } } - // The Constructor args: GSEAConverter(idTypeNameOrPrefix, crossSpeciesCheckEnabled?, skipSubPathways?) GSEAConverter gseaConverter = new GSEAConverter(argv[3], !crossSpecies, !subPathways); gseaConverter.setSkipOutsidePathways(!notPathways); gseaConverter.setAllowedOrganisms(organisms);//if organisms is empty then all species are allowed (no filtering) @@ -333,8 +333,9 @@ static void toSifnx(String[] argv) throws IOException { Model model = getModel(io, argv[1]); - if(mergeInteractions) + if(mergeInteractions) { ModelUtils.mergeEquivalentInteractions(model); + } //Create a new SIF searcher: //set SIF miners to use (default is to use all types, given no include/exclude args provided) diff --git a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/IDFetcher.java b/paxtools-core/src/main/java/org/biopax/paxtools/controller/IDFetcher.java similarity index 55% rename from pattern/src/main/java/org/biopax/paxtools/pattern/miner/IDFetcher.java rename to paxtools-core/src/main/java/org/biopax/paxtools/controller/IDFetcher.java index dcbedfc62..3ded59a91 100644 --- a/pattern/src/main/java/org/biopax/paxtools/pattern/miner/IDFetcher.java +++ b/paxtools-core/src/main/java/org/biopax/paxtools/controller/IDFetcher.java @@ -1,4 +1,4 @@ -package org.biopax.paxtools.pattern.miner; +package org.biopax.paxtools.controller; import org.biopax.paxtools.model.BioPAXElement; @@ -11,8 +11,8 @@ public interface IDFetcher { /** * Finds a String ID for the given element. - * @param ele element to fecth the ID from - * @return ID + * @param ele element to fetch the ID from + * @return some identifiers */ - public Set fetchID(BioPAXElement ele); + Set fetchID(BioPAXElement ele); } diff --git a/paxtools-core/src/main/java/org/biopax/paxtools/controller/IdFetcher.java b/paxtools-core/src/main/java/org/biopax/paxtools/controller/IdFetcher.java deleted file mode 100644 index 526b64ab3..000000000 --- a/paxtools-core/src/main/java/org/biopax/paxtools/controller/IdFetcher.java +++ /dev/null @@ -1,150 +0,0 @@ -package org.biopax.paxtools.controller; - -import org.apache.commons.lang3.StringUtils; -import org.biopax.paxtools.model.BioPAXElement; -import org.biopax.paxtools.model.level3.*; -import org.biopax.paxtools.util.ClassFilterSet; - -import java.util.*; - -/** - * Tries to get preferred type IDs of an entity reference. - * - * Could be used, e.g., as part of a BioPAX to plain text (or SIF, GMT) converter. - */ -public class IdFetcher -{ - private final List seqDbStartsWithOrEquals; - private final List chemDbStartsWithOrEquals; - private boolean useNameWhenNoDbMatch; - - /** - * Constructor. - */ - public IdFetcher() { - seqDbStartsWithOrEquals = new ArrayList<>(); - chemDbStartsWithOrEquals = new ArrayList<>(); - useNameWhenNoDbMatch = false; - } - - /** - * Set to prefer collecting gene/sequence IDs of such Xrefs - * where the db starts with or equals given string, - * ignoring case. You can chain this method calls like - * seqDbStartsWithOrEquals(A).seqDbStartsWithOrEquals(B)... - - * it will try to match a xref.db and collect xref.id - * in the given order/priority. - * - * @param dbStartsWithOrEquals the Xref.db value or prefix (case-insensitive) - * @return this id-fetcher instance - */ - public IdFetcher seqDbStartsWithOrEquals(String dbStartsWithOrEquals) { - this.seqDbStartsWithOrEquals.add(dbStartsWithOrEquals.toLowerCase()); - return this; - } - - public List getSeqDbStartsWithOrEquals() { - return Collections.unmodifiableList(seqDbStartsWithOrEquals); - } - - /** - * Set to prefer collecting chemical IDs of such Xrefs - * where the small molecules db starts with or equals given string, - * ignoring case. You can chain this method calls like - * chemDbStartsWithOrEquals(A).chemDbStartsWithOrEquals(B)... - - * it will try to match a xref.db and collect xref.id - * in the given order/priority. - * - * @param dbStartsWithOrEquals the Xref.db value or prefix (case-insensitive) - * @return this id-fetcher instance - */ - public IdFetcher chemDbStartsWithOrEquals(String dbStartsWithOrEquals) { - this.chemDbStartsWithOrEquals.add(dbStartsWithOrEquals.toLowerCase()); - return this; - } - - public List getChemDbStartsWithOrEquals() { - return Collections.unmodifiableList(chemDbStartsWithOrEquals); - } - - /** - * Set the flag to use the entity reference's names - * when no desired ID type can be found (none of xref.db - * matched before, or there are no xrefs at all). - * - * @param useNameWhenNoDbMatch true/false (default is 'true' - when this method's never been called) - * @return this id-fetcher instance - */ - public IdFetcher useNameWhenNoDbMatch(boolean useNameWhenNoDbMatch) { - this.useNameWhenNoDbMatch = useNameWhenNoDbMatch; - return this; - } - - public Set fetchID(BioPAXElement ele) - { - Set set = new HashSet<>(); - if(ele instanceof XReferrable) { - //Iterate the db priority list, match/filter all xrefs to collect the IDs of given type, until 'set' is not empty. - List dbStartsWithOrEquals = - (ele instanceof SmallMoleculeReference || ele instanceof SmallMolecule) - ? chemDbStartsWithOrEquals : seqDbStartsWithOrEquals; - - for (String dbStartsWith : dbStartsWithOrEquals) { - //a shortcut for normalized URIs; prevents collecting lots of secondary IDs of the same type - if(StringUtils.containsIgnoreCase(ele.getUri(),"identifiers.org/"+dbStartsWith) - || StringUtils.containsIgnoreCase(ele.getUri(),"bioregistry.io/"+dbStartsWith)) { - set.add(ele.getUri().substring(ele.getUri().lastIndexOf("/") + 1)); - } - else { - for (UnificationXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), UnificationXref.class)) { - collectXrefIdIfDbLike(x, dbStartsWith, set); - } - //if none was found in unif. xrefs, try rel, xrefs - if (set.isEmpty()) { - for (RelationshipXref x : new ClassFilterSet<>(((XReferrable) ele).getXref(), - RelationshipXref.class)) { - collectXrefIdIfDbLike(x, dbStartsWith, set); - } - } - } - //once we've found some ID, no need to try another id type - if (!set.isEmpty()) { - break; - } - } - } - - if (set.isEmpty() && ele instanceof Named && useNameWhenNoDbMatch) - { - Named e = (Named) ele; - //avoid shortened/incomplete names - - if (e.getDisplayName() != null && !e.getDisplayName().contains("...")) - set.add(e.getDisplayName()); - else if (e.getStandardName() != null && !e.getStandardName().contains("...")) - set.add(e.getStandardName()); - else if (!e.getName().isEmpty()) { - Set names = new TreeSet<>(); - for(String name : e.getName()) { - if(!name.contains("...")) - names.add(name); - } - set.add(names.toString()); - } - } - - return set; - } - - private void collectXrefIdIfDbLike(final Xref x, final String dbStartsWith, final Set set) { - String db = x.getDb(); - String id = x.getId(); - if (db != null && id != null && !id.isEmpty()) { - db = db.toLowerCase(); - if (db.startsWith(dbStartsWith)) { - if (id != null) - set.add(id); - } - } - } - -}