Skip to content

Commit

Permalink
Polished Cleaner/Converter classes, etc.
Browse files Browse the repository at this point in the history
  • Loading branch information
IgorRodchenkov committed Mar 4, 2024
1 parent d710087 commit 18cb155
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 51 deletions.
12 changes: 9 additions & 3 deletions src/main/java/cpath/cleaner/HumanCycCleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,18 @@ protected void cleanXrefDBName(Model model)
for (Xref xr : model.getObjects(Xref.class))
{
if(xr.getDb() == null) {
if(!(xr instanceof PublicationXref))
if(!(xr instanceof PublicationXref))
LOG.warn(xr.getModelInterface().getSimpleName() + ".db is NULL; " + xr.getUri());
} else if(xr.getDb().startsWith("Entrez"))
}
else if(xr.getDb().startsWith("Entrez")) {
xr.setDb("genpept"); //Protein GenBank Identifier
else if(xr.getDb().equalsIgnoreCase("NCBI Taxonomy"))
}
else if(xr.getDb().equalsIgnoreCase("NCBI Taxonomy")) {
xr.setDb("ncbitaxon");
}
else if(xr.getDb().equalsIgnoreCase("Ensembl Human")) {
xr.setDb("ensembl");
}
}
}

Expand Down
60 changes: 27 additions & 33 deletions src/main/java/cpath/cleaner/ReactomeCleaner.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,36 +30,34 @@
* Removes "unstable" Reactome ID xref from objects where a stable ID is present.
*/
final class ReactomeCleaner implements Cleaner {
private static Logger log = LoggerFactory.getLogger(ReactomeCleaner.class);
private static Logger log = LoggerFactory.getLogger(ReactomeCleaner.class);

public void clean(InputStream data, OutputStream cleanedData)
public void clean(InputStream data, OutputStream cleanedData)
{
// import the original Reactome BioPAX model from file
log.info("Cleaning Reactome data...");
SimpleIOHandler simpleReader = new SimpleIOHandler(BioPAXLevel.L3);
Model model = simpleReader.convertFromOWL(data);

// Normalize pathway URIs, where possible, using Reactome stable IDs
// Since v54, Reactome stable ID format has been changed to like: "R-HSA-123456"
final Map<String, Entity> newUriToEntityMap = new HashMap<>();
final Set<Process> processes = new HashSet<>(model.getObjects(Process.class));

for(Process proc : processes) {
if (StringUtils.contains(proc.getUri(),"identifiers.org/reactome")
|| StringUtils.contains(proc.getUri(), "bioregistry.io/reactome"))
for(Process proc : processes)
{
if (StringUtils.containsAny(proc.getUri(),
"identifiers.org/reactome", "bioregistry.io/reactome")) {
continue; //skip for already normalized pathway or interaction

final Set<UnificationXref> uxrefs = new ClassFilterSet<>(
new HashSet<>(proc.getXref()), UnificationXref.class);
}
final Set<UnificationXref> uxrefs = new ClassFilterSet<>(new HashSet<>(proc.getXref()), UnificationXref.class);
for (UnificationXref x : uxrefs) {
if (x.getDb() != null && x.getDb().equalsIgnoreCase("Reactome")) {
if (StringUtils.equalsIgnoreCase(x.getDb(),"reactome")) {
String stableId = x.getId();
//remove 'REACTOME:' (length=9) prefix if present (it's optional - according to MIRIAM)
//remove 'REACTOME:' (length=9) prefix if present (it's optional - according to MIRIAM/Bioregistry)
if (stableId.startsWith("REACTOME:")) {
stableId = stableId.substring(9);
// stableID is like 'R-HSA-123456'
}

final String uri = "bioregistry.io/reactome:" + stableId;
if (!model.containsID(uri) && !newUriToEntityMap.containsKey(uri)) {
//save it in the map to replace the URI later (see below)
Expand All @@ -79,9 +77,10 @@ public void clean(InputStream data, OutputStream cleanedData)
}
}

// set standard URIs for selected entities;
for(String uri : newUriToEntityMap.keySet())
// set standard URIs for selected entities (processes);
for(String uri : newUriToEntityMap.keySet()) {
CPathUtils.replaceUri(model, newUriToEntityMap.get(uri), uri);
}

// All Conversions in Reactome are LEFT-TO-RIGH,
// unless otherwise was specified (confirmed with Guanming Wu, 2013/12)
Expand All @@ -91,29 +90,25 @@ public void clean(InputStream data, OutputStream cleanedData)
ent.setConversionDirection(ConversionDirectionType.LEFT_TO_RIGHT);
}

// Remove unstable UnificationXrefs like "Reactome Database ID Release XX"
// Remove unstable UnificationXrefs like "Reactome Database ID Release 65"
// if there is a stable xref in the same object
// Since Reactome v54, stable ID format is different (not like REACT_12345...)
final Set<Xref> xrefsToRemove = new HashSet<>();
for(Xref xref: new HashSet<>(model.getObjects(Xref.class))) {
if(xref.getDb() != null && xref.getDb().toLowerCase().startsWith("reactome database"))
for(Xref xref: new HashSet<>(model.getObjects(Xref.class)))
{
if(StringUtils.startsWithIgnoreCase(xref.getDb(),"reactome database"))
{
//remove the long comment (save some RAM)
if(!(xref instanceof PublicationXref))
if(!(xref instanceof PublicationXref)) {
xref.getComment().clear();

//proceed with a unification xref only...
if(xref instanceof UnificationXref) {
for(XReferrable owner : new HashSet<>(xref.getXrefOf())) {
for(Xref x : new HashSet<>(owner.getXref())) {
if(!(x instanceof UnificationXref) || x.equals(xref))
continue;
//another unif. xref present in the same owner object
if(x.getDb() != null && x.getDb().equalsIgnoreCase("reactome")) {
//remove the unstable ID ref from the object that has a stable id
owner.removeXref(xref);
xrefsToRemove.add(xref);
}
}
for(XReferrable owner : new HashSet<>(xref.getXrefOf())) {
for(Xref x : new HashSet<>(owner.getXref())) {
if(StringUtils.equalsIgnoreCase(x.getDb(), "reactome")) {
//if a standard "reactome" xref is also present in the same owner object,
//then remove the unstable ID xref from that object
owner.removeXref(xref);
xrefsToRemove.add(xref);
}
}
}
Expand All @@ -128,8 +123,7 @@ public void clean(InputStream data, OutputStream cleanedData)
try {
simpleReader.convertToOWL(model, cleanedData);
} catch (Exception e) {
throw new RuntimeException("clean(), Exception thrown while saving cleaned Reactome data", e);
throw new RuntimeException("clean(), failed saving the cleaned Reactome model", e);
}
}

}
8 changes: 4 additions & 4 deletions src/main/java/cpath/converter/UniprotConverter.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,13 @@ public void convert(InputStream is, OutputStream os) {
// GN gene symbols - to PR names and rel. xrefs
if (geneName != null) {
Collection<String> geneNames = getGeneSymbols(geneName.toString(), proteinReference);
// always use "HGNC Symbol" for rel. xrefs, despite it can be from MGI, RGD,.. (these are coordinated by HGNC)
// always use HGNC Symbol for rel. xrefs, despite it can be from MGI, RGD (these are coordinated by HGNC);
// (cannot do this in setXRefsFromDRs: no gene synonyms there, and organism specific db names like MGI)
for (String symbol : geneNames) {
// also add Gene Names to PR names (can be >1 due to isoforms)
proteinReference.addName(symbol);
RelationshipXref rXRef = CPathUtils
.findOrCreateRelationshipXref(RelTypeVocab.IDENTITY, "HGNC Symbol", symbol, model);
.findOrCreateRelationshipXref(RelTypeVocab.IDENTITY, "hgnc.symbol", symbol, model);
proteinReference.addXref(rXRef);
}
}
Expand Down Expand Up @@ -305,12 +305,12 @@ else if (db.equalsIgnoreCase("EMBL")) {
fixedDb = "Nucleotide Sequence Database";
//last ID in a HGNC line is in fact gene name
} else if(db.equalsIgnoreCase("HGNC") && !id.startsWith("HGNC:")) {
fixedDb = "HGNC Symbol";
fixedDb = "hgnc.symbol";
}
//remove .version from RefSeq IDs
else if (db.equalsIgnoreCase("REFSEQ")) {
// extract only RefSeq AC from AC.Version ID form
fixedDb = "RefSeq";
fixedDb = "refseq";
id = id.replaceFirst("\\.\\d+", "");
}

Expand Down
9 changes: 5 additions & 4 deletions src/main/java/cpath/service/BiopaxConverter.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import cpath.service.api.OutputFormat;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.biopax.paxtools.io.gsea.GSEAConverter;
import org.biopax.paxtools.io.jsonld.JsonldBiopaxConverter;
import org.biopax.paxtools.io.jsonld.JsonldConverter;
Expand Down Expand Up @@ -178,9 +179,9 @@ private void convertToSBGN(Model m, OutputStream stream, Blacklist blackList, bo
private void convertToGSEA(Model m, OutputStream stream, Map<String, String> options)
throws IOException {
String idType;
if ((idType = options.get("db")) == null)
idType = "hgnc symbol";

if ((idType = options.get("db")) == null) {
idType = "hgnc.symbol";
}

// It won't traverse into sub-pathways; will use only pre-defined organisms.
// GSEAConverter's 'skipSubPathways' option is a different beast from the PC web api's 'subpw':
Expand Down Expand Up @@ -215,7 +216,7 @@ private void convertToSIF(Model m, OutputStream out,
ConfigurableIDFetcher idFetcher = new ConfigurableIDFetcher();
idFetcher.chemDbStartsWithOrEquals("chebi");

if (db == null || db.isEmpty() || db.toLowerCase().startsWith("hgnc")) {
if (StringUtils.isBlank(db) || db.toLowerCase().startsWith("hgnc")) {
idFetcher.seqDbStartsWithOrEquals("hgnc");
} else if (db.toLowerCase().startsWith("uniprot")) {
idFetcher.seqDbStartsWithOrEquals("uniprot");
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/cpath/service/ConsoleApplication.java
Original file line number Diff line number Diff line change
Expand Up @@ -391,14 +391,14 @@ private void writeScriptCommands(String bpFilename, PrintWriter writer, boolean
final String commaSepTaxonomyIds = String.join(",", service.settings().getOrganismTaxonomyIds());
if (exportToGSEA) {
writer.println(String.format("%s %s '%s' '%s' %s 2>&1 &", javaRunPaxtools, "toGSEA", bpFilename,
prefix + "hgnc.gmt", "'hgnc symbol' 'organisms=" + commaSepTaxonomyIds + "'"));//'hgnc symbol' - important
prefix + "hgnc.gmt", "'hgnc.symbol' 'organisms=" + commaSepTaxonomyIds + "'"));//'hgnc symbol' - important
writer.println(String.format("%s %s '%s' '%s' %s 2>&1 &", javaRunPaxtools, "toGSEA", bpFilename,
prefix + "uniprot.gmt", "'uniprot' 'organisms=" + commaSepTaxonomyIds + "'"));
writer.println("wait"); //important
writer.println("echo \"Done converting " + bpFilename + " to GSEA.\"");
}
writer.println(String.format("%s %s '%s' '%s' %s 2>&1 &", javaRunPaxtools, "toSIF", bpFilename,
prefix + "hgnc.txt", "seqDb=hgnc -extended -andSif exclude=neighbor_of"));//'hgnc symbol' or 'hgnc' does not matter
prefix + "hgnc.txt", "seqDb=hgnc -extended -andSif exclude=neighbor_of"));
//UniProt ID based extended SIF files can be huge, take too long to generate; skip for now.
writer.println("wait"); //important
writer.println("echo \"Done converting " + bpFilename + " to SIF.\"");
Expand Down
11 changes: 6 additions & 5 deletions src/main/java/cpath/service/Merger.java
Original file line number Diff line number Diff line change
Expand Up @@ -606,8 +606,9 @@ else if(primaryACs.size() > maxNumXrefsToAdd) {
}

// map primary ACs to HGNC Symbols and generate RXs if not too many...
if (noneXrefDbStartsWith(bpe, "hgnc symbol"))
if (noneXrefDbStartsWith(bpe, "hgnc.symbol")) {
mayAddHgncXrefs(m, bpe, primaryACs, maxNumXrefsToAdd);
}
}

// For biopolymers, also map uniprot accessions to HGNC Symbols, and add the xrefs, if possible -
Expand All @@ -618,17 +619,17 @@ private void mayAddHgncXrefs(final Model m, final XReferrable bpe,
}
final Set<String> hgncSymbols = new HashSet<>();
for (String ac : accessions) {
ProteinReference canonicalPR =
(ProteinReference) warehouseModel.getByID("bioregistry.io/uniprot:" + ac);
ProteinReference canonicalPR = (ProteinReference) warehouseModel.getByID("bioregistry.io/uniprot:" + ac);
if (canonicalPR != null) {
for (Xref x : canonicalPR.getXref())
if (x.getDb().equalsIgnoreCase("hgnc symbol"))
if (x.getDb().equalsIgnoreCase("hgnc.symbol")) {
hgncSymbols.add(x.getId());
}
}
}
// add rel. xrefs if there are not too many (there's risk to make nonsense SIF/GSEA export...)
if (!hgncSymbols.isEmpty() && hgncSymbols.size() <= maxNumXrefsToAdd) {
addRelXrefs(m, bpe, "hgnc symbol", hgncSymbols, RelTypeVocab.ADDITIONAL_INFORMATION);
addRelXrefs(m, bpe, "hgnc.symbol", hgncSymbols, RelTypeVocab.ADDITIONAL_INFORMATION);
}
}

Expand Down

0 comments on commit 18cb155

Please sign in to comment.