Skip to content

Commit

Permalink
Updated Resolver, Normalizer, jsonld-converter due to Jena bug/featur…
Browse files Browse the repository at this point in the history
…e...

Jena (besides other bugs, depending on version) mishandles valid abs. URIs in RDFXML
that have no namespace or schema prefix ('http://' or 'foo:'), e.g. 'bioregistry.io/chebi:20',
and also relative URIs sometimes...
  • Loading branch information
IgorRodchenkov committed Apr 23, 2024
1 parent 56d8a28 commit 94483aa
Show file tree
Hide file tree
Showing 11 changed files with 201 additions and 106 deletions.
2 changes: 1 addition & 1 deletion json-converter/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<dependency>
<groupId>org.apache.jena</groupId>
<artifactId>apache-jena-libs</artifactId>
<version>4.9.0</version>
<version>5.0.0</version>
<type>pom</type>
</dependency>
<dependency>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.text.SimpleDateFormat;
import java.util.Calendar;

import org.apache.jena.rdf.model.Model;
import org.apache.jena.rdf.model.ModelFactory;
Expand All @@ -24,103 +22,65 @@ public class JsonldBiopaxConverter implements JsonldConverter {
private final static Logger LOG = LoggerFactory.getLogger(JsonldBiopaxConverter.class);

/*
* Convert inputstream in owl/rdf format to outputsream in jsonld format
* Convert biopax owl (rdf/xml) to jsonld format.
*/
public void convertToJsonld(InputStream in, OutputStream os)
throws IOException {

File inputProcessedFile = preProcessFile(in);
LOG.info("OWl File processed successfully ");

// print current time
SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
LOG.info("Conversion RDF to JSONLD started "
+ sdf.format(Calendar.getInstance().getTime()));

// create an empty model
public void convertToJsonld(InputStream in, OutputStream os) throws IOException {
File tmpFile = preProcessFile(in); //in gets there closed
Model modelJena = ModelFactory.createDefaultModel();
InputStream internalInputStream = new FileInputStream(inputProcessedFile);
// read the RDF/XML file
RDFDataMgr.read(modelJena, internalInputStream, Lang.RDFXML);
LOG.info("Read into Model finished " + sdf.format(Calendar.getInstance().getTime()));

try { //close quietly and delete the temp. input file
internalInputStream.close();
inputProcessedFile.delete();
} catch(Exception e) {}

in = new FileInputStream(tmpFile);
RDFDataMgr.read(modelJena, in, Lang.RDFXML);
RDFDataMgr.write(os, modelJena, Lang.JSONLD);
LOG.info("Conversion RDF to JSONLD finished " + sdf.format(Calendar.getInstance().getTime()));
LOG.info(" JSONLD file " + " is written successfully.");

LOG.info("BioPAX RDFXML to JSONLD finished");
try { //close, flush quietly
in.close();
os.close();
tmpFile.delete();
} catch(Exception e) {}
}


/*
* Convert inputstream in jsonld format to outputsream if owl/rdf format
* Convert jsonld back to rdf/xml
* if that jsonld was converted from rdf/xml (e.g. biopax) originally
*/
public void convertFromJsonld(InputStream in, OutputStream out) {

Model modelJena = ModelFactory.createDefaultModel();

if (in == null) {
throw new IllegalArgumentException("Input File: " + " not found");
}
if (out == null) {
throw new IllegalArgumentException("Output File: " + " not found");
}

// read the JSONLD file
Model modelJena = ModelFactory.createDefaultModel();
modelJena.read(in, null, "JSONLD");

RDFDataMgr.write(out, modelJena, Lang.RDFXML);
LOG.info(" RDF file " + " is written successfully.");

LOG.info("JSONLD to RDFXML finished");
}

/**
* Converts the BioPAX data (stream) to an equivalent temporary
* BioPAX RDF/XML file that contains absolute instead of (possibly)
* relative URIs for all the BioPAX elements out there; and returns that file.
* This is required due to a bug in Jena lib that results in inserting '#' inside the URIs...
*
* @param in biopax input stream
* @return a temporary file
* @throws IOException
*/
public File preProcessFile(InputStream in) throws IOException {

SimpleDateFormat sdf = new SimpleDateFormat("HH:mm:ss");
LOG.info("BIOPAX Conversion started "
+ sdf.format(Calendar.getInstance().getTime()));

if (in == null) {
throw new IllegalArgumentException("Input File: " + " is not found");
}

SimpleIOHandler simpleIO = new SimpleIOHandler(BioPAXLevel.L3);

// create a Paxtools Model from the BioPAX L3 RDF/XML input file (stream)
org.biopax.paxtools.model.Model model = simpleIO.convertFromOWL(in);
// - and the input stream 'in' gets closed inside the above method call

// set for the IO to output full URIs:

simpleIO.absoluteUris(true);

File fullUriBiopaxInput = File.createTempFile("paxtools", ".owl");

fullUriBiopaxInput.deleteOnExit(); // delete on JVM exits
FileOutputStream outputStream = new FileOutputStream(fullUriBiopaxInput);

// create a Paxtools Model from the BioPAX RDF/XML input stream
org.biopax.paxtools.model.Model model = simpleIO.convertFromOWL(in);//also closes the input stream
// model.setXmlBase("");
simpleIO.absoluteUris(true); //forces absolute URIs in the output!
File tmpf = File.createTempFile("paxtools", ".owl");
tmpf.deleteOnExit(); // delete on JVM exits
FileOutputStream outputStream = new FileOutputStream(tmpf);
// write to an output stream (back to RDF/XML)

simpleIO.convertToOWL((org.biopax.paxtools.model.Model) model, outputStream); // it closes the stream internally

LOG.info("BIOPAX Conversion finished " + sdf.format(Calendar.getInstance().getTime()));
return fullUriBiopaxInput;
simpleIO.convertToOWL(model, outputStream); //also closes the output stream
return tmpf;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -2,30 +2,46 @@

import org.junit.jupiter.api.Test;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.*;
import java.net.URI;

import org.junit.jupiter.api.Assertions;

public class JsonldBiopaxConverterTest {

@Test
public final void test() throws IOException {
final void test() throws IOException {
File jsonldTestFileName = File.createTempFile("test", ".jsonld");
File rdfTestFileName = File.createTempFile("test", ".rdf");

JsonldConverter intf = new JsonldBiopaxConverter();

JsonldConverter converter = new JsonldBiopaxConverter();
// convert owl test file in resource directory to jsonld format
InputStream in = getClass().getResourceAsStream("/PC2v5test-Signaling-By-BMP-Pathway-REACT_12034.2.owl");
intf.convertToJsonld(in, new FileOutputStream(jsonldTestFileName));
converter.convertToJsonld(in, new FileOutputStream(jsonldTestFileName));

// convert jsonld test file back to rdf format
InputStream inputLD = new FileInputStream(jsonldTestFileName);
OutputStream outRDF = new FileOutputStream(rdfTestFileName);
intf.convertFromJsonld(inputLD, outRDF);
converter.convertFromJsonld(inputLD, outRDF);
}

@Test
final void test2() throws IOException {
JsonldConverter converter = new JsonldBiopaxConverter();
// convert owl test file in resource directory to jsonld format
InputStream in = getClass().getResourceAsStream("/pc14-test.owl");
ByteArrayOutputStream baos = new ByteArrayOutputStream();
converter.convertToJsonld(in, baos);
String res = baos.toString("UTF-8");
Assertions.assertAll(
() -> Assertions.assertThrows(IllegalArgumentException.class, () -> URI.create("http://")), //bad URI
() -> Assertions.assertDoesNotThrow(() -> URI.create("bioregistry.io/chebi:18367")), //valid URI but not good for LD (LinkedData)
() -> Assertions.assertDoesNotThrow(() -> URI.create("chebi:18367")), // valid URI (CURIE)
() -> Assertions.assertDoesNotThrow(() -> URI.create("http://bioregistry.io/chebi:18367")), //good valid absolute URI
() -> Assertions.assertTrue(res.contains("@id\": \"http://bioregistry.io/chebi:18367")),
() -> Assertions.assertTrue(res.contains("@id\": \"http://bioregistry.io/mi:0361")),//as long as it has 'http://' (valid abs. uri w/o schema would fail here due Jena bug)
() -> Assertions.assertTrue(res.contains("@id\": \"chebi:18367")) //unchanged
);
}

}
117 changes: 117 additions & 0 deletions json-converter/src/test/resources/pc14-test.owl
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF
xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
xmlns:owl="http://www.w3.org/2002/07/owl#"
xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
xmlns:bp="http://www.biopax.org/release/biopax-level3.owl#"
xml:base="pc14:">
<owl:Ontology rdf:about="">
<owl:imports rdf:resource="http://www.biopax.org/release/biopax-level3.owl#" />
</owl:Ontology>
<bp:UnificationXref rdf:about="mi:0360">
<bp:id rdf:datatype = "xsd:string">MI:0360</bp:id>
<bp:db rdf:datatype = "xsd:string">mi</bp:db>
</bp:UnificationXref>
<bp:RelationshipXref rdf:ID="RX_chebi_CHEBI_14791_secondary-ac">
<bp:relationshipType rdf:resource="http://bioregistry.io/mi:0360" />
<bp:id rdf:datatype = "xsd:string">CHEBI:14791</bp:id>
<bp:db rdf:datatype = "xsd:string">chebi</bp:db>
</bp:RelationshipXref>
<bp:RelationshipXref rdf:ID="RX_chebi_CHEBI_45024_secondary-ac">
<bp:relationshipType rdf:resource="http://bioregistry.io/mi:0360" />
<bp:id rdf:datatype = "xsd:string">CHEBI:45024</bp:id>
<bp:db rdf:datatype = "xsd:string">chebi</bp:db>
</bp:RelationshipXref>
<bp:CellularLocationVocabulary rdf:ID="LV_go_GO_0005634">
<bp:xref rdf:resource="go:0005634" />
<bp:term rdf:datatype = "xsd:string">Nucleus</bp:term>
<bp:term rdf:datatype = "xsd:string">NUCLEUS</bp:term>
<bp:term rdf:datatype = "xsd:string">nucleus</bp:term>
</bp:CellularLocationVocabulary>
<bp:RelationshipTypeVocabulary rdf:about="http://bioregistry.io/mi:0829">
<bp:xref rdf:resource="mi:0829" />
<bp:term rdf:datatype = "xsd:string">multiple parent reference</bp:term>
<bp:term rdf:datatype = "xsd:string">multiple parent</bp:term>
</bp:RelationshipTypeVocabulary>
<bp:RelationshipXref rdf:ID="RX_chebi_CHEBI_43474_see-also">
<bp:relationshipType rdf:resource="http://bioregistry.io/mi:0361" />
<bp:id rdf:datatype = "xsd:string">CHEBI:43474</bp:id>
<bp:db rdf:datatype = "xsd:string">chebi</bp:db>
</bp:RelationshipXref>
<bp:UnificationXref rdf:about="mi:0361">
<bp:id rdf:datatype = "xsd:string">0361</bp:id>
<bp:db rdf:datatype = "xsd:string">mi</bp:db>
</bp:UnificationXref>
<bp:BindingFeature rdf:about="pathbank:Compound/PW_C001104/ProteinModification/48">
<bp:intraMolecular rdf:datatype = "xsd:boolean">true</bp:intraMolecular>
</bp:BindingFeature>
<bp:RelationshipXref rdf:ID="RX_chebi_CHEBI_7793_secondary-ac">
<bp:relationshipType rdf:resource="http://bioregistry.io/mi:0360" />
<bp:id rdf:datatype = "xsd:string">CHEBI:7793</bp:id>
<bp:db rdf:datatype = "xsd:string">chebi</bp:db>
</bp:RelationshipXref>
<bp:UnificationXref rdf:about="go:0005634">
<bp:id rdf:datatype = "xsd:string">GO:0005634</bp:id>
<bp:db rdf:datatype = "xsd:string">go</bp:db>
</bp:UnificationXref>
<bp:RelationshipXref rdf:ID="RX_chebi_CHEBI_79387_multiple_parent_reference">
<bp:relationshipType rdf:resource="http://bioregistry.io/mi:0829" />
<bp:id rdf:datatype = "xsd:string">CHEBI:79387</bp:id>
<bp:db rdf:datatype = "xsd:string">chebi</bp:db>
</bp:RelationshipXref>
<bp:RelationshipTypeVocabulary rdf:about="http://bioregistry.io/mi:0360">
<bp:xref rdf:resource="mi:0360" />
<bp:term rdf:datatype = "xsd:string">secondary-ac</bp:term>
</bp:RelationshipTypeVocabulary>
<bp:UnificationXref rdf:about="mi:0829">
<bp:id rdf:datatype = "xsd:string">MI:0829</bp:id>
<bp:db rdf:datatype = "xsd:string">mi</bp:db>
</bp:UnificationXref>
<bp:UnificationXref rdf:about="chebi:18367">
<bp:id rdf:datatype = "xsd:string">CHEBI:18367</bp:id>
<bp:db rdf:datatype = "xsd:string">chebi</bp:db>
</bp:UnificationXref>
<bp:SmallMoleculeReference rdf:about="http://bioregistry.io/chebi:18367">
<bp:standardName rdf:datatype = "xsd:string">phosphate</bp:standardName>
<bp:xref rdf:resource="#RX_chebi_CHEBI_14791_secondary-ac" />
<bp:xref rdf:resource="#RX_chebi_CHEBI_45024_secondary-ac" />
<bp:xref rdf:resource="chebi:18367" />
<bp:xref rdf:resource="#RX_chebi_CHEBI_43474_see-also" />
<bp:xref rdf:resource="#RX_chebi_CHEBI_7793_secondary-ac" />
<bp:xref rdf:resource="#RX_chebi_CHEBI_35780_multiple_parent_reference" />
<bp:xref rdf:resource="#RX_chebi_CHEBI_79387_multiple_parent_reference" />
<bp:displayName rdf:datatype = "xsd:string">phosphate(3-)</bp:displayName>
<bp:name rdf:datatype = "xsd:string">[PO4](3-)</bp:name>
<bp:name rdf:datatype = "xsd:string">tetraoxophosphate(3-)</bp:name>
<bp:name rdf:datatype = "xsd:string">tetraoxophosphate(V)</bp:name>
<bp:name rdf:datatype = "xsd:string">Phosphate</bp:name>
<bp:name rdf:datatype = "xsd:string">Orthophosphate</bp:name>
<bp:name rdf:datatype = "xsd:string">PHOSPHATE ION</bp:name>
<bp:name rdf:datatype = "xsd:string">tetraoxidophosphate(3-)</bp:name>
<bp:name rdf:datatype = "xsd:string">PO4(3-)</bp:name>
<bp:comment rdf:datatype = "xsd:string">A phosphate ion that is the conjugate base of hydrogenphosphate.</bp:comment>
<bp:comment rdf:datatype = "xsd:string">is_conjugate_base_of 43474</bp:comment>
</bp:SmallMoleculeReference>
<bp:Provenance rdf:ID="pathbank">
<bp:standardName rdf:datatype = "xsd:string">Pathbank</bp:standardName>
<bp:displayName rdf:datatype = "xsd:string">Pathbank</bp:displayName>
<bp:comment rdf:datatype = "xsd:string">Source http://pathbank.org/downloads/pathbank_primary_biopax.zip type: BIOPAX, Pathbank 2.0 BioPAX data (primary pathways, human data only), 16-Aug-2019</bp:comment>
</bp:Provenance>
<bp:RelationshipTypeVocabulary rdf:about="http://bioregistry.io/mi:0361">
<bp:xref rdf:resource="mi:0361" />
<bp:term rdf:datatype = "xsd:string">see-also</bp:term>
<bp:term rdf:datatype = "xsd:string">additional information</bp:term>
</bp:RelationshipTypeVocabulary>
<bp:SmallMolecule rdf:about="pathbank:Compound/PW_C001104_Nucleus">
<bp:displayName rdf:datatype = "xsd:string">Phosphate</bp:displayName>
<bp:cellularLocation rdf:resource="#LV_go_GO_0005634" />
<bp:feature rdf:resource="pathbank:Compound/PW_C001104/ProteinModification/48" />
<bp:entityReference rdf:resource="http://bioregistry.io/chebi:18367" />
<bp:dataSource rdf:resource="#pathbank" />
</bp:SmallMolecule>
<bp:RelationshipXref rdf:ID="RX_chebi_CHEBI_35780_multiple_parent_reference">
<bp:relationshipType rdf:resource="http://bioregistry.io/mi:0829" />
<bp:id rdf:datatype = "xsd:string">CHEBI:35780</bp:id>
<bp:db rdf:datatype = "xsd:string">chebi</bp:db>
</bp:RelationshipXref>
</rdf:RDF>
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ public class Resolver {
private static Map<String,String> spellmap; // compressed name -> prefix (combines custom spellmap.json and registry)
private static Map<String,String> synonymap;// synonym -> prefix (build from custom synonymap.json and registry)

public static final String BIOREGISTRY_IO = "bioregistry.io/";
public static final String BIOREGISTRY_IO = "http://bioregistry.io/";
public static final String BIOREGISTRY_JSON_URL =
"https://raw.githubusercontent.com/biopragmatics/bioregistry/main/exports/registry/registry.json";

Expand Down Expand Up @@ -183,7 +183,7 @@ public static Namespace getNamespace(String key, boolean allowVariants) {
}

/**
* Builds a URI of the bioentity (e.g., "bioregistry.io/go:0045202")
* Builds a URI of the bioentity (e.g., "http://bioregistry.io/go:0045202")
* from the collection name/synonym and bio id.
*
* @param name - name, URI, or ID of a data collection (examples: "ChEBI", "go")
Expand Down
Loading

0 comments on commit 94483aa

Please sign in to comment.