diff --git a/.gitignore b/.gitignore index 401baa4..2bff71c 100644 --- a/.gitignore +++ b/.gitignore @@ -11,4 +11,3 @@ debug #Folders data/** *.ttl -*.json diff --git a/Cargo.lock b/Cargo.lock index 65646e9..66f0d84 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -216,6 +216,24 @@ version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05ca71f324d19e85a2e976be04b5ecbb193253794a75adfe2e5044c8bef03f6a" +[[package]] +name = "converter" +version = "0.1.0" +dependencies = [ + "anyhow", + "catplus-common", + "clap", + "lazy_static", + "serde", + "serde_json", + "sophia", + "sophia_api", + "sophia_isomorphism", + "sophia_term", + "sophia_turtle", + "uuid", +] + [[package]] name = "cpufeatures" version = "0.2.17" @@ -1212,24 +1230,6 @@ dependencies = [ "unicode-ident", ] -[[package]] -name = "synth-converter" -version = "0.1.0" -dependencies = [ - "anyhow", - "catplus-common", - "clap", - "lazy_static", - "serde", - "serde_json", - "sophia", - "sophia_api", - "sophia_isomorphism", - "sophia_term", - "sophia_turtle", - "uuid", -] - [[package]] name = "thiserror" version = "1.0.69" diff --git a/Cargo.toml b/Cargo.toml index 2f561fd..693a61a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,7 +3,7 @@ resolver = "2" members = [ "src/catplus-common", - "src/synth-converter", + "src/converter", ] diff --git a/README.md b/README.md index 4111102..4a0ae29 100644 --- a/README.md +++ b/README.md @@ -2,51 +2,51 @@ ## About -This repository contains all the Zarr converters for the different data types in the Cat+ project (Agilent, UV, IR, etc.) -The data types are all in different formats, their data and metadata colluded together. The goal will be to convert the metadata to [an established ontology](https://github.com/sdsc-ordes/catplus-ontology/tree/main), and -as much as data format allow- convert the data in [Zarr array](https://zarr.readthedocs.io/en/stable/index.html). +This repository contains all the converters for the different data types in the Cat+ project (Agilent, UV, IR, etc.) +The data types are all in different formats, their data and metadata colluded together. The goal will be to convert the metadata to [an established ontology](https://github.com/sdsc-ordes/catplus-ontology/tree/main), and provide the data in their original files. ## Tools -### synth-converter -The Synth-converter parses a json input into an rdf graph and serializes the graph to either turtle or jsonld. -It expects the input to conform to the cat+ ontology and the struct `synth-converter/src/batch.rs`. An example input file is provided in `example/1-Synth.json`. +### converter +The converter parses a json input into an rdf graph and serializes the graph to either turtle or jsonld. +It expects the input to conform to the cat+ ontology and the struct `src/catplus-common/src/models/types.rs`. Example input files are provided in `examples` directory. #### Usage -The `synth-converter` has three parameters: +The `converter` has four arguments: +- input_type: currently `synth` (see `examples/1-Synth.json`) or `hci` (see `examples/0-HCI.json`) - inputfile: path to input file (relative to top level of the repo or absolute) - outputfile: path to output file (relative to top level of the repo or absolute) -- format: default is "ttl", the other option is jsonld +- format: rdf output format, currently `turtle` or `jsonld` -The `synth-converter` turns the inputfile into a rdf graph and serilizes it to either turtle or jsonld. The serialization is written to an outputfile. +The `converter` turns the inputfile into a rdf graph and serializes it to either turtle or jsonld. The serialization is written to the provided outputfile. + +Examples ``` -just run example/1-Synth.json output.ttl -just run example/1-Synth.json output.json --format jsonld +just run synth examples/1-Synth.json examples/1-Synth.ttl turtle +just run hci examples/0-HCI.json examples/0-HCI.ttl jsonld ``` ### Architecture -The json input is read with `serde_json`: the transformation of fields is described in the struct `synth-converter/src/batch.rs` - -The graph is build via `synth-converter/src/graph/graph_builder.rs` and uses `sophia_rs`. Besides `rdf` and `xsd` that have build in namespaces in `sophia_rs`, all namespaces and terms are provided in `synth-converter/src/graph/namespaces` as constants. This makes the code more readable and also ensures that the rdf iris and namespaces are controlled and spelt correctly. -Graph serializers and parsers are provided in `synth-converter/src/rdf`. The turtle serializer there is needed for the test. -The conversion is done in the public crate `synth-converter/src/convert.rs` +The json input is read with `serde_json`: the transformation into rdf is done by the `src/catplus-common` library. +It uses `sophia_rs`. The mapping is triggered by `src/catplus-common/src/models/types.rs` and makes use of the namespaces defined at `src/catplus-common/src/graph/namespaces`. ### Shacl Validation The rdf graph confirms to the cat+ ontology: https://github.com/sdsc-ordes/catplus-ontology. Currently rust offeres no Shacl Validation Library, but once such a library exists, it would make sense to add a Shacl Validation. -TheShacl Validation can be done manually here: https://www.itb.ec.europa.eu/shacl/any/upload +The Shacl Validation can be done manually here: https://www.itb.ec.europa.eu/shacl/any/upload ## Installation guidelines The repo is setup with nix. ``` -git clone git@github.com:sdsc-ordes/catplus-zarr-converters.git -cd catplus-zarr-converters +git clone git@github.com:sdsc-ordes/catplus-converters.git +cd catplus-converters cargo build ``` @@ -57,17 +57,18 @@ The rust commands can be started via a justfile: ``` just --list Available recipes: - build *args # Build the synth-converter. - default # Default recipe to list all recipes. - nix-develop *args # Enter a Nix development shell. - run input_file output_file *args # Run the synth-converter. - test *args # Test the synth-converter. - fmt *arg # Format the synth-converter. + build *args # Build all crates + default # Default recipe to list all recipes. + format *args # Format all crates + fmt *args # alias for `format` + nix-develop *args # Enter a Nix development shell. + run input_type input_file output_file *args # Run the converter. + test *args # Test all crates ``` ### Tests -Run the tests with `just test`: only integration tests have been integrated that ensure that the serialized graph in turtle is isomorphic to an expected turtle serialization per valid substructure of the input data: this substructures are action that occur in the synthesis process. +Run the tests with `just test`: only integration tests have been integrated that ensure that the serialized graph in turtle is isomorphic to an expected turtle serialization of the input data. ### Contribute diff --git a/examples/0-HCI.json b/examples/0-HCI.json new file mode 100644 index 0000000..56bcd87 --- /dev/null +++ b/examples/0-HCI.json @@ -0,0 +1,98 @@ +{ + "hasCampaign": { + "campaignName": "Caffeine Synthesis", + "description": "1-step N-methylation of theobromine to caffeine", + "objective": "High caffeine yield at the end", + "campaignClass": "Standard Research", + "type": "optimization", + "reference": "Substitution reaction - SN2", + "hasBatch": { + "batchID": "23", + "batchName": "20240516", + "reactionType": "N-methylation", + "reactionName": "Caffeine synthesis", + "optimizationType": "Yield optimization", + "link": "https://www.sciencedirect.com/science/article/pii/S0187893X15720926" + }, + "hasObjective": { + "criteria": "Yield ≥ 90%", + "condition": "Reflux in acetone with methyl iodide and potassium carbonate", + "description": "Optimize reaction conditions to maximize caffeine yield from theobromine using methyl iodide", + "objectiveName": "Maximize caffeine formation" + }, + "hasChemical": [ + { + "chemicalID": "19", + "chemicalName": "Sodium methoxide", + "CASNumber": "124-41-4", + "molecularMass": { + "value": 54.024, + "unit": "g/mol" + }, + "smiles": "C[O-].[Na+]", + "swissCatNumber": "SwissCAT-10942334", + "keywords": "optional only in HCI file", + "Inchi": "InChI=1S/CH3O.Na/c1-2;/h1H3;/q-1;+1", + "molecularFormula": "CH3NaO", + "density": { + "value": 1.3, + "unit": "g/mL" + } + }, + { + "chemicalID": "36", + "chemicalName": "theobromine", + "CASNumber": "83-67-0", + "molecularMass": { + "value": 180.160, + "unit": "g/mol" + }, + "smiles": "CN1C=NC2=C1C(=O)NC(=O)N2C", + "swissCatNumber": "SwissCAT-5429", + "keywords": "optional only in HCI file", + "Inchi": "InChI=1S/C7H8N4O2/c1-10-3-8-5-4(10)6(12)9-7(13)11(5)2/h3H,1-2H3,(H,9,12,13)", + "molecularFormula": "C7H8N4O2", + "density": { + "value": 1.522, + "unit": "g/mL" + } + }, + { + "chemicalID": "25", + "chemicalName": "methyl iodide", + "CASNumber": "74-88-4", + "molecularMass": { + "value": 141.939, + "unit": "g/mol" + }, + "smiles": "CI", + "swissCatNumber": "SwissCAT-6328", + "keywords": "optional only in HCI file", + "Inchi": "InChI=1S/CH3I/c1-2/h1H3", + "molecularFormula": "CH3I", + "density": { + "value": 2.28, + "unit": "g/mL" + } + }, + { + "chemicalID": "79", + "chemicalName": "methanol", + "CASNumber": "67-56-1", + "molecularMass": { + "value": 32.042, + "unit": "g/mol" + }, + "smiles": "CO", + "swissCatNumber": "SwissCAT-887", + "keywords": "optional only in HCI file", + "Inchi": "InChI=1S/CH4O/c1-2/h2H,1H3", + "molecularFormula": "CH4O", + "density": { + "value": 0.79, + "unit": "g/mL" + } + } + ] + } +} \ No newline at end of file diff --git a/justfile b/justfile index 8f4b316..f03874b 100644 --- a/justfile +++ b/justfile @@ -1,3 +1,4 @@ +#!/usr/bin/env bash set positional-arguments set shell := ["bash", "-cue"] @@ -20,10 +21,10 @@ alias fmt := format format *args: cargo fmt {{args}} -# Run the synth-converter. -run input_file output_file *args: - cd "{{root_dir}}/src/synth-converter" && \ - cargo run --bin synth-converter "{{root_dir}}/{{input_file}}" "{{root_dir}}/{{output_file}}" {{args}} +# Run the converter. +run input_type input_file output_file *args: + cd "{{root_dir}}/src/converter" && \ + cargo run --bin converter "{{input_type}}" "{{root_dir}}/{{input_file}}" "{{root_dir}}/{{output_file}}" {{args}} # Enter a Nix development shell. nix-develop *args: diff --git a/src/catplus-common/src/graph/namespaces/allocom.rs b/src/catplus-common/src/graph/namespaces/allocom.rs new file mode 100644 index 0000000..c16af53 --- /dev/null +++ b/src/catplus-common/src/graph/namespaces/allocom.rs @@ -0,0 +1,10 @@ +use lazy_static::lazy_static; +use sophia::api::ns::Namespace; +use sophia_api::namespace; +namespace! { + "http://purl.allotrope.org/ontologies/common#", + AFC_0000090 +} +lazy_static! { + pub static ref ns: Namespace<&'static str> = Namespace::new(PREFIX.as_str()).unwrap(); +} diff --git a/src/catplus-common/src/graph/namespaces/allohdf.rs b/src/catplus-common/src/graph/namespaces/allohdf.rs new file mode 100644 index 0000000..8749666 --- /dev/null +++ b/src/catplus-common/src/graph/namespaces/allohdf.rs @@ -0,0 +1,10 @@ +use lazy_static::lazy_static; +use sophia::api::ns::Namespace; +use sophia_api::namespace; +namespace! { + "http://purl.allotrope.org/ontologies/hdf5/1.8#", + HardLink +} +lazy_static! { + pub static ref ns: Namespace<&'static str> = Namespace::new(PREFIX.as_str()).unwrap(); +} diff --git a/src/catplus-common/src/graph/namespaces/allores.rs b/src/catplus-common/src/graph/namespaces/allores.rs index da06138..bf2a7f2 100644 --- a/src/catplus-common/src/graph/namespaces/allores.rs +++ b/src/catplus-common/src/graph/namespaces/allores.rs @@ -3,18 +3,19 @@ use sophia::api::ns::Namespace; use sophia_api::namespace; namespace! { "http://purl.allotrope.org/ontologies/result#", + AFR_0001606, + AFR_0001723, + AFR_0001952, + AFR_0002036, AFR_0002240, - AFR_0002296, - AFR_0002295, AFR_0002294, + AFR_0002295, + AFR_0002296, + AFR_0002423, AFR_0002464, + AFR_0002764, AFRE_0000001, - AFX_0000622, - AFR_0002423, - AFR_0001606, - AFR_0001723, - AFR_0001952, - AFR_0002036 + AFX_0000622 } lazy_static! { pub static ref ns: Namespace<&'static str> = Namespace::new(PREFIX.as_str()).unwrap(); diff --git a/src/catplus-common/src/graph/namespaces/cat.rs b/src/catplus-common/src/graph/namespaces/cat.rs index 1797476..ff1ef7c 100644 --- a/src/catplus-common/src/graph/namespaces/cat.rs +++ b/src/catplus-common/src/graph/namespaces/cat.rs @@ -6,34 +6,41 @@ namespace! { AddAction, Batch, Campaign, - ContainerPositionAndQuantity, - Experiment, - FiltrateAction, - Observation, - Sample, - SetPressureAction, - SetTemperatureAction, - SetVacuumAction, - ShakeAction, - speedTumbleStirrerShape, + campaignClass, + campaignType, casNumber, chemicalName, containerBarcode, containerID, + ContainerPositionAndQuantity, + criteria, dispenseType, errorMargin, expectedDatum, + Experiment, + FiltrateAction, + genericObjective, hasBatch, hasCampaign, + hasChemical, hasContainerPositionAndQuantity, + hasObjective, hasSample, - hasChemical, internalBarCode, measuredQuantity, Objective, + Observation, + optimizationType, + reactionSubType, + reactionType, role, - setTemperatureAction, + Sample, + SetPressureAction, + SetTemperatureAction, + SetVacuumAction, + ShakeAction, speedInRPM, + speedTumbleStirrerShape, subEquipmentName, swissCatNumber, temperatureShakerShape, diff --git a/src/catplus-common/src/graph/namespaces/mod.rs b/src/catplus-common/src/graph/namespaces/mod.rs index 99e061d..bd5521c 100644 --- a/src/catplus-common/src/graph/namespaces/mod.rs +++ b/src/catplus-common/src/graph/namespaces/mod.rs @@ -1,3 +1,5 @@ +pub mod allocom; +pub mod allohdf; pub mod alloproc; pub mod alloqual; pub mod allores; diff --git a/src/catplus-common/src/graph/namespaces/obo.rs b/src/catplus-common/src/graph/namespaces/obo.rs index 6ebe990..5c7f7c8 100644 --- a/src/catplus-common/src/graph/namespaces/obo.rs +++ b/src/catplus-common/src/graph/namespaces/obo.rs @@ -4,6 +4,7 @@ use sophia_api::namespace; namespace! { "http://purl.obolibrary.org/obo/", CHEBI_25367, + IAO_0000005, PATO_0001019 } lazy_static! { diff --git a/src/catplus-common/src/graph/namespaces/qudt.rs b/src/catplus-common/src/graph/namespaces/qudt.rs index b6fd98d..7789934 100644 --- a/src/catplus-common/src/graph/namespaces/qudt.rs +++ b/src/catplus-common/src/graph/namespaces/qudt.rs @@ -3,9 +3,9 @@ use sophia::api::ns::Namespace; use sophia_api::namespace; namespace! { "http://qudt.org/schema/qudt/", + quantity, unit, - value, - quantity + value } lazy_static! { pub static ref ns: Namespace<&'static str> = Namespace::new(PREFIX.as_str()).unwrap(); diff --git a/src/catplus-common/src/graph/namespaces/schema.rs b/src/catplus-common/src/graph/namespaces/schema.rs index aac82ce..dc81c87 100644 --- a/src/catplus-common/src/graph/namespaces/schema.rs +++ b/src/catplus-common/src/graph/namespaces/schema.rs @@ -3,8 +3,9 @@ use sophia::api::ns::Namespace; use sophia_api::namespace; namespace! { "https://schema.org/", - name, - keywords + description, + keywords, + name } lazy_static! { pub static ref ns: Namespace<&'static str> = Namespace::new(PREFIX.as_str()).unwrap(); diff --git a/src/catplus-common/src/graph/namespaces/unit.rs b/src/catplus-common/src/graph/namespaces/unit.rs index 86e06b6..6f03ee8 100644 --- a/src/catplus-common/src/graph/namespaces/unit.rs +++ b/src/catplus-common/src/graph/namespaces/unit.rs @@ -9,9 +9,9 @@ namespace! { "https://qudt.org/vocab/unit/", Bar, DegC, - MilliGM, GMPerMilliL, GMPerMol, + MilliGM, MolPerL, RevPerMin } diff --git a/src/catplus-common/src/graph/prefix_map.rs b/src/catplus-common/src/graph/prefix_map.rs index 2cbec19..d798083 100644 --- a/src/catplus-common/src/graph/prefix_map.rs +++ b/src/catplus-common/src/graph/prefix_map.rs @@ -1,4 +1,6 @@ -use crate::graph::namespaces::{alloproc, alloqual, allores, cat, obo, purl, qudt, schema, unit}; +use crate::graph::namespaces::{ + allocom, allohdf, alloproc, alloqual, allores, cat, obo, purl, qudt, schema, unit, +}; use sophia_api::{prefix::Prefix, prelude::Iri}; use lazy_static::lazy_static; @@ -11,26 +13,43 @@ lazy_static! { Namespace::new("http://www.w3.org/2001/XMLSchema#").unwrap(); } +macro_rules! ns_entries_direct { // For rdf and xsd + ($msg:expr, $($ns:ident),*) => { + vec![ + $( + (stringify!($ns), $ns.get("").expect(&$msg)), + )* + ] + }; +} + +macro_rules! ns_entries_module { // For the other modules + ($msg:expr, $($module:ident),*) => { + vec![ + $( + (stringify!($module), $module::ns.get("").expect(&$msg)), + )* + ] + }; +} + pub fn generate_prefix_map() -> Vec<(Prefix>, Iri>)> { - vec![ - ("rdf", rdf.get("").expect("Namespace URI should always be valid")), - ("cat", cat::ns.get("").expect("Namespace URI should always be valid")), - ("schema", schema::ns.get("").expect("Namespace URI should always be valid")), - ("unit", unit::ns.get("").expect("Namespace URI should always be valid")), - ("allores", allores::ns.get("").expect("Namespace URI should always be valid")), - ("alloproc", alloproc::ns.get("").expect("Namespace URI should always be valid")), - ("qudt", qudt::ns.get("").expect("Namespace URI should always be valid")), - ("alloqual", alloqual::ns.get("").expect("Namespace URI should always be valid")), - ("purl", purl::ns.get("").expect("Namespace URI should always be valid")), - ("obo", obo::ns.get("").expect("Namespace URI should always be valid")), - ("xsd", xsd.get("").expect("Namespace URI should always be valid")), - ] - .into_iter() - .map(|(prefix, iri)| { - ( - Prefix::new(prefix.to_string().into_boxed_str()).expect("Invalid prefix"), - Iri::new(iri.to_string().into_boxed_str()).expect("Invalid IRI"), + let msg = "Namespace URI should always be valid"; + ns_entries_direct!(msg, rdf, xsd) // Correct call for rdf and xsd + .into_iter() + .chain( + ns_entries_module!( + // Correct call for the other modules + msg, cat, schema, unit, allores, alloproc, allocom, allohdf, qudt, alloqual, purl, + obo + ) + .into_iter(), ) - }) - .collect() + .map(|(prefix, iri)| { + ( + Prefix::new(prefix.to_string().into_boxed_str()).expect("Invalid prefix"), + Iri::new(iri.to_string().into_boxed_str()).expect("Invalid IRI"), + ) + }) + .collect() } diff --git a/src/catplus-common/src/models/types.rs b/src/catplus-common/src/models/types.rs index 34c8abb..fa6deb5 100644 --- a/src/catplus-common/src/models/types.rs +++ b/src/catplus-common/src/models/types.rs @@ -5,7 +5,7 @@ use crate::{ graph::{ insert_into::{InsertIntoGraph, Link}, - namespaces::{alloproc, alloqual, allores, cat, obo, purl, qudt, schema}, + namespaces::{alloproc, allocom, allohdf, alloqual, allores, cat, obo, purl, qudt, schema}, }, models::enums::{ActionName, Unit}, }; @@ -20,27 +20,109 @@ use sophia_api::{ term::{SimpleTerm, Term}, }; +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Campaign { + pub campaign_name: String, + pub description: String, + #[serde(rename = "objective")] + pub generic_objective: String, + pub campaign_class: String, + #[serde(rename = "type")] + pub campaign_type: String, + pub reference: String, + pub has_objective: Option, + pub has_batch: Batch, + pub has_chemical: Option>, +} + +impl InsertIntoGraph for Campaign { + fn insert_into(&self, graph: &mut LightGraph, iri: SimpleTerm) -> anyhow::Result<()> { + for (pred, value) in [ + (rdf::type_, &cat::Campaign.as_simple() as &dyn InsertIntoGraph), + (schema::name, &self.campaign_name.as_simple()), + (schema::description, &self.description.as_simple()), + (cat::genericObjective, &self.generic_objective.as_simple()), + (cat::campaignClass, &self.campaign_class.as_simple()), + (cat::campaignType, &self.campaign_type.as_simple()), + (allores::AFR_0002764, &self.reference.as_simple()), + (cat::hasObjective, &self.has_objective), + (cat::hasBatch, &self.has_batch), + (cat::hasChemical, &self.has_chemical), + ] { + value.attach_into( + graph, + Link { source_iri: iri.clone(), pred: pred.as_simple(), target_iri: None }, + )?; + } + Ok(()) + } +} + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct Objective { + pub criteria: String, + pub condition: String, + pub description: String, + pub objective_name: String, +} + +impl InsertIntoGraph for Objective { + fn insert_into(&self, graph: &mut LightGraph, iri: SimpleTerm) -> anyhow::Result<()> { + for (pred, value) in [ + (rdf::type_, &obo::IAO_0000005.as_simple()), + (schema::name, &self.objective_name.as_simple()), + (schema::description, &self.description.as_simple()), + (cat::criteria, &self.criteria.as_simple()), + (allocom::AFC_0000090, &self.condition.as_simple()), + ] { + value.attach_into( + graph, + Link { source_iri: iri.clone(), pred: pred.as_simple(), target_iri: None }, + )?; + } + Ok(()) + } +} + +#[derive(Deserialize)] +pub struct CampaignWrapper { + #[serde(rename = "hasCampaign")] + pub has_campaign: Campaign, +} +impl InsertIntoGraph for CampaignWrapper { + fn insert_into(&self, graph: &mut LightGraph, iri: SimpleTerm) -> anyhow::Result<()> { + self.has_campaign.insert_into(graph, iri) + } +} + #[derive(Clone, Debug, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] pub struct Batch { #[serde(rename = "batchID")] pub batch_id: String, #[serde(rename = "Actions")] - pub actions: Vec, + pub actions: Option>, pub batch_name: Option, - #[serde(rename = "ReactionType")] pub reaction_type: Option, - #[serde(rename = "OptimizationType")] + pub reaction_name: Option, pub optimization_type: Option, - #[serde(rename = "Link")] pub link: Option, } impl InsertIntoGraph for Batch { fn insert_into(&self, graph: &mut LightGraph, iri: SimpleTerm) -> anyhow::Result<()> { - for (pred, value) in - [(rdf::type_, &cat::Batch.as_simple()), (schema::name, &self.batch_id.as_simple())] - { + for (pred, value) in [ + (rdf::type_, &cat::Batch.as_simple() as &dyn InsertIntoGraph), + (schema::name, &self.batch_id.as_simple()), + (allohdf::HardLink, &self.link.as_ref().clone().map(|s| s.as_simple())), + (cat::reactionType, &self.reaction_type.as_ref().clone().map(|s| s.as_simple())), + ( + cat::optimizationType, + &self.optimization_type.as_ref().clone().map(|s| s.as_simple()), + ), + ] { value.attach_into( graph, Link { source_iri: iri.clone(), pred: pred.as_simple(), target_iri: None }, @@ -48,10 +130,12 @@ impl InsertIntoGraph for Batch { } // NOTE: for actions, the direction is reversed (action hasbatch batch) - for action in &self.actions { - let action_uri = action.get_uri(); - graph.insert(&action_uri, cat::hasBatch.as_simple(), iri.clone())?; - action.insert_into(graph, action_uri)?; + if let Some(actions) = &self.actions { + for action in actions { + let action_uri = action.get_uri(); + graph.insert(&action_uri, cat::hasBatch.as_simple(), iri.clone())?; + action.insert_into(graph, action_uri)?; + } } Ok(()) diff --git a/src/synth-converter/Cargo.lock b/src/converter/Cargo.lock similarity index 100% rename from src/synth-converter/Cargo.lock rename to src/converter/Cargo.lock diff --git a/src/synth-converter/Cargo.toml b/src/converter/Cargo.toml similarity index 91% rename from src/synth-converter/Cargo.toml rename to src/converter/Cargo.toml index b8160e7..ad6601f 100644 --- a/src/synth-converter/Cargo.toml +++ b/src/converter/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "synth-converter" +name = "converter" version = "0.1.0" edition = "2021" @@ -7,7 +7,7 @@ edition = "2021" path = "src/lib.rs" [[bin]] -name = "synth-converter" +name = "converter" path = "src/main.rs" [dependencies] diff --git a/src/converter/src/convert.rs b/src/converter/src/convert.rs new file mode 100644 index 0000000..57dbd99 --- /dev/null +++ b/src/converter/src/convert.rs @@ -0,0 +1,49 @@ +use anyhow::{Context, Result}; +use catplus_common::graph::{graph_builder::GraphBuilder, insert_into::InsertIntoGraph}; +use serde::{de::DeserializeOwned, Deserialize}; + +// Derive Deserialize and ValueEnum +#[derive(Deserialize, Debug, clap::ValueEnum, Clone)] +pub enum RdfFormat { + Turtle, + Jsonld, +} + +/// Parses JSON and serializes the RDF graph to the specified format. +/// +/// This function can handle any struct that implements `serde::DeserializeOwned` and your `InsertIntoGraph` trait. +/// +/// # Arguments +/// - `input_content`: The JSON input as a string. +/// - `format`: The desired serialization format. +/// +/// # Returns +/// A `Result` containing the serialized graph as a string or an error. +pub fn json_to_rdf(input_content: &str, format: &RdfFormat) -> Result +where + T: DeserializeOwned + InsertIntoGraph, // Trait bounds +{ + let data: T = parse_json(input_content).context("Failed to parse JSON input")?; + + let mut graph_builder = GraphBuilder::new(); + graph_builder.insert(&data).context("Failed to build RDF graph")?; + + let serialized_graph = match format { + RdfFormat::Jsonld => { + graph_builder.serialize_to_jsonld().context("Failed to serialize to JSON-LD")? + } + RdfFormat::Turtle => { + graph_builder.serialize_to_turtle().context("Failed to serialize to Turtle")? + } + }; + + Ok(serialized_graph) +} + +/// Parses a JSON string into a struct of type T. +fn parse_json(json_data: &str) -> Result +where + T: DeserializeOwned, // Trait bound +{ + serde_json::from_str(json_data).map_err(|e| anyhow::Error::new(e)) +} diff --git a/src/synth-converter/src/lib.rs b/src/converter/src/lib.rs similarity index 100% rename from src/synth-converter/src/lib.rs rename to src/converter/src/lib.rs diff --git a/src/synth-converter/src/main.rs b/src/converter/src/main.rs similarity index 58% rename from src/synth-converter/src/main.rs rename to src/converter/src/main.rs index b370fcc..f5930c4 100644 --- a/src/synth-converter/src/main.rs +++ b/src/converter/src/main.rs @@ -1,28 +1,41 @@ use anyhow::{Context, Result}; +use catplus_common::models::types::{Batch, CampaignWrapper}; use clap::Parser; +use converter::convert::{json_to_rdf, RdfFormat}; +use serde::Deserialize; use std::{ fs::File, io::{Read, Write}, path::Path, }; -use synth_converter::convert::json_to_rdf; -/// Converts CAT+ Synthesis JSON input into RDF formats. +// Derive Deserialize and ValueEnum +#[derive(Deserialize, Debug, clap::ValueEnum, Clone)] +enum InputType { + Synth, + HCI, +} + +/// Converts CAT+ JSON input into RDF formats. /// -/// This tool expects Synthesis data similar to example/1-Synth.json -/// of a batch with actions. This data is then transformed to RDF and +/// This tool expects data similar to examples/1-Synth.json or examples/0-HCI.json +/// This data is then transformed to RDF and /// serialized as Turtle (ttl) or JSON-LD (jsonld). #[derive(Parser, Debug)] struct Args { - /// Path to the input JSON file: relative or absolute. + /// Type of input data: "Synth" or "HCI". + #[arg(value_enum)] + input_type: InputType, + + /// Path to the input JSON file. input_file: String, /// Path to the output RDF file. output_file: String, - /// Output format: "ttl" (Turtle) or "jsonld" (JSON-LD) - #[arg(short, long, default_value = "ttl")] - format: String, + /// Type of input data: "Turtle" or "Jsonld". + #[arg(value_enum)] + format: RdfFormat, } fn main() -> Result<()> { @@ -44,9 +57,12 @@ fn main() -> Result<()> { .read_to_string(&mut input_content) .with_context(|| format!("Failed to read input file '{}'", args.input_file))?; - // Use unified conversion function - let serialized_graph = json_to_rdf(&input_content, &args.format) - .with_context(|| format!("Failed to convert JSON to RDF format '{}'", args.format))?; + // Unified conversion function with type selection + let serialized_graph = match args.input_type { + InputType::Synth => json_to_rdf::(&input_content, &args.format), + InputType::HCI => json_to_rdf::(&input_content, &args.format), + } + .with_context(|| format!("Failed to convert JSON to RDF format '{:?}'", &args.format))?; println!("Conversion successful!"); diff --git a/src/synth-converter/tests/convert_tests.rs b/src/converter/tests/convert_tests.rs similarity index 68% rename from src/synth-converter/tests/convert_tests.rs rename to src/converter/tests/convert_tests.rs index f81102e..0d7e699 100644 --- a/src/synth-converter/tests/convert_tests.rs +++ b/src/converter/tests/convert_tests.rs @@ -1,9 +1,13 @@ -use catplus_common::rdf::rdf_parser::parse_turtle_to_graph; +use catplus_common::{ + models::types::{Batch, CampaignWrapper}, + rdf::rdf_parser::parse_turtle_to_graph, +}; +use converter::convert::{json_to_rdf, RdfFormat}; use sophia_isomorphism::isomorphic_graphs; -use synth_converter::convert::json_to_rdf; #[test] fn test_convert_filtrate_action() { + let output_format = RdfFormat::Turtle; let json_data = r#" { "batchID": "23", @@ -21,7 +25,7 @@ fn test_convert_filtrate_action() { ] } "#; - let result = json_to_rdf(json_data, "turtle"); + let result = json_to_rdf::(json_data, &output_format); let expected_ttl = r#" PREFIX rdf: PREFIX cat: @@ -54,6 +58,7 @@ fn test_convert_filtrate_action() { #[test] fn test_convert_pressure_action() { + let output_format = RdfFormat::Turtle; let json_data = r#" { "batchID": "23", @@ -79,7 +84,7 @@ fn test_convert_pressure_action() { ] } "#; - let result = json_to_rdf(json_data, "turtle"); + let result = json_to_rdf::(json_data, &output_format); let expected_ttl = r#" PREFIX rdf: PREFIX cat: @@ -119,6 +124,7 @@ fn test_convert_pressure_action() { #[test] fn test_convert_set_temperature_action() { + let output_format = RdfFormat::Turtle; let json_data = r#" { "batchID": "23", @@ -160,7 +166,7 @@ fn test_convert_set_temperature_action() { ] } "#; - let result = json_to_rdf(json_data, "turtle"); + let result = json_to_rdf::(json_data, &output_format); let expected_ttl = r#" PREFIX rdf: PREFIX cat: @@ -212,6 +218,7 @@ fn test_convert_set_temperature_action() { #[test] fn test_convert_add_action() { + let output_format = RdfFormat::Turtle; let json_data = r#" { "batchID": "23", @@ -310,7 +317,7 @@ fn test_convert_add_action() { ] } "#; - let result = json_to_rdf(json_data, "turtle"); + let result = json_to_rdf::(json_data, &output_format); let expected_ttl = r#" PREFIX rdf: PREFIX cat: @@ -404,6 +411,7 @@ fn test_convert_add_action() { #[test] fn test_convert_shake_action() { + let output_format = RdfFormat::Turtle; let json_data = r#" { "batchID": "23", @@ -445,7 +453,7 @@ fn test_convert_shake_action() { ] } "#; - let result = json_to_rdf(json_data, "turtle"); + let result = json_to_rdf::(json_data, &output_format); let expected_ttl = r#" PREFIX rdf: PREFIX cat: @@ -497,6 +505,7 @@ fn test_convert_shake_action() { #[test] fn test_convert_set_vacuum_action() { + let output_format = RdfFormat::Turtle; let json_data = r#" { "batchID": "23", @@ -522,7 +531,7 @@ fn test_convert_set_vacuum_action() { ] } "#; - let result = json_to_rdf(json_data, "turtle"); + let result = json_to_rdf::(json_data, &output_format); let expected_ttl = r#" PREFIX rdf: PREFIX cat: @@ -551,3 +560,207 @@ fn test_convert_set_vacuum_action() { let graphs_match = isomorphic_graphs(&result_graph, &expected_graph); assert_eq!(graphs_match.unwrap(), true); } + +#[test] +fn test_convert_campaign() { + let output_format = RdfFormat::Turtle; + let json_data = r#" + { + "hasCampaign": { + "campaignName": "Caffeine Synthesis", + "description": "1-step N-methylation of theobromine to caffeine", + "objective": "High caffeine yield at the end", + "campaignClass": "Standard Research", + "type": "optimization", + "reference": "Substitution reaction - SN2", + "hasBatch": { + "batchID": "23", + "batchName": "20240516", + "reactionType": "N-methylation", + "reactionName": "Caffeine synthesis", + "optimizationType": "Yield optimization", + "link": "https://www.sciencedirect.com/science/article/pii/S0187893X15720926" + }, + "hasObjective": { + "criteria": "Yield ≥ 90%", + "condition": "Reflux in acetone with methyl iodide and potassium carbonate", + "description": "Optimize reaction conditions to maximize caffeine yield from theobromine using methyl iodide", + "objectiveName": "Maximize caffeine formation" + }, + "hasChemical": [ + { + "chemicalID": "19", + "chemicalName": "Sodium methoxide", + "CASNumber": "124-41-4", + "molecularMass": { + "value": 54.024, + "unit": "g/mol" + }, + "smiles": "C[O-].[Na+]", + "swissCatNumber": "SwissCAT-10942334", + "keywords": "optional only in HCI file", + "Inchi": "InChI=1S/CH3O.Na/c1-2;/h1H3;/q-1;+1", + "molecularFormula": "CH3NaO", + "density": { + "value": 1.3, + "unit": "g/mL" + } + }, + { + "chemicalID": "36", + "chemicalName": "theobromine", + "CASNumber": "83-67-0", + "molecularMass": { + "value": 180.160, + "unit": "g/mol" + }, + "smiles": "CN1C=NC2=C1C(=O)NC(=O)N2C", + "swissCatNumber": "SwissCAT-5429", + "keywords": "optional only in HCI file", + "Inchi": "InChI=1S/C7H8N4O2/c1-10-3-8-5-4(10)6(12)9-7(13)11(5)2/h3H,1-2H3,(H,9,12,13)", + "molecularFormula": "C7H8N4O2", + "density": { + "value": 1.522, + "unit": "g/mL" + } + }, + { + "chemicalID": "25", + "chemicalName": "methyl iodide", + "CASNumber": "74-88-4", + "molecularMass": { + "value": 141.939, + "unit": "g/mol" + }, + "smiles": "CI", + "swissCatNumber": "SwissCAT-6328", + "keywords": "optional only in HCI file", + "Inchi": "InChI=1S/CH3I/c1-2/h1H3", + "molecularFormula": "CH3I", + "density": { + "value": 2.28, + "unit": "g/mL" + } + }, + { + "chemicalID": "79", + "chemicalName": "methanol", + "CASNumber": "67-56-1", + "molecularMass": { + "value": 32.042, + "unit": "g/mol" + }, + "smiles": "CO", + "swissCatNumber": "SwissCAT-887", + "keywords": "optional only in HCI file", + "Inchi": "InChI=1S/CH4O/c1-2/h2H,1H3", + "molecularFormula": "CH4O", + "density": { + "value": 0.79, + "unit": "g/mL" + } + } + ] + } + } + "#; + let result = json_to_rdf::(json_data, &output_format); + let expected_ttl = r#" + PREFIX rdf: + PREFIX cat: + PREFIX schema: + PREFIX unit: + PREFIX allores: + PREFIX alloproc: + PREFIX allocom: + PREFIX allohdf: + PREFIX qudt: + PREFIX alloqual: + PREFIX purl: + PREFIX obo: + PREFIX xsd: + + [] a cat:Campaign; + cat:campaignClass "Standard Research"; + cat:campaignType "optimization"; + cat:genericObjective "High caffeine yield at the end"; + cat:hasBatch [ a cat:Batch; + cat:optimizationType "Yield optimization"; + cat:reactionType "N-methylation"; + allohdf:HardLink "https://www.sciencedirect.com/science/article/pii/S0187893X15720926"; + schema:name "23"]; + cat:hasChemical [ a obo:CHEBI_25367; + cat:casNumber "124-41-4"; + cat:chemicalName "Sodium methoxide"; + cat:swissCatNumber "SwissCAT-10942334"; + purl:identifier "19"; + allores:AFR_0001952 "CH3NaO"; + allores:AFR_0002294 [ a cat:Observation; + qudt:unit unit:GM-PER-MOL; + qudt:value "54.024"^^xsd:double]; + allores:AFR_0002295 "C[O-].[Na+]"; + allores:AFR_0002296 "InChI=1S/CH3O.Na/c1-2;/h1H3;/q-1;+1"; + obo:PATO_0001019 [ a cat:Observation; + qudt:unit unit:GM-PER-MilliL; + qudt:value "1.3"^^xsd:double]; + schema:keywords "optional only in HCI file"], + [ a obo:CHEBI_25367; + cat:casNumber "83-67-0"; + cat:chemicalName "theobromine"; + cat:swissCatNumber "SwissCAT-5429"; + purl:identifier "36"; + allores:AFR_0001952 "C7H8N4O2"; + allores:AFR_0002294 [ a cat:Observation; + qudt:unit unit:GM-PER-MOL; + qudt:value "180.16"^^xsd:double]; + allores:AFR_0002295 "CN1C=NC2=C1C(=O)NC(=O)N2C"; + allores:AFR_0002296 "InChI=1S/C7H8N4O2/c1-10-3-8-5-4(10)6(12)9-7(13)11(5)2/h3H,1-2H3,(H,9,12,13)"; + obo:PATO_0001019 [ a cat:Observation; + qudt:unit unit:GM-PER-MilliL; + qudt:value "1.522"^^xsd:double]; + schema:keywords "optional only in HCI file"], + [ a obo:CHEBI_25367; + cat:casNumber "74-88-4"; + cat:chemicalName "methyl iodide"; + cat:swissCatNumber "SwissCAT-6328"; + purl:identifier "25"; + allores:AFR_0001952 "CH3I"; + allores:AFR_0002294 [ a cat:Observation; + qudt:unit unit:GM-PER-MOL; + qudt:value "141.939"^^xsd:double]; + allores:AFR_0002295 "CI"; + allores:AFR_0002296 "InChI=1S/CH3I/c1-2/h1H3"; + obo:PATO_0001019 [ a cat:Observation; + qudt:unit unit:GM-PER-MilliL; + qudt:value "2.28"^^xsd:double]; + schema:keywords "optional only in HCI file"], + [ a obo:CHEBI_25367; + cat:casNumber "67-56-1"; + cat:chemicalName "methanol"; + cat:swissCatNumber "SwissCAT-887"; + purl:identifier "79"; + allores:AFR_0001952 "CH4O"; + allores:AFR_0002294 [ a cat:Observation; + qudt:unit unit:GM-PER-MOL; + qudt:value "32.042"^^xsd:double]; + allores:AFR_0002295 "CO"; + allores:AFR_0002296 "InChI=1S/CH4O/c1-2/h2H,1H3"; + obo:PATO_0001019 [ a cat:Observation; + qudt:unit unit:GM-PER-MilliL; + qudt:value "0.79"^^xsd:double]; + schema:keywords "optional only in HCI file"]; + cat:hasObjective [ a obo:IAO_0000005; + cat:criteria "Yield ≥ 90%"; + allocom:AFC_0000090 "Reflux in acetone with methyl iodide and potassium carbonate"; + schema:description "Optimize reaction conditions to maximize caffeine yield from theobromine using methyl iodide"; + schema:name "Maximize caffeine formation"]; + allores:AFR_0002764 "Substitution reaction - SN2"; + schema:description "1-step N-methylation of theobromine to caffeine"; + schema:name "Caffeine Synthesis". + "#; + let expected_graph = parse_turtle_to_graph(&expected_ttl).unwrap(); + let result_ttl = result.as_ref().unwrap().as_str(); + let result_graph = parse_turtle_to_graph(&result_ttl).unwrap(); + let graphs_match = isomorphic_graphs(&result_graph, &expected_graph); + assert_eq!(graphs_match.unwrap(), true); +} diff --git a/src/synth-converter/src/convert.rs b/src/synth-converter/src/convert.rs deleted file mode 100644 index 56d24f4..0000000 --- a/src/synth-converter/src/convert.rs +++ /dev/null @@ -1,46 +0,0 @@ -use anyhow::{Context, Result}; -use catplus_common::{graph::graph_builder::GraphBuilder, models::types::Batch}; - -/// Parse JSON and serialize the RDF graph to the specified format -/// -/// The input JSON should conform to the structure defined in the `Batch` struct. -/// An example input file, `1-Synth.json`, is available in the `example` directory. -/// -/// # Arguments -/// - `input_content`: The JSON input as a string. -/// - `fmt`: The desired serialization format ("turtle" or "jsonld"). -/// If unspecified or empty, defaults to "turtle". -/// -/// # Returns -/// A `Result` containing the serialized graph as a string or an error if the process fails. -pub fn json_to_rdf(input_content: &str, fmt: &str) -> Result { - // Parse JSON into a Batch object - let batch = parse_json(input_content).context("Failed to parse JSON input")?; - - // Build the RDF graph - let mut graph_builder = GraphBuilder::new(); - graph_builder.insert(&batch).context("Failed to build RDF graph")?; - - // Serialize the RDF graph to the specified format - let serialized_graph = match fmt { - "jsonld" => graph_builder - .serialize_to_jsonld() - .context("Failed to serialize RDF graph to JSON-LD")?, - _ => graph_builder - .serialize_to_turtle() - .context("Failed to serialize RDF graph to Turtle")?, - }; - - Ok(serialized_graph) -} - -/// Parses a JSON string into a `Batch` struct -/// -/// # Arguments -/// - `json_data`: The JSON data as a string. -/// -/// # Returns -/// A `Result` containing the parsed `Batch` struct or an error. -fn parse_json(json_data: &str) -> Result { - serde_json::from_str(json_data).map_err(|e| anyhow::Error::new(e)) -}