From 92fb2a0500b2a29b78887bdc6a3e285b16caee96 Mon Sep 17 00:00:00 2001 From: ivan-aksamentov Date: Thu, 2 Jan 2025 21:49:20 +0100 Subject: [PATCH] feat: use json string to store additional metadata in fasta headers Proposal is to use stringified JSON to add any additional data to fasta headers as opposed to home-grown format. Advantages: - Easire to parse: the fasta header can be split on first space and then the second part can be parsed as json to extract all metadata, which is basically 2 lines of code in most languages. The home-grown format would either have to be parsed specifically or cannot be parsed reliably at all, especially if not documented. - Output contains field names, which might improve clarity if the meaning of values is not immediately obvious - New fields can be easily added and removed without any additional thinking about new delimiters, positions etc. - We could even define a schema and document the format to be super-strict and clear! (but probably too much for this particular case) - JSON is a compromise between human- and machine-readability Disadvantages: - Need to double check if fasta headers allows json characters - Output is longer than just ad-hoc values - Output contains field names, which might be excessive if the meaning is already clear - JSON is a compromise between human- and machine-readability Example outputs: Before: ``` >11571779012938514380 pCAV1344-40-1705098846223677255 [4961-9839|+] ``` After: ``` >11571779012938514380 {"path_name":"pCAV1344-40","block_id":1705098846223677255,"start":4961,"end":9839,"strand":"+"} ``` --- .../pangraph/src/pangraph/pangraph_block.rs | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/packages/pangraph/src/pangraph/pangraph_block.rs b/packages/pangraph/src/pangraph/pangraph_block.rs index 1e24ec70..a1442c6f 100644 --- a/packages/pangraph/src/pangraph/pangraph_block.rs +++ b/packages/pangraph/src/pangraph/pangraph_block.rs @@ -1,3 +1,4 @@ +use crate::io::json::{json_write_str, JsonPretty}; use crate::io::seq::reverse_complement; use crate::pangraph::edits::Edit; use crate::pangraph::pangraph::Pangraph; @@ -10,6 +11,7 @@ use getset::{CopyGetters, Getters}; use maplit::btreemap; use schemars::JsonSchema; use serde::{Deserialize, Serialize}; +use serde_json::json; use std::collections::{BTreeMap, BTreeSet}; use std::hash::Hash; @@ -154,7 +156,20 @@ impl PangraphBlock { let path_name = &graph.paths[&node.path_id()].name().as_ref().unwrap(); let (start, end) = node.position(); let strand = node.strand(); - format!("{node_id} {path_name}-{block_id} [{start}-{end}|{strand}]") + + let meta = json_write_str( + &json! ({ + "path_name": path_name, + "block_id": block_id, + "start": start, + "end": end, + "strand": strand, + }), + JsonPretty(false), + ) + .unwrap(); + + format!("{node_id} {meta}") } RecordNaming::Path => { let path_id = graph.nodes[node_id].path_id();