From 7ab810eae25dbe7413aaacebb25bcfccf6ee0986 Mon Sep 17 00:00:00 2001 From: Faith Okamoto <52177356+faithokamoto@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:06:15 -0800 Subject: [PATCH 1/8] Make exon numbering more flexible Validate exon order by base-pair position instead of number attribute. --- src/transcriptome.cpp | 111 ++++++++++-------------------------------- 1 file changed, 26 insertions(+), 85 deletions(-) diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index 7fb7362360..cf3fd580a4 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -582,11 +582,8 @@ int32_t Transcriptome::parse_transcripts(vector * transcripts, uint3 string feature; string pos; string strand; - string attributes; string attribute; - bool zero_based_exon_number = false; - while (transcript_stream->good()) { line_number += 1; @@ -650,58 +647,8 @@ int32_t Transcriptome::parse_transcripts(vector * transcripts, uint3 transcript_line_ss.ignore(numeric_limits::max(), '\t'); string transcript_id = ""; - int32_t exon_number = -1; - - while (getline(transcript_line_ss, attribute, ';')) { - - if (attribute.empty()) { - - break; - } - - // Parse transcript ID. - if (transcript_id.empty()) { - - transcript_id = parse_attribute_value(attribute, transcript_tag); - } - - // Parse exon number. - if (exon_number < 0) { - - auto exon_number_str = parse_attribute_value(attribute, "exon_number"); - - if (exon_number_str.empty()) { - - // If not exon_number attribute try ID. - auto exon_id = parse_attribute_value(attribute, "ID"); - - if (count(exon_id.begin(), exon_id.end(), ':') == 2) { - - auto exon_id_ss = stringstream(exon_id); - - string element; - getline(exon_id_ss, element, ':'); - - if (element == "exon") { - - getline(exon_id_ss, element, ':'); - getline(exon_id_ss, element); - - exon_number = stoi(element); - } - } - - } else { - - exon_number = stoi(exon_number_str); - } - } - - if (!transcript_id.empty() && exon_number >= 0) { - - break; - } - } + getline(transcript_line_ss, attribute, ';'); + transcript_id = parse_attribute_value(attribute, transcript_tag); if (transcript_id.empty()) { @@ -728,25 +675,18 @@ int32_t Transcriptome::parse_transcripts(vector * transcripts, uint3 // Add exon to current transcript. add_exon(transcript, make_pair(spos, epos), graph_path_pos_overlay); } + } - // Check if exons are in correct order in file. - if (exon_number >= 0) { + for (auto & transcript: parsed_transcripts) { + // Reorder reversed order exons. + reorder_exons(&(transcript.second)); - // If first transcript and exon, set whether exon numbering is zero-based. - if (parsed_transcripts.size() == 1 && transcript->exons.size() == 1) { + // Exclude transcripts with exons in incorrect order according to bp. + if (has_incorrect_order_exons(transcript.second.exons)) { - zero_based_exon_number = (exon_number == 0) ? true : false; - } - if (transcript->exons.size() - static_cast(zero_based_exon_number) != exon_number) { - - // Exclude transcripts with exons in incorrect order according to attributes. - excluded_transcripts.emplace(transcript_id); - } + excluded_transcripts.emplace(transcript.first); } - } - - for (auto & transcript: parsed_transcripts) { // Exclude transcripts with overlapping exons. if (has_overlapping_exons(transcript.second.exons)) { @@ -759,13 +699,11 @@ int32_t Transcriptome::parse_transcripts(vector * transcripts, uint3 transcripts->reserve(transcripts->size() + parsed_transcripts.size() - excluded_transcripts.size()); + // Populate transcripts with parsed_transcripts not in excluded_transcripts. for (auto & transcript: parsed_transcripts) { if (excluded_transcripts.find(transcript.first) == excluded_transcripts.end()) { - // Reorder reversed order exons. - reorder_exons(&(transcript.second)); - transcripts->emplace_back(move(transcript.second)); } } @@ -884,7 +822,7 @@ void Transcriptome::reorder_exons(Transcript * transcript) const { if (transcript->is_reverse) { - // Is exons in reverse order. + // Are exons in reverse order? bool is_reverse_order = true; for (size_t i = 1; i < transcript->exons.size(); i++) { @@ -905,21 +843,24 @@ void Transcriptome::reorder_exons(Transcript * transcript) const { bool Transcriptome::has_overlapping_exons(const vector & exons) const { for (size_t i = 1; i < exons.size(); ++i) { + // Assumes that exons are in increasing coordinate order. + if (exons.at(i - 1).coordinates.second >= exons.at(i).coordinates.first) { + + return true; + } + } - // Is exons in reverse order. - if (exons.at(i - 1).coordinates.first <= exons.at(i).coordinates.first) { - - if (exons.at(i - 1).coordinates.second >= exons.at(i).coordinates.first) { - - return true; - } - - } else { + return false; +} - if (exons.at(i).coordinates.second >= exons.at(i - 1).coordinates.first) { +bool Transcriptome::has_incorrect_order_exons(const vector & exons) const { - return true; - } + for (size_t i = 1; i < exons.size(); ++i) { + // Assumes that exons are in increasing coordinate order. + if (exons.at(i - 1).coordinates.first > exons.at(i).coordinates.first + || exons.at(i - 1).coordinates.second > exons.at(i).coordinates.second) { + + return true; } } From 38095a5b1c9e24395c15c71e3e687af43d8b934c Mon Sep 17 00:00:00 2001 From: Faith Okamoto <52177356+faithokamoto@users.noreply.github.com> Date: Fri, 6 Dec 2024 14:07:51 -0800 Subject: [PATCH 2/8] new function for order validation --- src/transcriptome.hpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp index 42214df12c..fb8dafd438 100644 --- a/src/transcriptome.hpp +++ b/src/transcriptome.hpp @@ -295,6 +295,9 @@ class Transcriptome { /// are ordered in reverse. void reorder_exons(Transcript * transcript) const; + /// Checks whether any adjacent exons are out of (strictly increasing) order + bool has_incorrect_order_exons(const vector & exons) const; + /// Checks whether any adjacent exons overlap. bool has_overlapping_exons(const vector & exons) const; From e07d9b74b368cbd8f41dec6ef1f1a6d0c1da5d6b Mon Sep 17 00:00:00 2001 From: Faith Okamoto <52177356+faithokamoto@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:04:58 -0800 Subject: [PATCH 3/8] add assumption --- src/transcriptome.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transcriptome.hpp b/src/transcriptome.hpp index fb8dafd438..8b3ae1aff8 100644 --- a/src/transcriptome.hpp +++ b/src/transcriptome.hpp @@ -295,10 +295,12 @@ class Transcriptome { /// are ordered in reverse. void reorder_exons(Transcript * transcript) const; - /// Checks whether any adjacent exons are out of (strictly increasing) order + /// Checks whether any adjacent exons are out of order + /// Assumes exons are in increasing order (to be correct) bool has_incorrect_order_exons(const vector & exons) const; /// Checks whether any adjacent exons overlap. + /// Assumes exons are in increasing order bool has_overlapping_exons(const vector & exons) const; /// Constructs edited reference transcript paths from a set of From 49a537a4d4228ea4cd569ea2400fcb3ecf9f420f Mon Sep 17 00:00:00 2001 From: Faith Okamoto <52177356+faithokamoto@users.noreply.github.com> Date: Fri, 6 Dec 2024 15:44:59 -0800 Subject: [PATCH 4/8] Restore original transcript ID parsing autoindex has difficulty with transcript ID, maybe this will fix it. --- src/transcriptome.cpp | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/src/transcriptome.cpp b/src/transcriptome.cpp index cf3fd580a4..6534203cd7 100644 --- a/src/transcriptome.cpp +++ b/src/transcriptome.cpp @@ -647,8 +647,24 @@ int32_t Transcriptome::parse_transcripts(vector * transcripts, uint3 transcript_line_ss.ignore(numeric_limits::max(), '\t'); string transcript_id = ""; - getline(transcript_line_ss, attribute, ';'); - transcript_id = parse_attribute_value(attribute, transcript_tag); + while (getline(transcript_line_ss, attribute, ';')) { + + if (attribute.empty()) { + + break; + } + + // Parse transcript ID. + if (transcript_id.empty()) { + + transcript_id = parse_attribute_value(attribute, transcript_tag); + } + + if (!transcript_id.empty()) { + + break; + } + } if (transcript_id.empty()) { From 553cf202d8c15c88d0ff21008de92be030da3a8e Mon Sep 17 00:00:00 2001 From: Faith Okamoto <52177356+faithokamoto@users.noreply.github.com> Date: Fri, 6 Dec 2024 16:40:20 -0800 Subject: [PATCH 5/8] Update unit test to new logic Now the one passing transcript is the second one. The first breaks because the exons are simply mixed up (even though their attribute numbers are correct), and the last breaks because forward strand genes can't be reversed. --- src/unittest/transcriptome.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/unittest/transcriptome.cpp b/src/unittest/transcriptome.cpp index dd8765d6b1..e6fa4fef79 100644 --- a/src/unittest/transcriptome.cpp +++ b/src/unittest/transcriptome.cpp @@ -499,13 +499,13 @@ namespace vg { SECTION("Transcriptome can parse gff3 file and exclude transcripts with incorrect exon order") { stringstream transcript_stream2; - transcript_stream2 << "path1\t.\texon\t2\t7\t.\t+\t.\ttranscript_id=transcript1;exon_number=2" << endl; - transcript_stream2 << "path1\t.\texon\t9\t10\t.\t+\t.\ttranscript_id=transcript1;exon_number=1" << endl; + transcript_stream2 << "path1\t.\texon\t9\t10\t.\t+\t.\ttranscript_id=transcript1;exon_number=2" << endl; + transcript_stream2 << "path1\t.\texon\t2\t7\t.\t+\t.\ttranscript_id=transcript1;exon_number=1" << endl; transcript_stream2 << "path1\t.\texon\t19\t21\t.\t+\t.\ttranscript_id=transcript1;exon_number=3" << endl; transcript_stream2 << "path1\t.\texon\t16\t21\t.\t-\t.\tID=exon:transcript2:0;transcript_id=transcript2;" << endl; transcript_stream2 << "path1\t.\texon\t2\t7\t.\t-\t.\tID=exon:transcript2:1;transcript_id=transcript2" << endl; - transcript_stream2 << "path1\t.\texon\t9\t11\t.\t+\t.\texon_number=1;transcript_id=transcript3" << endl; transcript_stream2 << "path1\t.\texon\t18\t21\t.\t+\t.\texon_number=2;transcript_id=transcript3" << endl; + transcript_stream2 << "path1\t.\texon\t9\t11\t.\t+\t.\texon_number=1;transcript_id=transcript3" << endl; transcriptome.add_reference_transcripts(vector({&transcript_stream2}), empty_haplotype_index, false, false); From 11904da42979b998d2a0632aefd9e451381c61d0 Mon Sep 17 00:00:00 2001 From: Faith Okamoto <52177356+faithokamoto@users.noreply.github.com> Date: Fri, 6 Dec 2024 17:11:43 -0800 Subject: [PATCH 6/8] use original passing transcript --- src/unittest/transcriptome.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/unittest/transcriptome.cpp b/src/unittest/transcriptome.cpp index e6fa4fef79..83ba9746b4 100644 --- a/src/unittest/transcriptome.cpp +++ b/src/unittest/transcriptome.cpp @@ -503,9 +503,12 @@ namespace vg { transcript_stream2 << "path1\t.\texon\t2\t7\t.\t+\t.\ttranscript_id=transcript1;exon_number=1" << endl; transcript_stream2 << "path1\t.\texon\t19\t21\t.\t+\t.\ttranscript_id=transcript1;exon_number=3" << endl; transcript_stream2 << "path1\t.\texon\t16\t21\t.\t-\t.\tID=exon:transcript2:0;transcript_id=transcript2;" << endl; - transcript_stream2 << "path1\t.\texon\t2\t7\t.\t-\t.\tID=exon:transcript2:1;transcript_id=transcript2" << endl; - transcript_stream2 << "path1\t.\texon\t18\t21\t.\t+\t.\texon_number=2;transcript_id=transcript3" << endl; + transcript_stream2 << "path1\t.\texon\t8\t9\t.\t-\t.\tID=exon:transcript2:1;transcript_id=transcript2" << endl; + transcript_stream2 << "path1\t.\texon\t2\t7\t.\t-\t.\tID=exon:transcript2:2;transcript_id=transcript2" << endl; transcript_stream2 << "path1\t.\texon\t9\t11\t.\t+\t.\texon_number=1;transcript_id=transcript3" << endl; + transcript_stream2 << "path1\t.\texon\t18\t21\t.\t+\t.\texon_number=2;transcript_id=transcript3" << endl; + transcript_stream2 << "path1\t.\texon\t8\t10\t.\t+\t.\ttranscript_id=transcript4" << endl; + transcript_stream2 << "path1\t.\texon\t3\t6\t.\t+\t.\ttranscript_id=transcript4" << endl; transcriptome.add_reference_transcripts(vector({&transcript_stream2}), empty_haplotype_index, false, false); From 105e5863da633d1ac77aefb751a23856cace5769 Mon Sep 17 00:00:00 2001 From: Faith Okamoto <52177356+faithokamoto@users.noreply.github.com> Date: Fri, 6 Dec 2024 18:03:44 -0800 Subject: [PATCH 7/8] oops actually break transcript2 --- src/unittest/transcriptome.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/unittest/transcriptome.cpp b/src/unittest/transcriptome.cpp index 83ba9746b4..78826a5eaa 100644 --- a/src/unittest/transcriptome.cpp +++ b/src/unittest/transcriptome.cpp @@ -503,10 +503,10 @@ namespace vg { transcript_stream2 << "path1\t.\texon\t2\t7\t.\t+\t.\ttranscript_id=transcript1;exon_number=1" << endl; transcript_stream2 << "path1\t.\texon\t19\t21\t.\t+\t.\ttranscript_id=transcript1;exon_number=3" << endl; transcript_stream2 << "path1\t.\texon\t16\t21\t.\t-\t.\tID=exon:transcript2:0;transcript_id=transcript2;" << endl; - transcript_stream2 << "path1\t.\texon\t8\t9\t.\t-\t.\tID=exon:transcript2:1;transcript_id=transcript2" << endl; - transcript_stream2 << "path1\t.\texon\t2\t7\t.\t-\t.\tID=exon:transcript2:2;transcript_id=transcript2" << endl; + transcript_stream2 << "path1\t.\texon\t2\t7\t.\t-\t.\tID=exon:transcript2:1;transcript_id=transcript2" << endl; + transcript_stream2 << "path1\t.\texon\t8\t9\t.\t-\t.\tID=exon:transcript2:2;transcript_id=transcript2" << endl; transcript_stream2 << "path1\t.\texon\t9\t11\t.\t+\t.\texon_number=1;transcript_id=transcript3" << endl; - transcript_stream2 << "path1\t.\texon\t18\t21\t.\t+\t.\texon_number=2;transcript_id=transcript3" << endl; + transcript_stream2 << "path1\t.\texon\t18\t21\t.\t-\t.\texon_number=2;transcript_id=transcript3" << endl; transcript_stream2 << "path1\t.\texon\t8\t10\t.\t+\t.\ttranscript_id=transcript4" << endl; transcript_stream2 << "path1\t.\texon\t3\t6\t.\t+\t.\ttranscript_id=transcript4" << endl; From 9711c5a07e6ee9897ab427f5c036919e45c5db94 Mon Sep 17 00:00:00 2001 From: Faith Okamoto <52177356+faithokamoto@users.noreply.github.com> Date: Fri, 6 Dec 2024 18:05:46 -0800 Subject: [PATCH 8/8] oops take 2 transcript3 strand --- src/unittest/transcriptome.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/unittest/transcriptome.cpp b/src/unittest/transcriptome.cpp index 78826a5eaa..ace9b83db8 100644 --- a/src/unittest/transcriptome.cpp +++ b/src/unittest/transcriptome.cpp @@ -506,7 +506,7 @@ namespace vg { transcript_stream2 << "path1\t.\texon\t2\t7\t.\t-\t.\tID=exon:transcript2:1;transcript_id=transcript2" << endl; transcript_stream2 << "path1\t.\texon\t8\t9\t.\t-\t.\tID=exon:transcript2:2;transcript_id=transcript2" << endl; transcript_stream2 << "path1\t.\texon\t9\t11\t.\t+\t.\texon_number=1;transcript_id=transcript3" << endl; - transcript_stream2 << "path1\t.\texon\t18\t21\t.\t-\t.\texon_number=2;transcript_id=transcript3" << endl; + transcript_stream2 << "path1\t.\texon\t18\t21\t.\t+\t.\texon_number=2;transcript_id=transcript3" << endl; transcript_stream2 << "path1\t.\texon\t8\t10\t.\t+\t.\ttranscript_id=transcript4" << endl; transcript_stream2 << "path1\t.\texon\t3\t6\t.\t+\t.\ttranscript_id=transcript4" << endl;