From d19ffe684b77f32725baf0c3a4cfe0f2c9e37af0 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Tue, 23 Jul 2024 18:45:38 +0200 Subject: [PATCH 01/14] avoid multi-patch overlaps --- src/common/wflign/src/wflign_patch.cpp | 28 +++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 65859d18..ff960406 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -196,6 +196,9 @@ void do_wfa_patch_alignment( } if (query_length >= min_inversion_length && target_length >= min_inversion_length) { + if (aln.ok) { + wf_aligner.setMaxAlignmentSteps(fwd_score * 2 + 64); + } // Try reverse complement alignment std::string rev_comp_query = reverse_complement(std::string(query + j, query_length)); const int rev_status = wf_aligner.alignEnd2End(target + i, target_length, rev_comp_query.c_str(), query_length); @@ -358,10 +361,10 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln) { bounds.target_start_offset = target_pos; found_first_match = true; } - bounds.query_end_offset = query_pos; - bounds.target_end_offset = target_pos; ++query_pos; ++target_pos; + bounds.query_end_offset = query_pos; + bounds.target_end_offset = target_pos; break; case 'I': ++query_pos; @@ -446,6 +449,8 @@ std::vector do_progressive_wfa_patch_alignment( uint64_t current_target_start = target_start; uint64_t remaining_query_length = query_length; uint64_t remaining_target_length = target_length; + const uint64_t query_end = query_start + query_length; + const uint64_t target_end = target_start + target_length; while (remaining_query_length >= min_inversion_length && remaining_target_length >= min_inversion_length) { alignment_t aln, rev_aln; @@ -521,9 +526,8 @@ std::vector do_progressive_wfa_patch_alignment( current_query_start += bounds.query_end_offset; } current_target_start += bounds.target_end_offset; - remaining_query_length -= (current_query_start - last_query_start); - remaining_target_length -= (current_target_start - last_target_start); - + remaining_query_length = query_end - current_query_start; + remaining_target_length = target_end - current_target_start; } else { // If the alignment was completely eroded, break the loop break; @@ -1252,6 +1256,16 @@ void write_merged_alignment( while (!multi_patch_alns.empty() && (multi_patch_alns.back().query_end() > aln.query_begin() || multi_patch_alns.back().target_end() > aln.target_begin())) { + // write some info about the popped alignment vs the current one + /* + std::cerr << std::endl + << "Popped alignment: " + << multi_patch_alns.back().query_begin() << "-" << multi_patch_alns.back().query_end() + << " vs " << aln.query_begin() << "-" << aln.query_end() << std::endl + // target + << multi_patch_alns.back().target_begin() << "-" << multi_patch_alns.back().target_end() + << " vs " << aln.target_begin() << "-" << aln.target_end() << std::endl; + */ multi_patch_alns.pop_back(); } multi_patch_alns.push_back(aln); @@ -1688,7 +1702,7 @@ void write_merged_alignment( #endif // normalize: sort so that I Date: Tue, 23 Jul 2024 23:01:40 +0200 Subject: [PATCH 02/14] single patching pass and cleaner patch alignment score handling --- src/common/wflign/src/wflign_alignment.cpp | 2 +- src/common/wflign/src/wflign_alignment.hpp | 1 + src/common/wflign/src/wflign_patch.cpp | 29 +++++++++++----------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/src/common/wflign/src/wflign_alignment.cpp b/src/common/wflign/src/wflign_alignment.cpp index 3bb42731..98471ef8 100644 --- a/src/common/wflign/src/wflign_alignment.cpp +++ b/src/common/wflign/src/wflign_alignment.cpp @@ -14,7 +14,7 @@ // Default constructor alignment_t::alignment_t() - : j(0), i(0), query_length(0), target_length(0), ok(false), keep(false) { + : j(0), i(0), query_length(0), target_length(0), score(std::numeric_limits::max()), ok(false), keep(false) { edit_cigar = {nullptr, 0, 0}; } diff --git a/src/common/wflign/src/wflign_alignment.hpp b/src/common/wflign/src/wflign_alignment.hpp index 23f00445..8a2127c5 100644 --- a/src/common/wflign/src/wflign_alignment.hpp +++ b/src/common/wflign/src/wflign_alignment.hpp @@ -35,6 +35,7 @@ class alignment_t { int i; int query_length; int target_length; + int score; bool ok; bool keep; bool is_rev; diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index ff960406..fa050908 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -166,8 +166,8 @@ void do_wfa_patch_alignment( wf_aligner.setMaxAlignmentSteps(max_score); - int fwd_score = std::numeric_limits::max(); - int rev_score = std::numeric_limits::max(); + //int fwd_score = std::numeric_limits::max(); + //int rev_score = std::numeric_limits::max(); const int status = wf_aligner.alignEnd2End(target + i, target_length, query + j, query_length); aln.ok = (status == WF_STATUS_ALG_COMPLETED); @@ -191,13 +191,13 @@ void do_wfa_patch_alignment( #endif wflign_edit_cigar_copy(wf_aligner, &aln.edit_cigar); - fwd_score = calculate_alignment_score(aln.edit_cigar, convex_penalties); + aln.score = calculate_alignment_score(aln.edit_cigar, convex_penalties); //std::cerr << "forward score is " << fwd_score << std::endl; } if (query_length >= min_inversion_length && target_length >= min_inversion_length) { if (aln.ok) { - wf_aligner.setMaxAlignmentSteps(fwd_score * 2 + 64); + wf_aligner.setMaxAlignmentSteps(aln.score); } // Try reverse complement alignment std::string rev_comp_query = reverse_complement(std::string(query + j, query_length)); @@ -225,7 +225,7 @@ void do_wfa_patch_alignment( } #endif - rev_score = calculate_alignment_score(rev_aln.edit_cigar, convex_penalties); + rev_aln.score = calculate_alignment_score(rev_aln.edit_cigar, convex_penalties); //std::cerr << "reverse score is " << rev_score << std::endl; rev_aln.j = j; rev_aln.i = i; @@ -236,7 +236,7 @@ void do_wfa_patch_alignment( rev_aln.ok = false; } - if (rev_aln.ok && rev_score < fwd_score) { + if (rev_aln.ok && rev_aln.score < aln.score) { rev_aln.ok = true; aln.ok = false; #ifdef WFLIGN_DEBUG @@ -246,8 +246,8 @@ void do_wfa_patch_alignment( << " chain_gap " << chain_gap << " max_patching_score " << max_patching_score << " max_score " << max_score - << " fwd_score " << fwd_score - << " rev_score " << rev_score + << " fwd_score " << aln.score + << " rev_score " << rev_aln.score << std::endl << "query " << std::string(query + j, query_length) << " target " << std::string(target + i, target_length) @@ -470,7 +470,7 @@ std::vector do_progressive_wfa_patch_alignment( min_inversion_length); if (aln.ok || rev_aln.ok) { - alignment_t& chosen_aln = (rev_aln.ok && (!aln.ok || calculate_alignment_score(rev_aln.edit_cigar, convex_penalties) < calculate_alignment_score(aln.edit_cigar, convex_penalties))) ? rev_aln : aln; + alignment_t& chosen_aln = (rev_aln.ok && (!aln.ok || rev_aln.score < aln.score)) ? rev_aln : aln; // Erode short matches throughout the alignment //erode_alignment(chosen_aln, erode_k); if (chosen_aln.query_length > 0 && chosen_aln.target_length > 0) { @@ -1677,13 +1677,13 @@ void write_merged_alignment( //std::cerr << "FIRST PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - patching(erodev, pre_tracev, 4096, 32, 512, false); // In the 1st round, we patch more aggressively and don't save inversions + patching(erodev, tracev, 4096, 32, 512, true); // one patchin gpass #ifdef VALIDATE_WFA_WFLIGN - if (!validate_trace(pre_tracev, query, + if (!validate_trace(tracev, query, target - target_pointer_shift, query_length, target_length_mut, query_start, target_start)) { - std::cerr << "cigar failure in pre_tracev " + std::cerr << "cigar failure in tracev " << "\t" << query_name << "\t" << query_total_length << "\t" << query_offset + (query_is_rev @@ -1702,12 +1702,13 @@ void write_merged_alignment( #endif // normalize: sort so that I Date: Thu, 25 Jul 2024 17:22:25 +0200 Subject: [PATCH 03/14] seems to help and not hurt to make a light patching pass to clean up edges and small gaps --- src/common/wflign/src/wflign_patch.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index fa050908..a2e323bd 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -1677,13 +1677,13 @@ void write_merged_alignment( //std::cerr << "FIRST PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - patching(erodev, tracev, 4096, 32, 512, true); // one patchin gpass + patching(erodev, pre_tracev, 4096, 32, 512, false); // one patching pass #ifdef VALIDATE_WFA_WFLIGN - if (!validate_trace(tracev, query, + if (!validate_trace(pre_tracev, query, target - target_pointer_shift, query_length, target_length_mut, query_start, target_start)) { - std::cerr << "cigar failure in tracev " + std::cerr << "cigar failure in pre_tracev " << "\t" << query_name << "\t" << query_total_length << "\t" << query_offset + (query_is_rev @@ -1702,13 +1702,13 @@ void write_merged_alignment( #endif // normalize: sort so that I Date: Thu, 25 Jul 2024 18:21:52 +0200 Subject: [PATCH 04/14] disable multipatch popping, guard against potential mistake for inclusion of patch --- src/common/wflign/src/wflign_patch.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index a2e323bd..3d3cb064 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -1215,8 +1215,7 @@ void write_merged_alignment( if (rev_patch_aln.ok && save_multi_patch_alns) { // do_progressive_patch = true; - } - if (patch_aln.ok) { + } else if (patch_aln.ok) { got_alignment = true; const int start_idx = patch_aln.edit_cigar.begin_offset; @@ -1253,11 +1252,11 @@ void write_merged_alignment( if (aln.ok) { // avoid overlapping patches: if alignments is non-empty, pop alignments off its back until // the current query and target start are greater than the query and target end of the last alignment + /* while (!multi_patch_alns.empty() && (multi_patch_alns.back().query_end() > aln.query_begin() || multi_patch_alns.back().target_end() > aln.target_begin())) { // write some info about the popped alignment vs the current one - /* std::cerr << std::endl << "Popped alignment: " << multi_patch_alns.back().query_begin() << "-" << multi_patch_alns.back().query_end() @@ -1265,9 +1264,9 @@ void write_merged_alignment( // target << multi_patch_alns.back().target_begin() << "-" << multi_patch_alns.back().target_end() << " vs " << aln.target_begin() << "-" << aln.target_end() << std::endl; - */ multi_patch_alns.pop_back(); } + */ multi_patch_alns.push_back(aln); } } @@ -1677,7 +1676,7 @@ void write_merged_alignment( //std::cerr << "FIRST PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - patching(erodev, pre_tracev, 4096, 32, 512, false); // one patching pass + patching(erodev, pre_tracev, 4096, 32, 512, false); #ifdef VALIDATE_WFA_WFLIGN if (!validate_trace(pre_tracev, query, @@ -1707,8 +1706,7 @@ void write_merged_alignment( //std::cerr << "SECOND PATCH ROUND" << std::endl; // +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++\n"; - // in some tests long ago this appeared to help, but now it's unclear if it's necessary - patching(pre_tracev, tracev, 256, 8, 128, true); // In the 2nd round, we patch less aggressively but we save inversions + patching(pre_tracev, tracev, 256, 8, 128, true); } // normalize the indels From f31f6dd1a18bdaf5f260767433a7abf1a183342b Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Fri, 26 Jul 2024 23:08:26 +0200 Subject: [PATCH 05/14] simplify the multipatch --- src/common/wflign/src/wflign_alignment.cpp | 38 ++- src/common/wflign/src/wflign_alignment.hpp | 13 +- src/common/wflign/src/wflign_patch.cpp | 278 ++++++--------------- src/common/wflign/src/wflign_patch.hpp | 1 - 4 files changed, 120 insertions(+), 210 deletions(-) diff --git a/src/common/wflign/src/wflign_alignment.cpp b/src/common/wflign/src/wflign_alignment.cpp index 98471ef8..9b4d257e 100644 --- a/src/common/wflign/src/wflign_alignment.cpp +++ b/src/common/wflign/src/wflign_alignment.cpp @@ -14,7 +14,7 @@ // Default constructor alignment_t::alignment_t() - : j(0), i(0), query_length(0), target_length(0), score(std::numeric_limits::max()), ok(false), keep(false) { + : j(0), i(0), query_length(0), target_length(0), score(std::numeric_limits::max()), ok(false), keep(false), is_rev(false) { edit_cigar = {nullptr, 0, 0}; } @@ -25,7 +25,7 @@ alignment_t::~alignment_t() { // Copy constructor alignment_t::alignment_t(const alignment_t& other) - : j(other.j), i(other.i), query_length(other.query_length), + : j(other.j), i(other.i), query_length(other.query_length), is_rev(other.is_rev), target_length(other.target_length), ok(other.ok), keep(other.keep) { if (other.edit_cigar.cigar_ops) { edit_cigar.cigar_ops = (char*)malloc((other.edit_cigar.end_offset - other.edit_cigar.begin_offset) * sizeof(char)); @@ -40,7 +40,7 @@ alignment_t::alignment_t(const alignment_t& other) // Move constructor alignment_t::alignment_t(alignment_t&& other) noexcept - : j(other.j), i(other.i), query_length(other.query_length), + : j(other.j), i(other.i), query_length(other.query_length), is_rev(other.is_rev), target_length(other.target_length), ok(other.ok), keep(other.keep), edit_cigar(other.edit_cigar) { other.edit_cigar = {nullptr, 0, 0}; @@ -55,6 +55,7 @@ alignment_t& alignment_t::operator=(const alignment_t& other) { target_length = other.target_length; ok = other.ok; keep = other.keep; + is_rev = other.is_rev; free(edit_cigar.cigar_ops); if (other.edit_cigar.cigar_ops) { @@ -79,6 +80,7 @@ alignment_t& alignment_t::operator=(alignment_t&& other) noexcept { target_length = other.target_length; ok = other.ok; keep = other.keep; + is_rev = other.is_rev; free(edit_cigar.cigar_ops); edit_cigar = other.edit_cigar; @@ -87,19 +89,19 @@ alignment_t& alignment_t::operator=(alignment_t&& other) noexcept { return *this; } -int alignment_t::query_begin() { +int alignment_t::query_begin() const { return j; } -int alignment_t::query_end() { +int alignment_t::query_end() const { return j + query_length; } -int alignment_t::target_begin() { +int alignment_t::target_begin() const { return i; } -int alignment_t::target_end() { +int alignment_t::target_end() const { return i + target_length; } @@ -707,6 +709,28 @@ int calculate_alignment_score(const wflign_cigar_t& cigar, const wflign_penaltie return score; } +std::string cigar_to_string(const wflign_cigar_t& cigar) { + std::stringstream ss; + const int start_idx = cigar.begin_offset; + const int end_idx = cigar.end_offset; + for (int c = start_idx; c < end_idx; c++) { + ss << cigar.cigar_ops[c]; + } + return ss.str(); +} + +std::ostream& operator<<(std::ostream& os, const alignment_t& aln) { + return os << "Alignment: " + << "Query(" << aln.query_begin() << "-" << aln.query_end() << "/" << aln.query_length << ") " + << "Target(" << aln.target_begin() << "-" << aln.target_end() << "/" << aln.target_length << ") " + << "Score=" << aln.score << " " + << "Rev=" << (aln.is_rev ? "Yes" : "No") << " " + << "Status=" << (aln.ok ? "OK" : "NotOK") << " " + << "Keep=" << (aln.keep ? "Yes" : "No") << " " + << "CIGAR=" << cigar_to_string(aln.edit_cigar) << " " + << "Indices(i,j)=(" << aln.i << "," << aln.j << ")"; +} + /* // No more necessary bool hack_cigar(wfa::cigar_t &cigar, const char *query, const char *target, diff --git a/src/common/wflign/src/wflign_alignment.hpp b/src/common/wflign/src/wflign_alignment.hpp index 8a2127c5..09f076b5 100644 --- a/src/common/wflign/src/wflign_alignment.hpp +++ b/src/common/wflign/src/wflign_alignment.hpp @@ -3,6 +3,7 @@ #include #include +#include #include "WFA2-lib/bindings/cpp/WFAligner.hpp" /* @@ -57,12 +58,16 @@ class alignment_t { void trim_back(int query_trim); // query_begin, query_end, target_begin, target_end // Accessors - int query_begin(); - int query_end(); - int target_begin(); - int target_end(); + int query_begin() const; + int query_end() const; + int target_begin() const; + int target_end() const; }; +// debugging alignment writer +std::ostream& operator<<(std::ostream& os, const alignment_t& aln); +// debugging cigar writer + /* * Wflign Trace-Pos: Links a position in a traceback matrix to its edit diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 3d3cb064..f0a78792 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -378,58 +378,6 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln) { return bounds; } -void trim_alignment(alignment_t& aln) { - bool is_reverse = aln.is_rev; - if (!aln.ok || aln.edit_cigar.end_offset <= aln.edit_cigar.begin_offset) { - return; // Nothing to trim - } - - int64_t trim_start = 0; - int64_t trim_end = 0; - - // Trim start - for (int64_t i = aln.edit_cigar.begin_offset; i < aln.edit_cigar.end_offset; ++i) { - char op = aln.edit_cigar.cigar_ops[i]; - if (op != 'I' && op != 'D') { - break; - } - ++trim_start; - if (op == 'I') { - if (is_reverse) { - --aln.query_length; // Adjust query length for reverse alignment - } else { - ++aln.j; // Adjust query start for forward alignment - } - } else { // op == 'D' - ++aln.i; // Adjust target start (same for both orientations) - } - } - - // Trim end - for (int64_t i = aln.edit_cigar.end_offset - 1; i >= aln.edit_cigar.begin_offset + trim_start; --i) { - char op = aln.edit_cigar.cigar_ops[i]; - if (op != 'I' && op != 'D') { - break; - } - ++trim_end; - if (op == 'I') { - if (is_reverse) { - ++aln.j; // Adjust query start for reverse alignment - } else { - --aln.query_length; // Adjust query length for forward alignment - } - } else { // op == 'D' - --aln.target_length; // Adjust target length (same for both orientations) - } - } - - // Update CIGAR - if (trim_start > 0 || trim_end > 0) { - aln.edit_cigar.begin_offset += trim_start; - aln.edit_cigar.end_offset -= trim_end; - } -} - std::vector do_progressive_wfa_patch_alignment( const char* query, const uint64_t& query_start, @@ -452,8 +400,15 @@ std::vector do_progressive_wfa_patch_alignment( const uint64_t query_end = query_start + query_length; const uint64_t target_end = target_start + target_length; - while (remaining_query_length >= min_inversion_length && remaining_target_length >= min_inversion_length) { + //std::cerr << "BEGIN do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; + bool first = true; + + while (first || remaining_query_length >= min_inversion_length && remaining_target_length >= min_inversion_length) { + first = false; + //std::cerr << "do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; alignment_t aln, rev_aln; + aln.is_rev = false; + rev_aln.is_rev = true; do_wfa_patch_alignment( query, current_query_start, @@ -469,75 +424,74 @@ std::vector do_progressive_wfa_patch_alignment( max_patching_score, min_inversion_length); - if (aln.ok || rev_aln.ok) { - alignment_t& chosen_aln = (rev_aln.ok && (!aln.ok || rev_aln.score < aln.score)) ? rev_aln : aln; - // Erode short matches throughout the alignment - //erode_alignment(chosen_aln, erode_k); - if (chosen_aln.query_length > 0 && chosen_aln.target_length > 0) { + //std::cerr << "WFA fwd alignment: " << aln << std::endl; + //std::cerr << "WFA rev alignment: " << rev_aln << std::endl; + + if (rev_aln.ok && (!aln.ok || rev_aln.score < aln.score)) { + alignments.push_back(rev_aln); + } else if (aln.ok) { + alignments.push_back(aln); + //break; + } #ifdef VALIDATE_WFA_WFLIGN - { - std::string querystr = chosen_aln.is_rev ? - reverse_complement(std::string(query + current_query_start, chosen_aln.query_length)) - : std::string(query + current_query_start, chosen_aln.query_length); - std::string targetstr = std::string(target + current_target_start, chosen_aln.target_length); - if (!validate_cigar(chosen_aln.edit_cigar, querystr.c_str(), targetstr.c_str(), - chosen_aln.query_length, chosen_aln.target_length, 0, 0)) { - std::cerr << "before trim cigar failure at " << current_query_start << " " << current_target_start << std::endl; - unpack_display_cigar(chosen_aln.edit_cigar, querystr.c_str(), targetstr.c_str(), - chosen_aln.query_length, chosen_aln.target_length, 0, 0); - std::cerr << ">query" << std::endl << querystr << std::endl - << ">target" << std::endl << targetstr << std::endl; - assert(false); - exit(1); - } - } + { + std::string querystr = chosen_aln.is_rev ? + reverse_complement(std::string(query + current_query_start, chosen_aln.query_length)) + : std::string(query + current_query_start, chosen_aln.query_length); + std::string targetstr = std::string(target + current_target_start, chosen_aln.target_length); + if (!validate_cigar(chosen_aln.edit_cigar, querystr.c_str(), targetstr.c_str(), + chosen_aln.query_length, chosen_aln.target_length, 0, 0)) { + std::cerr << "before trim cigar failure at " << current_query_start << " " << current_target_start << std::endl; + unpack_display_cigar(chosen_aln.edit_cigar, querystr.c_str(), targetstr.c_str(), + chosen_aln.query_length, chosen_aln.target_length, 0, 0); + std::cerr << ">query" << std::endl << querystr << std::endl + << ">target" << std::endl << targetstr << std::endl; + assert(false); + exit(1); + } + } #endif - trim_alignment(chosen_aln); - auto bounds = find_alignment_bounds(chosen_aln); + if (alignments.empty()) { + break; + } + auto& chosen_aln = alignments.back(); + auto bounds = find_alignment_bounds(chosen_aln); #ifdef VALIDATE_WFA_WFLIGN - { - std::string querystr = chosen_aln.is_rev ? - reverse_complement(std::string(query + current_query_start, chosen_aln.query_length)) - : std::string(query + current_query_start, chosen_aln.query_length); - std::string targetstr = std::string(target + current_target_start, chosen_aln.target_length); - if (!validate_cigar(chosen_aln.edit_cigar, querystr.c_str(), targetstr.c_str(), - chosen_aln.query_length, chosen_aln.target_length, 0, 0)) { - std::cerr << "after trim cigar failure at " << current_query_start << " " << current_target_start << std::endl; - unpack_display_cigar(chosen_aln.edit_cigar, querystr.c_str(), targetstr.c_str(), - chosen_aln.query_length, chosen_aln.target_length, 0, 0); - std::cerr << ">query" << std::endl << querystr << std::endl - << ">target" << std::endl << targetstr << std::endl; - assert(false); - exit(1); - } - } + { + std::string querystr = chosen_aln.is_rev ? + reverse_complement(std::string(query + current_query_start, chosen_aln.query_length)) + : std::string(query + current_query_start, chosen_aln.query_length); + std::string targetstr = std::string(target + current_target_start, chosen_aln.target_length); + if (!validate_cigar(chosen_aln.edit_cigar, querystr.c_str(), targetstr.c_str(), + chosen_aln.query_length, chosen_aln.target_length, 0, 0)) { + std::cerr << "after trim cigar failure at " << current_query_start << " " << current_target_start << std::endl; + unpack_display_cigar(chosen_aln.edit_cigar, querystr.c_str(), targetstr.c_str(), + chosen_aln.query_length, chosen_aln.target_length, 0, 0); + std::cerr << ">query" << std::endl << querystr << std::endl + << ">target" << std::endl << targetstr << std::endl; + assert(false); + exit(1); + } + } #endif - alignments.push_back(chosen_aln); - auto last_query_start = current_query_start; - auto last_target_start = current_target_start; + //alignments.push_back(chosen_aln); + auto last_query_start = current_query_start; + auto last_target_start = current_target_start; - // Update the start positions and remaining lengths for the next iteration - if (chosen_aln.is_rev) { - current_query_start += bounds.query_start_offset; - } else { - current_query_start += bounds.query_end_offset; - } - current_target_start += bounds.target_end_offset; - remaining_query_length = query_end - current_query_start; - remaining_target_length = target_end - current_target_start; - } else { - // If the alignment was completely eroded, break the loop - break; - } + // Update the start positions and remaining lengths for the next iteration + if (chosen_aln.is_rev) { + current_query_start += bounds.query_start_offset; } else { - // If no alignment was found, break the loop - break; + current_query_start += bounds.query_end_offset; } + current_target_start += bounds.target_end_offset; + remaining_query_length = query_end - current_query_start; + remaining_target_length = target_end - current_target_start; } - + //std::cerr << "END do_progressive_wfa_patch_alignment got " << alignments.size() << " alignments" << std::endl; return alignments; } @@ -1202,38 +1156,8 @@ void write_merged_alignment( size_region_to_repatch = 0; { - alignment_t patch_aln; - alignment_t rev_patch_aln; - bool do_progressive_patch = false; // WFA is only global - do_wfa_patch_alignment( - query, query_pos, query_delta, - target - target_pointer_shift, target_pos, target_delta, - wf_aligner, convex_penalties, patch_aln, rev_patch_aln, - chain_gap, max_patching_score, min_inversion_length); - // collect inverse patch alignments - if (rev_patch_aln.ok && save_multi_patch_alns) { - // - do_progressive_patch = true; - } else if (patch_aln.ok) { - got_alignment = true; - const int start_idx = - patch_aln.edit_cigar.begin_offset; - const int end_idx = - patch_aln.edit_cigar.end_offset; - for (int i = start_idx; i < end_idx; i++) { - patched.push_back(patch_aln.edit_cigar.cigar_ops[i]); - } - } - - if (do_progressive_patch) { - /* - std::cerr << "Starting progressive patching for region:" << std::endl; - std::cerr << "Query: " << query_pos << "-" << query_pos + query_delta << " (length: " << query_delta << ")" << std::endl; - std::cerr << "Target: " << target_pos << "-" << target_pos + target_delta << " (length: " << target_delta << ")" << std::endl; - */ - - auto patch_alignments = do_progressive_wfa_patch_alignment( + auto patch_alignments = do_progressive_wfa_patch_alignment( query, query_pos, query_delta, @@ -1246,64 +1170,22 @@ void write_merged_alignment( max_patching_score, min_inversion_length, erode_k); - - // save the ok multi-patch alignments into our list - for (auto& aln : patch_alignments) { - if (aln.ok) { - // avoid overlapping patches: if alignments is non-empty, pop alignments off its back until - // the current query and target start are greater than the query and target end of the last alignment - /* - while (!multi_patch_alns.empty() - && (multi_patch_alns.back().query_end() > aln.query_begin() - || multi_patch_alns.back().target_end() > aln.target_begin())) { - // write some info about the popped alignment vs the current one - std::cerr << std::endl - << "Popped alignment: " - << multi_patch_alns.back().query_begin() << "-" << multi_patch_alns.back().query_end() - << " vs " << aln.query_begin() << "-" << aln.query_end() << std::endl - // target - << multi_patch_alns.back().target_begin() << "-" << multi_patch_alns.back().target_end() - << " vs " << aln.target_begin() << "-" << aln.target_end() << std::endl; - multi_patch_alns.pop_back(); - } - */ - multi_patch_alns.push_back(aln); - } + if (patch_alignments.size() == 1 + && patch_alignments.front().ok + && !patch_alignments.front().is_rev) { + got_alignment = true; + auto& patch_aln = patch_alignments.front(); + const int start_idx = + patch_aln.edit_cigar.begin_offset; + const int end_idx = + patch_aln.edit_cigar.end_offset; + for (int i = start_idx; i < end_idx; i++) { + patched.push_back(patch_aln.edit_cigar.cigar_ops[i]); } - - /* - std::cerr << "Progressive patching found " << patch_alignments.size() << " alignments" << std::endl; - - for (size_t i = 0; i < patch_alignments.size(); ++i) { - const auto& aln = patch_alignments[i]; - std::cerr << "Alignment " << i + 1 << ":" << std::endl; - std::cerr << " Rev: " << (aln.is_rev?"true":"false") << std::endl; - std::cerr << " Query: " << aln.j << "-" << aln.j + aln.query_length << " (length: " << aln.query_length << ")" << std::endl; - std::cerr << " Target: " << aln.i << "-" << aln.i + aln.target_length << " (length: " << aln.target_length << ")" << std::endl; - std::cerr << " CIGAR: "; - for (int c = aln.edit_cigar.begin_offset; c < aln.edit_cigar.end_offset; ++c) { - std::cerr << aln.edit_cigar.cigar_ops[c]; - } - std::cerr << std::endl; - // Calculate and print alignment statistics - uint64_t matches = 0, mismatches = 0, insertions = 0, deletions = 0; - for (int c = aln.edit_cigar.begin_offset; c < aln.edit_cigar.end_offset; ++c) { - switch (aln.edit_cigar.cigar_ops[c]) { - case 'M': ++matches; break; - case 'X': ++mismatches; break; - case 'I': ++insertions; break; - case 'D': ++deletions; break; - } - } - std::cerr << " Stats: Matches=" << matches << ", Mismatches=" << mismatches - << ", Insertions=" << insertions << ", Deletions=" << deletions << std::endl; - // Print a snippet of the aligned sequences - const int snippet_length = 50; - std::cerr << " Query snippet: " << std::string(query + aln.j, std::min((int)aln.query_length, snippet_length)) << std::endl; - std::cerr << " Target snippet: " << std::string(target - target_pointer_shift + aln.i, std::min((int)aln.target_length, snippet_length)) << std::endl; - std::cerr << std::endl; + } else if (save_multi_patch_alns) { + for (auto& aln : patch_alignments) { + multi_patch_alns.push_back(aln); } - */ } #ifdef WFA_PNG_TSV_TIMING diff --git a/src/common/wflign/src/wflign_patch.hpp b/src/common/wflign/src/wflign_patch.hpp index 92ba32c5..26009a9d 100644 --- a/src/common/wflign/src/wflign_patch.hpp +++ b/src/common/wflign/src/wflign_patch.hpp @@ -72,7 +72,6 @@ namespace wflign { const int& max_patching_score, const uint64_t& min_inversion_length, const int& erode_k); - void trim_alignment(alignment_t& aln); void write_merged_alignment( std::ostream &out, const std::vector &trace, From c341e1e7b7985eed85d8a0bf943714f1cc323b95 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Fri, 26 Jul 2024 23:57:17 +0200 Subject: [PATCH 06/14] put back trimming --- src/common/wflign/src/wflign_patch.cpp | 33 ++++++++++++++++++++++++++ src/common/wflign/src/wflign_patch.hpp | 1 + 2 files changed, 34 insertions(+) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index f0a78792..7a9e7d37 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -378,6 +378,38 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln) { return bounds; } +void trim_alignment(alignment_t& aln) { + // Trim head + int head_trim_q = 0, head_trim_t = 0; + while (aln.edit_cigar.begin_offset < aln.edit_cigar.end_offset) { + char op = aln.edit_cigar.cigar_ops[aln.edit_cigar.begin_offset]; + if (op != 'I' && op != 'D') break; + if (op == 'I') head_trim_q++; + if (op == 'D') head_trim_t++; + aln.edit_cigar.begin_offset++; + } + + // Trim tail + int tail_trim_q = 0, tail_trim_t = 0; + while (aln.edit_cigar.end_offset > aln.edit_cigar.begin_offset) { + char op = aln.edit_cigar.cigar_ops[aln.edit_cigar.end_offset - 1]; + if (op != 'I' && op != 'D') break; + if (op == 'I') tail_trim_q++; + if (op == 'D') tail_trim_t++; + aln.edit_cigar.end_offset--; + } + + // Adjust coordinates + if (aln.is_rev) { + aln.j += tail_trim_q; // For reverse alignments, tail trim affects the start + } else { + aln.j += head_trim_q; + } + aln.i += head_trim_t; + aln.query_length -= (head_trim_q + tail_trim_q); + aln.target_length -= (head_trim_t + tail_trim_t); +} + std::vector do_progressive_wfa_patch_alignment( const char* query, const uint64_t& query_start, @@ -1184,6 +1216,7 @@ void write_merged_alignment( } } else if (save_multi_patch_alns) { for (auto& aln : patch_alignments) { + trim_alignment(aln); multi_patch_alns.push_back(aln); } } diff --git a/src/common/wflign/src/wflign_patch.hpp b/src/common/wflign/src/wflign_patch.hpp index 26009a9d..f5dc4c5a 100644 --- a/src/common/wflign/src/wflign_patch.hpp +++ b/src/common/wflign/src/wflign_patch.hpp @@ -59,6 +59,7 @@ namespace wflign { const int64_t& chain_gap, const int& max_patching_score, const uint64_t& min_inversion_length); + void trim_alignment(alignment_t& aln); std::vector do_progressive_wfa_patch_alignment( const char* query, const uint64_t& query_start, From 4f996e02e92e4a9ba20a13c7886515e186b5162b Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 27 Jul 2024 09:44:59 +0200 Subject: [PATCH 07/14] make debugging happy --- src/common/wflign/src/wflign_patch.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 7a9e7d37..5a1fa721 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -1223,10 +1223,12 @@ void write_merged_alignment( #ifdef WFA_PNG_TSV_TIMING if (emit_patching_tsv) { - *out_patching_tsv + for (auto& aln : patch_alignments) { + *out_patching_tsv << query_name << "\t" << query_pos << "\t" << query_pos + query_delta << "\t" << target_name << "\t" << (target_pos - target_pointer_shift) << "\t" << (target_pos - target_pointer_shift + target_delta) << "\t" - << patch_aln.ok << std::endl; + << aln.ok << std::endl; + } } #endif } From c44daa8ea0cf7c4866917a317c3d7b9fe175a659 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sat, 27 Jul 2024 22:49:59 +0200 Subject: [PATCH 08/14] forward strand inverse patches in sam --- src/common/wflign/src/wflign.cpp | 2 +- src/common/wflign/src/wflign_patch.cpp | 226 ++++++++++++++++++++++--- src/common/wflign/src/wflign_patch.hpp | 31 +++- 3 files changed, 232 insertions(+), 27 deletions(-) diff --git a/src/common/wflign/src/wflign.cpp b/src/common/wflign/src/wflign.cpp index 44cd1267..ddd89b47 100644 --- a/src/common/wflign/src/wflign.cpp +++ b/src/common/wflign/src/wflign.cpp @@ -1026,7 +1026,7 @@ void WFlign::wflign_affine_wavefront( } else { // todo old implementation (and SAM format is not supported) for (auto x = trace.rbegin(); x != trace.rend(); ++x) { - write_alignment( + write_alignment_paf( *out, **x, query_name, diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 5a1fa721..2d357971 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -2042,35 +2042,211 @@ query_start : query_end) // always clean up free(cigarv); - // write how many reverse complement alignments were found - //std::cerr << "got " << rev_patch_alns.size() << " rev patch alns" << std::endl; - for (auto& patch_aln : multi_patch_alns) { - bool rev_query_is_rev = !query_is_rev; // Flip the orientation - write_alignment( - out, - patch_aln, - query_name, - query_total_length, - query_offset, - query_length, - rev_query_is_rev, // Use the flipped orientation - target_name, - target_total_length, - target_offset, - target_length, - min_identity, - mashmap_estimated_identity, - false, // Don't add an endline after each alignment - true); // This is a reverse complement alignment - // write tag indicating that this is a multipatch alignment - out << "\t" << "patch:Z:true" << "\t" - // and if the patch is inverted as well - << "\t" << "inv:Z:" << (patch_aln.is_rev ? "true" : "false") << "\n"; + + if (!paf_format_else_sam) { + for (auto& patch_aln : multi_patch_alns) { + write_alignment_sam( + out, patch_aln, query_name, query_total_length, + query_offset, query_length, patch_aln.is_rev, + target_name, target_total_length, target_offset, target_length, + min_identity, mashmap_estimated_identity, + no_seq_in_sam, emit_md_tag, query, target, target_pointer_shift); + } + } else { + // write how many reverse complement alignments were found + //std::cerr << "got " << rev_patch_alns.size() << " rev patch alns" << std::endl; + for (auto& patch_aln : multi_patch_alns) { + bool rev_query_is_rev = !query_is_rev; // Flip the orientation + write_alignment_paf( + out, + patch_aln, + query_name, + query_total_length, + query_offset, + query_length, + rev_query_is_rev, // Use the flipped orientation + target_name, + target_total_length, + target_offset, + target_length, + min_identity, + mashmap_estimated_identity, + false, // Don't add an endline after each alignment + true); // This is a reverse complement alignment + // write tag indicating that this is a multipatch alignment + out << "\t" << "pt:Z:true" << "\t" + // and if the patch is inverted as well + << "\t" << "iv:Z:" << (patch_aln.is_rev ? "true" : "false") << "\n"; + } } out << std::flush; } -void write_alignment( +void write_tag_and_md_string( + std::ostream &out, + const char *cigar_ops, + const int cigar_start, + const int cigar_end, + const int target_start, + const char *target, + const int64_t target_offset, + const int64_t target_pointer_shift) { + + out << "MD:Z:"; + + char last_op = '\0'; + int last_len = 0; + int t_off = target_start, l_MD = 0; + int l = cigar_start; + int x = cigar_start; + + while (x < cigar_end) { + while (x < cigar_end && isdigit(cigar_ops[x])) + ++x; + char op = cigar_ops[x]; + int len = 0; + std::from_chars(cigar_ops + l, cigar_ops + x, len); + l = ++x; + if (last_len) { + if (last_op == op) { + len += last_len; + } else { + if (last_op == '=' || last_op == 'M') { + l_MD += last_len; + t_off += last_len; + } else if (last_op == 'X') { + for (uint64_t ii = 0; ii < last_len; ++ii) { + out << l_MD << target[t_off + ii - target_pointer_shift]; + l_MD = 0; + } + t_off += last_len; + } else if (last_op == 'D') { + out << l_MD << "^"; + for (uint64_t ii = 0; ii < last_len; ++ii) { + out << target[t_off + ii - target_pointer_shift]; + } + l_MD = 0; + t_off += last_len; + } + } + } + last_op = op; + last_len = len; + } + + if (last_len) { + if (last_op == '=' || last_op == 'M') { + out << last_len + l_MD; + } else if (last_op == 'X') { + for (uint64_t ii = 0; ii < last_len; ++ii) { + out << l_MD << target[t_off + ii - target_pointer_shift]; + l_MD = 0; + } + out << "0"; + } else if (last_op == 'I') { + out << l_MD; + } else if (last_op == 'D') { + out << l_MD << "^"; + for (uint64_t ii = 0; ii < last_len; ++ii) { + out << target[t_off + ii - target_pointer_shift]; + } + out << "0"; + } + } +} + +void write_alignment_sam( + std::ostream &out, + const alignment_t& patch_aln, + const std::string& query_name, + const uint64_t& query_total_length, + const uint64_t& query_offset, + const uint64_t& query_length, + const bool& query_is_rev, + const std::string& target_name, + const uint64_t& target_total_length, + const uint64_t& target_offset, + const uint64_t& target_length, + const float& min_identity, + const float& mashmap_estimated_identity, + const bool& no_seq_in_sam, + const bool& emit_md_tag, + const char* query, + const char* target, + const int64_t& target_pointer_shift) { + + uint64_t patch_matches = 0; + uint64_t patch_mismatches = 0; + uint64_t patch_insertions = 0; + uint64_t patch_inserted_bp = 0; + uint64_t patch_deletions = 0; + uint64_t patch_deleted_bp = 0; + uint64_t patch_refAlignedLength = 0; + uint64_t patch_qAlignedLength = 0; + + char* patch_cigar = wfa_alignment_to_cigar( + &patch_aln.edit_cigar, patch_refAlignedLength, patch_qAlignedLength, + patch_matches, patch_mismatches, patch_insertions, patch_inserted_bp, + patch_deletions, patch_deleted_bp); + + double patch_gap_compressed_identity = (double)patch_matches / + (double)(patch_matches + patch_mismatches + patch_insertions + patch_deletions); + double patch_block_identity = (double)patch_matches / + (double)(patch_matches + patch_mismatches + patch_inserted_bp + patch_deleted_bp); + + /* + const uint64_t query_start_pos = + (query_is_rev ? query_offset + query_length : query_offset); + const uint64_t query_end_pos = + (query_is_rev ? query_offset : query_offset + query_length); + */ + + if (patch_gap_compressed_identity >= min_identity) { + + out << query_name << "\t" + << (query_is_rev ? "16" : "0") << "\t" + << target_name << "\t" + << target_offset + patch_aln.i + 1 << "\t" + << std::round(float2phred(1.0 - patch_block_identity)) << "\t" + << patch_cigar << "\t" + << "*\t0\t0\t"; + + if (no_seq_in_sam) { + out << "*"; + } else { + std::stringstream seq; + for (uint64_t p = patch_aln.j; p < patch_aln.j + patch_aln.query_length; ++p) { + seq << query[p]; + } + if (query_is_rev) { + // reverse complement + out << reverse_complement(seq.str()); + } else { + out << seq.str(); + } + } + out << "\t*\t" + << "NM:i:" << (patch_mismatches + patch_inserted_bp + patch_deleted_bp) << "\t" + << "gi:f:" << patch_gap_compressed_identity << "\t" + << "bi:f:" << patch_block_identity << "\t" + << "md:f:" << mashmap_estimated_identity << "\t" + << "pt:Z:true" << "\t" + << "iv:Z:" << (patch_aln.is_rev ? "true" : "false"); + + if (emit_md_tag) { + out << "\t"; + write_tag_and_md_string(out, patch_cigar, 0, strlen(patch_cigar), + target_offset + patch_aln.i, + target, target_offset, target_pointer_shift); + } + + out << "\n"; + } + + free(patch_cigar); +} + +void write_alignment_paf( std::ostream& out, const alignment_t& aln, const std::string& query_name, diff --git a/src/common/wflign/src/wflign_patch.hpp b/src/common/wflign/src/wflign_patch.hpp index f5dc4c5a..e7f6892a 100644 --- a/src/common/wflign/src/wflign_patch.hpp +++ b/src/common/wflign/src/wflign_patch.hpp @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "dna.hpp" @@ -114,7 +115,35 @@ namespace wflign { std::ostream* out_patching_tsv, #endif const bool& with_endline = true); - void write_alignment( + void write_tag_and_md_string( + std::ostream &out, + const char *cigar_ops, + const int cigar_start, + const int cigar_end, + const int target_start, + const char *target, + const int64_t target_offset, + const int64_t target_pointer_shift); + void write_alignment_sam( + std::ostream &out, + const alignment_t& patch_aln, + const std::string& query_name, + const uint64_t& query_total_length, + const uint64_t& query_offset, + const uint64_t& query_length, + const bool& query_is_rev, + const std::string& target_name, + const uint64_t& target_total_length, + const uint64_t& target_offset, + const uint64_t& target_length, + const float& min_identity, + const float& mashmap_estimated_identity, + const bool& no_seq_in_sam, + const bool& emit_md_tag, + const char* query, + const char* target, + const int64_t& target_pointer_shift); + void write_alignment_paf( std::ostream& out, const alignment_t& aln, const std::string& query_name, From 631cbbe729b68bea9c4ec2e2fe33a7823b1f4bb1 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 28 Jul 2024 16:58:58 +0200 Subject: [PATCH 09/14] going the right way in both orientations --- src/common/wflign/src/wflign_patch.cpp | 42 +++++++++++++++++++++----- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 2d357971..6162054e 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -2047,7 +2047,7 @@ query_start : query_end) for (auto& patch_aln : multi_patch_alns) { write_alignment_sam( out, patch_aln, query_name, query_total_length, - query_offset, query_length, patch_aln.is_rev, + query_offset, query_length, query_is_rev, target_name, target_total_length, target_offset, target_length, min_identity, mashmap_estimated_identity, no_seq_in_sam, emit_md_tag, query, target, target_pointer_shift); @@ -2056,7 +2056,6 @@ query_start : query_end) // write how many reverse complement alignments were found //std::cerr << "got " << rev_patch_alns.size() << " rev patch alns" << std::endl; for (auto& patch_aln : multi_patch_alns) { - bool rev_query_is_rev = !query_is_rev; // Flip the orientation write_alignment_paf( out, patch_aln, @@ -2064,7 +2063,7 @@ query_start : query_end) query_total_length, query_offset, query_length, - rev_query_is_rev, // Use the flipped orientation + query_is_rev, target_name, target_total_length, target_offset, @@ -2253,7 +2252,7 @@ void write_alignment_paf( const uint64_t& query_total_length, const uint64_t& query_offset, // query offset on the forward strand const uint64_t& query_length, // used to compute the coordinates for reversed alignments - const bool& query_is_rev, + const bool& query_is_rev, // if the base homology mapping is in the reverse complement orientation const std::string& target_name, const uint64_t& target_total_length, const uint64_t& target_offset, @@ -2288,16 +2287,45 @@ void write_alignment_paf( // PAF standard) if (gap_compressed_identity >= min_identity) { - uint64_t q_start; + //uint64_t q_start = query_offset; + /* if (query_is_rev && !is_rev_patch) { q_start = query_offset + (query_length - (aln.j + qAlignedLength)); } else { q_start = query_offset + aln.j; + }*/ + /* + << query_offset + + (query_is_rev ? query_length - query_end : query_start) + << "\t" + << query_offset + + (query_is_rev ? query_length - query_start : query_end) + */ + std::cerr << "query_is_rev: " << query_is_rev + << ", aln.is_rev: " << aln.is_rev + << ", query_offset: " << query_offset + << ", query_length: " << query_length + << ", aln.j: " << aln.j + << ", qAlignedLength: " << qAlignedLength + << ", aln.query_length: " << aln.query_length + << std::endl; + // four cases + // 1. query is forward, alignment is forward + // 2. query is forward, alignment is reverse + // 3. query is reverse, alignment is forward + // 4. query is reverse, alignment is reverse + uint64_t q_start, q_end; + if (query_is_rev) { + q_start = query_offset + (query_length - aln.j - qAlignedLength); + q_end = query_offset + (query_length - aln.j); + } else { + q_start = query_offset + aln.j; + q_end = query_offset + aln.j + qAlignedLength; } out << query_name << "\t" << query_total_length << "\t" << q_start - << "\t" << q_start + qAlignedLength << "\t" - << (query_is_rev ? "-" : "+") << "\t" << target_name << "\t" + << "\t" << q_end << "\t" + << (aln.is_rev ^ query_is_rev ? "-" : "+") << "\t" << target_name << "\t" << target_total_length << "\t" << target_offset + alignmentRefPos << "\t" << target_offset + alignmentRefPos + refAlignedLength << "\t" From ed507d67641f79d942e7868532774ddc8ac08182 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 28 Jul 2024 17:00:06 +0200 Subject: [PATCH 10/14] cleanup debug --- src/common/wflign/src/wflign_patch.cpp | 30 -------------------------- 1 file changed, 30 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 6162054e..99402ed5 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -2283,38 +2283,8 @@ void write_alignment_paf( double block_identity = (double)matches / (double)(matches + mismatches + inserted_bp + deleted_bp); - // convert our coordinates to be relative to forward strand (matching - // PAF standard) if (gap_compressed_identity >= min_identity) { - //uint64_t q_start = query_offset; - /* - if (query_is_rev && !is_rev_patch) { - q_start = - query_offset + (query_length - (aln.j + qAlignedLength)); - } else { - q_start = query_offset + aln.j; - }*/ - /* - << query_offset + - (query_is_rev ? query_length - query_end : query_start) - << "\t" - << query_offset + - (query_is_rev ? query_length - query_start : query_end) - */ - std::cerr << "query_is_rev: " << query_is_rev - << ", aln.is_rev: " << aln.is_rev - << ", query_offset: " << query_offset - << ", query_length: " << query_length - << ", aln.j: " << aln.j - << ", qAlignedLength: " << qAlignedLength - << ", aln.query_length: " << aln.query_length - << std::endl; - // four cases - // 1. query is forward, alignment is forward - // 2. query is forward, alignment is reverse - // 3. query is reverse, alignment is forward - // 4. query is reverse, alignment is reverse uint64_t q_start, q_end; if (query_is_rev) { q_start = query_offset + (query_length - aln.j - qAlignedLength); From 79e231397f5305b4f1bd5bc92ae843d28409ed5e Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 28 Jul 2024 17:13:25 +0200 Subject: [PATCH 11/14] get correct orientation in sam for patch alignments --- src/common/wflign/src/wflign_patch.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 99402ed5..60de95c2 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -2203,7 +2203,7 @@ void write_alignment_sam( if (patch_gap_compressed_identity >= min_identity) { out << query_name << "\t" - << (query_is_rev ? "16" : "0") << "\t" + << (query_is_rev ^ patch_aln.is_rev ? "16" : "0") << "\t" << target_name << "\t" << target_offset + patch_aln.i + 1 << "\t" << std::round(float2phred(1.0 - patch_block_identity)) << "\t" @@ -2217,7 +2217,7 @@ void write_alignment_sam( for (uint64_t p = patch_aln.j; p < patch_aln.j + patch_aln.query_length; ++p) { seq << query[p]; } - if (query_is_rev) { + if (patch_aln.is_rev) { // reverse complement out << reverse_complement(seq.str()); } else { From 5916ca0ef29be90b898cb16f35641538c2dcfa64 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Mon, 29 Jul 2024 19:59:40 +0200 Subject: [PATCH 12/14] almost progressively work on the largest unaligned query/target chunk --- src/common/wflign/src/wflign_patch.cpp | 218 +++++++++++++++++++++++-- 1 file changed, 207 insertions(+), 11 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 60de95c2..da33d0cc 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -340,7 +340,7 @@ struct AlignmentBounds { int64_t target_end_offset; }; -AlignmentBounds find_alignment_bounds(const alignment_t& aln) { +AlignmentBounds ok_find_alignment_bounds(const alignment_t& aln, const int& erode_k) { AlignmentBounds bounds; bounds.query_start_offset = 0; bounds.query_end_offset = 0; @@ -349,32 +349,187 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln) { int64_t query_pos = 0; int64_t target_pos = 0; - bool found_first_match = false; + int match_count = 0; + int reverse_match_count = 0; + bool found_start = false; + // Forward pass to find start bounds for (int i = aln.edit_cigar.begin_offset; i < aln.edit_cigar.end_offset; ++i) { char op = aln.edit_cigar.cigar_ops[i]; switch (op) { case 'M': + case '=': + ++match_count; + if (match_count >= erode_k && !found_start) { + bounds.query_start_offset = query_pos; + bounds.target_start_offset = target_pos; + found_start = true; + } + ++query_pos; + ++target_pos; + break; case 'X': - if (!found_first_match) { + match_count = 0; + ++query_pos; + ++target_pos; + break; + case 'I': + match_count = 0; + ++query_pos; + break; + case 'D': + match_count = 0; + ++target_pos; + break; + } + } + + // Reverse pass to find end bounds + query_pos = aln.query_length - 1; + target_pos = aln.target_length - 1; + for (int i = aln.edit_cigar.end_offset - 1; i >= aln.edit_cigar.begin_offset; --i) { + char op = aln.edit_cigar.cigar_ops[i]; + switch (op) { + case 'M': + case '=': + ++reverse_match_count; + if (reverse_match_count >= erode_k) { + bounds.query_end_offset = query_pos + 1; + bounds.target_end_offset = target_pos + 1; + return bounds; + } + --query_pos; + --target_pos; + break; + case 'X': + reverse_match_count = 0; + --query_pos; + --target_pos; + break; + case 'I': + reverse_match_count = 0; + --query_pos; + break; + case 'D': + reverse_match_count = 0; + --target_pos; + break; + } + } + + // If we didn't find end bounds, set them to the end of the alignment + if (bounds.query_end_offset == 0) { + bounds.query_end_offset = aln.query_length; + bounds.target_end_offset = aln.target_length; + } + + return bounds; +} + +AlignmentBounds find_alignment_bounds(const alignment_t& aln, const int& erode_k) { + AlignmentBounds bounds; + bounds.query_start_offset = 0; + bounds.query_end_offset = 0; + bounds.target_start_offset = 0; + bounds.target_end_offset = 0; + + int64_t query_pos = 0; + int64_t target_pos = 0; + int match_count = 0; + bool found_start = false; + bool found_end = false; + + // Forward pass + for (int i = aln.edit_cigar.begin_offset; i < aln.edit_cigar.end_offset; ++i) { + char op = aln.edit_cigar.cigar_ops[i]; + switch (op) { + case 'M': + case '=': + ++match_count; + if (match_count >= erode_k && !found_start) { bounds.query_start_offset = query_pos; bounds.target_start_offset = target_pos; - found_first_match = true; + found_start = true; } ++query_pos; ++target_pos; - bounds.query_end_offset = query_pos; - bounds.target_end_offset = target_pos; + break; + case 'X': + match_count = 0; + ++query_pos; + ++target_pos; break; case 'I': + match_count = 0; ++query_pos; break; case 'D': + match_count = 0; ++target_pos; break; } } + // Reverse pass + query_pos = aln.query_length - 1; + target_pos = aln.target_length - 1; + match_count = 0; + + for (int i = aln.edit_cigar.end_offset - 1; i >= aln.edit_cigar.begin_offset; --i) { + char op = aln.edit_cigar.cigar_ops[i]; + switch (op) { + case 'M': + case '=': + ++match_count; + if (match_count >= erode_k && !found_end) { + bounds.query_end_offset = query_pos + 1; + bounds.target_end_offset = target_pos + 1; + found_end = true; + } + --query_pos; + --target_pos; + break; + case 'X': + match_count = 0; + --query_pos; + --target_pos; + break; + case 'I': + match_count = 0; + --query_pos; + break; + case 'D': + match_count = 0; + --target_pos; + break; + } + } + + // If we didn't find bounds, set them to the full alignment length + if (!found_start) { + bounds.query_start_offset = 0; + bounds.target_start_offset = 0; + } else { + // heuristic: subtract erode_k + //bounds.query_start_offset = std::max((int64_t)0, bounds.query_start_offset - erode_k); + //bounds.target_start_offset = std::max((int64_t)0, bounds.target_start_offset - erode_k); + } + if (!found_end) { + bounds.query_end_offset = aln.query_length; + bounds.target_end_offset = aln.target_length; + } else { + // heuristic: add erode_k + //bounds.query_end_offset = std::min((int64_t)aln.query_length, bounds.query_end_offset + erode_k); + //bounds.target_end_offset = std::min((int64_t)aln.target_length, bounds.target_end_offset + erode_k); + } + + // Adjust bounds for reverse complement alignments + if (aln.is_rev) { + std::swap(bounds.query_start_offset, bounds.query_end_offset); + bounds.query_start_offset = aln.query_length - bounds.query_start_offset; + bounds.query_end_offset = aln.query_length - bounds.query_end_offset; + } + return bounds; } @@ -432,12 +587,12 @@ std::vector do_progressive_wfa_patch_alignment( const uint64_t query_end = query_start + query_length; const uint64_t target_end = target_start + target_length; - //std::cerr << "BEGIN do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; + std::cerr << "BEGIN do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; bool first = true; while (first || remaining_query_length >= min_inversion_length && remaining_target_length >= min_inversion_length) { first = false; - //std::cerr << "do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; + std::cerr << "do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; alignment_t aln, rev_aln; aln.is_rev = false; rev_aln.is_rev = true; @@ -456,8 +611,8 @@ std::vector do_progressive_wfa_patch_alignment( max_patching_score, min_inversion_length); - //std::cerr << "WFA fwd alignment: " << aln << std::endl; - //std::cerr << "WFA rev alignment: " << rev_aln << std::endl; + std::cerr << "WFA fwd alignment: " << aln << std::endl; + std::cerr << "WFA rev alignment: " << rev_aln << std::endl; if (rev_aln.ok && (!aln.ok || rev_aln.score < aln.score)) { alignments.push_back(rev_aln); @@ -489,7 +644,8 @@ std::vector do_progressive_wfa_patch_alignment( break; } auto& chosen_aln = alignments.back(); - auto bounds = find_alignment_bounds(chosen_aln); + auto bounds = find_alignment_bounds(chosen_aln, erode_k); + std::cerr << "bounds: " << bounds.query_start_offset << " " << bounds.query_end_offset << " " << bounds.target_start_offset << " " << bounds.target_end_offset << std::endl; #ifdef VALIDATE_WFA_WFLIGN { @@ -514,6 +670,7 @@ std::vector do_progressive_wfa_patch_alignment( auto last_target_start = current_target_start; // Update the start positions and remaining lengths for the next iteration + /* if (chosen_aln.is_rev) { current_query_start += bounds.query_start_offset; } else { @@ -522,6 +679,45 @@ std::vector do_progressive_wfa_patch_alignment( current_target_start += bounds.target_end_offset; remaining_query_length = query_end - current_query_start; remaining_target_length = target_end - current_target_start; + */ + // instead of the above, we want to progressively find the largest incomplete region to align + // we should work with total area of the alignment over query and target to determine what to do + // this will change the way we update tracking variables for the next iteration + // basically: we take the larger of the two regions that will be at the start and end of the alignment + // if there isn't any remaining target or query sequence on one end, obviously it's not an option + // if there is, we should align the largest possible region that is still incomplete + /// .... + // Calculate the sizes of unaligned regions using only the bounds + uint64_t left_query_size = bounds.query_start_offset; + uint64_t right_query_size = remaining_query_length - bounds.query_end_offset; + uint64_t left_target_size = bounds.target_start_offset; + uint64_t right_target_size = remaining_target_length - bounds.target_end_offset; + + uint64_t max_left_size = std::max(left_query_size, left_target_size); + uint64_t max_right_size = std::max(right_query_size, right_target_size); + + std::cerr << "left_query_size: " << left_query_size << " left_target_size: " << left_target_size << std::endl + << "right_query_size: " << right_query_size << " right_target_size: " << right_target_size << std::endl + << "max_left_size: " << max_left_size << " max_right_size: " << max_right_size << std::endl; + + if (max_left_size >= max_right_size && max_left_size > 0) { + std::cerr << "aligning left region" << std::endl + << "left_query_size: " << left_query_size << " left_target_size: " << left_target_size << std::endl; + // Align the left region + remaining_query_length = left_query_size; + remaining_target_length = left_target_size; + // current_query_start and current_target_start remain unchanged + } else if (max_right_size > 0) { + std::cerr << "aligning right region" << std::endl + << "right_query_size: " << right_query_size << " right_target_size: " << right_target_size << std::endl; + // Align the right region + current_query_start += bounds.query_end_offset; + current_target_start += bounds.target_end_offset; + remaining_query_length = right_query_size; + remaining_target_length = right_target_size; + } else { + break; // No more regions to align + } } //std::cerr << "END do_progressive_wfa_patch_alignment got " << alignments.size() << " alignments" << std::endl; return alignments; From 5ee879689744f6361ef7d6440209396fa1297451 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Tue, 30 Jul 2024 10:55:59 +0200 Subject: [PATCH 13/14] recursive progressive patching of largest remaining chunks --- src/common/wflign/src/wflign_patch.cpp | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index da33d0cc..85863e39 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -438,6 +438,7 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln, const int& erode_k int match_count = 0; bool found_start = false; bool found_end = false; + std::cerr << "erode_k = " << erode_k << std::endl; // Forward pass for (int i = aln.edit_cigar.begin_offset; i < aln.edit_cigar.end_offset; ++i) { @@ -455,16 +456,16 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln, const int& erode_k ++target_pos; break; case 'X': - match_count = 0; + //match_count = 0; ++query_pos; ++target_pos; break; case 'I': - match_count = 0; + //match_count = 0; ++query_pos; break; case 'D': - match_count = 0; + //match_count = 0; ++target_pos; break; } @@ -490,16 +491,16 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln, const int& erode_k --target_pos; break; case 'X': - match_count = 0; + //match_count = 0; --query_pos; --target_pos; break; case 'I': - match_count = 0; + //match_count = 0; --query_pos; break; case 'D': - match_count = 0; + //match_count = 0; --target_pos; break; } @@ -511,16 +512,16 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln, const int& erode_k bounds.target_start_offset = 0; } else { // heuristic: subtract erode_k - //bounds.query_start_offset = std::max((int64_t)0, bounds.query_start_offset - erode_k); - //bounds.target_start_offset = std::max((int64_t)0, bounds.target_start_offset - erode_k); + bounds.query_start_offset = std::max((int64_t)0, bounds.query_start_offset - erode_k); + bounds.target_start_offset = std::max((int64_t)0, bounds.target_start_offset - erode_k); } if (!found_end) { bounds.query_end_offset = aln.query_length; bounds.target_end_offset = aln.target_length; } else { // heuristic: add erode_k - //bounds.query_end_offset = std::min((int64_t)aln.query_length, bounds.query_end_offset + erode_k); - //bounds.target_end_offset = std::min((int64_t)aln.target_length, bounds.target_end_offset + erode_k); + bounds.query_end_offset = std::min((int64_t)aln.query_length, bounds.query_end_offset + erode_k); + bounds.target_end_offset = std::min((int64_t)aln.target_length, bounds.target_end_offset + erode_k); } // Adjust bounds for reverse complement alignments @@ -644,7 +645,9 @@ std::vector do_progressive_wfa_patch_alignment( break; } auto& chosen_aln = alignments.back(); - auto bounds = find_alignment_bounds(chosen_aln, erode_k); + //auto bounds = ok_find_alignment_bounds(chosen_aln, erode_k); + //auto bounds = find_alignment_bounds(chosen_aln, erode_k); + auto bounds = find_alignment_bounds(chosen_aln, 7); // very light erosion of bounds on ends to avoid single-match starts and ends std::cerr << "bounds: " << bounds.query_start_offset << " " << bounds.query_end_offset << " " << bounds.target_start_offset << " " << bounds.target_end_offset << std::endl; #ifdef VALIDATE_WFA_WFLIGN From 48bea20783498886b93fef2340a0de0516a8ed40 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Tue, 30 Jul 2024 17:02:36 +0200 Subject: [PATCH 14/14] tweaking recursive progressive patch Chew back a tiny bit of the end of the alignment when finding bounds, which avoids getting confused by a single or handful of bp match at the beginning of the patch. Only do recursive patching when we hit inversions (skip if we get just one forward alignment) because we only get overlapping alignments when we repeat the forward alignment multiple times. --- src/common/wflign/src/wflign_patch.cpp | 27 +++++++++++--------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 85863e39..9382bbdd 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -438,7 +438,6 @@ AlignmentBounds find_alignment_bounds(const alignment_t& aln, const int& erode_k int match_count = 0; bool found_start = false; bool found_end = false; - std::cerr << "erode_k = " << erode_k << std::endl; // Forward pass for (int i = aln.edit_cigar.begin_offset; i < aln.edit_cigar.end_offset; ++i) { @@ -588,12 +587,12 @@ std::vector do_progressive_wfa_patch_alignment( const uint64_t query_end = query_start + query_length; const uint64_t target_end = target_start + target_length; - std::cerr << "BEGIN do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; + //std::cerr << "BEGIN do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; bool first = true; while (first || remaining_query_length >= min_inversion_length && remaining_target_length >= min_inversion_length) { first = false; - std::cerr << "do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; + //std::cerr << "do_progressive_wfa_patch_alignment: " << current_query_start << " " << remaining_query_length << " " << current_target_start << " " << remaining_target_length << std::endl; alignment_t aln, rev_aln; aln.is_rev = false; rev_aln.is_rev = true; @@ -612,14 +611,16 @@ std::vector do_progressive_wfa_patch_alignment( max_patching_score, min_inversion_length); - std::cerr << "WFA fwd alignment: " << aln << std::endl; - std::cerr << "WFA rev alignment: " << rev_aln << std::endl; + //std::cerr << "WFA fwd alignment: " << aln << std::endl; + //std::cerr << "WFA rev alignment: " << rev_aln << std::endl; if (rev_aln.ok && (!aln.ok || rev_aln.score < aln.score)) { alignments.push_back(rev_aln); } else if (aln.ok) { alignments.push_back(aln); - //break; + if (alignments.size() == 1) { + break; + } } #ifdef VALIDATE_WFA_WFLIGN @@ -645,10 +646,8 @@ std::vector do_progressive_wfa_patch_alignment( break; } auto& chosen_aln = alignments.back(); - //auto bounds = ok_find_alignment_bounds(chosen_aln, erode_k); - //auto bounds = find_alignment_bounds(chosen_aln, erode_k); auto bounds = find_alignment_bounds(chosen_aln, 7); // very light erosion of bounds on ends to avoid single-match starts and ends - std::cerr << "bounds: " << bounds.query_start_offset << " " << bounds.query_end_offset << " " << bounds.target_start_offset << " " << bounds.target_end_offset << std::endl; + //std::cerr << "bounds: " << bounds.query_start_offset << " " << bounds.query_end_offset << " " << bounds.target_start_offset << " " << bounds.target_end_offset << std::endl; #ifdef VALIDATE_WFA_WFLIGN { @@ -699,20 +698,16 @@ std::vector do_progressive_wfa_patch_alignment( uint64_t max_left_size = std::max(left_query_size, left_target_size); uint64_t max_right_size = std::max(right_query_size, right_target_size); - std::cerr << "left_query_size: " << left_query_size << " left_target_size: " << left_target_size << std::endl - << "right_query_size: " << right_query_size << " right_target_size: " << right_target_size << std::endl - << "max_left_size: " << max_left_size << " max_right_size: " << max_right_size << std::endl; + //std::cerr << "left_query_size: " << left_query_size << " left_target_size: " << left_target_size << std::endl + // << "right_query_size: " << right_query_size << " right_target_size: " << right_target_size << std::endl + // << "max_left_size: " << max_left_size << " max_right_size: " << max_right_size << std::endl; if (max_left_size >= max_right_size && max_left_size > 0) { - std::cerr << "aligning left region" << std::endl - << "left_query_size: " << left_query_size << " left_target_size: " << left_target_size << std::endl; // Align the left region remaining_query_length = left_query_size; remaining_target_length = left_target_size; // current_query_start and current_target_start remain unchanged } else if (max_right_size > 0) { - std::cerr << "aligning right region" << std::endl - << "right_query_size: " << right_query_size << " right_target_size: " << right_target_size << std::endl; // Align the right region current_query_start += bounds.query_end_offset; current_target_start += bounds.target_end_offset;