From 565938031c6c50b92f083ee0eb96cb6a1c433396 Mon Sep 17 00:00:00 2001 From: Erik Garrison Date: Sun, 25 Aug 2024 14:25:58 +0200 Subject: [PATCH] global head/tail patching works with quirks --- src/common/wflign/src/wflign_patch.cpp | 109 ++++++++++++++++++++----- src/common/wflign/src/wflign_patch.hpp | 3 +- src/map/include/computeMap.hpp | 8 +- 3 files changed, 95 insertions(+), 25 deletions(-) diff --git a/src/common/wflign/src/wflign_patch.cpp b/src/common/wflign/src/wflign_patch.cpp index 2cd13675..e896070a 100644 --- a/src/common/wflign/src/wflign_patch.cpp +++ b/src/common/wflign/src/wflign_patch.cpp @@ -721,6 +721,68 @@ std::vector do_progressive_wfa_patch_alignment( return alignments; } +void erode_head(std::vector& unpatched, uint64_t& query_pos, uint64_t& target_pos, int erode_k) { + int match_count = 0; + auto it = unpatched.begin(); + uint64_t query_erased = 0, target_erased = 0; + + while (it != unpatched.end()) { + if (*it == 'M' || *it == 'X') { + match_count++; + if (match_count >= erode_k) { + break; + } + query_pos++; + target_pos++; + query_erased++; + target_erased++; + } else { + //match_count = 0; + if (*it == 'I') { + query_pos++; + query_erased++; + } + if (*it == 'D') { + target_pos++; + target_erased++; + } + } + it++; + } + + std::cerr << "erode_head: eroded " << query_erased << " query and " << target_erased << " target" << std::endl; + // Erase the eroded part + unpatched.erase(unpatched.begin(), it); +} + +void erode_tail(std::vector& unpatched, uint64_t& query_end, uint64_t& target_end, int erode_k) { + int match_count = 0; + auto it = unpatched.rbegin(); + uint64_t q_offset = 0, t_offset = 0; + + while (it != unpatched.rend()) { + if (*it == 'M' || *it == 'X') { + match_count++; + if (match_count >= erode_k) { + break; + } + q_offset++; + t_offset++; + } else { + //match_count = 0; + if (*it == 'I') q_offset++; + if (*it == 'D') t_offset++; + } + it++; + } + + // Erase the eroded part + std::cerr << "erode_tail: eroded " << q_offset << " query and " << t_offset << " target" << std::endl; + unpatched.erase(it.base(), unpatched.end()); + query_end -= q_offset; + target_end -= t_offset; +} + void write_merged_alignment( std::ostream &out, const std::vector &trace, @@ -859,20 +921,20 @@ void write_merged_alignment( }; auto patching = [&query, &query_name, &query_length, &query_start, - &query_offset, &target, &target_name, - &target_length, &target_start, &target_offset, - &target_total_length, &target_end, - &target_pointer_shift, - &wflign_max_len_major, - &wflign_max_len_minor, - &distance_close_big_enough_indels, &min_wf_length, - &max_dist_threshold, &wf_aligner, - &multi_patch_alns, - &convex_penalties, - &chain_gap, &max_patching_score, &min_inversion_length, &erode_k + &query_offset, &query_end, &target, &target_name, + &target_length, &target_start, &target_offset, + &target_total_length, &target_end, + &target_pointer_shift, + &wflign_max_len_major, + &wflign_max_len_minor, + &distance_close_big_enough_indels, &min_wf_length, + &max_dist_threshold, &wf_aligner, + &multi_patch_alns, + &convex_penalties, + &chain_gap, &max_patching_score, &min_inversion_length, &erode_k #ifdef WFA_PNG_TSV_TIMING - ,&emit_patching_tsv, - &out_patching_tsv + ,&emit_patching_tsv, + &out_patching_tsv #endif ](std::vector &unpatched, std::vector &patched, @@ -883,14 +945,20 @@ void write_merged_alignment( auto q = unpatched.begin(); - uint64_t query_pos = query_start; - uint64_t target_pos = target_start; - uint64_t query_delta = 0; uint64_t target_delta = 0; bool got_alignment = false; + // trim back small matches at the start + erode_head(unpatched, query_start, target_start, 7); + + uint64_t query_pos = query_start; + uint64_t target_pos = target_start; + + // Trim spurious matches off the tail of the alignment + erode_tail(unpatched, query_end, target_end, 7); + // Head patching if (query_start > 0 || target_start > 0) { alignment_t head_aln; @@ -1262,7 +1330,7 @@ void write_merged_alignment( query_length - query_pos, target, target_pos, - target_length - target_pos, + std::min(target_length, target_length - target_pos + (query_length - query_pos)), wf_aligner, convex_penalties, tail_aln, @@ -1281,7 +1349,6 @@ void write_merged_alignment( } } - #ifdef WFLIGN_DEBUG std::cerr << "[wflign::wflign_affine_wavefront] got unsorted " "patched traceback: "; @@ -1486,7 +1553,7 @@ void write_merged_alignment( std::cerr << std::endl; #endif -#ifdef VALIDATE_WFA_WFLIGN +//#ifdef VALIDATE_WFA_WFLIGN // std::cerr << "query_length: " << query_length << std::endl; // std::cerr << "target_length: " << target_length << // std::endl; std::cerr << "query_start: " << query_start << @@ -1494,7 +1561,7 @@ void write_merged_alignment( // std::endl; if (!validate_trace(tracev, query, target - target_pointer_shift, - query_length, target_length, query_start, + query_length, target_length + 2 * wflign_max_len_minor, query_start, target_start)) { std::cerr << "cigar failure at alignment (before head/tail del trimming) " @@ -1510,7 +1577,7 @@ void write_merged_alignment( << target_offset + target_end << std::endl; exit(1); } -#endif +//#endif // trim deletions at start and end of tracev uint64_t begin_offset; diff --git a/src/common/wflign/src/wflign_patch.hpp b/src/common/wflign/src/wflign_patch.hpp index e7f6892a..80530a54 100644 --- a/src/common/wflign/src/wflign_patch.hpp +++ b/src/common/wflign/src/wflign_patch.hpp @@ -59,7 +59,8 @@ namespace wflign { alignment_t& rev_aln, const int64_t& chain_gap, const int& max_patching_score, - const uint64_t& min_inversion_length); + const uint64_t& min_inversion_length, + bool ends_free); void trim_alignment(alignment_t& aln); std::vector do_progressive_wfa_patch_alignment( const char* query, diff --git a/src/map/include/computeMap.hpp b/src/map/include/computeMap.hpp index b314f1c6..0d94d723 100644 --- a/src/map/include/computeMap.hpp +++ b/src/map/include/computeMap.hpp @@ -1698,7 +1698,7 @@ namespace skch // tweak start and end positions of consecutive mappings // TODO: XXX double check that the consecutive mappings are not overlapping!!! // extra: force global alignment by patching head and tail to mapping start and end coordinates - adjustConsecutiveMappings(it, it_end, param.segLength); + //adjustConsecutiveMappings(it, it_end, param.segLength); // First pass: Mark cuttable positions const int consecutive_mappings_window = 4; // Configurable parameter @@ -1716,13 +1716,15 @@ namespace skch if (current == it || current == std::prev(it_end)) { continue; } - if (current->queryStartPos - std::prev(current)->queryEndPos > param.segLength / 2 - || current->refStartPos - std::prev(current)->refEndPos > param.segLength / 2) { + if (current->queryStartPos - std::prev(current)->queryEndPos > param.segLength / 5 + || current->refStartPos - std::prev(current)->refEndPos > param.segLength / 5) { is_cuttable[std::distance(it, current) - 1] = false; is_cuttable[std::distance(it, current)] = false; } } + adjustConsecutiveMappings(it, it_end, param.segLength); + auto fragment_start = it; auto current = it; offset_t accumulate_length = 0;