From d68b484ec36851026bad5253cd8d518fbb64bc72 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hannes=20P=C3=A9tur=20Eggertsson?= Date: Mon, 19 Oct 2020 09:51:01 +0000 Subject: [PATCH] v2.6 (#6) * Added option for extra call_only iteration. Added check for duplicated sample names. * Added coverage filter in the 'genotype_sv' subcommand. It is possible to turn the filter off with a new '--no_filter_on_coverage' command. * Another fix for a very rare bug in graph construction. Only affects very large-scale genotyping runs. WIP streamlined discovery. * Write CSI instead of TBI with the --csi option. * Added several per alt QC metrics to VCF output when calling genotype * optimization for vcf_merge * Added no_variant_overlapping option in force calling mode * updated how QD and QDalt are calculated * AAScore added in SNP/indel genotyping * bugfix: In very large genotyping runs some INFO fields can get larger than INT_MAX so I needed to change them to be strings instead, otherwise bcftools is not happy with me. --- CMakeLists.txt | 4 +- include/graphtyper/constants.hpp.in | 7 + .../graphtyper/graph/absolute_position.hpp | 2 +- include/graphtyper/graph/constructor.hpp | 5 + include/graphtyper/graph/haplotype.hpp | 4 +- include/graphtyper/typer/alignment.hpp | 39 +- include/graphtyper/typer/bucket.hpp | 67 + include/graphtyper/typer/caller.hpp | 9 + include/graphtyper/typer/event.hpp | 204 ++ include/graphtyper/typer/event_call.hpp | 32 + include/graphtyper/typer/genotype_paths.hpp | 21 +- .../graphtyper/typer/logistic_constants.hpp | 72 +- include/graphtyper/typer/read.hpp | 83 + include/graphtyper/typer/sample_call.hpp | 12 +- include/graphtyper/typer/var_stats.hpp | 60 +- include/graphtyper/typer/variant.hpp | 5 +- include/graphtyper/typer/variant_call.hpp | 14 + include/graphtyper/typer/vcf_operations.hpp | 1 + include/graphtyper/utilities/genotype_sv.hpp | 5 +- .../utilities/hts_parallel_reader.hpp | 5 +- include/graphtyper/utilities/hts_reader.hpp | 16 +- include/graphtyper/utilities/options.hpp | 4 +- paw | 2 +- seqan | 2 +- src/CMakeLists.txt | 3 + src/graph/constructor.cpp | 111 +- src/graph/graph.cpp | 69 +- src/graph/haplotype.cpp | 95 +- src/graph/sv.cpp | 42 +- src/main.cpp | 49 +- src/typer/alignment.cpp | 372 +++- src/typer/bucket.cpp | 207 ++ src/typer/caller.cpp | 1657 ++++++++++++++++- src/typer/event.cpp | 410 ++++ src/typer/genotype_paths.cpp | 20 +- src/typer/read.cpp | 103 + src/typer/sample_call.cpp | 38 +- src/typer/var_stats.cpp | 217 ++- src/typer/variant.cpp | 559 ++++-- src/typer/variant_map.cpp | 4 +- src/typer/vcf.cpp | 75 +- src/typer/vcf_operations.cpp | 387 +--- src/typer/vcf_writer.cpp | 43 +- src/utilities/bamshrink.cpp | 39 +- src/utilities/genotype.cpp | 168 +- src/utilities/genotype_camou.cpp | 14 +- src/utilities/genotype_sv.cpp | 32 +- src/utilities/hts_parallel_reader.cpp | 175 +- src/utilities/hts_reader.cpp | 93 +- src/utilities/type_conversions.cpp | 139 +- 50 files changed, 4686 insertions(+), 1110 deletions(-) create mode 100644 include/graphtyper/typer/bucket.hpp create mode 100644 include/graphtyper/typer/event.hpp create mode 100644 include/graphtyper/typer/event_call.hpp create mode 100644 include/graphtyper/typer/read.hpp create mode 100644 include/graphtyper/typer/variant_call.hpp create mode 100644 src/typer/bucket.cpp create mode 100644 src/typer/event.cpp create mode 100644 src/typer/read.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index c273872..139d3b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,8 +5,8 @@ include(ExternalProject) # The version number set (graphtyper_VERSION_MAJOR 2) -set (graphtyper_VERSION_MINOR 5) -set (graphtyper_VERSION_PATCH 1) +set (graphtyper_VERSION_MINOR 6) +set (graphtyper_VERSION_PATCH 0) set(STATIC_DIR "" CACHE STRING "If set, GraphTyper will be built as a static binary using libraries from the given STATIC_DIR.") # Get the current working branch diff --git a/include/graphtyper/constants.hpp.in b/include/graphtyper/constants.hpp.in index 73141fe..b5b7805 100644 --- a/include/graphtyper/constants.hpp.in +++ b/include/graphtyper/constants.hpp.in @@ -44,6 +44,13 @@ uint32_t constexpr MAX_NUM_LOCATIONS_PER_PATH = 256; uint16_t constexpr EPSILON_0_EXPONENT = 12; int32_t constexpr INSERT_SIZE_WHEN_NOT_PROPER_PAIR = 0x7FFFFFFFl; +// alignments scores and penalties +uint8_t constexpr SCORE_MATCH = 1; +uint8_t constexpr SCORE_MISMATCH = 4; +uint8_t constexpr SCORE_GAP_OPEN = 7; +uint8_t constexpr SCORE_GAP_EXTEND = 1; +uint8_t constexpr SCORE_CLIP = 5; + /** * Flags diff --git a/include/graphtyper/graph/absolute_position.hpp b/include/graphtyper/graph/absolute_position.hpp index 4c27fe8..67feb4d 100644 --- a/include/graphtyper/graph/absolute_position.hpp +++ b/include/graphtyper/graph/absolute_position.hpp @@ -10,7 +10,7 @@ namespace gyper { -class Contig; +struct Contig; class AbsolutePosition { diff --git a/include/graphtyper/graph/constructor.hpp b/include/graphtyper/graph/constructor.hpp index 151ebbc..efb33f1 100644 --- a/include/graphtyper/graph/constructor.hpp +++ b/include/graphtyper/graph/constructor.hpp @@ -19,6 +19,11 @@ void construct_graph(std::string const & reference_filename, bool use_absolute_positions = true, bool check_index = true); +void +open_and_read_reference_genome(std::vector & reference_sequence, + std::string const & reference_fn, + GenomicRegion const & genomic_region); + std::vector get_variants_using_tabix(std::string const & vcf, GenomicRegion const & genomic_region); diff --git a/include/graphtyper/graph/haplotype.hpp b/include/graphtyper/graph/haplotype.hpp index 8b56138..96815ea 100644 --- a/include/graphtyper/graph/haplotype.hpp +++ b/include/graphtyper/graph/haplotype.hpp @@ -115,11 +115,13 @@ class Haplotype bool is_low_qual, std::size_t mismatches); - void clipped_reads_to_stats(bool fully_aligned); + void clipped_reads_to_stats(int clipped_bp, int read_length); void mapq_to_stats(uint8_t mapq); // void realignment_to_stats(uint32_t original_pos, uint32_t new_pos); void strand_to_stats(uint16_t const flags); + void mismatches_to_stats(uint8_t mismatches, int read_length); + void score_diff_to_stats(uint8_t score_diff); void coverage_to_gts(std::size_t pn_index, bool is_proper_pair); std::bitset explain_to_path_explain(); diff --git a/include/graphtyper/typer/alignment.hpp b/include/graphtyper/typer/alignment.hpp index ed544c2..8d6e187 100644 --- a/include/graphtyper/typer/alignment.hpp +++ b/include/graphtyper/typer/alignment.hpp @@ -20,36 +20,37 @@ align_read(bam1_t * rec, seqan::IupacString const & rseq, gyper::PHIndex const & ph_index); +//#ifndef NDEBUG +//GenotypePaths * +//update_unpaired_read_paths(std::pair & geno_paths, +// seqan::IupacString const & seq, +// seqan::IupacString const & rseq, +// bam1_t * rec); +//#else GenotypePaths * update_unpaired_read_paths(std::pair & geno_paths, bam1_t * rec); -void -further_update_unpaired_read_paths_for_discovery(GenotypePaths & geno, - seqan::IupacString const & seq, - seqan::IupacString const & rseq, - bam1_t * rec - ); +//#endif -#ifndef NDEBUG void -update_paths(std::pair & geno_paths, - seqan::IupacString const & seq, - seqan::IupacString const & rseq, - bam1_t * rec); -#endif // NDEBUG +further_update_unpaired_read_paths_for_discovery(GenotypePaths & geno, bam1_t * rec); + +//#ifndef NDEBUG +//void +//update_paths(std::pair & geno_paths, +// seqan::IupacString const & seq, +// seqan::IupacString const & rseq, +// bam1_t * rec); +//#endif // NDEBUG -#ifdef NDEBUG +//#ifdef NDEBUG void update_paths(std::pair & geno_paths, bam1_t * rec); -#endif // DEBUG +//#endif // DEBUG void -further_update_paths_for_discovery(std::pair & geno_paths, - seqan::IupacString const & seq, - seqan::IupacString const & rseq, - bam1_t * rec - ); +further_update_paths_for_discovery(std::pair & geno_paths, bam1_t * rec); std::pair get_better_paths(std::pair & geno_paths1, diff --git a/include/graphtyper/typer/bucket.hpp b/include/graphtyper/typer/bucket.hpp new file mode 100644 index 0000000..834adf6 --- /dev/null +++ b/include/graphtyper/typer/bucket.hpp @@ -0,0 +1,67 @@ +#pragma once + +#include + +#include +#include + + +namespace gyper +{ + +class BucketFirstPass +{ +public: + int32_t global_max_pos_end{-1};// Max pos end of alignments in this bucket and all previous buckets + int32_t max_pos_end{-1}; // Max pos end of alignments in this bucket + + std::map snps; + Tindel_events indel_events; // type is std::map +}; + + +class Bucket +{ +public: + int32_t global_max_pos_end{-1};// Max pos end of alignments in this bucket and all previous buckets + int32_t max_pos_end{-1}; // Max pos end of alignments in this bucket + + Tindel_events indel_events; // type is std::map + std::vector reads; + + std::string to_string() const; +}; + + +bool +is_indel_in_bucket(std::vector const & buckets, + IndelEvent const & indel_event, + long const region_begin, + long const BUCKET_SIZE); + + +std::map::iterator +add_snp_event_to_bucket(std::vector & buckets, + SnpEvent && event, + long const region_begin, + long const BUCKET_SIZE); + + +template +Tindel_events::iterator +add_indel_event_to_bucket(std::vector & buckets, + IndelEvent && event, + long const region_begin, + long const BUCKET_SIZE, + std::vector const & reference_sequence, + long ref_offset); + +//void +//add_base_to_bucket(std::vector & buckets, +// int32_t pos, +// char seq, +// char qual, +// long const region_begin, +// long const BUCKET_SIZE); + +} // namespace gyper diff --git a/include/graphtyper/typer/caller.hpp b/include/graphtyper/typer/caller.hpp index c7484eb..96f8600 100644 --- a/include/graphtyper/typer/caller.hpp +++ b/include/graphtyper/typer/caller.hpp @@ -7,11 +7,13 @@ namespace gyper { +class Vcf; class Primers; // returns the prefix to the output files std::vector call(std::vector const & hts_path, + std::vector const & avg_cov_by_readlen, std::string const & graph_path, PHIndex const & ph_index, std::string const & output_dir, @@ -34,4 +36,11 @@ discover_directly_from_bam(std::string const & graph_path, long minimum_variant_support, double minimum_variant_support_ratio); +void +streamlined_discovery(std::vector const & hts_paths, + std::string const & reference_fn, + std::string const & region_str, + std::string const & output_dir, + gyper::Vcf & vcf); + } // namespace gyper diff --git a/include/graphtyper/typer/event.hpp b/include/graphtyper/typer/event.hpp new file mode 100644 index 0000000..cf51f7b --- /dev/null +++ b/include/graphtyper/typer/event.hpp @@ -0,0 +1,204 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + + +namespace gyper +{ + +std::array constexpr index2base = {'A', 'C', 'G', 'T'}; + +long +base2index(char const base); + +struct BaseCount +{ + std::array acgt; + std::array acgt_qualsum; + int32_t deleted{0}; + int32_t unknown{0}; + + long get_depth_without_deleted() const; + long get_depth_with_deleted() const; + std::string to_string() const; + + void add_base(char seq, char qual); + +}; + + +class IndelEvent +{ +public: + // Event type is the same as the CIGAR mappings, that is: + // I for insertion + // D for deletion + // X for mismatches + // other cigar mappings are not used in this context + uint32_t pos{0}; // event is between pos-1 and pos in a 1-based system + char type{'\0'}; + std::vector sequence{}; + + IndelEvent() = default; + + IndelEvent(uint32_t _pos, char _type) + : pos(_pos) + , type(_type) + {} + + IndelEvent(uint32_t _pos, char _type, std::vector && _sequence) + : pos{_pos} + , type{_type} + , sequence{std::forward >(_sequence)} + {} + + inline std::string + to_string() const + { + std::ostringstream ss; + ss << pos << " " << type << ' ' << std::string(sequence.begin(), sequence.end()); + return ss.str(); + } + + + /********************* + * OPERATOR OVERLOAD * + *********************/ + bool operator==(IndelEvent const & b) const; + bool operator!=(IndelEvent const & b) const; + bool operator<(IndelEvent const & b) const; +}; + + +class SnpEvent +{ +public: + uint32_t pos{0}; + char base{'N'}; + + SnpEvent() = default; + SnpEvent(uint32_t _pos, char _base) + : pos(_pos) + , base(_base) + {} + + inline + std::string + to_string() const + { + std::ostringstream ss; + ss << pos << " " << base; + return ss.str(); + } + + + /********************* + * OPERATOR OVERLOAD * + *********************/ + bool operator==(SnpEvent const & b) const; + bool operator!=(SnpEvent const & b) const; + bool operator<(SnpEvent const & b) const; +}; + + +class SnpEventInfo +{ +public: + uint16_t hq_count{0}; + uint16_t lq_count{0}; + uint16_t proper_pairs{0}; + uint16_t first_in_pairs{0}; + uint16_t sequence_reversed{0u}; + uint16_t clipped{0}; + uint8_t max_mapq{0}; + uint8_t max_distance{0}; + int32_t uniq_pos1{-1}; + int32_t uniq_pos2{-1}; + int32_t uniq_pos3{-1}; + std::map phase; + + double corrected_support() const; + bool has_good_support(long const cov) const; + std::string to_string() const; + +}; + + +std::vector get_phred_biallelic(uint32_t count, uint32_t anti_count, uint32_t eps); +uint32_t get_log_qual(uint32_t count, uint32_t anti_count, uint32_t eps = 7); +uint32_t get_log_qual_double(double count, double anti_count, double eps = 7.0); + + +class EventInfo +{ +public: + uint32_t count{0}; + uint32_t multi_count{0}; + uint32_t anti_count{0}; + uint16_t span{1}; + + // indels with realignment support are good enough to try to realign reads to them + bool has_realignment_support{false}; + bool has_good_support{false}; + + uint32_t max_log_qual{0}; + int max_log_qual_file_i{-1}; + + uint32_t log_qual(uint32_t eps = 7) const; + void clean_counts(); +}; + + +bool +apply_indel_event(std::vector & sequence, + std::vector & ref_pos, + IndelEvent const & indel_event, + long const offset, + bool const is_debug = false); + +IndelEvent +make_deletion_event(std::vector const & reference_sequence, long ref_offset, int32_t pos, long count); + +IndelEvent +make_insertion_event(int32_t pos, std::vector && event_sequence); + +} // namespace gyper + + +namespace std +{ + +template <> +struct hash +{ + size_t + operator()(gyper::IndelEvent const & event) + { + size_t h1 = std::hash()(event.pos); + size_t h2 = std::hash()(event.type); + size_t h3 = 42 ^ event.sequence.size(); + + for (auto const & seq : event.sequence) + h3 ^= std::hash()(seq); + + return h1 ^ (h2 << 1) ^ (h3 + 0x9e3779b9); + } + + +}; + +} // namespace std + + +namespace gyper +{ + +//using Tevents = phmap::node_hash_map; +using Tindel_events = std::map; // maps events to count + +} // namespace gyper diff --git a/include/graphtyper/typer/event_call.hpp b/include/graphtyper/typer/event_call.hpp new file mode 100644 index 0000000..e89aba7 --- /dev/null +++ b/include/graphtyper/typer/event_call.hpp @@ -0,0 +1,32 @@ +#pragma once + +#include +#include +#include + +#include +#include + + +namespace gyper +{ + +class EventCall +{ +public: + IndelEvent event; // includes pos, type and sequence + + uint32_t count{0}; + uint32_t anti_count{0}; + uint32_t multi_count{0}; + + uint16_t span{0}; +}; + +//class EventCalls +//{ +//public: +// +//}; + +} // namespace gyper diff --git a/include/graphtyper/typer/genotype_paths.hpp b/include/graphtyper/typer/genotype_paths.hpp index 943ea94..89c9ac2 100644 --- a/include/graphtyper/typer/genotype_paths.hpp +++ b/include/graphtyper/typer/genotype_paths.hpp @@ -22,7 +22,6 @@ struct GenotypePathsDetails { std::string query_name; std::string read_group; - uint32_t score_diff = 0u; }; @@ -32,12 +31,14 @@ class GenotypePaths std::vector read2; std::vector qual2; std::vector paths; - uint16_t read_length = 0; - uint16_t flags = 0; - uint32_t longest_path_length = 0; - uint32_t original_pos = 0; // 0-based position from global alignment - uint8_t mapq = 255; - int32_t ml_insert_size = INSERT_SIZE_WHEN_NOT_PROPER_PAIR; + uint16_t read_length{0}; + uint16_t flags{0}; + uint32_t longest_path_length{0}; + uint32_t original_pos{0}; // 0-based position from global alignment + uint8_t score_diff{0}; // AS-XS from bwa +// uint8_t original_clipped_bp{0}; + uint8_t mapq{255}; + int32_t ml_insert_size{INSERT_SIZE_WHEN_NOT_PROPER_PAIR}; #ifndef NDEBUG std::unique_ptr details; // Only used when statistics are kept @@ -60,14 +61,12 @@ class GenotypePaths void add_next_kmer_labels(std::vector const & ll, uint32_t start_index, uint32_t read_end_index, - int mismatches = 0 - ); + int mismatches = 0); void add_prev_kmer_labels(std::vector const & ll, uint32_t const read_start_index, uint32_t const read_end_index, - int const mismatches = 0 - ); + int const mismatches = 0); void clear_paths(); diff --git a/include/graphtyper/typer/logistic_constants.hpp b/include/graphtyper/typer/logistic_constants.hpp index e314c7d..092634e 100644 --- a/include/graphtyper/typer/logistic_constants.hpp +++ b/include/graphtyper/typer/logistic_constants.hpp @@ -6,6 +6,7 @@ namespace { +// logf double constexpr LOGF_INTERCEPT{-29.28908}; double constexpr LOGF_ABHom{23.12909}; double constexpr LOGF_CRBySeqDepth{-10.22658}; @@ -46,7 +47,7 @@ namespace gyper { inline double -get_logf(double ab_hom, +get_logf(double abhom, double cr_by_seqdepth, double mq, double pass_ratio, @@ -63,11 +64,76 @@ get_logf(double ab_hom, //BOOST_LOG_TRIVIAL(info) << __HERE__ << " " << ab_hom << " " << cr_by_seqdepth << " " << mq << " " << pass_ratio // << " " << gt_yield << " " << qd << " " << ab_het_bin << " " << sbalt_bin; - double const pwr = LOGF_INTERCEPT + ab_hom * LOGF_ABHom + cr_by_seqdepth * LOGF_CRBySeqDepth + + double const pwr = LOGF_INTERCEPT + abhom * LOGF_ABHom + cr_by_seqdepth * LOGF_CRBySeqDepth + mq * LOGF_MQ + pass_ratio * LOGF_PASS_ratio + gt_yield * LOGF_GTYield + qd * LOGF_QD + LOGF_ABHet[ab_het_bin] + LOGF_SBAlt[sbalt_bin]; //BOOST_LOG_TRIVIAL(info) << __HERE__ << " log model pwr = " << pwr; - return 1.0 / (1.0 + std::exp(-pwr)); + double const _exp = std::max(0.0, std::exp(-pwr)); + return 1.0 / (1.0 + _exp); +} + + +// aa score +double constexpr AA_INTERCEPT{-6.347426707}; +double constexpr AA_SB{-0.252334000}; +double constexpr AA_MM{-0.013766577 * 3.0}; +double constexpr AA_SD{0.014572295}; +double constexpr AA_QD{0.065221319}; +double constexpr AA_CR{-0.006449446 * 3.0}; +double constexpr AA_MQ{0.055973424}; + +std::array constexpr AA_ABHom{{0.0, + 1.304140117, + 1.681221065, + 2.214801195, + 3.930106559}}; + +inline double +get_aa_score(double abhom, double sb, double mm, long sd, double qd, double cr, long mq) +{ + long abhom_bin{0}; + + if (abhom <= 0.85) + abhom_bin = 0; + else if (abhom <= 0.94) + abhom_bin = 1; + else if (abhom <= 0.98) + abhom_bin = 2; + else if (abhom <= 0.99) + abhom_bin = 3; + else + abhom_bin = 4; + + if (mq > 60) + mq = 60; + + double const pwr = AA_INTERCEPT + + AA_ABHom[abhom_bin] + + sb * AA_SB + + mm * AA_MM + + sd * AA_SD + + qd * AA_QD + + cr * AA_CR + + mq * AA_MQ; + + double const _exp = std::max(0.0, std::exp(-pwr)); + + //if (sb > 0.8) + //{ + // BOOST_LOG_TRIVIAL(info) << __HERE__ + // << " abhom_bin=" << abhom_bin + // << " abhom=" << AA_ABHom[abhom_bin] + // << " sb=" << sb + // << " mm=" << mm + // << " sd=" << sd + // << " qd=" << qd + // << " cr=" << cr + // << " mq=" << mq + // << " pwr=" << pwr + // << " val=" << (1.0 / (1.0 + _exp)); + //} + + return 1.0 / (1.0 + _exp); } diff --git a/include/graphtyper/typer/read.hpp b/include/graphtyper/typer/read.hpp new file mode 100644 index 0000000..9528d2b --- /dev/null +++ b/include/graphtyper/typer/read.hpp @@ -0,0 +1,83 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include + +#include + + +namespace gyper +{ + +int32_t static constexpr READ_ANTI_SUPPORT{-1}; +int32_t static constexpr READ_MULTI_SUPPORT{-2}; + +struct ReadIndelEvent +{ +public: + int32_t read_pos{0}; + Tindel_events::iterator event_it; + + ReadIndelEvent(long _read_pos, Tindel_events::iterator _event_it) + : read_pos(_read_pos) + , event_it(_event_it) + {} + +}; + + +struct ReadSnpEvent +{ +public: + int32_t read_pos{0}; + int32_t ref_pos{0}; + + ReadSnpEvent(int32_t _read_pos, int32_t ref_pos) + : read_pos(_read_pos) + , ref_pos(ref_pos) + {} +}; + + +class Alignment +{ +public: + int32_t pos{-1}; + int32_t pos_end{-1}; + int32_t score{std::numeric_limits::min()}; // Alignment score + int16_t num_clipped_begin{0}; // Number of clipped bases at the begin of the read + int16_t num_clipped_end{0}; // Number of clipped bases at the end of the read + int16_t num_ins_begin{0}; // Number of bases in insertion at the begin of the read + //int16_t num_ins_end{0}; // Number of bases in insertion at the end of the read + //std::vector snp_events; // read pos containing snp events + //std::vector::iterator> snp_events; + std::vector indel_events; // pair of read_offset and event + + bool has_indel_event(Tindel_events::iterator) const; + bool is_clipped() const; + void add_indel_event(int32_t pos, Tindel_events::iterator indel_it); + void replace_indel_events(std::vector && new_indel_events); +}; + + +class Read +{ +public: + std::string name; + int32_t mate_pos{-1}; + uint16_t flags{0}; + uint8_t mapq{255}; + + std::vector sequence{}; + std::vector qual{}; + Alignment alignment{}; + + std::string to_string() const; +}; + +} // namespace gyper diff --git a/include/graphtyper/typer/sample_call.hpp b/include/graphtyper/typer/sample_call.hpp index 42ad3da..c43a774 100644 --- a/include/graphtyper/typer/sample_call.hpp +++ b/include/graphtyper/typer/sample_call.hpp @@ -28,17 +28,19 @@ class SampleCall // If R is the number of alleles, then phred should be of size R * (R + 1) / 2 and coverage of size R. std::vector phred; // GT and PL std::vector coverage; // AD and DP - uint16_t ref_total_depth = 0u; // Total reads that support the reference allele - uint16_t alt_total_depth = 0u; // Total reads that support alternative allele(s) - uint8_t ambiguous_depth = 0u; // - uint8_t alt_proper_pair_depth = 0u; // Total reads in proper pairs that support the alternative allele(s) + uint16_t ref_total_depth{0u}; // Total reads that support the reference allele + uint16_t alt_total_depth{0u}; // Total reads that support alternative allele(s) + uint8_t ambiguous_depth{0u}; // + uint8_t alt_proper_pair_depth{0u}; // Total reads in proper pairs that support the alternative allele(s) - mutable int8_t filter = -1; // -1 is unknown, 0 is PASS, 1 is GQ filter + mutable int8_t filter{-1}; // -1 is unknown, 0 is PASS, 1 is GQ filter uint32_t get_depth() const; uint32_t get_unique_depth() const; + uint32_t get_alt_depth() const; std::pair get_gt_call() const; uint8_t get_gq() const; + uint8_t get_lowest_phred_not_with(uint16_t allele) const; int8_t check_filter(long gq) const; private: diff --git a/include/graphtyper/typer/var_stats.hpp b/include/graphtyper/typer/var_stats.hpp index 06ba311..feec245 100644 --- a/include/graphtyper/typer/var_stats.hpp +++ b/include/graphtyper/typer/var_stats.hpp @@ -5,46 +5,74 @@ #include // std::string #include // std::vector +#include + + #include namespace gyper { +struct VarStatsPerAllele +{ + uint64_t clipped_bp{0u}; + uint64_t mapq_squared{0u}; + uint32_t score_diff{0u}; + uint32_t mismatches{0u}; + + template + void + serialize(Archive & ar, unsigned int /*version*/) + { + ar & clipped_bp; + ar & mapq_squared; + ar & score_diff; + ar & mismatches; + } + + +}; + + class VarStats { + friend class boost::serialization::access; + public: - /** Clipped reads */ - uint32_t clipped_reads = 0u; + /** Stats per allele (excluding ReadStrand) */ + std::vector per_allele{}; /** Strand bias per allele */ std::vector read_strand{}; + /** Clipped reads */ + uint32_t clipped_reads{0u}; + /** MapQ statistics */ - uint64_t mapq_squared = 0u; + uint64_t mapq_squared{0u}; /** * CONSTRUCTORS */ - VarStats(uint16_t allele_count) noexcept; - - /** - * MODIFIERS - */ - void add_mapq(uint8_t const new_mapq); + VarStats(std::size_t allele_count) noexcept; /** * CLASS INFORMATION */ - std::string get_forward_strand_bias() const; - std::string get_reverse_strand_bias() const; - // Read pair specfici biases - std::string get_r1_forward_strand_bias() const; - std::string get_r2_forward_strand_bias() const; - std::string get_r1_reverse_strand_bias() const; - std::string get_r2_reverse_strand_bias() const; + void write_stats(std::map & infos) const; + void write_per_allele_stats(std::map & infos) const; + void write_read_strand_stats(std::map & infos) const; + /** + * CLASS MODIFIERS + */ + void add_stats(VarStats const & stats); + //void read_stats(std::map const & infos); +private: + template + void serialize(Archive & ar, unsigned int version); }; template diff --git a/include/graphtyper/typer/variant.hpp b/include/graphtyper/typer/variant.hpp index a7b9a95..dfa825e 100644 --- a/include/graphtyper/typer/variant.hpp +++ b/include/graphtyper/typer/variant.hpp @@ -9,6 +9,7 @@ #include // gyper::Genotype #include // gyper::Haplotype #include // gyper::SampleCall +#include #include namespace gyper @@ -24,9 +25,11 @@ class Variant uint32_t abs_pos; std::vector > seqs; std::vector calls; + VarStats stats; std::map infos; std::string suffix_id; bool is_info_generated{false}; + char type{'.'}; Variant() noexcept; @@ -67,7 +70,7 @@ class Variant std::vector get_seq_depth_of_all_alleles() const; uint64_t get_qual() const; // Gets quality of a record double get_qual_by_depth() const; // Gets total quality by depth (QD) of a record - + std::vector get_qual_by_depth_per_alt_allele() const; /********************* * OPERATOR OVERLOAD * diff --git a/include/graphtyper/typer/variant_call.hpp b/include/graphtyper/typer/variant_call.hpp new file mode 100644 index 0000000..7ba8833 --- /dev/null +++ b/include/graphtyper/typer/variant_call.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include + +#include + + +class VariantCall +{ +public: + uint32_t pos{0}; + char type{'\0'}; + +}; diff --git a/include/graphtyper/typer/vcf_operations.hpp b/include/graphtyper/typer/vcf_operations.hpp index 1843947..524ea19 100644 --- a/include/graphtyper/typer/vcf_operations.hpp +++ b/include/graphtyper/typer/vcf_operations.hpp @@ -13,6 +13,7 @@ void vcf_concatenate(std::vector const & vcfs, std::string const & output, bool const SKIP_SORT, bool const SITES_ONLY, + bool const WRITE_TBI, std::string const & region); void vcf_break_down(std::string const & vcf, std::string const & output, std::string const & region); diff --git a/include/graphtyper/utilities/genotype_sv.hpp b/include/graphtyper/utilities/genotype_sv.hpp index 8dfa9d5..e01a5aa 100644 --- a/include/graphtyper/utilities/genotype_sv.hpp +++ b/include/graphtyper/utilities/genotype_sv.hpp @@ -13,14 +13,15 @@ void genotype_sv(std::string const & interval_fn, std::string const & ref_path, std::vector const & sams, + std::vector const & avg_cov_by_readlen, gyper::GenomicRegion const & genomic_region, - std::string const & output_path, - std::vector const & avg_cov_by_readlen); + std::string const & output_path); void genotype_sv_regions(std::string ref_path, std::string const & sv_vcf, std::vector const & sams, + std::vector const & avg_cov_by_readlen, std::vector const & regions, std::string const & output_path, bool const is_copy_reference); diff --git a/include/graphtyper/utilities/hts_parallel_reader.hpp b/include/graphtyper/utilities/hts_parallel_reader.hpp index 2a69dcd..1aa753b 100644 --- a/include/graphtyper/utilities/hts_parallel_reader.hpp +++ b/include/graphtyper/utilities/hts_parallel_reader.hpp @@ -25,8 +25,8 @@ class HtsParallelReader std::vector hts_files; std::vector heap; HtsStore store; - std::vector samples; - long num_rg = 0; // Number of read groups + std::vector samples; // List of sample names + long num_rg{0}; // Number of read groups public: HtsParallelReader() = default; @@ -68,6 +68,7 @@ class HtsParallelReader void parallel_reader_genotype_only(std::string * out_path, std::vector const * hts_paths_ptr, + std::vector const * avg_cov_ptr, std::string const * output_dir_ptr, std::string const * reference_fn_ptr, std::string const * region_ptr, diff --git a/include/graphtyper/utilities/hts_reader.hpp b/include/graphtyper/utilities/hts_reader.hpp index 67aefed..58a673c 100644 --- a/include/graphtyper/utilities/hts_reader.hpp +++ b/include/graphtyper/utilities/hts_reader.hpp @@ -22,8 +22,8 @@ class HtsReader hts_idx_t * hts_index{nullptr}; hts_itr_t * hts_iter{nullptr}; std::vector samples; - int sample_index_offset = 0; - int rg_index_offset = 0; + int sample_index_offset{0}; + int rg_index_offset{0}; private: int ret{0}; // return value of the most recently read record @@ -34,19 +34,23 @@ class HtsReader std::unordered_map rg2index; // associates read groups with indices of that rg std::vector rg2sample_i; // uses the rg index to determine the sample index + void set_reference(std::string const & reference_path); + public: HtsReader(HtsStore & _store); - void open(std::string const & path, std::string const & region); + void open(std::string const & path, std::string const & region, std::string const & reference); void close(); - int set_reference(std::string const & reference_path); void set_sample_index_offset(int new_sample_index_offset); void set_rg_index_offset(int new_rg_index_offset); - bam1_t * get_next_read(bam1_t * old_record); + // Get the read when reading a HTS file in order. It is probably never a good idea to call both get_next_read_in_order + // and get_next_read(). + bam1_t * get_next_read_in_order(bam1_t * old_record); + bam1_t * get_next_read_in_order(); bam1_t * get_next_read(); - bam1_t * get_next_pos(); + bam1_t * get_next_read(bam1_t * old_record); void get_sample_and_rg_index(long & sample_i, long & rg_i, bam1_t * rec) const; long get_num_rg() const; diff --git a/include/graphtyper/utilities/options.hpp b/include/graphtyper/utilities/options.hpp index ed1437a..567793d 100644 --- a/include/graphtyper/utilities/options.hpp +++ b/include/graphtyper/utilities/options.hpp @@ -80,14 +80,16 @@ class Options * CALLING OPTIONS * *******************/ bool hq_reads{false}; + bool is_csi{false}; int sam_flag_filter{3840}; - long max_files_open{1000}; // Maximum amount of SAM/BAM/CRAM files can be opened at the same time + long max_files_open{512}; // Maximum amount of SAM/BAM/CRAM files can be opened at the same time long soft_cap_of_variants_in_100_bp_window{22}; bool get_sample_names_from_filename{false}; bool output_all_variants{false}; bool is_one_genotype_per_haplotype{false}; std::string variant_suffix_id{}; std::string primer_bedpe{}; + bool is_extra_call_only_iteration{false}; // 7 and 0.26 for >= 1000 samples long genotype_aln_min_support{4}; diff --git a/paw b/paw index 2de577d..7b86062 160000 --- a/paw +++ b/paw @@ -1 +1 @@ -Subproject commit 2de577dc561d22af642aa3958ab0b8ebcaa9c018 +Subproject commit 7b860629eeee0cadd4fb17715503be9273ba8175 diff --git a/seqan b/seqan index 515b276..496ed88 160000 --- a/seqan +++ b/seqan @@ -1 +1 @@ -Subproject commit 515b2769ed6c6538a8b5512e16008ed71b5a0126 +Subproject commit 496ed88cc8af0c9426feb69f480ecab432fed1e9 diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4399e29..bc1fccb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -22,10 +22,13 @@ set(graphtyper_SOURCE_FILES index/kmer_label.cpp index/ph_index.cpp typer/alignment.cpp + typer/bucket.cpp typer/caller.cpp + typer/event.cpp typer/genotype_paths.cpp typer/path.cpp typer/primers.cpp + typer/read.cpp typer/sample_call.cpp typer/segment.cpp typer/var_stats.cpp diff --git a/src/graph/constructor.cpp b/src/graph/constructor.cpp index bc071a0..46242e0 100644 --- a/src/graph/constructor.cpp +++ b/src/graph/constructor.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -178,11 +179,11 @@ append_sv_tag_to_node(std::vector & alt) void open_tabix(seqan::Tabix & tabix_file, - std::string const & tabix_filename, - gyper::GenomicRegion const & genomic_region - ) + std::string const & vcf_filename, + gyper::GenomicRegion const & genomic_region) { - seqan::open(tabix_file, tabix_filename.c_str()); + std::string const ext = gyper::is_file(vcf_filename + ".csi") ? ".csi" : ".tbi"; + seqan::open(tabix_file, vcf_filename.c_str(), "r", ext.c_str()); seqan::String header; seqan::getHeader(header, tabix_file); @@ -215,6 +216,8 @@ open_reference_genome(seqan::FaiIndex & fasta_index, std::string const & fasta_f ss >> new_contig.length; gyper::graph.contigs.push_back(std::move(new_contig)); } + + f.close(); } // Check if the total length of contigs is too large @@ -239,7 +242,8 @@ open_reference_genome(seqan::FaiIndex & fasta_index, std::string const & fasta_f BOOST_LOG_TRIVIAL(error) << "[" << __HERE__ << "] FASTA index could not be loaded or built."; std::exit(31); } - else if (!seqan::save(fasta_index)) + + if (!seqan::save(fasta_index)) { BOOST_LOG_TRIVIAL(error) << "[" << __HERE__ << "] FASTA index could not be saved to disk."; std::exit(32); @@ -271,8 +275,7 @@ read_reference_seq(std::vector & reference_sequence, seqan::FaiIndex const & fasta_index, unsigned const chrom_idx, uint32_t const begin, - uint32_t const length - ) + uint32_t const length) { seqan::Dna5String ref_seq; @@ -296,8 +299,7 @@ read_reference_genome(std::vector & reference_sequence, fasta_index, chrom_idx, genomic_region.begin, - genomic_region.end - genomic_region.begin - ); + genomic_region.end - genomic_region.begin); } @@ -331,14 +333,36 @@ read_reference_genome_ends(seqan::FaiIndex const & fasta_index, namespace gyper { +void +open_and_read_reference_genome(std::vector & reference_sequence, + std::string const & reference_fn, + GenomicRegion const & genomic_region) +{ + // Load the reference genome + seqan::FaiIndex fai_index; + + if (!seqan::open(fai_index, reference_fn.c_str())) + { + BOOST_LOG_TRIVIAL(error) << "[" << __HERE__ << "] FASTA index could not be loaded for " + << reference_fn; + std::exit(31); + } + + // Read the reference sequence + read_reference_genome(reference_sequence, fai_index, genomic_region); + + // Close the file + seqan::clear(fai_index); +} + + void add_sv_breakend(SV & sv, VarRecord & var, seqan::VcfRecord const & vcf_record, seqan::FaiIndex const & fasta_index, unsigned const chrom_idx, - uint32_t const EXTRA_SEQUENCE_LENGTH - ) + uint32_t const EXTRA_SEQUENCE_LENGTH) { // Read the first matching reference base read_reference_seq(var.ref, fasta_index, chrom_idx, var.pos, 1); @@ -358,7 +382,16 @@ add_sv_breakend(SV & sv, auto parse_chromosome_name = [&](std::vector const & seq, char const c) -> std::string { auto find_it = std::find(seq.begin(), seq.end(), c); - auto find_colon_it = std::find(find_it + 1, seq.end(), ':'); + auto find_last_colon_it = std::find(seq.rbegin(), seq.rend(), ':'); + + if (find_last_colon_it == seq.rend()) + invalid_bnd_alt(); + + long const colon_pos = seq.size() - + std::distance(seq.rbegin(), find_last_colon_it) - 1; + assert(colon_pos < static_cast(seq.size())); + assert(colon_pos >= 0); + auto find_colon_it = seq.begin() + colon_pos; if (find_colon_it == seq.end()) invalid_bnd_alt(); @@ -368,13 +401,23 @@ add_sv_breakend(SV & sv, auto parse_position = [&](std::vector const & seq, char const c) -> long { - auto find_colon_it = std::find(seq.begin() + 1, seq.end(), ':'); + auto find_last_colon_it = std::find(seq.rbegin(), seq.rend(), ':'); + + if (find_last_colon_it == seq.rend()) + invalid_bnd_alt(); + + long const colon_pos = seq.size() - + std::distance(seq.rbegin(), find_last_colon_it) - 1; + assert(colon_pos < static_cast(seq.size())); + assert(colon_pos >= 0); + auto find_colon_it = seq.begin() + colon_pos; + auto find_end_it = std::find(find_colon_it + 1, seq.end(), c); if (find_end_it == seq.end()) invalid_bnd_alt(); - long pos = 0; + long pos{0}; std::istringstream ss { std::string(find_colon_it + 1, find_end_it) }; @@ -419,9 +462,9 @@ add_sv_breakend(SV & sv, if (find2_it == alt.end()) invalid_bnd_alt(); - auto const len = EXTRA_SEQUENCE_LENGTH - std::distance(find2_it, alt.end()) - 1; + auto const len = EXTRA_SEQUENCE_LENGTH - std::distance(find2_it, alt.end()); std::vector seq; - read_reference_seq(seq, fasta_index, chrom_idx_2, pos, len); + read_reference_seq(seq, fasta_index, chrom_idx_2, pos - 1, len); // Complement the sequence std::transform(seq.begin(), seq.end(), seq.begin(), complement); @@ -453,7 +496,7 @@ add_sv_breakend(SV & sv, invalid_bnd_alt(); auto const len = EXTRA_SEQUENCE_LENGTH - std::distance(find2_it, alt.end()) - 1; - read_reference_seq(bnd, fasta_index, chrom_idx_2, pos - len - 1, len); + read_reference_seq(bnd, fasta_index, chrom_idx_2, pos - len, len); std::copy(find2_it + 1, alt.end(), std::back_inserter(bnd)); } else @@ -465,7 +508,7 @@ add_sv_breakend(SV & sv, std::copy(alt.begin() + 1, find_it, std::back_inserter(bnd)); // Extra insertion auto const len = EXTRA_SEQUENCE_LENGTH - bnd.size() + 1; std::vector seq; - read_reference_seq(seq, fasta_index, chrom_idx_2, pos - len - 1, len); + read_reference_seq(seq, fasta_index, chrom_idx_2, pos - len, len); // Copy the reverse complement std::transform(seq.begin(), seq.end(), seq.begin(), complement); @@ -487,8 +530,7 @@ add_sv_deletion(SV & sv, VarRecord & var, seqan::FaiIndex const & fasta_index, unsigned const chrom_idx, - uint32_t const EXTRA_SEQUENCE_LENGTH - ) + uint32_t const EXTRA_SEQUENCE_LENGTH) { // Read the first matching reference base read_reference_seq(var.ref, fasta_index, chrom_idx, var.pos, 1); @@ -529,8 +571,7 @@ add_sv_insertion(SV & sv, seqan::VcfRecord const & vcf_record, seqan::FaiIndex const & fasta_index, unsigned const chrom_idx, - uint32_t const EXTRA_SEQUENCE_LENGTH - ) + uint32_t const EXTRA_SEQUENCE_LENGTH) { assert(seqan::length(vcf_record.ref) >= 1); @@ -557,8 +598,7 @@ add_sv_insertion(SV & sv, { std::copy(sv.seq.begin(), sv.seq.begin() + EXTRA_SEQUENCE_LENGTH, - std::back_inserter(alt1) - ); + std::back_inserter(alt1)); append_sv_tag_to_node(alt1); sv.related_sv = static_cast(graph.SVs.size()) + 1; // Next SV is related @@ -568,8 +608,7 @@ add_sv_insertion(SV & sv, std::copy(sv.seq.end() - EXTRA_SEQUENCE_LENGTH, sv.seq.end(), - std::back_inserter(alt2) - ); + std::back_inserter(alt2)); sv.related_sv = static_cast(graph.SVs.size()) - 1; // Previous SV is related sv.model = "BREAKPOINT2"; @@ -1398,13 +1437,13 @@ add_var_record(std::vector & var_records, sv.length = (int)sv.ins_seq.size(); } - // If length is less than 50 then it is not an SV - if (sv.length < 50 && ((int)sv.ins_seq_left.size() + (int)sv.ins_seq_right.size()) < 20) - { - BOOST_LOG_TRIVIAL(warning) << "[" << __HERE__ << "] Ignored SV at " << var.pos - << " because it was less than 50 bp."; - return; - } + //// If length is less than 50 then it is not an SV + //if (sv.length < 50 && ((int)sv.ins_seq_left.size() + (int)sv.ins_seq_right.size()) < 20) + //{ + // BOOST_LOG_TRIVIAL(warning) << "[" << __HERE__ << "] Ignored SV at " << var.pos + // << " because it was less than 50 bp."; + // return; + //} } // Make sure sv.size is set @@ -1744,8 +1783,7 @@ construct_graph(std::string const & reference_filename, for (auto & var_record : var_records) { genomic_region.add_reference_to_record_if_they_have_a_matching_prefix(var_record, - reference_sequence - ); + reference_sequence); } #ifndef NDEBUG @@ -1765,8 +1803,7 @@ construct_graph(std::string const & reference_filename, graph.add_genomic_region(std::move(reference_sequence), std::move(var_records), - std::move(genomic_region) - ); + std::move(genomic_region)); #ifndef NDEBUG if (!graph.check()) diff --git a/src/graph/graph.cpp b/src/graph/graph.cpp index adc197d..afb5ea6 100644 --- a/src/graph/graph.cpp +++ b/src/graph/graph.cpp @@ -208,60 +208,7 @@ Graph::add_genomic_region(std::vector && reference_sequence, continue; } - std::vector suffix = var_records[i].get_common_suffix(); - - //if (var_records[i + 1].pos >= var_records[i].pos + var_records[i].ref.size() - suffix.size()) - //{ - // long const suffix_to_remove = var_records[i].pos + var_records[i].ref.size() - var_records[i + 1].pos; - // - // // Check if it is possible to remove enough suffix so we can safely join the two records - // if (suffix_to_remove <= 0 || suffix_to_remove > static_cast(suffix.size())) - // { - // var_records[i + 1].merge_one_path(std::move(var_records[i])); - // } - // else - // { - // // Erase from previous record so that it ends where the next record starts - // assert(static_cast(var_records[i].ref.size()) > suffix_to_remove); - // var_records[i].ref.erase(var_records[i].ref.end() - suffix_to_remove, var_records[i].ref.end()); - // - // for (auto & alt : var_records[i].alts) - // { - // assert(static_cast(alt.size()) > suffix_to_remove); - // alt.erase(alt.end() - suffix_to_remove, alt.end()); - // } - // - // assert(var_records[i + 1].pos == var_records[i].pos + var_records[i].ref.size()); - // - // // Merge the two records - // long const suffix_to_add = suffix_to_remove - static_cast(var_records[i + 1].ref.size()); - // - // var_records[i + 1].merge(std::move(var_records[i])); - // - // if (suffix_to_add > 0) - // { - // var_records[i + 1].ref.insert(var_records[i + 1].ref.end(), suffix.end() - suffix_to_add, suffix.end()); - // - // for (auto & alt : var_records[i + 1].alts) - // alt.insert(alt.end(), suffix.end() - suffix_to_add, suffix.end()); - // } - // } - //} - //else - { - // In a few extreme scenarios we cannot merge only one path here. - //BOOST_LOG_TRIVIAL(fatal) << "i : " << var_records[i].to_string(); - //BOOST_LOG_TRIVIAL(fatal) << "i+1 : " << var_records[i + 1].to_string(); - var_records[i + 1].merge(std::move(var_records[i]), 4); // last parameter is EXTRA_SUFFIX - //BOOST_LOG_TRIVIAL(fatal) << "after: " << var_records[i + 1].to_string(); - } - - if (var_records[i + 1].alts.size() >= (MAX_NUMBER_OF_HAPLOTYPES - 1)) - { - BOOST_LOG_TRIVIAL(warning) << "[" << __HERE__ << "] Found a variant with too many alleles."; - var_records[i + 1].alts.resize(MAX_NUMBER_OF_HAPLOTYPES - 2); - } - + var_records[i + 1].merge(std::move(var_records[i]), 4); var_records[i].clear(); // Clear variants that have been merged into others ++i; } // while @@ -290,6 +237,16 @@ Graph::add_genomic_region(std::vector && reference_sequence, } ), var_records.end()); + // Remove var_records with too many alternatives + for (auto & var_record : var_records) + { + if (var_record.alts.size() >= (MAX_NUMBER_OF_HAPLOTYPES - 1)) + { + BOOST_LOG_TRIVIAL(warning) << __HERE__ << " Found a variant with too many alleles."; + var_record.alts.resize(MAX_NUMBER_OF_HAPLOTYPES - 2); + } + } + // Remove common suffix for (auto & var_record : var_records) { @@ -1013,12 +970,12 @@ Graph::get_locations_of_an_actual_position(uint32_t pos, Path const & path, bool // Only add this node if the path has it auto find_it = std::find(path.var_order.cbegin(), path.var_order.cend(), - var_nodes[v].get_label().order - ); + var_nodes[v].get_label().order); long const j = std::distance(path.var_order.cbegin(), find_it); assert(j >= 0); assert(i == this->get_variant_num(v)); + assert(i < static_cast(MAX_NUMBER_OF_HAPLOTYPES)); if (path.is_empty() || (j < static_cast(path.nums.size()) && path.nums[j].test(i))) { diff --git a/src/graph/haplotype.cpp b/src/graph/haplotype.cpp index 3517432..7f940f7 100644 --- a/src/graph/haplotype.cpp +++ b/src/graph/haplotype.cpp @@ -273,17 +273,28 @@ Haplotype::add_coverage(uint32_t const i, uint16_t const c) void -Haplotype::clipped_reads_to_stats(bool const fully_aligned) +Haplotype::clipped_reads_to_stats(int const clipped_bp, int const read_length) { - if (!fully_aligned) + if (clipped_bp == 0) + return; + + assert(var_stats.size() == gts.size()); + assert(var_stats.size() == coverage.size()); + long const num_allele = var_stats.size(); + long const scaled_clipped_bp = (clipped_bp * 1000l) / read_length; + + for (long i = 0; i < num_allele; ++i) { - assert(var_stats.size() == gts.size()); - assert(var_stats.size() == coverage.size()); + auto & var_stat = var_stats[i]; + auto const & cov = coverage[i]; + + if (cov != NO_COVERAGE) + ++var_stats[i].clipped_reads; - for (long i = 0; i < static_cast(var_stats.size()); ++i) + if (cov < MULTI_REF_COVERAGE) // true when the coverage is unique to a particular allele { - if (coverage[i] != NO_COVERAGE) - ++var_stats[i].clipped_reads; + assert(cov < var_stat.per_allele.size()); + var_stat.per_allele[cov].clipped_bp += scaled_clipped_bp; } } } @@ -292,13 +303,27 @@ Haplotype::clipped_reads_to_stats(bool const fully_aligned) void Haplotype::mapq_to_stats(uint8_t const new_mapq) { + if (new_mapq == 255) + return; // means that MQ is unavailable + assert(var_stats.size() == gts.size()); assert(var_stats.size() == coverage.size()); + uint64_t const mapq_squared = static_cast(new_mapq) * static_cast(new_mapq); + long const num_allele = var_stats.size(); - for (long i = 0; i < static_cast(var_stats.size()); ++i) + for (long i = 0; i < num_allele; ++i) { - if (coverage[i] != NO_COVERAGE) - var_stats[i].add_mapq(new_mapq); + auto & var_stat = var_stats[i]; + auto const & cov = coverage[i]; + + if (cov != NO_COVERAGE) + var_stat.mapq_squared += mapq_squared; + + if (cov < MULTI_REF_COVERAGE) // true when the coverage is unique to a particular allele + { + assert(cov < var_stat.per_allele.size()); + var_stat.per_allele[cov].mapq_squared += mapq_squared; + } } } @@ -310,13 +335,14 @@ Haplotype::strand_to_stats(uint16_t const flags) bool const is_first_in_pair = (flags & IS_FIRST_IN_PAIR) != 0; assert(var_stats.size() == gts.size()); assert(var_stats.size() == coverage.size()); + long const num_allele = var_stats.size(); - for (long i = 0; i < static_cast(var_stats.size()); ++i) + for (long i = 0; i < num_allele; ++i) { auto & var_stat = var_stats[i]; auto const & cov = coverage[i]; - if (coverage[i] < MULTI_REF_COVERAGE) // true when the coverage is unique to a particular allele + if (cov < MULTI_REF_COVERAGE) // true when the coverage is unique to a particular allele { assert(cov < var_stat.read_strand.size()); @@ -339,6 +365,51 @@ Haplotype::strand_to_stats(uint16_t const flags) } +void +Haplotype::mismatches_to_stats(uint8_t const mismatches, int const read_length) +{ + if (mismatches == 0) + return; + + long const scaled_mismatches = (mismatches * 1000l) / read_length; + long const num_allele = var_stats.size(); + + for (long i = 0; i < num_allele; ++i) + { + auto & var_stat = var_stats[i]; + auto const & cov = coverage[i]; + + if (cov < MULTI_REF_COVERAGE) // true when the coverage is unique to a particular allele + { + assert(cov < var_stat.per_allele.size()); + var_stat.per_allele[cov].mismatches += scaled_mismatches; + } + } +} + + +void +Haplotype::score_diff_to_stats(uint8_t const score_diff) +{ + if (score_diff == 0) + return; + + long const num_allele = var_stats.size(); + + for (long i = 0; i < num_allele; ++i) + { + auto & var_stat = var_stats[i]; + auto const & cov = coverage[i]; + + if (cov < MULTI_REF_COVERAGE) // true when the coverage is unique to a particular allele + { + assert(cov < var_stat.per_allele.size()); + var_stat.per_allele[cov].score_diff += score_diff; + } + } +} + + void Haplotype::coverage_to_gts(std::size_t const pn_index, bool const is_proper_pair) { diff --git a/src/graph/sv.cpp b/src/graph/sv.cpp index 350ea84..d3cb85a 100644 --- a/src/graph/sv.cpp +++ b/src/graph/sv.cpp @@ -173,6 +173,7 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & continue; // Nothing to do, there are no SVs here } + /* auto merge_alt_info_lambda = [](Variant & new_var, std::string const & id, long const aa) { @@ -190,6 +191,7 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & values.resize(2); new_var.infos[id] = join_strand_bias(values); }; + */ auto make_new_sv_var = [&](Variant const & old_var, long const aa) -> Variant @@ -200,15 +202,16 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & new_var.seqs.push_back(old_var.seqs[0]); new_var.seqs.push_back(old_var.seqs[aa + 1]); new_var.infos = old_var.infos; + new_var.stats = old_var.stats; - merge_alt_info_lambda(new_var, "SBF", aa); - merge_alt_info_lambda(new_var, "SBR", aa); - merge_alt_info_lambda(new_var, "SBF1", aa); - merge_alt_info_lambda(new_var, "SBR1", aa); - merge_alt_info_lambda(new_var, "SBF2", aa); - merge_alt_info_lambda(new_var, "SBR2", aa); - //merge_alt_info_lambda(new_var, "RACount", aa); - //merge_alt_info_lambda(new_var, "RADist", aa); + // Set value at index 1 to the alternative allele stats and then shrink the vector with .resize() + new_var.stats.per_allele[1] = old_var.stats.per_allele[aa + 1]; + new_var.stats.read_strand[1] = old_var.stats.read_strand[aa + 1]; + + new_var.stats.per_allele.resize(2); + new_var.stats.read_strand.resize(2); + + assert(new_var.stats.per_allele.size() == new_var.seqs.size()); new_var.calls.reserve(old_var.calls.size()); @@ -413,6 +416,8 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & // Put first SV allele in new VCF records Variant new_sv_var = make_new_sv_var(var, aa); assert(new_sv_var.seqs.size() == 2); + assert(new_sv_var.stats.per_allele.size() == 2); + assert(new_sv_var.stats.read_strand.size() == 2); SV const & sv = graph.SVs[sv_ids[aa]]; if (sv.type != BND) @@ -431,8 +436,7 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & // Fix genotype likelihood for duplication if (sv.type == DUP && - (sv.model == "BREAKPOINT1" || sv.model == "BREAKPOINT2") - ) + (sv.model == "BREAKPOINT1" || sv.model == "BREAKPOINT2")) { for (auto & call : new_sv_var.calls) { @@ -445,8 +449,7 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & minus_10log10_one_third * static_cast(call.coverage[1]) + minus_10log10_two_third * - static_cast(call.coverage[0]) - ); + static_cast(call.coverage[0])); uint64_t gt_11 = 3ul * (call.coverage[0] + static_cast(call.coverage[1])); uint64_t min_gt = std::min(gt_00, std::min(gt_01, gt_11)); @@ -464,12 +467,13 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & // If the new variant is an insertion and this is the second breakpoint, make a combined // variant. Also do the same if it is a duplication and we have coverage turned off if ((sv.type == INS && related_svs_map.count(sv_ids[aa]) == 1) || - (sv.type == INV && related_svs_map.count(sv_ids[aa]) == 1) - ) + (sv.type == INV && related_svs_map.count(sv_ids[aa]) == 1)) { assert(new_vars.size() > 0); Variant const & var_bp1 = new_vars[related_svs_map.at(sv_ids[aa])]; + assert(var_bp1.seqs.size() == var_bp1.stats.per_allele.size()); Variant combined_var = make_variant_with_combined_calls(new_sv_var, var_bp1); + assert(combined_var.seqs.size() == combined_var.stats.per_allele.size()); add_sv_to_new_vars_vector(std::move(combined_var), sv, "AGGREGATED"); } @@ -479,14 +483,18 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & if ((sv.type == DEL || sv.type == DEL_ALU)) { Variant cov_var(new_sv_var); - assert(cov_var.seqs.size() == 2ul); + assert(cov_var.seqs.size() == 2); + assert(cov_var.stats.per_allele.size() == 2); + assert(cov_var.stats.read_strand.size() == 2); for (long pn_index = 0; pn_index < static_cast(cov_var.calls.size()); ++pn_index) cov_var.calls[pn_index] = make_call_based_on_coverage(pn_index, sv, reference_depth); // Make a combined variant with both breakpoint and coverage Variant combined_var = make_variant_with_combined_calls(new_sv_var, cov_var); - assert(combined_var.seqs.size() >= 2); + assert(combined_var.seqs.size() == 2); + assert(combined_var.stats.per_allele.size() == 2); + assert(combined_var.stats.read_strand.size() == 2); add_sv_to_new_vars_vector(std::move(combined_var), sv, "AGGREGATED"); assert(cov_var.seqs.size() >= 2); add_sv_to_new_vars_vector(std::move(cov_var), sv, "COVERAGE"); @@ -496,6 +504,7 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & // Second breakpoint of a duplication Variant cov_var(new_sv_var); assert(cov_var.seqs.size() == 2ul); + assert(cov_var.seqs.size() == cov_var.stats.per_allele.size()); for (long pn_index = 0; pn_index < static_cast(cov_var.calls.size()); ++pn_index) cov_var.calls[pn_index] = make_call_based_on_coverage(pn_index, sv, reference_depth); @@ -544,6 +553,7 @@ reformat_sv_vcf_records(std::vector & variants, ReferenceDepth const & continue; // Not SV } + assert(var.seqs.size() == var.stats.per_allele.size()); add_sv_variant(var, aa, related_svs); } diff --git a/src/main.cpp b/src/main.cpp index 455f9b6..3b1e88c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -680,6 +680,11 @@ subcmd_genotype(paw::Parser & parser) "genotype_dis_min_support_ratio", "(advanced) Minimum graph discovery support ratio for a variant so it can be added to the graph."); + parser.parse_option(opts.is_csi, + 'C', + "csi", + "(advanced) If set, graphtyper will make csi indices instead of tbi."); + parser.parse_option(opts.is_only_cigar_discovery, ' ', "is_only_cigar_discovery", @@ -724,6 +729,11 @@ subcmd_genotype(paw::Parser & parser) "(advanced) How many alleles each batch of internal VCFs has. Increasing this number inceases " "memory used, while slightly decreases compute time."); + parser.parse_option(opts.is_extra_call_only_iteration, + ' ', + "is_extra_call_only_iteration", + "(advanced) Set to add an extra call only iteration."); + #ifndef NDEBUG parser.parse_option(opts.stats, ' ', "stats", "(advanced) Directory for statistics files."); #endif // ifndef NDEBUG @@ -777,7 +787,7 @@ subcmd_genotype_sv(paw::Parser & parser) gyper::Options & opts = *(gyper::Options::instance()); std::string avg_cov_by_readlen_fn; - std::string output_dir = "sv_results"; + std::string output_dir{"sv_results"}; std::string opts_region{}; std::string opts_region_file{}; std::string ref_fn{}; @@ -787,17 +797,33 @@ subcmd_genotype_sv(paw::Parser & parser) bool force_copy_reference{false}; bool force_no_copy_reference{false}; + // Change the default values + opts.max_files_open = 128; + // Parse options + parser.parse_option(avg_cov_by_readlen_fn, + ' ', + "avg_cov_by_readlen", + "File with average coverage by read length (one value per line). " + "The values are used for subsampling regions with extremely high coverage and should be in the same " + "order as the BAM/CRAM list."); + parser.parse_option(force_copy_reference, ' ', "force_copy_reference", + "Force copy of the reference FASTA to temporary folder."); + parser.parse_option(force_no_copy_reference, ' ', "force_no_copy_reference", + "Force that the reference FASTA is NOT copied to temporary folder."); + parser.parse_option(opts.force_use_input_ref_for_cram_reading, ' ', "force_use_input_ref_for_cram_reading", + "Force using the input reference FASTA file when reading CRAMs."); + parser.parse_option(opts.is_csi, + 'C', + "csi", + "(advanced) If set, graphtyper will make csi indices instead of tbi."); + parser.parse_option(opts.max_files_open, ' ', "max_files_open", "Select how many files can be open at the same time."); parser.parse_option(opts.no_cleanup, ' ', "no_cleanup", "Set to skip removing temporary files. Useful for debugging."); - parser.parse_option(force_copy_reference, ' ', "force_copy_reference", - "Force copy of the reference FASTA to temporary folder."); - parser.parse_option(force_no_copy_reference, ' ', "force_no_copy_reference", - "Force that the reference FASTA is NOT copied to temporary folder."); parser.parse_option(output_dir, 'O', "output", "Output directory."); parser.parse_option(opts_region, 'r', "region", "Genomic region to genotype."); parser.parse_option(opts_region_file, 'R', "region_file", "File with genomic regions to genotype."); @@ -807,6 +833,10 @@ subcmd_genotype_sv(paw::Parser & parser) #ifndef NDEBUG parser.parse_option(opts.stats, ' ', "stats", "Directory for statistics files."); #endif // NDEBUG + parser.parse_option(opts.no_filter_on_coverage, + ' ', + "no_filter_on_coverage", + "(advanced) Set to disable filter on coverage."); //parser.parse_option(opts.vcf, ' ', "vcf", "Input VCF file with SNP/indel variant sites."); parser.parse_positional_argument(ref_fn, "REF.FA", "Reference genome in FASTA format."); @@ -826,8 +856,9 @@ subcmd_genotype_sv(paw::Parser & parser) // Get the genomic regions to process from the --region and --region_file options std::vector regions = get_regions(ref_fn, opts_region, opts_region_file, 1000000); - // Get the SAM/BAM/CRAM file names + // Get the BAM/CRAM file names std::vector sams_fn = get_sams(sam, sams); + std::vector avg_cov_by_readlen = get_avg_cov_by_readlen(avg_cov_by_readlen_fn, sams_fn.size()); // If neither force copy reference or force no copy reference we determine it from number of SAMs bool is_copy_reference = force_copy_reference || (!force_no_copy_reference && sams_fn.size() >= 100); @@ -835,9 +866,11 @@ subcmd_genotype_sv(paw::Parser & parser) gyper::genotype_sv_regions(ref_fn, sv_vcf, sams_fn, + avg_cov_by_readlen, regions, output_dir, is_copy_reference); + return 0; } @@ -965,18 +998,20 @@ subcmd_vcf_concatenate(paw::Parser & parser) std::string output_fn = "-"; bool sites_only{false}; std::string region; + bool write_tbi{false}; parser.parse_option(no_sort, ' ', "no_sort", "Set to skip sorting the variants."); parser.parse_option(output_fn, 'o', "output", "Output VCF file name."); parser.parse_option(sites_only, ' ', "sites_only", "Set to write only variant site information."); parser.parse_option(region, 'r', "region", "Region to print variant in."); + parser.parse_option(write_tbi, 't', "write_tbi", "Set to write TBI index."); parser.parse_remaining_positional_arguments(vcfs, "vcfs...", "VCFs to concatenate"); parser.finalize(); setup_logger(); - gyper::vcf_concatenate(vcfs, output_fn, no_sort, sites_only, region); + gyper::vcf_concatenate(vcfs, output_fn, no_sort, sites_only, write_tbi, region); return 0; } diff --git a/src/typer/alignment.cpp b/src/typer/alignment.cpp index 0746d0d..4837e55 100644 --- a/src/typer/alignment.cpp +++ b/src/typer/alignment.cpp @@ -27,8 +27,7 @@ void find_genotype_paths_of_one_of_the_sequences(seqan::IupacString const & read, gyper::GenotypePaths & geno, gyper::PHIndex const & ph_index, - gyper::Graph const & graph = gyper::graph - ) + gyper::Graph const & graph = gyper::graph) { using namespace gyper; @@ -54,7 +53,7 @@ find_genotype_paths_of_one_of_the_sequences(seqan::IupacString const & read, } { - uint32_t read_start_index = 0; + uint32_t read_start_index{0}; { assert(r_hamming0.size() == r_hamming1.size()); @@ -64,14 +63,12 @@ find_genotype_paths_of_one_of_the_sequences(seqan::IupacString const & read, geno.add_next_kmer_labels(r_hamming0[i], read_start_index, read_start_index + (K - 1), - 0 /*mismatches*/ - ); + 0 /*mismatches*/); geno.add_next_kmer_labels(r_hamming1[i], read_start_index, read_start_index + (K - 1), - 1 /*mismatches*/ - ); + 1 /*mismatches*/); read_start_index += (K - 1); } @@ -93,13 +90,18 @@ find_genotype_paths_of_one_of_the_sequences(seqan::IupacString const & read, geno.remove_fully_special_paths(); geno.remove_non_ref_paths_when_read_matches_ref(); // Should be the last check - geno.update_longest_path_size(); geno.remove_short_paths(); if (graph.is_sv_graph) geno.remove_support_from_read_ends(); + // store read sequence + geno.read2 = std::vector(seqan::begin(read), seqan::end(read)); + + // store score diff + + #ifndef NDEBUG /* if (!geno.check_no_variant_is_missing()) @@ -112,6 +114,7 @@ find_genotype_paths_of_one_of_the_sequences(seqan::IupacString const & read, } +/* bool is_clipped(bam1_t const & b) { @@ -141,6 +144,233 @@ is_clipped(bam1_t const & b) return false; } +*/ + +long +clipped_count(bam1_t const & b) +{ + long count{0}; + + if (b.core.n_cigar > 0) + { + auto it = b.data + b.core.l_qname; + + // Check first + uint32_t opAndCnt; + memcpy(&opAndCnt, it, sizeof(uint32_t)); + + if ((opAndCnt & 15) == 4) + { + uint32_t const cigar_count = opAndCnt >> 4; + count += cigar_count; + //std::cerr << "MIDNSHP=X*******"[opAndCnt & 15] << " first "; + return true; + } + + // Check last + memcpy(&opAndCnt, it + sizeof(uint32_t) * (b.core.n_cigar - 1), sizeof(uint32_t)); + + if ((opAndCnt & 15) == 4) + { + //std::cerr << "MIDNSHP=X*******"[opAndCnt & 15] << " last "; + uint32_t const cigar_count = opAndCnt >> 4; + count += cigar_count; + return true; + } + } + + return count; +} + + +uint8_t +get_score_diff(bam1_t * rec) +{ + uint8_t * it = bam_get_aux(rec); + auto const l_aux = bam_get_l_aux(rec); + unsigned i{0}; + int64_t as{-1}; + int64_t xs{-1}; + + while (i < l_aux) + { + i += 3; + char type = *(it + i - 1); + bool is_as = false; + bool is_xs = false; + + // Check for AS and XS + if (*(it + i - 2) == 'S') + { + if (*(it + i - 3) == 'A') + is_as = true; + else if (*(it + i - 3) == 'X') + is_xs = true; + } + + switch (type) + { + case 'A': + { + // A printable character + ++i; + break; + } + + case 'Z': + { + // A string! Let's loop it until qNULL + // Check for RG + while (*(it + i) != '\0' && *(it + i) != '\n') + ++i; + + ++i; + break; + } + + case 'c': + { + if (is_as) + { + int8_t num; + memcpy(&num, it + i, sizeof(int8_t)); + as = num; + } + else if (is_xs) + { + int8_t num; + memcpy(&num, it + i, sizeof(int8_t)); + xs = num; + } + + i += sizeof(int8_t); + break; + } + + case 'C': + { + if (is_as) + { + uint8_t num; + memcpy(&num, it + i, sizeof(uint8_t)); + as = num; + } + else if (is_xs) + { + uint8_t num; + memcpy(&num, it + i, sizeof(uint8_t)); + xs = num; + } + + i += sizeof(uint8_t); + break; + } + + case 's': + { + if (is_as) + { + int16_t num; + memcpy(&num, it + i, sizeof(int16_t)); + as = num; + } + else if (is_xs) + { + int16_t num; + memcpy(&num, it + i, sizeof(int16_t)); + xs = num; + } + + i += sizeof(int16_t); + break; + } + + case 'S': + { + if (is_as) + { + uint16_t num; + memcpy(&num, it + i, sizeof(uint16_t)); + as = num; + } + else if (is_xs) + { + uint16_t num; + memcpy(&num, it + i, sizeof(uint16_t)); + xs = num; + } + + i += sizeof(uint16_t); + break; + } + + case 'i': + { + if (is_as) + { + int32_t num; + memcpy(&num, it + i, sizeof(int32_t)); + as = num; + } + else if (is_xs) + { + int32_t num; + memcpy(&num, it + i, sizeof(int32_t)); + xs = num; + } + + i += sizeof(int32_t); + break; + } + + case 'I': + { + if (is_as) + { + uint32_t num; + memcpy(&num, it + i, sizeof(uint32_t)); + as = num; + } + else if (is_xs) + { + uint32_t num; + memcpy(&num, it + i, sizeof(uint32_t)); + xs = num; + } + + i += sizeof(uint32_t); + break; + } + + case 'f': + { + i += sizeof(float); + break; + } + + default: + { + i = l_aux; // Unkown tag, stop + break; + } + } + } + + + if (as == -1 || as < xs) + { + return 0; + } + else + { + if (xs == -1) + xs = 0; + + // check for overflow + long const diff = as - xs; + return diff < std::numeric_limits::max() ? diff : std::numeric_limits::max(); + } +} } // anon namespace @@ -164,12 +394,14 @@ align_read(bam1_t * rec, GenotypePaths(core.flag, core.l_qseq)); // Hard restriction on read length is 63 bp (2*32 - 1) - if (seqan::length(seq) >= (2 * K - 1)) - { - find_genotype_paths_of_one_of_the_sequences(seq, geno_paths.first, ph_index); - find_genotype_paths_of_one_of_the_sequences(rseq, geno_paths.second, ph_index); - } - + if (seqan::length(seq) < (2 * K - 1)) + return geno_paths; + + find_genotype_paths_of_one_of_the_sequences(seq, geno_paths.first, ph_index); + find_genotype_paths_of_one_of_the_sequences(rseq, geno_paths.second, ph_index); + //uint8_t const score_diff = get_score_diff(rec); + //geno_paths.first.score_diff = score_diff; + //geno_paths.second.score_diff = score_diff; return geno_paths; } @@ -178,55 +410,80 @@ GenotypePaths * update_unpaired_read_paths(std::pair & geno_paths, bam1_t * rec) { assert(rec); - auto const & core = rec->core; switch (compare_pair_of_genotype_paths(geno_paths.first, geno_paths.second)) { case 1: { - GenotypePaths * geno = &geno_paths.first; - geno->flags = core.flag & ~IS_PROPER_PAIR; // clear proper pair bit - geno->mapq = core.qual; + GenotypePaths & geno = geno_paths.first; + geno.flags = core.flag & ~IS_PROPER_PAIR; // clear proper pair bit + geno.mapq = core.qual; if (!(core.flag & IS_UNMAPPED)) // True if read was mapped - geno->original_pos = core.pos; + geno.original_pos = core.pos; if (core.qual < 25) // True if MQ is below 25 - set_bit(geno->flags, IS_MAPQ_BAD); + set_bit(geno.flags, IS_MAPQ_BAD); + + { + long const clipped_count_bp = clipped_count(*rec); - if (is_clipped(*rec)) - set_bit(geno->flags, IS_CLIPPED); + if (clipped_count_bp > 3) + set_bit(geno.flags, IS_CLIPPED); + } + + geno.score_diff = get_score_diff(rec); #ifndef NDEBUG if (Options::instance()->stats.size() > 0) - geno->details->query_name = std::string(reinterpret_cast(rec->data)); + { + geno.details->query_name = std::string(reinterpret_cast(rec->data)); + geno.qual2.resize(core.l_qseq); + auto it = bam_get_qual(rec); + + for (int i = 0; i < core.l_qseq; ++it, ++i) + geno.qual2[i] = static_cast(*it + 33); + } #endif // NDEBUG - return geno; + return &geno; } case 2: { - GenotypePaths * geno = &geno_paths.second; - geno->flags = (core.flag ^ IS_SEQ_REVERSED) & ~IS_PROPER_PAIR; // clear proper pair bit - geno->mapq = core.qual; + GenotypePaths & geno = geno_paths.second; + geno.flags = (core.flag ^ IS_SEQ_REVERSED) & ~IS_PROPER_PAIR; // clear proper pair bit + geno.mapq = core.qual; if (!(core.flag & IS_UNMAPPED)) // True if read was mapped - geno->original_pos = core.pos; + geno.original_pos = core.pos; if (core.qual < 25) // True if MQ is below 25 - set_bit(geno->flags, IS_MAPQ_BAD); + set_bit(geno.flags, IS_MAPQ_BAD); + + { + long const clipped_count_bp = clipped_count(*rec); + + if (clipped_count_bp > 3) + set_bit(geno.flags, IS_CLIPPED); + } - if (is_clipped(*rec)) - set_bit(geno->flags, IS_CLIPPED); + geno.score_diff = get_score_diff(rec); #ifndef NDEBUG if (Options::instance()->stats.size() > 0) - geno->details->query_name = std::string(reinterpret_cast(rec->data)); + { + geno.details->query_name = std::string(reinterpret_cast(rec->data)); + geno.qual2.resize(core.l_qseq); + auto it = bam_get_qual(rec); + + for (int i = (core.l_qseq - 1); i >= 0; ++it, --i) + geno.qual2[i] = static_cast(*it + 33); + } #endif // NDEBUG - return geno; + return &geno; } default: break; @@ -237,11 +494,7 @@ update_unpaired_read_paths(std::pair & geno_paths, void -further_update_unpaired_read_paths_for_discovery(GenotypePaths & geno, - seqan::IupacString const & seq, - seqan::IupacString const & rseq, - bam1_t * rec - ) +further_update_unpaired_read_paths_for_discovery(GenotypePaths & geno, bam1_t * rec) { assert(rec); auto const & core = rec->core; @@ -249,7 +502,6 @@ further_update_unpaired_read_paths_for_discovery(GenotypePaths & geno, if ((geno.flags & IS_SEQ_REVERSED) == (core.flag & IS_SEQ_REVERSED)) { // seq reversed bit has not been flipped - geno.read2 = std::vector(seqan::begin(seq), seqan::end(seq)); geno.qual2.resize(core.l_qseq); auto it = bam_get_qual(rec); @@ -261,7 +513,6 @@ further_update_unpaired_read_paths_for_discovery(GenotypePaths & geno, } else { - geno.read2 = std::vector(seqan::begin(rseq), seqan::end(rseq)); geno.qual2.resize(core.l_qseq); auto it = bam_get_qual(rec); @@ -275,16 +526,8 @@ further_update_unpaired_read_paths_for_discovery(GenotypePaths & geno, } -#ifndef NDEBUG -void -update_paths(std::pair & geno_paths, - seqan::IupacString const & seq, - seqan::IupacString const & rseq, - bam1_t * rec) -#else void update_paths(std::pair & geno_paths, bam1_t * rec) -#endif { assert(rec); auto const & core = rec->core; @@ -305,10 +548,20 @@ update_paths(std::pair & geno_paths, bam1_t * rec) if (core.qual < 25) set_bit(geno1.flags, IS_MAPQ_BAD); - if (is_clipped(*rec)) { - set_bit(geno1.flags, IS_CLIPPED); - set_bit(geno2.flags, IS_CLIPPED); + long const clipped_count_bp = clipped_count(*rec); + + if (clipped_count_bp > 3) + { + set_bit(geno1.flags, IS_CLIPPED); + set_bit(geno2.flags, IS_CLIPPED); + } + } + + { + auto const score_diff = get_score_diff(rec); + geno1.score_diff = score_diff; + geno2.score_diff = score_diff; } geno2.flags = (core.flag ^ IS_SEQ_REVERSED) & ~IS_PROPER_PAIR; @@ -321,14 +574,12 @@ update_paths(std::pair & geno_paths, bam1_t * rec) geno1.details->query_name = std::string(reinterpret_cast(rec->data)); geno2.details->query_name = geno1.details->query_name; - geno1.read2 = std::vector(seqan::begin(seq), seqan::end(seq)); geno1.qual2.resize(core.l_qseq); auto it = bam_get_qual(rec); for (int i = 0; i < core.l_qseq; ++it, ++i) geno1.qual2[i] = static_cast(*it + 33); - geno2.read2 = std::vector(seqan::begin(rseq), seqan::end(rseq)); geno2.qual2 = std::vector(geno1.qual2.rbegin(), geno1.qual2.rend()); } #endif // NDEBUG @@ -336,11 +587,7 @@ update_paths(std::pair & geno_paths, bam1_t * rec) void -further_update_paths_for_discovery(std::pair & geno_paths, - seqan::IupacString const & seq, - seqan::IupacString const & rseq, - bam1_t * rec - ) +further_update_paths_for_discovery(std::pair & geno_paths, bam1_t * rec) { assert(rec); @@ -348,21 +595,12 @@ further_update_paths_for_discovery(std::pair & gen GenotypePaths & geno1 = geno_paths.first; GenotypePaths & geno2 = geno_paths.second; - assert(seqan::length(seq) > 0); - assert(static_cast(seqan::length(seq)) == core.l_qseq); - assert(seqan::length(seq) == seqan::length(rseq)); - assert(seqan::length(seq) == seqan::length(rseq)); - assert(seqan::length(seq) == geno1.read_length); - assert(seqan::length(seq) == geno2.read_length); - - geno1.read2 = std::vector(seqan::begin(seq), seqan::end(seq)); geno1.qual2.resize(core.l_qseq); auto it = bam_get_qual(rec); for (int i = 0; i < core.l_qseq; ++it, ++i) geno1.qual2[i] = static_cast(*it + 33); - geno2.read2 = std::vector(seqan::begin(rseq), seqan::end(rseq)); geno2.qual2 = std::vector(geno1.qual2.rbegin(), geno1.qual2.rend()); } diff --git a/src/typer/bucket.cpp b/src/typer/bucket.cpp new file mode 100644 index 0000000..5851536 --- /dev/null +++ b/src/typer/bucket.cpp @@ -0,0 +1,207 @@ +#include +#include +#include +#include + +#include + +#include + +#include +#include +#include +#include + + +namespace gyper +{ + +std::string +Bucket::to_string() const +{ + std::ostringstream ss; + ss << " indel events " << "," << indel_events.size(); + ss << " n_reads=" << reads.size(); + + for (auto const & read : reads) + ss << "\n" << read.to_string(); + + return ss.str(); +} + + +bool +is_indel_in_bucket(std::vector const & buckets, + IndelEvent const & indel_event, + long const region_begin, + long const BUCKET_SIZE) +{ + long const event_bucket_index = (indel_event.pos - region_begin) / BUCKET_SIZE; + assert(event_bucket_index < static_cast(buckets.size())); + auto & indel_events = buckets[event_bucket_index].indel_events; + auto find_it = indel_events.find(indel_event); + return find_it != indel_events.end(); +} + + +template +Tindel_events::iterator +add_indel_event_to_bucket(std::vector & buckets, + IndelEvent && new_indel_event, + long const region_begin, + long const BUCKET_SIZE, + std::vector const & reference_sequence, + long ref_offset) +{ + long const REF_SIZE = reference_sequence.size(); + + assert(new_indel_event.pos >= region_begin); + long const event_bucket_index = (new_indel_event.pos - region_begin) / BUCKET_SIZE; + assert(event_bucket_index >= 0); + + if (event_bucket_index >= static_cast(buckets.size())) + buckets.resize(event_bucket_index + 1); + + assert(event_bucket_index < static_cast(buckets.size())); + + EventInfo new_info; + std::pair it_pair = + buckets[event_bucket_index].indel_events.insert({std::move(new_indel_event), std::move(new_info)}); + + assert(it_pair.first != buckets[event_bucket_index].indel_events.end()); + + if (it_pair.second) + { + IndelEvent const & new_event = it_pair.first->first; + EventInfo & new_event_info = it_pair.first->second; + + long span{0}; + long const count{static_cast(new_event.sequence.size())}; + + // Calculate span + if (new_event.type == 'I' || new_event.type == 'i') + { + // Check insterted sequence + while (span < count) + { + if ((ref_offset + span) >= REF_SIZE || + new_event.sequence[span] != reference_sequence[ref_offset + span]) + { + break; + } + + ++span; + } + + // Check further if we have matched the entire inserted sequence + if (span == count) + { + while ((ref_offset + span) < REF_SIZE) + { + if (reference_sequence[ref_offset + span - count] != reference_sequence[ref_offset + span]) + break; + + ++span; + } + } + + // Check for overflow + if ((span + 1) >= std::numeric_limits::max()) + span = std::numeric_limits::max() - 1; + + new_event_info.span = span + 1; // to 1-based system + } + else + { + assert(new_event.type == 'D' || new_event.type == 'd'); + + // Check after deleted sequence + while ((ref_offset + span) < REF_SIZE) + { + if (reference_sequence[ref_offset + span] != reference_sequence[ref_offset + span + count]) + break; + + ++span; + } + + // Check for overflow + if ((span + 1) >= std::numeric_limits::max()) + span = std::numeric_limits::max() - 1; + + new_event_info.span = span + 1; // to 1-based system + } + } + + return it_pair.first; +} + + +std::map::iterator +add_snp_event_to_bucket(std::vector & buckets, + SnpEvent && event, + long const region_begin, + long const BUCKET_SIZE) +{ + assert(event.pos >= region_begin); + long const event_bucket_index = (event.pos - region_begin) / BUCKET_SIZE; + assert(event_bucket_index >= 0); + + if (event_bucket_index >= static_cast(buckets.size())) + buckets.resize(event_bucket_index + 1); + + assert(event_bucket_index < static_cast(buckets.size())); + + SnpEventInfo new_info; + std::pair::iterator, bool> it_pair = + buckets[event_bucket_index].snps.insert({std::move(event), std::move(new_info)}); + + return it_pair.first; +} + + +/* +void +add_base_to_bucket(std::vector & buckets, + int32_t pos, + char seq, + char qual, + long const region_begin, + long const BUCKET_SIZE) +{ + // Any sensible compiler will optimize this + long const event_bucket_index = (pos - region_begin) / BUCKET_SIZE; + long const local_pos = (pos - region_begin) % BUCKET_SIZE; + + if (event_bucket_index >= static_cast(buckets.size())) + buckets.resize(event_bucket_index + 1); + + auto & bucket = buckets[event_bucket_index]; + + if (bucket.pileup.size() == 0) + bucket.pileup.resize(BUCKET_SIZE); + + assert(local_pos < static_cast(bucket.pileup.size())); + bucket.pileup[local_pos].add_base(seq, qual); +} +*/ + + +// explicit instantiation +template Tindel_events::iterator +add_indel_event_to_bucket(std::vector & buckets, + IndelEvent && new_indel_event, + long const region_begin, + long const BUCKET_SIZE, + std::vector const & reference_sequence, + long ref_offset); + +template Tindel_events::iterator +add_indel_event_to_bucket(std::vector & buckets, + IndelEvent && new_indel_event, + long const region_begin, + long const BUCKET_SIZE, + std::vector const & reference_sequence, + long ref_offset); + + +} // namespace gyper diff --git a/src/typer/caller.cpp b/src/typer/caller.cpp index 8215cfa..7e89ad8 100644 --- a/src/typer/caller.cpp +++ b/src/typer/caller.cpp @@ -1,10 +1,12 @@ #include // assert +#include // std::numeric_limits #include // std::unique_ptr #include // std::ostringstream #include // std::string #include // std::unordered_map #include // std::vector +#include #include #include @@ -14,8 +16,11 @@ #include // BOOST_LOG_TRIVIAL +#include + #include #include // gyper::absolute_pos +#include #include // gyper::load_graph #include #include @@ -23,10 +28,15 @@ #include // gyper::index (global) #include #include +#include #include +#include +#include #include // gyper::Primers +#include #include #include +#include #include // gyper::HtsParallelReader #include // gyper::HtsReader #include @@ -36,6 +46,40 @@ namespace { +std::string const debug_read_name = "simulated.452/2"; + + +bool +is_clipped(bam1_t const & b) +{ + if (b.core.n_cigar == 0) + return false; + + auto it = b.data + b.core.l_qname; + + // Check first + uint32_t opAndCnt; + memcpy(&opAndCnt, it, sizeof(uint32_t)); + + if ((opAndCnt & 15) == 4) + { + //std::cerr << "MIDNSHP=X*******"[opAndCnt & 15] << " first "; + return true; + } + + // Check last + memcpy(&opAndCnt, it + sizeof(uint32_t) * (b.core.n_cigar - 1), sizeof(uint32_t)); + + if ((opAndCnt & 15) == 4) + { + //std::cerr << "MIDNSHP=X*******"[opAndCnt & 15] << " last "; + return true; + } + + return false; +} + + void _read_rg_and_samples(std::vector & samples, std::vector > & vec_rg2sample_i, @@ -75,11 +119,12 @@ _determine_num_jobs_and_num_parts(long & jobs, { using namespace gyper; - jobs = Options::const_instance()->threads; + gyper::Options const & copts = *(Options::const_instance()); + jobs = copts.threads; num_parts = jobs; - long const MAX_FILES_OPEN = Options::const_instance()->max_files_open; + long const MAX_FILES_OPEN = copts.max_files_open > jobs ? copts.max_files_open : jobs; - if (jobs >= NUM_SAMPLES || jobs >= MAX_FILES_OPEN) + if (jobs >= NUM_SAMPLES) { // Special case where there are more threads than samples OR more threads than max files open. POOL_SIZE is 1 num_parts = std::min(NUM_SAMPLES, MAX_FILES_OPEN); @@ -105,6 +150,7 @@ namespace gyper std::vector call(std::vector const & hts_paths, + std::vector const & avg_cov_by_readlen, std::string const & graph_path, PHIndex const & ph_index, std::string const & output_dir, @@ -130,27 +176,35 @@ call(std::vector const & hts_paths, // Split hts_paths std::vector > > spl_hts_paths; + std::vector > > spl_avg_cov; assert(Options::const_instance()->max_files_open > 0); - long jobs = 1; // Running jobs - long num_parts = 1; // Number of parts to split the work into - long const NUM_SAMPLES = hts_paths.size(); + long jobs{1}; // Running jobs + long num_parts{1}; // Number of parts to split the work into + long const NUM_SAMPLES{static_cast(hts_paths.size())}; _determine_num_jobs_and_num_parts(jobs, num_parts, NUM_SAMPLES); { auto it = hts_paths.begin(); + auto cov_it = avg_cov_by_readlen.begin(); for (long i = 0; i < num_parts; ++i) { - auto end_it = it + NUM_SAMPLES / num_parts + (i < (NUM_SAMPLES % num_parts)); + long const advance = NUM_SAMPLES / num_parts + (i < (NUM_SAMPLES % num_parts)); + + auto end_it = it + advance; assert(std::distance(hts_paths.begin(), end_it) <= NUM_SAMPLES); spl_hts_paths.emplace_back(new std::vector(it, end_it)); it = end_it; + + auto cov_end_it = cov_it + advance; + spl_avg_cov.emplace_back(new std::vector(cov_it, cov_end_it)); + cov_it = cov_end_it; } } - BOOST_LOG_TRIVIAL(debug) << "[" << __HERE__ << "] Number of pools = " << spl_hts_paths.size(); + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Number of pools = " << spl_hts_paths.size(); long const NUM_POOLS = spl_hts_paths.size(); paths.resize(NUM_POOLS); @@ -166,6 +220,7 @@ call(std::vector const & hts_paths, call_station.add_work(parallel_reader_genotype_only, &paths[i], spl_hts_paths[i].get(), + spl_avg_cov[i].get(), &output_dir, &reference_fn, ®ion, @@ -180,6 +235,7 @@ call(std::vector const & hts_paths, parallel_reader_genotype_only, &paths[NUM_POOLS - 1], spl_hts_paths[NUM_POOLS - 1].get(), + spl_avg_cov[NUM_POOLS - 1].get(), &output_dir, &reference_fn, ®ion, @@ -243,10 +299,10 @@ find_variants_in_cigar(seqan::BamAlignmentRecord const & record, graph.ref_nodes[0].get_label().order : 0; - long read_pos = 0; + long read_pos{0}; std::string read(seqan::begin(record.seq), seqan::end(record.seq)); - long begin_clipping_size = 0; - bool is_clipped = false; + long begin_clipping_size{0}; + bool is_clipped{false}; std::string reference_seq; std::string read_seq; @@ -473,7 +529,7 @@ parallel_discover_from_cigar(std::string * output_ptr, if (!rg_tag) { - BOOST_LOG_TRIVIAL(error) << "[graphtyper::typer::caller] Unable to find RG tag in read."; + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Unable to find RG tag in read."; std::exit(1); } @@ -483,7 +539,7 @@ parallel_discover_from_cigar(std::string * output_ptr, if (find_rg_it == rg2sample_i.end()) { - BOOST_LOG_TRIVIAL(error) << "[graphtyper::typer::caller] Unable to find read group. " << read_group; + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Unable to find read group. " << read_group; std::exit(1); } @@ -514,9 +570,7 @@ parallel_discover_from_cigar(std::string * output_ptr, std::ostringstream variant_map_path; variant_map_path << output_dir << "/" << samples[0] << "_variant_map"; - BOOST_LOG_TRIVIAL(debug) << "[graphtyper::caller] Writing variant map to '" - << variant_map_path.str(); - + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Writing variant map to '" << variant_map_path.str(); varmap.create_varmap_for_all(reference_depth); #ifndef NDEBUG @@ -570,7 +624,7 @@ discover_directly_from_bam(std::string const & graph_path, } long const NUM_POOLS = spl_hts_paths.size(); - BOOST_LOG_TRIVIAL(debug) << "[graphtyper::caller] Number of pools = " << NUM_POOLS; + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Number of pools = " << NUM_POOLS; output_file_paths.resize(NUM_POOLS); // Run in parallel @@ -609,4 +663,1573 @@ discover_directly_from_bam(std::string const & graph_path, } +void +run_first_pass(bam1_t * hts_rec, + HtsReader & hts_reader, + long file_i, + std::vector & buckets, + std::map > & pool_snp_haplotypes, + long const BUCKET_SIZE, + long const region_begin, + std::vector const & reference_sequence) +{ + long const REF_SIZE{static_cast(reference_sequence.size())}; + int32_t global_max_pos_end{0}; + std::vector cov_up(REF_SIZE); + std::vector cov_down(REF_SIZE); + + while (true) + { + assert(hts_rec); + auto const & core = hts_rec->core; + + if (core.n_cigar == 0 || core.pos < region_begin) + { + hts_rec = hts_reader.get_next_read(hts_rec); + + if (!hts_rec) + break; + } + + std::array static constexpr CIGAR_MAP = {{ + 'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', 'B', '*', '*', '*', '*', '*', '*' + }}; + + auto it = hts_rec->data; + auto cigar_it = it + core.l_qname; + auto seq_it = cigar_it + (core.n_cigar << 2); + + // Index of first aligned read base in read sequence + long read_offset{0}; + + // Index of first aligned read base in reference_sequence + long ref_offset{static_cast(core.pos) - region_begin}; + assert(ref_offset >= 0); + + // Get reference to bucket + long const bucket_id{ref_offset / BUCKET_SIZE}; + + // While experimenting say that id is also index, then go back to: + // long const bucket_index = bucket_id % NUM_BUCKETS; + long const bucket_index{bucket_id}; + + if (bucket_index >= static_cast(buckets.size())) + buckets.resize(bucket_index + 1); + + assert(bucket_index >= 0); + assert(bucket_index < static_cast(buckets.size())); + long const N_CIGAR = core.n_cigar; + + if (ref_offset >= REF_SIZE) + { + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Unexpected ref_offset = " << ref_offset; + std::exit(1); + } + + Read read; + read.name = reinterpret_cast(it); + //read.mate_pos = static_cast(core.mpos); + + if ((core.flag & IS_FIRST_IN_PAIR) != 0u) + read.name.append("/1"); + else + read.name.append("/2"); + + read.flags = core.flag; + read.sequence.resize(core.l_qseq); + + for (int i = 0; i < core.l_qseq; ++i) + read.sequence[i] = seq_nt16_str[bam_seqi(seq_it, i)]; + + + auto const qual_it = bam_get_qual(hts_rec); + std::vector::iterator> snp_events; + bool const is_read_clipped = is_clipped(*hts_rec); + + for (long i{0}; i < N_CIGAR; ++i) + { + uint32_t opAndCnt; + memcpy(&opAndCnt, cigar_it, sizeof(uint32_t)); + cigar_it += sizeof(uint32_t); + + char const cigar_operation = CIGAR_MAP[opAndCnt & 15]; + uint32_t const cigar_count = opAndCnt >> 4; + assert(cigar_count > 0); + + switch (cigar_operation) + { + case 'M': + case '=': // '=' and 'X' are typically not used by aligners (at least not by default), + case 'X': // but we keep it here just in case + { + assert((ref_offset + cigar_count - 1l) < REF_SIZE); + //auto ref_it = reference_sequence.begin() + ref_offset; + + for (long r{0}; r < cigar_count; ++r) + { + long const ref_pos = ref_offset + r; + char const ref = reference_sequence[ref_pos]; + long const read_pos = read_offset + r; + char const read_base = read.sequence[read_pos]; + + if (read_base == ref || + (ref != 'A' && ref != 'C' && ref != 'G' && ref != 'T') || + (read_base != 'A' && read_base != 'C' && read_base != 'G' && read_base != 'T')) + { + continue; + } + + SnpEvent new_snp_event(ref_pos, read_base); + + std::map::iterator snp_it = + add_snp_event_to_bucket(buckets, std::move(new_snp_event), region_begin, BUCKET_SIZE); + + SnpEventInfo & info = snp_it->second; + uint8_t const qual = *(qual_it + read_pos); + + if (qual >= 25) + ++info.hq_count; + else + ++info.lq_count; + + //if (qual > info.max_qual) + // info.max_qual = qual; + + if (core.qual != 255 && core.qual > info.max_mapq) + info.max_mapq = core.qual; + + info.proper_pairs += ((read.flags & IS_PROPER_PAIR) != 0); + info.first_in_pairs += ((read.flags & IS_FIRST_IN_PAIR) != 0); + info.sequence_reversed += ((read.flags & IS_SEQ_REVERSED) != 0); + info.clipped += is_read_clipped; + + if (info.uniq_pos1 == -1) + { + info.uniq_pos1 = core.pos; + } + else if (info.uniq_pos2 == -1) + { + if (info.uniq_pos1 != core.pos) + { + info.uniq_pos2 = core.pos; + } + } + else if (info.uniq_pos3 == -1 && info.uniq_pos2 != core.pos) + { + assert(info.uniq_pos1 != core.pos); // should not happend if input is sorted + info.uniq_pos3 = core.pos; + } + + long const max_distance = std::min(read_pos, core.l_qseq - 1 - read_pos); + assert(max_distance >= 0); + + if (max_distance > info.max_distance) + info.max_distance = max_distance; + + snp_events.push_back(snp_it); + } + + read_offset += cigar_count; + ref_offset += cigar_count; + break; + } + + case 'I': + { + assert(cigar_count > 0); + + auto end_it = (read_offset + cigar_count) < static_cast(read.sequence.size()) ? + read.sequence.begin() + read_offset + cigar_count : + read.sequence.end(); + + std::vector event_sequence(read.sequence.begin() + read_offset, end_it); + IndelEvent new_event = make_insertion_event(region_begin + ref_offset, + std::move(event_sequence)); + + auto indel_event_it = add_indel_event_to_bucket(buckets, + std::move(new_event), + region_begin, + BUCKET_SIZE, + reference_sequence, + ref_offset); + + if (read.name == debug_read_name) + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " new ins=" << indel_event_it->first.to_string(); + + ++indel_event_it->second.count; + read_offset += cigar_count; + break; + } + + case 'D': + { + assert(cigar_count > 0); + + IndelEvent new_event = + make_deletion_event(reference_sequence, ref_offset, region_begin + ref_offset, cigar_count); + + auto indel_event_it = add_indel_event_to_bucket(buckets, + std::move(new_event), + region_begin, + BUCKET_SIZE, + reference_sequence, + ref_offset); + + if (read.name == debug_read_name) + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " new del=" << indel_event_it->first.to_string(); + + ++indel_event_it->second.count; + ref_offset += cigar_count; + break; + } + + case 'S': + { + read_offset += cigar_count; + break; + } // else 'H' or 'P' which move neither reference nor read + } + } + + for (long e{1}; e < static_cast(snp_events.size()); ++e) + { + auto const & event = snp_events[e]->first; + + // Add event to previous snp events + for (long prev{0}; prev < e; ++prev) + { + auto & prev_info = snp_events[prev]->second; + auto insert_it = prev_info.phase.insert({event, 0}); + ++insert_it.first->second; + } + } + + read.alignment.pos = static_cast(core.pos); + read.alignment.pos_end = region_begin + std::min(ref_offset, REF_SIZE - 1); + + assert((read.alignment.pos - region_begin) < static_cast(cov_up.size())); + ++cov_up[read.alignment.pos - region_begin]; + + assert((read.alignment.pos_end - region_begin) < static_cast(cov_down.size())); + ++cov_down[read.alignment.pos_end - region_begin]; + + auto & bucket = buckets[bucket_index]; + int32_t const end_with_clip = read.alignment.pos_end + read.alignment.num_clipped_end; + + if (end_with_clip > bucket.max_pos_end) + { + bucket.max_pos_end = end_with_clip; + global_max_pos_end = std::max(global_max_pos_end, end_with_clip); + } + + bucket.global_max_pos_end = global_max_pos_end; + hts_rec = hts_reader.get_next_read(hts_rec); + + if (!hts_rec) + break; + } + + long const NUM_BUCKETS{static_cast(buckets.size())}; + + auto update_coverage = + [&cov_up, &cov_down](long & cov, long const pos, long const b, long const BUCKET_SIZE) -> void + { + long offset{pos + 1}; + + if (offset > b * BUCKET_SIZE) + { + offset = b * BUCKET_SIZE; + + // Add coverage until naive begin + while (offset <= pos) + { + cov += (static_cast(cov_up[offset]) - static_cast(cov_down[offset])); + ++offset; + } + } + }; + + // Remove SNPs with low support + { + long depth{0}; + + for (long b{0}; b < NUM_BUCKETS; ++b) + { + auto & bucket = buckets[b]; + + for (auto snp_it = bucket.snps.begin(); snp_it != bucket.snps.end(); /*++it*/) + { + SnpEvent const & snp = snp_it->first; + SnpEventInfo const & info = snp_it->second; + + long cov{depth}; + long const begin = std::max(0l, snp.pos - region_begin); + update_coverage(cov, begin, b, BUCKET_SIZE); + + if (info.has_good_support(cov)) + ++snp_it; + else + snp_it = bucket.snps.erase(snp_it); + } + + // Add coverage from bucket b + assert(b * BUCKET_SIZE < REF_SIZE); + long offset{b * BUCKET_SIZE}; + long const end_offset = std::min(REF_SIZE, (b + 1) * BUCKET_SIZE); + + while (offset < end_offset) + { + depth += static_cast(cov_up[offset]) - static_cast(cov_down[offset]); + ++offset; + } + } + } + + long depth{0}; + + // Find HQ SNPs and indels with good enough support to allow realignments + for (long b{0}; b < NUM_BUCKETS; ++b) + { + auto & bucket = buckets[b]; + + // Check SNP phase + for (auto snp_it = bucket.snps.begin(); snp_it != bucket.snps.end(); ++snp_it) + { + SnpEvent const & snp = snp_it->first; + SnpEventInfo const & info = snp_it->second; + + long const begin = std::max(0l, snp.pos - region_begin); + long cov{depth}; // starting coverage depth + + auto it_bool = pool_snp_haplotypes.insert({snp, std::set()}); + std::set & snp_haplotypes = it_bool.first->second; + + assert(begin >= b * BUCKET_SIZE); + update_coverage(cov, begin, b, BUCKET_SIZE); + + //BOOST_LOG_TRIVIAL(info) << __HERE__ + // << " snp has good support @ " << snp.pos + // << " base=" << snp.base + // << " cov=" << cov + // << " INFO=" << info.to_string(); + + auto is_good_support = + [&cov_down, ®ion_begin](long local_cov, + long local_offset, + std::map::const_iterator find_it) -> bool + { + long const end = std::max(0l, static_cast(find_it->first.pos) - region_begin); + + // reduce coverage by reads who do not overlap the whole interval + while (local_offset <= end) + { + local_cov -= static_cast(cov_down[local_offset]); + ++local_offset; + } + + //BOOST_LOG_TRIVIAL(info) << __HERE__ << " pos=" + // << find_it->first.pos << " base=" + // << find_it->first.base << " support=" + // << find_it->second << "/" << local_cov; + + // Make sure there is some coverage + if (local_cov <= 0) + local_cov = 1; + + if ((static_cast(find_it->second) / static_cast(local_cov)) < 0.22) + { + return false; + } + + return true; + }; + + // check this bucket + for (auto it2 = std::next(snp_it); it2 != bucket.snps.end(); ++it2) + { + auto find_it = info.phase.find(it2->first); + + if (find_it != info.phase.end()) + { + assert(find_it->first.pos > snp.pos); + bool const is_good = is_good_support(cov, begin + 1, find_it); + + if (is_good) + { + //BOOST_LOG_TRIVIAL(info) << __HERE__ << " GOOD SUPPORT"; + snp_haplotypes.insert(find_it->first); + } + //else + //{ + // BOOST_LOG_TRIVIAL(info) << __HERE__ << " BAD SUPPORT"; + //} + } + } + + // check next bucket + if (b + 1 < NUM_BUCKETS) + { + auto const & next_bucket = buckets[b + 1]; + + for (auto it2 = next_bucket.snps.begin(); it2 != next_bucket.snps.end(); ++it2) + { + auto find_it = info.phase.find(it2->first); + + if (find_it == info.phase.end()) + continue; + + assert(find_it->first.pos > snp.pos); + long const MAX_PHASE_DISTANCE{BUCKET_SIZE}; + + if (find_it->first.pos > (snp.pos + MAX_PHASE_DISTANCE)) + { + //BOOST_LOG_TRIVIAL(info) << __HERE__ << " Too far away " + // << snp.pos << " + " << BUCKET_SIZE << " < " << find_it->first.pos; + continue; // I can break here? + } + + bool const is_good = is_good_support(cov, begin + 1, find_it); + + if (is_good) + { + //BOOST_LOG_TRIVIAL(info) << __HERE__ << " GOOD SUPPORT"; + snp_haplotypes.insert(find_it->first); + //snp_haplotypes.push_back(find_it); + } + //else + //{ + // BOOST_LOG_TRIVIAL(info) << __HERE__ << " BAD SUPPORT"; + //} + } + } + } + + // INDELS + for (Tindel_events::iterator it = bucket.indel_events.begin(); it != bucket.indel_events.end(); /*++it*/) + { + assert(!it->second.has_realignment_support); + + IndelEvent const & indel = it->first; + EventInfo const & indel_info = it->second; + + long const naive_pad = 4.0 + static_cast(indel.sequence.size()) / 3.0; + long const naive_begin = std::max(0l, indel.pos - naive_pad - region_begin); + long const naive_end = std::min(REF_SIZE, indel.pos + indel_info.span + naive_pad - region_begin); + + double const correction = indel.type == 'I' ? + static_cast(indel.sequence.size() / 2.0 + 8.0) / 8.0 : + static_cast(indel.sequence.size() / 3.0 + 10.0) / 10.0; + + double const count = correction * indel_info.count; + + //BOOST_LOG_TRIVIAL(info) << "Indel [pos,pos+span], size [begin,end]: [" << region_begin << " " + // << indel.pos << "," << (indel.pos + indel_info.span) + // << "] " << indel.sequence.size() << " " << correction + // << " [" << naive_begin << "," << naive_end << "]" + // << " count=" << count << " depth=" << depth; + + long cov{depth}; // starting coverage depth + long offset{naive_begin}; + + // Fix coverage starting before here + if (offset <= b * BUCKET_SIZE) + { + while (offset < b * BUCKET_SIZE) + { + cov -= (static_cast(cov_up[offset]) - static_cast(cov_down[offset])); + ++offset; + } + } + else + { + offset = b * BUCKET_SIZE; + + // Add coverage until naive begin + while (offset < naive_begin) + { + cov += (static_cast(cov_up[offset]) - static_cast(cov_down[offset])); + ++offset; + } + } + + // reduce coverage by reads who do not overlap the whole naive interval + while (offset <= naive_end) + { + cov -= static_cast(cov_down[offset]); + ++offset; + } + + uint32_t log_qual{0}; + + { + double const corrected_cov{std::max(static_cast(cov), count)}; + double const anti_count_d{corrected_cov - count}; + log_qual = get_log_qual_double(count, anti_count_d, 10.0); + } + + if (indel_info.count >= 2 && count >= 4.0 && log_qual >= 30) + { + //BOOST_LOG_TRIVIAL(info) << __HERE__ << " Indel has good support."; + it->second.has_good_support = true; + it->second.has_realignment_support = true; + it->second.max_log_qual = log_qual; + it->second.max_log_qual_file_i = file_i; + + ++it; + } + else if (log_qual > 0) + { + // Realignment support, meaning low support but needs realignment to confirm/deny + //BOOST_LOG_TRIVIAL(info) << __HERE__ << " Indel has realignment support."; + assert(it->second.has_good_support == false); + it->second.has_realignment_support = true; + it->second.max_log_qual = log_qual; + it->second.max_log_qual_file_i = file_i; + ++it; + } + else + { + // erase indel if it is not good enough + //BOOST_LOG_TRIVIAL(info) << __HERE__ << " erasing indel=" << indel.to_string(); + it = bucket.indel_events.erase(it); + } + } + + // Add coverage from bucket b + assert(b * BUCKET_SIZE < REF_SIZE); + long offset{b * BUCKET_SIZE}; + long const end_offset = std::min(REF_SIZE, (b + 1) * BUCKET_SIZE); + + while (offset < end_offset) + { + depth += static_cast(cov_up[offset]) - static_cast(cov_down[offset]); + ++offset; + } + } // for (long b{0}; b < static_cast(buckets.size()); ++b) +} + + +void +realign_to_indels(std::vector const & realignment_indels, + std::vector & buckets, + long const BUCKET_SIZE, + long const region_begin, + std::vector const & reference_sequence) +{ + long const REF_SIZE = reference_sequence.size(); + long const PAD{50}; // TODO make an option + long const MAX_READ_SIZE{151}; + using Tuint = uint16_t; + paw::AlignmentOptions opts; + opts.set_match(SCORE_MATCH).set_mismatch(SCORE_MISMATCH); + opts.set_gap_open(SCORE_GAP_OPEN).set_gap_extend(SCORE_GAP_EXTEND); + opts.left_column_free = true; + opts.right_column_free = true; + long _alignment_counter{0}; + + // Realign reads and add reference support + for (Tindel_events::iterator indel_it : realignment_indels) + { + IndelEvent const & indel = indel_it->first; + EventInfo const & indel_info = indel_it->second; + long const indel_span = indel.pos + indel_info.span; + + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Realignment to indel=" << indel.to_string() << " span=" + << indel_info.span; + + //create reference sequence with the indel + long const begin_padded = std::max(0l, indel.pos - MAX_READ_SIZE - 2 * PAD - region_begin); + assert(begin_padded < REF_SIZE); + long const end_padded = indel.pos + MAX_READ_SIZE + 2 * PAD - region_begin; + + auto end_it = end_padded >= REF_SIZE ? + reference_sequence.end() : + reference_sequence.begin() + end_padded; + + std::vector new_ref(reference_sequence.begin() + begin_padded, end_it); + std::vector ref_pos(new_ref.size()); + std::iota(ref_pos.begin(), ref_pos.end(), 0); + //BOOST_LOG_TRIVIAL(info) << "New ref before:\n" << std::string(new_ref.begin(), new_ref.end()); + + { + bool const is_applied = apply_indel_event(new_ref, ref_pos, indel, begin_padded + region_begin); + assert(is_applied); + + if (!is_applied) + continue; + } + + //BOOST_LOG_TRIVIAL(info) << "New ref after:\n" << std::string(new_ref.begin(), new_ref.end()); + assert(buckets.size() > 0); + long b = begin_padded / BUCKET_SIZE; + long b_end = std::min(static_cast(buckets.size()) - 1, end_padded / BUCKET_SIZE); + assert(b < static_cast(buckets.size())); + + std::vector const new_ref_stable(new_ref); + std::vector const ref_pos_stable(ref_pos); + + // Find the smallest bucket that might contain reads that are overlapping the indel + while (b > 0 && buckets[b].global_max_pos_end > (indel.pos - PAD)) + --b; + + // Check any reads that might overlap the indel + for ( ; b <= b_end; ++b) + { + assert(b >= 0); + assert(b < static_cast(buckets.size())); + auto & bucket = buckets[b]; + + if (bucket.max_pos_end <= (indel.pos - PAD)) + continue; + + for (auto & read : bucket.reads) + { + if (read.alignment.pos < 0 || read.sequence.size() == 0) + continue; + + // If the variant already has the indel then we can just update the score and skip the realignment + if (read.alignment.has_indel_event(indel_it)) + continue; + + if ((read.alignment.num_clipped_end == 0 && read.alignment.pos_end < static_cast(indel.pos)) || + (read.alignment.pos_end + read.alignment.num_clipped_end + + std::min(static_cast(read.alignment.num_clipped_end), PAD) < indel.pos) || + (read.alignment.num_clipped_begin == 0 && read.alignment.pos > indel_span) || + (read.alignment.pos - read.alignment.num_clipped_begin - + std::min(static_cast(read.alignment.num_clipped_begin), PAD) > indel_span)) + { + continue; + } + + //if (read.alignment.score == static_cast(read.sequence.size())) // Cannot improve score + // continue; // I cannot make this optimization unless I also add anti-support + assert(new_ref == new_ref_stable); + assert(ref_pos == ref_pos_stable); + bool reset_new_ref{false}; + std::vector applied_events{}; + applied_events.push_back({0, indel_it}); + + for (auto it = read.alignment.indel_events.cbegin(); it != read.alignment.indel_events.cend(); ++it) + { + IndelEvent const & indel_event = it->event_it->first; + EventInfo const & event_info = it->event_it->second; + + if (event_info.has_realignment_support) + { + bool ret = apply_indel_event(new_ref, ref_pos, indel_event, begin_padded + region_begin, false); + + if (ret) + { + if (read.name == debug_read_name) + { + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Applied event " << it->event_it->first.to_string(); + } + + applied_events.push_back({0, it->event_it}); + reset_new_ref = true; + } + else + { + if (read.name == debug_read_name) + { + apply_indel_event(new_ref, ref_pos, indel_event, begin_padded + region_begin, true); + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Could not apply event " << it->event_it->first.to_string(); + } + + applied_events.push_back({READ_ANTI_SUPPORT, it->event_it}); + } + } + } + + long const old_score = read.alignment.score; + + // TODO write greedy alignment + paw::global_alignment(read.sequence, new_ref, opts); + ++_alignment_counter; + assert(opts.get_alignment_results()); + auto const clip = opts.get_alignment_results()->apply_clipping(read.sequence, + new_ref, + opts.get_match(), + opts.get_mismatch(), + opts.get_gap_open(), + opts.get_gap_extend(), + opts.get_clip()); + + paw::AlignmentResults const & ar = *opts.get_alignment_results(); + auto p = ar.get_database_begin_end(read.sequence, new_ref); + + if (p.first == 0 || p.second == ar.database_end) + { + auto aligned_strings = ar.get_aligned_strings(read.sequence, new_ref); + + BOOST_LOG_TRIVIAL(warning) << __HERE__ << " Insufficient padding in read alignment, skipping read. " + << read.to_string() << "\n" + << aligned_strings.first << '\n' + << aligned_strings.second; + + if (reset_new_ref) + { + new_ref = new_ref_stable; + ref_pos = ref_pos_stable; + } + + continue; + } + + if (read.name == debug_read_name) + { + //if (clip.first > 0 || clip.second < static_cast(read.sequence.size())) + { + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " name=" << read.name << " pos=" << read.alignment.pos + << " begin,end_clip=" << clip.first << "," << clip.second + << " old_score=" << old_score << " new_score=" << ar.score; + } + } + + if (ar.score <= old_score) + { + // If the score is worse and the sequence is not clipped it is anti-support + if (ar.score < old_score) + { + if (read.name == debug_read_name) + { + auto aligned_strings = ar.get_aligned_strings(read.sequence, new_ref); + + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " ANTI old_score, new_score, read = " << old_score << ", " + << ar.score << ", " + << read.name << '\n' << aligned_strings.first << '\n' + << aligned_strings.second; + } + + read.alignment.add_indel_event(READ_ANTI_SUPPORT, indel_it); + } + else if (indel_it->first.pos >= (ref_pos[p.first] + begin_padded) && + indel_it->first.pos <= (ref_pos[p.second] + begin_padded)) + { + assert(ar.score == old_score); + + if (read.name == debug_read_name) + { + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " SAME SCORE " << old_score + << " read=" << read.to_string(); + } + + read.alignment.add_indel_event(READ_MULTI_SUPPORT, indel_it); + } + + if (reset_new_ref) + { + new_ref = new_ref_stable; + ref_pos = ref_pos_stable; + } + + continue; + } + + // We got a better score + read.alignment.replace_indel_events(std::move(applied_events)); + + // Update alignment + read.alignment.pos = ref_pos[p.first] + begin_padded; + read.alignment.pos_end = ref_pos[p.second] + begin_padded; + read.alignment.score = ar.score; + read.alignment.num_clipped_begin = clip.first; + read.alignment.num_clipped_end = static_cast(read.sequence.size()) - clip.second; + + // count number of inserted bases at the beginning of the read + { + long num_ins{0}; + + while (p.first + num_ins + 1 < static_cast(ref_pos.size()) && + ref_pos[p.first + num_ins] == ref_pos[p.first + num_ins + 1]) + { + ++num_ins; + } + + read.alignment.num_ins_begin = num_ins; + } + + if (read.name == debug_read_name) + { + auto aligned_strings = ar.get_aligned_strings(read.sequence, new_ref); + BOOST_LOG_TRIVIAL(info) << __HERE__ << " score=" << ar.score << "\n" + << aligned_strings.first << "\n" << aligned_strings.second; + } + + // Generate and edit script. No normalize and ignore indels at ends + /* + std::set edits = paw::get_edit_script(aligned_strings, false, true); + long const num_edits = edits.size(); + + auto it = edits.begin(); + long gaps{0}; + + for (long e{0}; e < num_edits; ++e, ++it) + { + // Skip first and last edit (they are padding events) + assert(it != edits.end()); + paw::Event2 const & edit = *it; + + if (read.name == debug_read_name) + { + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " e=" << e << " pos=" << edit.pos << " pos_q=" + << edit.pos_q << " ref=" + << edit.ref << " alt=" << edit.alt << " ref_pos=" + << (begin_padded + gaps + edit.pos); + } + + // Skip SNPs + if (edit.alt.size() == edit.ref.size()) + continue; + + IndelEvent new_indel; + assert((gaps + edit.pos) < static_cast(ref_pos.size())); + + if (edit.alt.size() > edit.ref.size()) + { + assert(edit.ref.size() == 0); + + new_indel.pos = (begin_padded + ref_pos[edit.pos]); + new_indel.type = 'i'; // second level insertion + new_indel.sequence = std::vector(edit.alt.begin(), edit.alt.end()); + } + else + { + assert(edit.alt.size() == 0); + + new_indel.pos = (begin_padded + ref_pos[edit.pos]); + new_indel.type = 'd'; + new_indel.sequence = std::vector(edit.ref.begin(), edit.ref.end()); + } + + // the new indel needs to be in the region + if (new_indel.pos >= region_begin) + { + long const read_offset{edit.pos}; + long const ref_offset{new_indel.pos - region_begin}; + + auto indel_event_it = + add_indel_event_to_bucket(buckets, + std::move(new_indel), + region_begin, + BUCKET_SIZE, + reference_sequence, + ref_offset); + + read.alignment.add_indel_event(read_offset, indel_event_it); + } + + gaps += edit.alt.size() - edit.ref.size(); + } + //*/ + + if (reset_new_ref) + { + new_ref = new_ref_stable; + ref_pos = ref_pos_stable; + } + } + } + } + + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " alignment counter = " << _alignment_counter; + + // Print counts etc + for (Tindel_events::iterator indel_it : realignment_indels) + { + IndelEvent const & indel = indel_it->first; + EventInfo & indel_info = indel_it->second; + + if (indel_info.has_good_support) + continue; + + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Realignment results for indel=" << indel.to_string() << " " + << indel_info.count << "," << indel_info.anti_count << " MD=" << indel_info.multi_count + << " log_qual=" << indel_info.log_qual(10); + + // Check if it is good after realignment + if (indel_info.log_qual(10) > 30) + indel_info.has_good_support = true; + } +} + + +void +read_hts_and_return_realignment_indels(bam1_t * hts_rec, + HtsReader & hts_reader, + std::vector & buckets, + long const BUCKET_SIZE, + long const region_begin, + std::vector const & reference_sequence) +{ + long const REF_SIZE{static_cast(reference_sequence.size())}; + int32_t global_max_pos_end{0}; + + while (true) + { + assert(hts_rec); + auto const & core = hts_rec->core; + + if (core.n_cigar == 0 || core.pos < region_begin) + { + hts_rec = hts_reader.get_next_read(hts_rec); + + if (!hts_rec) + break; + } + + std::array static constexpr CIGAR_MAP = {{ + 'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', 'B', '*', '*', '*', '*', '*', '*' + }}; + + auto it = hts_rec->data; + auto cigar_it = it + core.l_qname; + auto seq_it = cigar_it + (core.n_cigar << 2); + + // Index of first aligned read base in read sequence + long read_offset{0}; + + // Index of first aligned read base in reference_sequence + long ref_offset{static_cast(core.pos) - region_begin}; + + // Get reference to bucket + long const bucket_id{ref_offset / BUCKET_SIZE}; + + // While experimenting say that id is also index, then go back to: + // long const bucket_index = bucket_id % NUM_BUCKETS; + long const bucket_index{bucket_id}; + + assert(bucket_index < static_cast(buckets.size())); + //if (bucket_index >= static_cast(buckets.size())) + // buckets.resize(bucket_index + 1); + + long const N_CIGAR = core.n_cigar; + + if (ref_offset < 0 || ref_offset >= REF_SIZE) + { + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Unexpected ref_offset = " << ref_offset; + std::exit(1); + } + + Read read; + read.name = reinterpret_cast(it); + + if ((core.flag & IS_FIRST_IN_PAIR) != 0u) + read.name.append("/1"); + else + read.name.append("/2"); + + read.mate_pos = static_cast(core.mpos); + read.flags = core.flag; + read.mapq = core.qual; + + read.sequence.resize(core.l_qseq); + + for (int i = 0; i < core.l_qseq; ++i) + read.sequence[i] = seq_nt16_str[bam_seqi(seq_it, i)]; + + read.qual.resize(core.l_qseq); + + { + auto qual_it = bam_get_qual(hts_rec); + + for (int i = 0; i < core.l_qseq; ++qual_it, ++i) + read.qual[i] = static_cast(*qual_it); + } + + read.alignment.score = 0; + + for (long i{0}; i < N_CIGAR; ++i) + { + uint32_t opAndCnt; + memcpy(&opAndCnt, cigar_it, sizeof(uint32_t)); + cigar_it += sizeof(uint32_t); + + char const cigar_operation = CIGAR_MAP[opAndCnt & 15]; + uint32_t const count = opAndCnt >> 4; + assert(count > 0); + + switch (cigar_operation) + { + case 'M': + case '=': // '=' and 'X' are typically not used by aligners (at least not by default), + case 'X': // but we keep it here just in case + { + assert(ref_offset < REF_SIZE); + auto ref_it = reference_sequence.begin() + ref_offset; + + for (long r{0}; r < count; ++r, ++ref_it) + { + char const alt = read.sequence[r + read_offset]; + + if (alt != *ref_it && alt != 'N' && *ref_it != 'N') + { + read.alignment.score -= SCORE_MISMATCH; + } + else + { + read.alignment.score += SCORE_MATCH; + } + } + + read_offset += count; + ref_offset += count; + break; + } + + case 'I': + { + assert(count > 0); + + auto end_it = (read_offset + count) < static_cast(read.sequence.size()) ? + read.sequence.begin() + read_offset + count : + read.sequence.end(); + + std::vector event_sequence(read.sequence.begin() + read_offset, end_it); + IndelEvent new_event = make_insertion_event(region_begin + ref_offset, + std::move(event_sequence)); + + auto indel_event_it = + add_indel_event_to_bucket(buckets, + std::move(new_event), + region_begin, + BUCKET_SIZE, + reference_sequence, + ref_offset); + + //indel_event_it->second.has_support_in_file = true; + + if (!indel_event_it->second.has_realignment_support) + read.alignment.score -= (SCORE_GAP_OPEN + (count - 1) * SCORE_GAP_EXTEND); + else + read.alignment.score += SCORE_MATCH * count; + + read.alignment.add_indel_event(read_offset, indel_event_it); + read_offset += count; + break; + } + + case 'D': + { + assert(count > 0); + + IndelEvent new_event = + make_deletion_event(reference_sequence, ref_offset, region_begin + ref_offset, count); + + auto indel_event_it = + add_indel_event_to_bucket(buckets, + std::move(new_event), + region_begin, + BUCKET_SIZE, + reference_sequence, + ref_offset); + + //indel_event_it->second.has_support_in_file = true; + + if (!indel_event_it->second.has_realignment_support) + read.alignment.score -= (SCORE_GAP_OPEN + (count - 1) * SCORE_GAP_EXTEND); + + read.alignment.add_indel_event(read_offset, indel_event_it); + ref_offset += count; + break; + } + + case 'S': + { + read_offset += count; + set_bit(read.flags, IS_CLIPPED); + read.alignment.score -= SCORE_CLIP; + + if (i == 0) + { + read.alignment.num_clipped_begin = count; + } + else + { + assert(i + 1 == N_CIGAR); + read.alignment.num_clipped_end = count; + } + + break; + } // else 'H' or 'P' which move neither reference nor read + } + } + + read.alignment.pos = static_cast(core.pos); + read.alignment.pos_end = region_begin + ref_offset; + auto & bucket = buckets[bucket_index]; + int32_t const end_with_clip = read.alignment.pos_end + read.alignment.num_clipped_end; + + if (end_with_clip > bucket.max_pos_end) + { + bucket.max_pos_end = end_with_clip; + global_max_pos_end = std::max(global_max_pos_end, end_with_clip); + } + + bucket.global_max_pos_end = global_max_pos_end; + bucket.reads.push_back(std::move(read)); + + hts_rec = hts_reader.get_next_read(hts_rec); + + if (!hts_rec) + break; + } + + // realign +} + + +void +parallel_first_pass(std::vector * hts_paths_ptr, + std::map > * pool_snp_haplotypes_ptr, + std::vector > * file_buckets_first_pass_ptr, + std::vector * sample_names_ptr, + std::vector * reference_sequence_ptr, + long const BUCKET_SIZE, + long const region_begin, + long const lowest_file_i) +{ + assert(hts_paths_ptr); + assert(pool_snp_haplotypes_ptr); + + std::vector const & hts_paths = *hts_paths_ptr; + std::map > & pool_snp_haplotypes = *pool_snp_haplotypes_ptr; + std::vector const & reference_sequence = *reference_sequence_ptr; + + for (long i{0}; i < static_cast(hts_paths.size()); ++i) + { + // BAM/CRAM record storage + HtsStore store; + + // Open BAM/CRAM file + HtsReader hts_reader(store); + hts_reader.open(hts_paths[i], std::string("."), ""); + + // Get the first read + bam1_t * hts_rec = hts_reader.get_next_read(); + + // Check if there were any reads + if (!hts_rec) + { + hts_reader.close(); + continue; + } + + run_first_pass(hts_rec, + hts_reader, + lowest_file_i + i, + (*file_buckets_first_pass_ptr)[lowest_file_i + i], + pool_snp_haplotypes, + BUCKET_SIZE, + region_begin, + reference_sequence); + + // Add sample + if (hts_reader.samples.size() > 1) + { + BOOST_LOG_TRIVIAL(error) << __HERE__ << " We found file with multiple samples, sorry, " + << "this is currently not supported."; + std::exit(1); + } + + assert(hts_reader.samples.size() == 1); + (*sample_names_ptr)[lowest_file_i + i] = hts_reader.samples[0]; + + // Close BAM/CRAM file + hts_reader.close(); + } // for (long file_i{0}; file_i < static_cast(hts_paths.size()); ++file_i) +} + + +void +parallel_second_pass(std::string const * hts_path_ptr, + std::vector const * reference_sequence_ptr, + Tindel_events * indel_events_ptr, + std::vector * indel_to_realign_ptr, + long const BUCKET_SIZE, + long const NUM_BUCKETS, + long const region_begin, + long const file_i) +{ + if (indel_to_realign_ptr->size() == 0) + return; + + std::string const & hts_path = *hts_path_ptr; + std::vector const & reference_sequence = *reference_sequence_ptr; + Tindel_events const & indel_events = *indel_events_ptr; + std::vector & indel_to_realign = *indel_to_realign_ptr; + + std::vector buckets; + buckets.resize(NUM_BUCKETS); + + // I/O + { + // BAM/CRAM record storage + HtsStore store; + + // Open BAM/CRAM file + HtsReader hts_reader(store); + hts_reader.open(hts_path, std::string("."), ""); + + // Get the first read + bam1_t * hts_rec = hts_reader.get_next_read(); + + // Check if there were any reads + if (!hts_rec) + { + hts_reader.close(); + return; + } + + read_hts_and_return_realignment_indels(hts_rec, + hts_reader, + buckets, + BUCKET_SIZE, + region_begin, + reference_sequence); + + // Close BAM/CRAM file + hts_reader.close(); + } // I/O ends + + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " I/O DONE."; + std::vector nearby_good_events; + long constexpr NEARBY_BP = 20; + + for (Tindel_events::iterator indel : indel_to_realign) + { + Tindel_events::iterator indel_it = indel; + + // Check before the indel + while (indel_it != indel_events.begin()) + { + --indel_it; + + if (indel_it->second.has_good_support && indel_it->first.pos + NEARBY_BP >= indel->first.pos) + { + // It also needs to be found in the file + if (is_indel_in_bucket(buckets, indel_it->first, region_begin, BUCKET_SIZE)) + nearby_good_events.push_back(indel_it); + } + } + + // Check after the indel + indel_it = std::next(indel); + + while (indel_it != indel_events.end()) + { + if (indel_it->second.has_good_support && indel_it->first.pos - NEARBY_BP <= indel->first.pos) + { + // It also needs to be found in the file + if (is_indel_in_bucket(buckets, indel_it->first, region_begin, BUCKET_SIZE)) + nearby_good_events.push_back(indel_it); + } + + ++indel_it; + } + } + + std::sort(nearby_good_events.begin(), nearby_good_events.end(), + [](Tindel_events::iterator const & a, + Tindel_events::iterator const & b) -> bool + { + return a->first.pos < b->first.pos; + }); + + auto end_unique_it = std::unique(nearby_good_events.begin(), nearby_good_events.end()); + std::move(nearby_good_events.begin(), end_unique_it, std::back_inserter(indel_to_realign)); + + std::sort(indel_to_realign.begin(), indel_to_realign.end(), + [](Tindel_events::iterator const & a, + Tindel_events::iterator const & b) -> bool + { + //return a->second.max_log_qual > b->second.max_log_qual || + // (a->second.max_log_qual == b->second.max_log_qual && a->first.pos < b->first.pos); + return a->second.has_good_support > b->second.has_good_support || + (a->second.has_good_support == b->second.has_good_support && a->first.pos < b->first.pos); + }); + + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Number of realignment indels,file_i=" + << indel_to_realign.size() << "," << file_i; + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Nearby realignment indels=" + << nearby_good_events.size(); + + realign_to_indels(indel_to_realign, //realignment_indels, + buckets, + BUCKET_SIZE, + region_begin, + reference_sequence); + + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Realignment DONE."; +} + + +void +streamlined_discovery(std::vector const & hts_paths, + std::string const & ref_path, + std::string const & region_str, + std::string const & output_dir, + gyper::Vcf & vcf) +{ + BOOST_LOG_TRIVIAL(info) << "Start WIP on region " << region_str; + + long const NUM_FILES = hts_paths.size(); + GenomicRegion genomic_region(region_str); + long const region_begin{genomic_region.begin}; + std::vector reference_sequence; + open_and_read_reference_genome(reference_sequence, ref_path, genomic_region); + long constexpr BUCKET_SIZE{50}; // TODO make an option + std::vector sample_names; + sample_names.resize(NUM_FILES); + + std::vector > > spl_hts_paths{}; + std::vector > > > spl_snp_hq_haplotypes{}; + long jobs{1}; + + { + long num_parts{1}; + _determine_num_jobs_and_num_parts(jobs, num_parts, NUM_FILES); + + { + auto it = hts_paths.cbegin(); + spl_hts_paths.reserve(num_parts); + spl_snp_hq_haplotypes.reserve(num_parts); + + for (long i = 0; i < num_parts; ++i) + { + auto end_it = it + NUM_FILES / num_parts + (i < (NUM_FILES % num_parts)); + assert(std::distance(hts_paths.cbegin(), end_it) <= NUM_FILES); + spl_hts_paths.emplace_back(new std::vector(it, end_it)); + spl_snp_hq_haplotypes.emplace_back(new std::map >()); + it = end_it; + } + } + } + + // New pipeline which splits into two passes: (1) discover potential indels (2) discover SNPs+genotypes all variants + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " First pass starting."; + long const NUM_POOLS{static_cast(spl_hts_paths.size())}; + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Number of pools = " << NUM_POOLS; + Tindel_events indel_events; + long NUM_BUCKETS{0}; + + // FIRST PASS + { + std::vector > file_buckets_first_pass(NUM_FILES); + + // Parallel first pass + { + paw::Station first_station(jobs); + long file_i{0}; + + for (long i{0}; i < NUM_POOLS - 1; ++i) + { + first_station.add_work(parallel_first_pass, + &(*spl_hts_paths[i]), + &(*spl_snp_hq_haplotypes[i]), + &file_buckets_first_pass, + &sample_names, + &reference_sequence, + BUCKET_SIZE, + region_begin, + file_i); + + file_i += spl_hts_paths[i]->size(); + } + + // Do the last pool on the current thread + first_station.add_to_thread(jobs - 1, + parallel_first_pass, + &(*spl_hts_paths[NUM_POOLS - 1]), + &(*spl_snp_hq_haplotypes[NUM_POOLS - 1]), + &file_buckets_first_pass, + &sample_names, + &reference_sequence, + BUCKET_SIZE, + region_begin, + file_i); + + first_station.join(); + } + + // Find SNPs and their SNP haplotypes + std::map > snp_haplotypes = std::move(*spl_snp_hq_haplotypes[0]); + + for (long i{1}; i < NUM_POOLS; ++i) + { + std::map > const & pool_snp_haplotypes = *spl_snp_hq_haplotypes[i]; + + for (auto it = pool_snp_haplotypes.begin(); it != pool_snp_haplotypes.end(); ++it) + { + auto insert_it = snp_haplotypes.insert(*it); + + if (!insert_it.second) + { + std::copy(it->second.begin(), + it->second.end(), + std::inserter(insert_it.first->second, insert_it.first->second.begin())); + } + } + } + + for (auto const & snp_haplotype : snp_haplotypes) + { + BOOST_LOG_TRIVIAL(info) << __HERE__ << " SNP=" << snp_haplotype.first.to_string(); + + for (auto const & hap : snp_haplotype.second) + { + BOOST_LOG_TRIVIAL(info) << __HERE__ << " phased with=" << hap.to_string(); + } + } + + // Create buckets + for (auto && buckets_first : file_buckets_first_pass) + { + if (static_cast(buckets_first.size()) > NUM_BUCKETS) + NUM_BUCKETS = buckets_first.size(); + + for (long b{0}; b < static_cast(buckets_first.size()); ++b) + { + auto && bucket_first = buckets_first[b]; + + for (auto && indel_event : bucket_first.indel_events) + { + auto find_it = indel_events.find(indel_event.first); + + if (find_it == indel_events.end()) + { + // Not found - Insert it + indel_events.insert(std::move(indel_event)); + } + else + { + // Found - Check if support is better + if (indel_event.second.max_log_qual > find_it->second.max_log_qual) + { + find_it->second.max_log_qual = indel_event.second.max_log_qual; + find_it->second.max_log_qual_file_i = indel_event.second.max_log_qual_file_i; + } + } + } + } + } + } + + // FIRST PASS ENDS + BOOST_LOG_TRIVIAL(info) << __HERE__ << " First pass DONE. Starting second pass."; + + + // SECOND PASS BEGINS + { + std::vector > indel_to_realign; + indel_to_realign.resize(NUM_FILES); + + // Add indels to events + for (auto it = indel_events.begin(); it != indel_events.end(); ++it) + { + it->second.count = 0; + assert(it->second.anti_count == 0); + assert(it->second.multi_count == 0); + assert(it->second.has_realignment_support); + + if (!it->second.has_good_support) + { + BOOST_LOG_TRIVIAL(info) << __HERE__ << " realignment indel=" << it->first.to_string() << " qual,file_i=" + << it->second.max_log_qual << " " << it->second.max_log_qual_file_i; + + assert(it->second.max_log_qual_file_i < static_cast(indel_to_realign.size())); + indel_to_realign[it->second.max_log_qual_file_i].push_back(it); + } + } + + { + paw::Station second_station(jobs); + + for (long file_i{0}; file_i < (NUM_FILES - 1); ++file_i) + { + if (indel_to_realign[file_i].size() > 0) + { + second_station.add_work(parallel_second_pass, + &(hts_paths[file_i]), + &reference_sequence, + &indel_events, + &(indel_to_realign[file_i]), + BUCKET_SIZE, + NUM_BUCKETS, + region_begin, + file_i); + } + } + + // Last file is run on current thread + long const file_i{(NUM_FILES - 1)}; + + if (indel_to_realign[file_i].size() > 0) + { + second_station.add_to_thread(jobs - 1, + parallel_second_pass, + &(hts_paths[file_i]), + &reference_sequence, + &indel_events, + &(indel_to_realign[file_i]), + BUCKET_SIZE, + NUM_BUCKETS, + region_begin, + file_i); + } + + second_station.join(); + } + + vcf.open(WRITE_BGZF_MODE, output_dir + "/indel_sites.vcf.gz"); + uint64_t const chromosome_offset = genomic_region.get_absolute_position(1); + + for (auto it = indel_events.begin(); it != indel_events.end();) // No increment + { + if (it->second.has_good_support) + { + IndelEvent const & indel_event = it->first; + + Variant variant{}; + uint32_t const abs_pos = indel_event.pos + chromosome_offset; + + if (indel_event.type == 'D') + { + variant.abs_pos = abs_pos; + variant.seqs.push_back({indel_event.sequence}); + variant.seqs.push_back({}); + variant.add_base_in_front(true); + variant.type = 'D'; + } + else if (indel_event.type == 'I') + { + variant.abs_pos = abs_pos; + variant.seqs.push_back({}); + variant.seqs.push_back({indel_event.sequence}); + variant.add_base_in_front(true); + variant.type = 'I'; + } + + vcf.variants.push_back(std::move(variant)); + + // clean indel info + it->second.clean_counts(); + ++it; + } + else + { + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Removing indel=" << it->first.to_string(); + it = indel_events.erase(it); + } + } + +#ifndef NDEBUG + vcf.write(); +#endif // NDEBUG + } + + BOOST_LOG_TRIVIAL(info) << __HERE__ << " WIP done"; +} + + } // namespace gyper diff --git a/src/typer/event.cpp b/src/typer/event.cpp new file mode 100644 index 0000000..6a80902 --- /dev/null +++ b/src/typer/event.cpp @@ -0,0 +1,410 @@ +#include +#include + +#include + +#include +#include +#include + + +namespace gyper +{ + +long +base2index(char const base) +{ + switch (base) + { + case 'A': + { + return 0; + } + + case 'C': + { + return 1; + } + + case 'G': + { + return 2; + } + + case 'T': + { + return 3; + } + + case 'N': + { + return -1; + } + + case '*': + { + return -2; + } + + default: + { + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Unexpected base '" << base << "'"; + std::exit(1); + } + } // switch ends +} + + +long +BaseCount::get_depth_without_deleted() const +{ + return acgt[0] + acgt[1] + acgt[2] + acgt[3]; +} + + +long +BaseCount::get_depth_with_deleted() const +{ + return acgt[0] + acgt[1] + acgt[2] + acgt[3] + deleted; +} + + +std::string +BaseCount::to_string() const +{ + std::ostringstream ss; + ss << acgt[0] << "," << acgt[1] << "," << acgt[2] << "," << acgt[3] << ":" + << acgt_qualsum[0] << "," << acgt_qualsum[1] << "," << acgt_qualsum[2] + << "," << acgt_qualsum[3] << " " << deleted; + return ss.str(); +} + + +void +BaseCount::add_base(char seq, char qual) +{ + long i = base2index(seq); + + if (i < 0) + { + if (i == -1) + { + ++unknown; + return; + } + else + { + assert(i == -2); + // -2 is deleted base + ++deleted; + return; + } + } + + assert(i < 4); + ++(acgt[i]); + acgt_qualsum[i] += static_cast(qual); +} + + +bool +IndelEvent::operator==(IndelEvent const & b) const +{ + return (pos == b.pos) && + (type == b.type) && + (sequence == b.sequence); +} + + +bool +IndelEvent::operator!=(IndelEvent const & b) const +{ + return !(*this == b); +} + + +bool +IndelEvent::operator<(IndelEvent const & b) const +{ + return pos < b.pos || (pos == b.pos && type < b.type) || (pos == b.pos && type == b.type && sequence < b.sequence); +} + + +bool +SnpEvent::operator==(SnpEvent const & b) const +{ + return (pos == b.pos) && (base == b.base); +} + + +bool +SnpEvent::operator!=(SnpEvent const & b) const +{ + return !(*this == b); +} + + +bool +SnpEvent::operator<(SnpEvent const & b) const +{ + return pos < b.pos || (pos == b.pos && base < b.base); +} + + +double +SnpEventInfo::corrected_support() const +{ + return static_cast(hq_count) + static_cast(lq_count) / 2.0; +} + + +bool +SnpEventInfo::has_good_support(long const cov) const +{ + int const _depth = hq_count + lq_count; + bool const is_promising = uniq_pos3 != -1 && hq_count >= 4 && proper_pairs >= 3; + gyper::Options const & copts = *(Options::const_instance()); + + return (copts.no_filter_on_begin_pos || uniq_pos2 != -1) + && + (!copts.filter_on_proper_pairs || (proper_pairs >= 2)) + && + (hq_count >= 3) + && + (!copts.filter_on_read_bias || + is_promising || + (first_in_pairs > 0 && first_in_pairs < _depth)) + && + (!copts.filter_on_strand_bias || + (is_promising && sequence_reversed > 0 && sequence_reversed < _depth) || + (sequence_reversed > 1 && sequence_reversed < (_depth - 1))) + && + ((clipped + 3) <= _depth) + && + ((is_promising && hq_count >= 7) || max_distance > 10) + && + (corrected_support() >= 3.9) + && + (cov <= 0 || (static_cast(_depth) / static_cast(cov) > 0.26)); +} + + +std::string +SnpEventInfo::to_string() const +{ + std::ostringstream ss; + + ss << "hq=" << hq_count + << ",lq=" << lq_count + << ",pp=" << proper_pairs + << ",first=" << first_in_pairs + << ",reversed=" << sequence_reversed + << ",clipped=" << clipped + << ",max_mapq=" << static_cast(max_mapq) + << ",max_distance=" << static_cast(max_distance) + << ",uniq_pos1=" << uniq_pos1 + << ",uniq_pos2=" << uniq_pos2 + << ",uniq_pos3=" << uniq_pos3 + << ",phase_num=" << phase.size(); + + return ss.str(); +} + + +std::vector +get_phred_biallelic(uint32_t count, uint32_t anti_count, uint32_t eps) +{ + std::vector phred(3); + + uint64_t gt00 = count * eps; + uint64_t gt01 = count + anti_count; + uint64_t gt11 = anti_count * eps; + uint64_t const min_gt = std::min({gt00, gt01, gt11}); + gt00 = (gt00 - min_gt) * 3ul; + gt01 = (gt01 - min_gt) * 3ul; + gt11 = (gt11 - min_gt) * 3ul; + + uint64_t constexpr MAX = std::numeric_limits::max(); + phred[0] = std::min(gt00, MAX); + phred[1] = std::min(gt01, MAX); + phred[2] = std::min(gt11, MAX); + return phred; +} + + +uint32_t +get_log_qual(uint32_t count, uint32_t anti_count, uint32_t eps) +{ + uint32_t const gt00 = count * eps; + uint32_t const gt01 = count + anti_count; + uint32_t const gt11 = anti_count * eps; + uint32_t const gt_alt = std::min(gt01, gt11); + + if (gt00 > gt_alt) + return gt00 - gt_alt; + else + return 0u; +} + + +uint32_t +get_log_qual_double(double count, double anti_count, double eps) +{ + double const gt00 = count * eps; + double const gt01 = count + anti_count; + double const gt11 = anti_count * eps; + double const gt_alt = std::min(gt01, gt11); + + if (gt00 > gt_alt) + return static_cast(gt00 - gt_alt + 0.5); + else + return 0u; +} + + +uint32_t +EventInfo::log_qual(uint32_t eps) const +{ + return get_log_qual(count, anti_count, eps); +} + + +void +EventInfo::clean_counts() +{ + count = 0; + anti_count = 0; + multi_count = 0; +} + + +bool +apply_indel_event(std::vector & sequence, + std::vector & ref_positions, + IndelEvent const & indel_event, + long const offset, + bool const is_debug) +{ + assert(sequence.size() == ref_positions.size()); + long const ref_pos{indel_event.pos - offset}; + + if (ref_pos <= 0) + return false; + + long pos{ref_pos}; // Start the search for the reference position here + long const event_size{static_cast(indel_event.sequence.size())}; + long const seq_size{static_cast(sequence.size())}; + assert(pos < seq_size); + + if (ref_positions[pos] != ref_pos) + { + while ((pos + 1) < seq_size && ref_positions[pos] < ref_pos) + ++pos; + + while (pos > 0 && ref_positions[pos] > ref_pos) + --pos; + + // Check again + if (ref_positions[pos] != ref_pos) + { + if (is_debug) + BOOST_LOG_TRIVIAL(warning) << __HERE__ << " Couldn't find pos for event " << indel_event.to_string(); + + return false; + } + } + + // Check purity + { + long constexpr PURITY_PAD{3}; + long const begin = std::max(0l, pos - PURITY_PAD); + long const end = std::min(static_cast(ref_positions.size()), pos + PURITY_PAD); + + assert(begin >= 0); + assert(begin <= end); + assert(end <= static_cast(ref_positions.size())); + + long prev_ref_pos = ref_positions[begin]; + + for (long p{begin + 1}; p < end; ++p) + { + if (ref_positions[p] == prev_ref_pos + 1) + { + ++prev_ref_pos; + } + else + { + if (is_debug) + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Not pure " << indel_event.to_string(); + + return false; // Not pure, skip indel + } + } + } + + if (indel_event.type == 'D') + { + if ((pos + event_size) >= static_cast(ref_positions.size()) || + ref_positions[pos + event_size] != (ref_pos + event_size)) + { + if (is_debug) + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Cannot apply deletion"; + + return false; + } + + std::vector::iterator end_it = sequence.end(); + std::vector::iterator end_pos_it = ref_positions.end(); + + if ((pos + event_size) < seq_size) + { + end_it = sequence.begin() + pos + event_size; + end_pos_it = ref_positions.begin() + pos + event_size; + } + + sequence.erase(sequence.begin() + pos, end_it); + ref_positions.erase(ref_positions.begin() + pos, end_pos_it); + } + else if (indel_event.type == 'I') + { + sequence.insert(sequence.begin() + pos, indel_event.sequence.begin(), indel_event.sequence.end()); + + std::vector new_pos(event_size, pos + 1); + ref_positions.insert(ref_positions.begin() + pos + 1, new_pos.begin(), new_pos.end()); + } + else + { + BOOST_LOG_TRIVIAL(error) << "Unknown type: " << indel_event.type; + return false; + } + + assert(sequence.size() == ref_positions.size()); + return true; +} + + +IndelEvent +make_deletion_event(std::vector const & reference_sequence, long ref_offset, int32_t pos, long count) +{ + IndelEvent new_event(pos, 'D'); + + assert(ref_offset >= 0); + assert(ref_offset + count < static_cast(reference_sequence.size())); + + new_event.sequence = std::vector(reference_sequence.begin() + ref_offset, + reference_sequence.begin() + ref_offset + count); + + return new_event; +} + + +IndelEvent +make_insertion_event(int32_t pos, std::vector && event_sequence) +{ + IndelEvent new_event(pos, 'I'); + new_event.sequence = std::move(event_sequence); + return new_event; +} + + +} // namespace gyper diff --git a/src/typer/genotype_paths.cpp b/src/typer/genotype_paths.cpp index f434555..d882134 100644 --- a/src/typer/genotype_paths.cpp +++ b/src/typer/genotype_paths.cpp @@ -88,6 +88,8 @@ GenotypePaths::GenotypePaths() , flags(0) , longest_path_length(0) , original_pos(0) + , score_diff(0) +// , original_clipped_bp(0) , mapq(255) , ml_insert_size(INSERT_SIZE_WHEN_NOT_PROPER_PAIR) { @@ -108,6 +110,8 @@ GenotypePaths::GenotypePaths(GenotypePaths const & b) , flags(b.flags) , longest_path_length(b.longest_path_length) , original_pos(b.original_pos) + , score_diff(b.score_diff) +// , original_clipped_bp(b.original_clipped_bp) , mapq(b.mapq) , ml_insert_size(b.ml_insert_size) { @@ -130,6 +134,8 @@ GenotypePaths::GenotypePaths(GenotypePaths && b) , flags(b.flags) , longest_path_length(b.longest_path_length) , original_pos(b.original_pos) + , score_diff(b.score_diff) +// , original_clipped_bp(b.original_clipped_bp) , mapq(b.mapq) , ml_insert_size(b.ml_insert_size) { @@ -152,6 +158,8 @@ GenotypePaths::operator=(GenotypePaths const & b) flags = b.flags; longest_path_length = b.longest_path_length; original_pos = b.original_pos; + score_diff = b.score_diff; +// original_clipped_bp = b.original_clipped_bp; mapq = b.mapq; ml_insert_size = b.ml_insert_size; @@ -185,6 +193,8 @@ GenotypePaths::operator=(GenotypePaths && b) flags = b.flags; longest_path_length = b.longest_path_length; original_pos = b.original_pos; + score_diff = b.score_diff; +// original_clipped_bp = b.original_clipped_bp; mapq = b.mapq; ml_insert_size = b.ml_insert_size; @@ -207,6 +217,8 @@ GenotypePaths::GenotypePaths(int16_t _flags, std::size_t _read_length) , flags(_flags) , longest_path_length(0) , original_pos(0) + , score_diff(0) +// , original_clipped_bp(0) , mapq(255) , ml_insert_size(INSERT_SIZE_WHEN_NOT_PROPER_PAIR) { @@ -700,13 +712,13 @@ GenotypePaths::find_new_variants() const assert(reference.size() == read2.size()); { - int matches = -1; // -1 means we have not found a mismatch yet + int matches{-1}; // -1 means we have not found a mismatch yet std::vector ref; std::vector alt; - int64_t const MIN_VAR_THRESHOLD = 5; - uint32_t var_pos = 0; + int64_t constexpr MIN_VAR_THRESHOLD{5}; + uint32_t var_pos{0}; - for (unsigned i = 0; i < reference.size(); ++i) + for (unsigned i{0}; i < reference.size(); ++i) { assert(i < read2.size()); assert(i < qual2.size()); diff --git a/src/typer/read.cpp b/src/typer/read.cpp new file mode 100644 index 0000000..c119e91 --- /dev/null +++ b/src/typer/read.cpp @@ -0,0 +1,103 @@ +#include +#include +#include + +#include + +#include + +#include +#include + + +namespace gyper +{ + +bool +Alignment::has_indel_event(Tindel_events::iterator indel_event) const +{ + for (auto const & e : indel_events) + { + if (e.event_it == indel_event) + return e.read_pos != READ_ANTI_SUPPORT; + } + + return false; +} + + +bool +Alignment::is_clipped() const +{ + return num_clipped_begin != 0 || num_clipped_end != 0; +} + + +void +Alignment::add_indel_event(int32_t pos, Tindel_events::iterator indel_it) +{ + if (pos == READ_ANTI_SUPPORT) + ++indel_it->second.anti_count; + else if (pos == READ_MULTI_SUPPORT) + ++indel_it->second.multi_count; + else + ++indel_it->second.count; + + indel_events.push_back({pos, indel_it}); +} + + +void +Alignment::replace_indel_events(std::vector && new_indel_events) +{ + // Lower count on old events + for (auto & event_and_info : indel_events) + { + if (event_and_info.read_pos == READ_ANTI_SUPPORT) + --event_and_info.event_it->second.anti_count; + else if (event_and_info.read_pos == READ_MULTI_SUPPORT) + --event_and_info.event_it->second.multi_count; + else + --event_and_info.event_it->second.count; + } + + // Increase count on new events + for (auto & event_and_info : new_indel_events) + { + if (event_and_info.read_pos == READ_ANTI_SUPPORT) + ++event_and_info.event_it->second.anti_count; + else if (event_and_info.read_pos == READ_MULTI_SUPPORT) + ++event_and_info.event_it->second.multi_count; + else + ++event_and_info.event_it->second.count; + } + + indel_events = std::move(new_indel_events); +} + + +std::string +Read::to_string() const +{ + std::ostringstream ss; + ss << name << " " << alignment.pos << " " << flags << " " << static_cast(mapq) << "\n"; + ss << std::string(sequence.begin(), sequence.end()); + + //for (auto const & q : qual) + // ss << static_cast(q + 33); + + { + ss << "\nAlignment: score=" << alignment.score << " num_clipped_begin,_end=" << alignment.num_clipped_begin + << "," << alignment.num_clipped_end << " num_ins_begin=" << alignment.num_ins_begin; + + for (auto const & support_event : alignment.indel_events) + ss << " pos=" << static_cast(support_event.read_pos) << " " << &(*(support_event.event_it)); + + ss << " "; + } + + return ss.str(); +} + + +} // namespace gyper diff --git a/src/typer/sample_call.cpp b/src/typer/sample_call.cpp index 80b78e9..6cffe45 100644 --- a/src/typer/sample_call.cpp +++ b/src/typer/sample_call.cpp @@ -77,8 +77,7 @@ SampleCall::SampleCall(std::vector && _phred, alt_total_depth = static_cast( std::min(static_cast(0xFFFFu), - static_cast(alt_depth) - ) + static_cast(alt_depth)) ); assert(alt_total_depth >= ambiguous_depth); @@ -100,6 +99,13 @@ SampleCall::get_unique_depth() const } +uint32_t +SampleCall::get_alt_depth() const +{ + return std::accumulate(coverage.begin() + 1, coverage.end(), static_cast(ambiguous_depth)); +} + + std::pair SampleCall::get_gt_call() const { @@ -148,6 +154,34 @@ SampleCall::get_gq() const } +uint8_t +SampleCall::get_lowest_phred_not_with(uint16_t allele) const +{ + long i{0}; + uint8_t min_phred{255}; + + for (long y{0}; y < static_cast(coverage.size()); ++y) + { + if (y == allele) + { + i += y + 1; + continue; + } + + for (long x = 0; x <= y; ++x, ++i) + { + if (x == allele) + continue; + + if (phred[i] < min_phred) + min_phred = phred[i]; + } + } + + return min_phred; +} + + int8_t SampleCall::check_filter(long gq) const { diff --git a/src/typer/var_stats.cpp b/src/typer/var_stats.cpp index 1d90b87..a2b6ed7 100644 --- a/src/typer/var_stats.cpp +++ b/src/typer/var_stats.cpp @@ -9,6 +9,9 @@ #include // std::vector #include // boost::split +#include +#include +#include #include // gyper::absolute_pos #include // gyper::VarStats @@ -19,14 +22,11 @@ namespace gyper { -VarStats::VarStats(uint16_t allele_count) noexcept +VarStats::VarStats(std::size_t const allele_count) noexcept // : realignment_distance(allele_count) // , realignment_count(allele_count) - : read_strand(allele_count) -// , r1_strand_forward(allele_count) -// , r1_strand_reverse(allele_count) -// , r2_strand_forward(allele_count) -// , r2_strand_reverse(allele_count) + : per_allele(allele_count) + , read_strand(allele_count) {} @@ -53,122 +53,126 @@ VarStats::add_realignment_distance(uint8_t const allele_id, * CLASS INFORMATION */ -//std::string -//VarStats::get_realignment_count() const -//{ -// return join_strand_bias(realignment_count); // TODO: Rename join_strand_bias() -//} -// -// -//std::string -//VarStats::get_realignment_distance() const -//{ -// return join_strand_bias(realignment_distance); // TODO: Rename join_strand_bias() -//} - - -std::string -VarStats::get_forward_strand_bias() const +void +VarStats::write_stats(std::map & infos) const { - if (read_strand.size() == 0) - return "."; + assert(per_allele.size() == read_strand.size()); + long const num_allele = per_allele.size(); - std::stringstream ss; - ss << (read_strand[0].r1_forward + read_strand[0].r2_forward); - - for (long i = 1; i < static_cast(read_strand.size()); ++i) - ss << ',' << (read_strand[i].r1_forward + read_strand[i].r2_forward); + if (num_allele <= 1) + return; - return ss.str(); + infos["CR"] = std::to_string(clipped_reads); + infos["MQsquared"] = std::to_string(mapq_squared); + write_read_strand_stats(infos); + write_per_allele_stats(infos); } -std::string -VarStats::get_reverse_strand_bias() const +void +VarStats::write_per_allele_stats(std::map & infos) const { - if (read_strand.size() == 0) - return "."; - - std::stringstream ss; - ss << (read_strand[0].r1_reverse + read_strand[0].r2_reverse); - - for (long i = 1; i < static_cast(read_strand.size()); ++i) - ss << ',' << (read_strand[i].r1_reverse + read_strand[i].r2_reverse); - - return ss.str(); -} + long const num_allele = per_allele.size(); + assert(num_allele > 1); + std::ostringstream cr_ss; + std::ostringstream mq_ss; + std::ostringstream sd_ss; + std::ostringstream mm_ss; -std::string -VarStats::get_r1_forward_strand_bias() const -{ - if (read_strand.size() == 0) - return "."; - - std::stringstream ss; - ss << read_strand[0].r1_forward; + { + VarStatsPerAllele const & stats_per_al = per_allele[0]; + cr_ss << stats_per_al.clipped_bp; + mq_ss << stats_per_al.mapq_squared; + sd_ss << stats_per_al.score_diff; + mm_ss << stats_per_al.mismatches; + } - for (long i = 1; i < static_cast(read_strand.size()); ++i) - ss << ',' << read_strand[i].r1_forward; + for (long i{1}; i < num_allele; ++i) + { + VarStatsPerAllele const & stats_per_al = per_allele[i]; + cr_ss << ',' << stats_per_al.clipped_bp; + mq_ss << ',' << stats_per_al.mapq_squared; + sd_ss << ',' << stats_per_al.score_diff; + mm_ss << ',' << stats_per_al.mismatches; + } - return ss.str(); + infos["CRal"] = cr_ss.str(); + infos["MQSal"] = mq_ss.str(); + infos["SDal"] = sd_ss.str(); + infos["MMal"] = mm_ss.str(); } -std::string -VarStats::get_r2_forward_strand_bias() const +void +VarStats::write_read_strand_stats(std::map & infos) const { - if (read_strand.size() == 0) - return "."; - - std::stringstream ss; - ss << read_strand[0].r2_forward; - - for (long i = 1; i < static_cast(read_strand.size()); ++i) - ss << ',' << read_strand[i].r2_forward; - - return ss.str(); -} + long const num_allele = read_strand.size(); + assert(num_allele > 1); + std::ostringstream f_ss; + std::ostringstream r_ss; + std::ostringstream f1_ss; + std::ostringstream f2_ss; + std::ostringstream r1_ss; + std::ostringstream r2_ss; -std::string -VarStats::get_r1_reverse_strand_bias() const -{ - if (read_strand.size() == 0) - return "."; - - std::stringstream ss; - ss << read_strand[0].r1_reverse; + { + auto const & stats = read_strand[0]; + f_ss << (stats.r1_forward + stats.r2_forward); + r_ss << (stats.r1_reverse + stats.r2_reverse); + f1_ss << stats.r1_forward; + f2_ss << stats.r2_forward; + r1_ss << stats.r1_reverse; + r2_ss << stats.r2_reverse; + } - for (long i = 1; i < static_cast(read_strand.size()); ++i) - ss << ',' << read_strand[i].r1_reverse; + for (long i{1}; i < num_allele; ++i) + { + auto const & stats = read_strand[i]; + f_ss << ',' << (stats.r1_forward + stats.r2_forward); + r_ss << ',' << (stats.r1_reverse + stats.r2_reverse); + f1_ss << ',' << stats.r1_forward; + f2_ss << ',' << stats.r2_forward; + r1_ss << ',' << stats.r1_reverse; + r2_ss << ',' << stats.r2_reverse; + } - return ss.str(); + infos["SBF"] = f_ss.str(); + infos["SBR"] = r_ss.str(); + infos["SBF1"] = f1_ss.str(); + infos["SBF2"] = f2_ss.str(); + infos["SBR1"] = r1_ss.str(); + infos["SBR2"] = r2_ss.str(); } -std::string -VarStats::get_r2_reverse_strand_bias() const +void +VarStats::add_stats(VarStats const & stats) { - if (read_strand.size() == 0) - return "."; - - std::stringstream ss; - ss << read_strand[0].r2_reverse; + assert(per_allele.size() == stats.per_allele.size()); + assert(read_strand.size() == stats.read_strand.size()); - for (long i = 1; i < static_cast(read_strand.size()); ++i) - ss << ',' << read_strand[i].r2_reverse; + clipped_reads += stats.clipped_reads; + mapq_squared += stats.mapq_squared; - return ss.str(); -} - - -void -VarStats::add_mapq(uint8_t const new_mapq) -{ - // Ignore MapQ == 255, that means MapQ is unavailable - if (new_mapq < 255u) - mapq_squared += static_cast(new_mapq) * static_cast(new_mapq); + for (long i{0}; i < static_cast(per_allele.size()); ++i) + { + auto & new_a = per_allele[i]; + auto & old_a = stats.per_allele[i]; + auto & new_rs = read_strand[i]; + auto & old_rs = stats.read_strand[i]; + + new_a.clipped_bp += old_a.clipped_bp; + new_a.mapq_squared += old_a.mapq_squared; + new_a.score_diff += old_a.score_diff; + new_a.mismatches += old_a.mismatches; + + new_rs.r1_forward += old_rs.r1_forward; + new_rs.r1_reverse += old_rs.r1_reverse; + new_rs.r2_forward += old_rs.r2_forward; + new_rs.r2_reverse += old_rs.r2_reverse; + } } @@ -183,7 +187,7 @@ join_strand_bias(std::vector const & bias) std::stringstream ss; ss << bias[0]; - for (std::size_t i = 1; i < bias.size(); ++i) + for (long i = 1; i < static_cast(bias.size()); ++i) ss << ',' << bias[i]; return ss.str(); @@ -296,4 +300,21 @@ get_list_of_uncalled_alleles(std::string const & ac) } +template +void +VarStats::serialize(Archive & ar, unsigned const int /*version*/) +{ + ar & per_allele; + ar & read_strand; + ar & clipped_reads; + ar & mapq_squared; +} + + +template void VarStats::serialize(boost::archive::binary_iarchive &, + const unsigned int); +template void VarStats::serialize(boost::archive::binary_oarchive &, + const unsigned int); + + } // namespace gyper diff --git a/src/typer/variant.cpp b/src/typer/variant.cpp index f0c3d59..01b01e9 100644 --- a/src/typer/variant.cpp +++ b/src/typer/variant.cpp @@ -4,6 +4,8 @@ #include // std::stringstream #include // std::vector +#include +#include #include #include // boost::hash_range #include @@ -30,82 +32,53 @@ namespace { void -update_strand_bias(std::size_t const num_seqs, - std::size_t new_num_seqs, - std::vector const & old_phred_to_new_phred, - gyper::Variant const & var, - gyper::Variant & new_var) +update_per_allele_stats(std::size_t const num_seqs, + std::size_t new_num_seqs, + std::vector const & old_phred_to_new_phred, + gyper::Variant const & var, + gyper::Variant & new_var) { - using gyper::get_strand_bias; - using gyper::join_strand_bias; - - std::vector seq_depths = var.get_seq_depth_of_all_alleles(); + //assert(var.infos.count("SBF1") > 0); - std::vector sbf = get_strand_bias(var.infos, "SBF"); - std::vector sbr = get_strand_bias(var.infos, "SBR"); + // Check if any infos to update + if (var.stats.per_allele.size() == 0) + return; - std::vector sbf1 = get_strand_bias(var.infos, "SBF1"); - std::vector sbf2 = get_strand_bias(var.infos, "SBF2"); - std::vector sbr1 = get_strand_bias(var.infos, "SBR1"); - std::vector sbr2 = get_strand_bias(var.infos, "SBR2"); + gyper::VarStats const & old_stats = var.stats;//(num_seqs); + //old_stats.read_stats(var.infos); - assert(sbf.size() > 0); - assert(sbf.size() == num_seqs); - assert(sbr.size() == num_seqs); - assert(sbf1.size() == num_seqs); - assert(sbf2.size() == num_seqs); - assert(sbr1.size() == num_seqs); - assert(sbr2.size() == num_seqs); + new_var.stats = gyper::VarStats(new_num_seqs); + gyper::VarStats & new_stats = new_var.stats; + new_stats.clipped_reads = old_stats.clipped_reads; + new_stats.mapq_squared = old_stats.mapq_squared; - if (sbf.size() > 0 && sbr.size() > 0) + for (uint32_t y = 0; y < num_seqs; ++y) { - std::vector new_sbf(new_num_seqs, 0u); - std::vector new_sbr(new_num_seqs, 0u); - - std::vector new_sbf1(new_num_seqs, 0u); - std::vector new_sbf2(new_num_seqs, 0u); - std::vector new_sbr1(new_num_seqs, 0u); - std::vector new_sbr2(new_num_seqs, 0u); - -// std::vector new_ra_count(new_num_seqs, 0u); -// std::vector new_ra_dist(new_num_seqs, 0u); - - for (uint32_t y = 0; y < num_seqs; ++y) - { - uint32_t new_y = old_phred_to_new_phred[y]; - - assert(new_y < new_num_seqs); - assert(y < seq_depths.size()); - assert(y < sbf.size()); - assert(y < sbr.size()); - assert(y < sbf1.size()); - assert(y < sbf2.size()); - assert(y < sbr1.size()); - assert(y < sbr2.size()); - - new_sbf[new_y] += sbf[y]; - new_sbr[new_y] += sbr[y]; - - new_sbf1[new_y] += sbf1[y]; - new_sbf2[new_y] += sbf2[y]; - new_sbr1[new_y] += sbr1[y]; - new_sbr2[new_y] += sbr2[y]; - -// new_ra_count[new_y] += ra_count[y]; -// new_ra_dist[new_y] += ra_dist[y]; - } - - new_var.infos["SBF"] = join_strand_bias(new_sbf); - new_var.infos["SBR"] = join_strand_bias(new_sbr); - - new_var.infos["SBF1"] = join_strand_bias(new_sbf1); - new_var.infos["SBF2"] = join_strand_bias(new_sbf2); - new_var.infos["SBR1"] = join_strand_bias(new_sbr1); - new_var.infos["SBR2"] = join_strand_bias(new_sbr2); - -// new_var.infos["RACount"] = join_strand_bias(new_ra_count); -// new_var.infos["RADist"] = join_strand_bias(new_ra_dist); + uint32_t new_y = old_phred_to_new_phred[y]; + + assert(new_y < new_num_seqs); + assert(new_y < new_stats.per_allele.size()); + assert(new_y < new_stats.read_strand.size()); + assert(y < old_stats.per_allele.size()); + assert(y < old_stats.read_strand.size()); + + auto const & old_a = old_stats.per_allele[y]; + auto & new_a = new_stats.per_allele[new_y]; + auto const & old_rs = old_stats.read_strand[y]; + auto & new_rs = new_stats.read_strand[new_y]; + + new_a.clipped_bp += old_a.clipped_bp; + new_a.mapq_squared += old_a.mapq_squared; + new_a.score_diff += old_a.score_diff; + new_a.mismatches += old_a.mismatches; + + new_rs.r1_forward += old_rs.r1_forward; + new_rs.r2_forward += old_rs.r2_forward; + new_rs.r1_reverse += old_rs.r1_reverse; + new_rs.r2_reverse += old_rs.r2_reverse; } + + //new_stats.add_stats(new_var.infos); } @@ -117,24 +90,29 @@ namespace gyper Variant::Variant() noexcept : abs_pos(0) + , stats(0) {} Variant::Variant(Variant const & var) noexcept : abs_pos(var.abs_pos) , seqs(var.seqs) , calls(var.calls) + , stats(var.stats) , infos(var.infos) , suffix_id(var.suffix_id) , is_info_generated(var.is_info_generated) + , type(var.type) {} Variant::Variant(Variant && var) noexcept : abs_pos(std::forward(var.abs_pos)) , seqs(std::forward > >(var.seqs)) , calls(std::forward >(var.calls)) + , stats(std::forward(var.stats)) , infos(std::forward >(var.infos)) , suffix_id(std::forward(var.suffix_id)) , is_info_generated(var.is_info_generated) + , type(var.type) {} @@ -144,9 +122,11 @@ Variant::operator=(Variant const & o) noexcept abs_pos = o.abs_pos; seqs = o.seqs; calls = o.calls; + stats = o.stats; infos = o.infos; suffix_id = o.suffix_id; is_info_generated = o.is_info_generated; + type = o.type; return *this; } @@ -157,27 +137,33 @@ Variant::operator=(Variant && o) noexcept abs_pos = o.abs_pos; seqs = std::move(o.seqs); calls = std::move(o.calls); + stats = std::move(o.stats); infos = std::move(o.infos); suffix_id = std::move(o.suffix_id); is_info_generated = o.is_info_generated; + type = o.type; return *this; } Variant::Variant(Genotype const & gt) + : stats(0) { seqs = graph.get_all_sequences_of_a_genotype(gt); + stats = VarStats(seqs.size()); abs_pos = gt.id - 1; // -1 cause we always fetch one position back as well } Variant::Variant(std::vector const & gts, std::vector const & hap_calls) + : stats(0) { assert(gts.size() > 0); for (long i = 0; i < static_cast(hap_calls.size()); ++i) seqs.push_back(graph.get_sequence_of_a_haplotype_call(gts, hap_calls[i])); + stats = VarStats(seqs.size()); abs_pos = gts[0].id - 1; // -1 cause we always fetch one position back as well } @@ -185,6 +171,7 @@ Variant::Variant(std::vector const & gts, std::vector const Variant::Variant(VariantCandidate const & var_candidate) noexcept : abs_pos(var_candidate.abs_pos) , seqs(var_candidate.seqs) + , stats(0) {} @@ -226,7 +213,7 @@ Variant::update_camou_phred(long const ploidy) { assert(y < static_cast(normalized_cov.size())); - long constexpr ERROR = 4; + long constexpr ERROR{4}; long const norm_cov = normalized_cov[y]; long phred00 = norm_cov * ERROR; long phred01_or_11 = cov[0]; @@ -262,12 +249,15 @@ void Variant::generate_infos() { // First check if INFO had already been generated - if (is_info_generated) - return; + //if (is_info_generated) + // return; assert(seqs.size() >= 2); infos["RefLen"] = std::to_string(seqs[0].size()); + // Write stats from VarStat object + stats.write_stats(infos); + // Calculate AC, AN, SeqDepth, MaxVS, ABHom, ABHet, ABHomMulti, ABHetMulti std::vector ac(seqs.size() - 1, 0u); std::vector pass_ac(seqs.size() - 1, 0u); @@ -276,10 +266,11 @@ Variant::generate_infos() long pass_an{0}; long genotyped{0}; uint64_t seqdepth{0}; + std::vector seqdepth_alt(seqs.size() - 1, 0u); std::pair het_allele_depth = {0ul, 0ul}; // First is the first call, second is the second call std::pair hom_allele_depth = {0ul, 0ul}; // First is the called allele, second is not the called one - std::vector > het_multi_allele_depth(seqs.size() - 1, {0u, 0u}); + std::vector > het_multi_allele_depth(seqs.size(), {0u, 0u}); std::vector > hom_multi_allele_depth(seqs.size(), {0u, 0u}); // Max variant support and variant support ratio @@ -292,10 +283,10 @@ Variant::generate_infos() uint64_t n_alt_alt{0u}; // Maximum number of alt proper pairs (MaxAltPP) - uint8_t n_max_alt_proper_pairs = 0u; + uint8_t n_max_alt_proper_pairs{0u}; // For calculation how many calls passed - long n_passed_calls = 0; + long n_passed_calls{0}; for (auto const & sample_call : calls) { @@ -312,15 +303,13 @@ Variant::generate_infos() n_max_alt_proper_pairs = std::max(n_max_alt_proper_pairs, sample_call.alt_proper_pair_depth); uint32_t const total_depth = std::accumulate(sample_call.coverage.cbegin(), sample_call.coverage.cend(), - 0u - ); + 0u); // Skip zero (the reference) for (uint16_t c = 1; c < sample_call.coverage.size(); ++c) { maximum_variant_support[c - 1] = std::max(maximum_variant_support[c - 1], - sample_call.coverage[c] - ); + sample_call.coverage[c]); if (total_depth > 0) { @@ -328,8 +317,7 @@ Variant::generate_infos() static_cast(total_depth); maximum_alternative_support_ratio[c - 1] = std::max(maximum_alternative_support_ratio[c - 1], - current_ratio - ); + current_ratio); } } @@ -369,36 +357,47 @@ Variant::generate_infos() } } - // ABHetMulti and ABHomMulti is calculated on multiallelic sites only - if (seqs.size() > 2) + // ABHetMulti and ABHomMulti is calculated on multiallelic sites only - NO LONGER + //if (seqs.size() > 2) { + uint32_t const call_depth = sample_call.get_unique_depth(); + // Check if heterozygous if (call.first != call.second) { // Only consider heterozygous reference calls - if (call.first == 0u) + //if (call.first == 0u) { - auto const & c = call.second; - assert((c - 1) < static_cast(het_multi_allele_depth.size())); - het_multi_allele_depth[c - 1].first += sample_call.coverage[0]; - het_multi_allele_depth[c - 1].second += sample_call.coverage[c]; + auto const & c1 = call.first; + auto const & c2 = call.second; + assert(c1 < static_cast(het_multi_allele_depth.size())); + assert(c2 < static_cast(het_multi_allele_depth.size())); + + het_multi_allele_depth[c1].first += sample_call.coverage[c1]; + het_multi_allele_depth[c1].second += call_depth - sample_call.coverage[c1]; + + het_multi_allele_depth[c2].first += sample_call.coverage[c2]; + het_multi_allele_depth[c2].second += call_depth - sample_call.coverage[c2]; } } else { auto const & c = call.first; - assert(c < hom_multi_allele_depth.size()); + assert(static_cast(c) < static_cast(hom_multi_allele_depth.size())); hom_multi_allele_depth[c].first += sample_call.coverage[c]; - hom_multi_allele_depth[c].second += std::accumulate(sample_call.coverage.cbegin(), - sample_call.coverage.cend(), - 0u - ) - sample_call.coverage[c]; + hom_multi_allele_depth[c].second += call_depth - sample_call.coverage[c]; } } // SeqDepth if (sample_call.coverage.size() > 0) + { seqdepth += sample_call.get_depth(); + assert(sample_call.coverage.size() == seqdepth_alt.size() + 1); + + for (long c{1}; c < static_cast(sample_call.coverage.size()); ++c) + seqdepth_alt[c - 1] += sample_call.coverage[c]; + } // AN { @@ -435,7 +434,7 @@ Variant::generate_infos() else ++n_alt_alt; } - } // for sample_call + } // LOOP over sample_call // Write MaxAAS { @@ -486,7 +485,6 @@ Variant::generate_infos() // Write AN infos["AN"] = std::to_string(an); - // Write AF { std::ostringstream af_ss; @@ -573,7 +571,7 @@ Variant::generate_infos() infos["ABHet"] = ss_abhet.str(); } - double info_abhom{0.0}; + double info_abhom{0.985}; // Write ABHom { @@ -633,78 +631,62 @@ Variant::generate_infos() infos["SBAlt"] = ss_sb.str(); } - if (seqs.size() > 2) + // Write ABHetMulti { - // Write ABHetMulti - { - assert(het_multi_allele_depth.size() > 0); - assert(het_multi_allele_depth.size() == seqs.size() - 1); - std::stringstream ss_abhet; - ss_abhet.precision(4); + assert(het_multi_allele_depth.size() > 0); + assert(het_multi_allele_depth.size() == seqs.size()); + std::stringstream ss_abhet; + ss_abhet.precision(4); - for (unsigned i = 0; i < het_multi_allele_depth.size(); ++i) - { - if (i > 0) - ss_abhet << ","; + for (unsigned i = 0; i < het_multi_allele_depth.size(); ++i) + { + if (i > 0) + ss_abhet << ","; - auto const & het_depth_pair = het_multi_allele_depth[i]; - uint32_t const total_het_depth = het_depth_pair.first + het_depth_pair.second; + auto const & het_depth_pair = het_multi_allele_depth[i]; + uint32_t const total_het_depth = het_depth_pair.first + het_depth_pair.second; - if (total_het_depth > 0) - { - ss_abhet << (static_cast(het_depth_pair.second) / - static_cast(total_het_depth)); - } - else - { - ss_abhet << "-1"; - } + if (total_het_depth > 0) + { + ss_abhet << (static_cast(het_depth_pair.second) / + static_cast(total_het_depth)); } - - infos["ABHetMulti"] = ss_abhet.str(); - } - - // Write ABHomMulti - { - assert(hom_multi_allele_depth.size() > 0); - assert(hom_multi_allele_depth.size() == seqs.size()); - std::stringstream ss_abhom; - ss_abhom.precision(4); - - for (unsigned i = 0; i < hom_multi_allele_depth.size(); ++i) + else { - if (i > 0) - ss_abhom << ","; - - auto const & hom_depth_pair = hom_multi_allele_depth[i]; - uint32_t const total_hom_depth = hom_depth_pair.first + hom_depth_pair.second; - - if (total_hom_depth > 0) - { - ss_abhom << (static_cast(hom_depth_pair.first) / - static_cast(total_hom_depth)); - } - else - { - ss_abhom << "-1"; - } + ss_abhet << "-1"; } - - infos["ABHomMulti"] = ss_abhom.str(); } + + infos["ABHetMulti"] = ss_abhet.str(); } - else + + // Write ABHomMulti { - // Erase ABH[et|om]Multi if it is there - auto find_it = infos.find("ABHetMulti"); + assert(hom_multi_allele_depth.size() > 0); + assert(hom_multi_allele_depth.size() == seqs.size()); + std::stringstream ss_abhom; + ss_abhom.precision(4); + + for (unsigned i = 0; i < hom_multi_allele_depth.size(); ++i) + { + if (i > 0) + ss_abhom << ","; - if (find_it != infos.end()) - infos.erase(find_it); + auto const & hom_depth_pair = hom_multi_allele_depth[i]; + uint32_t const total_hom_depth = hom_depth_pair.first + hom_depth_pair.second; - find_it = infos.find("ABHomMulti"); + if (total_hom_depth > 0) + { + ss_abhom << (static_cast(hom_depth_pair.first) / + static_cast(total_hom_depth)); + } + else + { + ss_abhom << "-1"; + } + } - if (find_it != infos.end()) - infos.erase(find_it); + infos["ABHomMulti"] = ss_abhom.str(); } // Write VarType @@ -723,6 +705,23 @@ Variant::generate_infos() infos["QD"] = ss.str(); } + std::vector qd_alt = get_qual_by_depth_per_alt_allele(); + + // Calculate QDalt + { + std::stringstream ss; + ss.precision(4); + ss << qd_alt[0]; + + for (long q{1}; q < static_cast(qd_alt.size()); ++q) + { + ss.precision(4); + ss << ',' << qd_alt[q]; + } + + infos["QDalt"] = ss.str(); + } + long info_mq{60}; // Calculate MQ @@ -736,43 +735,140 @@ Variant::generate_infos() if (seqdepth > 0) { double mapq = std::sqrt(mapq_squared / static_cast(seqdepth)); - info_mq = static_cast(mapq + 0.499999); + info_mq = static_cast(mapq + 0.5); infos["MQ"] = std::to_string(info_mq); } else { - infos["MQ"] = "255"; + infos["MQ"] = "0"; } } } - // Logistic regression quality filter (LOGF). Work in progress + assert(stats.read_strand.size() == seqs.size()); + assert(stats.per_allele.size() == seqs.size()); + assert(seqdepth_alt.size() + 1 == seqs.size()); + + // Calculate SDalt, MMalt, CRalt and MQalt { - long const info_cr = infos.count("CR") ? std::stol(infos.at("CR")) : 0; - long const ab_het_bin = static_cast(info_ab_het * 10.0 + 0.00001); - long const sbalt_bin = static_cast(info_sbalt * 10.0 + 0.00001); - double const cr_by_seqdepth = static_cast(info_cr) / static_cast(seqdepth); - double const gt_yield = static_cast(genotyped) / static_cast(an); - - // variables: - // crBySeqdepth=info_cr/seqdepth - // info_qd - // info_abhom - // info_mq - // info_pass_ratio - // gtYield=genotyped/an - assert(ab_het_bin >= 0l); - assert(ab_het_bin <= 10l); - assert(sbalt_bin >= 0l); - assert(sbalt_bin <= 10l); - - double const logf = get_logf(info_abhom, cr_by_seqdepth, info_mq, - info_pass_ratio, gt_yield, info_qd, ab_het_bin, sbalt_bin); - - std::ostringstream ss; - ss.precision(4); - ss << logf; - infos["LOGF"] = ss.str(); + std::ostringstream sd_ss; + std::ostringstream mm_ss; + std::ostringstream cr_ss; + std::ostringstream mq_ss; + + for (long s{0}; s < static_cast(seqdepth_alt.size()); ++s) + { + uint64_t const alt_depth{seqdepth_alt[s]}; + + if (s > 0) + { + sd_ss << ','; + mm_ss << ','; + cr_ss << ','; + mq_ss << ','; + } + + if (alt_depth > 0) + { + auto const & per_al = stats.per_allele[s + 1]; + + sd_ss << (static_cast(per_al.score_diff) / static_cast(alt_depth)); + mm_ss << (static_cast(per_al.mismatches) / static_cast(alt_depth) / 10.0); + cr_ss << (static_cast(per_al.clipped_bp) / static_cast(alt_depth) / 10.0); + mq_ss << static_cast(std::sqrt(static_cast(per_al.mapq_squared) / + static_cast(alt_depth)) + 0.5); + } + else + { + sd_ss << "0.0"; + mm_ss << "0.0"; + cr_ss << "0.0"; + mq_ss << "0"; + } + } + + infos["SDalt"] = sd_ss.str(); + infos["MMalt"] = mm_ss.str(); + infos["CRalt"] = cr_ss.str(); + infos["MQalt"] = mq_ss.str(); + } + + std::vector sb_alt(seqs.size() - 1, 0); + + for (long s{0}; s < static_cast(sb_alt.size()); ++s) + sb_alt[s] = stats.read_strand[s + 1].get_reverse_count(); + + // Alternative allele score (AAScore) + if (!graph.is_sv_graph) + { + { + std::vector aa_score(seqdepth_alt.size(), 0); + + for (long s{0}; s < static_cast(seqdepth_alt.size()); ++s) + { + if (seqdepth_alt[s] > 0) + { + auto const & per_al = stats.per_allele[s + 1]; + + assert(s < static_cast(qd_alt.size())); + double const alt_seq_depth = seqdepth_alt[s]; + double const _sb = 2.0 * ((static_cast(sb_alt[s]) / alt_seq_depth) - 0.5); + double const sb = _sb >= 0.0 ? _sb : -_sb; + assert(sb >= 0.0); + double const mm = static_cast(per_al.mismatches) / alt_seq_depth / 10.0; + long const sd = static_cast(per_al.score_diff) / alt_seq_depth + 0.5; + double const cr = static_cast(per_al.clipped_bp) / alt_seq_depth / 10.0; + long const mq = std::sqrt(static_cast(per_al.mapq_squared) / alt_seq_depth) + 0.5; + + aa_score[s] = get_aa_score(info_abhom, sb, mm, sd, qd_alt[s], cr, mq); + } + else + { + aa_score[s] = 0.0; + } + } + + std::ostringstream ss; + ss.precision(4); + ss << aa_score[0]; + + for (long s{1}; s < static_cast(aa_score.size()); ++s) + { + ss.precision(4); + ss << "," << aa_score[s]; + } + + infos["AAScore"] = ss.str(); + } + + // Logistic regression quality filter (LOGF). + { + long const info_cr = infos.count("CR") ? std::stol(infos.at("CR")) : 0; + long const ab_het_bin = static_cast(info_ab_het * 10.0 + 0.00001); + long const sbalt_bin = static_cast(info_sbalt * 10.0 + 0.00001); + double const cr_by_seqdepth = static_cast(info_cr) / static_cast(seqdepth); + double const gt_yield = static_cast(genotyped) / static_cast(an); + + // variables: + // crBySeqdepth=info_cr/seqdepth + // info_qd + // info_abhom + // info_mq + // info_pass_ratio + // gtYield=genotyped/an + assert(ab_het_bin >= 0l); + assert(ab_het_bin <= 10l); + assert(sbalt_bin >= 0l); + assert(sbalt_bin <= 10l); + + double const logf = get_logf(info_abhom, cr_by_seqdepth, info_mq, + info_pass_ratio, gt_yield, info_qd, ab_het_bin, sbalt_bin); + + std::ostringstream ss; + ss.precision(4); + ss << logf; + infos["LOGF"] = ss.str(); + } } is_info_generated = true; @@ -1220,18 +1316,21 @@ Variant::get_qual() const double Variant::get_qual_by_depth() const { - long total_qual = 0; - long total_depth = 0; + long total_qual{0}; + long total_depth{0}; for (auto const & sample_call : calls) { // Skip homozygous ref calls if (sample_call.phred[0] > 0) { - auto const & cov = sample_call.coverage; - total_qual += static_cast(sample_call.phred[0]); - long const depth = std::min(10l, std::accumulate(cov.begin() + 1, cov.end(), 0l)); - total_depth += depth; + long const depth = std::min(10l, static_cast(sample_call.get_alt_depth())); + + if (depth > 0) + { + total_qual += std::min(25l * depth, static_cast(sample_call.phred[0])); + total_depth += depth; + } } } @@ -1242,6 +1341,64 @@ Variant::get_qual_by_depth() const } +std::vector +Variant::get_qual_by_depth_per_alt_allele() const +{ + assert(seqs.size() > 0); + long const num_alts = seqs.size() - 1; + std::vector qd(num_alts, 0.0); + std::vector total_qual(num_alts, 0); + std::vector total_depth(num_alts, 0); + + for (auto const & sample_call : calls) + { + // Skip homozygous ref calls + if (sample_call.phred[0] > 0) + { + std::pair gt_call = sample_call.get_gt_call(); + auto const & cov = sample_call.coverage; + assert(gt_call.first < cov.size()); + assert(gt_call.second < cov.size()); + + if (gt_call.first > 0) + { + long const depth = std::min(10l, static_cast(cov[gt_call.first] + sample_call.ambiguous_depth)); + + if (depth > 0) + { + total_qual[gt_call.first - 1] += + std::min(25l * depth, static_cast(sample_call.get_lowest_phred_not_with(gt_call.first))); + + total_depth[gt_call.first - 1] += depth; + } + } + + if (gt_call.first != gt_call.second) + { + assert(gt_call.second > 0); + long const depth = std::min(10l, static_cast(cov[gt_call.second] + sample_call.ambiguous_depth)); + + if (depth > 0) + { + total_qual[gt_call.second - 1] += + std::min(25l * depth, static_cast(sample_call.get_lowest_phred_not_with(gt_call.second))); + + total_depth[gt_call.second - 1] += depth; + } + } + } + } + + for (long s{0}; s < num_alts; ++s) + { + if (total_depth[s] > 0) + qd[s] = static_cast(total_qual[s]) / static_cast(total_depth[s]); + } + + return qd; +} + + std::vector make_biallelic(Variant && var) { @@ -1310,7 +1467,7 @@ make_biallelic(Variant && var) new_var.calls.push_back(std::move(new_sample_call)); } - update_strand_bias(var.seqs.size(), new_var.seqs.size(), old_phred_to_new_phred, var, new_var); + update_per_allele_stats(var.seqs.size(), new_var.seqs.size(), old_phred_to_new_phred, var, new_var); out.push_back(std::move(new_var)); } @@ -1326,6 +1483,7 @@ break_down_variant(Variant && var, bool const is_all_biallelic) { std::vector broken_down_vars; + //assert(var.stats.per_allele.size() == var.seqs.size()); if (Options::const_instance()->no_decompose || (var.seqs.size() == 2 && @@ -1624,11 +1782,11 @@ find_variant_sequences(gyper::Variant & new_var, gyper::Variant const & old_var) } // Update strand bias - update_strand_bias(old_var.seqs.size(), - new_seqs.size(), - old_phred_to_new_phred, - old_var, - new_var); + update_per_allele_stats(old_var.seqs.size(), + new_seqs.size(), + old_phred_to_new_phred, + old_var, + new_var); new_var.calls.push_back(std::move(new_call)); } @@ -1643,6 +1801,7 @@ break_multi_snps(Variant const && var) uint32_t const pos = var.abs_pos; std::vector > const & seqs = var.seqs; std::vector new_vars; + //assert(var.infos.count("SBF1") == 1); for (long j = 0; j < static_cast(seqs[0].size()); ++j) { @@ -1726,8 +1885,9 @@ break_multi_snps(Variant const && var) new_var.calls.push_back(std::move(new_sample_call)); } - if (var.infos.count("SBF1") == 1) - update_strand_bias(seqs.size(), new_var.seqs.size(), old_phred_to_new_phred, var, new_var); + //if (var.infos.count("SBF1") == 1) + //assert(var.infos.count("SBF1") == 1); + update_per_allele_stats(seqs.size(), new_var.seqs.size(), old_phred_to_new_phred, var, new_var); new_vars.push_back(std::move(new_var)); } @@ -1748,7 +1908,7 @@ break_down_skyr(Variant && var, long const reach) std::min(OPTIMAL_EXTRA, static_cast(var.abs_pos) - reach - 1l) : OPTIMAL_EXTRA; - long constexpr extra_bases_after = OPTIMAL_EXTRA; + long constexpr extra_bases_after{OPTIMAL_EXTRA}; for (long i = 0; i < extra_bases_before && var.add_base_in_front(false); ++i) {} @@ -1825,7 +1985,7 @@ break_down_skyr(Variant && var, long const reach) new_var.calls.push_back(std::move(new_sample_call)); } - update_strand_bias(seqs.size(), new_var.seqs.size(), old_phred_to_new_phred, var, new_var); + update_per_allele_stats(seqs.size(), new_var.seqs.size(), old_phred_to_new_phred, var, new_var); new_vars.push_back(std::move(new_var)); } @@ -1866,6 +2026,7 @@ Variant::serialize(Archive & ar, unsigned const int /*version*/) ar & abs_pos; ar & seqs; ar & calls; + ar & stats; ar & infos; ar & suffix_id; } diff --git a/src/typer/variant_map.cpp b/src/typer/variant_map.cpp index 7bfdbd4..afe03c2 100644 --- a/src/typer/variant_map.cpp +++ b/src/typer/variant_map.cpp @@ -415,9 +415,11 @@ VariantMap::filter_varmap_for_all() long const reach{-1}; bool const is_no_variant_overlapping{true}; // Force it to be true so we wont use skyr bool const is_all_biallelic{false}; + Variant new_var(var_cp); + new_var.infos["SBF1"] = "."; std::vector new_broken_down_vars = - break_down_variant(Variant(var_cp), reach, is_no_variant_overlapping, is_all_biallelic); + break_down_variant(std::move(new_var), reach, is_no_variant_overlapping, is_all_biallelic); assert(new_broken_down_vars.size() != 0); diff --git a/src/typer/vcf.cpp b/src/typer/vcf.cpp index d6a5e5f..503a79f 100644 --- a/src/typer/vcf.cpp +++ b/src/typer/vcf.cpp @@ -25,6 +25,7 @@ #include "tbx.h" + namespace { @@ -293,19 +294,17 @@ Vcf::read_record(bool const SITES_ONLY) std::unordered_set const keys_to_parse( { -// "AC", - "CR", + "CR", "CRal", "CRalt", "END", "HOMSEQ" // "INV3", "INV5", "LEFT_SVINSSEQ", "NCLUSTERS", "NUM_MERGED_SVS", - "MQsquared", + "MMal", "MMalt", "MQ", "MQSal", "MQsquared", "MQal", "OLD_VARIANT_ID", "OREND", "ORSTART", - "PS", "RELATED_SV_ID", "RIGHT_SVINSSEQ", "SBF", "SBF1", "SBF2", - "SBR", "SBR1", "SBR2", + "SBR", "SBR1", "SBR2", "SDal", "SDalt", "SVLEN", "SVTYPE", "SVSIZE", "SVMODEL", "SEQ", "SVINSSEQ", "SV_ID" } ); @@ -589,23 +588,26 @@ Vcf::write_header() // INFO definitions { bgzf_stream.ss + << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" - << "##INFO=\n" + << "##INFO=\n" << "##INFO=\n" + " homozygous calls (A/(A+0)) where A is the called allele and O is anything " + "else. -1 if not available.\">\n" << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" - << "##INFO=\n" + << "##INFO=\n" + << "##INFO=\n" + << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" @@ -620,11 +622,16 @@ Vcf::write_header() "ratio per alt. allele.\">\n" << "##INFO=\n" + << "##INFO=\n" + << "##INFO=\n" << "##INFO=\n" - << "##INFO=\n" + << "##INFO=\n" + + << "##INFO=\n" << "##INFO=\n" + "cluster.\">\n" << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" + << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" - << "##INFO=\n" +// << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" + << "##INFO=\n" + << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" - << "##INFO=\n" - << "##INFO=\n" + << "##INFO=\n" + << "##INFO=\n" << "##INFO=\n" << "##INFO=\n" @@ -1185,7 +1197,11 @@ Vcf::close_vcf_file() void Vcf::write_tbi_index() const { - int ret = tbx_index_build(filename.c_str(), 0, &tbx_conf_vcf); + bool const is_csi = Options::const_instance()->is_csi; + + int ret = is_csi ? + tbx_index_build(filename.c_str(), 14, &tbx_conf_vcf) : + tbx_index_build(filename.c_str(), 0, &tbx_conf_vcf); if (ret < 0) BOOST_LOG_TRIVIAL(warning) << __HERE__ << " Could not build VCF index"; @@ -1219,7 +1235,7 @@ Vcf::add_segment(Segment && segment) void -Vcf::add_haplotype(Haplotype & haplotype, bool const clear_haplotypes, uint32_t const phase_set) +Vcf::add_haplotype(Haplotype & haplotype, bool const clear_haplotypes, uint32_t const /*phase_set*/) { assert(haplotype.gts.size() > 0); @@ -1236,18 +1252,12 @@ Vcf::add_haplotype(Haplotype & haplotype, bool const clear_haplotypes, uint32_t // Add variant stats for (long i = 0; i < static_cast(new_vars.size()); ++i) { - auto const & stat = haplotype.var_stats[i]; + VarStats const & stats = haplotype.var_stats[i]; auto & new_var = new_vars[i]; - new_var.infos["CR"] = std::to_string(stat.clipped_reads); - new_var.infos["PS"] = std::to_string(phase_set); - new_var.infos["MQsquared"] = std::to_string(stat.mapq_squared); - new_var.infos["SBF"] = stat.get_forward_strand_bias(); - new_var.infos["SBR"] = stat.get_reverse_strand_bias(); - new_var.infos["SBF1"] = stat.get_r1_forward_strand_bias(); - new_var.infos["SBF2"] = stat.get_r2_forward_strand_bias(); - new_var.infos["SBR1"] = stat.get_r1_reverse_strand_bias(); - new_var.infos["SBR2"] = stat.get_r2_reverse_strand_bias(); + //new_var.infos["PS"] = std::to_string(phase_set); + //stats.add_stats(new_var.infos); + new_var.stats = stats; } //long const ploidy = Options::const_instance()->ploidy; @@ -1653,8 +1663,7 @@ save_vcf(Vcf const & vcf, std::string const & filename) if (!ofs.is_open()) { BOOST_LOG_TRIVIAL(error) << __HERE__ << " Could not save VCF to location '" - << filename - << "'"; + << filename << "'"; std::exit(1); } diff --git a/src/typer/vcf_operations.cpp b/src/typer/vcf_operations.cpp index b101fc1..b76fd2f 100644 --- a/src/typer/vcf_operations.cpp +++ b/src/typer/vcf_operations.cpp @@ -73,82 +73,8 @@ vcf_merge(std::vector & vcfs, std::string const & output) // For each variant in the first VCF, add the calls from the other VCFs for (auto & var : vcf.variants) { - // Only output variant if it is in the genotyping region - // CR - uint32_t number_of_clipped_reads = 0; - - { - auto find_it = var.infos.find("CR"); - - if (find_it != var.infos.end()) - number_of_clipped_reads = std::strtoull(find_it->second.c_str(), NULL, 10); - } - - // MQ. Keep track of the total mapping quality rooted/squared - uint64_t mapq_squared = 0; - - { - auto find_it = var.infos.find("MQsquared"); - - if (find_it != var.infos.end()) - mapq_squared = std::strtoull(find_it->second.c_str(), NULL, 10); - } - - // SBF and SBR - std::vector strand_forward; - std::vector strand_reverse; - - std::vector r1_strand_forward; - std::vector r1_strand_reverse; - std::vector r2_strand_forward; - std::vector r2_strand_reverse; - -// std::vector realignment_distance; -// std::vector realignment_count; - - { - auto find_it = var.infos.find("SBF"); - - if (find_it != var.infos.end()) - strand_forward = split_bias_to_numbers(find_it->second); - - find_it = var.infos.find("SBR"); - - if (find_it != var.infos.end()) - strand_reverse = split_bias_to_numbers(find_it->second); - - // Read specific strand bias - find_it = var.infos.find("SBF1"); - - if (find_it != var.infos.end()) - r1_strand_forward = split_bias_to_numbers(find_it->second); - - find_it = var.infos.find("SBF2"); - - if (find_it != var.infos.end()) - r2_strand_forward = split_bias_to_numbers(find_it->second); - - find_it = var.infos.find("SBR1"); - - if (find_it != var.infos.end()) - r1_strand_reverse = split_bias_to_numbers(find_it->second); - - find_it = var.infos.find("SBR2"); - - if (find_it != var.infos.end()) - r2_strand_reverse = split_bias_to_numbers(find_it->second); - -// // Realignment count and distance -// find_it = var.infos.find("RACount"); -// -// if (find_it != var.infos.end()) -// realignment_count = split_bias_to_numbers(find_it->second); -// -// find_it = var.infos.find("RADist"); -// -// if (find_it != var.infos.end()) -// realignment_distance = split_bias_to_numbers(find_it->second); - } + //VarStats stats(var.seqs.size()); + //stats.read_stats(var.infos); for (auto & next_vcf : next_vcfs) { @@ -166,54 +92,7 @@ vcf_merge(std::vector & vcfs, std::string const & output) } assert(next_vcf.variants.size() == 1); - - // Get CR - { - auto find_it = next_vcf.variants[0].infos.find("CR"); - - if (find_it != next_vcf.variants[0].infos.end()) - number_of_clipped_reads += std::strtoull(find_it->second.c_str(), NULL, 10); - } - - // Get MQ - { - auto find_it = next_vcf.variants[0].infos.find("MQsquared"); - - if (find_it != next_vcf.variants[0].infos.end()) - mapq_squared += std::strtoull(find_it->second.c_str(), NULL, 10); - } - - - // Get SBF and SBR - { - auto add_to_bias_lambda = [&](std::string id, std::vector & bias) - { - auto find_it = next_vcf.variants[0].infos.find(id); - - if (find_it != next_vcf.variants[0].infos.end()) - { - std::vector split_nums = - split_bias_to_numbers(find_it->second); - - for (std::size_t i = 0; i < split_nums.size(); ++i) - { - if (i < strand_forward.size()) - bias[i] += split_nums[i]; - else - bias.push_back(split_nums[i]); - } - } - }; - - add_to_bias_lambda("SBF", strand_forward); - add_to_bias_lambda("SBR", strand_reverse); - add_to_bias_lambda("SBF1", r1_strand_forward); - add_to_bias_lambda("SBF2", r2_strand_forward); - add_to_bias_lambda("SBR1", r1_strand_reverse); - add_to_bias_lambda("SBR2", r2_strand_reverse); -// add_to_bias_lambda("RACount", realignment_count); -// add_to_bias_lambda("RADist", realignment_distance); - } + var.stats.add_stats(next_vcf.variants[0].stats); std::move(next_vcf.variants[0].calls.begin(), next_vcf.variants[0].calls.end(), @@ -222,23 +101,7 @@ vcf_merge(std::vector & vcfs, std::string const & output) next_vcf.variants.clear(); } - // Add strand bias, this must happend before the INFO is generated - var.infos["SBF"] = join_strand_bias(strand_forward); - var.infos["SBR"] = join_strand_bias(strand_reverse); - - var.infos["SBF1"] = join_strand_bias(r1_strand_forward); - var.infos["SBF2"] = join_strand_bias(r2_strand_forward); - var.infos["SBR1"] = join_strand_bias(r1_strand_reverse); - var.infos["SBR2"] = join_strand_bias(r2_strand_reverse); - -// var.infos["RACount"] = join_strand_bias(realignment_count); -// var.infos["RADist"] = join_strand_bias(realignment_distance); - - // Add MQsquared - var.infos["MQsquared"] = std::to_string(mapq_squared); - - // Add CR - var.infos["CR"] = std::to_string(number_of_clipped_reads); + //stats.add_stats(var.infos); // generate the rest of the INFOs var.generate_infos(); @@ -298,8 +161,7 @@ vcf_merge_and_break(std::vector const & vcfs, long const ploidy = copts.ploidy; GenomicRegion genomic_region(region); uint32_t const region_begin = 1 + absolute_pos.get_absolute_position(genomic_region.chr, - genomic_region.begin - ); + genomic_region.begin); uint32_t const region_end = absolute_pos.get_absolute_position(genomic_region.chr, genomic_region.end); @@ -334,8 +196,19 @@ vcf_merge_and_break(std::vector const & vcfs, vcf.sample_names.insert(vcf.sample_names.end(), next_vcf.sample_names.begin(), next_vcf.sample_names.end()); + } + // Check whether the sample names are unique + { + std::vector sample_names_cp(vcf.sample_names); + std::sort(sample_names_cp.begin(), sample_names_cp.end()); + auto uniq_it = std::unique(sample_names_cp.begin(), sample_names_cp.end()); + if (uniq_it != sample_names_cp.end()) + { + BOOST_LOG_TRIVIAL(warning) << __HERE__ << " Sample names are not unique. " + << "The output VCF file will contain duplicated sample names."; + } } // We have all the samples, write the header now @@ -361,68 +234,8 @@ vcf_merge_and_break(std::vector const & vcfs, for (long v = 0; v < static_cast(vcf.variants.size()); ++v) { auto & var = vcf.variants[v]; - - // CR - uint32_t number_of_clipped_reads = 0; - - { - auto find_it = var.infos.find("CR"); - - if (find_it != var.infos.end()) - number_of_clipped_reads = std::strtoull(find_it->second.c_str(), NULL, 10); - } - - // MQ. Keep track of the total mapping quality rooted/squared - uint64_t mapq_squared = 0; - - { - auto find_it = var.infos.find("MQsquared"); - - if (find_it != var.infos.end()) - mapq_squared = std::strtoull(find_it->second.c_str(), NULL, 10); - } - - // SBF and SBR - std::vector strand_forward; - std::vector strand_reverse; - - std::vector r1_strand_forward; - std::vector r1_strand_reverse; - std::vector r2_strand_forward; - std::vector r2_strand_reverse; - - { - auto find_it = var.infos.find("SBF"); - - if (find_it != var.infos.end()) - strand_forward = split_bias_to_numbers(find_it->second); - - find_it = var.infos.find("SBR"); - - if (find_it != var.infos.end()) - strand_reverse = split_bias_to_numbers(find_it->second); - - // Read specific strand bias - find_it = var.infos.find("SBF1"); - - if (find_it != var.infos.end()) - r1_strand_forward = split_bias_to_numbers(find_it->second); - - find_it = var.infos.find("SBF2"); - - if (find_it != var.infos.end()) - r2_strand_forward = split_bias_to_numbers(find_it->second); - - find_it = var.infos.find("SBR1"); - - if (find_it != var.infos.end()) - r1_strand_reverse = split_bias_to_numbers(find_it->second); - - find_it = var.infos.find("SBR2"); - - if (find_it != var.infos.end()) - r2_strand_reverse = split_bias_to_numbers(find_it->second); - } + var.is_info_generated = false; + //VarStats & stats = var.stats; // Trigger read next batch if (next_vcfs.size() > 0 && (v - v_next) == static_cast(next_vcfs[0].variants.size())) @@ -453,52 +266,7 @@ vcf_merge_and_break(std::vector const & vcfs, assert((v - v_next) >= 0); assert((v - v_next) < static_cast(next_vcf.variants.size())); auto & next_vcf_var = next_vcf.variants[v - v_next]; - - // Get CR - { - auto find_it = next_vcf_var.infos.find("CR"); - - if (find_it != next_vcf_var.infos.end()) - number_of_clipped_reads += std::strtoull(find_it->second.c_str(), NULL, 10); - } - - // Get MQ - { - auto find_it = next_vcf_var.infos.find("MQsquared"); - - if (find_it != next_vcf_var.infos.end()) - mapq_squared += std::strtoull(find_it->second.c_str(), NULL, 10); - } - - - // Get SBF and SBR - { - auto add_to_bias_lambda = - [&](std::string id, std::vector & bias) - { - auto find_it = next_vcf_var.infos.find(id); - - if (find_it != next_vcf_var.infos.end()) - { - std::vector split_nums = split_bias_to_numbers(find_it->second); - - for (long i = 0; i < static_cast(split_nums.size()); ++i) - { - if (i < static_cast(strand_forward.size())) - bias[i] += split_nums[i]; - else - bias.push_back(split_nums[i]); - } - } - }; - - add_to_bias_lambda("SBF", strand_forward); - add_to_bias_lambda("SBR", strand_reverse); - add_to_bias_lambda("SBF1", r1_strand_forward); - add_to_bias_lambda("SBF2", r2_strand_forward); - add_to_bias_lambda("SBR1", r1_strand_reverse); - add_to_bias_lambda("SBR2", r2_strand_reverse); - } + var.stats.add_stats(next_vcf_var.stats); std::move(next_vcf_var.calls.begin(), next_vcf_var.calls.end(), @@ -506,19 +274,7 @@ vcf_merge_and_break(std::vector const & vcfs, } // Add strand bias, this must happend before the INFO is generated - var.infos["SBF"] = join_strand_bias(strand_forward); - var.infos["SBR"] = join_strand_bias(strand_reverse); - - var.infos["SBF1"] = join_strand_bias(r1_strand_forward); - var.infos["SBF2"] = join_strand_bias(r2_strand_forward); - var.infos["SBR1"] = join_strand_bias(r1_strand_reverse); - var.infos["SBR2"] = join_strand_bias(r2_strand_reverse); - - // Add MQsquared - var.infos["MQsquared"] = std::to_string(mapq_squared); - - // Add CR - var.infos["CR"] = std::to_string(number_of_clipped_reads); + //stats.add_stats(var.infos); if (var.calls.size() != vcf.sample_names.size()) { @@ -527,6 +283,9 @@ vcf_merge_and_break(std::vector const & vcfs, std::exit(1); } + //assert(var.infos.count("SBF1") == 1); + assert(var.stats.per_allele.size() == var.seqs.size()); + // break down the merged variants bool const is_no_variant_overlapping{copts.no_variant_overlapping || force_no_variant_overlapping}; @@ -546,7 +305,7 @@ vcf_merge_and_break(std::vector const & vcfs, for (auto & new_var : new_variants) { // If we have not processed this variant before, do so now - if (!new_var.is_info_generated) + //if (!new_var.is_info_generated) { new_var.normalize(); @@ -555,38 +314,58 @@ vcf_merge_and_break(std::vector const & vcfs, new_var.generate_infos(); - // Remove MQsquared - new_var.infos.erase("MQsquared"); - new_var.infos.erase("PS"); + // Remove uneeded infos + //new_var.infos.erase("CRal"); + //new_var.infos.erase("MMal"); + //new_var.infos.erase("MQal"); + //new_var.infos.erase("MQsquared"); + //new_var.infos.erase("MQSal"); + //new_var.infos.erase("SDal"); } } update_reach(new_variants); std::move(new_variants.begin(), new_variants.end(), std::back_inserter(broken_vars)); - long constexpr W = 500; // Print variants that are more than W bp before the newest one + long constexpr W{700}; // Print variants that are more than W bp before the newest one - if ((broken_vars[0].abs_pos + 2 * W) < broken_vars[broken_vars.size() - 1].abs_pos) + auto min_max_it_pair = + std::minmax_element(broken_vars.begin(), + broken_vars.end(), + [](Variant const & a, Variant const & b) + { + return a.abs_pos < b.abs_pos; + }); + + assert(min_max_it_pair.first != broken_vars.end()); + assert(min_max_it_pair.second != broken_vars.end()); + long const min_abs_pos = min_max_it_pair.first->abs_pos; + long const max_abs_pos = min_max_it_pair.second->abs_pos; + + if ((min_abs_pos + 2l * W) < max_abs_pos) { // Make sure we do no print outside of the region - uint32_t const reg_end = - std::min(region_end, static_cast(broken_vars[broken_vars.size() - 1].abs_pos - W)); - - vcf.write_records(region_begin, - reg_end, - FILTER_ZERO_QUAL, - broken_vars); + long const reg_end = + std::min(static_cast(region_end), max_abs_pos - static_cast(W)); - // Remove variants that were written (or before the region, since they will never be printed) - broken_vars.erase( - std::remove_if(broken_vars.begin(), - broken_vars.end(), - [&](Variant const & v) - { - return v.abs_pos <= reg_end || v.abs_pos < region_begin; - }), - broken_vars.end() - ); + if (reg_end >= static_cast(region_begin)) + { + vcf.write_records(region_begin, + static_cast(reg_end), + FILTER_ZERO_QUAL, + broken_vars); + + // Remove variants that were written (or before the region, since they will never be printed) + broken_vars.erase( + std::remove_if(broken_vars.begin(), + broken_vars.end(), + [&](Variant const & v) + { + return static_cast(v.abs_pos) <= reg_end; + }), + broken_vars.end() + ); + } } var = Variant(); // Free memory @@ -608,6 +387,7 @@ vcf_concatenate(std::vector const & vcfs, std::string const & output, bool const SKIP_SORT, bool const SITES_ONLY, + bool const WRITE_TBI, std::string const & region) { gyper::Vcf vcf; @@ -701,10 +481,9 @@ vcf_concatenate(std::vector const & vcfs, if (SITES_ONLY) vcf.sample_names.clear(); - BOOST_LOG_TRIVIAL(debug) << "[graphtyper::vcf_operations] Total number of samples read is " - << vcf.sample_names.size(); + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Total number of samples read is " << vcf.sample_names.size(); - for (std::size_t i = 0; i < vcfs.size(); ++i) + for (long i{0}; i < static_cast(vcfs.size()); ++i) { // Skip if the filename contains '*' if (std::count(vcfs[i].begin(), vcfs[i].end(), '*') > 0) @@ -730,9 +509,9 @@ vcf_concatenate(std::vector const & vcfs, // Merge with the other VCF if (next_vcf.sample_names.size() != vcf.sample_names.size()) { - BOOST_LOG_TRIVIAL(error) << "[graphtyper::vcf_operations] The VCF file " + BOOST_LOG_TRIVIAL(error) << __HERE__ << " The VCF file " << vcfs[i] - << " has different amount of samples! (" + << " has unexpected number of sample names! (" << next_vcf.sample_names.size() << " but not " << vcf.sample_names.size() << ")"; std::exit(1); @@ -753,6 +532,9 @@ vcf_concatenate(std::vector const & vcfs, vcf.write(region); } + + if (WRITE_TBI) + vcf.write_tbi_index(); } @@ -771,33 +553,27 @@ vcf_break_down(std::string const & vcf, std::string const & output, std::string // Read the sample names and add them vcf_in.read_samples(); - BOOST_LOG_TRIVIAL(debug) << "[graphtyper::vcf_break_down] " - << "Total number of samples read is " + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Total number of samples read is " << vcf_in.sample_names.size(); // Copy sample names std::copy(vcf_in.sample_names.begin(), vcf_in.sample_names.end(), - std::back_inserter(vcf_out.sample_names) - ); + std::back_inserter(vcf_out.sample_names)); GenomicRegion genomic_region(region); uint32_t const region_begin = 1 + absolute_pos.get_absolute_position(genomic_region.chr, - genomic_region.begin - ); + genomic_region.begin); uint32_t const region_end = absolute_pos.get_absolute_position(genomic_region.chr, - genomic_region.end - ); + genomic_region.end); // Read first record bool not_at_end = vcf_in.read_record(); // Write header to output vcf_out.write_header(); - BOOST_LOG_TRIVIAL(debug) << "[graphtyper::vcf_break_down] VCF header written"; - - long reach = -1; // Indicates how long the previous variants reached + long reach{-1}; // Indicates how long the previous variants reached auto update_reach = [&reach](std::vector const & new_variants) @@ -820,8 +596,8 @@ vcf_break_down(std::string const & vcf, std::string const & output, std::string // Make sure the number of calls matches the number of samples if (vcf_in.variants[0].calls.size() != vcf_in.sample_names.size()) { - BOOST_LOG_TRIVIAL(error) << "[graphtyper::vcf_operations::vcf_break_down]" - << "The number of calls, " + BOOST_LOG_TRIVIAL(error) << __HERE__ + << " The number of calls, " << vcf_in.variants[0].calls.size() << " did not match the number of samples, " << vcf_in.sample_names.size(); @@ -831,6 +607,7 @@ vcf_break_down(std::string const & vcf, std::string const & output, std::string bool const is_no_variant_overlapping{Options::const_instance()->no_variant_overlapping}; bool const is_all_biallelic{Options::const_instance()->is_all_biallelic}; + assert(vcf_in.variants[0].infos.count("SBF1") == 1); std::vector new_variants = break_down_variant(std::move(vcf_in.variants[0]), reach, is_no_variant_overlapping, diff --git a/src/typer/vcf_writer.cpp b/src/typer/vcf_writer.cpp index 55c56f6..521c733 100644 --- a/src/typer/vcf_writer.cpp +++ b/src/typer/vcf_writer.cpp @@ -73,11 +73,8 @@ namespace gyper VcfWriter::VcfWriter(uint32_t variant_distance) { haplotypes = gyper::graph.get_all_haplotypes(variant_distance); - BOOST_LOG_TRIVIAL(debug) << "[graphtyper::vcf_writer] Number of variant nodes in graph " - << graph.var_nodes.size(); - BOOST_LOG_TRIVIAL(debug) << "[graphtyper::vcf_writer] Got " - << haplotypes.size() - << " haplotypes."; + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Number of variant nodes in graph " << graph.var_nodes.size(); + BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Got " << haplotypes.size() << " haplotypes."; } @@ -107,20 +104,25 @@ VcfWriter::set_samples(std::vector const & samples) void VcfWriter::update_haplotype_scores_geno(GenotypePaths & geno, long const pn_index, Primers const * primers) { +#ifndef NDEBUG + if (Options::instance()->stats.size() > 0) + update_statistics(geno, pn_index); +#endif + if (are_genotype_paths_good(geno)) { if (primers) primers->check(geno); push_to_haplotype_scores(geno, pn_index); - -#ifndef NDEBUG - if (Options::instance()->stats.size() > 0) - update_statistics(geno, pn_index); - } -#else } -#endif //NDEBUG +//#ifndef NDEBUG +// if (Options::instance()->stats.size() > 0) +// update_statistics(geno, pn_index); +// } +//#else +// } +//#endif //NDEBUG } @@ -156,7 +158,6 @@ VcfWriter::print_variant_group_details() const } } - //std::lock_guard lock(io_mutex); write_gzipped_to_file(hap_file, hap_stats_fn); } @@ -209,7 +210,6 @@ VcfWriter::print_variant_details() const variant_file << '\n'; } - //std::lock_guard lock(io_mutex); write_gzipped_to_file(variant_file, variant_details_fn); } @@ -411,10 +411,14 @@ VcfWriter::push_to_haplotype_scores(GenotypePaths & geno, long const pn_index) assert(are_genotype_paths_good(geno)); // Quality metrics - bool const fully_aligned = geno.all_paths_fully_aligned(); + assert(geno.longest_path_length <= geno.read_length); + int const clipped_bp = geno.read_length - geno.longest_path_length; + bool const fully_aligned = clipped_bp == 0; + assert(fully_aligned == geno.all_paths_fully_aligned()); + bool const non_unique_paths = !geno.all_paths_unique(); std::size_t const mismatches = geno.paths[0].mismatches; - bool has_low_quality_snp = false; + bool has_low_quality_snp{false}; std::unordered_map recent_ids; // maps to is_overlapping @@ -434,7 +438,7 @@ VcfWriter::push_to_haplotype_scores(GenotypePaths & geno, long const pn_index) auto & hap = haplotypes[type_ids.first]; auto & num = p_it->nums[i]; - long constexpr MIN_OFFSET = 3; + long constexpr MIN_OFFSET{3}; bool is_overlapping = (p_it->start_ref_reach_pos() + MIN_OFFSET) <= p_it->var_order[i] && (p_it->end_ref_reach_pos() - MIN_OFFSET) > p_it->var_order[i]; recent_ids[type_ids.first] |= is_overlapping; @@ -487,9 +491,12 @@ VcfWriter::push_to_haplotype_scores(GenotypePaths & geno, long const pn_index) // Move INFO to variant statistics. // This needs to called before 'coverage_to_gts', because it uses the coverage of each variant. - haplotype.clipped_reads_to_stats(fully_aligned); + haplotype.clipped_reads_to_stats(clipped_bp, geno.read_length); haplotype.mapq_to_stats(geno.mapq); haplotype.strand_to_stats(geno.flags); + haplotype.mismatches_to_stats(mismatches, geno.read_length); + haplotype.score_diff_to_stats(geno.score_diff); + // TODO display these new stats // Update the likelihood scores haplotype.explain_to_score(pn_index, diff --git a/src/utilities/bamshrink.cpp b/src/utilities/bamshrink.cpp index ceffec8..16f758d 100644 --- a/src/utilities/bamshrink.cpp +++ b/src/utilities/bamshrink.cpp @@ -83,14 +83,17 @@ is_one_end_clipped(seqan::String > const & cigar, long con bool process_tags(seqan::BamAlignmentRecord & record, seqan::CharString & new_tags, bamshrink::Options const & opts) { - unsigned i = 0; - int64_t as = -1; - int64_t xs = -1; + unsigned i{0}; + int64_t as{-1}; + int64_t xs{-1}; + auto const tags_length = seqan::length(record.tags); - while (i < seqan::length(record.tags)) + while (i < tags_length) { //std::string curr_tag(2, record.tags[i]); //curr_tag[1] = record.tags[i+1]; + auto begin_it = begin(record.tags) + i; + auto end_it = begin_it; i += 3; char type = record.tags[i - 1]; bool is_as = false; @@ -120,16 +123,14 @@ process_tags(seqan::BamAlignmentRecord & record, seqan::CharString & new_tags, b // Check for RG if (record.tags[i - 3] == 'R' && record.tags[i - 2] == 'G') { - auto begin_it = begin(record.tags) + i - 3; - while (record.tags[i] != '\0' && record.tags[i] != '\n') ++i; ++i; - auto end_it = begin(record.tags) + i; - //unsigned const old_length = length(new_tags); - resize(new_tags, std::distance(begin_it, end_it), seqan::Exact()); - seqan::arrayCopyForward(begin_it, end_it, begin(new_tags)); + end_it = begin(record.tags) + i; + unsigned const old_length = length(new_tags); + resize(new_tags, old_length + std::distance(begin_it, end_it), seqan::Exact()); + seqan::arrayCopyForward(begin_it, end_it, begin(new_tags) + old_length); } else { @@ -158,6 +159,7 @@ process_tags(seqan::BamAlignmentRecord & record, seqan::CharString & new_tags, b } i += sizeof(int8_t); + end_it = begin(record.tags) + i; break; } @@ -177,6 +179,7 @@ process_tags(seqan::BamAlignmentRecord & record, seqan::CharString & new_tags, b } i += sizeof(uint8_t); + end_it = begin(record.tags) + i; break; } @@ -196,6 +199,7 @@ process_tags(seqan::BamAlignmentRecord & record, seqan::CharString & new_tags, b } i += sizeof(int16_t); + end_it = begin(record.tags) + i; break; } @@ -215,6 +219,7 @@ process_tags(seqan::BamAlignmentRecord & record, seqan::CharString & new_tags, b } i += sizeof(uint16_t); + end_it = begin(record.tags) + i; break; } @@ -234,6 +239,7 @@ process_tags(seqan::BamAlignmentRecord & record, seqan::CharString & new_tags, b } i += sizeof(int32_t); + end_it = begin(record.tags) + i; break; } @@ -253,21 +259,30 @@ process_tags(seqan::BamAlignmentRecord & record, seqan::CharString & new_tags, b } i += sizeof(uint32_t); + end_it = begin(record.tags) + i; break; } case 'f': { i += sizeof(float); + end_it = begin(record.tags) + i; break; } default: { - i = length(record.tags); // Unkown tag, stop + i = tags_length; // Unkown tag, stop break; } } + + if (is_as || is_xs) + { + unsigned const old_length = length(new_tags); + resize(new_tags, old_length + std::distance(begin_it, end_it), seqan::Exact()); + seqan::arrayCopyForward(begin_it, end_it, begin(new_tags) + old_length); + } } if (as != -1 && xs != -1 && (!seqan::hasFlagMultiple(record) || seqan::hasFlagNextUnmapped(record))) @@ -818,7 +833,9 @@ qualityFilterSlice2(Options const & opts, if (!removeNsAtEnds(rec, opts)) return; + // replace tags rec.tags = std::move(new_tags); + long const bin = (rec.beginPos - first_pos) / 50l; if (bin >= static_cast(bin_counts.size())) diff --git a/src/utilities/genotype.cpp b/src/utilities/genotype.cpp index fc8c0dc..834887c 100644 --- a/src/utilities/genotype.cpp +++ b/src/utilities/genotype.cpp @@ -28,6 +28,7 @@ #include #include +//#define GT_DEV 1 namespace { @@ -276,7 +277,9 @@ genotype_only_with_a_vcf(std::string const & ref_path, BOOST_LOG_TRIVIAL(info) << "Genotyping using an input VCF."; std::string const output_vcf = tmp + "/it1/final.vcf.gz"; std::string const out_dir = tmp + "/it1"; + gyper::Options const & copts = *(Options::const_instance()); mkdir(out_dir.c_str(), 0755); + bool const is_sv_graph{false}; bool const use_absolute_positions{true}; bool const check_index{true}; @@ -285,7 +288,7 @@ genotype_only_with_a_vcf(std::string const & ref_path, bool const is_writing_hap{false}; gyper::construct_graph(ref_path, - Options::const_instance()->vcf, + copts.vcf, padded_region.to_string(), is_sv_graph, use_absolute_positions, @@ -302,16 +305,18 @@ genotype_only_with_a_vcf(std::string const & ref_path, { PHIndex ph_index = index_graph(gyper::graph); + std::vector avg_cov_by_readlen(shrinked_sams.size(), -1.0); paths = gyper::call(shrinked_sams, - "", // graph_path + avg_cov_by_readlen, + "", // graph_path ph_index, out_dir, - "", // reference - ".", // region + "", // reference + ".", // region primers, - 5, //minimum_variant_support, - 0.25, //minimum_variant_support_ratio, + 5, // minimum_variant_support, + 0.25, // minimum_variant_support_ratio, is_writing_calls_vcf, is_discovery, is_writing_hap); @@ -323,9 +328,21 @@ genotype_only_with_a_vcf(std::string const & ref_path, // Append _calls.vcf.gz //for (auto & path : paths) // path += "_calls.vcf.gz"; + bool constexpr is_filter_zero_qual{false}; //> FILTER_ZERO_QUAL, force_no_variant_overlapping, force_no_break_down - vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), true, false, false); + vcf_merge_and_break(paths, tmp + "/graphtyper.vcf.gz", region.to_string(), is_filter_zero_qual, false, false); + + if (copts.normal_and_no_variant_overlapping) + { + //> FILTER_ZERO_QUAL, force_no_variant_overlapping + vcf_merge_and_break(paths, + tmp + "/graphtyper.no_variant_overlapping.vcf.gz", + region.to_string(), + is_filter_zero_qual, + true, + false); + } // free memory graph = Graph(); @@ -442,6 +459,7 @@ genotype(std::string ref_path, minimum_variant_support = copts.genotype_aln_min_support; minimum_variant_support_ratio = copts.genotype_aln_min_support_ratio; is_writing_calls_vcf = false; // Skip writing calls vcf in release mode in all iterations except the last one + std::vector avg_cov_by_readlen(shrinked_sams.size(), -1.0); // Iteration 1 { @@ -449,11 +467,88 @@ genotype(std::string ref_path, std::string const output_vcf = tmp + "/it1/final.vcf.gz"; std::string const out_dir = tmp + "/it1"; mkdir(out_dir.c_str(), 0755); + +#ifdef GT_DEV gyper::construct_graph(ref_path, "", padded_region.to_string(), false, true, false); + + + Vcf indel_sites; + gyper::streamlined_discovery(shrinked_sams, + ref_path, + padded_region.to_string(), + tmp, + indel_sites); +#endif // GT_DEV + +#ifndef NDEBUG + /* + BOOST_LOG_TRIVIAL(info) << __HERE__ << " WIP Third pass starting."; + + // THIRD PASS + { + bool const is_sv_graph{false}; + bool const use_absolute_positions{true}; + bool const check_index{false}; + bool const is_writing_calls_vcf{true}; + bool const is_discovery{false}; + bool const is_writing_hap{false}; + + gyper::construct_graph(ref_path, + tmp + "/indel_sites.vcf.gz", + padded_region.to_string(), + is_sv_graph, + use_absolute_positions, + check_index); + + absolute_pos.calculate_offsets(gyper::graph.contigs); + +//#ifndef NDEBUG + // Save graph in debug mode + save_graph(tmp + "/graph_indels"); +//#endif // NDEBUG + + std::vector paths; + + { + PHIndex ph_index = index_graph(gyper::graph); + std::vector avg_cov_by_readlen(shrinked_sams.size(), -1.0); + + paths = gyper::call(shrinked_sams, + avg_cov_by_readlen, + "", // graph_path + ph_index, + tmp, + "", // reference + ".", // region + primers.get(), + 5, // minimum_variant_support, + 0.25, // minimum_variant_support_ratio, + is_writing_calls_vcf, + is_discovery, + is_writing_hap); + } + + vcf_merge_and_break(paths, + tmp + "/graphtyper.test.vcf.gz", + region.to_string(), + true, + false, + false); + + graph = Graph(); // free memory reserved for graph + Options::instance()->stats.clear(); + } + //*/ + ///// WIP ends +#endif // NDEBUG + + gyper::construct_graph(ref_path, "", padded_region.to_string(), false, true, false); + #ifndef NDEBUG // Save graph in debug mode save_graph(out_dir + "/graph"); -#endif // NDEBUG +#endif + absolute_pos.calculate_offsets(gyper::graph.contigs); auto output_paths = gyper::discover_directly_from_bam("", shrinked_sams, @@ -467,6 +562,11 @@ genotype(std::string ref_path, Vcf final_vcf; varmap.get_vcf(final_vcf, output_vcf); +#ifdef GT_DEV + // Add indel sites + std::move(indel_sites.variants.begin(), indel_sites.variants.end(), std::back_inserter(final_vcf.variants)); +#endif // GT_DEV + if (copts.prior_vcf.size() > 0) { BOOST_LOG_TRIVIAL(info) << "Inserting prior variant sites."; @@ -495,8 +595,11 @@ genotype(std::string ref_path, } #endif // NDEBUG - long FIRST_CALLONLY_ITERATION = 3; - long LAST_ITERATION = 4; + long FIRST_CALLONLY_ITERATION{3}; + long LAST_ITERATION{4}; + + if (copts.is_extra_call_only_iteration) + ++LAST_ITERATION; // Iteration 2 if (copts.is_only_cigar_discovery) @@ -528,6 +631,7 @@ genotype(std::string ref_path, minimum_variant_support_ratio = copts.genotype_dis_min_support_ratio; paths = gyper::call(shrinked_sams, + avg_cov_by_readlen, "", // graph_path ph_index, out_dir, @@ -608,6 +712,7 @@ genotype(std::string ref_path, PHIndex ph_index = index_graph(gyper::graph); paths = gyper::call(shrinked_sams, + avg_cov_by_readlen, "", // graph_path ph_index, out_dir, @@ -672,7 +777,7 @@ genotype(std::string ref_path, // Copy sites to system { std::ostringstream ss_cmd; - ss_cmd << "cp -p " << tmp << "/it" << (LAST_ITERATION - 1) << "/final.vcf.gz" << " " // Change to (LAST_ITERATION - 1) + ss_cmd << "cp -p " << tmp << "/it" << (LAST_ITERATION - 1) << "/final.vcf.gz" << " " << output_path << "/input_sites/" << region.chr << "/" << std::setw(9) << std::setfill('0') << (region.begin + 1) << '-' @@ -689,6 +794,8 @@ genotype(std::string ref_path, } } + std::string const index_ext = copts.is_csi ? ".vcf.gz.csi" : ".vcf.gz.tbi"; + // Copy final VCFs auto copy_to_results = [&](std::string const & basename_no_ext, std::string const & extension, std::string const & id) @@ -704,7 +811,7 @@ genotype(std::string ref_path, int ret = system(ss_cmd.str().c_str()); - if (extension != ".vcf.gz.tbi" && ret != 0) + if (extension != index_ext && ret != 0) { BOOST_LOG_TRIVIAL(error) << __HERE__ << " This command failed '" << ss_cmd.str() << "'"; std::exit(ret); @@ -716,15 +823,46 @@ genotype(std::string ref_path, } }; - copy_to_results("graphtyper", ".vcf.gz", ""); // Copy final VCF - copy_to_results("graphtyper", ".vcf.gz.tbi", ""); // Copy tabix index for final VCF + std::string basename_no_ext{"graphtyper"}; + + // Check if tabix file exists + if (!is_file(tmp + "/graphtyper.vcf.gz.tbi") && !is_file(tmp + "/graphtyper.vcf.gz.csi")) + { + BOOST_LOG_TRIVIAL(warning) << "Tabix creation appears to have failed, " + << "I will retry sorting the VCF by reading it in whole."; + + bool const no_sort{false}; + bool const sites_only{false}; + bool const write_tbi{true}; + + std::string region{"."}; // already extracted region + + vcf_concatenate({tmp + "/graphtyper.vcf.gz"}, + tmp + "/graphtyper.sorted.vcf.gz", + no_sort, + sites_only, + write_tbi, + region); + + basename_no_ext = "graphtyper.sorted"; + } + + copy_to_results(basename_no_ext, ".vcf.gz", ""); // Copy final VCF + copy_to_results(basename_no_ext, index_ext, ""); // Copy tabix index for final VCF if (copts.normal_and_no_variant_overlapping) { copy_to_results("graphtyper.no_variant_overlapping", ".vcf.gz", ".no_variant_overlapping"); - copy_to_results("graphtyper.no_variant_overlapping", ".vcf.gz.tbi", ".no_variant_overlapping"); + copy_to_results("graphtyper.no_variant_overlapping", index_ext, ".no_variant_overlapping"); } +#ifndef NDEBUG + /*copy_to_results("indel_sites", ".vcf.gz", ".indel_sites");*/ + //copy_to_results("indel_sites", index_ext, ".indel_sites"); + //copy_to_results("graphtyper.test", ".vcf.gz", ".test"); + //copy_to_results("graphtyper.test", index_ext, ".test"); +#endif // NDEBUG + if (!copts.no_cleanup) { BOOST_LOG_TRIVIAL(info) << "Cleaning up temporary files."; diff --git a/src/utilities/genotype_camou.cpp b/src/utilities/genotype_camou.cpp index 289eac3..dedd79c 100644 --- a/src/utilities/genotype_camou.cpp +++ b/src/utilities/genotype_camou.cpp @@ -129,18 +129,12 @@ genotype_camou(std::string const & interval_fn, run_samtools_merge(shrinked_sams, tmp_bams); } - bool constexpr is_writing_hap { - false - }; + bool constexpr is_writing_hap{false}; for (auto const & interval : intervals) { - bool is_writing_calls_vcf { - false - }; - bool is_discovery { - true - }; + bool is_writing_calls_vcf{false}; + bool is_discovery{true}; BOOST_LOG_TRIVIAL(info) << "Genotyping interval " << interval; GenomicRegion genomic_region(interval); @@ -180,6 +174,7 @@ genotype_camou(std::string const & interval_fn, double minimum_variant_support_ratio = 0.35 / static_cast(num_intervals); paths = gyper::call(shrinked_sams, + avg_cov_by_readlen, "", // graph_path ph_index, out_dir, @@ -239,6 +234,7 @@ genotype_camou(std::string const & interval_fn, PHIndex ph_index = index_graph(gyper::graph); paths = gyper::call(shrinked_sams, + avg_cov_by_readlen, "", // graph_path ph_index, out_dir, diff --git a/src/utilities/genotype_sv.cpp b/src/utilities/genotype_sv.cpp index 78d8b57..e1c0b1c 100644 --- a/src/utilities/genotype_sv.cpp +++ b/src/utilities/genotype_sv.cpp @@ -28,27 +28,23 @@ void genotype_sv(std::string ref_path, std::string const & sv_vcf, std::vector const & sams, + std::vector const & avg_cov_by_readlen, GenomicRegion const & genomic_region, std::string const & output_path, bool const is_copy_reference) { // TODO: If the reference is only Ns then output an empty vcf with the sample names // TODO: Extract the reference sequence and use that to discover directly from BAM - bool constexpr is_writing_calls_vcf { - true - }; - bool constexpr is_writing_hap { - false - }; - bool constexpr is_discovery { - false - }; + gyper::Options const & copts = *(Options::const_instance()); + bool constexpr is_writing_calls_vcf{true}; + bool constexpr is_writing_hap{false}; + bool constexpr is_discovery{false}; long const NUM_SAMPLES = sams.size(); BOOST_LOG_TRIVIAL(info) << "SV genotyping region " << genomic_region.to_string(); BOOST_LOG_TRIVIAL(info) << "Path to genome is '" << ref_path << "'"; - BOOST_LOG_TRIVIAL(info) << "Running with up to " << Options::const_instance()->threads << " threads."; + BOOST_LOG_TRIVIAL(info) << "Running with up to " << copts.threads << " threads."; BOOST_LOG_TRIVIAL(info) << "Copying data from " << NUM_SAMPLES << " input SAM/BAM/CRAMs to local disk."; std::string tmp = create_temp_dir(genomic_region); @@ -132,13 +128,18 @@ genotype_sv(std::string ref_path, #endif // NDEBUG PHIndex ph_index = index_graph(gyper::graph); + std::string reference_fn{}; // empty by default + + if (Options::const_instance()->force_use_input_ref_for_cram_reading) + reference_fn = ref_path; std::vector paths = gyper::call(sams, + avg_cov_by_readlen, "", // graph_path ph_index, out_dir, - "", // reference + reference_fn, // reference unpadded_region.to_string(), // region nullptr, 5,//minimum_variant_support, @@ -190,10 +191,12 @@ genotype_sv(std::string ref_path, } }; + std::string const index_ext = copts.is_csi ? ".csi" : ".tbi"; + copy_vcf_to_system(""); // Copy final VCF - copy_vcf_to_system(".tbi"); // Copy tabix index for final VCF + copy_vcf_to_system(index_ext); // Copy tabix index for final VCF - if (!Options::instance()->no_cleanup) + if (!copts.no_cleanup) { BOOST_LOG_TRIVIAL(info) << "Cleaning up temporary files."; remove_file_tree(tmp.c_str()); @@ -224,13 +227,14 @@ void genotype_sv_regions(std::string ref_path, std::string const & sv_vcf, std::vector const & sams, + std::vector const & avg_cov_by_readlen, std::vector const & regions, std::string const & output_path, bool const is_copy_reference) { for (auto const & region : regions) { - genotype_sv(ref_path, sv_vcf, sams, region, output_path, is_copy_reference); + genotype_sv(ref_path, sv_vcf, sams, avg_cov_by_readlen, region, output_path, is_copy_reference); } } diff --git a/src/utilities/hts_parallel_reader.cpp b/src/utilities/hts_parallel_reader.cpp index e45115e..c584c42 100644 --- a/src/utilities/hts_parallel_reader.cpp +++ b/src/utilities/hts_parallel_reader.cpp @@ -42,13 +42,13 @@ check_if_maps_are_empty(std::vector const & maps) if (map.size() > 0) { - BOOST_LOG_TRIVIAL(warning) + BOOST_LOG_TRIVIAL(info) << __HERE__ << " Found a non-empty read map for with size " << map.size() << "! This likely means these reads have the BAM_FPAIRED flag set but have no mate read."; - for (auto it = map.begin(); it != map.end(); ++it) - BOOST_LOG_TRIVIAL(debug) << "Leftover read name: " << it->first; + //for (auto it = map.begin(); it != map.end(); ++it) + // BOOST_LOG_TRIVIAL(debug) << "Leftover read name: " << it->first; } } } @@ -76,11 +76,7 @@ HtsParallelReader::open(std::vector const & hts_file_paths, for (auto const & bam : hts_file_paths) { HtsReader f(store); - f.open(bam, region); - - if (!reference.empty()) - f.set_reference(reference); - + f.open(bam, region, reference); f.set_sample_index_offset(samples.size()); std::copy(f.samples.begin(), f.samples.end(), std::back_inserter(samples)); f.set_rg_index_offset(num_rg); @@ -94,7 +90,7 @@ HtsParallelReader::open(std::vector const & hts_file_paths, // Read the first record of each hts file for (long i = 0; i < n; ++i) { - bam1_t * hts_rec = hts_files[i].get_next_read(); + bam1_t * hts_rec = hts_files[i].get_next_read_in_order(); if (hts_rec) { @@ -127,9 +123,9 @@ HtsParallelReader::read_record(HtsRecord & hts_record) // reuse an old bam1_t record if it is available if (old_record) - hts_rec = hts_files[i].get_next_read(old_record); + hts_rec = hts_files[i].get_next_read_in_order(old_record); else - hts_rec = hts_files[i].get_next_read(); + hts_rec = hts_files[i].get_next_read_in_order(); std::pop_heap(heap.begin(), heap.end(), Cmp_gt_pair_bam1_t_fun); assert(hts_record.record == heap.back().record); @@ -283,8 +279,8 @@ genotype_only(HtsParallelReader const & hts_preader, assert(hts_rec.file_index >= 0); assert(hts_rec.file_index < static_cast(maps.size())); - long rg_i = 0; - long sample_i = 0; + long rg_i{0}; + long sample_i{0}; hts_preader.get_sample_and_rg_index(sample_i, rg_i, hts_rec); assert(maps.size() > 0); @@ -301,7 +297,6 @@ genotype_only(HtsParallelReader const & hts_preader, } std::pair geno_paths(prev_paths); - auto find_it = map_gpaths.find(read_name); if (find_it == map_gpaths.end()) @@ -309,17 +304,21 @@ genotype_only(HtsParallelReader const & hts_preader, if (hts_rec.record->core.flag & IS_PAIRED) { // Read is in a pair and we need to wait for its mate -#ifndef NDEBUG - update_paths(geno_paths, seq, rseq, hts_rec.record); -#else +//#ifndef NDEBUG +// update_paths(geno_paths, seq, rseq, hts_rec.record); +//#else update_paths(geno_paths, hts_rec.record); -#endif +//#endif map_gpaths[read_name] = std::move(geno_paths); // Did not find the read name } else { +//#ifndef NDEBUG +// GenotypePaths * selected = update_unpaired_read_paths(geno_paths, seq, rseq, hts_rec.record); +//#else GenotypePaths * selected = update_unpaired_read_paths(geno_paths, hts_rec.record); +//#endif if (selected) { @@ -336,11 +335,11 @@ genotype_only(HtsParallelReader const & hts_preader, else { // Do stuff with paired reads.... -#ifndef NDEBUG - update_paths(geno_paths, seq, rseq, hts_rec.record); -#else +//#ifndef NDEBUG +// update_paths(geno_paths, seq, rseq, hts_rec.record); +//#else update_paths(geno_paths, hts_rec.record); -#endif +//#endif if ((geno_paths.first.flags & IS_FIRST_IN_PAIR) == (find_it->second.first.flags & IS_FIRST_IN_PAIR)) { @@ -355,6 +354,8 @@ genotype_only(HtsParallelReader const & hts_preader, if (better_paths.first) { + assert(better_paths.second); + if (graph.is_sv_graph) { // Add reference depth @@ -413,13 +414,9 @@ genotype_and_discover(HtsParallelReader const & hts_preader, if (hts_rec.record->core.flag & IS_PAIRED) { // Read is in a pair and we need to wait for its mate -#ifndef NDEBUG - update_paths(geno_paths, seq, rseq, hts_rec.record); -#else update_paths(geno_paths, hts_rec.record); -#endif - further_update_paths_for_discovery(geno_paths, seq, rseq, hts_rec.record); + further_update_paths_for_discovery(geno_paths, hts_rec.record); map_gpaths[read_name] = std::move(geno_paths); // Did not find the read name } else @@ -428,7 +425,7 @@ genotype_and_discover(HtsParallelReader const & hts_preader, if (selected) { - further_update_unpaired_read_paths_for_discovery(*selected, seq, rseq, hts_rec.record); + further_update_unpaired_read_paths_for_discovery(*selected, hts_rec.record); // Discover new variants { @@ -449,11 +446,7 @@ genotype_and_discover(HtsParallelReader const & hts_preader, else { // Do stuff with paired reads.... -#ifndef NDEBUG - update_paths(geno_paths, seq, rseq, hts_rec.record); -#else update_paths(geno_paths, hts_rec.record); -#endif if ((geno_paths.first.flags & IS_FIRST_IN_PAIR) == (find_it->second.first.flags & IS_FIRST_IN_PAIR)) { @@ -462,7 +455,7 @@ genotype_and_discover(HtsParallelReader const & hts_preader, std::exit(1); } - further_update_paths_for_discovery(geno_paths, seq, rseq, hts_rec.record); + further_update_paths_for_discovery(geno_paths, hts_rec.record); // Find the better pair of geno paths std::pair better_paths = @@ -499,6 +492,7 @@ genotype_and_discover(HtsParallelReader const & hts_preader, void parallel_reader_genotype_only(std::string * out_path, std::vector const * hts_paths_ptr, + std::vector const * avg_cov_ptr, std::string const * output_dir_ptr, std::string const * reference_fn_ptr, std::string const * region_ptr, @@ -508,12 +502,14 @@ parallel_reader_genotype_only(std::string * out_path, bool const is_writing_hap) { assert(hts_paths_ptr); + assert(avg_cov_ptr); assert(ph_index_ptr); assert(output_dir_ptr); assert(reference_fn_ptr); assert(region_ptr); auto const & hts_paths = *hts_paths_ptr; + auto const & avg_cov_by_readlen = *avg_cov_ptr; auto const & ph_index = *ph_index_ptr; auto const & output_dir = *output_dir_ptr; auto const & reference = *reference_fn_ptr; @@ -537,13 +533,9 @@ parallel_reader_genotype_only(std::string * out_path, // Set up reference depth tracks and bin counts if we are SV calling ReferenceDepth reference_depth; - //std::vector > bin_counts; if (graph.is_sv_graph) - { reference_depth.set_depth_sizes(writer.pns.size()); - // bin_counts.resize(writer.pns.size()); - } std::vector maps; // One map for each file. Each map relates read names to their graph alignments maps.resize(hts_preader.get_num_rg()); @@ -562,11 +554,58 @@ parallel_reader_genotype_only(std::string * out_path, std::pair prev_paths; HtsRecord prev; + auto is_good_read = + [](bam1_t * record) -> bool + { + auto const & core = record->core; + + if ((core.flag & IS_UNMAPPED) != 0u) + return false; + + std::array static constexpr CIGAR_MAP = {{ + 'M', 'I', 'D', 'N', 'S', 'H', 'P', '=', 'X', 'B', '*', '*', '*', '*', '*', '*' + }}; + + auto cigar_it = record->data + core.l_qname; + long const N_CIGAR = core.n_cigar; + + // Check if both ends are clipped + if (N_CIGAR >= 2) + { + uint32_t opAndCnt; + memcpy(&opAndCnt, cigar_it, sizeof(uint32_t)); + char cigar_operation_front = CIGAR_MAP[opAndCnt & 15]; + uint32_t count_front = opAndCnt >> 4; + + cigar_it += sizeof(uint32_t) * (N_CIGAR - 1); + memcpy(&opAndCnt, cigar_it, sizeof(uint32_t)); + char cigar_operation_back = CIGAR_MAP[opAndCnt & 15]; + uint32_t count_back = opAndCnt >> 4; + + bool const is_one_clipped = (cigar_operation_front == 'S' && count_front >= 12) || + (cigar_operation_back == 'S' && count_back >= 12); + + bool const is_mate_far_away = core.tid != core.mtid || std::abs(core.pos - core.mpos) > 200000; + + if ((cigar_operation_front == 'S' && cigar_operation_back == 'S') || + (is_mate_far_away && is_one_clipped) || + (core.qual <= 15 && is_one_clipped) || + (core.qual <= 15 && is_mate_far_away)) + { + return false; + } + } + + return true; + }; + // Read the first record bool is_done = !hts_preader.read_record(prev); assert(is_done || prev.record); + bool const IS_SV_CALLING{graph.is_sv_graph}; - while (!is_done && (prev.record->core.flag & Options::const_instance()->sam_flag_filter) != 0u) + while (!is_done && ((prev.record->core.flag & Options::const_instance()->sam_flag_filter) != 0u || + (IS_SV_CALLING && !is_good_read(prev.record)))) { BOOST_LOG_TRIVIAL(debug) << __HERE__ << " Reread first record"; is_done = !hts_preader.read_record(prev); @@ -578,6 +617,54 @@ parallel_reader_genotype_only(std::string * out_path, } else { + // bin counts for the (extreme) coverage filter + long const first_pos{prev.record->core.pos}; + + std::vector > bin_counts; + bool const IS_COVERAGE_FILTER{IS_SV_CALLING && !Options::const_instance()->no_filter_on_coverage}; + + if (IS_COVERAGE_FILTER) + bin_counts.resize(hts_preader.get_samples().size()); + + // Lambda function to update bin counts for the first record. Returns false when there are too many reads in bin + auto update_bin_count = + [&](HtsRecord const & hts_rec) -> bool + { + if (!IS_COVERAGE_FILTER) + return true; + + long sample_i{0}; + long rg_i{0}; + hts_preader.get_sample_and_rg_index(sample_i, rg_i, hts_rec); + + if (sample_i >= static_cast(avg_cov_by_readlen.size()) || avg_cov_by_readlen[sample_i] <= 0.0) + return true; + + uint16_t max_bin_count = std::min(static_cast(std::numeric_limits::max()), + static_cast(avg_cov_by_readlen[sample_i] * 50.0 * 3.0 + 0.5)); + + assert(sample_i < static_cast(bin_counts.size())); + auto & sample_bin_counts = bin_counts[sample_i]; + long const bin = (hts_rec.record->core.pos - first_pos) / 50l; + + if (bin >= static_cast(sample_bin_counts.size())) + { + sample_bin_counts.resize(bin + 1, 0u); + ++sample_bin_counts[bin]; // increment read count in new bin + return true; + } + else if (sample_bin_counts[bin] > max_bin_count) + { + return false; + } + else + { + ++sample_bin_counts[bin]; // increment read count in bin + return true; + } + }; + + update_bin_count(prev); ++num_records; seqan::IupacString seq; seqan::IupacString rseq; @@ -589,20 +676,30 @@ parallel_reader_genotype_only(std::string * out_path, while (hts_preader.read_record(curr)) { // Ignore reads with the following bits set - if ((curr.record->core.flag & Options::const_instance()->sam_flag_filter) != 0u) + if ((curr.record->core.flag & Options::const_instance()->sam_flag_filter) != 0u || + (IS_SV_CALLING && !is_good_read(curr.record))) + { continue; + } ++num_records; if (equal_pos_seq(prev.record, curr.record)) { // The two records are equal + update_bin_count(curr); ++num_duplicated_records; genotype_only(hts_preader, writer, reference_depth, maps, prev_paths, ph_index, primers, curr, seq, rseq, false /*update prev_geno_paths*/); } else { + if (!update_bin_count(curr)) + { + --num_records; // Skip record + continue; + } + genotype_only(hts_preader, writer, reference_depth, maps, prev_paths, ph_index, primers, curr, seq, rseq, true /*update prev_geno_paths*/); hts_preader.move_record(prev, curr); // move curr to prev diff --git a/src/utilities/hts_reader.cpp b/src/utilities/hts_reader.cpp index 01f98be..9badcd3 100644 --- a/src/utilities/hts_reader.cpp +++ b/src/utilities/hts_reader.cpp @@ -19,20 +19,21 @@ HtsReader::HtsReader(HtsStore & _store) void -HtsReader::open(std::string const & path, std::string const & region) +HtsReader::open(std::string const & path, std::string const & region, std::string const & reference) { fp = hts_open(path.c_str(), "r"); if (!fp) { - std::cerr << "ERROR: Could not open BAM file " << path << std::endl; + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Could not open BAM/CRAM file " << path; std::exit(1); } + set_reference(reference); fp->bam_header = sam_hdr_read(fp); // Read sample from header - if (!Options::instance()->get_sample_names_from_filename) + if (!Options::const_instance()->get_sample_names_from_filename) { std::string const header_text(fp->bam_header->text, fp->bam_header->l_text); std::vector header_lines; @@ -49,8 +50,7 @@ HtsReader::open(std::string const & path, std::string const & region) if (pos_samp == std::string::npos || pos_id == std::string::npos) { - std::cerr << "[graphtyper::utilities::hts_reader] ERROR: Could not parse RG and sample from header line:" - << line_it << std::endl; + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Could not parse RG and sample from header line: " << line_it; std::exit(1); } @@ -68,10 +68,7 @@ HtsReader::open(std::string const & path, std::string const & region) std::string new_id = line_it.substr(pos_id + 4, pos_id_ends - pos_id - 4); std::string new_sample = line_it.substr(pos_samp + 4, pos_samp_ends - pos_samp - 4); - - BOOST_LOG_TRIVIAL(debug) << "[graphtyper::utilities::hts_reader] Added RG: '" - << new_id << "' => '" << new_sample << "'"; - + BOOST_LOG_TRIVIAL(debug) << __HERE__ << "Added RG: '" << new_id << "' => '" << new_sample << "'"; rg2index[new_id] = rg2sample_i.size(); auto find_it = std::find(samples.begin(), samples.end(), new_sample); @@ -98,15 +95,11 @@ HtsReader::open(std::string const & path, std::string const & region) sample = sample.substr(0, sample.find('.')); samples.push_back(std::move(sample)); - - // No read group - //rg2sample_i.push_back(0); - //rg2index[""] = 0; } rec = store.get(); - if (region == ".") + if (region.size() <= 1) { ret = sam_read1(fp, fp->bam_header, rec); } @@ -142,19 +135,19 @@ HtsReader::close() } -int +void HtsReader::set_reference(std::string const & reference_path) { + if ((reference_path.size() == 0) || (reference_path.size() == 1 && reference_path[0] == '.')) + return; + int ret2 = hts_set_fai_filename(fp, reference_path.c_str()); - if (ret2 < 0) + if (ret2 != 0) { - BOOST_LOG_TRIVIAL(error) << "[graphtyper::utilities::hts_reader] ERROR: " - << "Could not open reference FASTA file with filename " << reference_path; + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Could not open reference FASTA file with filename " << reference_path; std::exit(1); } - - return ret2; } @@ -173,7 +166,7 @@ HtsReader::set_rg_index_offset(int new_rg_index_offset) bam1_t * -HtsReader::get_next_read(bam1_t * old_record) +HtsReader::get_next_read_in_order(bam1_t * old_record) { assert(old_record); @@ -189,6 +182,12 @@ HtsReader::get_next_read(bam1_t * old_record) // We do not have any records ready and we have reached the end of the file, stop now if (ret < 0) { + if (ret < -1) + { + BOOST_LOG_TRIVIAL(error) << __HERE__ << " htslib failed BAM/CRAM reading and returned " << ret; + std::exit(1); + } + // Deallocate memory when we reach the end of the file if (rec) store.push(rec); @@ -240,7 +239,7 @@ HtsReader::get_next_read(bam1_t * old_record) bam1_t * -HtsReader::get_next_read() +HtsReader::get_next_read_in_order() { // If we have some records ready in the vector, return those first if (records.size() > 0) @@ -253,6 +252,12 @@ HtsReader::get_next_read() // We do not have any records ready and we have reached the end of the file, stop now if (ret < 0) { + if (ret < -1) + { + BOOST_LOG_TRIVIAL(error) << __HERE__ << " htslib failed BAM/CRAM reading and returned " << ret; + std::exit(1); + } + // Deallocate memory when we reach the end of the file if (rec) store.push(rec); @@ -304,12 +309,20 @@ HtsReader::get_next_read() bam1_t * -HtsReader::get_next_pos() +HtsReader::get_next_read() { if (ret < 0) { + if (ret < -1) + { + BOOST_LOG_TRIVIAL(error) << __HERE__ << " htslib failed BAM/CRAM reading and returned " << ret; + std::exit(1); + } + // Deallocate memory when we reach the end of the file - bam_destroy1(rec); + if (rec) + store.push(rec); + return nullptr; } @@ -320,6 +333,34 @@ HtsReader::get_next_pos() } +bam1_t * +HtsReader::get_next_read(bam1_t * old_record) +{ + assert(old_record); + + if (ret < 0) + { + if (ret < -1) + { + BOOST_LOG_TRIVIAL(error) << __HERE__ << " htslib failed BAM/CRAM reading and returned " << ret; + std::exit(1); + } + + // Deallocate memory when we reach the end of the file + if (rec) + store.push(rec); + + store.push(old_record); + return nullptr; + } + + bam1_t * record = rec; + rec = old_record; + ret = hts_iter ? sam_itr_next(fp, hts_iter, rec) : sam_read1(fp, fp->bam_header, rec); + return record; +} + + void HtsReader::get_sample_and_rg_index(long & sample_i, long & rg_i, bam1_t * rec) const { @@ -337,7 +378,7 @@ HtsReader::get_sample_and_rg_index(long & sample_i, long & rg_i, bam1_t * rec) c if (!rg_tag) { - std::cerr << "[graphtyper::utilities::hts_reader] ERROR: Unable to find RG tag in read." << std::endl; + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Unable to find RG tag in read."; std::exit(1); } @@ -346,7 +387,7 @@ HtsReader::get_sample_and_rg_index(long & sample_i, long & rg_i, bam1_t * rec) c if (find_rg_it == rg2index.end()) { - std::cerr << "[graphtyper::utilities::hts_reader] ERROR: Unable to find read group. " << read_group << std::endl; + BOOST_LOG_TRIVIAL(error) << __HERE__ << " Unable to find read group. " << read_group; std::exit(1); } diff --git a/src/utilities/type_conversions.cpp b/src/utilities/type_conversions.cpp index 81de8e3..69d4147 100644 --- a/src/utilities/type_conversions.cpp +++ b/src/utilities/type_conversions.cpp @@ -91,22 +91,31 @@ to_uint64(std::string const & s, int i) } +/** WIP uint16_t to_uint16(char const c) { switch (c) { + case 'a': case 'A': return 0u; + case 'c': case 'C': return 1u; + case 'g': case 'G': return 2u; + case 't': + case 'T': + return 3u; + default: - return 3u; // 'T' or anything else + BOOST_LOG_TRIVIAL(warning) << "[graphtyper::type_conversions] Invalid character " << c; + return 4ul; } } @@ -127,6 +136,83 @@ to_uint16(std::vector const & s, std::size_t i) } + +int32_t +to_uint16_with_check(std::vector const & s, long i) +{ + assert(static_cast(s.size()) >= 8 + i); + int32_t d{0}; + + for (long const j = i + 8; i < j; ++i) + { + d <<= 2; + uint16_t ret = to_uint16(s[i]); + + if (ret == 4) + return -1; + else + d += ret; + } + + return d; +} + + +std::vector +to_uint16_vec(std::string const & s) +{ + assert(s.size() >= 8); // Otherwise its impossible to read 8 bases from read! + uint16_t current{0u} + std::vector uints; + long const sequence_size = s.size(); + long const max_8mers{sequence_size - 7}; + uints.reserve(max_8mers); + int bp_since_N{0}; + + for (long i = 0; i < max_8mers; ++i) + { + char const c = s[i]; + ++bp_since_N; + + switch (c) + { + case 'a': + case 'A': + current *= 4; + //current += 0; + break; + + case 'c': + case 'C': + current *= 4; + current += 1; + + case 'g': + case 'G': + current *= 4; + current += 2; + break; + + case 't': + case 'T': + current *= 4; + current += 3; + break; + + default: + bp_since_N = 0; + break; + } + + if (bp_since_N >= 8) + uints.push_back(current); + } + + return uints; +} +*/ + + template std::vector to_uint64_vec(TSeq const & s, std::size_t i) @@ -190,60 +276,9 @@ to_uint64_vec(TSeq const & s, std::size_t i) } -template <> -std::vector -to_uint64_vec(std::string const & s, std::size_t i) -{ - assert(s.size() >= 32 + i); // Otherwise its impossible to read 32 bases from read! - std::vector uints(1, 0u); - - for (unsigned const j = i + 32; i < j; ++i) - { - long const origin_size = uints.size(); - - if (origin_size > 97) - return std::vector(); - - char const c = s[i]; - - for (long u = 0; u < origin_size; ++u) - { - switch (c) - { - case 'A': - uints[u] += 4 + 0; - break; - - case 'C': - uints[u] += 4 + 1; - break; - - case 'G': - uints[u] += 4 + 2; - break; - - case 'T': - uints[u] += 4 + 3; - break; - - default: - uints.push_back(uints[u] * 4 + 0); // A - uints.push_back(uints[u] * 4 + 1); // C - uints.push_back(uints[u] * 4 + 2); // G - uints[u] <<= 2; uints[u] += 3; // T - break; - } - } - } - - return uints; -} - - // Explicit instantation template std::vector to_uint64_vec(seqan::Dna5String const & s, std::size_t i); template std::vector to_uint64_vec(seqan::IupacString const & s, std::size_t i); -template std::vector to_uint64_vec(std::string const & s, std::size_t i); std::array