Skip to content

Commit

Permalink
v2.6 (#6)
Browse files Browse the repository at this point in the history
* Added option for extra call_only iteration. Added check for duplicated sample names.
* Added coverage filter in the 'genotype_sv' subcommand. It is possible to turn the filter off with a new '--no_filter_on_coverage' command.
* Another fix for a very rare bug in graph construction. Only affects very large-scale genotyping runs. WIP streamlined discovery.
* Write CSI instead of TBI with the --csi option.
* Added several per alt QC metrics to VCF output when calling genotype
* optimization for vcf_merge
* Added no_variant_overlapping option in force calling mode
* updated how QD and QDalt are calculated
* AAScore added in SNP/indel genotyping
* bugfix: In very large genotyping runs some INFO fields can get larger than INT_MAX so I needed to change them to be strings instead, otherwise bcftools is not happy with me.
  • Loading branch information
hannespetur authored Oct 19, 2020
1 parent d8d0167 commit d68b484
Show file tree
Hide file tree
Showing 50 changed files with 4,686 additions and 1,110 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ include(ExternalProject)

# The version number
set (graphtyper_VERSION_MAJOR 2)
set (graphtyper_VERSION_MINOR 5)
set (graphtyper_VERSION_PATCH 1)
set (graphtyper_VERSION_MINOR 6)
set (graphtyper_VERSION_PATCH 0)
set(STATIC_DIR "" CACHE STRING "If set, GraphTyper will be built as a static binary using libraries from the given STATIC_DIR.")

# Get the current working branch
Expand Down
7 changes: 7 additions & 0 deletions include/graphtyper/constants.hpp.in
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,13 @@ uint32_t constexpr MAX_NUM_LOCATIONS_PER_PATH = 256;
uint16_t constexpr EPSILON_0_EXPONENT = 12;
int32_t constexpr INSERT_SIZE_WHEN_NOT_PROPER_PAIR = 0x7FFFFFFFl;

// alignments scores and penalties
uint8_t constexpr SCORE_MATCH = 1;
uint8_t constexpr SCORE_MISMATCH = 4;
uint8_t constexpr SCORE_GAP_OPEN = 7;
uint8_t constexpr SCORE_GAP_EXTEND = 1;
uint8_t constexpr SCORE_CLIP = 5;


/**
* Flags
Expand Down
2 changes: 1 addition & 1 deletion include/graphtyper/graph/absolute_position.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
namespace gyper
{

class Contig;
struct Contig;

class AbsolutePosition
{
Expand Down
5 changes: 5 additions & 0 deletions include/graphtyper/graph/constructor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@ void construct_graph(std::string const & reference_filename,
bool use_absolute_positions = true,
bool check_index = true);

void
open_and_read_reference_genome(std::vector<char> & reference_sequence,
std::string const & reference_fn,
GenomicRegion const & genomic_region);

std::vector<Variant>
get_variants_using_tabix(std::string const & vcf, GenomicRegion const & genomic_region);

Expand Down
4 changes: 3 additions & 1 deletion include/graphtyper/graph/haplotype.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,13 @@ class Haplotype
bool is_low_qual,
std::size_t mismatches);

void clipped_reads_to_stats(bool fully_aligned);
void clipped_reads_to_stats(int clipped_bp, int read_length);
void mapq_to_stats(uint8_t mapq);
// void realignment_to_stats(uint32_t original_pos, uint32_t new_pos);

void strand_to_stats(uint16_t const flags);
void mismatches_to_stats(uint8_t mismatches, int read_length);
void score_diff_to_stats(uint8_t score_diff);
void coverage_to_gts(std::size_t pn_index, bool is_proper_pair);
std::bitset<MAX_NUMBER_OF_HAPLOTYPES> explain_to_path_explain();

Expand Down
39 changes: 20 additions & 19 deletions include/graphtyper/typer/alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,36 +20,37 @@ align_read(bam1_t * rec,
seqan::IupacString const & rseq,
gyper::PHIndex const & ph_index);

//#ifndef NDEBUG
//GenotypePaths *
//update_unpaired_read_paths(std::pair<GenotypePaths, GenotypePaths> & geno_paths,
// seqan::IupacString const & seq,
// seqan::IupacString const & rseq,
// bam1_t * rec);
//#else
GenotypePaths *
update_unpaired_read_paths(std::pair<GenotypePaths, GenotypePaths> & geno_paths, bam1_t * rec);

void
further_update_unpaired_read_paths_for_discovery(GenotypePaths & geno,
seqan::IupacString const & seq,
seqan::IupacString const & rseq,
bam1_t * rec
);
//#endif

#ifndef NDEBUG
void
update_paths(std::pair<GenotypePaths, GenotypePaths> & geno_paths,
seqan::IupacString const & seq,
seqan::IupacString const & rseq,
bam1_t * rec);
#endif // NDEBUG
further_update_unpaired_read_paths_for_discovery(GenotypePaths & geno, bam1_t * rec);

//#ifndef NDEBUG
//void
//update_paths(std::pair<GenotypePaths, GenotypePaths> & geno_paths,
// seqan::IupacString const & seq,
// seqan::IupacString const & rseq,
// bam1_t * rec);
//#endif // NDEBUG

#ifdef NDEBUG
//#ifdef NDEBUG
void
update_paths(std::pair<GenotypePaths, GenotypePaths> & geno_paths, bam1_t * rec);
#endif // DEBUG
//#endif // DEBUG


void
further_update_paths_for_discovery(std::pair<GenotypePaths, GenotypePaths> & geno_paths,
seqan::IupacString const & seq,
seqan::IupacString const & rseq,
bam1_t * rec
);
further_update_paths_for_discovery(std::pair<GenotypePaths, GenotypePaths> & geno_paths, bam1_t * rec);

std::pair<GenotypePaths *, GenotypePaths *>
get_better_paths(std::pair<GenotypePaths, GenotypePaths> & geno_paths1,
Expand Down
67 changes: 67 additions & 0 deletions include/graphtyper/typer/bucket.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
#pragma once

#include <parallel_hashmap/phmap_fwd_decl.h>

#include <graphtyper/typer/event.hpp>
#include <graphtyper/typer/read.hpp>


namespace gyper
{

class BucketFirstPass
{
public:
int32_t global_max_pos_end{-1};// Max pos end of alignments in this bucket and all previous buckets
int32_t max_pos_end{-1}; // Max pos end of alignments in this bucket

std::map<SnpEvent, SnpEventInfo> snps;
Tindel_events indel_events; // type is std::map<IndelEvent, EventInfo>
};


class Bucket
{
public:
int32_t global_max_pos_end{-1};// Max pos end of alignments in this bucket and all previous buckets
int32_t max_pos_end{-1}; // Max pos end of alignments in this bucket

Tindel_events indel_events; // type is std::map<IndelEvent, EventInfo>
std::vector<Read> reads;

std::string to_string() const;
};


bool
is_indel_in_bucket(std::vector<Bucket> const & buckets,
IndelEvent const & indel_event,
long const region_begin,
long const BUCKET_SIZE);


std::map<SnpEvent, SnpEventInfo>::iterator
add_snp_event_to_bucket(std::vector<BucketFirstPass> & buckets,
SnpEvent && event,
long const region_begin,
long const BUCKET_SIZE);


template <typename TBucket>
Tindel_events::iterator
add_indel_event_to_bucket(std::vector<TBucket> & buckets,
IndelEvent && event,
long const region_begin,
long const BUCKET_SIZE,
std::vector<char> const & reference_sequence,
long ref_offset);

//void
//add_base_to_bucket(std::vector<Bucket> & buckets,
// int32_t pos,
// char seq,
// char qual,
// long const region_begin,
// long const BUCKET_SIZE);

} // namespace gyper
9 changes: 9 additions & 0 deletions include/graphtyper/typer/caller.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
namespace gyper
{

class Vcf;
class Primers;

// returns the prefix to the output files
std::vector<std::string>
call(std::vector<std::string> const & hts_path,
std::vector<double> const & avg_cov_by_readlen,
std::string const & graph_path,
PHIndex const & ph_index,
std::string const & output_dir,
Expand All @@ -34,4 +36,11 @@ discover_directly_from_bam(std::string const & graph_path,
long minimum_variant_support,
double minimum_variant_support_ratio);

void
streamlined_discovery(std::vector<std::string> const & hts_paths,
std::string const & reference_fn,
std::string const & region_str,
std::string const & output_dir,
gyper::Vcf & vcf);

} // namespace gyper
Loading

0 comments on commit d68b484

Please sign in to comment.