Skip to content

Commit

Permalink
Merge pull request #4451 from vgteam/fastq-comments
Browse files Browse the repository at this point in the history
Allow SAM-style flags to be passed through to output from FASTQ comments
  • Loading branch information
jeizenga authored Nov 25, 2024
2 parents c9e7e2b + 1d74ef4 commit 0b43653
Show file tree
Hide file tree
Showing 17 changed files with 516 additions and 115 deletions.
2 changes: 1 addition & 1 deletion deps/libvgio
Submodule libvgio updated 1 files
+32 −0 src/alignment_io.cpp
403 changes: 356 additions & 47 deletions src/alignment.cpp

Large diffs are not rendered by default.

21 changes: 13 additions & 8 deletions src/alignment.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,37 +27,42 @@ int hts_for_each(string& filename, function<void(Alignment&)> lambda,
const PathPositionHandleGraph* graph);
int hts_for_each_parallel(string& filename, function<void(Alignment&)> lambda,
const PathPositionHandleGraph* graph);
int fastq_for_each(string& filename, function<void(Alignment&)> lambda);

// fastq
bool get_next_alignment_from_fastq(gzFile fp, char* buffer, size_t len, Alignment& alignment);
bool get_next_interleaved_alignment_pair_from_fastq(gzFile fp, char* buffer, size_t len, Alignment& mate1, Alignment& mate2);
bool get_next_alignment_pair_from_fastqs(gzFile fp1, gzFile fp2, char* buffer, size_t len, Alignment& mate1, Alignment& mate2);
// parsing a FASTQ record, optionally intepreting the comment as SAM-style tags
bool get_next_alignment_from_fastq(gzFile fp, char* buffer, size_t len, Alignment& alignment, bool comment_as_tags);
bool get_next_interleaved_alignment_pair_from_fastq(gzFile fp, char* buffer, size_t len, Alignment& mate1, Alignment& mate2, bool comment_as_tags);
bool get_next_alignment_pair_from_fastqs(gzFile fp1, gzFile fp2, char* buffer, size_t len, Alignment& mate1, Alignment& mate2, bool comment_as_tags);

size_t fastq_unpaired_for_each(const string& filename, function<void(Alignment&)> lambda);
size_t fastq_paired_interleaved_for_each(const string& filename, function<void(Alignment&, Alignment&)> lambda);
size_t fastq_paired_two_files_for_each(const string& file1, const string& file2, function<void(Alignment&, Alignment&)> lambda);
// parsing a FASTQ or FASTA file, optionally interpreting comments as SAM-style tags
size_t fastq_unpaired_for_each(const string& filename, function<void(Alignment&)> lambda, bool comment_as_tags = false);
size_t fastq_paired_interleaved_for_each(const string& filename, function<void(Alignment&, Alignment&)> lambda, bool comment_as_tags = false);
size_t fastq_paired_two_files_for_each(const string& file1, const string& file2, function<void(Alignment&, Alignment&)> lambda, bool comment_as_tags = false);
// parallel versions of above
size_t fastq_unpaired_for_each_parallel(const string& filename,
function<void(Alignment&)> lambda,
bool comment_as_tags = false,
uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE);

size_t fastq_paired_interleaved_for_each_parallel(const string& filename,
function<void(Alignment&, Alignment&)> lambda,
bool comment_as_tags = false,
uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE);

size_t fastq_paired_interleaved_for_each_parallel_after_wait(const string& filename,
function<void(Alignment&, Alignment&)> lambda,
function<bool(void)> single_threaded_until_true,
bool comment_as_tags = false,
uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE);

size_t fastq_paired_two_files_for_each_parallel(const string& file1, const string& file2,
function<void(Alignment&, Alignment&)> lambda,
bool comment_as_tags = false,
uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE);

size_t fastq_paired_two_files_for_each_parallel_after_wait(const string& file1, const string& file2,
function<void(Alignment&, Alignment&)> lambda,
function<bool(void)> single_threaded_until_true,
bool comment_as_tags = false,
uint64_t batch_size = vg::io::DEFAULT_PARALLEL_BATCHSIZE);

bam_hdr_t* hts_file_header(string& filename, string& header);
Expand Down
10 changes: 9 additions & 1 deletion src/mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3407,6 +3407,9 @@ pair<vector<Alignment>, vector<Alignment>> Mapper::align_paired_multi(
read2_max_score = max(p->second.score(), read2_max_score);
results.first.push_back(p->first);
results.second.push_back(p->second);
// FIXME: this can clobber the annotations about the haplotype scoring, but I think we've abandoned that
*results.first.back().mutable_annotation() = first_mate.annotation();
*results.second.back().mutable_annotation() = second_mate.annotation();
possible_pairs += p->first.score() > 0 && p->second.score() > 0;
}
bool max_first = results.first.size() && (read1_max_score == results.first.front().score() && read2_max_score == results.second.front().score());
Expand Down Expand Up @@ -4486,7 +4489,12 @@ vector<Alignment> Mapper::align_multi(const Alignment& aln, int kmer_size, int s
clean_aln.set_sequence(aln.sequence());
clean_aln.set_quality(aln.quality());
clean_aln.clear_refpos();
return align_multi_internal(true, clean_aln, kmer_size, stride, max_mem_length, band_width, band_overlap, cluster_mq, max_multimaps, extra_multimaps, nullptr, xdrop_alignment);
auto alns = align_multi_internal(true, clean_aln, kmer_size, stride, max_mem_length, band_width, band_overlap, cluster_mq, max_multimaps, extra_multimaps, nullptr, xdrop_alignment);
for (auto& result : alns) {
// FIXME: this can clobber the haplotype score annotations, but i think we've abandoned them
*result.mutable_annotation() = aln.annotation();
}
return alns;
}

vector<Alignment> Mapper::align_multi_internal(bool compute_unpaired_quality,
Expand Down
12 changes: 5 additions & 7 deletions src/minimizer_mapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -781,13 +781,11 @@ vector<Alignment> MinimizerMapper::map_from_extensions(Alignment& aln) {
alignments_to_source.reserve(cluster_extensions.size());

// Create a new alignment object to get rid of old annotations.
{
Alignment temp;
temp.set_sequence(aln.sequence());
temp.set_name(aln.name());
temp.set_quality(aln.quality());
aln = std::move(temp);
}
aln.clear_refpos();
aln.clear_path();
aln.set_score(0);
aln.set_identity(0);
aln.set_mapping_quality(0);

// Annotate the read with metadata
if (!sample_name.empty()) {
Expand Down
2 changes: 1 addition & 1 deletion src/minimizer_mapper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ class MinimizerMapper : public AlignerClient {
size_t min_lookback_items = default_min_lookback_items;
/// How many chaining sources should we allow ourselves to consider ever?
static constexpr size_t default_lookback_item_hard_cap = 15;
size_t lookback_item_hard_cap = lookback_item_hard_cap;
size_t lookback_item_hard_cap = default_lookback_item_hard_cap;
/// How many bases should we try to look back initially when chaining?
static constexpr size_t default_initial_lookback_threshold = 10;
size_t initial_lookback_threshold = default_initial_lookback_threshold;
Expand Down
1 change: 1 addition & 0 deletions src/multipath_alignment.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2320,6 +2320,7 @@ namespace vg {
auto annotation = from.get_annotation("secondary");
assert(annotation.first == multipath_alignment_t::Bool);
to.set_is_secondary(*((bool*) annotation.second));
clear_annotation(to, "secondary");
}
}

Expand Down
9 changes: 1 addition & 8 deletions src/multipath_alignment_emitter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -313,8 +313,7 @@ void MultipathAlignmentEmitter::convert_to_alignment(const multipath_alignment_t
void MultipathAlignmentEmitter::create_alignment_shim(const string& name, const multipath_alignment_t& mp_aln,
Alignment& shim, const string* prev_name, const string* next_name) const {

shim.set_sequence(mp_aln.sequence());
shim.set_quality(mp_aln.quality());
transfer_read_metadata(mp_aln, shim);
shim.set_name(name);
if (prev_name) {
shim.mutable_fragment_prev()->set_name(*prev_name);
Expand Down Expand Up @@ -344,12 +343,6 @@ void MultipathAlignmentEmitter::create_alignment_shim(const string& name, const
shim.set_score(optimal_alignment_score(mp_aln, true));
}

// this tag comes from surject and is used in both
if (mp_aln.has_annotation("all_scores")) {
auto anno = mp_aln.get_annotation("all_scores");
assert(anno.first == multipath_alignment_t::String);
set_annotation(&shim, "all_scores", *((const string*) anno.second));
}
}

void MultipathAlignmentEmitter::convert_to_hts_unpaired(const string& name, const multipath_alignment_t& mp_aln,
Expand Down
19 changes: 14 additions & 5 deletions src/subcommand/giraffe_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,7 +390,8 @@ void help_giraffe(char** argv, const BaseOptionGroup& parser, bool full_help) {
<< "input options:" << endl
<< " -G, --gam-in FILE read and realign GAM-format reads from FILE" << endl
<< " -f, --fastq-in FILE read and align FASTQ-format reads from FILE (two are allowed, one for each mate)" << endl
<< " -i, --interleaved GAM/FASTQ input is interleaved pairs, for paired-end alignment" << endl;
<< " -i, --interleaved GAM/FASTQ input is interleaved pairs, for paired-end alignment" << endl
<< " --comments-as-tags intepret comments in name lines as SAM-style tags and annotate alignments with them" << endl;

cerr
<< "haplotype sampling:" << endl
Expand Down Expand Up @@ -464,6 +465,7 @@ int main_giraffe(int argc, char** argv) {
constexpr int OPT_HAPLOTYPE_NAME = 1100;
constexpr int OPT_KFF_NAME = 1101;
constexpr int OPT_INDEX_BASENAME = 1102;
constexpr int OPT_COMMENTS_AS_TAGS = 1103;

// initialize parameters with their default options

Expand Down Expand Up @@ -497,8 +499,10 @@ int main_giraffe(int argc, char** argv) {
bool interleaved = false;
// True if fastq_filename_2 or interleaved is set.
bool paired = false;
// True if the FASTQ's name line comments are SAM-style tags that we want to preserve
bool comments_as_tags = false;
string param_preset = "default";
//Attempt up to this many rescues of reads with no pairs
// Attempt up to this many rescues of reads with no pairs
bool forced_rescue_attempts = false;
// Which rescue algorithm do we use?
MinimizerMapper::RescueAlgorithm rescue_algorithm = MinimizerMapper::rescue_dozeu;
Expand Down Expand Up @@ -592,6 +596,7 @@ int main_giraffe(int argc, char** argv) {
{"gam-in", required_argument, 0, 'G'},
{"fastq-in", required_argument, 0, 'f'},
{"interleaved", no_argument, 0, 'i'},
{"comments-as-tags", no_argument, 0, OPT_COMMENTS_AS_TAGS},
{"max-multimaps", required_argument, 0, 'M'},
{"sample", required_argument, 0, 'N'},
{"read-group", required_argument, 0, 'R'},
Expand Down Expand Up @@ -775,6 +780,10 @@ int main_giraffe(int argc, char** argv) {
paired = true;
break;

case OPT_COMMENTS_AS_TAGS:
comments_as_tags = true;
break;

case 'N':
sample_name = optarg;
break;
Expand Down Expand Up @@ -1462,12 +1471,12 @@ int main_giraffe(int argc, char** argv) {
});
} else if (!fastq_filename_2.empty()) {
//A pair of FASTQ files to map
fastq_paired_two_files_for_each_parallel_after_wait(fastq_filename_1, fastq_filename_2, map_read_pair, distribution_is_ready, batch_size);
fastq_paired_two_files_for_each_parallel_after_wait(fastq_filename_1, fastq_filename_2, map_read_pair, distribution_is_ready, comments_as_tags, batch_size);


} else if ( !fastq_filename_1.empty()) {
// An interleaved FASTQ file to map, map all its pairs in parallel.
fastq_paired_interleaved_for_each_parallel_after_wait(fastq_filename_1, map_read_pair, distribution_is_ready, batch_size);
fastq_paired_interleaved_for_each_parallel_after_wait(fastq_filename_1, map_read_pair, distribution_is_ready, comments_as_tags, batch_size);
}

// Now map all the ambiguous pairs
Expand Down Expand Up @@ -1535,7 +1544,7 @@ int main_giraffe(int argc, char** argv) {

if (!fastq_filename_1.empty()) {
// FASTQ file to map, map all its reads in parallel.
fastq_unpaired_for_each_parallel(fastq_filename_1, map_read, batch_size);
fastq_unpaired_for_each_parallel(fastq_filename_1, map_read, comments_as_tags, batch_size);
}
}

Expand Down
18 changes: 13 additions & 5 deletions src/subcommand/map_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ void help_map(char** argv) {
<< " -G, --gam-input FILE realign GAM input" << endl
<< " -f, --fastq FILE input fastq or (2-line format) fasta, possibly compressed, two are allowed, one for each mate" << endl
<< " -F, --fasta FILE align the sequences in a FASTA file that may have multiple lines per reference sequence" << endl
<< " --comments-as-tags intepret comments in name lines as SAM-style tags and annotate alignments with them" << endl
<< " -i, --interleaved fastq or GAM is interleaved paired-ended" << endl
<< " -N, --sample NAME for --reads input, add this sample" << endl
<< " -R, --read-group NAME for --reads input, add this read group" << endl
Expand Down Expand Up @@ -111,6 +112,7 @@ int main_map(int argc, char** argv) {
#define OPT_RECOMBINATION_PENALTY 1001
#define OPT_EXCLUDE_UNALIGNED 1002
#define OPT_REF_PATHS 1003
#define OPT_COMMENTS_AS_TAGS 1004
string matrix_file_name;
string seq;
string qual;
Expand Down Expand Up @@ -189,6 +191,7 @@ int main_map(int argc, char** argv) {
bool xdrop_alignment = false;
uint32_t max_gap_length = 40;
bool log_time = false;
bool comments_as_tags = false;

int c;
optind = 2; // force optind past command positional argument
Expand Down Expand Up @@ -267,6 +270,7 @@ int main_map(int argc, char** argv) {
{"xdrop-alignment", no_argument, 0, 2},
{"gaf", no_argument, 0, '%'},
{"log-time", no_argument, 0, '^'},
{"comments-as-tags", no_argument, 0, OPT_COMMENTS_AS_TAGS},
{0, 0, 0, 0}
};

Expand Down Expand Up @@ -589,6 +593,10 @@ int main_map(int argc, char** argv) {
case '^':
log_time = true;
break;

case OPT_COMMENTS_AS_TAGS:
comments_as_tags = true;
break;

case 'h':
case '?':
Expand Down Expand Up @@ -1038,7 +1046,7 @@ int main_map(int argc, char** argv) {

reads_mapped_by_thread[omp_get_thread_num()] += 2;
};
fastq_paired_interleaved_for_each_parallel(fastq1, lambda);
fastq_paired_interleaved_for_each_parallel(fastq1, lambda, comments_as_tags);
#pragma omp parallel
{ // clean up buffered alignments that weren't perfect
auto our_mapper = mapper[omp_get_thread_num()];
Expand Down Expand Up @@ -1071,7 +1079,7 @@ int main_map(int argc, char** argv) {
output_alignments(alignments, empty_alns);
reads_mapped_by_thread[tid] += 1;
};
fastq_unpaired_for_each_parallel(fastq1, lambda);
fastq_unpaired_for_each_parallel(fastq1, lambda, comments_as_tags);
} else {
// paired two-file
auto output_func = [&](Alignment& aln1,
Expand Down Expand Up @@ -1114,7 +1122,7 @@ int main_map(int argc, char** argv) {

reads_mapped_by_thread[omp_get_thread_num()] += 2;
};
fastq_paired_two_files_for_each_parallel(fastq1, fastq2, lambda);
fastq_paired_two_files_for_each_parallel(fastq1, fastq2, lambda, comments_as_tags);
#pragma omp parallel
{
auto our_mapper = mapper[omp_get_thread_num()];
Expand Down Expand Up @@ -1186,7 +1194,7 @@ int main_map(int argc, char** argv) {
}
reads_mapped_by_thread[omp_get_thread_num()] += 2;
};
vg::io::for_each_interleaved_pair_parallel(gam_in, lambda);
vg::io::for_each_interleaved_pair_parallel(gam_in, lambda, comments_as_tags);
#pragma omp parallel
{
auto our_mapper = mapper[omp_get_thread_num()];
Expand Down Expand Up @@ -1222,7 +1230,7 @@ int main_map(int argc, char** argv) {
output_alignments(alignments, empty_alns);
reads_mapped_by_thread[tid] += 1;
};
vg::io::for_each_parallel(gam_in, lambda);
vg::io::for_each_parallel(gam_in, lambda, comments_as_tags);
}
gam_in.close();
}
Expand Down
Loading

1 comment on commit 0b43653

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17395 seconds

Please sign in to comment.