diff --git a/data/md5sum.txt b/data/md5sum.txt index f21617e..1be10d3 100644 --- a/data/md5sum.txt +++ b/data/md5sum.txt @@ -1,16 +1,16 @@ -ae8e662c3ab23ba742a45a9784390e63 tests/reads_1.fq -ddeea92ba05c5c6a909c136d5059140d tests/reads.mstats -7c43e9b1409c6725c70aeb18d8db3413 tests/reads_pbat_pe_1.fq -21c3b088e956de62e10a6d190ffb8744 tests/reads_pbat_pe_2.fq -8c170b022512012f078396cc19c41600 tests/reads_pbat_pe.mstats -ae8e662c3ab23ba742a45a9784390e63 tests/reads_pe_1.fq -d2723d2e99d2a3492af508633d61b30a tests/reads_pe_2.fq -6e2ba01e98f0effc005641ba6a7eb8b6 tests/reads_pe.mstats -2e033e6c0f2e93000877d6f8f40bcc99 tests/reads_rpbat_pe_1.fq -a28bc47b567130bbcdc45b1daa54ae32 tests/reads_rpbat_pe_2.fq -ecf7559bdeecb3f5d23e2c85e1eb22ff tests/reads_rpbat_pe.mstats +8dc13cfc9c7af9c5ed368d5e2462275e tests/reads_1.fq +04f73fa36d90fbe9e157e5c3214e585a tests/reads.mstats +da3003dc18e6ecfcf0586705444ca6f8 tests/reads_pbat_pe_1.fq +bbfd18b0cc7cf93ad4ed317ef996f3ee tests/reads_pbat_pe_2.fq +7d2e72d75fe2e55f978dffa76a492c61 tests/reads_pbat_pe.mstats +2415d44925cc23fa60a0d78c9af04509 tests/reads_pbat_pe.sam +d49efb9f420b4edeb4647217fa247e6d tests/reads_pe_1.fq +3627c807f259109ea0c4c2b7bcd12320 tests/reads_pe_2.fq +0efe8fb80106f2edb370e8f7e7c1bbb6 tests/reads_pe.mstats +3c10bd0ce3f7d458a05a334a351d96ff tests/reads_pe.sam +3dd98274d8b707878aaad8246c6df9f6 tests/reads_rpbat_pe_1.fq +fcf5e4d93ac62dafcfb279e95103ad0e tests/reads_rpbat_pe_2.fq +19329a8ec659a82dc82e56a0ca6999b0 tests/reads_rpbat_pe.mstats +c6980b863b86d7e491e3f9358943bcd3 tests/reads_rpbat_pe.sam +0f2ad3720fd50961494222a3cf1dbef1 tests/reads.sam bcbf01be810cbf4051292813eb6b9225 tests/tRex1.idx -411a91bbd43a429ca665e350ddb96c4b tests/reads_pbat_pe.sam -28312ec7f684e8552be4a842e451e9be tests/reads_rpbat_pe.sam -7a1dd56c17759819bdc6cca3cf39ef1d tests/reads.sam -a067f22b83e6a7cd1246cdc0353b7ce9 tests/reads_pe.sam diff --git a/src/abismal.cpp b/src/abismal.cpp index 5c9fef1..2a3ac3a 100644 --- a/src/abismal.cpp +++ b/src/abismal.cpp @@ -96,7 +96,7 @@ get_strand_code(const char strand, const conversion_type conv) { } struct ReadLoader { - ReadLoader(const string &fn): cur_line{0}, filename{fn}, in{fn, "r"} {} + ReadLoader(const string &fn) : cur_line{0}, filename{fn}, in{fn, "r"} {} bool good() const { return in; } @@ -132,7 +132,8 @@ struct ReadLoader { [](const char c) { return c != 'N'; }) < min_read_length) line.clear(); else { - while (line.back() == 'N') line.pop_back(); // remove Ns from 3' + while (line.back() == 'N') + line.pop_back(); // remove Ns from 3' line = line.substr(line.find_first_of("ACGT")); // removes Ns from 5' } reads.emplace_back(line); @@ -161,7 +162,8 @@ const uint32_t ReadLoader::min_read_length = // alignment matrix for a batch of reads static inline void update_max_read_length(size_t &max_length, const vector &reads) { - for (auto &i : reads) max_length = max(max_length, i.size()); + for (auto &i : reads) + max_length = max(max_length, i.size()); } struct se_element { // size = 8 @@ -169,10 +171,10 @@ struct se_element { // size = 8 flags_t flags; // 2 bytes uint32_t pos; // 4 bytes - se_element(): diffs(MAX_DIFFS), flags(0), pos(0) {} + se_element() : diffs(MAX_DIFFS), flags(0), pos(0) {} - se_element(const score_t d, const flags_t f, const uint32_t p) - : diffs(d), flags(f), pos(p) {} + se_element(const score_t d, const flags_t f, const uint32_t p) : + diffs(d), flags(f), pos(p) {} bool operator==(const se_element &rhs) const { return pos == rhs.pos && flags == rhs.flags; @@ -248,13 +250,15 @@ valid_hit(const se_element s, const uint32_t readlen) { return s.diffs < static_cast(se_element::invalid_hit_frac * readlen); } -template static inline T +template +static inline T max16(const T x, const T y) { return (x > y) ? x : y; } struct se_candidates { - se_candidates(): sz(1), best(se_element()), v(vector(max_size)) {} + se_candidates() : + sz(1), best(se_element()), v(vector(max_size)) {} inline bool full() const { return sz == max_size; }; @@ -284,7 +288,9 @@ struct se_candidates { pop_heap(begin(v), begin(v) + sz); v[sz - 1] = se_element(d, s, p); } - else { v[sz++] = se_element(d, s, p); } + else { + v[sz++] = se_element(d, s, p); + } push_heap(begin(v), begin(v) + sz); } @@ -357,7 +363,8 @@ static inline bool chrom_and_posn(const ChromLookup &cl, const bam_cigar_t &cig, const uint32_t p, uint32_t &r_p, uint32_t &r_e, uint32_t &r_chr) { const uint32_t ref_ops = cigar_rseq_ops(cig); - if (!cl.get_chrom_idx_and_offset(p, ref_ops, r_chr, r_p)) return false; + if (!cl.get_chrom_idx_and_offset(p, ref_ops, r_chr, r_p)) + return false; r_e = r_p + ref_ops; return true; } @@ -370,7 +377,8 @@ format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, bam_rec &sr) { const bool ambig = res.ambig(); const bool valid = !res.empty(); - if (!allow_ambig && ambig) return map_ambig; + if (!allow_ambig && ambig) + return map_ambig; uint32_t ref_s = 0, ref_e = 0, chrom_idx = 0; if (!valid || !chrom_and_posn(cl, cigar, res.pos, ref_s, ref_e, chrom_idx)) @@ -378,9 +386,11 @@ format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, // ADS: we might be doing format_se for a mate in paried reads uint16_t flag = 0; - if (res.rc()) flag |= BAM_FREVERSE; + if (res.rc()) + flag |= BAM_FREVERSE; - if (allow_ambig && ambig) flag |= BAM_FSECONDARY; + if (allow_ambig && ambig) + flag |= BAM_FSECONDARY; // flag |= BAM_FREAD1; // ADS: this might be wrong... @@ -401,20 +411,23 @@ format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, read.data(), // const char *seq, nullptr, // const char *qual, 16); // size_t l_aux); - if (ret < 0) throw runtime_error("failed to format bam"); + if (ret < 0) + throw runtime_error("failed to format bam"); ret = bam_aux_update_int(sr.b, "NM", res.diffs); - if (ret < 0) throw runtime_error("bam_aux_update_int"); + if (ret < 0) + throw runtime_error("bam_aux_update_int"); ret = bam_aux_append(sr.b, "CV", 'A', 1, (uint8_t *)(res.elem_is_a_rich() ? "A" : "T")); - if (ret < 0) throw runtime_error("bam_aux_append"); + if (ret < 0) + throw runtime_error("bam_aux_append"); return ambig ? map_ambig : map_unique; } struct pe_element { - pe_element(): aln_score(0), r1(se_element()), r2(se_element()) {} + pe_element() : aln_score(0), r1(se_element()), r2(se_element()) {} score_t diffs() const { return r1.diffs + r2.diffs; } @@ -503,10 +516,12 @@ format_pe(const bool allow_ambig, const pe_element &p, const ChromLookup &cl, bam_rec &sr1, bam_rec &sr2) { static const uint8_t cv[2] = {'T', 'A'}; - if (p.empty()) return map_unmapped; + if (p.empty()) + return map_unmapped; const bool ambig = p.ambig(); - if (!allow_ambig && ambig) return map_ambig; + if (!allow_ambig && ambig) + return map_ambig; uint32_t r_s1 = 0, r_e1 = 0, chr1 = 0; // positions in chroms (0-based) uint32_t r_s2 = 0, r_e2 = 0, chr2 = 0; @@ -559,13 +574,16 @@ format_pe(const bool allow_ambig, const pe_element &p, const ChromLookup &cl, read1.data(), // const char *seq, nullptr, // const char *qual, 16); // size_t l_aux); - if (ret < 0) throw runtime_error("error formatting bam"); + if (ret < 0) + throw runtime_error("error formatting bam"); ret = bam_aux_update_int(sr1.b, "NM", p.r1.diffs); - if (ret < 0) throw runtime_error("error adding aux field"); + if (ret < 0) + throw runtime_error("error adding aux field"); ret = bam_aux_append(sr1.b, "CV", 'A', 1, cv + p.r1.elem_is_a_rich()); - if (ret < 0) throw runtime_error("error adding aux field"); + if (ret < 0) + throw runtime_error("error adding aux field"); sr2.b = bam_init1(); ret = bam_set1(sr2.b, @@ -584,19 +602,22 @@ format_pe(const bool allow_ambig, const pe_element &p, const ChromLookup &cl, read2.data(), // const char *seq, nullptr, // const char *qual, 16); // size_t l_aux); - if (ret < 0) throw runtime_error("failed to format bam"); + if (ret < 0) + throw runtime_error("failed to format bam"); ret = bam_aux_update_int(sr2.b, "NM", p.r2.diffs); - if (ret < 0) throw runtime_error("error adding aux field"); + if (ret < 0) + throw runtime_error("error adding aux field"); ret = bam_aux_append(sr2.b, "CV", 'A', 1, cv + p.r2.elem_is_a_rich()); - if (ret < 0) throw runtime_error("error adding aux field"); + if (ret < 0) + throw runtime_error("error adding aux field"); return ambig ? map_ambig : map_unique; } struct pe_candidates { - pe_candidates(): v(vector(max_size_large)) {} + pe_candidates() : v(vector(max_size_large)) {} inline void reset(const uint32_t readlen) { v.front().reset(readlen); @@ -735,7 +756,8 @@ struct se_map_stats { reads_unmapped += !valid; skipped_reads += read.empty(); - if (valid && (allow_ambig || !ambig)) update_error_rate(s.diffs, cigar); + if (valid && (allow_ambig || !ambig)) + update_error_rate(s.diffs, cigar); } void update_error_rate(const score_t diffs, const bam_cigar_t &cigar) { @@ -765,7 +787,8 @@ struct se_map_stats { assign_values(); string t; - for (size_t i = 0; i < n_tabs; ++i) t += tab; + for (size_t i = 0; i < n_tabs; ++i) + t += tab; ostringstream oss; // clang-format off oss << t << "total_reads: " << total_reads << endl @@ -951,7 +974,8 @@ select_output(const bool allow_ambig, const ChromLookup &cl, name1, name2, cig1, cig2, sr1, sr2); if (!best.should_report(allow_ambig) || pe_map_type == map_unmapped) { - if (pe_map_type == map_unmapped) best.reset(); + if (pe_map_type == map_unmapped) + best.reset(); if (format_se(allow_ambig, se1, cl, read1, name1, cig1, sr1) == map_unmapped) se1.reset(); @@ -993,7 +1017,7 @@ full_compare(const score_t cutoff, const PackedRead::const_iterator read_end, return d; } -template +template static inline void check_hits(const uint32_t offset, const PackedRead::const_iterator read_st, const PackedRead::const_iterator read_end, @@ -1016,12 +1040,13 @@ check_hits(const uint32_t offset, const PackedRead::const_iterator read_st, full_compare(res.cutoff, read_end, ((the_pos & 15u) << 2), read_st, genome_st + (the_pos >> 4)); - if (diffs <= res.cutoff) res.update(specific, diffs, strand_code, the_pos); + if (diffs <= res.cutoff) + res.update(specific, diffs, strand_code, the_pos); } } struct compare_bases { - compare_bases(const genome_iterator g_): g(g_) {} + compare_bases(const genome_iterator g_) : g(g_) {} bool operator()(const uint32_t mid, const two_letter_t chr) const { return get_bit(*(g + mid)) < chr; @@ -1030,7 +1055,8 @@ struct compare_bases { const genome_iterator g; }; -template static uint32_t +template +static uint32_t find_candidates(const uint32_t max_candidates, const Read::const_iterator read_start, const genome_iterator gi, const uint32_t read_lim, vector::const_iterator &low, @@ -1062,14 +1088,15 @@ find_candidates(const uint32_t max_candidates, return p; } -template static inline three_letter_t +template +static inline three_letter_t get_three_letter_num_fast(const uint8_t nt) { return (the_conv == c_to_t) ? nt & 5 : // C=T=0, A=1, G=4 nt & 10; // A=G=0, C=2, T=8 } -template struct compare_bases_three { - compare_bases_three(const genome_iterator g_): g(g_) {} +template struct compare_bases_three { + compare_bases_three(const genome_iterator g_) : g(g_) {} bool operator()(const uint32_t mid, const three_letter_t chr) const { return get_three_letter_num_fast(*(g + mid)) < chr; @@ -1078,7 +1105,7 @@ template struct compare_bases_three { const genome_iterator g; }; -template +template static uint32_t find_candidates_three(const uint32_t max_candidates, const Read::const_iterator read_start, @@ -1131,7 +1158,8 @@ get_conv_type(const uint16_t strand_code) { : (c_to_t)); } -template static void +template +static void process_seeds(const uint32_t max_candidates, const vector::const_iterator counter_st, const vector::const_iterator counter_three_st, @@ -1198,7 +1226,8 @@ process_seeds(const uint32_t max_candidates, shift_three_key(*(read_idx + seed::key_weight_three), k_three); } - if (!res.should_do_sensitive()) return; + if (!res.should_do_sensitive()) + return; read_idx = begin(read_seed); get_1bit_hash(read_idx, k); @@ -1237,7 +1266,8 @@ process_seeds(const uint32_t max_candidates, } } -template static void +template +static void prep_read(const string &r, Read &pread) { pread.resize(r.size()); for (size_t i = 0; i != r.size(); ++i) @@ -1273,7 +1303,8 @@ pack_read(const Read &pread, PackedRead &packed_pread) { } // do not fill the flanking position - if (pread_ind == sz) return; + if (pread_ind == sz) + return; // now put only the remaining bases in the last pos. The rest // should match any base in the reference @@ -1282,7 +1313,8 @@ pack_read(const Read &pread, PackedRead &packed_pread) { while (pread_ind < sz) *it |= (static_cast(pread[pread_ind++]) << ((j++) << 2)); - while (j < NUM_BASES_PER_ELEMENT) *it |= base_match_any << ((j++) << 2); + while (j < NUM_BASES_PER_ELEMENT) + *it |= base_match_any << ((j++) << 2); } static inline bool @@ -1350,7 +1382,8 @@ align_se_candidates(const Read &pread_t, const Read &pread_t_rc, best.diffs = simple_aln::edit_distance(best_scr, len, cigar); // do not report and count it as unmapped if not valid - if (!valid(best, len, readlen, cutoff)) best.reset(); + if (!valid(best, len, readlen, cutoff)) + best.reset(); } else best.reset(); @@ -1363,11 +1396,13 @@ valid_bam_rec(const bam_rec &b) { static inline void reset_bam_rec(bam_rec &b) { - if (b.b) bam_destroy1(b.b); + if (b.b) + bam_destroy1(b.b); b.b = nullptr; } -template static void +template +static void map_single_ended(const bool show_progress, const bool allow_ambig, const AbismalIndex &abismal_index, ReadLoader &rl, se_map_stats &se_stats, bamxx::bam_header &hdr, @@ -1458,14 +1493,16 @@ map_single_ended(const bool show_progress, const bool allow_ambig, } } for (size_t i = 0; i < n_reads; ++i) { - if (valid_bam_rec(mr[i])) reset_bam_rec(mr[i]); + if (valid_bam_rec(mr[i])) + reset_bam_rec(mr[i]); se_stats.update(allow_ambig, reads[i], cigar[i], bests[i]); cigar[i].clear(); } if (show_progress) #pragma omp critical { - if (progress.time_to_report(the_byte)) progress.report(cerr, the_byte); + if (progress.time_to_report(the_byte)) + progress.report(cerr, the_byte); } } } @@ -1569,14 +1606,16 @@ map_single_ended_rand(const bool show_progress, const bool allow_ambig, throw runtime_error("failed to write bam"); } for (size_t i = 0; i < n_reads; ++i) { - if (valid_bam_rec(mr[i])) reset_bam_rec(mr[i]); + if (valid_bam_rec(mr[i])) + reset_bam_rec(mr[i]); se_stats.update(allow_ambig, reads[i], cigar[i], bests[i]); cigar[i].clear(); } if (show_progress) #pragma omp critical { - if (progress.time_to_report(the_byte)) progress.report(cerr, the_byte); + if (progress.time_to_report(the_byte)) + progress.report(cerr, the_byte); } } } @@ -1589,7 +1628,8 @@ format_time_in_sec(const double t) { return oss.str(); } -template static void +template +static void run_single_ended(const bool show_progress, const bool allow_ambig, const string &reads_file, const AbismalIndex &abismal_index, se_map_stats &se_stats, bamxx::bam_header &hdr, @@ -1622,7 +1662,8 @@ best_single(const pe_candidates &pres, se_candidates &res) { res.update(false, i->diffs, i->flags, i->pos); } -template static void +template +static void best_pair(const pe_candidates &res1, const pe_candidates &res2, const Read &pread1, const Read &pread2, bam_cigar_t &cigar1, bam_cigar_t &cigar2, vector &mem_scr1, @@ -1736,7 +1777,8 @@ best_pair(const pe_candidates &res1, const pe_candidates &res2, } } -template static bool +template +static bool select_maps(const Read &pread1, const Read &pread2, bam_cigar_t &cig1, bam_cigar_t &cig2, pe_candidates &res1, pe_candidates &res2, vector &mem_scr1, se_candidates &res_se1, @@ -1752,8 +1794,8 @@ select_maps(const Read &pread1, const Read &pread2, bam_cigar_t &cig1, return true; } -template +template static inline bool map_fragments(const uint32_t max_candidates, const string &read1, const string &read2, @@ -1770,7 +1812,8 @@ map_fragments(const uint32_t max_candidates, const string &read1, res1.reset(read1.size()); res2.reset(read2.size()); - if (read1.empty() && read2.empty()) return false; + if (read1.empty() && read2.empty()) + return false; if (!read1.empty()) { prep_read(read1, pread1); @@ -1793,7 +1836,8 @@ map_fragments(const uint32_t max_candidates, const string &read1, mem_scr1, res_se1, res_se2, aln, best); } -template static void +template +static void map_paired_ended(const bool show_progress, const bool allow_ambig, const AbismalIndex &abismal_index, ReadLoader &rl1, ReadLoader &rl2, pe_map_stats &pe_stats, @@ -1944,8 +1988,10 @@ map_paired_ended(const bool show_progress, const bool allow_ambig, } } for (size_t i = 0; i < n_reads; ++i) { - if (valid_bam_rec(mr1[i])) reset_bam_rec(mr1[i]); - if (valid_bam_rec(mr2[i])) reset_bam_rec(mr2[i]); + if (valid_bam_rec(mr1[i])) + reset_bam_rec(mr1[i]); + if (valid_bam_rec(mr2[i])) + reset_bam_rec(mr2[i]); pe_stats.update(allow_ambig, reads1[i], reads2[i], cigar1[i], cigar2[i], bests[i], bests_se1[i], bests_se2[i]); cigar1[i].clear(); @@ -1954,7 +2000,8 @@ map_paired_ended(const bool show_progress, const bool allow_ambig, if (show_progress) #pragma omp critical { - if (progress.time_to_report(the_byte)) progress.report(cerr, the_byte); + if (progress.time_to_report(the_byte)) + progress.report(cerr, the_byte); } } } @@ -2119,8 +2166,10 @@ map_paired_ended_rand(const bool show_progress, const bool allow_ambig, } } for (size_t i = 0; i < n_reads; ++i) { - if (valid_bam_rec(mr1[i])) reset_bam_rec(mr1[i]); - if (valid_bam_rec(mr2[i])) reset_bam_rec(mr2[i]); + if (valid_bam_rec(mr1[i])) + reset_bam_rec(mr1[i]); + if (valid_bam_rec(mr2[i])) + reset_bam_rec(mr2[i]); pe_stats.update(allow_ambig, reads1[i], reads2[i], cigar1[i], cigar2[i], bests[i], bests_se1[i], bests_se2[i]); cigar1[i].clear(); @@ -2129,12 +2178,14 @@ map_paired_ended_rand(const bool show_progress, const bool allow_ambig, if (show_progress) #pragma omp critical { - if (progress.time_to_report(the_byte)) progress.report(cerr, the_byte); + if (progress.time_to_report(the_byte)) + progress.report(cerr, the_byte); } } } -template static void +template +static void run_paired_ended(const bool show_progress, const bool allow_ambig, const string &reads_file1, const string &reads_file2, const AbismalIndex &abismal_index, pe_map_stats &pe_stats, @@ -2209,6 +2260,10 @@ abismal_make_sam_header(const ChromLookup &cl, const int argc, int abismal(int argc, const char **argv) { try { + + const string version_str = string("(v") + VERSION + string(")"); + const string description = "map bisulfite converted reads " + version_str; + bool VERBOSE = false; bool GA_conversion = false; bool allow_ambig = false; @@ -2223,7 +2278,7 @@ abismal(int argc, const char **argv) { string stats_outfile = ""; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "map bisulfite converted reads", + OptionParser opt_parse(strip_path(argv[0]), description, " []"); opt_parse.set_show_defaults(); opt_parse.add_opt("index", 'i', "index file", false, index_file); @@ -2255,30 +2310,30 @@ abismal(int argc, const char **argv) { vector leftover_args; opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { - cerr << "help requested" << endl; cerr << opt_parse.help_message() << endl; + cerr << opt_parse.about_message() << endl; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { - cerr << "about requested" << endl; cerr << opt_parse.about_message() << endl; return EXIT_SUCCESS; } if (opt_parse.option_missing()) { - cerr << "missing required option" << endl; + cerr << "Missing required option." << endl; cerr << opt_parse.option_missing_message() << endl; return EXIT_SUCCESS; } if (leftover_args.size() != 1 && leftover_args.size() != 2) { cerr << opt_parse.help_message() << endl; + cerr << opt_parse.about_message() << endl; return EXIT_SUCCESS; } if (n_threads <= 0) { - cerr << "please choose a positive number of threads" << endl; + cerr << "Please choose a positive number of threads." << endl; return EXIT_SUCCESS; } if (index_file.empty() == genome_file.empty()) { - cerr << "please select either an index file (-i) or a genome file (-g)" + cerr << "Please select either an index file (-i) or a genome file (-g)." << endl; return EXIT_SUCCESS; } @@ -2307,15 +2362,16 @@ abismal(int argc, const char **argv) { const int n_procs = omp_get_num_procs(); int num_threads_fulfilled = 1; #pragma omp parallel - { num_threads_fulfilled = omp_get_num_threads(); } + { + num_threads_fulfilled = omp_get_num_threads(); + } if (VERBOSE && n_threads > n_procs) - print_with_time( - "[WARNING] requesting more threads than the " - "maximum of " + - to_string(n_procs) + - " processors available in " - "this device"); + print_with_time("[WARNING] requesting more threads than the " + "maximum of " + + to_string(n_procs) + + " processors available in " + "this device"); if (VERBOSE) print_with_time("using " + to_string(num_threads_fulfilled) + @@ -2345,7 +2401,8 @@ abismal(int argc, const char **argv) { const double start_time = omp_get_wtime(); if (!index_file.empty()) { - if (VERBOSE) print_with_time("loading index " + index_file); + if (VERBOSE) + print_with_time("loading index " + index_file); abismal_index.read(index_file); if (VERBOSE) @@ -2353,7 +2410,8 @@ abismal(int argc, const char **argv) { format_time_in_sec(omp_get_wtime() - start_time)); } else { - if (VERBOSE) print_with_time("indexing genome " + genome_file); + if (VERBOSE) + print_with_time("indexing genome " + genome_file); abismal_index.create_index(genome_file); if (VERBOSE) print_with_time("indexing time: " + @@ -2371,14 +2429,17 @@ abismal(int argc, const char **argv) { pe_map_stats pe_stats; bamxx::bam_out out(outfile, write_bam_fmt); - if (!out) throw runtime_error("failed to open output file: " + outfile); + if (!out) + throw runtime_error("failed to open output file: " + outfile); bamxx::bam_header hdr; int ret = abismal_make_sam_header(abismal_index.cl, argc, argv, hdr); - if (ret < 0) throw runtime_error("error formatting header"); + if (ret < 0) + throw runtime_error("error formatting header"); - if (!out.write(hdr)) throw runtime_error("error writing header"); + if (!out.write(hdr)) + throw runtime_error("error writing header"); if (reads_file2.empty()) { if (GA_conversion || pbat_mode) diff --git a/src/abismalidx.cpp b/src/abismalidx.cpp index 7574025..0f69d1e 100644 --- a/src/abismalidx.cpp +++ b/src/abismalidx.cpp @@ -15,45 +15,51 @@ * General Public License for more details. */ -#include "smithlab_os.hpp" -#include "smithlab_utils.hpp" +#include "abismalidx.hpp" +#include "AbismalIndex.hpp" + #include "OptionParser.hpp" #include "dna_four_bit.hpp" +#include "smithlab_os.hpp" +#include "smithlab_utils.hpp" -#include "abismalidx.hpp" -#include "AbismalIndex.hpp" +#include #include -#include +#include +#include #include -#include -#include +#include #include -#include +#include #include -#include +#include -using std::vector; -using std::runtime_error; -using std::string; using std::cerr; using std::endl; +using std::runtime_error; +using std::string; using std::unordered_set; +using std::vector; int abismalidx(int argc, const char **argv) { try { + + const string version_str = string("(v") + VERSION + string(")"); + const string description = "build abismal index " + version_str; + string target_regions_file; bool VERBOSE = false; size_t n_threads = 1; /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "build abismal index", + OptionParser opt_parse(strip_path(argv[0]), description, " ", 2); opt_parse.set_show_defaults(); - opt_parse.add_opt("targets", 'A', "target regions", - false, target_regions_file); + opt_parse.add_opt("targets", 'A', "target regions", false, + target_regions_file); opt_parse.add_opt("threads", 't', "number of threads", false, n_threads); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); @@ -61,6 +67,7 @@ abismalidx(int argc, const char **argv) { opt_parse.parse(argc, argv, leftover_args); if (argc == 1 || opt_parse.help_requested()) { cerr << opt_parse.help_message() << endl; + cerr << opt_parse.about_message() << endl; return EXIT_SUCCESS; } if (opt_parse.about_requested()) { @@ -101,9 +108,9 @@ abismalidx(int argc, const char **argv) { abismal_index.write(outfile); if (VERBOSE) - cerr << "[total indexing time: " << omp_get_wtime() - start_time << "]" << endl; + cerr << "[total indexing time: " << omp_get_wtime() - start_time << "]" + << endl; /****************** END BUILDING INDEX *************/ - } catch (const std::runtime_error &e) { cerr << e.what() << endl; diff --git a/src/simreads.cpp b/src/simreads.cpp index e5c72d6..712d35f 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018 Andrew D. Smith + * Copyright (C) 2018-2024 Andrew D. Smith * * Author: Andrew D. Smith * @@ -16,106 +16,129 @@ * General Public License for more details. */ +#include "OptionParser.hpp" #include "smithlab_os.hpp" #include "smithlab_utils.hpp" -#include "OptionParser.hpp" -#include "simreads.hpp" #include "AbismalIndex.hpp" #include "cigar_utils.hpp" #include "sam_record.hpp" +#include "simreads.hpp" -#include +#include +#include // for the int8_t and friends #include +#include +#include +#include +#include #include #include -#include -#include -#include -#include -#include // for the int8_t and friends -#include +#include // getpid() -using std::vector; -using std::runtime_error; -using std::string; +using std::begin; +using std::cbegin; +using std::cend; using std::cerr; +using std::end; using std::endl; using std::function; -using std::istream; -using std::ostream; using std::ifstream; +using std::istream; +using std::istringstream; using std::ofstream; +using std::ostream; using std::ostringstream; -using std::istringstream; +using std::runtime_error; +using std::size; +using std::size_t; +using std::string; using std::to_string; +using std::transform; +using std::uint64_t; +using std::vector; +template using num_lim = std::numeric_limits; namespace simreads_random { - // ADS: I made this namespace and functions because different - // implementations of rand() on different OS meant that even with - // the same seed, the results could be different. This meant testing - // didn't work. - bool initialized = false; - std::default_random_engine e; - std::uniform_real_distribution dr; - std::uniform_int_distribution di; - void initialize(const size_t the_seed) { - e = std::default_random_engine(the_seed); - initialized = true; - } - int rand() { - assert(initialized); - // ADS: should have same range as ordinary rand() by properties of - // std::uniform_int_distribution default constructor. - return di(e); - } - double rand_double() { // ADS: in the interval [0, 1] - assert(initialized); - // ADS: default constructor for std::uniform_real_distribution - // sets a range of [0,1) - return dr(e); - } +// ADS: I made this namespace and functions because different +// implementations of rand() on different OS meant that even with +// the same seed, the results could be different. This meant testing +// didn't work. +bool initialized = false; +std::default_random_engine e; +std::uniform_real_distribution dr; +std::uniform_int_distribution di; +void +initialize(const size_t the_seed) { + e = std::default_random_engine(the_seed); + initialized = true; } +uint64_t +rand() { + assert(initialized); + // ADS: should have same range as ordinary rand() by properties of + // std::uniform_int_distribution default constructor. + return di(e); +} +double +rand_double() { // ADS: in the interval [0, 1] + assert(initialized); + // ADS: default constructor for std::uniform_real_distribution + // sets a range of [0,1) + return dr(e); +} +} // namespace simreads_random -static string +static inline string format_fastq_record(const string &name, const string &read) { assert(!name.empty()); - ostringstream oss; - oss << '@' << name << endl << read << endl - << '+' << endl << string(read.length(), 'B'); - return oss.str(); + string s; + s += '@'; + s += name; + s += '\n'; + s += read; + s += "\n+\n"; + s += string(size(read), 'B'); + return s; } +static inline string +format_fasta_record(const string &name, const string &read) { + assert(!name.empty()); + string s; + s += '>'; + s += name; + s += '\n'; + s += read; + return s; +} struct FragInfo { - void set_sequential_name() { - name = "read" + to_string(frag_count++); - } - string - read1() const { + void set_sequential_name() { name = "read" + to_string(frag_count++); } + string read1() const { assert(!name.empty()); string read = seq.substr(0, read_length); - for (size_t i = 0; i < read_length - read.length(); ++i) + for (size_t i = 0; i < read_length - size(read); ++i) read += random_base(); - return format_fastq_record(name + ".1", read); + return fasta_format ? format_fasta_record(name + ".1", read) + : format_fastq_record(name + ".1", read); } - string - read2() const { + string read2() const { assert(!name.empty()); string read(seq); revcomp_inplace(read); read = read.substr(0, read_length); - for (size_t i = 0; i < read_length - read.length(); ++i) + for (size_t i = 0; i < read_length - size(read); ++i) read += random_base(); - return format_fastq_record(name + ".2", read); + return fasta_format ? format_fasta_record(name + ".2", read) + : format_fastq_record(name + ".2", read); } - void - erase_info_through_insert() { + void erase_info_through_insert() { const size_t orig_ref_len = end_pos - start_pos; - if (2*read_length < seq.length()) { + if (2 * read_length < size(seq)) { string cigar2(cigar); truncate_cigar_q(cigar, read_length); reverse_cigar(begin(cigar2), end(cigar2)); @@ -123,20 +146,15 @@ struct FragInfo { reverse_cigar(begin(cigar2), end(cigar2)); const size_t rseq_ops = cigar_rseq_ops(cigar) + cigar_rseq_ops(cigar2); cigar = cigar + to_string(orig_ref_len - rseq_ops) + "N" + cigar2; - seq = seq.substr(0, read_length) + - string(orig_ref_len - rseq_ops, 'N') + - seq.substr(seq.length() - read_length, read_length); - + seq = seq.substr(0, read_length) + string(orig_ref_len - rseq_ops, 'N') + + seq.substr(size(seq) - read_length, read_length); } } - void - remove_cigar_match_symbols() { + void remove_cigar_match_symbols() { replace(begin(cigar), end(cigar), '=', 'M'); merge_equal_neighbor_cigar_ops(cigar); } - void - bisulfite_conversion(const bool random_pbat, - const double bs_conv) { + void bisulfite_conversion(const bool random_pbat, const double bs_conv) { if (pbat || (random_pbat && simreads_random::rand_double() < 0.5)) { for (auto it(begin(seq)); it != end(seq); ++it) { if (*it == 'G' && (simreads_random::rand_double() < bs_conv)) @@ -154,24 +172,25 @@ struct FragInfo { bool rc() const { return strand == '-'; } string chrom; - size_t start_pos; - size_t end_pos; + size_t start_pos{}; + size_t end_pos{}; string name; - double score; - char strand; + double score{}; + char strand{}; string seq; string cigar; static bool pbat; + static bool fasta_format; static size_t frag_count; static size_t read_length; }; bool FragInfo::pbat = false; +bool FragInfo::fasta_format = false; size_t FragInfo::frag_count = 0; size_t FragInfo::read_length = 100; - static ostream & operator<<(ostream &out, FragInfo &the_info) { const bool rc = the_info.rc(); @@ -181,16 +200,18 @@ operator<<(ostream &out, FragInfo &the_info) { samflags::set(flags_read, samflags::read_paired); samflags::set(flags_read, samflags::read_pair_mapped); samflags::set(flags_read, samflags::template_first); - samflags::set(flags_read, the_info.rc() ? samflags::read_rc : samflags::mate_rc); + samflags::set(flags_read, + the_info.rc() ? samflags::read_rc : samflags::mate_rc); samflags::set(flags_mate, samflags::read_paired); samflags::set(flags_mate, samflags::read_pair_mapped); samflags::set(flags_mate, samflags::template_last); - samflags::set(flags_mate, the_info.rc() ? samflags::mate_rc : samflags::read_rc); + samflags::set(flags_mate, + the_info.rc() ? samflags::mate_rc : samflags::read_rc); const size_t read_pos = the_info.start_pos + 1; const size_t mate_pos = the_info.end_pos - FragInfo::read_length + 1; - const int tlen = rc ? (-the_info.seq.size()) : (the_info.seq.size()); + const int tlen = rc ? -size(the_info.seq) : size(the_info.seq); string cigar1 = the_info.cigar; string cigar2 = the_info.cigar; @@ -210,99 +231,104 @@ operator<<(ostream &out, FragInfo &the_info) { const size_t pos1 = rc ? mate_pos : read_pos; const size_t pos2 = rc ? read_pos : mate_pos; - return out << the_info.name << ".1\t" + // clang-format off + return out << the_info.name << ".1" << '\t' << flags_read << '\t' << the_info.chrom << '\t' << pos1 << '\t' - << "255\t" + << "255" << '\t' << cigar1 << '\t' - << "=\t" - << pos2 << "\t" + << "=" << '\t' + << pos2 << '\t' << tlen << '\t' - << seq1 << "\t" - << "*" << endl - - << the_info.name << ".2\t" + << seq1 << '\t' + << "*" << '\n' + << the_info.name << ".2" << '\t' << flags_mate << '\t' << the_info.chrom << '\t' << pos2 << '\t' - << "255\t" + << "255" << '\t' << cigar2 << '\t' - << "=\t" - << pos1 << "\t" + << "=" << '\t' + << pos1 << '\t' << -tlen << '\t' - << seq2 << "\t" + << seq2 << '\t' << "*"; + // clang-format on } - // extract the position of the fragment checking all bases are valid static void -sim_frag_position(const string &genome, const size_t frag_len, - string &the_frag, size_t &the_position) { - static auto is_invalid = [](const char c) {return !valid_base(c);}; +sim_frag_position(const string &genome, const size_t frag_len, string &the_frag, + size_t &the_position, const bool require_valid) { + static auto is_invalid = [](const char c) { return !valid_base(c); }; - const size_t lim = genome.length() - frag_len + 1; + const size_t lim = size(genome) - frag_len + 1; do { the_position = simreads_random::rand() % lim; - the_frag = string(begin(genome) + the_position, - begin(genome) + the_position + frag_len); - } - while (find_if(begin(the_frag), end(the_frag), is_invalid) != end(the_frag)); + the_frag = string(cbegin(genome) + the_position, + cbegin(genome) + the_position + frag_len); + } while (require_valid && find_if(cbegin(the_frag), cend(the_frag), + is_invalid) != cend(the_frag)); } - // simulate from a uniform distribution in a range static size_t sim_frag_length(const size_t min_length, const size_t max_length) { assert(max_length >= min_length); - if (min_length == max_length) return min_length; + if (min_length == max_length) + return min_length; const size_t diff = max_length - min_length; return min_length + (simreads_random::rand() % diff); } - struct FragSampler { FragSampler(const string &g, const ChromLookup c, const char sc, - const size_t milen, const size_t malen) : - genome(g), cl(c), strand_code(sc), min_length(milen), max_length(malen) {} - void - sample_fragment(FragInfo &the_info) const { + const size_t milen, const size_t malen, + const bool require_valid) : + genome(g), cl(c), strand_code(sc), min_length(milen), max_length(malen), + require_valid(require_valid) {} + void sample_fragment(FragInfo &the_info) const { const size_t frag_len = sim_frag_length(min_length, max_length); - sim_frag_position(genome, frag_len, the_info.seq, the_info.start_pos); + sim_frag_position(genome, frag_len, the_info.seq, the_info.start_pos, + require_valid); - uint32_t offset = 0, chrom_idx = 0; + uint32_t offset = 0; + uint32_t chrom_idx = 0; cl.get_chrom_idx_and_offset(the_info.start_pos, chrom_idx, offset); the_info.chrom = cl.names[chrom_idx]; the_info.start_pos = offset; the_info.end_pos = the_info.start_pos + frag_len; - the_info.set_sequential_name(); // default - the_info.strand = sim_strand(); // based on frag code + the_info.set_sequential_name(); // default + the_info.strand = sim_strand(); // based on frag code if (the_info.strand == '-') revcomp_inplace(the_info.seq); - the_info.cigar = to_string(frag_len) + "M"; // default, no muts + the_info.cigar = to_string(frag_len) + "M"; // default, no muts } char sim_strand() const { - if (strand_code == 'f') return '+'; - else if (strand_code == 'r') return '-'; - else if (strand_code == 'b') return (simreads_random::rand() & 1) ? '+' : '-'; - else throw runtime_error("bad strand code: " + to_string(strand_code)); - return '\0'; + switch (strand_code) { + case 'f': return '+'; + case 'r': return '-'; + case 'b': return (simreads_random::rand() & 1) ? '+' : '-'; + default: std::abort(); + } } const string &genome; ChromLookup cl; - char strand_code; - size_t min_length; - size_t max_length; + char strand_code{}; + size_t min_length{}; + size_t max_length{}; + bool require_valid{}; }; - struct FragMutator { FragMutator(const double m, const double s, const double i, const double d) : - mutation_rate(m), substitution_rate(s), - insertion_rate(i), deletion_rate(d) { - const double total = substitution_rate + insertion_rate + deletion_rate; + mutation_rate(m), substitution_rate(s), insertion_rate(i), + deletion_rate(d) { + const double total = + std::max(substitution_rate + insertion_rate + deletion_rate, + num_lim::min()); substitution_rate /= total; insertion_rate /= total; deletion_rate /= total; @@ -313,7 +339,7 @@ struct FragMutator { string seq, cigar; size_t i = 0; the_info.score = 0; - while (i < the_info.seq.length()) { + while (i < size(the_info.seq)) { // select a mutation or not const char mut = sample_mutation(); if (mut == 'I') { @@ -332,24 +358,28 @@ struct FragMutator { ++the_info.score; ++i; } - else { //if (mut == '=') { + else { // if (mut == '=') { cigar += "="; seq += the_info.seq[i]; ++i; } } - the_info.cigar.resize(2*cigar.size()); - compress_cigar(begin(cigar), end(cigar), the_info.cigar); + the_info.cigar.resize(2 * size(cigar)); + compress_cigar(cbegin(cigar), cend(cigar), the_info.cigar); swap(seq, the_info.seq); } char sample_mutation() const { - const double x = simreads_random::rand_double(); - if (x > mutation_rate) return '='; + const double x = simreads_random::rand_double(); + if (x > mutation_rate) + return '='; else { - const double y = simreads_random::rand_double(); - if (y < substitution_rate) return 'M'; - else if (y < insertion_rate) return 'I'; - else return 'D'; + const double y = simreads_random::rand_double(); + if (y < substitution_rate) + return 'M'; + else if (y < insertion_rate) + return 'I'; + else + return 'D'; } } string tostring() const { @@ -360,17 +390,15 @@ struct FragMutator { << "deletion_rate=" << deletion_rate; return oss.str(); } - double mutation_rate; - double substitution_rate; - double insertion_rate; - double deletion_rate; + double mutation_rate{}; + double substitution_rate{}; + double insertion_rate{}; + double deletion_rate{}; }; - static void extract_change_type_vals(const string &change_type_vals, - double &substitution_rate, - double &insertion_rate, + double &substitution_rate, double &insertion_rate, double &deletion_rate) { if (!change_type_vals.empty()) { istringstream iss(change_type_vals); @@ -383,7 +411,6 @@ extract_change_type_vals(const string &change_type_vals, } } - int simreads(int argc, const char **argv) { @@ -393,10 +420,10 @@ simreads(int argc, const char **argv) { string locations_file; bool VERBOSE = false; - bool write_locations = false; bool single_end = false; bool show_cigar_matches = true; bool random_pbat = false; + bool require_valid = false; size_t n_reads = 100; size_t min_frag_len = 100; @@ -404,7 +431,7 @@ simreads(int argc, const char **argv) { char strand_arg = 'b'; - size_t rng_seed = std::numeric_limits::max(); + size_t rng_seed = num_lim::max(); double mutation_rate = 0.0; string change_type_vals; @@ -414,35 +441,41 @@ simreads(int argc, const char **argv) { double bs_conv = 1.0; - size_t max_mutations = std::numeric_limits::max(); + size_t max_mutations = num_lim::max(); /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(strip_path(argv[0]), "simulate reads for " - "testing walt2", "", 1); + OptionParser opt_parse(strip_path(argv[0]), + "simulate reads for " + "testing walt2", + "", 1); opt_parse.set_show_defaults(); opt_parse.add_opt("out", 'o', "output file prefix", true, output_prefix); opt_parse.add_opt("single", '\0', "output single end", false, single_end); - opt_parse.add_opt("loc", '\0', "write locations", false, write_locations); - opt_parse.add_opt("read-len", 'l', "read length", false, FragInfo::read_length); - opt_parse.add_opt("min-fraglen", '\0', "min fragment length", - false, min_frag_len); - opt_parse.add_opt("max-fraglen", '\0', "max fragment length", - false, max_frag_len); + opt_parse.add_opt("loc", '\0', "write locations here", false, + locations_file); + opt_parse.add_opt("read-len", 'l', "read length", false, + FragInfo::read_length); + opt_parse.add_opt("min-fraglen", '\0', "min fragment length", false, + min_frag_len); + opt_parse.add_opt("max-fraglen", '\0', "max fragment length", false, + max_frag_len); opt_parse.add_opt("n-reads", 'n', "number of reads", false, n_reads); opt_parse.add_opt("mut", 'm', "mutation rate", false, mutation_rate); - opt_parse.add_opt("bis", 'b', "bisulfite conversion rate", false,\ - bs_conv); + opt_parse.add_opt("bis", 'b', "bisulfite conversion rate", false, bs_conv); opt_parse.add_opt("show-matches", '\0', "show match symbols in cigar", false, show_cigar_matches); - opt_parse.add_opt("changes", 'c', - "change types (comma sep relative vals)", + opt_parse.add_opt("changes", 'c', "change types (comma sep relative vals)", false, change_type_vals); opt_parse.add_opt("max-mut", 'M', "max mutations", false, max_mutations); opt_parse.add_opt("pbat", 'a', "pbat", false, FragInfo::pbat); opt_parse.add_opt("random-pbat", 'R', "random pbat", false, random_pbat); opt_parse.add_opt("strand", 's', "strand {f, r, b}", false, strand_arg); - opt_parse.add_opt("seed", '\0', "rng seed (default: from system)", - false, rng_seed); + opt_parse.add_opt("fasta", '\0', "output fasta format (no quality scores)", + false, FragInfo::fasta_format); + opt_parse.add_opt("seed", '\0', "rng seed (default: from system)", false, + rng_seed); + opt_parse.add_opt("require-valid", '\0', "require valid bases in fragments", + false, require_valid); opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE); vector leftover_args; opt_parse.parse(argc, argv, leftover_args); @@ -458,19 +491,19 @@ simreads(int argc, const char **argv) { cerr << opt_parse.option_missing_message() << endl; return EXIT_SUCCESS; } - if (leftover_args.size() != 1) { + if (size(leftover_args) != 1) { cerr << opt_parse.help_message() << endl; return EXIT_SUCCESS; } const string genome_file(leftover_args.front()); /****************** END COMMAND LINE OPTIONS *****************/ - extract_change_type_vals(change_type_vals, - substitution_rate, insertion_rate, deletion_rate); + extract_change_type_vals(change_type_vals, substitution_rate, + insertion_rate, deletion_rate); - if (rng_seed == std::numeric_limits::max()) { + if (rng_seed == num_lim::max()) rng_seed = time(0) + getpid(); - } + if (VERBOSE) cerr << "rng seed: " << rng_seed << endl; simreads_random::initialize(rng_seed); @@ -483,75 +516,76 @@ simreads(int argc, const char **argv) { string genome; ChromLookup cl; load_genome(genome_file, genome, cl); - transform(begin(genome), end(genome), begin(genome), - [](unsigned char c){return toupper(c);}); - - - FragSampler frag_samp(genome, cl, strand_arg, min_frag_len, max_frag_len); - if (VERBOSE) - cerr << "[constructed fragment sampler]" << endl; - if (VERBOSE) - cerr << "[simulating clean frags]" << endl; - vector the_info(n_reads); - for (size_t i = 0; i < n_reads; ++i) - frag_samp.sample_fragment(the_info[i]); - - if (VERBOSE) - cerr << "[mutating the frags]" << endl; - FragMutator frag_mut(mutation_rate, substitution_rate, - insertion_rate, deletion_rate); - - for (size_t i = 0; i < the_info.size(); ++i) - frag_mut.mutate(the_info[i]); + transform(cbegin(genome), cend(genome), begin(genome), + [](unsigned char c) { return toupper(c); }); - for (size_t i = 0; i < the_info.size(); ++i) - the_info[i].bisulfite_conversion(random_pbat, bs_conv); - - if (!show_cigar_matches) - for (size_t i = 0; i < the_info.size(); ++i) - the_info[i].remove_cigar_match_symbols(); - - - if (write_locations) { - const string locations_file = output_prefix + ".sam"; + ofstream loc_out; + if (!locations_file.empty()) { if (VERBOSE) - cerr << "[writing frag locations: " << locations_file << "]" << endl; - ofstream loc_out(locations_file); + cerr << "[opening frag locations file: " << locations_file << "]" + << endl; + loc_out.open(locations_file); if (!loc_out) - throw runtime_error("bad locations file: " + locations_file); - for (size_t i = 0; i < the_info.size(); ++i) { - loc_out << the_info[i] << endl; - } + throw runtime_error("bad locations output file: " + locations_file); } - const string read1_outfile = output_prefix + "_1.fq"; - if (VERBOSE) - cerr << "[writing read1 fastq: " << read1_outfile << "]" << endl; + const string read1_outfile = + output_prefix + (FragInfo::fasta_format ? "_1.fa" : "_1.fq"); + if (VERBOSE) { + if (FragInfo::fasta_format) + cerr << "[opening read1 fastq: " << read1_outfile << "]" << endl; + else + cerr << "[opening read1 fasta: " << read1_outfile << "]" << endl; + } ofstream read1_out(read1_outfile); if (!read1_out) throw runtime_error("bad output file: " + read1_outfile); - for (size_t i = 0; i < the_info.size(); ++i) - read1_out << the_info[i].read1() << endl; - + ofstream read2_out; if (!single_end) { - const string read2_outfile = output_prefix + "_2.fq"; - if (VERBOSE) - cerr << "[writing read2 fastq: " << read2_outfile << "]" << endl; - ofstream read2_out(read2_outfile); + const string read2_outfile = + output_prefix + (FragInfo::fasta_format ? "_2.fa" : "_2.fq"); + if (VERBOSE) { + if (FragInfo::fasta_format) + cerr << "[opening read2 fastq: " << read2_outfile << "]" << endl; + else + cerr << "[opening read2 fasta: " << read2_outfile << "]" << endl; + } + read2_out.open(read2_outfile); if (!read2_out) throw runtime_error("bad output file: " + read2_outfile); - for (size_t i = 0; i < the_info.size(); ++i) - read2_out << the_info[i].read2() << endl; + } + + FragSampler frag_samp(genome, cl, strand_arg, min_frag_len, max_frag_len, + require_valid); + if (VERBOSE) + cerr << "[constructed fragment sampler]" << endl; + + FragMutator frag_mut(mutation_rate, substitution_rate, insertion_rate, + deletion_rate); + if (VERBOSE) + cerr << "[constructed mutator]" << endl; + + if (VERBOSE) + cerr << "[simulating frags]" << endl; + + for (size_t i = 0; i < n_reads; ++i) { + FragInfo info; + frag_samp.sample_fragment(info); + frag_mut.mutate(info); + info.bisulfite_conversion(random_pbat, bs_conv); + if (!show_cigar_matches) + info.remove_cigar_match_symbols(); + if (!locations_file.empty()) + loc_out << info << '\n'; + read1_out << info.read1() << '\n'; + if (!single_end) + read2_out << info.read2() << '\n'; } } - catch (const runtime_error &e) { + catch (const std::exception &e) { cerr << e.what() << endl; return EXIT_FAILURE; } - catch (std::bad_alloc &ba) { - cerr << "ERROR: could not allocate memory" << endl; - return EXIT_FAILURE; - } return EXIT_SUCCESS; }