Skip to content

Commit

Permalink
many bug fixes in BWAWrapper
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeremiah Wala committed May 22, 2015
1 parent bdffb57 commit 1e08c0d
Show file tree
Hide file tree
Showing 12 changed files with 468 additions and 367 deletions.
284 changes: 38 additions & 246 deletions src/AlignedContig.cpp

Large diffs are not rendered by default.

104 changes: 79 additions & 25 deletions src/BWAWrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ extern "C" {
#include <string.h>
}

#define DEBUG_BWATOOLS 1
//#define DEBUG_BWATOOLS 1

#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1))
#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3)
Expand Down Expand Up @@ -36,7 +36,7 @@ namespace SnowTools {
tlen += i.seq.length();

#ifdef DEBUG_BWATOOLS
//std::cerr << "ref seq length: " << tlen << std::endl;
std::cerr << "ref seq length: " << tlen << std::endl;
#endif

// make the bwt
Expand Down Expand Up @@ -76,35 +76,88 @@ namespace SnowTools {

}

void BWAWrapper::alignSingleSequence(const std::string& seq) {

void BWAWrapper::alignSingleSequence(const std::string& seq, const std::string& name, BamReadVector& vec,
bool keep_secondary) {

mem_alnreg_v ar;
ar = mem_align1(memopt, idx->bwt, idx->bns, idx->pac, seq.length(), seq.c_str()); // get all the hits

#ifdef DEBUG_BWATOOLS
std::cout << "num hits: " << ar.n << std::endl;
//std::cout << __print_bns() << std::endl;
#endif

// loop through the hits
for (size_t i = 0; i < ar.n; ++i) {

mem_aln_t a;
if (ar.a[i].secondary >= 0)
continue; // skip secondary alignments
a = mem_reg2aln(memopt, idx->bns, idx->pac, seq.length(), seq.c_str(), &ar.a[i]); // get forward-strand position and CIGAR
if (ar.a[i].secondary >= 0 && !keep_secondary)
continue; // skip secondary alignments

// get forward-strand position and CIGAR
a = mem_reg2aln(memopt, idx->bns, idx->pac, seq.length(), seq.c_str(), &ar.a[i]);

// instantiate the read
BamRead b;
b.SetQname("testing123");
b.SetSequence(seq);
b.SetMapQuality(a.mapq);
//std::cout << b.toSam(h) << std::endl;
#ifdef DEBUG_BWATOOLS
b.init();

b.b->core.tid = a.rid;
b.b->core.pos = a.pos;
b.b->core.qual = a.mapq;
b.b->core.flag = a.flag;
b.b->core.n_cigar = a.n_cigar;

// allocate all the data
b.b->core.l_qname = name.length() + 1;
b.b->core.l_qseq = seq.length(); //(seq.length()>>1) + seq.length() % 2; // 4-bit encoding
b.b->l_data = b.b->core.l_qname + (a.n_cigar<<2) + ((b.b->core.l_qseq+1)>>1) + (b.b->core.l_qseq);
b.b.get()->data = (uint8_t*)malloc(b.b.get()->l_data);

// allocate the qname
memcpy(b.b->data, name.c_str(), name.length() + 1);

// allocate the cigar
memcpy(b.b->data + b.b->core.l_qname, (uint8_t*)a.cigar, a.n_cigar<<2);

// allocate the sequence
uint8_t* m_bases = b.b->data + b.b->core.l_qname + (b.b->core.n_cigar<<2);

// move this out of bigger loop
for (int i = 0; i < seq.length(); ++i) {

// bad idea but works for now
uint8_t base = 15;
if (seq.at(i) == 'A')
base = 1;
else if (seq.at(i) == 'C')
base = 2;
else if (seq.at(i) == 'G')
base = 4;
else if (seq.at(i) == 'T')
base = 8;

m_bases[i >> 1] &= ~(0xF << ((~i & 1) << 2)); ///< zero out previous 4-bit base encoding
m_bases[i >> 1] |= base << ((~i & 1) << 2); ///< insert new 4-bit base encoding
}

// allocate the quality


//b.SetQname(name);
//b.SetSequence(seq);

b.AddIntTag("NA", ar.n); // number of matches
b.AddIntTag("NM", a.NM);

vec.push_back(b);

//#ifdef DEBUG_BWATOOLS
// print alignment
printf("\t%c\t%s\t%ld\t%d\t", /*ks->name.s,*/ "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
for (int k = 0; k < a.n_cigar; ++k) // print CIGAR
printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
printf("\t%d\n", a.NM); // print edit distance
#endif
//printf("\t%c\t%s\t%ld\t%d\t", /*ks->name.s,*/ "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq);
//for (int k = 0; k < a.n_cigar; ++k) // print CIGAR
// printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]);
// printf("\t%d\n", a.NM); // print edit distance
//#endif
free(a.cigar); // don't forget to deallocate CIGAR
}
free (ar.a); // dealloc the hit list
Expand Down Expand Up @@ -182,18 +235,19 @@ uint8_t* BWAWrapper::__make_pac(const USeqVector& v, bool for_only, bool write_f

// make the ref name kstring
kstring_t * name = (kstring_t*)malloc(1 * sizeof(kstring_t));
name->l = v[k].name.length();
name->m = v[k].name.length() + 2;
name->s = (char*)calloc(v[k].name.length(), sizeof(char));
strncpy(name->s, v[k].name.c_str(), v[k].name.length());
name->l = v[k].name.length() + 1;
name->m = v[k].name.length() + 3;
name->s = (char*)calloc(name->m, sizeof(char));
memcpy(name->s, v[k].name.c_str(), v[k].name.length()+1);

// make the sequence kstring
kstring_t * t = (kstring_t*)malloc(sizeof(kstring_t));
t->l = v[k].seq.length();
t->m = v[k].seq.length() + 2;
t->s = (char*)calloc(v[k].seq.length(), sizeof(char));
strncpy(t->s, v[k].seq.c_str(), v[k].seq.length());

//t->s = (char*)calloc(v[k].seq.length(), sizeof(char));
t->s = (char*)malloc(t->m);
memcpy(t->s, v[k].seq.c_str(), v[k].seq.length());

// put into a kstring
kseq_t *ks = (kseq_t*)calloc(1, sizeof(kseq_t));
ks->seq = *t;
Expand Down
31 changes: 16 additions & 15 deletions src/BamRead.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,15 @@

namespace SnowTools {

BamRead::BamRead() {
b = bam_init1();
}

BamRead::~BamRead() {
bam_destroy1(b);
struct free_delete {
void operator()(void* x) { bam_destroy1((bam1_t*)x); }
};

void BamRead::init() {
bam1_t* f = bam_init1();
b = std::shared_ptr<bam1_t>(f, free_delete());
}

void BamRead::SetQname(std::string n)
{
// copy out the non-qname data
Expand All @@ -33,17 +34,17 @@ namespace SnowTools {
free(nonq);
}

void BamRead::SetSequence(std::string s)
/*void BamRead::SetSequence(std::string s)
{
// change the size to accomodate new sequence. Clear the quality string
std::cout << "osize " << b->l_data << " calcsize " << (b->core.l_qseq + b->core.l_qname + (b->core.n_cigar<<2)) << std::endl;
//std::cout << "osize " << b->l_data << " calcsize " << (b->core.l_qseq + b->core.l_qname + (b->core.n_cigar<<2)) << std::endl;
b->data = (uint8_t*)realloc(b->data, b->core.l_qname + s.length() + (b->core.n_cigar<<2));
// copy in the new sequence
memcpy(b->data + b->core.l_qname + (b->core.n_cigar<<2), (uint8_t*)s.c_str(), s.length());
}
}*/

/* std::string BamRead::toSam(bam_hdr_t* h) const
{
Expand All @@ -57,12 +58,12 @@ namespace SnowTools {
//kstring_t *str;
//sam_format1(hdr, b, str);
//out << str->s;
return out;

out << r.QnameChar() << "\t" << r.AlignmentFlag() << "\t"
<< r.ChrID() << "\t"
<< r.InsertSize() << "\t" << r.AlignmentFlag() << "\t"
<< r.MapQuality() << "\t" << r.Sequence();
out << bam_get_qname(r.b) << "\t" << r.b->core.flag
<< "\t" << r.b->core.tid << "\t" << r.b->core.pos
<< "\t" << r.b->core.qual << "\t" << r.CigarString()
<< "\t" << "=" << "\t" << 0
<< "\t" << r.Sequence() << "\t" << "*";
return out;


Expand Down
2 changes: 1 addition & 1 deletion src/GenomicRegion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,7 @@ void GenomicRegion::random() {
// TODO slow.
bool found = false;
for (int i = 0; i < h->n_targets; ++i)
if (strcmp(tchr.data(), h->target_name[i]))
if (strcmp(tchr.c_str(), h->target_name[i]) == 0)
{
chr = i;
found = true;
Expand Down
3 changes: 2 additions & 1 deletion src/GenomicRegionCollection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ void GenomicRegionCollection<T>::readBEDfile(const std::string & file, int pad,

// construct the GenomicRegion
T gr(chr, pos1, pos2, h);

if (gr.valid()) {
gr.pad(pad);
m_grv.push_back(gr);
Expand Down Expand Up @@ -387,7 +388,7 @@ GenomicRegionCollection<GenomicRegion> GenomicRegionCollection<T>::findOverlaps(
#ifdef DEBUG_OVERLAPS
std::cerr << "find overlaps hit " << j.start << " " << j.stop << " -- " << j.value << std::endl;
#endif
output.add(T(m_grv[i].chr, std::max((uint32_t)j.start, m_grv[i].pos1), std::min((uint32_t)j.stop, m_grv[i].pos2)));
output.add(T(m_grv[i].chr, std::max(j.start, m_grv[i].pos1), std::min(j.stop, m_grv[i].pos2)));
}
}
}
Expand Down
Loading

0 comments on commit 1e08c0d

Please sign in to comment.