Skip to content

Commit

Permalink
Add support for interleaved fastq
Browse files Browse the repository at this point in the history
  • Loading branch information
ch4rr0 committed Jun 5, 2017
1 parent fcb1579 commit 97dd710
Show file tree
Hide file tree
Showing 7 changed files with 8,053 additions and 10 deletions.
14 changes: 12 additions & 2 deletions ebwt_search.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -338,7 +338,8 @@ enum {
ARG_QUALS2,
ARG_ALLOW_CONTAIN,
ARG_COLOR_PRIMER,
ARG_WRAPPER
ARG_WRAPPER,
ARG_INTERLEAVED_FASTQ,
};

static struct option long_options[] = {
Expand Down Expand Up @@ -444,6 +445,7 @@ static struct option long_options[] = {
{(char*)"allow-contain",no_argument, 0, ARG_ALLOW_CONTAIN},
{(char*)"col-primer", no_argument, 0, ARG_COLOR_PRIMER},
{(char*)"wrapper", required_argument, 0, ARG_WRAPPER},
{(char*)"interleaved", required_argument, 0, ARG_INTERLEAVED_FASTQ},
{(char*)0, 0, 0, 0} // terminator
};

Expand All @@ -461,14 +463,15 @@ static void printUsage(ostream& out) {
}

out << "Usage: " << endl
<< tool_name << " [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | <s>} [<hit>]" << endl
<< tool_name << " [options]* <ebwt> {-1 <m1> -2 <m2> | --12 <r> | --interleaved <i> | <s>} [<hit>]" << endl
<< endl
<< " <m1> Comma-separated list of files containing upstream mates (or the" << endl
<< " sequences themselves, if -c is set) paired with mates in <m2>" << endl
<< " <m2> Comma-separated list of files containing downstream mates (or the" << endl
<< " sequences themselves if -c is set) paired with mates in <m1>" << endl
<< " <r> Comma-separated list of files containing Crossbow-style reads. Can be" << endl
<< " a mixture of paired and unpaired. Specify \"-\" for stdin." << endl
<< " <i> Files with interleaved paired-end FASTQ reads." << endl
<< " <s> Comma-separated list of files containing unpaired reads, or the" << endl
<< " sequences themselves, if -c is set. Specify \"-\" for stdin." << endl
<< " <hit> File to write hits to (default: stdout)" << endl
Expand Down Expand Up @@ -647,6 +650,7 @@ static void parseOptions(int argc, const char **argv) {
case '1': tokenize(optarg, ",", mates1); break;
case '2': tokenize(optarg, ",", mates2); break;
case ARG_ONETWO: tokenize(optarg, ",", mates12); format = TAB_MATE; break;
case ARG_INTERLEAVED_FASTQ: tokenize(optarg, ",", mates12); format = INTERLEAVED; break;
case 'f': format = FASTA; break;
case 'F': {
format = FASTA_CONT;
Expand Down Expand Up @@ -2592,6 +2596,12 @@ patsrcFromStrings(int format,
trim3, trim5,
solexaQuals, phred64Quals,
integerQuals);
case INTERLEAVED:
return new FastqPatternSource (reads, color,
patDumpfile,
trim3, trim5,
solexaQuals, phred64Quals,
integerQuals, true /* is interleaved */);
case TAB_MATE:
return new TabbedPatternSource(reads, false, color,
patDumpfile,
Expand Down
1 change: 1 addition & 0 deletions formats.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ enum file_format {
FASTA = 1,
FASTA_CONT,
FASTQ,
INTERLEAVED,
TAB_MATE,
RAW,
CMDLINE,
Expand Down
1 change: 1 addition & 0 deletions hit.h
Original file line number Diff line number Diff line change
Expand Up @@ -882,6 +882,7 @@ class HitSinkPerThread {
hitsForThisRead_(),
_max(max),
_n(n),
defaultMapq_(defaultMapq),
threadId_(threadId)
{
sink.addWrapper();
Expand Down
32 changes: 24 additions & 8 deletions pat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -906,7 +906,7 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
bool batch_a)
{
int c = 0;
vector<Read>& readBuf = batch_a ? pt.bufa_ : pt.bufb_;
vector<Read>* readBuf = batch_a ? &pt.bufa_ : &pt.bufb_;
if(first_) {
c = getc_wrapper();
while(c == '\r' || c == '\n') {
Expand All @@ -917,15 +917,15 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
throw 1;
}
first_ = false;
readBuf[0].readOrigBuf[0] = c;
readBuf[0].readOrigBufLen = 1;
(*readBuf)[0].readOrigBuf[0] = c;
(*readBuf)[0].readOrigBufLen = 1;
}
bool done = false, aborted = false;
size_t readi = 0;
// Read until we run out of input or until we've filled the buffer
for(; readi < pt.max_buf_ && !done; readi++) {
char* buf = readBuf[readi].readOrigBuf;
assert(readi == 0 || readBuf[readi].readOrigBufLen == 0);
while (readi < pt.max_buf_ && !done) {
char* buf = (*readBuf)[readi].readOrigBuf;
assert(readi == 0 || (*readBuf)[readi].readOrigBufLen == 0);
int newlines = 4;
while(newlines) {
c = getc_wrapper();
Expand All @@ -936,10 +936,26 @@ pair<bool, int> FastqPatternSource::nextBatchFromFile(
newlines--;
c = '\n';
} else if(done) {
aborted = true; // Unexpected EOF
if (newlines == 4) {
newlines = 0;
} else {
aborted = true; // Unexpected EOF
}
break;
}
buf[readBuf[readi].readOrigBufLen++] = c;
buf[(*readBuf)[readi].readOrigBufLen++] = c;
}
if (c > 0) {
if (interleaved_) {
// alternate between read buffers
batch_a = !batch_a;
readBuf = batch_a ? &pt.bufa_ : &pt.bufb_;
// increment read counter after each pair gets read
readi = batch_a ? readi + 1 : readi;
}
else {
readi++;
}
}
}
if(aborted) {
Expand Down
3 changes: 3 additions & 0 deletions pat.h
Original file line number Diff line number Diff line change
Expand Up @@ -934,6 +934,7 @@ class FastqPatternSource : public CFilePatternSource {
bool solexa_quals = false,
bool phred64Quals = false,
bool integer_quals = false,
bool interleaved = false,
uint32_t skip = 0) :
CFilePatternSource(
infiles,
Expand All @@ -945,6 +946,7 @@ class FastqPatternSource : public CFilePatternSource {
solQuals_(solexa_quals),
phred64Quals_(phred64Quals),
intQuals_(integer_quals),
interleaved_(interleaved),
color_(color) { }

virtual void reset() {
Expand Down Expand Up @@ -988,6 +990,7 @@ class FastqPatternSource : public CFilePatternSource {
bool solQuals_;
bool phred64Quals_;
bool intQuals_;
bool interleaved_;
bool color_;
};

Expand Down
Loading

0 comments on commit 97dd710

Please sign in to comment.