Skip to content

Commit

Permalink
*** empty log message ***
Browse files Browse the repository at this point in the history
  • Loading branch information
langmead committed May 5, 2009
1 parent 8419e77 commit 2d5153c
Show file tree
Hide file tree
Showing 6 changed files with 356 additions and 25 deletions.
16 changes: 3 additions & 13 deletions pat.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,14 @@
using namespace std;
using namespace seqan;

void wrongQualityScale() {
cerr << "Encounterd negative quality value, but Phred qualities can't be negative."<<endl
<< "These qualities appear to use the Solexa scale." << endl
<< "Please re-run Bowtie with the --solexa-quals option.";
}

void wrongQualityFormat() {
cerr << "Encounterd space-separated qualities"<<endl
<< "This appears to be an FASTQ-int file" << endl
<< "Please re-run Bowtie with the --integer-quals option.";
<< "This appears to be an FASTQ-int file" << endl
<< "Please re-run Bowtie with the --integer-quals option.";
}

void tooFewQualities(const String<char>& read_name) {
string s;
for(size_t i = 0; i < seqan::length(read_name); i++) {
s.push_back(read_name[i]);
}
cerr << "Too few quality values for read: " << s << endl
cerr << "Too few quality values for read: " << read_name << endl
<< "\tare you sure this is a FASTQ-int file?" << endl;
}

Expand Down
10 changes: 1 addition & 9 deletions pat.h
Original file line number Diff line number Diff line change
Expand Up @@ -1582,7 +1582,6 @@ class FastaPatternSource : public BufferedFilePatternSource {
int policy_;
};

extern void wrongQualityScale();
extern void wrongQualityFormat();
extern void tooFewQualities(const String<char>& read_name);

Expand Down Expand Up @@ -2031,13 +2030,7 @@ class FastqPatternSource : public BufferedFilePatternSource {
solQuals_(solexa_quals),
phred64Quals_(phred64Quals),
intQuals_(integer_quals)
{
for (int l = 0; l != 128; ++l) {
table_[l] = (int)(10.0 * log(1.0 + pow(10.0, (l - 64) / 10.0)) / log(10.0) + .499);
if (table_[l] >= 63) table_[l] = 63;
if (table_[l] == 0) table_[l] = 1;
}
}
{ }
virtual void reset() {
first_ = true;
BufferedFilePatternSource::reset();
Expand Down Expand Up @@ -2302,7 +2295,6 @@ class FastqPatternSource : public BufferedFilePatternSource {
bool phred64Quals_;
bool intQuals_;
int policy_;
int table_[128];
};

/**
Expand Down
34 changes: 34 additions & 0 deletions qual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,3 +30,37 @@ unsigned char qualRounds[] = {
30, 30, 30, 30, 30, 30, 30, 30, 30, 30, // 245 - 254
30 // 255
};

/**
* Lookup table for converting from Solexa-scaled (log-odds) quality
* values to Phred-scaled quality values.
*/
unsigned char solToPhred[] = {
/* -10 */ 0, 1, 1, 1, 1, 1, 1, 2, 2, 3,
/* 0 */ 3, 4, 4, 5, 5, 6, 7, 8, 9, 10,
/* 10 */ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
/* 20 */ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
/* 30 */ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
/* 40 */ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
/* 50 */ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
/* 60 */ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
/* 70 */ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
/* 80 */ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
/* 90 */ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
/* 100 */ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
/* 110 */ 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
/* 120 */ 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
/* 130 */ 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
/* 140 */ 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
/* 150 */ 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
/* 160 */ 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
/* 170 */ 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
/* 180 */ 180, 181, 182, 183, 184, 185, 186, 187, 188, 189,
/* 190 */ 190, 191, 192, 193, 194, 195, 196, 197, 198, 199,
/* 200 */ 200, 201, 202, 203, 204, 205, 206, 207, 208, 209,
/* 210 */ 210, 211, 212, 213, 214, 215, 216, 217, 218, 219,
/* 220 */ 220, 221, 222, 223, 224, 225, 226, 227, 228, 229,
/* 230 */ 230, 231, 232, 233, 234, 235, 236, 237, 238, 239,
/* 240 */ 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
/* 250 */ 250, 251, 252, 253, 254, 255
};
22 changes: 19 additions & 3 deletions qual.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,29 @@
#define QUAL_H_

extern unsigned char qualRounds[];
extern unsigned char solToPhred[];

/// Translate a Phred-encoded ASCII character into a Phred quality
static inline uint8_t phredCharToPhredQual(char c) {
return ((uint8_t)c >= 33 ? ((uint8_t)c - 33) : 0);
}

/**
* Convert a Solexa-scaled quality value into a Phred-scale quality
* value.
*
* p = probability that base is miscalled
* Qphred = -10 * log10 (p)
* Qsolexa = -10 * log10 (p / (1 - p))
* See: http://en.wikipedia.org/wiki/FASTQ_format
*
*/
static inline uint8_t solexaToPhred(int sol) {
assert_lt(sol, 256);
if(sol < -10) return 0;
return solToPhred[sol+10];
}

class SimplePhredPenalty {
public:
static uint8_t mmPenalty (uint8_t qual) {
Expand Down Expand Up @@ -71,8 +88,7 @@ inline static char charToPhred33(char c, bool solQuals, bool phred64Quals) {
if (solQuals) {
// Convert solexa-scaled chars to phred
// http://maq.sourceforge.net/fastq.shtml
int pQ = (int)(10.0 * log(1.0 + pow(10.0, ((int)c - 64) / 10.0)) / log(10.0) + .499) + 33;
char cc = (char)(pQ);
char cc = solexaToPhred((int)c - 64) + 33;
if (cc < 33) {
cerr << "Saw ASCII character "
<< ((int)c)
Expand Down Expand Up @@ -119,7 +135,7 @@ inline static char intToPhred33(int iQ, bool solQuals) {
// Convert from solexa quality to phred
// quality and translate to ASCII
// http://maq.sourceforge.net/qual.shtml
pQ = (int)(10.0 * log(1.0 + pow(10.0, (iQ) / 10.0)) / log(10.0) + .499) + 33;
pQ = solexaToPhred((int)iQ) + 33;
} else {
// Keep the phred quality and translate
// to ASCII
Expand Down
238 changes: 238 additions & 0 deletions scripts/gen_occ_lookup.pl
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
#!/usr/bin/perl -w

#
# Generate lookup table that, given a packed DNA byte (four bases) and
# a character (A, C, G or T), returns how many times that character
# occurs in that packed byte. Useful for quickly counting character
# occurrences in long strings. The LUT is indexed first by character
# (0-3) then by byte (0-255).
#
# Larger lookup tables are also possible, though they seem
# counterproductive. E.g., looking up eight bases at a time yields a
# 256K LUT, which doesn't fit in L1. A four-base LUT is 1KB, easily
# fitting in L1.
#
# See ebwt.h.
#

my @as4 = (), @as3 = (), @as2 = (), @as1 = ();
my @cs4 = (), @cs3 = (), @cs2 = (), @cs1 = ();
my @gs4 = (), @gs3 = (), @gs2 = (), @gs1 = ();
my @ts4 = (), @ts3 = (), @ts2 = (), @ts1 = ();

# Compile character arrays
my $i;
for($i = 0; $i < 256; $i++) {
my $b01 = ($i >> 0) & 3;
my $b23 = ($i >> 2) & 3;
my $b45 = ($i >> 4) & 3;
my $b67 = ($i >> 6) & 3;

my $a4 = ($b01 == 0) + ($b23 == 0) + ($b45 == 0) + ($b67 == 0);
my $c4 = ($b01 == 1) + ($b23 == 1) + ($b45 == 1) + ($b67 == 1);
my $g4 = ($b01 == 2) + ($b23 == 2) + ($b45 == 2) + ($b67 == 2);
my $t4 = ($b01 == 3) + ($b23 == 3) + ($b45 == 3) + ($b67 == 3);

push @as4, $a4;
push @cs4, $c4;
push @gs4, $g4;
push @ts4, $t4;

my $a3 = ($b01 == 0) + ($b23 == 0) + ($b45 == 0);
my $c3 = ($b01 == 1) + ($b23 == 1) + ($b45 == 1);
my $g3 = ($b01 == 2) + ($b23 == 2) + ($b45 == 2);
my $t3 = ($b01 == 3) + ($b23 == 3) + ($b45 == 3);

push @as3, $a3;
push @cs3, $c3;
push @gs3, $g3;
push @ts3, $t3;

my $a2 = ($b01 == 0) + ($b23 == 0);
my $c2 = ($b01 == 1) + ($b23 == 1);
my $g2 = ($b01 == 2) + ($b23 == 2);
my $t2 = ($b01 == 3) + ($b23 == 3);

push @as2, $a2;
push @cs2, $c2;
push @gs2, $g2;
push @ts2, $t2;

my $a1 = ($b01 == 0) + 0;
my $c1 = ($b01 == 1) + 0;
my $g1 = ($b01 == 2) + 0;
my $t1 = ($b01 == 3) + 0;

push @as1, $a1;
push @cs1, $c1;
push @gs1, $g1;
push @ts1, $t1;
}

my $entsPerLine = 16;

print "#include <stdint.h>\n\n";
print "/* Generated by gen_lookup_tables.pl */\n\n";

# Count occurrences in all 4 bit pairs

print "uint8_t cCntLUT_4[4][4][256] = {\n";
print "\t/* All 4 bit pairs */ {\n";

# Print As array
print "\t\t/* As */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$as4[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Cs array
print "\t\t/* Cs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$cs4[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Gs array
print "\t\t/* Gs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$gs4[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Ts array
print "\t\t/* Ts */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$ts4[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t}\n\t},\n";

# Count occurrences in low 1 bit pair

print "\t/* Least significant 1 bit pair */ {\n";

# Print As array
print "\t\t/* As */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$as1[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Cs array
print "\t\t/* Cs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$cs1[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Gs array
print "\t\t/* Gs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$gs1[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Ts array
print "\t\t/* Ts */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$ts1[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t}\n\t},\n";

# Count occurrences in low 2 bit pairs

print "\t/* Least significant 2 bit pairs */ {\n";

# Print As array
print "\t\t/* As */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$as2[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Cs array
print "\t\t/* Cs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$cs2[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Gs array
print "\t\t/* Gs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$gs2[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Ts array
print "\t\t/* Ts */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$ts2[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t}\n\t},\n";

# Count occurrences in low 3 bit pairs

print "\t/* Least significant 3 bit pairs */ {\n";

# Print As array
print "\t\t/* As */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$as3[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Cs array
print "\t\t/* Cs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$cs3[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Gs array
print "\t\t/* Gs */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$gs3[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t},\n";

# Print Ts array
print "\t\t/* Ts */ {\n";
for($i = 0; $i < 256; $i++) {
print "\t\t\t" if(($i % $entsPerLine) == 0);
print "$ts3[$i], ";
print "\n" if(($i % $entsPerLine) == ($entsPerLine-1));
}
print "\t\t}\n\t}\n";

print "};\n";
Loading

0 comments on commit 2d5153c

Please sign in to comment.