From 1d4c9c3c5484e6d012c15e6317daab3f53ac679d Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sun, 2 Nov 2014 01:35:53 -0700 Subject: [PATCH 01/14] got most of the code for BGZF compliance entered, not working yet --- pigz.c | 262 +++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 178 insertions(+), 84 deletions(-) diff --git a/pigz.c b/pigz.c index df25cf3..7fae6fc 100644 --- a/pigz.c +++ b/pigz.c @@ -294,6 +294,12 @@ the first set of compressions are being performed. The number of output buffers is not directly limited, but is indirectly limited by the release of input buffers to about the same number. + + TODO add BGZF Block Compress description and settings. + FNAME, FCOMMENT only in first block? + checksum may need to be tied to block, not continuous... + blocksize may need to be decreased somewhat accounting + for BGZF_BLOCK_SIZE is not a multiple of 128k... */ /* use large file functions if available */ @@ -450,6 +456,23 @@ /* input buffer size */ #define BUF 32768U +/* BGZF constants */ +#define BGZF_MAX_BLOCK_SIZE 0x10000 +#define BGZF_BLOCK_SIZE 0x0ff00 // make sure compressed_bytes(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE +#define BGZF_HEADER_SIZE 18 +#define BGZF_FOOTER_SIZE 8 +/* BGZF/GZIP header (speciallized from RFC 1952; little endian): + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + | 31|139| 8| 4| 0| 0|255|XLEN=6 | 66| 67| 2|BLK_LEN| + +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+ + ... compressed_data[ BLK_LEN - XLEN - 19 ] + +---+---+---+---+---+---+---+---+ + | CRC32 | ISIZE | + +---+---+---+---+---+---+---+---+ + */ +/* bgzf_eof is the last block signifying EOF */ +local const uint8_t bgzf_eof[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; + /* globals (modified by main thread only when it's the only thread) */ local struct { char *prog; /* name by which pigz was invoked */ @@ -463,6 +486,7 @@ local struct { int keep; /* true to prevent deletion of input file */ int force; /* true to overwrite, compress links, cat */ int form; /* gzip = 0, zlib = 1, zip = 2 or 3 */ + int bgzf; /* true to include BGZF BlockCompress compliance */ unsigned char magic1; /* first byte of possible header when decoding */ int recurse; /* true to dive down into directory structure */ char *sufx; /* suffix to use (".gz" or user supplied) */ @@ -863,6 +887,81 @@ local unsigned long time2dos(time_t t) #define PUT4L(a,b) (PUT2L(a,(b)&0xffff),PUT2L((a)+2,(b)>>16)) #define PUT4M(a,b) (*(a)=(b)>>24,(a)[1]=(b)>>16,(a)[2]=(b)>>8,(a)[3]=(b)) +/* write a gzip, zlib, or zip trailer */ +local void put_trailer(unsigned long ulen, unsigned long clen, + unsigned long check, unsigned long head) +{ + unsigned char tail[46]; + + if (g.form > 1) { /* zip */ + unsigned long cent; + + /* write data descriptor (as promised in local header) */ + PUT4L(tail, 0x08074b50UL); + PUT4L(tail + 4, check); + PUT4L(tail + 8, clen); + PUT4L(tail + 12, ulen); + writen(g.outd, tail, 16); + + /* write central file header */ + PUT4L(tail, 0x02014b50UL); /* central header signature */ + tail[4] = 63; /* obeyed version 6.3 of the zip spec */ + tail[5] = 255; /* ignore external attributes */ + PUT2L(tail + 6, 20); /* version needed to extract (2.0) */ + PUT2L(tail + 8, 8); /* data descriptor is present */ + PUT2L(tail + 10, 8); /* deflate */ + PUT4L(tail + 12, time2dos(g.mtime)); + PUT4L(tail + 16, check); /* crc */ + PUT4L(tail + 20, clen); /* compressed length */ + PUT4L(tail + 24, ulen); /* uncompressed length */ + PUT2L(tail + 28, g.name == NULL ? 1 : /* length of name */ + strlen(g.name)); + PUT2L(tail + 30, 9); /* length of extra field (see below) */ + PUT2L(tail + 32, 0); /* no file comment */ + PUT2L(tail + 34, 0); /* disk number 0 */ + PUT2L(tail + 36, 0); /* internal file attributes */ + PUT4L(tail + 38, 0); /* external file attributes (ignored) */ + PUT4L(tail + 42, 0); /* offset of local header */ + writen(g.outd, tail, 46); /* write central file header */ + cent = 46; + + /* write file name (use "-" for stdin) */ + if (g.name == NULL) + writen(g.outd, (unsigned char *)"-", 1); + else + writen(g.outd, (unsigned char *)g.name, strlen(g.name)); + cent += g.name == NULL ? 1 : strlen(g.name); + + /* write extended timestamp extra field block (9 bytes) */ + PUT2L(tail, 0x5455); /* extended timestamp signature */ + PUT2L(tail + 2, 5); /* number of data bytes in this block */ + tail[4] = 1; /* flag presence of mod time */ + PUT4L(tail + 5, g.mtime); /* mod time */ + writen(g.outd, tail, 9); /* write extra field block */ + cent += 9; + + /* write end of central directory record */ + PUT4L(tail, 0x06054b50UL); /* end of central directory signature */ + PUT2L(tail + 4, 0); /* number of this disk */ + PUT2L(tail + 6, 0); /* disk with start of central directory */ + PUT2L(tail + 8, 1); /* number of entries on this disk */ + PUT2L(tail + 10, 1); /* total number of entries */ + PUT4L(tail + 12, cent); /* size of central directory */ + PUT4L(tail + 16, head + clen + 16); /* offset of central directory */ + PUT2L(tail + 20, 0); /* no zip file comment */ + writen(g.outd, tail, 22); /* write end of central directory record */ + } + else if (g.form) { /* zlib */ + PUT4M(tail, check); + writen(g.outd, tail, 4); + } + else { /* gzip */ + PUT4L(tail, check); + PUT4L(tail + 4, ulen); + writen(g.outd, tail, 8); + } +} + /* write a gzip, zlib, or zip header using the information in the globals */ local unsigned long put_header(void) { @@ -917,91 +1016,47 @@ local unsigned long put_header(void) head[3] = g.name != NULL ? 8 : 0; PUT4L(head + 4, g.mtime); head[8] = g.level >= 9 ? 2 : (g.level == 1 ? 4 : 0); - head[9] = 3; /* unix */ - writen(g.outd, head, 10); - len = 10; - if (g.name != NULL) - writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); - if (g.name != NULL) - len += strlen(g.name) + 1; + head[9] = 3; /* unix */ + if (g.bgzf) { + head[3] |= 4; /* include FEXTRA */ + head[9] = 255; /* unknown */ + PUT2L(head+10, 6); /* 6 byte XLEN */ + head[12] = 'B'; /* BC tag */ + head[13] = 'C'; + PUT2L(head+14, 2); /* 2 byte BSIZE field */ + len = 16; + writen(g.outd, head, len); + /* Leave 2 bytes (uint16_t) BSIZE unwritten for now */ + if (g.name) { + + /* write a special no contents header to include the name */ + PUT2L(head+len, len+2+strlen(g.name)+1 + BGZF_FOOTER_SIZE - 1); + writen(g.outd, head+len, 2); + len += 2; + + writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); + len += strlen(g.name) + 1; + g.name = NULL; /* but do not write the name again */ + + /* write a footer */ + put_trailer(0,0,0,0); + + /* now put another header, but now g.name is NULL */ + put_header(); + + } + } else { + writen(g.outd, head, 10); + len = 10; + } + if (g.name != NULL) { + writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); + len += strlen(g.name) + 1; + } } return len; } -/* write a gzip, zlib, or zip trailer */ -local void put_trailer(unsigned long ulen, unsigned long clen, - unsigned long check, unsigned long head) -{ - unsigned char tail[46]; - - if (g.form > 1) { /* zip */ - unsigned long cent; - - /* write data descriptor (as promised in local header) */ - PUT4L(tail, 0x08074b50UL); - PUT4L(tail + 4, check); - PUT4L(tail + 8, clen); - PUT4L(tail + 12, ulen); - writen(g.outd, tail, 16); - - /* write central file header */ - PUT4L(tail, 0x02014b50UL); /* central header signature */ - tail[4] = 63; /* obeyed version 6.3 of the zip spec */ - tail[5] = 255; /* ignore external attributes */ - PUT2L(tail + 6, 20); /* version needed to extract (2.0) */ - PUT2L(tail + 8, 8); /* data descriptor is present */ - PUT2L(tail + 10, 8); /* deflate */ - PUT4L(tail + 12, time2dos(g.mtime)); - PUT4L(tail + 16, check); /* crc */ - PUT4L(tail + 20, clen); /* compressed length */ - PUT4L(tail + 24, ulen); /* uncompressed length */ - PUT2L(tail + 28, g.name == NULL ? 1 : /* length of name */ - strlen(g.name)); - PUT2L(tail + 30, 9); /* length of extra field (see below) */ - PUT2L(tail + 32, 0); /* no file comment */ - PUT2L(tail + 34, 0); /* disk number 0 */ - PUT2L(tail + 36, 0); /* internal file attributes */ - PUT4L(tail + 38, 0); /* external file attributes (ignored) */ - PUT4L(tail + 42, 0); /* offset of local header */ - writen(g.outd, tail, 46); /* write central file header */ - cent = 46; - - /* write file name (use "-" for stdin) */ - if (g.name == NULL) - writen(g.outd, (unsigned char *)"-", 1); - else - writen(g.outd, (unsigned char *)g.name, strlen(g.name)); - cent += g.name == NULL ? 1 : strlen(g.name); - - /* write extended timestamp extra field block (9 bytes) */ - PUT2L(tail, 0x5455); /* extended timestamp signature */ - PUT2L(tail + 2, 5); /* number of data bytes in this block */ - tail[4] = 1; /* flag presence of mod time */ - PUT4L(tail + 5, g.mtime); /* mod time */ - writen(g.outd, tail, 9); /* write extra field block */ - cent += 9; - - /* write end of central directory record */ - PUT4L(tail, 0x06054b50UL); /* end of central directory signature */ - PUT2L(tail + 4, 0); /* number of this disk */ - PUT2L(tail + 6, 0); /* disk with start of central directory */ - PUT2L(tail + 8, 1); /* number of entries on this disk */ - PUT2L(tail + 10, 1); /* total number of entries */ - PUT4L(tail + 12, cent); /* size of central directory */ - PUT4L(tail + 16, head + clen + 16); /* offset of central directory */ - PUT2L(tail + 20, 0); /* no zip file comment */ - writen(g.outd, tail, 22); /* write end of central directory record */ - } - else if (g.form) { /* zlib */ - PUT4M(tail, check); - writen(g.outd, tail, 4); - } - else { /* gzip */ - PUT4L(tail, check); - PUT4L(tail + 4, ulen); - writen(g.outd, tail, 8); - } -} /* compute check value depending on format */ #define CHECK(a,b,c) (g.form == 1 ? adler32(a,b,c) : crc32(a,b,c)) @@ -1642,6 +1697,7 @@ local void write_thread(void *dummy) unsigned long ulen; /* total uncompressed size (overflow ok) */ unsigned long clen; /* total compressed size (overflow ok) */ unsigned long check; /* check value of uncompressed data */ + unsigned char bsize_buf[2]; /* for bgzf compliance */ (void)dummy; @@ -1654,7 +1710,7 @@ local void write_thread(void *dummy) check = CHECK(0L, Z_NULL, 0); seq = 0; do { - /* get next write job in order */ + /* get next write job in order */ possess(write_first); wait_for(write_first, TO_BE, seq); job = write_head; @@ -1670,6 +1726,27 @@ local void write_thread(void *dummy) /* write the compressed data and drop the output buffer */ Trace(("-- writing #%ld", seq)); + if (g.bgzf) { + if (seq > 0) { + /* write old footer */ + put_trailer(ulen, clen, check, head); + /* reset counting */ + ulen = clen = 0; + check = CHECK(0L, Z_NULL, 0); + /* Do not need to repeat name in the header after the first one. */ + g.name = NULL; + /* write new header */ + head = put_header(); + } + /* write 2-byte BSIZE field */ + head += 2; + assert( head >= BGZF_HEADER_SIZE); + assert(job->out->len + head + BGZF_FOOTER_SIZE <= BGZF_BLOCK_SIZE); + PUT2L(bsize_buf, (uint16_t) (job->out->len + head + BGZF_FOOTER_SIZE - 1)); + writen(g.outd, bsize_buf, 4); + fprintf(stderr, "Wrote BGZF entry for block %lld: %d ulen %d. %d %d\n", seq, (job->out->len + head + BGZF_FOOTER_SIZE - 1), len, bsize_buf[0], bsize_buf[1]); + + } writen(g.outd, job->out->buf, job->out->len); drop_space(job->out); Trace(("-- wrote #%ld%s", seq, more ? "" : " (last)")); @@ -1691,6 +1768,10 @@ local void write_thread(void *dummy) /* write trailer */ put_trailer(ulen, clen, check, head); + if (g.bgzf) { + /* write the final trailing EOF block */ + writen(g.outd, (unsigned char*) bgzf_eof, 28); + } /* verify no more jobs, prepare for next use */ possess(compress_have); @@ -2459,7 +2540,7 @@ local int get_header(int save) g.in_next--; return -2; } - +/* TODO */ /* it's gzip -- get method and flags */ method = GET(); flags = GET(); @@ -3617,6 +3698,7 @@ local char *helptext[] = { " -F --first Do iterations first, before block split for -11", " -h, --help Display a help screen and quit", " -i, --independent Compress blocks independently for damage recovery", +" -B, --bgzf Create BGZF compliant gzip, includes -b 64 -i", " -I, --iterations n Number of iterations for -11 optimization", " -k, --keep Do not delete original file after processing", " -K, --zip Compress to PKWare zip (.zip) single entry format", @@ -3713,6 +3795,7 @@ local void defaults(void) g.force = 0; /* don't overwrite, don't compress links */ g.recurse = 0; /* don't go into directories */ g.form = 0; /* use gzip format */ + g.bgzf = 0; /* do not create BGZF gzip */ } /* long options conversion to short options */ @@ -3725,7 +3808,7 @@ local char *longopts[][2] = { {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"}, {"rsyncable", "R"}, {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"}, {"test", "t"}, {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"}, - {"version", "V"}, {"zip", "K"}, {"zlib", "z"}}; + {"version", "V"}, {"zip", "K"}, {"zlib", "z"}, {"bgzf", "B"}}; #define NLOPTS (sizeof(longopts) / (sizeof(char *) << 1)) /* either new buffer size, new compression level, or new number of processes -- @@ -3844,6 +3927,7 @@ local int option(char *arg) case 'f': g.force = 1; break; case 'h': help(); break; case 'i': g.setdict = 0; break; + case 'B': g.bgzf = 1; break; case 'k': g.keep = 1; break; case 'l': g.list = 1; break; case 'n': g.headis &= ~1; break; @@ -4007,6 +4091,16 @@ int main(int argc, char **argv) if (done == 1 && g.pipeout && !g.decode && !g.list && g.form > 1) complain("warning: output will be concatenated zip files -- " "will not be able to extract"); + if (g.bgzf) { + if (g.decode) { + /* decompressing. make g.bgzf autodetected from input file */ + g.bgzf = 0; + } else { + /* compressing. Override settings on block size and force independent checksums */ + g.block = BGZF_BLOCK_SIZE; + g.setdict = 0; + } + } process(strcmp(argv[n], "-") ? argv[n] : NULL); done++; } From d4591b8080f062bef3feaad187f4dabae772f8fe Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sun, 2 Nov 2014 15:39:28 -0800 Subject: [PATCH 02/14] now --bgzf compresses gzip compliant files --- pigz.c | 80 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/pigz.c b/pigz.c index 7fae6fc..e3ca4cb 100644 --- a/pigz.c +++ b/pigz.c @@ -967,6 +967,7 @@ local unsigned long put_header(void) { unsigned long len; unsigned char head[30]; + unsigned int bsize; /* for bgzf */ if (g.form > 1) { /* zip */ /* write local header */ @@ -1027,22 +1028,28 @@ local unsigned long put_header(void) len = 16; writen(g.outd, head, len); /* Leave 2 bytes (uint16_t) BSIZE unwritten for now */ + if (g.name) { - /* write a special no contents header to include the name */ - PUT2L(head+len, len+2+strlen(g.name)+1 + BGZF_FOOTER_SIZE - 1); + /* write a special no contents block to include the FNAME field */ + + /* write 2-byte BSIZE entry now */ + bsize = len+2 + strlen(g.name)+1 + 2 + BGZF_FOOTER_SIZE - 1; + PUT2L(head+len, bsize); writen(g.outd, head+len, 2); + fprintf(stderr, "first bgzf block with name bsize: %d %d %d\n", bsize, head[len], head[len+1]); len += 2; + /* write name */ writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); - len += strlen(g.name) + 1; - g.name = NULL; /* but do not write the name again */ - - /* write a footer */ - put_trailer(0,0,0,0); - /* now put another header, but now g.name is NULL */ - put_header(); + /* empty zero-length Z_FINISH-ed compressed stream */ + head[len] = 3; head[len+1] = 0; + write(g.outd, head+len, 2); + + /* add the name size+1 */ + len += strlen(g.name) + 1; + g.name = NULL; /* but do not write the name ever again! */ } } else { @@ -1574,9 +1581,9 @@ local void compress_thread(void *dummy) /* run the last piece through deflate -- end on a byte boundary, using a sync marker if necessary, or finish the - deflate stream if this is the last block */ + deflate stream if this is the last block or a BGZF compliant gzip */ strm.avail_in = (unsigned)len; - if (left || job->more) { + if (g.bgzf == 0 && (left || job->more)) { #if ZLIB_VERNUM >= 0x1260 deflate_engine(&strm, job->out, Z_BLOCK); @@ -1596,8 +1603,10 @@ local void compress_thread(void *dummy) deflate_engine(&strm, job->out, Z_SYNC_FLUSH); #endif } - else + else { deflate_engine(&strm, job->out, Z_FINISH); + fprintf(stderr, "finishing block uc: %d c: %d\n", len, strm.total_out); + } } else { /* compress len bytes using zopfli, bring to byte boundary */ @@ -1721,32 +1730,34 @@ local void write_thread(void *dummy) more = job->more; len = job->in->len; drop_space(job->in); - ulen += (unsigned long)len; - clen += (unsigned long)(job->out->len); /* write the compressed data and drop the output buffer */ Trace(("-- writing #%ld", seq)); + + /* special logic for BGZF compliant gzip... finish the block and start a new one */ if (g.bgzf) { - if (seq > 0) { - /* write old footer */ - put_trailer(ulen, clen, check, head); - /* reset counting */ - ulen = clen = 0; - check = CHECK(0L, Z_NULL, 0); - /* Do not need to repeat name in the header after the first one. */ - g.name = NULL; - /* write new header */ - head = put_header(); - } + put_trailer(ulen, clen, check, head); + + /* reset counting */ + ulen = clen = 0; + check = CHECK(0L, Z_NULL, 0); + + /* write new header */ + head = put_header(); + /* write 2-byte BSIZE field */ head += 2; assert( head >= BGZF_HEADER_SIZE); assert(job->out->len + head + BGZF_FOOTER_SIZE <= BGZF_BLOCK_SIZE); PUT2L(bsize_buf, (uint16_t) (job->out->len + head + BGZF_FOOTER_SIZE - 1)); - writen(g.outd, bsize_buf, 4); - fprintf(stderr, "Wrote BGZF entry for block %lld: %d ulen %d. %d %d\n", seq, (job->out->len + head + BGZF_FOOTER_SIZE - 1), len, bsize_buf[0], bsize_buf[1]); + writen(g.outd, bsize_buf, 2); + fprintf(stderr, "Wrote BGZF entry for block %lld: %d ulen %lld. %d %d\n", seq, (job->out->len + head + BGZF_FOOTER_SIZE - 1), len, bsize_buf[0], bsize_buf[1]); } + + ulen += (unsigned long)len; + clen += (unsigned long)(job->out->len); + writen(g.outd, job->out->buf, job->out->len); drop_space(job->out); Trace(("-- wrote #%ld%s", seq, more ? "" : " (last)")); @@ -1768,6 +1779,7 @@ local void write_thread(void *dummy) /* write trailer */ put_trailer(ulen, clen, check, head); + if (g.bgzf) { /* write the final trailing EOF block */ writen(g.outd, (unsigned char*) bgzf_eof, 28); @@ -2160,11 +2172,12 @@ local void single_compress(int reset) got -= MAXP2; } - /* compress the remainder, emit a block, finish if end of input */ + /* compress the remainder, emit a block, finish if end of input + of if this is a BGZF compliant gzip */ strm->avail_in = (unsigned)got; got = left; check = CHECK(check, strm->next_in, strm->avail_in); - if (more || got) { + if (g.bgzf == 0 && (more || got)) { #if ZLIB_VERNUM >= 0x1260 int bits; @@ -2241,6 +2254,12 @@ local void single_compress(int reset) /* write trailer */ put_trailer(ulen, clen, check, head); + + if (g.bgzf) { + /* write the final trailing EOF block */ + writen(g.outd, (unsigned char*) bgzf_eof, 28); + } + } /* --- decompression --- */ @@ -3019,6 +3038,7 @@ local void infchk(void) /* decompress, compute lengths and check value */ strm.avail_in = g.in_left; strm.next_in = g.in_next; + fprintf(stderr, "checking complen %d\n", strm.avail_in); ret = inflateBack(&strm, inb, NULL, outb, NULL); if (ret != Z_STREAM_END) bail("corrupted input -- invalid deflate data: ", g.inf); @@ -3029,7 +3049,7 @@ local void infchk(void) /* compute compressed data length */ clen = g.in_tot - g.in_left; - + fprintf(stderr, "checked complen %lld, uncomlen %lld\n", clen, g.out_tot); /* read and check trailer */ if (g.form > 1) { /* zip local trailer (if any) */ if (g.form == 3) { /* data descriptor follows */ From 42b433536afbbcea6d1025839063f40f28888d81 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sun, 2 Nov 2014 16:18:57 -0800 Subject: [PATCH 03/14] reworked indent to be 4 spaces, not tab removed print statements put put_trailer back in original spot --- pigz.c | 348 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 169 insertions(+), 179 deletions(-) diff --git a/pigz.c b/pigz.c index e3ca4cb..e1f658f 100644 --- a/pigz.c +++ b/pigz.c @@ -296,10 +296,9 @@ input buffers to about the same number. TODO add BGZF Block Compress description and settings. - FNAME, FCOMMENT only in first block? - checksum may need to be tied to block, not continuous... - blocksize may need to be decreased somewhat accounting - for BGZF_BLOCK_SIZE is not a multiple of 128k... + FNAME only in first + checksum may needs to be tied to block, not rolling + blocksize needs to be 65280 to ensure header + compressed + trailer < 65k */ /* use large file functions if available */ @@ -486,7 +485,7 @@ local struct { int keep; /* true to prevent deletion of input file */ int force; /* true to overwrite, compress links, cat */ int form; /* gzip = 0, zlib = 1, zip = 2 or 3 */ - int bgzf; /* true to include BGZF BlockCompress compliance */ + int bgzf; /* true to include BGZF BlockCompress compliance */ unsigned char magic1; /* first byte of possible header when decoding */ int recurse; /* true to dive down into directory structure */ char *sufx; /* suffix to use (".gz" or user supplied) */ @@ -887,87 +886,12 @@ local unsigned long time2dos(time_t t) #define PUT4L(a,b) (PUT2L(a,(b)&0xffff),PUT2L((a)+2,(b)>>16)) #define PUT4M(a,b) (*(a)=(b)>>24,(a)[1]=(b)>>16,(a)[2]=(b)>>8,(a)[3]=(b)) -/* write a gzip, zlib, or zip trailer */ -local void put_trailer(unsigned long ulen, unsigned long clen, - unsigned long check, unsigned long head) -{ - unsigned char tail[46]; - - if (g.form > 1) { /* zip */ - unsigned long cent; - - /* write data descriptor (as promised in local header) */ - PUT4L(tail, 0x08074b50UL); - PUT4L(tail + 4, check); - PUT4L(tail + 8, clen); - PUT4L(tail + 12, ulen); - writen(g.outd, tail, 16); - - /* write central file header */ - PUT4L(tail, 0x02014b50UL); /* central header signature */ - tail[4] = 63; /* obeyed version 6.3 of the zip spec */ - tail[5] = 255; /* ignore external attributes */ - PUT2L(tail + 6, 20); /* version needed to extract (2.0) */ - PUT2L(tail + 8, 8); /* data descriptor is present */ - PUT2L(tail + 10, 8); /* deflate */ - PUT4L(tail + 12, time2dos(g.mtime)); - PUT4L(tail + 16, check); /* crc */ - PUT4L(tail + 20, clen); /* compressed length */ - PUT4L(tail + 24, ulen); /* uncompressed length */ - PUT2L(tail + 28, g.name == NULL ? 1 : /* length of name */ - strlen(g.name)); - PUT2L(tail + 30, 9); /* length of extra field (see below) */ - PUT2L(tail + 32, 0); /* no file comment */ - PUT2L(tail + 34, 0); /* disk number 0 */ - PUT2L(tail + 36, 0); /* internal file attributes */ - PUT4L(tail + 38, 0); /* external file attributes (ignored) */ - PUT4L(tail + 42, 0); /* offset of local header */ - writen(g.outd, tail, 46); /* write central file header */ - cent = 46; - - /* write file name (use "-" for stdin) */ - if (g.name == NULL) - writen(g.outd, (unsigned char *)"-", 1); - else - writen(g.outd, (unsigned char *)g.name, strlen(g.name)); - cent += g.name == NULL ? 1 : strlen(g.name); - - /* write extended timestamp extra field block (9 bytes) */ - PUT2L(tail, 0x5455); /* extended timestamp signature */ - PUT2L(tail + 2, 5); /* number of data bytes in this block */ - tail[4] = 1; /* flag presence of mod time */ - PUT4L(tail + 5, g.mtime); /* mod time */ - writen(g.outd, tail, 9); /* write extra field block */ - cent += 9; - - /* write end of central directory record */ - PUT4L(tail, 0x06054b50UL); /* end of central directory signature */ - PUT2L(tail + 4, 0); /* number of this disk */ - PUT2L(tail + 6, 0); /* disk with start of central directory */ - PUT2L(tail + 8, 1); /* number of entries on this disk */ - PUT2L(tail + 10, 1); /* total number of entries */ - PUT4L(tail + 12, cent); /* size of central directory */ - PUT4L(tail + 16, head + clen + 16); /* offset of central directory */ - PUT2L(tail + 20, 0); /* no zip file comment */ - writen(g.outd, tail, 22); /* write end of central directory record */ - } - else if (g.form) { /* zlib */ - PUT4M(tail, check); - writen(g.outd, tail, 4); - } - else { /* gzip */ - PUT4L(tail, check); - PUT4L(tail + 4, ulen); - writen(g.outd, tail, 8); - } -} - /* write a gzip, zlib, or zip header using the information in the globals */ local unsigned long put_header(void) { unsigned long len; unsigned char head[30]; - unsigned int bsize; /* for bgzf */ + unsigned int bsize; /* for bgzf */ if (g.form > 1) { /* zip */ /* write local header */ @@ -1017,53 +941,123 @@ local unsigned long put_header(void) head[3] = g.name != NULL ? 8 : 0; PUT4L(head + 4, g.mtime); head[8] = g.level >= 9 ? 2 : (g.level == 1 ? 4 : 0); - head[9] = 3; /* unix */ - if (g.bgzf) { - head[3] |= 4; /* include FEXTRA */ - head[9] = 255; /* unknown */ - PUT2L(head+10, 6); /* 6 byte XLEN */ - head[12] = 'B'; /* BC tag */ - head[13] = 'C'; - PUT2L(head+14, 2); /* 2 byte BSIZE field */ - len = 16; - writen(g.outd, head, len); - /* Leave 2 bytes (uint16_t) BSIZE unwritten for now */ - - if (g.name) { - - /* write a special no contents block to include the FNAME field */ - - /* write 2-byte BSIZE entry now */ - bsize = len+2 + strlen(g.name)+1 + 2 + BGZF_FOOTER_SIZE - 1; - PUT2L(head+len, bsize); - writen(g.outd, head+len, 2); - fprintf(stderr, "first bgzf block with name bsize: %d %d %d\n", bsize, head[len], head[len+1]); - len += 2; - - /* write name */ - writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); - - /* empty zero-length Z_FINISH-ed compressed stream */ - head[len] = 3; head[len+1] = 0; - write(g.outd, head+len, 2); - - /* add the name size+1 */ - len += strlen(g.name) + 1; - g.name = NULL; /* but do not write the name ever again! */ - - } - } else { - writen(g.outd, head, 10); - len = 10; - } - if (g.name != NULL) { - writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); - len += strlen(g.name) + 1; - } + head[9] = 3; /* unix */ + if (g.bgzf) { + head[3] |= 4; /* include FEXTRA */ + head[9] = 255; /* unknown */ + PUT2L(head+10, 6); /* 6 byte XLEN */ + head[12] = 'B'; /* BC tag */ + head[13] = 'C'; + PUT2L(head+14, 2); /* 2 byte BSIZE field */ + len = 16; + writen(g.outd, head, len); + /* Leave 2 bytes (uint16_t) BSIZE unwritten for now */ + + if (g.name) { /* write a special no contents block to include the FNAME field */ + + /* write 2-byte BSIZE entry now */ + bsize = len+2 + strlen(g.name)+1 + 2 + BGZF_FOOTER_SIZE - 1; + PUT2L(head+len, bsize); + writen(g.outd, head+len, 2); + len += 2; + + /* write name */ + writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); + + /* empty zero-length Z_FINISH-ed compressed stream */ + head[len] = 3; head[len+1] = 0; + write(g.outd, head+len, 2); + + /* add the name size+1 */ + len += strlen(g.name) + 1; + g.name = NULL; /* but do not write the name ever again! */ + } + } else { + writen(g.outd, head, 10); + len = 10; + } + if (g.name != NULL) { + writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); + len += strlen(g.name) + 1; + } } return len; } +/* write a gzip, zlib, or zip trailer */ +local void put_trailer(unsigned long ulen, unsigned long clen, + unsigned long check, unsigned long head) +{ + unsigned char tail[46]; + + if (g.form > 1) { /* zip */ + unsigned long cent; + + /* write data descriptor (as promised in local header) */ + PUT4L(tail, 0x08074b50UL); + PUT4L(tail + 4, check); + PUT4L(tail + 8, clen); + PUT4L(tail + 12, ulen); + writen(g.outd, tail, 16); + + /* write central file header */ + PUT4L(tail, 0x02014b50UL); /* central header signature */ + tail[4] = 63; /* obeyed version 6.3 of the zip spec */ + tail[5] = 255; /* ignore external attributes */ + PUT2L(tail + 6, 20); /* version needed to extract (2.0) */ + PUT2L(tail + 8, 8); /* data descriptor is present */ + PUT2L(tail + 10, 8); /* deflate */ + PUT4L(tail + 12, time2dos(g.mtime)); + PUT4L(tail + 16, check); /* crc */ + PUT4L(tail + 20, clen); /* compressed length */ + PUT4L(tail + 24, ulen); /* uncompressed length */ + PUT2L(tail + 28, g.name == NULL ? 1 : /* length of name */ + strlen(g.name)); + PUT2L(tail + 30, 9); /* length of extra field (see below) */ + PUT2L(tail + 32, 0); /* no file comment */ + PUT2L(tail + 34, 0); /* disk number 0 */ + PUT2L(tail + 36, 0); /* internal file attributes */ + PUT4L(tail + 38, 0); /* external file attributes (ignored) */ + PUT4L(tail + 42, 0); /* offset of local header */ + writen(g.outd, tail, 46); /* write central file header */ + cent = 46; + + /* write file name (use "-" for stdin) */ + if (g.name == NULL) + writen(g.outd, (unsigned char *)"-", 1); + else + writen(g.outd, (unsigned char *)g.name, strlen(g.name)); + cent += g.name == NULL ? 1 : strlen(g.name); + + /* write extended timestamp extra field block (9 bytes) */ + PUT2L(tail, 0x5455); /* extended timestamp signature */ + PUT2L(tail + 2, 5); /* number of data bytes in this block */ + tail[4] = 1; /* flag presence of mod time */ + PUT4L(tail + 5, g.mtime); /* mod time */ + writen(g.outd, tail, 9); /* write extra field block */ + cent += 9; + + /* write end of central directory record */ + PUT4L(tail, 0x06054b50UL); /* end of central directory signature */ + PUT2L(tail + 4, 0); /* number of this disk */ + PUT2L(tail + 6, 0); /* disk with start of central directory */ + PUT2L(tail + 8, 1); /* number of entries on this disk */ + PUT2L(tail + 10, 1); /* total number of entries */ + PUT4L(tail + 12, cent); /* size of central directory */ + PUT4L(tail + 16, head + clen + 16); /* offset of central directory */ + PUT2L(tail + 20, 0); /* no zip file comment */ + writen(g.outd, tail, 22); /* write end of central directory record */ + } + else if (g.form) { /* zlib */ + PUT4M(tail, check); + writen(g.outd, tail, 4); + } + else { /* gzip */ + PUT4L(tail, check); + PUT4L(tail + 4, ulen); + writen(g.outd, tail, 8); + } +} /* compute check value depending on format */ #define CHECK(a,b,c) (g.form == 1 ? adler32(a,b,c) : crc32(a,b,c)) @@ -1603,10 +1597,8 @@ local void compress_thread(void *dummy) deflate_engine(&strm, job->out, Z_SYNC_FLUSH); #endif } - else { + else deflate_engine(&strm, job->out, Z_FINISH); - fprintf(stderr, "finishing block uc: %d c: %d\n", len, strm.total_out); - } } else { /* compress len bytes using zopfli, bring to byte boundary */ @@ -1706,7 +1698,7 @@ local void write_thread(void *dummy) unsigned long ulen; /* total uncompressed size (overflow ok) */ unsigned long clen; /* total compressed size (overflow ok) */ unsigned long check; /* check value of uncompressed data */ - unsigned char bsize_buf[2]; /* for bgzf compliance */ + unsigned char bsize_buf[2]; /* for bgzf compliance */ (void)dummy; @@ -1719,7 +1711,7 @@ local void write_thread(void *dummy) check = CHECK(0L, Z_NULL, 0); seq = 0; do { - /* get next write job in order */ + /* get next write job in order */ possess(write_first); wait_for(write_first, TO_BE, seq); job = write_head; @@ -1733,30 +1725,29 @@ local void write_thread(void *dummy) /* write the compressed data and drop the output buffer */ Trace(("-- writing #%ld", seq)); - - /* special logic for BGZF compliant gzip... finish the block and start a new one */ - if (g.bgzf) { - put_trailer(ulen, clen, check, head); - - /* reset counting */ - ulen = clen = 0; - check = CHECK(0L, Z_NULL, 0); - - /* write new header */ - head = put_header(); - - /* write 2-byte BSIZE field */ - head += 2; - assert( head >= BGZF_HEADER_SIZE); - assert(job->out->len + head + BGZF_FOOTER_SIZE <= BGZF_BLOCK_SIZE); - PUT2L(bsize_buf, (uint16_t) (job->out->len + head + BGZF_FOOTER_SIZE - 1)); - writen(g.outd, bsize_buf, 2); - fprintf(stderr, "Wrote BGZF entry for block %lld: %d ulen %lld. %d %d\n", seq, (job->out->len + head + BGZF_FOOTER_SIZE - 1), len, bsize_buf[0], bsize_buf[1]); - - } - - ulen += (unsigned long)len; - clen += (unsigned long)(job->out->len); + + /* special logic for BGZF compliant gzip... finish the block and start a new one */ + if (g.bgzf) { + /* finish this block */ + put_trailer(ulen, clen, check, head); + + /* reset counting */ + ulen = clen = 0; + check = CHECK(0L, Z_NULL, 0); + + /* write new header for new block */ + head = put_header(); + + /* write 2-byte BSIZE field */ + head += 2; + assert( head >= BGZF_HEADER_SIZE); + assert(job->out->len + head + BGZF_FOOTER_SIZE <= BGZF_BLOCK_SIZE); + PUT2L(bsize_buf, (uint16_t) (job->out->len + head + BGZF_FOOTER_SIZE - 1)); + writen(g.outd, bsize_buf, 2); + } + + ulen += (unsigned long)len; + clen += (unsigned long)(job->out->len); writen(g.outd, job->out->buf, job->out->len); drop_space(job->out); @@ -1779,11 +1770,11 @@ local void write_thread(void *dummy) /* write trailer */ put_trailer(ulen, clen, check, head); - - if (g.bgzf) { - /* write the final trailing EOF block */ - writen(g.outd, (unsigned char*) bgzf_eof, 28); - } + + if (g.bgzf) { + /* write the final trailing EOF block */ + writen(g.outd, (unsigned char*) bgzf_eof, 28); + } /* verify no more jobs, prepare for next use */ possess(compress_have); @@ -2173,7 +2164,7 @@ local void single_compress(int reset) } /* compress the remainder, emit a block, finish if end of input - of if this is a BGZF compliant gzip */ + of if this is a BGZF compliant gzip */ strm->avail_in = (unsigned)got; got = left; check = CHECK(check, strm->next_in, strm->avail_in); @@ -2254,11 +2245,11 @@ local void single_compress(int reset) /* write trailer */ put_trailer(ulen, clen, check, head); - - if (g.bgzf) { - /* write the final trailing EOF block */ - writen(g.outd, (unsigned char*) bgzf_eof, 28); - } + + if (g.bgzf) { + /* write the final trailing EOF block */ + writen(g.outd, (unsigned char*) bgzf_eof, 28); + } } @@ -3038,7 +3029,6 @@ local void infchk(void) /* decompress, compute lengths and check value */ strm.avail_in = g.in_left; strm.next_in = g.in_next; - fprintf(stderr, "checking complen %d\n", strm.avail_in); ret = inflateBack(&strm, inb, NULL, outb, NULL); if (ret != Z_STREAM_END) bail("corrupted input -- invalid deflate data: ", g.inf); @@ -3049,7 +3039,7 @@ local void infchk(void) /* compute compressed data length */ clen = g.in_tot - g.in_left; - fprintf(stderr, "checked complen %lld, uncomlen %lld\n", clen, g.out_tot); + /* read and check trailer */ if (g.form > 1) { /* zip local trailer (if any) */ if (g.form == 3) { /* data descriptor follows */ @@ -3815,7 +3805,7 @@ local void defaults(void) g.force = 0; /* don't overwrite, don't compress links */ g.recurse = 0; /* don't go into directories */ g.form = 0; /* use gzip format */ - g.bgzf = 0; /* do not create BGZF gzip */ + g.bgzf = 0; /* do not create BGZF gzip */ } /* long options conversion to short options */ @@ -3828,7 +3818,7 @@ local char *longopts[][2] = { {"processes", "p"}, {"quiet", "q"}, {"recursive", "r"}, {"rsyncable", "R"}, {"silent", "q"}, {"stdout", "c"}, {"suffix", "S"}, {"test", "t"}, {"to-stdout", "c"}, {"uncompress", "d"}, {"verbose", "v"}, - {"version", "V"}, {"zip", "K"}, {"zlib", "z"}, {"bgzf", "B"}}; + {"version", "V"}, {"zip", "K"}, {"zlib", "z"}, {"bgzf", "B"}}; #define NLOPTS (sizeof(longopts) / (sizeof(char *) << 1)) /* either new buffer size, new compression level, or new number of processes -- @@ -3947,7 +3937,7 @@ local int option(char *arg) case 'f': g.force = 1; break; case 'h': help(); break; case 'i': g.setdict = 0; break; - case 'B': g.bgzf = 1; break; + case 'B': g.bgzf = 1; break; case 'k': g.keep = 1; break; case 'l': g.list = 1; break; case 'n': g.headis &= ~1; break; @@ -4111,16 +4101,16 @@ int main(int argc, char **argv) if (done == 1 && g.pipeout && !g.decode && !g.list && g.form > 1) complain("warning: output will be concatenated zip files -- " "will not be able to extract"); - if (g.bgzf) { - if (g.decode) { - /* decompressing. make g.bgzf autodetected from input file */ - g.bgzf = 0; - } else { - /* compressing. Override settings on block size and force independent checksums */ - g.block = BGZF_BLOCK_SIZE; - g.setdict = 0; - } - } + if (g.bgzf) { + if (g.decode) { + /* decompressing. make g.bgzf autodetected from input file */ + g.bgzf = 0; + } else { + /* compressing. Override settings on block size and force independent checksums */ + g.block = BGZF_BLOCK_SIZE; + g.setdict = 0; + } + } process(strcmp(argv[n], "-") ? argv[n] : NULL); done++; } From c65384fd897c7b5935a11e60c92bb969b0c4b3f8 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sun, 2 Nov 2014 16:41:03 -0800 Subject: [PATCH 04/14] refactor put_bgzf_trailer_and_header --- pigz.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/pigz.c b/pigz.c index e1f658f..b967f92 100644 --- a/pigz.c +++ b/pigz.c @@ -1062,6 +1062,27 @@ local void put_trailer(unsigned long ulen, unsigned long clen, /* compute check value depending on format */ #define CHECK(a,b,c) (g.form == 1 ? adler32(a,b,c) : crc32(a,b,c)) +local void put_bgzf_trailer_and_header(unsigned long *ulen, unsigned long *clen, + unsigned long *check, unsigned long *head) { + unsigned char bsize_buf[2]; + /* finish this block */ + put_trailer(*ulen, *clen, *check, *head); + + /* write new header for new block */ + *head = put_header(); + + /* write 2-byte BSIZE field */ + *head += 2; + assert( *head >= BGZF_HEADER_SIZE ); + assert( *clen + *head + BGZF_FOOTER_SIZE <= BGZF_BLOCK_SIZE ); + PUT2L(bsize_buf, (uint16_t) (*clen + *head + BGZF_FOOTER_SIZE - 1)); + writen(g.outd, bsize_buf, 2); + + /* reset counting */ + *ulen = *clen = 0; + *check = CHECK(0L, Z_NULL, 0); +} + #ifndef NOTHREAD /* -- threaded portions of pigz -- */ @@ -1698,7 +1719,6 @@ local void write_thread(void *dummy) unsigned long ulen; /* total uncompressed size (overflow ok) */ unsigned long clen; /* total compressed size (overflow ok) */ unsigned long check; /* check value of uncompressed data */ - unsigned char bsize_buf[2]; /* for bgzf compliance */ (void)dummy; @@ -1723,32 +1743,17 @@ local void write_thread(void *dummy) len = job->in->len; drop_space(job->in); - /* write the compressed data and drop the output buffer */ - Trace(("-- writing #%ld", seq)); - - /* special logic for BGZF compliant gzip... finish the block and start a new one */ + if (g.bgzf) { - /* finish this block */ - put_trailer(ulen, clen, check, head); - - /* reset counting */ - ulen = clen = 0; - check = CHECK(0L, Z_NULL, 0); - - /* write new header for new block */ - head = put_header(); - - /* write 2-byte BSIZE field */ - head += 2; - assert( head >= BGZF_HEADER_SIZE); - assert(job->out->len + head + BGZF_FOOTER_SIZE <= BGZF_BLOCK_SIZE); - PUT2L(bsize_buf, (uint16_t) (job->out->len + head + BGZF_FOOTER_SIZE - 1)); - writen(g.outd, bsize_buf, 2); + /* special logic for BGZF compliant gzip... finish the block and start a new one */ + put_bgzf_trailer_and_header(&ulen, &clen, &check, &head); } ulen += (unsigned long)len; clen += (unsigned long)(job->out->len); + /* write the compressed data and drop the output buffer */ + Trace(("-- writing #%ld", seq)); writen(g.outd, job->out->buf, job->out->len); drop_space(job->out); Trace(("-- wrote #%ld%s", seq, more ? "" : " (last)")); @@ -2139,6 +2144,10 @@ local void single_compress(int reset) } while (hash != RSYNCHIT); got -= left; } + + if (g.bgzf) { + put_bgzf_trailer_and_header(&ulen, &clen, &check, &head); + } /* clear history for --independent option */ fresh = 0; From 2f6b003c41bf80fe8868ed58c8a0ab728b5aa2ea Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Mon, 3 Nov 2014 00:41:42 -0800 Subject: [PATCH 05/14] added support for --bgzf in single threaded mode --- pigz.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/pigz.c b/pigz.c index b967f92..57a9322 100644 --- a/pigz.c +++ b/pigz.c @@ -1743,14 +1743,14 @@ local void write_thread(void *dummy) len = job->in->len; drop_space(job->in); - + clen += (unsigned long)(job->out->len); + if (g.bgzf) { /* special logic for BGZF compliant gzip... finish the block and start a new one */ put_bgzf_trailer_and_header(&ulen, &clen, &check, &head); } ulen += (unsigned long)len; - clen += (unsigned long)(job->out->len); /* write the compressed data and drop the output buffer */ Trace(("-- writing #%ld", seq)); @@ -2006,9 +2006,20 @@ local void parallel_compress(void) do { \ strm->avail_out = out_size; \ strm->next_out = out; \ + tmp_ulen = strm->avail_in; \ + check = CHECK(check, strm->next_in, strm->avail_in); \ (void)deflate(strm, flush); \ - writen(g.outd, out, out_size - strm->avail_out); \ clen += out_size - strm->avail_out; \ + if (g.bgzf) { \ + /*tmp_ulen = ulen;*/ \ + tmp_check = check; \ + put_bgzf_trailer_and_header(&last_ulen, &clen, &last_check, &head); \ + ulen = last_ulen; \ + check = last_check; \ + last_ulen = tmp_ulen; \ + last_check = tmp_check; \ + } \ + writen(g.outd, out, out_size - strm->avail_out); \ } while (strm->avail_out == 0); \ assert(strm->avail_in == 0); \ } while (0) @@ -2031,6 +2042,9 @@ local void single_compress(int reset) unsigned long ulen; /* total uncompressed size (overflow ok) */ unsigned long clen; /* total compressed size (overflow ok) */ unsigned long check; /* check value of uncompressed data */ + unsigned long last_ulen; /* previous block uncompressed size */ + unsigned long last_check; /* previous block checksum */ + unsigned long tmp_ulen, tmp_check; static unsigned out_size; /* size of output buffer */ static unsigned char *in, *next, *out; /* reused i/o buffers */ static z_stream *strm = NULL; /* reused deflate structure */ @@ -2082,6 +2096,8 @@ local void single_compress(int reset) clen = 0; have = 0; check = CHECK(0L, Z_NULL, 0); + last_check = check; + last_ulen = 0; hash = RSYNCHIT; do { /* get data to compress, see if there is any more input */ @@ -2145,10 +2161,6 @@ local void single_compress(int reset) got -= left; } - if (g.bgzf) { - put_bgzf_trailer_and_header(&ulen, &clen, &check, &head); - } - /* clear history for --independent option */ fresh = 0; if (!g.setdict) { @@ -2167,16 +2179,15 @@ local void single_compress(int reset) /* compress MAXP2-size chunks in case unsigned type is small */ while (got > MAXP2) { strm->avail_in = MAXP2; - check = CHECK(check, strm->next_in, strm->avail_in); DEFLATE_WRITE(Z_NO_FLUSH); got -= MAXP2; } /* compress the remainder, emit a block, finish if end of input - of if this is a BGZF compliant gzip */ + or if this is a BGZF compliant gzip + */ strm->avail_in = (unsigned)got; got = left; - check = CHECK(check, strm->next_in, strm->avail_in); if (g.bgzf == 0 && (more || got)) { #if ZLIB_VERNUM >= 0x1260 int bits; @@ -2253,11 +2264,12 @@ local void single_compress(int reset) } while (more || got); /* write trailer */ - put_trailer(ulen, clen, check, head); - if (g.bgzf) { /* write the final trailing EOF block */ + put_trailer(last_ulen, clen, last_check, head); writen(g.outd, (unsigned char*) bgzf_eof, 28); + } else { + put_trailer(ulen, clen, check, head); } } From 35727f21b8bfb69d8ed960db5ac4f12ce127b831 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sat, 15 Nov 2014 14:20:06 -0800 Subject: [PATCH 06/14] added support for parallel decompress of --bgzf input files --- pigz.c | 364 ++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 312 insertions(+), 52 deletions(-) diff --git a/pigz.c b/pigz.c index 57a9322..2e33a49 100644 --- a/pigz.c +++ b/pigz.c @@ -452,9 +452,6 @@ #define INBUFS(p) (((p)<<1)+3) #define OUTPOOL(s) ((s)+((s)>>4)+DICT) -/* input buffer size */ -#define BUF 32768U - /* BGZF constants */ #define BGZF_MAX_BLOCK_SIZE 0x10000 #define BGZF_BLOCK_SIZE 0x0ff00 // make sure compressed_bytes(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE @@ -472,6 +469,9 @@ /* bgzf_eof is the last block signifying EOF */ local const uint8_t bgzf_eof[28] = "\037\213\010\4\0\0\0\0\0\377\6\0\102\103\2\0\033\0\3\0\0\0\0\0\0\0\0\0"; +/* input buffer size */ +#define BUF BGZF_MAX_BLOCK_SIZE + /* globals (modified by main thread only when it's the only thread) */ local struct { char *prog; /* name by which pigz was invoked */ @@ -486,6 +486,7 @@ local struct { int force; /* true to overwrite, compress links, cat */ int form; /* gzip = 0, zlib = 1, zip = 2 or 3 */ int bgzf; /* true to include BGZF BlockCompress compliance */ + int bgzf_bsize; /* the remainder size of the next block to decompress */ unsigned char magic1; /* first byte of possible header when decoding */ int recurse; /* true to dive down into directory structure */ char *sufx; /* suffix to use (".gz" or user supplied) */ @@ -1063,7 +1064,8 @@ local void put_trailer(unsigned long ulen, unsigned long clen, #define CHECK(a,b,c) (g.form == 1 ? adler32(a,b,c) : crc32(a,b,c)) local void put_bgzf_trailer_and_header(unsigned long *ulen, unsigned long *clen, - unsigned long *check, unsigned long *head) { + unsigned long *check, unsigned long *head) +{ unsigned char bsize_buf[2]; /* finish this block */ put_trailer(*ulen, *clen, *check, *head); @@ -1263,17 +1265,34 @@ local struct space *get_space(struct pool *pool) pool->limit--; pool->made++; release(pool->have); + space = NULL; space = MALLOC(sizeof(struct space)); - if (space == NULL) - bail("not enough memory", ""); - space->use = new_lock(1); /* initially one user */ - space->buf = MALLOC(pool->size); - if (space->buf == NULL) - bail("not enough memory", ""); - space->size = pool->size; - space->len = 0; - space->pool = pool; /* remember the pool this belongs to */ - return space; + if (space != NULL) { + space->buf = NULL; + space->buf = MALLOC(pool->size); + if (space->buf != NULL) { + space->use = new_lock(1); /* initially one user */ + space->size = pool->size; + space->len = 0; + space->pool = pool; /* remember the pool this belongs to */ + return space; + } + } + assert( space == NULL || space->buf == NULL ); + if (pool->limit < 0 && pool->made > 1) { + possess(pool->have); + if (space != NULL) { + FREE(space); + } + pool->limit = 0; /* do not make any more buffers */ + pool->made--; + complain("Running out of memory with %d buffers of %d bytes, reducing buffer count\n", pool->made, pool->size); + twist(pool->have, TO, 0); + /* try again this time waiting for an existing buffer */ + return get_space(pool); + } + bail("not enough memory", ""); + return NULL; } /* compute next size up by multiplying by about 2**(1/3) and round to the next @@ -1484,6 +1503,128 @@ local void deflate_engine(z_stream *strm, struct space *out, int flush) assert(strm->avail_in == 0); } +local void uncompress_thread(void *dummy) +{ + int ret, i; + struct job *job; /* job pulled and working on */ + struct job *here, **prior; /* pointers for inserting in write list */ + struct space *out; /* pointer to output buffer */ + unsigned long check, incheck; /* check value of uncompressed output */ + unsigned long len, inlen; /* length of uncompressed output */ + z_stream strm; /* inflate stream */ + + (void)dummy; + + // prepare strm for inflate + strm.zalloc = ZALLOC; + strm.zfree = ZFREE; + strm.opaque = OPAQUE; + + ret = inflateInit2(&strm, -15); + if (ret != Z_OK) { + bail("not enough memory", ""); + } + + /* keep looking for work */ + for (;;) { + assert(g.form == 0 && g.bgzf); /* this must be BGZF */ + + /* acquire and allocate output buffer */ + out = get_space(&out_pool); + while( out->size < BGZF_FOOTER_SIZE) { + grow_space(out); + } + out->len = 0; + + /* get a job (like I tell my son) */ + possess(compress_have); + wait_for(compress_have, NOT_TO_BE, 0); + job = compress_head; + assert(job != NULL); + if (job->seq == -1) + break; + compress_head = job->next; + if (job->next == NULL) + compress_tail = &compress_head; + twist(compress_have, BY, -1); + + /* get output buffer space */ + if (job->out != NULL) { + drop_space(job->out); + } + job->out = out; + + assert(job->in != NULL && job->in->len > BGZF_FOOTER_SIZE); + + /* decompress, compute lengths and check value */ + strm.total_in = strm.total_out = 0; + + strm.next_in = job->in->buf; + strm.avail_in = job->in->len; + + strm.next_out = out->buf; + strm.avail_out = out->size; + + fprintf(stderr, "initialzed new bgzf block\n"); + + ret = inflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END) { + bail("corrupted input -- invalid BGZF deflate data: ", g.inf); + } + + if (strm.avail_in != BGZF_FOOTER_SIZE) { + bail("corrupted input -- invalid byte count in BGZF block", g.inf); + } + + if (inflateReset(&strm) != Z_OK) { + bail("corrupted input -- inflateReset failed in BGZF block", g.inf); + } + + /* compute and record uncompressed data length */ + len = strm.total_out; + out->len = len; + incheck = 0; + for(i=0; i<3;i++) { + incheck += *(out->buf + len + i) << 8*i; + } + inlen = 0; + for(i=0; i<3; i++) { + inlen += *(out->buf + len + 4 + i) << 8*i; + } + + /* compute uncompressed data check */ + check = CHECK(0L, Z_NULL, 0); + check = CHECK(check, out->buf, out->len); + + /* read and check trailer */ + /* gzip trailer */ + if (check != incheck || len != inlen) { + complain("Calculated %d check and %d length, but got %d and %d in the footer\n", check, len, incheck, inlen); + bail("corrupted input checksum or length in BGZF block", g.inf); + } + + /* insert write job in list in sorted order, alert write thread */ + possess(write_first); + prior = &write_head; + while ((here = *prior) != NULL) { + if (here->seq > job->seq) + break; + prior = &(here->next); + } + job->next = here; + *prior = job; + twist(write_first, TO, write_head->seq); + + } + if (inflateEnd(&strm) != Z_OK) { + bail("corrupted input -- inflateEnd failed in BGZF block", g.inf); + } + + + /* found job with seq == -1 -- free deflate memory and return to join */ + release(compress_have); +} + /* get the next compression job from the head of the list, compress and compute the check value on the input, and put a job in the write list with the results -- keep looking for more jobs, returning when a job is found with a @@ -1707,8 +1848,9 @@ local void compress_thread(void *dummy) } /* collect the write jobs off of the list in sequence order and write out the - compressed data until the last chunk is written -- also write the header and - trailer and combine the individual check values of the input buffers */ + output data until the last chunk is written. + if decoding, also write the header and trailer and combine the individual + check values of the input buffers */ local void write_thread(void *dummy) { long seq; /* next sequence number looking for */ @@ -1724,8 +1866,9 @@ local void write_thread(void *dummy) /* build and write header */ Trace(("-- write thread running")); - head = put_header(); - + if (!g.decode) { + head = put_header(); + } /* process output of compress threads until end of input */ ulen = clen = 0; check = CHECK(0L, Z_NULL, 0); @@ -1745,7 +1888,7 @@ local void write_thread(void *dummy) clen += (unsigned long)(job->out->len); - if (g.bgzf) { + if (g.bgzf && !g.decode) { /* special logic for BGZF compliant gzip... finish the block and start a new one */ put_bgzf_trailer_and_header(&ulen, &clen, &check, &head); } @@ -1758,27 +1901,30 @@ local void write_thread(void *dummy) drop_space(job->out); Trace(("-- wrote #%ld%s", seq, more ? "" : " (last)")); - /* wait for check calculation to complete, then combine, once - the compress thread is done with the input, release it */ - possess(job->calc); - wait_for(job->calc, TO_BE, 1); - release(job->calc); - check = COMB(check, job->check, len); - - /* free the job */ - free_lock(job->calc); + if (!g.decode) { + /* wait for check calculation to complete, then combine, once + the compress thread is done with the input, release it */ + possess(job->calc); + wait_for(job->calc, TO_BE, 1); + release(job->calc); + check = COMB(check, job->check, len); + + /* free the job */ + free_lock(job->calc); + } FREE(job); /* get the next buffer in sequence */ seq++; } while (more); - /* write trailer */ - put_trailer(ulen, clen, check, head); - - if (g.bgzf) { - /* write the final trailing EOF block */ - writen(g.outd, (unsigned char*) bgzf_eof, 28); + if (!g.decode) { + /* write trailer */ + put_trailer(ulen, clen, check, head); + if (g.bgzf) { + /* write the final trailing EOF block */ + writen(g.outd, (unsigned char*) bgzf_eof, 28); + } } /* verify no more jobs, prepare for next use */ @@ -2011,7 +2157,6 @@ local void parallel_compress(void) (void)deflate(strm, flush); \ clen += out_size - strm->avail_out; \ if (g.bgzf) { \ - /*tmp_ulen = ulen;*/ \ tmp_check = check; \ put_bgzf_trailer_and_header(&last_ulen, &clen, &last_check, &head); \ ulen = last_ulen; \ @@ -2044,7 +2189,7 @@ local void single_compress(int reset) unsigned long check; /* check value of uncompressed data */ unsigned long last_ulen; /* previous block uncompressed size */ unsigned long last_check; /* previous block checksum */ - unsigned long tmp_ulen, tmp_check; + unsigned long tmp_check, tmp_ulen; static unsigned out_size; /* size of output buffer */ static unsigned char *in, *next, *out; /* reused i/o buffers */ static z_stream *strm = NULL; /* reused deflate structure */ @@ -2289,6 +2434,7 @@ local void load_read(void *dummy) possess(g.load_state); wait_for(g.load_state, TO_BE, 1); g.in_len = len = readn(g.ind, g.in_which ? g.in_buf : g.in_buf2, BUF); + fprintf(stderr, "load_read read %zu using %d\n", g.in_len, g.in_which); Trace(("-- decompress read thread read %lu bytes", len)); twist(g.load_state, TO, 0); } while (len == BUF); @@ -2309,7 +2455,7 @@ local size_t load(void) g.in_left = 0; return 0; } - + #ifndef NOTHREAD /* if first time in or procs == 1, read a buffer to have something to return, otherwise wait for the previous read job to complete */ @@ -2320,7 +2466,7 @@ local size_t load(void) g.load_state = new_lock(1); g.load_thread = launch(load_read, NULL); } - + fprintf(stderr, "about to read from buffer %d\n", BUF); /* wait for the previously requested read to complete */ possess(g.load_state); wait_for(g.load_state, TO_BE, 0); @@ -2329,6 +2475,7 @@ local size_t load(void) /* set up input buffer with the data just read */ g.in_next = g.in_which ? g.in_buf : g.in_buf2; g.in_left = g.in_len; + fprintf(stderr, "read %zu from thread using %d\n", g.in_left, g.in_which); /* if not at end of file, alert read thread to load next buffer, alternate between g.in_buf and g.in_buf2 */ @@ -2350,6 +2497,7 @@ local size_t load(void) { /* don't use threads -- simply read a buffer into g.in_buf */ g.in_left = readn(g.ind, g.in_next = g.in_buf, BUF); + fprintf(stderr, "read no thread %zu of %d\n", g.in_left, BUF); } /* note end of file */ @@ -2490,7 +2638,10 @@ local int get_header(int save) unsigned magic; /* magic header */ int method; /* compression method */ int flags; /* header flags */ + int count; /* number of bytes read */ unsigned fname, extra; /* name and extra field lengths */ + char c1, c2; /* for parsing extra field type */ + unsigned field_len; /* for parsing extra field */ unsigned tmp2; /* for macro */ unsigned long tmp4; /* for macro */ @@ -2571,7 +2722,7 @@ local int get_header(int save) g.in_next--; return -2; } -/* TODO */ + /* it's gzip -- get method and flags */ method = GET(); flags = GET(); @@ -2589,13 +2740,29 @@ local int get_header(int save) /* skip extra field and OS */ SKIP(2); - /* skip extra field, if present */ + /* parse extra field, if present */ + g.bgzf = g.bgzf_bsize = extra = 0; /* assume it is NOT BGZF */ if (flags & 4) { extra = GET2(); if (g.in_eof) return -3; - SKIP(extra); + // parse extra field(s) look for BC && set g.bgzf to bsize, if present + count = extra; + g.bgzf_bsize = 0; + while (count > 0) { + c1 = GET(); c2 = GET(); field_len = GET2(); + if (c1 == 'B' && c2 == 'C' && field_len == 2) { + /* read the BGZF block size value */ + g.bgzf = 1; + g.bgzf_bsize = GET2(); + fprintf(stderr, "Found bgzf size %d\n", g.bgzf_bsize); + } else { + SKIP(field_len); + } + count -= 4 + field_len; + } } + count = extra + 12; /* read file name, if present, into allocated memory */ if ((flags & 8) && save) { @@ -2621,23 +2788,38 @@ local int get_header(int save) have += copy; g.in_left -= copy; g.in_next += copy; + count+=copy; } while (end == NULL); } - else if (flags & 8) - while (GET() != 0) + else if (flags & 8) { + while (GET() != 0) { if (g.in_eof) return -3; + count++; + } + } /* skip comment */ if (flags & 16) - while (GET() != 0) + while (GET() != 0) { if (g.in_eof) return -3; + count++; + } /* skip header crc */ - if (flags & 2) + if (flags & 2) { SKIP(2); + count += 2; + } + if (g.bgzf && g.bgzf_bsize) { + // subtract for header bytes already read, but include trailer bytes at the end... + assert(g.bgzf_bsize > count + BGZF_FOOTER_SIZE - 1); + assert(count >= BGZF_HEADER_SIZE); + g.bgzf_bsize = g.bgzf_bsize - count + 1; + } + fprintf(stderr, "get_header: count: %d bgzf: %d\n", count, g.bgzf_bsize); /* return gzip compression method */ g.form = 0; return method; @@ -3028,36 +3210,110 @@ local int outb(void *desc, unsigned char *buf, unsigned len) local void infchk(void) { int ret, cont, was; - unsigned long check, len; + unsigned long check, len, seq, block_len; z_stream strm; unsigned tmp2; unsigned long tmp4; off_t clen; + struct job *job; cont = 0; + assert(g.decode); + +#ifndef NOTHREAD + if (g.form == 0 && g.bgzf && g.procs > 1) { + /* This is a BGZF gzip file, decompress all blocks in parallel until the form changes */ + setup_jobs(); + /* start write thread */ + writeth = launch(write_thread, NULL); + + seq = 0; + job = NULL; + do { + + /* create a new job */ + job = MALLOC(sizeof(struct job)); + if (job == NULL) { + bail("not enough memory", ""); + } + job->calc = NULL; // no calc lock in decode + + /* update input spaces */ + block_len = g.bgzf_bsize; + job->in = get_space(&in_pool); + job->in->len = 0; + while (job->in->size < block_len) { + grow_space(job->in); + } + job->in->len += readn(g.ind, job->in->buf, block_len); + if (job->in->len != (size_t) g.bgzf_bsize) { + bail("incomplete BGZF gzip -- reached eof", ""); + } + job->more = 1; + + /* preparation of job is complete */ + job->seq = seq; + Trace(("-- read #%ld%s", "")); + if (++seq < 1) + bail("input too long: ", g.inf); + + /* start another uncompress thread if needed */ + if ((unsigned long) cthreads < seq && cthreads < g.procs) { + (void)launch(uncompress_thread, NULL); + cthreads++; + } + + was = g.form; + + ret = get_header(0); + if (ret != 8) { + job->more = 0; + } + + /* put job at end of compress list, let all the compressors know */ + possess(compress_have); + job->next = NULL; + *compress_tail = job; + compress_tail = &(job->next); + twist(compress_have, BY, +1); + + } while (was == 0 && ret == 8 && g.form == 0 && g.bgzf && g.bgzf_bsize); + + /* wait for the write thread to complete (we leave the compress threads out + there and waiting in case there is another stream to compress) */ + join(writeth); + writeth = NULL; + if ( ret != 8 ) { + return; + } + } +#endif + do { /* header already read -- set up for decompression */ g.in_tot = g.in_left; /* track compressed data length */ g.out_tot = 0; g.out_check = CHECK(0L, Z_NULL, 0); + strm.zalloc = ZALLOC; strm.zfree = ZFREE; strm.opaque = OPAQUE; ret = inflateBackInit(&strm, 15, out_buf); if (ret != Z_OK) - bail("not enough memory", ""); - + bail("not enough memory", ""); + fprintf(stderr, "initialzed new block\n"); + /* decompress, compute lengths and check value */ strm.avail_in = g.in_left; strm.next_in = g.in_next; ret = inflateBack(&strm, inb, NULL, outb, NULL); if (ret != Z_STREAM_END) - bail("corrupted input -- invalid deflate data: ", g.inf); + bail("corrupted input -- invalid deflate data: ", g.inf); g.in_left = strm.avail_in; g.in_next = strm.next_in; inflateBackEnd(&strm); outb(NULL, NULL, 0); /* finish off final write and check */ - + /* compute compressed data length */ clen = g.in_tot - g.in_left; @@ -3122,6 +3378,7 @@ local void infchk(void) bail("corrupted gzip stream -- crc32 mismatch: ", g.inf); if (len != (g.out_tot & LOW32)) bail("corrupted gzip stream -- length mismatch: ", g.inf); + fprintf(stderr, "got trailer check %lu, ulen: %lu\n", check, len); } /* show file information if requested */ @@ -3729,7 +3986,7 @@ local char *helptext[] = { " -F --first Do iterations first, before block split for -11", " -h, --help Display a help screen and quit", " -i, --independent Compress blocks independently for damage recovery", -" -B, --bgzf Create BGZF compliant gzip, includes -b 64 -i", +" -B, --bgzf Create BGZF compliant gzip. (overrides -b and -i)", " -I, --iterations n Number of iterations for -11 optimization", " -k, --keep Do not delete original file after processing", " -K, --zip Compress to PKWare zip (.zip) single entry format", @@ -3827,6 +4084,7 @@ local void defaults(void) g.recurse = 0; /* don't go into directories */ g.form = 0; /* use gzip format */ g.bgzf = 0; /* do not create BGZF gzip */ + g.bgzf_bsize = 0; /* size of bgzf block if this is a BGZF decompress */ } /* long options conversion to short options */ @@ -4124,8 +4382,10 @@ int main(int argc, char **argv) "will not be able to extract"); if (g.bgzf) { if (g.decode) { - /* decompressing. make g.bgzf autodetected from input file */ + /* decompressing. Ignore this hint as BGZF autodetected from gzip header, + and use g.bgzf_bsize as next block size when reading */ g.bgzf = 0; + g.block = BGZF_MAX_BLOCK_SIZE; } else { /* compressing. Override settings on block size and force independent checksums */ g.block = BGZF_BLOCK_SIZE; From c0885f848e62c27aa6df5ebceef03a6dc3299d20 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sat, 15 Nov 2014 16:20:07 -0800 Subject: [PATCH 07/14] got decompress on --bgzf working and tested --- Makefile | 3 +++ pigz.c | 67 +++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 23 deletions(-) diff --git a/Makefile b/Makefile index a9421aa..7af523b 100644 --- a/Makefile +++ b/Makefile @@ -62,6 +62,9 @@ test: pigz echo 'compress -f < pigz.c | ./unpigz | cmp - pigz.c' ;\ compress -f < pigz.c | ./unpigz | cmp - pigz.c ;\ fi + ./pigz -kfB pigz.c \ + && ./pigz -d -c -p 1 pigz.c.gz | diff -q - pigz.c \ + && ./pigz -d -c -p 2 pigz.c.gz | diff -q - pigz.c @rm -f pigz.c.gz pigz.c.zz pigz.c.zip tests: dev test diff --git a/pigz.c b/pigz.c index 2e33a49..174f32d 100644 --- a/pigz.c +++ b/pigz.c @@ -1527,11 +1527,10 @@ local void uncompress_thread(void *dummy) /* keep looking for work */ for (;;) { - assert(g.form == 0 && g.bgzf); /* this must be BGZF */ /* acquire and allocate output buffer */ out = get_space(&out_pool); - while( out->size < BGZF_FOOTER_SIZE) { + while( out->size < BGZF_MAX_BLOCK_SIZE) { grow_space(out); } out->len = 0; @@ -1543,15 +1542,14 @@ local void uncompress_thread(void *dummy) assert(job != NULL); if (job->seq == -1) break; + compress_head = job->next; if (job->next == NULL) compress_tail = &compress_head; twist(compress_have, BY, -1); /* get output buffer space */ - if (job->out != NULL) { - drop_space(job->out); - } + assert(job->out == NULL); job->out = out; assert(job->in != NULL && job->in->len > BGZF_FOOTER_SIZE); @@ -1581,16 +1579,16 @@ local void uncompress_thread(void *dummy) } /* compute and record uncompressed data length */ - len = strm.total_out; + len = strm.next_out - out->buf; out->len = len; incheck = 0; - for(i=0; i<3;i++) { - incheck += *(out->buf + len + i) << 8*i; - } + for(i=0; i<4;i++) { + incheck += (unsigned) *(strm.next_in++) << (8*i); } inlen = 0; - for(i=0; i<3; i++) { - inlen += *(out->buf + len + 4 + i) << 8*i; - } + for(i=0; i<4; i++) { + inlen += (unsigned) *(strm.next_in++) << (8*i); } + strm.avail_in -= 8; + assert(strm.avail_in == 0); /* compute uncompressed data check */ check = CHECK(0L, Z_NULL, 0); @@ -1599,7 +1597,7 @@ local void uncompress_thread(void *dummy) /* read and check trailer */ /* gzip trailer */ if (check != incheck || len != inlen) { - complain("Calculated %d check and %d length, but got %d and %d in the footer\n", check, len, incheck, inlen); + complain("Calculated %lu check and %lu length, but got %lu and %lu in the footer\n", check, len, incheck, inlen); bail("corrupted input checksum or length in BGZF block", g.inf); } @@ -1620,9 +1618,9 @@ local void uncompress_thread(void *dummy) bail("corrupted input -- inflateEnd failed in BGZF block", g.inf); } - /* found job with seq == -1 -- free deflate memory and return to join */ release(compress_have); + drop_space(out); } /* get the next compression job from the head of the list, compress and compute @@ -3209,13 +3207,14 @@ local int outb(void *desc, unsigned char *buf, unsigned len) read and check the gzip, zlib, or zip trailer */ local void infchk(void) { - int ret, cont, was; + int ret, cont, was, count; unsigned long check, len, seq, block_len; z_stream strm; unsigned tmp2; unsigned long tmp4; off_t clen; struct job *job; + struct space *input; cont = 0; assert(g.decode); @@ -3237,16 +3236,36 @@ local void infchk(void) bail("not enough memory", ""); } job->calc = NULL; // no calc lock in decode + job->lens = NULL; + job->next = NULL; /* update input spaces */ block_len = g.bgzf_bsize; - job->in = get_space(&in_pool); - job->in->len = 0; - while (job->in->size < block_len) { - grow_space(job->in); + input = get_space(&in_pool); + while (input->size < block_len) { + grow_space(input); + } + assert(input->len == 0); + job->in = input; + job->out = NULL; + + /* read block_len from g.in_next, modify g.in_left */ + /* instead of reading directly: job->in->len += readn(g.ind, job->in->buf, block_len); */ + while (block_len) { + count = block_len > g.in_left ? g.in_left : block_len; + memcpy(input->buf + input->len, g.in_next, count); + g.in_next += count; + g.in_left -= count; + input->len += count; + block_len -= count; + if (g.in_left == 0) { + load(); + if (g.in_left == 0) { + break; } + } } - job->in->len += readn(g.ind, job->in->buf, block_len); - if (job->in->len != (size_t) g.bgzf_bsize) { + + if (input->len != (size_t) g.bgzf_bsize) { bail("incomplete BGZF gzip -- reached eof", ""); } job->more = 1; @@ -3254,8 +3273,9 @@ local void infchk(void) /* preparation of job is complete */ job->seq = seq; Trace(("-- read #%ld%s", "")); - if (++seq < 1) - bail("input too long: ", g.inf); + if (++seq < 1) { + bail("input too long: ", g.inf); + } /* start another uncompress thread if needed */ if ((unsigned long) cthreads < seq && cthreads < g.procs) { @@ -3283,6 +3303,7 @@ local void infchk(void) there and waiting in case there is another stream to compress) */ join(writeth); writeth = NULL; + if ( ret != 8 ) { return; } From ea56a0ab71158714d031e5e67d9e9c3e71ab6468 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sat, 15 Nov 2014 19:15:39 -0800 Subject: [PATCH 08/14] added comments and removed debugging printf statements --- pigz.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/pigz.c b/pigz.c index 174f32d..e8421a5 100644 --- a/pigz.c +++ b/pigz.c @@ -295,10 +295,14 @@ buffers is not directly limited, but is indirectly limited by the release of input buffers to about the same number. - TODO add BGZF Block Compress description and settings. - FNAME only in first - checksum may needs to be tied to block, not rolling - blocksize needs to be 65280 to ensure header + compressed + trailer < 65k + Added support for the BGZF Blocked GNU Zip Format extension to gzip that enables + parallel decompression as well as random access to the uncompressed contents + of a gzip archive. The BGZF specification is described in the SAM format + specification: http://samtools.github.io/hts-specs/SAMv1.pdf + Where the uncompressed data is compressed in blocks of up to 65280 bytes with a + gzip header and trailer. Each block can be decompressed independently and + if the file is indexed by block boundaries, randomly accessed. + */ /* use large file functions if available */ @@ -1563,8 +1567,6 @@ local void uncompress_thread(void *dummy) strm.next_out = out->buf; strm.avail_out = out->size; - fprintf(stderr, "initialzed new bgzf block\n"); - ret = inflate(&strm, Z_FINISH); if (ret != Z_STREAM_END) { bail("corrupted input -- invalid BGZF deflate data: ", g.inf); @@ -2432,7 +2434,6 @@ local void load_read(void *dummy) possess(g.load_state); wait_for(g.load_state, TO_BE, 1); g.in_len = len = readn(g.ind, g.in_which ? g.in_buf : g.in_buf2, BUF); - fprintf(stderr, "load_read read %zu using %d\n", g.in_len, g.in_which); Trace(("-- decompress read thread read %lu bytes", len)); twist(g.load_state, TO, 0); } while (len == BUF); @@ -2464,7 +2465,6 @@ local size_t load(void) g.load_state = new_lock(1); g.load_thread = launch(load_read, NULL); } - fprintf(stderr, "about to read from buffer %d\n", BUF); /* wait for the previously requested read to complete */ possess(g.load_state); wait_for(g.load_state, TO_BE, 0); @@ -2473,7 +2473,6 @@ local size_t load(void) /* set up input buffer with the data just read */ g.in_next = g.in_which ? g.in_buf : g.in_buf2; g.in_left = g.in_len; - fprintf(stderr, "read %zu from thread using %d\n", g.in_left, g.in_which); /* if not at end of file, alert read thread to load next buffer, alternate between g.in_buf and g.in_buf2 */ @@ -2495,7 +2494,6 @@ local size_t load(void) { /* don't use threads -- simply read a buffer into g.in_buf */ g.in_left = readn(g.ind, g.in_next = g.in_buf, BUF); - fprintf(stderr, "read no thread %zu of %d\n", g.in_left, BUF); } /* note end of file */ @@ -2753,7 +2751,6 @@ local int get_header(int save) /* read the BGZF block size value */ g.bgzf = 1; g.bgzf_bsize = GET2(); - fprintf(stderr, "Found bgzf size %d\n", g.bgzf_bsize); } else { SKIP(field_len); } @@ -2817,7 +2814,6 @@ local int get_header(int save) assert(count >= BGZF_HEADER_SIZE); g.bgzf_bsize = g.bgzf_bsize - count + 1; } - fprintf(stderr, "get_header: count: %d bgzf: %d\n", count, g.bgzf_bsize); /* return gzip compression method */ g.form = 0; return method; @@ -3321,8 +3317,7 @@ local void infchk(void) strm.opaque = OPAQUE; ret = inflateBackInit(&strm, 15, out_buf); if (ret != Z_OK) - bail("not enough memory", ""); - fprintf(stderr, "initialzed new block\n"); + bail("not enough memory", ""); /* decompress, compute lengths and check value */ strm.avail_in = g.in_left; @@ -3399,7 +3394,6 @@ local void infchk(void) bail("corrupted gzip stream -- crc32 mismatch: ", g.inf); if (len != (g.out_tot & LOW32)) bail("corrupted gzip stream -- length mismatch: ", g.inf); - fprintf(stderr, "got trailer check %lu, ulen: %lu\n", check, len); } /* show file information if requested */ From 6623284beaebb28e79724ea258082d94dd0b9e34 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sat, 15 Nov 2014 19:33:38 -0800 Subject: [PATCH 09/14] fixed some comments --- pigz.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pigz.c b/pigz.c index e8421a5..8b39c9b 100644 --- a/pigz.c +++ b/pigz.c @@ -2742,7 +2742,7 @@ local int get_header(int save) extra = GET2(); if (g.in_eof) return -3; - // parse extra field(s) look for BC && set g.bgzf to bsize, if present + // parse extra field(s) look for BC && set g.bgzf_bsize, if present count = extra; g.bgzf_bsize = 0; while (count > 0) { @@ -2809,10 +2809,10 @@ local int get_header(int save) } if (g.bgzf && g.bgzf_bsize) { - // subtract for header bytes already read, but include trailer bytes at the end... + // subtract header bytes already read, but include trailer bytes at the end... assert(g.bgzf_bsize > count + BGZF_FOOTER_SIZE - 1); assert(count >= BGZF_HEADER_SIZE); - g.bgzf_bsize = g.bgzf_bsize - count + 1; + g.bgzf_bsize = g.bgzf_bsize - count + 1; // add 1 (see BGZF specification) } /* return gzip compression method */ g.form = 0; From 9278884b39411162a9728dda431111b0e90e0c91 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sat, 15 Nov 2014 21:03:05 -0800 Subject: [PATCH 10/14] fixed -t when archive is a BGZF --- pigz.c | 70 ++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 27 deletions(-) diff --git a/pigz.c b/pigz.c index 8b39c9b..423a3d9 100644 --- a/pigz.c +++ b/pigz.c @@ -1461,6 +1461,8 @@ local void finish_jobs(void) possess(compress_have); job.seq = -1; job.next = NULL; + job.in = NULL; + job.out = NULL; compress_head = &job; compress_tail = &(job.next); twist(compress_have, BY, +1); /* will wake them all up */ @@ -1532,6 +1534,7 @@ local void uncompress_thread(void *dummy) /* keep looking for work */ for (;;) { + job = NULL; /* acquire and allocate output buffer */ out = get_space(&out_pool); while( out->size < BGZF_MAX_BLOCK_SIZE) { @@ -1544,8 +1547,10 @@ local void uncompress_thread(void *dummy) wait_for(compress_have, NOT_TO_BE, 0); job = compress_head; assert(job != NULL); - if (job->seq == -1) + if (job->seq == -1) { + job = NULL; break; + } compress_head = job->next; if (job->next == NULL) @@ -1596,24 +1601,31 @@ local void uncompress_thread(void *dummy) check = CHECK(0L, Z_NULL, 0); check = CHECK(check, out->buf, out->len); - /* read and check trailer */ - /* gzip trailer */ + /* check gzip trailer */ if (check != incheck || len != inlen) { complain("Calculated %lu check and %lu length, but got %lu and %lu in the footer\n", check, len, incheck, inlen); bail("corrupted input checksum or length in BGZF block", g.inf); } - - /* insert write job in list in sorted order, alert write thread */ - possess(write_first); - prior = &write_head; - while ((here = *prior) != NULL) { - if (here->seq > job->seq) - break; - prior = &(here->next); + + if (g.decode == 1) { + + /* insert write job in list in sorted order, alert write thread */ + possess(write_first); + prior = &write_head; + while ((here = *prior) != NULL) { + if (here->seq > job->seq) + break; + prior = &(here->next); + } + job->next = here; + *prior = job; + twist(write_first, TO, write_head->seq); + + } else { + drop_space(job->in); + drop_space(job->out); + FREE(job); } - job->next = here; - *prior = job; - twist(write_first, TO, write_head->seq); } if (inflateEnd(&strm) != Z_OK) { @@ -3219,8 +3231,10 @@ local void infchk(void) if (g.form == 0 && g.bgzf && g.procs > 1) { /* This is a BGZF gzip file, decompress all blocks in parallel until the form changes */ setup_jobs(); - /* start write thread */ - writeth = launch(write_thread, NULL); + if (g.decode == 1) { + /* start write thread */ + writeth = launch(write_thread, NULL); + } seq = 0; job = NULL; @@ -3244,6 +3258,12 @@ local void infchk(void) assert(input->len == 0); job->in = input; job->out = NULL; + job->more = 1; + job->seq = seq; + Trace(("-- read #%ld%s", "")); + if (++seq < 1) { + bail("input too long: ", g.inf); + } /* read block_len from g.in_next, modify g.in_left */ /* instead of reading directly: job->in->len += readn(g.ind, job->in->buf, block_len); */ @@ -3264,14 +3284,8 @@ local void infchk(void) if (input->len != (size_t) g.bgzf_bsize) { bail("incomplete BGZF gzip -- reached eof", ""); } - job->more = 1; - + /* preparation of job is complete */ - job->seq = seq; - Trace(("-- read #%ld%s", "")); - if (++seq < 1) { - bail("input too long: ", g.inf); - } /* start another uncompress thread if needed */ if ((unsigned long) cthreads < seq && cthreads < g.procs) { @@ -3295,10 +3309,12 @@ local void infchk(void) } while (was == 0 && ret == 8 && g.form == 0 && g.bgzf && g.bgzf_bsize); - /* wait for the write thread to complete (we leave the compress threads out - there and waiting in case there is another stream to compress) */ - join(writeth); - writeth = NULL; + if (g.decode == 1) { + /* wait for the write thread to complete (we leave the compress threads out + there and waiting in case there is another stream to compress) */ + join(writeth); + writeth = NULL; + } if ( ret != 8 ) { return; From a9c6e3a25b5f1de32fc2ee7994b11dcb8768acaf Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sat, 15 Nov 2014 21:44:09 -0800 Subject: [PATCH 11/14] fixed spacing --- pigz.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/pigz.c b/pigz.c index 423a3d9..09b1758 100644 --- a/pigz.c +++ b/pigz.c @@ -1551,10 +1551,10 @@ local void uncompress_thread(void *dummy) job = NULL; break; } - compress_head = job->next; - if (job->next == NULL) - compress_tail = &compress_head; + if (job->next == NULL) { + compress_tail = &compress_head; + } twist(compress_have, BY, -1); /* get output buffer space */ @@ -1590,10 +1590,12 @@ local void uncompress_thread(void *dummy) out->len = len; incheck = 0; for(i=0; i<4;i++) { - incheck += (unsigned) *(strm.next_in++) << (8*i); } + incheck += (unsigned) *(strm.next_in++) << (8*i); + } inlen = 0; for(i=0; i<4; i++) { - inlen += (unsigned) *(strm.next_in++) << (8*i); } + inlen += (unsigned) *(strm.next_in++) << (8*i); + } strm.avail_in -= 8; assert(strm.avail_in == 0); From 82cc340dfd2cc3180cbe0128d56c4d939a7f8928 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Sat, 15 Nov 2014 22:34:45 -0800 Subject: [PATCH 12/14] fixed -t to wait for last BGZF block to be processed --- pigz.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pigz.c b/pigz.c index 09b1758..c3cb919 100644 --- a/pigz.c +++ b/pigz.c @@ -3254,6 +3254,7 @@ local void infchk(void) /* update input spaces */ block_len = g.bgzf_bsize; input = get_space(&in_pool); + while (input->size < block_len) { grow_space(input); } @@ -3316,6 +3317,11 @@ local void infchk(void) there and waiting in case there is another stream to compress) */ join(writeth); writeth = NULL; + } else { + /* wait for last block to be processed */ + possess(compress_have); + wait_for(compress_have, TO_BE, 0); + release(compress_have); } if ( ret != 8 ) { From 83cb87c836c6ea97ba4b27bd5ec858a01eab7171 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Mon, 17 Nov 2014 15:47:44 -0800 Subject: [PATCH 13/14] added extra checks for eof while reading and parsing gzip header structure --- pigz.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pigz.c b/pigz.c index c3cb919..c489862 100644 --- a/pigz.c +++ b/pigz.c @@ -2747,9 +2747,12 @@ local int get_header(int save) else SKIP(4); - /* skip extra field and OS */ + /* skip extra flags and OS */ SKIP(2); + if (g.in_eof) + return -3; + /* parse extra field, if present */ g.bgzf = g.bgzf_bsize = extra = 0; /* assume it is NOT BGZF */ if (flags & 4) { @@ -2761,6 +2764,8 @@ local int get_header(int save) g.bgzf_bsize = 0; while (count > 0) { c1 = GET(); c2 = GET(); field_len = GET2(); + if (g.in_eof) + return -3; if (c1 == 'B' && c2 == 'C' && field_len == 2) { /* read the BGZF block size value */ g.bgzf = 1; @@ -2768,6 +2773,8 @@ local int get_header(int save) } else { SKIP(field_len); } + if (g.in_eof) + return -3; count -= 4 + field_len; } } @@ -2822,6 +2829,9 @@ local int get_header(int save) count += 2; } + if (g.in_eof) + return -3; + if (g.bgzf && g.bgzf_bsize) { // subtract header bytes already read, but include trailer bytes at the end... assert(g.bgzf_bsize > count + BGZF_FOOTER_SIZE - 1); From 8c92aa359ab6827ecd314dd3cfbeba5d8bcd38a4 Mon Sep 17 00:00:00 2001 From: Rob Egan Date: Mon, 17 Nov 2014 22:40:19 -0800 Subject: [PATCH 14/14] fixed 0 byte input file and -p 1 compress added more tests for edge cases and gzip compatibility --- Makefile | 72 +++++++++++++++++++++++++++++++++++-- pigz.c | 106 +++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 136 insertions(+), 42 deletions(-) diff --git a/Makefile b/Makefile index 7af523b..49c899b 100644 --- a/Makefile +++ b/Makefile @@ -45,7 +45,68 @@ pigzn: pigzn.o zopfli/deflate.o zopfli/blocksplitter.o zopfli/tree.o zopfli/lz77 pigzn.o: pigz.c $(CC) $(CFLAGS) -DDEBUG -DNOTHREAD -g -c -o pigzn.o pigz.c -test: pigz +# +# set up pattern rules for tests +# + +LONG_NAME = VeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongNameVeryLongName + +TESTFILES = $(addprefix .testfile-, empty pigz.c largefile $(LONG_NAME) ) + +.testfile-empty: + cat /dev/null > $@ + +.testfile-pigz.c: pigz.c + cp pigz.c $@ + +.testfile-largefile: pigz.c + number=1 ; while [[ $$number -le 100 ]] ; do cat pigz.c >> $@ ; ((number = number + 1)) ; done + +.testfile-$(LONG_NAME): .testfile-largefile + cp $< $@ + +TEST_OPTIONS = .gz .b32.gz .1.gz .B.gz .B1.gz .gz2.gz .B-gz2.gz + +.testfile-%.gz : .testfile-% pigz + ./pigz -kf $< && touch $@ + +.testfile-%.b32.gz : .testfile-% pigz + ./pigz -kfb 32 $< && mv $<.gz $@ && touch $@ + +.testfile-%.1.gz : .testfile-% pigz + ./pigz -kfp 1 $< && mv $<.gz $@ && touch $@ + +.testfile-%.B.gz : .testfile-% pigz + ./pigz -kfB $< && mv $<.gz $@ && touch $@ + +.testfile-%.B1.gz : .testfile-% pigz + ./pigz -kfBp 1 $< && mv $<.gz $@ && touch $@ + +.testfile-%.gz2.gz : .testfile-% pigz + ./pigz -kf $< && ./pigz -kf $<.gz && mv $<.gz.gz $@ && touch $@ + +.testfile-%.B-gz2.gz : .testfile-% pigz + ./pigz -kfB $< && ./pigz -kfB $<.gz && mv $<.gz.gz $@ && touch $@ + +.test% : %.gz pigz + ./pigz -t $< + ./pigz -tp 1 $< + gzip -t $< + ./pigz -dc $< > $@.out && diff -q $@.out $* + ./pigz -dcp 1 $< > $@.out && diff -q $@.out $* + gzip -dc $< > $@.out && diff -q $@.out $* + +TESTFILES_GZ = $(foreach option, $(TEST_OPTIONS), $(addsuffix $(option), $(TESTFILES)) ) + +testfiles : $(TESTFILES) $(TESTFILES_GZ) + +TESTS = $(addprefix .test, $(TESTFILES_GZ)) + +.SECONDARY : $(TESTFILES) $(TESTFILES_GZ) $(TESTS) + +moretests : pigz $(TESTS) + +test: pigz moretests ./pigz -kf pigz.c ; ./pigz -t pigz.c.gz ./pigz -kfb 32 pigz.c ; ./pigz -t pigz.c.gz ./pigz -kfp 1 pigz.c ; ./pigz -t pigz.c.gz @@ -62,10 +123,15 @@ test: pigz echo 'compress -f < pigz.c | ./unpigz | cmp - pigz.c' ;\ compress -f < pigz.c | ./unpigz | cmp - pigz.c ;\ fi + ./pigz -cp 1 < /dev/null | ./pigz -t - + ./pigz -c < /dev/null | ./pigz -t - + ./pigz -kfB -p 1 pigz.c \ + && ./pigz -d -c -p 1 pigz.c.gz | diff -q - pigz.c \ + && ./pigz -d -c -p 2 pigz.c.gz | diff -q - pigz.c ./pigz -kfB pigz.c \ && ./pigz -d -c -p 1 pigz.c.gz | diff -q - pigz.c \ && ./pigz -d -c -p 2 pigz.c.gz | diff -q - pigz.c - @rm -f pigz.c.gz pigz.c.zz pigz.c.zip + @rm -f pigz.c.gz pigz.c.zz pigz.c.zip .test* tests: dev test ./pigzn -kf pigz.c ; ./pigz -t pigz.c.gz @@ -77,4 +143,4 @@ pigz.pdf: pigz.1 groff -mandoc -f H -T ps pigz.1 | ps2pdf - pigz.pdf clean: - @rm -f *.o zopfli/*.o pigz unpigz pigzn pigzt pigz.c.gz pigz.c.zz pigz.c.zip + @rm -f *.o zopfli/*.o pigz unpigz pigzn pigzt pigz.c.gz pigz.c.zz pigz.c.zip .test* diff --git a/pigz.c b/pigz.c index c489862..fbedeab 100644 --- a/pigz.c +++ b/pigz.c @@ -892,12 +892,12 @@ local unsigned long time2dos(time_t t) #define PUT4M(a,b) (*(a)=(b)>>24,(a)[1]=(b)>>16,(a)[2]=(b)>>8,(a)[3]=(b)) /* write a gzip, zlib, or zip header using the information in the globals */ -local unsigned long put_header(void) +/* compressedLen is ignored unless this is a BGZF formated gzip */ +local unsigned long put_header(long compressedLen) { unsigned long len; unsigned char head[30]; unsigned int bsize; /* for bgzf */ - if (g.form > 1) { /* zip */ /* write local header */ PUT4L(head, 0x04034b50UL); /* local header signature */ @@ -947,36 +947,48 @@ local unsigned long put_header(void) PUT4L(head + 4, g.mtime); head[8] = g.level >= 9 ? 2 : (g.level == 1 ? 4 : 0); head[9] = 3; /* unix */ + if (g.bgzf) { + assert(compressedLen > 0); + head[3] |= 4; /* include FEXTRA */ - head[9] = 255; /* unknown */ PUT2L(head+10, 6); /* 6 byte XLEN */ head[12] = 'B'; /* BC tag */ head[13] = 'C'; PUT2L(head+14, 2); /* 2 byte BSIZE field */ - len = 16; - writen(g.outd, head, len); - /* Leave 2 bytes (uint16_t) BSIZE unwritten for now */ - - if (g.name) { /* write a special no contents block to include the FNAME field */ - - /* write 2-byte BSIZE entry now */ - bsize = len+2 + strlen(g.name)+1 + 2 + BGZF_FOOTER_SIZE - 1; - PUT2L(head+len, bsize); - writen(g.outd, head+len, 2); - len += 2; - - /* write name */ + + /* calculate BGZF bsize. Account for FNAME field, but FCOMMENT and FHCRC are never included */ + bsize = BGZF_HEADER_SIZE + (g.name == NULL ? 0 : strlen(g.name) + 1) + compressedLen + BGZF_FOOTER_SIZE - 1; + + if (bsize >= BGZF_MAX_BLOCK_SIZE - 1) { + assert(g.name != NULL); + /* Including name field will exceed the capacity of the BGZF block. + * write name now, then empty deflate stream then trailer followed by a new header + */ + bsize = BGZF_HEADER_SIZE + (g.name == NULL ? 0 : strlen(g.name) + 1) + 2 + BGZF_FOOTER_SIZE - 1; + PUT2L(head+16, (uint16_t) (bsize)); + writen(g.outd, head, BGZF_HEADER_SIZE); writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); - - /* empty zero-length Z_FINISH-ed compressed stream */ - head[len] = 3; head[len+1] = 0; - write(g.outd, head+len, 2); - - /* add the name size+1 */ - len += strlen(g.name) + 1; - g.name = NULL; /* but do not write the name ever again! */ + g.name = NULL; /* do not include name again */ + /* empty deflate Z_FINISH-ed deflate stream (2 bytes 3, 0 */ + head[16] = 3; + head[17] = 0; + /* 0 check and 0 uncompressed length */ + head[18] = head[19] = head[20] = head[21] = 0; /* trailer check */ + head[22] = head[23] = head[24] = head[25] = 0; /* trailer ulen */ + writen(g.outd, head+16, 2+4+4); + return put_header(compressedLen); } + + assert( bsize > BGZF_HEADER_SIZE + BGZF_FOOTER_SIZE && bsize < BGZF_MAX_BLOCK_SIZE ); + assert( (head[3] & 2) == 0 && (head[3] & 16) == 0); + + /* write 2-byte BSIZE field */ + PUT2L(head+16, (uint16_t) (bsize)); + + len = 18; + writen(g.outd, head, len); + } else { writen(g.outd, head, 10); len = 10; @@ -984,6 +996,9 @@ local unsigned long put_header(void) if (g.name != NULL) { writen(g.outd, (unsigned char *)g.name, strlen(g.name) + 1); len += strlen(g.name) + 1; + if (g.bgzf) { + g.name = NULL; /* do not write this name again for BGZF */ + } } } return len; @@ -1070,19 +1085,16 @@ local void put_trailer(unsigned long ulen, unsigned long clen, local void put_bgzf_trailer_and_header(unsigned long *ulen, unsigned long *clen, unsigned long *check, unsigned long *head) { - unsigned char bsize_buf[2]; - /* finish this block */ - put_trailer(*ulen, *clen, *check, *head); - + assert( g.bgzf ); + if (*head > 0) { + /* finish this block */ + put_trailer(*ulen, *clen, *check, *head); + } // else this is the first block header... + /* write new header for new block */ - *head = put_header(); + *head = put_header(*clen); - /* write 2-byte BSIZE field */ - *head += 2; - assert( *head >= BGZF_HEADER_SIZE ); - assert( *clen + *head + BGZF_FOOTER_SIZE <= BGZF_BLOCK_SIZE ); - PUT2L(bsize_buf, (uint16_t) (*clen + *head + BGZF_FOOTER_SIZE - 1)); - writen(g.outd, bsize_buf, 2); + assert( *clen + *head + BGZF_FOOTER_SIZE <= BGZF_MAX_BLOCK_SIZE ); /* reset counting */ *ulen = *clen = 0; @@ -1881,7 +1893,12 @@ local void write_thread(void *dummy) /* build and write header */ Trace(("-- write thread running")); if (!g.decode) { - head = put_header(); + if (g.bgzf) { + // BGZF will write header just in time. + head = 0; + } else { + head = put_header(0); + } } /* process output of compress threads until end of input */ ulen = clen = 0; @@ -1934,6 +1951,9 @@ local void write_thread(void *dummy) if (!g.decode) { /* write trailer */ + if (head == 0) { + head = put_header(0); + } put_trailer(ulen, clen, check, head); if (g.bgzf) { /* write the final trailing EOF block */ @@ -2238,7 +2258,12 @@ local void single_compress(int reset) } /* write header */ - head = put_header(); + if (g.bgzf) { + // BGZF will write header just in time + head = 0; + } else { + head = put_header(0); + } /* set compression level in case it changed */ if (g.level <= 9) { @@ -2423,14 +2448,17 @@ local void single_compress(int reset) } while (more || got); /* write trailer */ + if (head == 0) { + head = put_header(0); + } + if (g.bgzf) { - /* write the final trailing EOF block */ + /* write the final BGZF EOF block */ put_trailer(last_ulen, clen, last_check, head); writen(g.outd, (unsigned char*) bgzf_eof, 28); } else { put_trailer(ulen, clen, check, head); } - } /* --- decompression --- */