diff --git a/Package.swift b/Package.swift index 2cfe1e2f..9893420c 100644 --- a/Package.swift +++ b/Package.swift @@ -223,6 +223,16 @@ let package = Package( .target( name: "DEFLATE", + exclude: [ + // better cross-platform compatibility if we remove gzip + // support for now, gzip builds fine for macOS and even + // on iOS -- it is only when a user chooses to build an + // archive using one of the generic devices, that makes + // building gzip more complicated. + "crc32.c", + "gzip_compress.c", + "gzip_decompress.c", + ], publicHeadersPath: "include", cxxSettings: [ .headerSearchPath("."), diff --git a/Sources/DEFLATE/adler32.c b/Sources/DEFLATE/adler32.c index 3aaa7efb..c6085c49 100644 --- a/Sources/DEFLATE/adler32.c +++ b/Sources/DEFLATE/adler32.c @@ -35,55 +35,87 @@ * of s2 overflowing when it is represented as an unsigned 32-bit integer. This * value was computed using the following Python script: * - * divisor = 65521 - * count = 0 - * s1 = divisor - 1 - * s2 = divisor - 1 - * while True: - * s1 += 0xFF - * s2 += s1 - * if s2 > 0xFFFFFFFF: - * break - * count += 1 - * print(count) + * divisor = 65521 + * count = 0 + * s1 = divisor - 1 + * s2 = divisor - 1 + * while True: + * s1 += 0xFF + * s2 += s1 + * if s2 > 0xFFFFFFFF: + * break + * count += 1 + * print(count) * * Note that to get the correct worst-case value, we must assume that every byte * has value 0xFF and that s1 and s2 started with the highest possible values * modulo the divisor. */ -#define MAX_CHUNK_LEN 5552 +#define MAX_CHUNK_LEN 5552 + +/* + * Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n, + * update n to 0, and reduce s1 and s2 mod DIVISOR. It is assumed that neither + * s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes + * already processed after the last reduction must not exceed MAX_CHUNK_LEN. + * + * This uses only portable C code. This is used as a fallback when a vectorized + * implementation of Adler-32 (e.g. AVX2) is unavailable on the platform. + * + * Some of the vectorized implementations also use this to handle the end of the + * data when the data isn't evenly divisible by the length the vectorized code + * works on. To avoid compiler errors about target-specific option mismatches + * when this is used in that way, this is a macro rather than a function. + * + * Although this is unvectorized, this does include an optimization where the + * main loop processes four bytes at a time using a strategy similar to that + * used by vectorized implementations. This provides increased instruction- + * level parallelism compared to the traditional 's1 += *p++; s2 += s1;'. + */ +#define ADLER32_CHUNK(s1, s2, p, n) \ +do { \ +if (n >= 4) { \ +u32 s1_sum = 0; \ +u32 byte_0_sum = 0; \ +u32 byte_1_sum = 0; \ +u32 byte_2_sum = 0; \ +u32 byte_3_sum = 0; \ +\ +do { \ +s1_sum += s1; \ +s1 += p[0] + p[1] + p[2] + p[3]; \ +byte_0_sum += p[0]; \ +byte_1_sum += p[1]; \ +byte_2_sum += p[2]; \ +byte_3_sum += p[3]; \ +p += 4; \ +n -= 4; \ +} while (n >= 4); \ +s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) + \ +(2 * byte_2_sum) + byte_3_sum; \ +} \ +for (; n; n--, p++) { \ +s1 += *p; \ +s2 += s1; \ +} \ +s1 %= DIVISOR; \ +s2 %= DIVISOR; \ +} while (0) static u32 MAYBE_UNUSED adler32_generic(u32 adler, const u8 *p, size_t len) { - u32 s1 = adler & 0xFFFF; - u32 s2 = adler >> 16; - const u8 * const end = p + len; - - while (p != end) { - size_t chunk_len = MIN(end - p, MAX_CHUNK_LEN); - const u8 *chunk_end = p + chunk_len; - size_t num_unrolled_iterations = chunk_len / 4; - - while (num_unrolled_iterations--) { - s1 += *p++; - s2 += s1; - s1 += *p++; - s2 += s1; - s1 += *p++; - s2 += s1; - s1 += *p++; - s2 += s1; - } - while (p != chunk_end) { - s1 += *p++; - s2 += s1; - } - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - return (s2 << 16) | s1; + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + + while (len) { + size_t n = MIN(len, MAX_CHUNK_LEN & ~3); + + len -= n; + ADLER32_CHUNK(s1, s2, p, n); + } + + return (s2 << 16) | s1; } /* Include architecture-specific implementation(s) if available. */ @@ -91,7 +123,7 @@ adler32_generic(u32 adler, const u8 *p, size_t len) #undef arch_select_adler32_func typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len); #if defined(ARCH_ARM32) || defined(ARCH_ARM64) -# include "adler32_impl.h" +# include "arm/adler32_impl.h" #elif defined(ARCH_X86_32) || defined(ARCH_X86_64) # include "x86/adler32_impl.h" #endif @@ -108,13 +140,13 @@ static volatile adler32_func_t adler32_impl = dispatch_adler32; /* Choose the best implementation at runtime. */ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len) { - adler32_func_t f = arch_select_adler32_func(); - - if (f == NULL) - f = DEFAULT_IMPL; - - adler32_impl = f; - return f(adler, p, len); + adler32_func_t f = arch_select_adler32_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + adler32_impl = f; + return f(adler, p, len); } #else /* The best implementation is statically known, so call it directly. */ @@ -124,7 +156,7 @@ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len) LIBDEFLATEAPI u32 libdeflate_adler32(u32 adler, const void *buffer, size_t len) { - if (buffer == NULL) /* Return initial value. */ - return 1; - return adler32_impl(adler, buffer, len); + if (buffer == NULL) /* Return initial value. */ + return 1; + return adler32_impl(adler, buffer, len); } diff --git a/Sources/DEFLATE/adler32_impl.h b/Sources/DEFLATE/adler32_impl.h deleted file mode 100644 index 865547b8..00000000 --- a/Sources/DEFLATE/adler32_impl.h +++ /dev/null @@ -1,272 +0,0 @@ -/* - * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef LIB_ARM_ADLER32_IMPL_H -#define LIB_ARM_ADLER32_IMPL_H - -#include "cpu_features.h" - -/* Regular NEON implementation */ -#if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN() -# define adler32_neon adler32_neon -# define FUNCNAME adler32_neon -# define FUNCNAME_CHUNK adler32_neon_chunk -# define IMPL_ALIGNMENT 16 -# define IMPL_SEGMENT_LEN 64 -/* Prevent unsigned overflow of the 16-bit precision byte counters */ -# define IMPL_MAX_CHUNK_LEN (64 * (0xFFFF / 0xFF)) -# if HAVE_NEON_NATIVE -# define ATTRIBUTES -# else -# ifdef ARCH_ARM32 -# define ATTRIBUTES _target_attribute("fpu=neon") -# else -# define ATTRIBUTES _target_attribute("+simd") -# endif -# endif -# include -static forceinline ATTRIBUTES void -adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end, - u32 *s1, u32 *s2) -{ - static const u16 _aligned_attribute(16) mults[64] = { - 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, - 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - }; - const uint16x8_t mults_a = vld1q_u16(&mults[0]); - const uint16x8_t mults_b = vld1q_u16(&mults[8]); - const uint16x8_t mults_c = vld1q_u16(&mults[16]); - const uint16x8_t mults_d = vld1q_u16(&mults[24]); - const uint16x8_t mults_e = vld1q_u16(&mults[32]); - const uint16x8_t mults_f = vld1q_u16(&mults[40]); - const uint16x8_t mults_g = vld1q_u16(&mults[48]); - const uint16x8_t mults_h = vld1q_u16(&mults[56]); - - uint32x4_t v_s1 = vdupq_n_u32(0); - uint32x4_t v_s2 = vdupq_n_u32(0); - /* - * v_byte_sums_* contain the sum of the bytes at index i across all - * 64-byte segments, for each index 0..63. - */ - uint16x8_t v_byte_sums_a = vdupq_n_u16(0); - uint16x8_t v_byte_sums_b = vdupq_n_u16(0); - uint16x8_t v_byte_sums_c = vdupq_n_u16(0); - uint16x8_t v_byte_sums_d = vdupq_n_u16(0); - uint16x8_t v_byte_sums_e = vdupq_n_u16(0); - uint16x8_t v_byte_sums_f = vdupq_n_u16(0); - uint16x8_t v_byte_sums_g = vdupq_n_u16(0); - uint16x8_t v_byte_sums_h = vdupq_n_u16(0); - - do { - /* Load the next 64 bytes. */ - const uint8x16_t bytes1 = *p++; - const uint8x16_t bytes2 = *p++; - const uint8x16_t bytes3 = *p++; - const uint8x16_t bytes4 = *p++; - uint16x8_t tmp; - - /* - * Accumulate the previous s1 counters into the s2 counters. - * The needed multiplication by 64 is delayed to later. - */ - v_s2 = vaddq_u32(v_s2, v_s1); - - /* - * Add the 64 bytes to their corresponding v_byte_sums counters, - * while also accumulating the sums of each adjacent set of 4 - * bytes into v_s1. - */ - tmp = vpaddlq_u8(bytes1); - v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1)); - v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1)); - tmp = vpadalq_u8(tmp, bytes2); - v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2)); - v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2)); - tmp = vpadalq_u8(tmp, bytes3); - v_byte_sums_e = vaddw_u8(v_byte_sums_e, vget_low_u8(bytes3)); - v_byte_sums_f = vaddw_u8(v_byte_sums_f, vget_high_u8(bytes3)); - tmp = vpadalq_u8(tmp, bytes4); - v_byte_sums_g = vaddw_u8(v_byte_sums_g, vget_low_u8(bytes4)); - v_byte_sums_h = vaddw_u8(v_byte_sums_h, vget_high_u8(bytes4)); - v_s1 = vpadalq_u16(v_s1, tmp); - - } while (p != end); - - /* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */ -#ifdef ARCH_ARM32 -# define umlal2(a, b, c) vmlal_u16((a), vget_high_u16(b), vget_high_u16(c)) -#else -# define umlal2 vmlal_high_u16 -#endif - v_s2 = vqshlq_n_u32(v_s2, 6); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), vget_low_u16(mults_a)); - v_s2 = umlal2(v_s2, v_byte_sums_a, mults_a); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), vget_low_u16(mults_b)); - v_s2 = umlal2(v_s2, v_byte_sums_b, mults_b); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), vget_low_u16(mults_c)); - v_s2 = umlal2(v_s2, v_byte_sums_c, mults_c); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d), vget_low_u16(mults_d)); - v_s2 = umlal2(v_s2, v_byte_sums_d, mults_d); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e), vget_low_u16(mults_e)); - v_s2 = umlal2(v_s2, v_byte_sums_e, mults_e); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f), vget_low_u16(mults_f)); - v_s2 = umlal2(v_s2, v_byte_sums_f, mults_f); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g), vget_low_u16(mults_g)); - v_s2 = umlal2(v_s2, v_byte_sums_g, mults_g); - v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h), vget_low_u16(mults_h)); - v_s2 = umlal2(v_s2, v_byte_sums_h, mults_h); -#undef umlal2 - - /* Horizontal sum to finish up */ -#ifdef ARCH_ARM32 - *s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) + - vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3); - *s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) + - vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3); -#else - *s1 += vaddvq_u32(v_s1); - *s2 += vaddvq_u32(v_s2); -#endif -} -# include "adler32_vec_template.h" -#endif /* Regular NEON implementation */ - -/* NEON+dotprod implementation */ -#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() -# define adler32_neon_dotprod adler32_neon_dotprod -# define FUNCNAME adler32_neon_dotprod -# define FUNCNAME_CHUNK adler32_neon_dotprod_chunk -# define IMPL_ALIGNMENT 16 -# define IMPL_SEGMENT_LEN 64 -# define IMPL_MAX_CHUNK_LEN MAX_CHUNK_LEN -# if HAVE_DOTPROD_NATIVE -# define ATTRIBUTES -# else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("dotprod") - /* - * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the - * default target is armv8.3-a or later in which case it must be omitted. - * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. - */ -# elif defined(__ARM_FEATURE_JCVT) -# define ATTRIBUTES _target_attribute("+dotprod") -# else -# define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod") -# endif -# endif -# include -static forceinline ATTRIBUTES void -adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end, - u32 *s1, u32 *s2) -{ - static const u8 _aligned_attribute(16) mults[64] = { - 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, - 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, - 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, - }; - const uint8x16_t mults_a = vld1q_u8(&mults[0]); - const uint8x16_t mults_b = vld1q_u8(&mults[16]); - const uint8x16_t mults_c = vld1q_u8(&mults[32]); - const uint8x16_t mults_d = vld1q_u8(&mults[48]); - const uint8x16_t ones = vdupq_n_u8(1); - uint32x4_t v_s1_a = vdupq_n_u32(0); - uint32x4_t v_s1_b = vdupq_n_u32(0); - uint32x4_t v_s1_c = vdupq_n_u32(0); - uint32x4_t v_s1_d = vdupq_n_u32(0); - uint32x4_t v_s2_a = vdupq_n_u32(0); - uint32x4_t v_s2_b = vdupq_n_u32(0); - uint32x4_t v_s2_c = vdupq_n_u32(0); - uint32x4_t v_s2_d = vdupq_n_u32(0); - uint32x4_t v_s1_sums_a = vdupq_n_u32(0); - uint32x4_t v_s1_sums_b = vdupq_n_u32(0); - uint32x4_t v_s1_sums_c = vdupq_n_u32(0); - uint32x4_t v_s1_sums_d = vdupq_n_u32(0); - uint32x4_t v_s1; - uint32x4_t v_s2; - uint32x4_t v_s1_sums; - - do { - uint8x16_t bytes_a = *p++; - uint8x16_t bytes_b = *p++; - uint8x16_t bytes_c = *p++; - uint8x16_t bytes_d = *p++; - - v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a); - v_s1_a = vdotq_u32(v_s1_a, bytes_a, ones); - v_s2_a = vdotq_u32(v_s2_a, bytes_a, mults_a); - - v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b); - v_s1_b = vdotq_u32(v_s1_b, bytes_b, ones); - v_s2_b = vdotq_u32(v_s2_b, bytes_b, mults_b); - - v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c); - v_s1_c = vdotq_u32(v_s1_c, bytes_c, ones); - v_s2_c = vdotq_u32(v_s2_c, bytes_c, mults_c); - - v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d); - v_s1_d = vdotq_u32(v_s1_d, bytes_d, ones); - v_s2_d = vdotq_u32(v_s2_d, bytes_d, mults_d); - } while (p != end); - - v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b), vaddq_u32(v_s1_c, v_s1_d)); - v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b), vaddq_u32(v_s2_c, v_s2_d)); - v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a, v_s1_sums_b), - vaddq_u32(v_s1_sums_c, v_s1_sums_d)); - v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6)); - - *s1 += vaddvq_u32(v_s1); - *s2 += vaddvq_u32(v_s2); -} -# include "adler32_vec_template.h" -#endif /* NEON+dotprod implementation */ - -#if defined(adler32_neon_dotprod) && HAVE_DOTPROD_NATIVE -#define DEFAULT_IMPL adler32_neon_dotprod -#else -static inline adler32_func_t -arch_select_adler32_func(void) -{ - const u32 features MAYBE_UNUSED = get_arm_cpu_features(); - -#ifdef adler32_neon_dotprod - if (HAVE_NEON(features) && HAVE_DOTPROD(features)) - return adler32_neon_dotprod; -#endif -#ifdef adler32_neon - if (HAVE_NEON(features)) - return adler32_neon; -#endif - return NULL; -} -#define arch_select_adler32_func arch_select_adler32_func -#endif - -#endif /* LIB_ARM_ADLER32_IMPL_H */ diff --git a/Sources/DEFLATE/adler32_vec_template.h b/Sources/DEFLATE/adler32_vec_template.h deleted file mode 100644 index 98c086bb..00000000 --- a/Sources/DEFLATE/adler32_vec_template.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * adler32_vec_template.h - template for vectorized Adler-32 implementations - * - * Copyright 2016 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This file contains a template for vectorized Adler-32 implementations. - * - * The inner loop between reductions modulo 65521 of an unvectorized Adler-32 - * implementation looks something like this: - * - * do { - * s1 += *p; - * s2 += s1; - * } while (++p != chunk_end); - * - * For vectorized calculation of s1, we only need to sum the input bytes. They - * can be accumulated into multiple counters which are eventually summed - * together. - * - * For vectorized calculation of s2, the basic idea is that for each iteration - * that processes N bytes, we can perform the following vectorizable - * calculation: - * - * s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N - * - * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N - * separate counters, then do the multiplications by N...1 just once at the end - * rather than once per iteration. - * - * Also, we must account for how previous bytes will affect s2 by doing the - * following at beginning of each iteration: - * - * s2 += s1 * N - * - * Furthermore, like s1, "s2" can actually be multiple counters which are - * eventually summed together. - */ - -static u32 ATTRIBUTES MAYBE_UNUSED -FUNCNAME(u32 adler, const u8 *p, size_t len) -{ - const size_t max_chunk_len = - MIN(MAX_CHUNK_LEN, IMPL_MAX_CHUNK_LEN) - - (MIN(MAX_CHUNK_LEN, IMPL_MAX_CHUNK_LEN) % IMPL_SEGMENT_LEN); - u32 s1 = adler & 0xFFFF; - u32 s2 = adler >> 16; - const u8 * const end = p + len; - const u8 *vend; - - /* Process a byte at a time until the needed alignment is reached. */ - if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) { - do { - s1 += *p++; - s2 += s1; - } while (p != end && (uintptr_t)p % IMPL_ALIGNMENT); - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - /* - * Process "chunks" of bytes using vector instructions. Chunk lengths - * are limited to MAX_CHUNK_LEN, which guarantees that s1 and s2 never - * overflow before being reduced modulo DIVISOR. For vector processing, - * chunk lengths are also made evenly divisible by IMPL_SEGMENT_LEN and - * may be further limited to IMPL_MAX_CHUNK_LEN. - */ - STATIC_ASSERT(IMPL_SEGMENT_LEN % IMPL_ALIGNMENT == 0); - vend = end - ((size_t)(end - p) % IMPL_SEGMENT_LEN); - while (p != vend) { - size_t chunk_len = MIN((size_t)(vend - p), max_chunk_len); - - s2 += s1 * chunk_len; - - FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_len), - &s1, &s2); - - p += chunk_len; - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - /* Process any remaining bytes. */ - if (p != end) { - do { - s1 += *p++; - s2 += s1; - } while (p != end); - s1 %= DIVISOR; - s2 %= DIVISOR; - } - - return (s2 << 16) | s1; -} - -#undef FUNCNAME -#undef FUNCNAME_CHUNK -#undef ATTRIBUTES -#undef IMPL_ALIGNMENT -#undef IMPL_SEGMENT_LEN -#undef IMPL_MAX_CHUNK_LEN diff --git a/Sources/DEFLATE/arm/adler32_impl.h b/Sources/DEFLATE/arm/adler32_impl.h new file mode 100644 index 00000000..c8892d47 --- /dev/null +++ b/Sources/DEFLATE/arm/adler32_impl.h @@ -0,0 +1,358 @@ +/* + * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_ARM_ADLER32_IMPL_H +#define LIB_ARM_ADLER32_IMPL_H + +#include "cpu_features.h" + +/* Regular NEON implementation */ +#if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN() +# define adler32_arm_neon adler32_arm_neon +# if HAVE_NEON_NATIVE +# define ATTRIBUTES +# else +# ifdef ARCH_ARM32 +# define ATTRIBUTES _target_attribute("fpu=neon") +# else +# define ATTRIBUTES _target_attribute("+simd") +# endif +# endif +# include +static u32 ATTRIBUTES MAYBE_UNUSED +adler32_arm_neon(u32 adler, const u8 *p, size_t len) +{ + static const u16 _aligned_attribute(16) mults[64] = { + 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const uint16x8_t mults_a = vld1q_u16(&mults[0]); + const uint16x8_t mults_b = vld1q_u16(&mults[8]); + const uint16x8_t mults_c = vld1q_u16(&mults[16]); + const uint16x8_t mults_d = vld1q_u16(&mults[24]); + const uint16x8_t mults_e = vld1q_u16(&mults[32]); + const uint16x8_t mults_f = vld1q_u16(&mults[40]); + const uint16x8_t mults_g = vld1q_u16(&mults[48]); + const uint16x8_t mults_h = vld1q_u16(&mults[56]); + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + + /* + * If the length is large and the pointer is misaligned, align it. + * For smaller lengths, just take the misaligned load penalty. + */ + if (unlikely(len > 32768 && ((uintptr_t)p & 15))) { + do { + s1 += *p++; + s2 += s1; + len--; + } while ((uintptr_t)p & 15); + s1 %= DIVISOR; + s2 %= DIVISOR; + } + + while (len) { + /* + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX. + */ + size_t n = MIN(len, MAX_CHUNK_LEN & ~63); + + len -= n; + + if (n >= 64) { + uint32x4_t v_s1 = vdupq_n_u32(0); + uint32x4_t v_s2 = vdupq_n_u32(0); + /* + * v_byte_sums_* contain the sum of the bytes at index i + * across all 64-byte segments, for each index 0..63. + */ + uint16x8_t v_byte_sums_a = vdupq_n_u16(0); + uint16x8_t v_byte_sums_b = vdupq_n_u16(0); + uint16x8_t v_byte_sums_c = vdupq_n_u16(0); + uint16x8_t v_byte_sums_d = vdupq_n_u16(0); + uint16x8_t v_byte_sums_e = vdupq_n_u16(0); + uint16x8_t v_byte_sums_f = vdupq_n_u16(0); + uint16x8_t v_byte_sums_g = vdupq_n_u16(0); + uint16x8_t v_byte_sums_h = vdupq_n_u16(0); + + s2 += s1 * (n & ~63); + + do { + /* Load the next 64 data bytes. */ + const uint8x16_t data_a = vld1q_u8(p + 0); + const uint8x16_t data_b = vld1q_u8(p + 16); + const uint8x16_t data_c = vld1q_u8(p + 32); + const uint8x16_t data_d = vld1q_u8(p + 48); + uint16x8_t tmp; + + /* + * Accumulate the previous s1 counters into the + * s2 counters. The needed multiplication by 64 + * is delayed to later. + */ + v_s2 = vaddq_u32(v_s2, v_s1); + + /* + * Add the 64 data bytes to their v_byte_sums + * counters, while also accumulating the sums of + * each adjacent set of 4 bytes into v_s1. + */ + tmp = vpaddlq_u8(data_a); + v_byte_sums_a = vaddw_u8(v_byte_sums_a, + vget_low_u8(data_a)); + v_byte_sums_b = vaddw_u8(v_byte_sums_b, + vget_high_u8(data_a)); + tmp = vpadalq_u8(tmp, data_b); + v_byte_sums_c = vaddw_u8(v_byte_sums_c, + vget_low_u8(data_b)); + v_byte_sums_d = vaddw_u8(v_byte_sums_d, + vget_high_u8(data_b)); + tmp = vpadalq_u8(tmp, data_c); + v_byte_sums_e = vaddw_u8(v_byte_sums_e, + vget_low_u8(data_c)); + v_byte_sums_f = vaddw_u8(v_byte_sums_f, + vget_high_u8(data_c)); + tmp = vpadalq_u8(tmp, data_d); + v_byte_sums_g = vaddw_u8(v_byte_sums_g, + vget_low_u8(data_d)); + v_byte_sums_h = vaddw_u8(v_byte_sums_h, + vget_high_u8(data_d)); + v_s1 = vpadalq_u16(v_s1, tmp); + + p += 64; + n -= 64; + } while (n >= 64); + + /* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */ +#ifdef ARCH_ARM32 +# define umlal2(a, b, c) vmlal_u16((a), vget_high_u16(b), vget_high_u16(c)) +#else +# define umlal2 vmlal_high_u16 +#endif + v_s2 = vqshlq_n_u32(v_s2, 6); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), + vget_low_u16(mults_a)); + v_s2 = umlal2(v_s2, v_byte_sums_a, mults_a); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), + vget_low_u16(mults_b)); + v_s2 = umlal2(v_s2, v_byte_sums_b, mults_b); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), + vget_low_u16(mults_c)); + v_s2 = umlal2(v_s2, v_byte_sums_c, mults_c); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d), + vget_low_u16(mults_d)); + v_s2 = umlal2(v_s2, v_byte_sums_d, mults_d); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e), + vget_low_u16(mults_e)); + v_s2 = umlal2(v_s2, v_byte_sums_e, mults_e); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f), + vget_low_u16(mults_f)); + v_s2 = umlal2(v_s2, v_byte_sums_f, mults_f); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g), + vget_low_u16(mults_g)); + v_s2 = umlal2(v_s2, v_byte_sums_g, mults_g); + v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h), + vget_low_u16(mults_h)); + v_s2 = umlal2(v_s2, v_byte_sums_h, mults_h); +#undef umlal2 + + /* Horizontal sum to finish up */ +#ifdef ARCH_ARM32 + s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) + + vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3); + s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) + + vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3); +#else + s1 += vaddvq_u32(v_s1); + s2 += vaddvq_u32(v_s2); +#endif + } + /* + * Process the last 0 <= n < 64 bytes of the chunk using + * scalar instructions and reduce s1 and s2 mod DIVISOR. + */ + ADLER32_CHUNK(s1, s2, p, n); + } + return (s2 << 16) | s1; +} +#undef ATTRIBUTES +#endif /* Regular NEON implementation */ + +/* NEON+dotprod implementation */ +#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN() +# define adler32_arm_neon_dotprod adler32_arm_neon_dotprod +# if HAVE_DOTPROD_NATIVE +# define ATTRIBUTES +# else +# ifdef __clang__ +# define ATTRIBUTES _target_attribute("dotprod") +/* + * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the + * default target is armv8.3-a or later in which case it must be omitted. + * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. + */ +# elif defined(__ARM_FEATURE_JCVT) +# define ATTRIBUTES _target_attribute("+dotprod") +# else +# define ATTRIBUTES _target_attribute("arch=armv8.2-a+dotprod") +# endif +# endif +# include +static u32 ATTRIBUTES +adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len) +{ + static const u8 _aligned_attribute(16) mults[64] = { + 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const uint8x16_t mults_a = vld1q_u8(&mults[0]); + const uint8x16_t mults_b = vld1q_u8(&mults[16]); + const uint8x16_t mults_c = vld1q_u8(&mults[32]); + const uint8x16_t mults_d = vld1q_u8(&mults[48]); + const uint8x16_t ones = vdupq_n_u8(1); + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + + /* + * If the length is large and the pointer is misaligned, align it. + * For smaller lengths, just take the misaligned load penalty. + */ + if (unlikely(len > 32768 && ((uintptr_t)p & 15))) { + do { + s1 += *p++; + s2 += s1; + len--; + } while ((uintptr_t)p & 15); + s1 %= DIVISOR; + s2 %= DIVISOR; + } + + while (len) { + /* + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX. + */ + size_t n = MIN(len, MAX_CHUNK_LEN & ~63); + + len -= n; + + if (n >= 64) { + uint32x4_t v_s1_a = vdupq_n_u32(0); + uint32x4_t v_s1_b = vdupq_n_u32(0); + uint32x4_t v_s1_c = vdupq_n_u32(0); + uint32x4_t v_s1_d = vdupq_n_u32(0); + uint32x4_t v_s2_a = vdupq_n_u32(0); + uint32x4_t v_s2_b = vdupq_n_u32(0); + uint32x4_t v_s2_c = vdupq_n_u32(0); + uint32x4_t v_s2_d = vdupq_n_u32(0); + uint32x4_t v_s1_sums_a = vdupq_n_u32(0); + uint32x4_t v_s1_sums_b = vdupq_n_u32(0); + uint32x4_t v_s1_sums_c = vdupq_n_u32(0); + uint32x4_t v_s1_sums_d = vdupq_n_u32(0); + uint32x4_t v_s1; + uint32x4_t v_s2; + uint32x4_t v_s1_sums; + + s2 += s1 * (n & ~63); + + do { + uint8x16_t data_a = vld1q_u8(p + 0); + uint8x16_t data_b = vld1q_u8(p + 16); + uint8x16_t data_c = vld1q_u8(p + 32); + uint8x16_t data_d = vld1q_u8(p + 48); + + v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a); + v_s1_a = vdotq_u32(v_s1_a, data_a, ones); + v_s2_a = vdotq_u32(v_s2_a, data_a, mults_a); + + v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b); + v_s1_b = vdotq_u32(v_s1_b, data_b, ones); + v_s2_b = vdotq_u32(v_s2_b, data_b, mults_b); + + v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c); + v_s1_c = vdotq_u32(v_s1_c, data_c, ones); + v_s2_c = vdotq_u32(v_s2_c, data_c, mults_c); + + v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d); + v_s1_d = vdotq_u32(v_s1_d, data_d, ones); + v_s2_d = vdotq_u32(v_s2_d, data_d, mults_d); + + p += 64; + n -= 64; + } while (n >= 64); + + v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b), + vaddq_u32(v_s1_c, v_s1_d)); + v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b), + vaddq_u32(v_s2_c, v_s2_d)); + v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a, + v_s1_sums_b), + vaddq_u32(v_s1_sums_c, + v_s1_sums_d)); + v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6)); + + s1 += vaddvq_u32(v_s1); + s2 += vaddvq_u32(v_s2); + } + /* + * Process the last 0 <= n < 64 bytes of the chunk using + * scalar instructions and reduce s1 and s2 mod DIVISOR. + */ + ADLER32_CHUNK(s1, s2, p, n); + } + return (s2 << 16) | s1; +} +#undef ATTRIBUTES +#endif /* NEON+dotprod implementation */ + +#if defined(adler32_arm_neon_dotprod) && HAVE_DOTPROD_NATIVE +#define DEFAULT_IMPL adler32_arm_neon_dotprod +#else +static inline adler32_func_t +arch_select_adler32_func(void) +{ + const u32 features MAYBE_UNUSED = get_arm_cpu_features(); + +#ifdef adler32_arm_neon_dotprod + if (HAVE_NEON(features) && HAVE_DOTPROD(features)) + return adler32_arm_neon_dotprod; +#endif +#ifdef adler32_arm_neon + if (HAVE_NEON(features)) + return adler32_arm_neon; +#endif + return NULL; +} +#define arch_select_adler32_func arch_select_adler32_func +#endif + +#endif /* LIB_ARM_ADLER32_IMPL_H */ diff --git a/Sources/DEFLATE/arm/cpu_features.c b/Sources/DEFLATE/arm/cpu_features.c new file mode 100644 index 00000000..fdb2d7c4 --- /dev/null +++ b/Sources/DEFLATE/arm/cpu_features.c @@ -0,0 +1,212 @@ +/* + * arm/cpu_features.c - feature detection for ARM CPUs + * + * Copyright 2018 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * ARM CPUs don't have a standard way for unprivileged programs to detect CPU + * features. But an OS-specific way can be used when available. + */ + +#ifdef __APPLE__ +# undef _ANSI_SOURCE +# undef _DARWIN_C_SOURCE +# define _DARWIN_C_SOURCE /* for sysctlbyname() */ +#endif + +#include "../cpu_features_common.h" /* must be included first */ +#include "cpu_features.h" + +#if HAVE_DYNAMIC_ARM_CPU_FEATURES + +#ifdef __linux__ +/* + * On Linux, arm32 and arm64 CPU features can be detected by reading the + * AT_HWCAP and AT_HWCAP2 values from /proc/self/auxv. + * + * Ideally we'd use the C library function getauxval(), but it's not guaranteed + * to be available: it was only added to glibc in 2.16, and in Android it was + * added to API level 18 for arm32 and level 21 for arm64. + */ + +#include +#include +#include +#include + +#define AT_HWCAP 16 +#define AT_HWCAP2 26 + +static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2) +{ + int fd; + unsigned long auxbuf[32]; + int filled = 0; + int i; + + fd = open("/proc/self/auxv", O_RDONLY); + if (fd < 0) + return; + + for (;;) { + do { + int ret = read(fd, &((char *)auxbuf)[filled], + sizeof(auxbuf) - filled); + if (ret <= 0) { + if (ret < 0 && errno == EINTR) + continue; + goto out; + } + filled += ret; + } while (filled < 2 * sizeof(long)); + + i = 0; + do { + unsigned long type = auxbuf[i]; + unsigned long value = auxbuf[i + 1]; + + if (type == AT_HWCAP) + *hwcap = value; + else if (type == AT_HWCAP2) + *hwcap2 = value; + i += 2; + filled -= 2 * sizeof(long); + } while (filled >= 2 * sizeof(long)); + + memmove(auxbuf, &auxbuf[i], filled); + } +out: + close(fd); +} + +static u32 query_arm_cpu_features(void) +{ + u32 features = 0; + unsigned long hwcap = 0; + unsigned long hwcap2 = 0; + + scan_auxv(&hwcap, &hwcap2); + +#ifdef ARCH_ARM32 + STATIC_ASSERT(sizeof(long) == 4); + if (hwcap & (1 << 12)) /* HWCAP_NEON */ + features |= ARM_CPU_FEATURE_NEON; + if (hwcap2 & (1 << 1)) /* HWCAP2_PMULL */ + features |= ARM_CPU_FEATURE_PMULL; + if (hwcap2 & (1 << 4)) /* HWCAP2_CRC32 */ + features |= ARM_CPU_FEATURE_CRC32; +#else + STATIC_ASSERT(sizeof(long) == 8); + if (hwcap & (1 << 1)) /* HWCAP_ASIMD */ + features |= ARM_CPU_FEATURE_NEON; + if (hwcap & (1 << 4)) /* HWCAP_PMULL */ + features |= ARM_CPU_FEATURE_PMULL; + if (hwcap & (1 << 7)) /* HWCAP_CRC32 */ + features |= ARM_CPU_FEATURE_CRC32; + if (hwcap & (1 << 17)) /* HWCAP_SHA3 */ + features |= ARM_CPU_FEATURE_SHA3; + if (hwcap & (1 << 20)) /* HWCAP_ASIMDDP */ + features |= ARM_CPU_FEATURE_DOTPROD; +#endif + return features; +} + +#elif defined(__APPLE__) +/* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */ + +#include +#include + +static const struct { + const char *name; + u32 feature; +} feature_sysctls[] = { + { "hw.optional.neon", ARM_CPU_FEATURE_NEON }, + { "hw.optional.AdvSIMD", ARM_CPU_FEATURE_NEON }, + { "hw.optional.arm.FEAT_PMULL", ARM_CPU_FEATURE_PMULL }, + { "hw.optional.armv8_crc32", ARM_CPU_FEATURE_CRC32 }, + { "hw.optional.armv8_2_sha3", ARM_CPU_FEATURE_SHA3 }, + { "hw.optional.arm.FEAT_SHA3", ARM_CPU_FEATURE_SHA3 }, + { "hw.optional.arm.FEAT_DotProd", ARM_CPU_FEATURE_DOTPROD }, +}; + +static u32 query_arm_cpu_features(void) +{ + u32 features = 0; + size_t i; + + for (i = 0; i < ARRAY_LEN(feature_sysctls); i++) { + const char *name = feature_sysctls[i].name; + u32 val = 0; + size_t valsize = sizeof(val); + + if (sysctlbyname(name, &val, &valsize, NULL, 0) == 0 && + valsize == sizeof(val) && val == 1) + features |= feature_sysctls[i].feature; + } + return features; +} +#elif defined(_WIN32) + +#include + +static u32 query_arm_cpu_features(void) +{ + u32 features = ARM_CPU_FEATURE_NEON; + + if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) + features |= ARM_CPU_FEATURE_PMULL; + if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) + features |= ARM_CPU_FEATURE_CRC32; + + /* FIXME: detect SHA3 and DOTPROD support too. */ + + return features; +} +#else +#error "unhandled case" +#endif + +static const struct cpu_feature arm_cpu_feature_table[] = { + {ARM_CPU_FEATURE_NEON, "neon"}, + {ARM_CPU_FEATURE_PMULL, "pmull"}, + {ARM_CPU_FEATURE_CRC32, "crc32"}, + {ARM_CPU_FEATURE_SHA3, "sha3"}, + {ARM_CPU_FEATURE_DOTPROD, "dotprod"}, +}; + +volatile u32 libdeflate_arm_cpu_features = 0; + +void libdeflate_init_arm_cpu_features(void) +{ + u32 features = query_arm_cpu_features(); + + disable_cpu_features_for_testing(&features, arm_cpu_feature_table, + ARRAY_LEN(arm_cpu_feature_table)); + + libdeflate_arm_cpu_features = features | ARM_CPU_FEATURES_KNOWN; +} + +#endif /* HAVE_DYNAMIC_ARM_CPU_FEATURES */ diff --git a/Sources/DEFLATE/arm/cpu_features.h b/Sources/DEFLATE/arm/cpu_features.h new file mode 100644 index 00000000..39fdb40f --- /dev/null +++ b/Sources/DEFLATE/arm/cpu_features.h @@ -0,0 +1,262 @@ +/* + * arm/cpu_features.h - feature detection for ARM CPUs + * + * Copyright 2018 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_ARM_CPU_FEATURES_H +#define LIB_ARM_CPU_FEATURES_H + +#include "../lib_common.h" + +#define HAVE_DYNAMIC_ARM_CPU_FEATURES 0 + +#if defined(ARCH_ARM32) || defined(ARCH_ARM64) + +#if !defined(FREESTANDING) && \ +(defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \ +(defined(__linux__) || \ +(defined(__APPLE__) && defined(ARCH_ARM64)) || \ +(defined(_WIN32) && defined(ARCH_ARM64))) +# undef HAVE_DYNAMIC_ARM_CPU_FEATURES +# define HAVE_DYNAMIC_ARM_CPU_FEATURES 1 +#endif + +#define ARM_CPU_FEATURE_NEON (1 << 0) +#define ARM_CPU_FEATURE_PMULL (1 << 1) +#define ARM_CPU_FEATURE_CRC32 (1 << 2) +#define ARM_CPU_FEATURE_SHA3 (1 << 3) +#define ARM_CPU_FEATURE_DOTPROD (1 << 4) + +#define HAVE_NEON(features) (HAVE_NEON_NATIVE || ((features) & ARM_CPU_FEATURE_NEON)) +#define HAVE_PMULL(features) (HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL)) +#define HAVE_CRC32(features) (HAVE_CRC32_NATIVE || ((features) & ARM_CPU_FEATURE_CRC32)) +#define HAVE_SHA3(features) (HAVE_SHA3_NATIVE || ((features) & ARM_CPU_FEATURE_SHA3)) +#define HAVE_DOTPROD(features) (HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD)) + +#if HAVE_DYNAMIC_ARM_CPU_FEATURES +#define ARM_CPU_FEATURES_KNOWN (1U << 31) +extern volatile u32 libdeflate_arm_cpu_features; + +void libdeflate_init_arm_cpu_features(void); + +static inline u32 get_arm_cpu_features(void) +{ + if (libdeflate_arm_cpu_features == 0) + libdeflate_init_arm_cpu_features(); + return libdeflate_arm_cpu_features; +} +#else /* HAVE_DYNAMIC_ARM_CPU_FEATURES */ +static inline u32 get_arm_cpu_features(void) { return 0; } +#endif /* !HAVE_DYNAMIC_ARM_CPU_FEATURES */ + +/* NEON */ +#if defined(__ARM_NEON) || defined(ARCH_ARM64) +# define HAVE_NEON_NATIVE 1 +#else +# define HAVE_NEON_NATIVE 0 +#endif +/* + * With both gcc and clang, NEON intrinsics require that the main target has + * NEON enabled already. Exception: with gcc 6.1 and later (r230411 for arm32, + * r226563 for arm64), hardware floating point support is sufficient. + */ +#if HAVE_NEON_NATIVE || \ +(HAVE_DYNAMIC_ARM_CPU_FEATURES && GCC_PREREQ(6, 1) && defined(__ARM_FP)) +# define HAVE_NEON_INTRIN 1 +#else +# define HAVE_NEON_INTRIN 0 +#endif + +/* PMULL */ +#ifdef __ARM_FEATURE_CRYPTO +# define HAVE_PMULL_NATIVE 1 +#else +# define HAVE_PMULL_NATIVE 0 +#endif +#if HAVE_PMULL_NATIVE || \ +(HAVE_DYNAMIC_ARM_CPU_FEATURES && \ +HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \ +(GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \ +/* +* On arm32 with clang, the crypto intrinsics (which include pmull) +* are not defined, even when using -mfpu=crypto-neon-fp-armv8, +* because clang's puts their definitions behind +* __aarch64__. +*/ \ +!(defined(ARCH_ARM32) && defined(__clang__))) +# define HAVE_PMULL_INTRIN CPU_IS_LITTLE_ENDIAN() /* untested on big endian */ +/* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */ +# ifdef _MSC_VER +# define compat_vmull_p64(a, b) vmull_p64(vcreate_p64(a), vcreate_p64(b)) +# else +# define compat_vmull_p64(a, b) vmull_p64((a), (b)) +# endif +#else +# define HAVE_PMULL_INTRIN 0 +#endif +/* + * Set USE_PMULL_TARGET_EVEN_IF_NATIVE if a workaround for a gcc bug that was + * fixed by commit 11a113d501ff ("aarch64: Simplify feature definitions") in gcc + * 13 is needed. A minimal program that fails to build due to this bug when + * compiled with -mcpu=emag, at least with gcc 10 through 12, is: + * + * static inline __attribute__((always_inline,target("+crypto"))) void f() {} + * void g() { f(); } + * + * The error is: + * + * error: inlining failed in call to ‘always_inline’ ‘f’: target specific option mismatch + * + * The workaround is to explicitly add the crypto target to the non-inline + * function g(), even though this should not be required due to -mcpu=emag + * enabling 'crypto' natively and causing __ARM_FEATURE_CRYPTO to be defined. + */ +#if HAVE_PMULL_NATIVE && defined(ARCH_ARM64) && \ +GCC_PREREQ(6, 1) && !GCC_PREREQ(13, 1) +# define USE_PMULL_TARGET_EVEN_IF_NATIVE 1 +#else +# define USE_PMULL_TARGET_EVEN_IF_NATIVE 0 +#endif + +/* CRC32 */ +#ifdef __ARM_FEATURE_CRC32 +# define HAVE_CRC32_NATIVE 1 +#else +# define HAVE_CRC32_NATIVE 0 +#endif +#undef HAVE_CRC32_INTRIN +#if HAVE_CRC32_NATIVE +# define HAVE_CRC32_INTRIN 1 +#elif HAVE_DYNAMIC_ARM_CPU_FEATURES +# if GCC_PREREQ(1, 0) +/* + * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled + * in the main target has been affected by two gcc bugs, which we must avoid + * by only allowing gcc versions that have the corresponding fixes. First, + * gcc commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a + * and hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed. Second, + * gcc commit c1cdabe3aab8 ("arm: reorder assembler architecture directives + * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when + * binutils is 2.34 or later, due to + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439. We use the second + * set of prerequisites, as they are stricter and we have no way to detect + * the binutils version directly from a C source file. + * + * Also exclude the cases where the main target arch is armv6kz or armv7e-m. + * In those cases, gcc doesn't let functions that use the main arch be + * inlined into functions that are targeted to armv8-a+crc. (armv8-a is + * necessary for crc to be accepted at all.) That causes build errors. + * This issue happens for these specific sub-archs because they are not a + * subset of armv8-a. Note: clang does not have this limitation. + */ +# if (GCC_PREREQ(11, 3) || \ +(GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \ +(GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0))) && \ +!defined(__ARM_ARCH_6KZ__) && \ +!defined(__ARM_ARCH_7EM__) +# define HAVE_CRC32_INTRIN 1 +# endif +# elif defined(__clang__) || defined(_MSC_VER) +# define HAVE_CRC32_INTRIN 1 +# endif +#endif +#ifndef HAVE_CRC32_INTRIN +# define HAVE_CRC32_INTRIN 0 +#endif + +/* SHA3 (needed for the eor3 instruction) */ +#if defined(ARCH_ARM64) && !defined(_MSC_VER) +# ifdef __ARM_FEATURE_SHA3 +# define HAVE_SHA3_NATIVE 1 +# else +# define HAVE_SHA3_NATIVE 0 +# endif +# define HAVE_SHA3_TARGET (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ +(GCC_PREREQ(8, 1) /* r256478 */ || \ +CLANG_PREREQ(7, 0, 10010463) /* r338010 */)) +# define HAVE_SHA3_INTRIN (HAVE_NEON_INTRIN && \ +(HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \ +(GCC_PREREQ(9, 1) /* r268049 */ || \ +CLANG_PREREQ(13, 0, 13160000))) +#else +# define HAVE_SHA3_NATIVE 0 +# define HAVE_SHA3_TARGET 0 +# define HAVE_SHA3_INTRIN 0 +#endif + +/* dotprod */ +#ifdef ARCH_ARM64 +# ifdef __ARM_FEATURE_DOTPROD +# define HAVE_DOTPROD_NATIVE 1 +# else +# define HAVE_DOTPROD_NATIVE 0 +# endif +# if HAVE_DOTPROD_NATIVE || \ +(HAVE_DYNAMIC_ARM_CPU_FEATURES && \ +(GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || \ +defined(_MSC_VER))) +# define HAVE_DOTPROD_INTRIN 1 +# else +# define HAVE_DOTPROD_INTRIN 0 +# endif +#else +# define HAVE_DOTPROD_NATIVE 0 +# define HAVE_DOTPROD_INTRIN 0 +#endif + +/* + * Work around bugs in arm_acle.h and arm_neon.h where sometimes intrinsics are + * only defined when the corresponding __ARM_FEATURE_* macro is defined. The + * intrinsics actually work in target attribute functions too if they are + * defined, though, so work around this by temporarily defining the + * corresponding __ARM_FEATURE_* macros while including the headers. + */ +#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \ +(defined(__clang__) || defined(ARCH_ARM32)) +# define __ARM_FEATURE_CRC32 1 +#endif +#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__) +# define __ARM_FEATURE_SHA3 1 +#endif +#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__) +# define __ARM_FEATURE_DOTPROD 1 +#endif +#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \ +(defined(__clang__) || defined(ARCH_ARM32)) +# include +# undef __ARM_FEATURE_CRC32 +#endif +#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__) +# include +# undef __ARM_FEATURE_SHA3 +#endif +#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__) +# include +# undef __ARM_FEATURE_DOTPROD +#endif + +#endif /* ARCH_ARM32 || ARCH_ARM64 */ + +#endif /* LIB_ARM_CPU_FEATURES_H */ diff --git a/Sources/DEFLATE/arm/crc32_impl.h b/Sources/DEFLATE/arm/crc32_impl.h new file mode 100644 index 00000000..472bc00f --- /dev/null +++ b/Sources/DEFLATE/arm/crc32_impl.h @@ -0,0 +1,682 @@ +/* + * arm/crc32_impl.h - ARM implementations of the gzip CRC-32 algorithm + * + * Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_ARM_CRC32_IMPL_H +#define LIB_ARM_CRC32_IMPL_H + +#include "cpu_features.h" + +/* + * crc32_arm_crc() - implementation using crc32 instructions (only) + * + * In general this implementation is straightforward. However, naive use of the + * crc32 instructions is serial: one of the two inputs to each crc32 instruction + * is the output of the previous one. To take advantage of CPUs that can + * execute multiple crc32 instructions in parallel, when possible we interleave + * the checksumming of several adjacent chunks, then combine their CRCs. + * + * However, without pmull, combining CRCs is fairly slow. So in this pmull-less + * version, we only use a large chunk length, and thus we only do chunked + * processing if there is a lot of data to checksum. This also means that a + * variable chunk length wouldn't help much, so we just support a fixed length. + */ +#if HAVE_CRC32_INTRIN +# if HAVE_CRC32_NATIVE +# define ATTRIBUTES +# else +# ifdef ARCH_ARM32 +# ifdef __clang__ +# define ATTRIBUTES _target_attribute("armv8-a,crc") +# elif defined(__ARM_PCS_VFP) +/* + * +simd is needed to avoid a "selected architecture lacks an FPU" + * error with Debian arm-linux-gnueabihf-gcc when -mfpu is not + * explicitly specified on the command line. + */ +# define ATTRIBUTES _target_attribute("arch=armv8-a+crc+simd") +# else +# define ATTRIBUTES _target_attribute("arch=armv8-a+crc") +# endif +# else +# ifdef __clang__ +# define ATTRIBUTES _target_attribute("crc") +# else +# define ATTRIBUTES _target_attribute("+crc") +# endif +# endif +# endif + +#ifndef _MSC_VER +# include +#endif + +/* + * Combine the CRCs for 4 adjacent chunks of length L = CRC32_FIXED_CHUNK_LEN + * bytes each by computing: + * + * [ crc0*x^(3*8*L) + crc1*x^(2*8*L) + crc2*x^(1*8*L) + crc3 ] mod G(x) + * + * This has been optimized in several ways: + * + * - The needed multipliers (x to some power, reduced mod G(x)) were + * precomputed. + * + * - The 3 multiplications are interleaved. + * + * - The reduction mod G(x) is delayed to the end and done using __crc32d. + * Note that the use of __crc32d introduces an extra factor of x^32. To + * cancel that out along with the extra factor of x^1 that gets introduced + * because of how the 63-bit products are aligned in their 64-bit integers, + * the multipliers are actually x^(j*8*L - 33) instead of x^(j*8*L). + */ +static forceinline ATTRIBUTES u32 +combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3) +{ + u64 res0 = 0, res1 = 0, res2 = 0; + int i; + + /* Multiply crc{0,1,2} by CRC32_FIXED_CHUNK_MULT_{3,2,1}. */ + for (i = 0; i < 32; i++) { + if (CRC32_FIXED_CHUNK_MULT_3 & (1U << i)) + res0 ^= (u64)crc0 << i; + if (CRC32_FIXED_CHUNK_MULT_2 & (1U << i)) + res1 ^= (u64)crc1 << i; + if (CRC32_FIXED_CHUNK_MULT_1 & (1U << i)) + res2 ^= (u64)crc2 << i; + } + /* Add the different parts and reduce mod G(x). */ + return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3; +} + +#define crc32_arm_crc crc32_arm_crc +static u32 ATTRIBUTES MAYBE_UNUSED +crc32_arm_crc(u32 crc, const u8 *p, size_t len) +{ + if (len >= 64) { + const size_t align = -(uintptr_t)p & 7; + + /* Align p to the next 8-byte boundary. */ + if (align) { + if (align & 1) + crc = __crc32b(crc, *p++); + if (align & 2) { + crc = __crc32h(crc, le16_bswap(*(u16 *)p)); + p += 2; + } + if (align & 4) { + crc = __crc32w(crc, le32_bswap(*(u32 *)p)); + p += 4; + } + len -= align; + } + /* + * Interleave the processing of multiple adjacent data chunks to + * take advantage of instruction-level parallelism. + * + * Some CPUs don't prefetch the data if it's being fetched in + * multiple interleaved streams, so do explicit prefetching. + */ + while (len >= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN) { + const u64 *wp0 = (const u64 *)p; + const u64 * const wp0_end = + (const u64 *)(p + CRC32_FIXED_CHUNK_LEN); + u32 crc1 = 0, crc2 = 0, crc3 = 0; + + STATIC_ASSERT(CRC32_NUM_CHUNKS == 4); + STATIC_ASSERT(CRC32_FIXED_CHUNK_LEN % (4 * 8) == 0); + do { + prefetchr(&wp0[64 + 0*CRC32_FIXED_CHUNK_LEN/8]); + prefetchr(&wp0[64 + 1*CRC32_FIXED_CHUNK_LEN/8]); + prefetchr(&wp0[64 + 2*CRC32_FIXED_CHUNK_LEN/8]); + prefetchr(&wp0[64 + 3*CRC32_FIXED_CHUNK_LEN/8]); + crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8])); + crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8])); + crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8])); + crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8])); + wp0++; + crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8])); + crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8])); + crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8])); + crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8])); + wp0++; + crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8])); + crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8])); + crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8])); + crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8])); + wp0++; + crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8])); + crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8])); + crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8])); + crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8])); + wp0++; + } while (wp0 != wp0_end); + crc = combine_crcs_slow(crc, crc1, crc2, crc3); + p += CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN; + len -= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN; + } + /* + * Due to the large fixed chunk length used above, there might + * still be a lot of data left. So use a 64-byte loop here, + * instead of a loop that is less unrolled. + */ + while (len >= 64) { + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 32))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 40))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 48))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 56))); + p += 64; + len -= 64; + } + } + if (len & 32) { + crc = __crc32d(crc, get_unaligned_le64(p + 0)); + crc = __crc32d(crc, get_unaligned_le64(p + 8)); + crc = __crc32d(crc, get_unaligned_le64(p + 16)); + crc = __crc32d(crc, get_unaligned_le64(p + 24)); + p += 32; + } + if (len & 16) { + crc = __crc32d(crc, get_unaligned_le64(p + 0)); + crc = __crc32d(crc, get_unaligned_le64(p + 8)); + p += 16; + } + if (len & 8) { + crc = __crc32d(crc, get_unaligned_le64(p)); + p += 8; + } + if (len & 4) { + crc = __crc32w(crc, get_unaligned_le32(p)); + p += 4; + } + if (len & 2) { + crc = __crc32h(crc, get_unaligned_le16(p)); + p += 2; + } + if (len & 1) + crc = __crc32b(crc, *p); + return crc; +} +#undef ATTRIBUTES +#endif /* crc32_arm_crc() */ + +/* + * crc32_arm_crc_pmullcombine() - implementation using crc32 instructions, plus + * pmull instructions for CRC combining + * + * This is similar to crc32_arm_crc(), but it enables the use of pmull + * (carryless multiplication) instructions for the steps where the CRCs of + * adjacent data chunks are combined. As this greatly speeds up CRC + * combination, this implementation also differs from crc32_arm_crc() in that it + * uses a variable chunk length which can get fairly small. The precomputed + * multipliers needed for the selected chunk length are loaded from a table. + * + * Note that pmull is used here only for combining the CRCs of separately + * checksummed chunks, not for folding the data itself. See crc32_arm_pmull*() + * for implementations that use pmull for folding the data itself. + */ +#if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN +# if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE +# define ATTRIBUTES +# else +# ifdef ARCH_ARM32 +# define ATTRIBUTES _target_attribute("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8") +# else +# ifdef __clang__ +# define ATTRIBUTES _target_attribute("crc,aes") +# else +# define ATTRIBUTES _target_attribute("+crc,+crypto") +# endif +# endif +# endif + +#ifndef _MSC_VER +# include +#endif +#include + +/* Do carryless multiplication of two 32-bit values. */ +static forceinline ATTRIBUTES u64 +clmul_u32(u32 a, u32 b) +{ + uint64x2_t res = vreinterpretq_u64_p128( + compat_vmull_p64((poly64_t)a, (poly64_t)b)); + + return vgetq_lane_u64(res, 0); +} + +/* + * Like combine_crcs_slow(), but uses vmull_p64 to do the multiplications more + * quickly, and supports a variable chunk length. The chunk length is + * 'i * CRC32_MIN_VARIABLE_CHUNK_LEN' + * where 1 <= i < ARRAY_LEN(crc32_mults_for_chunklen). + */ +static forceinline ATTRIBUTES u32 +combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i) +{ + u64 res0 = clmul_u32(crc0, crc32_mults_for_chunklen[i][0]); + u64 res1 = clmul_u32(crc1, crc32_mults_for_chunklen[i][1]); + u64 res2 = clmul_u32(crc2, crc32_mults_for_chunklen[i][2]); + + return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3; +} + +#define crc32_arm_crc_pmullcombine crc32_arm_crc_pmullcombine +static u32 ATTRIBUTES MAYBE_UNUSED +crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) +{ + const size_t align = -(uintptr_t)p & 7; + + if (len >= align + CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) { + /* Align p to the next 8-byte boundary. */ + if (align) { + if (align & 1) + crc = __crc32b(crc, *p++); + if (align & 2) { + crc = __crc32h(crc, le16_bswap(*(u16 *)p)); + p += 2; + } + if (align & 4) { + crc = __crc32w(crc, le32_bswap(*(u32 *)p)); + p += 4; + } + len -= align; + } + /* + * Handle CRC32_MAX_VARIABLE_CHUNK_LEN specially, so that better + * code is generated for it. + */ + while (len >= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN) { + const u64 *wp0 = (const u64 *)p; + const u64 * const wp0_end = + (const u64 *)(p + CRC32_MAX_VARIABLE_CHUNK_LEN); + u32 crc1 = 0, crc2 = 0, crc3 = 0; + + STATIC_ASSERT(CRC32_NUM_CHUNKS == 4); + STATIC_ASSERT(CRC32_MAX_VARIABLE_CHUNK_LEN % (4 * 8) == 0); + do { + prefetchr(&wp0[64 + 0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]); + prefetchr(&wp0[64 + 1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]); + prefetchr(&wp0[64 + 2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]); + prefetchr(&wp0[64 + 3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]); + crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + wp0++; + crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + wp0++; + crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + wp0++; + crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); + wp0++; + } while (wp0 != wp0_end); + crc = combine_crcs_fast(crc, crc1, crc2, crc3, + ARRAY_LEN(crc32_mults_for_chunklen) - 1); + p += CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN; + len -= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN; + } + /* Handle up to one variable-length chunk. */ + if (len >= CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) { + const size_t i = len / (CRC32_NUM_CHUNKS * + CRC32_MIN_VARIABLE_CHUNK_LEN); + const size_t chunk_len = + i * CRC32_MIN_VARIABLE_CHUNK_LEN; + const u64 *wp0 = (const u64 *)(p + 0*chunk_len); + const u64 *wp1 = (const u64 *)(p + 1*chunk_len); + const u64 *wp2 = (const u64 *)(p + 2*chunk_len); + const u64 *wp3 = (const u64 *)(p + 3*chunk_len); + const u64 * const wp0_end = wp1; + u32 crc1 = 0, crc2 = 0, crc3 = 0; + + STATIC_ASSERT(CRC32_NUM_CHUNKS == 4); + STATIC_ASSERT(CRC32_MIN_VARIABLE_CHUNK_LEN % (4 * 8) == 0); + do { + prefetchr(wp0 + 64); + prefetchr(wp1 + 64); + prefetchr(wp2 + 64); + prefetchr(wp3 + 64); + crc = __crc32d(crc, le64_bswap(*wp0++)); + crc1 = __crc32d(crc1, le64_bswap(*wp1++)); + crc2 = __crc32d(crc2, le64_bswap(*wp2++)); + crc3 = __crc32d(crc3, le64_bswap(*wp3++)); + crc = __crc32d(crc, le64_bswap(*wp0++)); + crc1 = __crc32d(crc1, le64_bswap(*wp1++)); + crc2 = __crc32d(crc2, le64_bswap(*wp2++)); + crc3 = __crc32d(crc3, le64_bswap(*wp3++)); + crc = __crc32d(crc, le64_bswap(*wp0++)); + crc1 = __crc32d(crc1, le64_bswap(*wp1++)); + crc2 = __crc32d(crc2, le64_bswap(*wp2++)); + crc3 = __crc32d(crc3, le64_bswap(*wp3++)); + crc = __crc32d(crc, le64_bswap(*wp0++)); + crc1 = __crc32d(crc1, le64_bswap(*wp1++)); + crc2 = __crc32d(crc2, le64_bswap(*wp2++)); + crc3 = __crc32d(crc3, le64_bswap(*wp3++)); + } while (wp0 != wp0_end); + crc = combine_crcs_fast(crc, crc1, crc2, crc3, i); + p += CRC32_NUM_CHUNKS * chunk_len; + len -= CRC32_NUM_CHUNKS * chunk_len; + } + + while (len >= 32) { + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16))); + crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24))); + p += 32; + len -= 32; + } + } else { + while (len >= 32) { + crc = __crc32d(crc, get_unaligned_le64(p + 0)); + crc = __crc32d(crc, get_unaligned_le64(p + 8)); + crc = __crc32d(crc, get_unaligned_le64(p + 16)); + crc = __crc32d(crc, get_unaligned_le64(p + 24)); + p += 32; + len -= 32; + } + } + if (len & 16) { + crc = __crc32d(crc, get_unaligned_le64(p + 0)); + crc = __crc32d(crc, get_unaligned_le64(p + 8)); + p += 16; + } + if (len & 8) { + crc = __crc32d(crc, get_unaligned_le64(p)); + p += 8; + } + if (len & 4) { + crc = __crc32w(crc, get_unaligned_le32(p)); + p += 4; + } + if (len & 2) { + crc = __crc32h(crc, get_unaligned_le16(p)); + p += 2; + } + if (len & 1) + crc = __crc32b(crc, *p); + return crc; +} +#undef ATTRIBUTES +#endif /* crc32_arm_crc_pmullcombine() */ + +/* + * crc32_arm_pmullx4() - implementation using "folding" with pmull instructions + * + * This implementation is intended for CPUs that support pmull instructions but + * not crc32 instructions. + */ +#if HAVE_PMULL_INTRIN +# define crc32_arm_pmullx4 crc32_arm_pmullx4 +# define SUFFIX _pmullx4 +# if HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE +# define ATTRIBUTES +# else +# ifdef ARCH_ARM32 +# define ATTRIBUTES _target_attribute("fpu=crypto-neon-fp-armv8") +# else +# ifdef __clang__ +/* + * This used to use "crypto", but that stopped working with clang 16. + * Now only "aes" works. "aes" works with older versions too, so use + * that. No "+" prefix; clang 15 and earlier doesn't accept that. + */ +# define ATTRIBUTES _target_attribute("aes") +# else +/* + * With gcc, only "+crypto" works. Both the "+" prefix and the + * "crypto" (not "aes") are essential... + */ +# define ATTRIBUTES _target_attribute("+crypto") +# endif +# endif +# endif +# define ENABLE_EOR3 0 +# include "crc32_pmull_helpers.h" + +static u32 ATTRIBUTES MAYBE_UNUSED +crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) +{ + static const u64 _aligned_attribute(16) mults[3][2] = { + { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ + { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */ + { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */ + }; + static const u64 _aligned_attribute(16) final_mults[3][2] = { + { CRC32_X63_MODG, 0 }, + { CRC32_BARRETT_CONSTANT_1, 0 }, + { CRC32_BARRETT_CONSTANT_2, 0 }, + }; + const uint8x16_t zeroes = vdupq_n_u8(0); + const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF)); + const poly64x2_t multipliers_1 = load_multipliers(mults[0]); + uint8x16_t v0, v1, v2, v3; + + if (len < 64 + 15) { + if (len < 16) + return crc32_slice1(crc, p, len); + v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc)); + p += 16; + len -= 16; + while (len >= 16) { + v0 = fold_vec(v0, vld1q_u8(p), multipliers_1); + p += 16; + len -= 16; + } + } else { + const poly64x2_t multipliers_4 = load_multipliers(mults[1]); + const poly64x2_t multipliers_2 = load_multipliers(mults[2]); + const size_t align = -(uintptr_t)p & 15; + const uint8x16_t *vp; + + v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc)); + p += 16; + /* Align p to the next 16-byte boundary. */ + if (align) { + v0 = fold_partial_vec(v0, p, align, multipliers_1); + p += align; + len -= align; + } + vp = (const uint8x16_t *)p; + v1 = *vp++; + v2 = *vp++; + v3 = *vp++; + while (len >= 64 + 64) { + v0 = fold_vec(v0, *vp++, multipliers_4); + v1 = fold_vec(v1, *vp++, multipliers_4); + v2 = fold_vec(v2, *vp++, multipliers_4); + v3 = fold_vec(v3, *vp++, multipliers_4); + len -= 64; + } + v0 = fold_vec(v0, v2, multipliers_2); + v1 = fold_vec(v1, v3, multipliers_2); + if (len & 32) { + v0 = fold_vec(v0, *vp++, multipliers_2); + v1 = fold_vec(v1, *vp++, multipliers_2); + } + v0 = fold_vec(v0, v1, multipliers_1); + if (len & 16) + v0 = fold_vec(v0, *vp++, multipliers_1); + p = (const u8 *)vp; + len &= 15; + } + + /* Handle any remaining partial block now before reducing to 32 bits. */ + if (len) + v0 = fold_partial_vec(v0, p, len, multipliers_1); + + /* + * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, + * which is equivalent to multiplying by x^32. This is needed because + * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). + */ + + v0 = veorq_u8(vextq_u8(v0, zeroes, 8), + clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1)); + + /* Fold 96 => 64 bits. */ + v0 = veorq_u8(vextq_u8(v0, zeroes, 4), + clmul_low(vandq_u8(v0, mask32), + load_multipliers(final_mults[0]))); + + /* Reduce 64 => 32 bits using Barrett reduction. */ + v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1])); + v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2])); + return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1); +} +#undef SUFFIX +#undef ATTRIBUTES +#undef ENABLE_EOR3 +#endif /* crc32_arm_pmullx4() */ + +/* + * crc32_arm_pmullx12_crc() - large-stride implementation using "folding" with + * pmull instructions, where crc32 instructions are also available + * + * See crc32_pmull_wide.h for explanation. + */ +#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN +# define crc32_arm_pmullx12_crc crc32_arm_pmullx12_crc +# define SUFFIX _pmullx12_crc +# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE +# define ATTRIBUTES +# else +# ifdef __clang__ +# define ATTRIBUTES _target_attribute("aes,crc") +# else +# define ATTRIBUTES _target_attribute("+crypto,+crc") +# endif +# endif +# define ENABLE_EOR3 0 +# include "crc32_pmull_wide.h" +#endif + +/* + * crc32_arm_pmullx12_crc_eor3() + * + * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from + * the sha3 extension) for even better performance. + * + * Note: we require HAVE_SHA3_TARGET (or HAVE_SHA3_NATIVE) rather than + * HAVE_SHA3_INTRIN, as we have an inline asm fallback for eor3. + */ +#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \ +(HAVE_SHA3_TARGET || HAVE_SHA3_NATIVE) +# define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3 +# define SUFFIX _pmullx12_crc_eor3 +# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE && \ +!USE_PMULL_TARGET_EVEN_IF_NATIVE +# define ATTRIBUTES +# else +# ifdef __clang__ +# define ATTRIBUTES _target_attribute("aes,crc,sha3") +/* + * With gcc, arch=armv8.2-a is needed for the sha3 intrinsics, unless the + * default target is armv8.3-a or later in which case it must be omitted. + * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. + */ +# elif defined(__ARM_FEATURE_JCVT) +# define ATTRIBUTES _target_attribute("+crypto,+crc,+sha3") +# else +# define ATTRIBUTES _target_attribute("arch=armv8.2-a+crypto+crc+sha3") +# endif +# endif +# define ENABLE_EOR3 1 +# include "crc32_pmull_wide.h" +#endif + +/* + * On the Apple M1 processor, crc32 instructions max out at about 25.5 GB/s in + * the best case of using a 3-way or greater interleaved chunked implementation, + * whereas a pmull-based implementation achieves 68 GB/s provided that the + * stride length is large enough (about 10+ vectors with eor3, or 12+ without). + * + * For now we assume that crc32 instructions are preferable in other cases. + */ +#define PREFER_PMULL_TO_CRC 0 +#ifdef __APPLE__ +# include +# if TARGET_OS_OSX +# undef PREFER_PMULL_TO_CRC +# define PREFER_PMULL_TO_CRC 1 +# endif +#endif + +/* + * If the best implementation is statically available, use it unconditionally. + * Otherwise choose the best implementation at runtime. + */ +#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3) && \ +HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE +# define DEFAULT_IMPL crc32_arm_pmullx12_crc_eor3 +#elif !PREFER_PMULL_TO_CRC && defined(crc32_arm_crc_pmullcombine) && \ +HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE +# define DEFAULT_IMPL crc32_arm_crc_pmullcombine +#else +static inline crc32_func_t +arch_select_crc32_func(void) +{ + const u32 features MAYBE_UNUSED = get_arm_cpu_features(); + +#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3) + if (HAVE_PMULL(features) && HAVE_CRC32(features) && HAVE_SHA3(features)) + return crc32_arm_pmullx12_crc_eor3; +#endif +#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc) + if (HAVE_PMULL(features) && HAVE_CRC32(features)) + return crc32_arm_pmullx12_crc; +#endif +#ifdef crc32_arm_crc_pmullcombine + if (HAVE_CRC32(features) && HAVE_PMULL(features)) + return crc32_arm_crc_pmullcombine; +#endif +#ifdef crc32_arm_crc + if (HAVE_CRC32(features)) + return crc32_arm_crc; +#endif +#ifdef crc32_arm_pmullx4 + if (HAVE_PMULL(features)) + return crc32_arm_pmullx4; +#endif + return NULL; +} +#define arch_select_crc32_func arch_select_crc32_func +#endif + +#endif /* LIB_ARM_CRC32_IMPL_H */ diff --git a/Sources/DEFLATE/crc32_pmull_helpers.h b/Sources/DEFLATE/arm/crc32_pmull_helpers.h similarity index 51% rename from Sources/DEFLATE/crc32_pmull_helpers.h rename to Sources/DEFLATE/arm/crc32_pmull_helpers.h index 1cd1cc18..2c2172e2 100644 --- a/Sources/DEFLATE/crc32_pmull_helpers.h +++ b/Sources/DEFLATE/arm/crc32_pmull_helpers.h @@ -30,11 +30,11 @@ * with pmull instructions. It accepts the following parameters: * * SUFFIX: - * Name suffix to append to all instantiated functions. + * Name suffix to append to all instantiated functions. * ATTRIBUTES: - * Target function attributes to use. + * Target function attributes to use. * ENABLE_EOR3: - * Use the eor3 instruction (from the sha3 extension). + * Use the eor3 instruction (from the sha3 extension). */ #include @@ -44,29 +44,29 @@ static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(u32_to_bytevec)(u32 a) { - return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0)); + return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0)); } -#define u32_to_bytevec ADD_SUFFIX(u32_to_bytevec) +#define u32_to_bytevec ADD_SUFFIX(u32_to_bytevec) /* Load two 64-bit values into a vector. */ #undef load_multipliers static forceinline ATTRIBUTES poly64x2_t ADD_SUFFIX(load_multipliers)(const u64 p[2]) { - return vreinterpretq_p64_u64(vld1q_u64(p)); + return vreinterpretq_p64_u64(vld1q_u64(p)); } -#define load_multipliers ADD_SUFFIX(load_multipliers) +#define load_multipliers ADD_SUFFIX(load_multipliers) /* Do carryless multiplication of the low halves of two vectors. */ #undef clmul_low static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b) { - return vreinterpretq_u8_p128( - compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0), - vgetq_lane_p64(b, 0))); + return vreinterpretq_u8_p128( + compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0), + vgetq_lane_p64(b, 0))); } -#define clmul_low ADD_SUFFIX(clmul_low) +#define clmul_low ADD_SUFFIX(clmul_low) /* Do carryless multiplication of the high halves of two vectors. */ #undef clmul_high @@ -74,19 +74,19 @@ static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b) { #if defined(__clang__) && defined(ARCH_ARM64) - /* - * Use inline asm to ensure that pmull2 is really used. This works - * around clang bug https://github.com/llvm/llvm-project/issues/52868. - */ - uint8x16_t res; - - __asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b)); - return res; + /* + * Use inline asm to ensure that pmull2 is really used. This works + * around clang bug https://github.com/llvm/llvm-project/issues/52868. + */ + uint8x16_t res; + + __asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b)); + return res; #else - return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b)); + return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b)); #endif } -#define clmul_high ADD_SUFFIX(clmul_high) +#define clmul_high ADD_SUFFIX(clmul_high) #undef eor3 static forceinline ATTRIBUTES uint8x16_t @@ -94,48 +94,48 @@ ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c) { #if ENABLE_EOR3 #if HAVE_SHA3_INTRIN - return veor3q_u8(a, b, c); + return veor3q_u8(a, b, c); #else - uint8x16_t res; - - __asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b" - : "=w" (res) : "w" (a), "w" (b), "w" (c)); - return res; + uint8x16_t res; + + __asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b" + : "=w" (res) : "w" (a), "w" (b), "w" (c)); + return res; #endif #else /* ENABLE_EOR3 */ - return veorq_u8(veorq_u8(a, b), c); + return veorq_u8(veorq_u8(a, b), c); #endif /* !ENABLE_EOR3 */ } -#define eor3 ADD_SUFFIX(eor3) +#define eor3 ADD_SUFFIX(eor3) #undef fold_vec static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers) { - uint8x16_t a = clmul_low(src, multipliers); - uint8x16_t b = clmul_high(src, multipliers); - - return eor3(a, b, dst); + uint8x16_t a = clmul_low(src, multipliers); + uint8x16_t b = clmul_high(src, multipliers); + + return eor3(a, b, dst); } -#define fold_vec ADD_SUFFIX(fold_vec) +#define fold_vec ADD_SUFFIX(fold_vec) #undef vtbl static forceinline ATTRIBUTES uint8x16_t ADD_SUFFIX(vtbl)(uint8x16_t table, uint8x16_t indices) { #ifdef ARCH_ARM64 - return vqtbl1q_u8(table, indices); + return vqtbl1q_u8(table, indices); #else - uint8x8x2_t tab2; - - tab2.val[0] = vget_low_u8(table); - tab2.val[1] = vget_high_u8(table); - - return vcombine_u8(vtbl2_u8(tab2, vget_low_u8(indices)), - vtbl2_u8(tab2, vget_high_u8(indices))); + uint8x8x2_t tab2; + + tab2.val[0] = vget_low_u8(table); + tab2.val[1] = vget_high_u8(table); + + return vcombine_u8(vtbl2_u8(tab2, vget_low_u8(indices)), + vtbl2_u8(tab2, vget_high_u8(indices))); #endif } -#define vtbl ADD_SUFFIX(vtbl) +#define vtbl ADD_SUFFIX(vtbl) /* * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the @@ -147,38 +147,38 @@ ADD_SUFFIX(vtbl)(uint8x16_t table, uint8x16_t indices) #undef fold_partial_vec static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len, - poly64x2_t multipliers_1) + poly64x2_t multipliers_1) { - /* - * vtbl(v, shift_tab[len..len+15]) left shifts v by 16-len bytes. - * vtbl(v, shift_tab[len+16..len+31]) right shifts v by len bytes. - */ - static const u8 shift_tab[48] = { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, - 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, - }; - const uint8x16_t lshift = vld1q_u8(&shift_tab[len]); - const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]); - uint8x16_t x0, x1, bsl_mask; - - /* x0 = v left-shifted by '16 - len' bytes */ - x0 = vtbl(v, lshift); - - /* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */ - bsl_mask = vreinterpretq_u8_s8( - vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7)); - - /* - * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len' - * bytes) followed by the remaining data. - */ - x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */, - vld1q_u8(p + len - 16), vtbl(v, rshift)); - - return fold_vec(x0, x1, multipliers_1); + /* + * vtbl(v, shift_tab[len..len+15]) left shifts v by 16-len bytes. + * vtbl(v, shift_tab[len+16..len+31]) right shifts v by len bytes. + */ + static const u8 shift_tab[48] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; + const uint8x16_t lshift = vld1q_u8(&shift_tab[len]); + const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]); + uint8x16_t x0, x1, bsl_mask; + + /* x0 = v left-shifted by '16 - len' bytes */ + x0 = vtbl(v, lshift); + + /* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */ + bsl_mask = vreinterpretq_u8_s8( + vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7)); + + /* + * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len' + * bytes) followed by the remaining data. + */ + x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */, + vld1q_u8(p + len - 16), vtbl(v, rshift)); + + return fold_vec(x0, x1, multipliers_1); } -#define fold_partial_vec ADD_SUFFIX(fold_partial_vec) +#define fold_partial_vec ADD_SUFFIX(fold_partial_vec) diff --git a/Sources/DEFLATE/arm/crc32_pmull_wide.h b/Sources/DEFLATE/arm/crc32_pmull_wide.h new file mode 100644 index 00000000..67453f63 --- /dev/null +++ b/Sources/DEFLATE/arm/crc32_pmull_wide.h @@ -0,0 +1,231 @@ +/* + * arm/crc32_pmull_wide.h - gzip CRC-32 with PMULL (extra-wide version) + * + * Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This file is a "template" for instantiating PMULL-based crc32_arm functions. + * The "parameters" are: + * + * SUFFIX: + * Name suffix to append to all instantiated functions. + * ATTRIBUTES: + * Target function attributes to use. + * ENABLE_EOR3: + * Use the eor3 instruction (from the sha3 extension). + * + * This is the extra-wide version; it uses an unusually large stride length of + * 12, and it assumes that crc32 instructions are available too. It's intended + * for powerful CPUs that support both pmull and crc32 instructions, but where + * throughput of pmull and xor (given enough instructions issued in parallel) is + * significantly higher than that of crc32, thus making the crc32 instructions + * (counterintuitively) not actually the fastest way to compute the CRC-32. The + * Apple M1 processor is an example of such a CPU. + */ + +#ifndef _MSC_VER +# include +#endif +#include + +#include "crc32_pmull_helpers.h" + +static u32 ATTRIBUTES MAYBE_UNUSED +ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) +{ + uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11; + + if (len < 3 * 192) { + static const u64 _aligned_attribute(16) mults[3][2] = { + { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */ + { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */ + { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ + }; + poly64x2_t multipliers_4, multipliers_2, multipliers_1; + + if (len < 64) + goto tail; + multipliers_4 = load_multipliers(mults[0]); + multipliers_2 = load_multipliers(mults[1]); + multipliers_1 = load_multipliers(mults[2]); + /* + * Short length; don't bother aligning the pointer, and fold + * 64 bytes (4 vectors) at a time, at most. + */ + v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc)); + v1 = vld1q_u8(p + 16); + v2 = vld1q_u8(p + 32); + v3 = vld1q_u8(p + 48); + p += 64; + len -= 64; + while (len >= 64) { + v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_4); + v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_4); + v2 = fold_vec(v2, vld1q_u8(p + 32), multipliers_4); + v3 = fold_vec(v3, vld1q_u8(p + 48), multipliers_4); + p += 64; + len -= 64; + } + v0 = fold_vec(v0, v2, multipliers_2); + v1 = fold_vec(v1, v3, multipliers_2); + if (len >= 32) { + v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_2); + v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_2); + p += 32; + len -= 32; + } + v0 = fold_vec(v0, v1, multipliers_1); + } else { + static const u64 _aligned_attribute(16) mults[4][2] = { + { CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */ + { CRC32_X799_MODG, CRC32_X735_MODG }, /* 6 vecs */ + { CRC32_X415_MODG, CRC32_X351_MODG }, /* 3 vecs */ + { CRC32_X159_MODG, CRC32_X95_MODG }, /* 1 vecs */ + }; + const poly64x2_t multipliers_12 = load_multipliers(mults[0]); + const poly64x2_t multipliers_6 = load_multipliers(mults[1]); + const poly64x2_t multipliers_3 = load_multipliers(mults[2]); + const poly64x2_t multipliers_1 = load_multipliers(mults[3]); + const size_t align = -(uintptr_t)p & 15; + const uint8x16_t *vp; + + /* Align p to the next 16-byte boundary. */ + if (align) { + if (align & 1) + crc = __crc32b(crc, *p++); + if (align & 2) { + crc = __crc32h(crc, le16_bswap(*(u16 *)p)); + p += 2; + } + if (align & 4) { + crc = __crc32w(crc, le32_bswap(*(u32 *)p)); + p += 4; + } + if (align & 8) { + crc = __crc32d(crc, le64_bswap(*(u64 *)p)); + p += 8; + } + len -= align; + } + vp = (const uint8x16_t *)p; + v0 = veorq_u8(*vp++, u32_to_bytevec(crc)); + v1 = *vp++; + v2 = *vp++; + v3 = *vp++; + v4 = *vp++; + v5 = *vp++; + v6 = *vp++; + v7 = *vp++; + v8 = *vp++; + v9 = *vp++; + v10 = *vp++; + v11 = *vp++; + len -= 192; + /* Fold 192 bytes (12 vectors) at a time. */ + do { + v0 = fold_vec(v0, *vp++, multipliers_12); + v1 = fold_vec(v1, *vp++, multipliers_12); + v2 = fold_vec(v2, *vp++, multipliers_12); + v3 = fold_vec(v3, *vp++, multipliers_12); + v4 = fold_vec(v4, *vp++, multipliers_12); + v5 = fold_vec(v5, *vp++, multipliers_12); + v6 = fold_vec(v6, *vp++, multipliers_12); + v7 = fold_vec(v7, *vp++, multipliers_12); + v8 = fold_vec(v8, *vp++, multipliers_12); + v9 = fold_vec(v9, *vp++, multipliers_12); + v10 = fold_vec(v10, *vp++, multipliers_12); + v11 = fold_vec(v11, *vp++, multipliers_12); + len -= 192; + } while (len >= 192); + + /* + * Fewer than 192 bytes left. Fold v0-v11 down to just v0, + * while processing up to 144 more bytes. + */ + v0 = fold_vec(v0, v6, multipliers_6); + v1 = fold_vec(v1, v7, multipliers_6); + v2 = fold_vec(v2, v8, multipliers_6); + v3 = fold_vec(v3, v9, multipliers_6); + v4 = fold_vec(v4, v10, multipliers_6); + v5 = fold_vec(v5, v11, multipliers_6); + if (len >= 96) { + v0 = fold_vec(v0, *vp++, multipliers_6); + v1 = fold_vec(v1, *vp++, multipliers_6); + v2 = fold_vec(v2, *vp++, multipliers_6); + v3 = fold_vec(v3, *vp++, multipliers_6); + v4 = fold_vec(v4, *vp++, multipliers_6); + v5 = fold_vec(v5, *vp++, multipliers_6); + len -= 96; + } + v0 = fold_vec(v0, v3, multipliers_3); + v1 = fold_vec(v1, v4, multipliers_3); + v2 = fold_vec(v2, v5, multipliers_3); + if (len >= 48) { + v0 = fold_vec(v0, *vp++, multipliers_3); + v1 = fold_vec(v1, *vp++, multipliers_3); + v2 = fold_vec(v2, *vp++, multipliers_3); + len -= 48; + } + v0 = fold_vec(v0, v1, multipliers_1); + v0 = fold_vec(v0, v2, multipliers_1); + p = (const u8 *)vp; + } + /* Reduce 128 to 32 bits using crc32 instructions. */ + crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0)); + crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1)); +tail: + /* Finish up the remainder using crc32 instructions. */ + if (len & 32) { + crc = __crc32d(crc, get_unaligned_le64(p + 0)); + crc = __crc32d(crc, get_unaligned_le64(p + 8)); + crc = __crc32d(crc, get_unaligned_le64(p + 16)); + crc = __crc32d(crc, get_unaligned_le64(p + 24)); + p += 32; + } + if (len & 16) { + crc = __crc32d(crc, get_unaligned_le64(p + 0)); + crc = __crc32d(crc, get_unaligned_le64(p + 8)); + p += 16; + } + if (len & 8) { + crc = __crc32d(crc, get_unaligned_le64(p)); + p += 8; + } + if (len & 4) { + crc = __crc32w(crc, get_unaligned_le32(p)); + p += 4; + } + if (len & 2) { + crc = __crc32h(crc, get_unaligned_le16(p)); + p += 2; + } + if (len & 1) + crc = __crc32b(crc, *p); + return crc; +} + +#undef SUFFIX +#undef ATTRIBUTES +#undef ENABLE_EOR3 diff --git a/Sources/DEFLATE/matchfinder_impl.h b/Sources/DEFLATE/arm/matchfinder_impl.h similarity index 66% rename from Sources/DEFLATE/matchfinder_impl.h rename to Sources/DEFLATE/arm/matchfinder_impl.h index b20f56a3..9917da4a 100644 --- a/Sources/DEFLATE/matchfinder_impl.h +++ b/Sources/DEFLATE/arm/matchfinder_impl.h @@ -35,42 +35,42 @@ static forceinline void matchfinder_init_neon(mf_pos_t *data, size_t size) { - int16x8_t *p = (int16x8_t *)data; - int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL); - - STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); - STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - - do { - p[0] = v; - p[1] = v; - p[2] = v; - p[3] = v; - p += 4; - size -= 4 * sizeof(*p); - } while (size != 0); + int16x8_t *p = (int16x8_t *)data; + int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + p[0] = v; + p[1] = v; + p[2] = v; + p[3] = v; + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); } #define matchfinder_init matchfinder_init_neon static forceinline void matchfinder_rebase_neon(mf_pos_t *data, size_t size) { - int16x8_t *p = (int16x8_t *)data; - int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE); - - STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); - STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); - STATIC_ASSERT(sizeof(mf_pos_t) == 2); - - do { - p[0] = vqaddq_s16(p[0], v); - p[1] = vqaddq_s16(p[1], v); - p[2] = vqaddq_s16(p[2], v); - p[3] = vqaddq_s16(p[3], v); - p += 4; - size -= 4 * sizeof(*p); - } while (size != 0); + int16x8_t *p = (int16x8_t *)data; + int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + p[0] = vqaddq_s16(p[0], v); + p[1] = vqaddq_s16(p[1], v); + p[2] = vqaddq_s16(p[2], v); + p[3] = vqaddq_s16(p[3], v); + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); } #define matchfinder_rebase matchfinder_rebase_neon diff --git a/Sources/DEFLATE/cpu_features.c b/Sources/DEFLATE/cpu_features.c deleted file mode 100644 index 72ab03da..00000000 --- a/Sources/DEFLATE/cpu_features.c +++ /dev/null @@ -1,212 +0,0 @@ -/* - * arm/cpu_features.c - feature detection for ARM CPUs - * - * Copyright 2018 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * ARM CPUs don't have a standard way for unprivileged programs to detect CPU - * features. But an OS-specific way can be used when available. - */ - -#ifdef __APPLE__ -# undef _ANSI_SOURCE -# undef _DARWIN_C_SOURCE -# define _DARWIN_C_SOURCE /* for sysctlbyname() */ -#endif - -#include "../cpu_features_common.h" /* must be included first */ -#include "cpu_features.h" - -#if HAVE_DYNAMIC_ARM_CPU_FEATURES - -#ifdef __linux__ -/* - * On Linux, arm32 and arm64 CPU features can be detected by reading the - * AT_HWCAP and AT_HWCAP2 values from /proc/self/auxv. - * - * Ideally we'd use the C library function getauxval(), but it's not guaranteed - * to be available: it was only added to glibc in 2.16, and in Android it was - * added to API level 18 for arm32 and level 21 for arm64. - */ - -#include -#include -#include -#include - -#define AT_HWCAP 16 -#define AT_HWCAP2 26 - -static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2) -{ - int fd; - unsigned long auxbuf[32]; - int filled = 0; - int i; - - fd = open("/proc/self/auxv", O_RDONLY); - if (fd < 0) - return; - - for (;;) { - do { - int ret = read(fd, &((char *)auxbuf)[filled], - sizeof(auxbuf) - filled); - if (ret <= 0) { - if (ret < 0 && errno == EINTR) - continue; - goto out; - } - filled += ret; - } while (filled < 2 * sizeof(long)); - - i = 0; - do { - unsigned long type = auxbuf[i]; - unsigned long value = auxbuf[i + 1]; - - if (type == AT_HWCAP) - *hwcap = value; - else if (type == AT_HWCAP2) - *hwcap2 = value; - i += 2; - filled -= 2 * sizeof(long); - } while (filled >= 2 * sizeof(long)); - - memmove(auxbuf, &auxbuf[i], filled); - } -out: - close(fd); -} - -static u32 query_arm_cpu_features(void) -{ - u32 features = 0; - unsigned long hwcap = 0; - unsigned long hwcap2 = 0; - - scan_auxv(&hwcap, &hwcap2); - -#ifdef ARCH_ARM32 - STATIC_ASSERT(sizeof(long) == 4); - if (hwcap & (1 << 12)) /* HWCAP_NEON */ - features |= ARM_CPU_FEATURE_NEON; - if (hwcap2 & (1 << 1)) /* HWCAP2_PMULL */ - features |= ARM_CPU_FEATURE_PMULL; - if (hwcap2 & (1 << 4)) /* HWCAP2_CRC32 */ - features |= ARM_CPU_FEATURE_CRC32; -#else - STATIC_ASSERT(sizeof(long) == 8); - if (hwcap & (1 << 1)) /* HWCAP_ASIMD */ - features |= ARM_CPU_FEATURE_NEON; - if (hwcap & (1 << 4)) /* HWCAP_PMULL */ - features |= ARM_CPU_FEATURE_PMULL; - if (hwcap & (1 << 7)) /* HWCAP_CRC32 */ - features |= ARM_CPU_FEATURE_CRC32; - if (hwcap & (1 << 17)) /* HWCAP_SHA3 */ - features |= ARM_CPU_FEATURE_SHA3; - if (hwcap & (1 << 20)) /* HWCAP_ASIMDDP */ - features |= ARM_CPU_FEATURE_DOTPROD; -#endif - return features; -} - -#elif defined(__APPLE__) -/* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */ - -#include -#include - -static const struct { - const char *name; - u32 feature; -} feature_sysctls[] = { - { "hw.optional.neon", ARM_CPU_FEATURE_NEON }, - { "hw.optional.AdvSIMD", ARM_CPU_FEATURE_NEON }, - { "hw.optional.arm.FEAT_PMULL", ARM_CPU_FEATURE_PMULL }, - { "hw.optional.armv8_crc32", ARM_CPU_FEATURE_CRC32 }, - { "hw.optional.armv8_2_sha3", ARM_CPU_FEATURE_SHA3 }, - { "hw.optional.arm.FEAT_SHA3", ARM_CPU_FEATURE_SHA3 }, - { "hw.optional.arm.FEAT_DotProd", ARM_CPU_FEATURE_DOTPROD }, -}; - -static u32 query_arm_cpu_features(void) -{ - u32 features = 0; - size_t i; - - for (i = 0; i < ARRAY_LEN(feature_sysctls); i++) { - const char *name = feature_sysctls[i].name; - u32 val = 0; - size_t valsize = sizeof(val); - - if (sysctlbyname(name, &val, &valsize, NULL, 0) == 0 && - valsize == sizeof(val) && val == 1) - features |= feature_sysctls[i].feature; - } - return features; -} -#elif defined(_WIN32) - -#include - -static u32 query_arm_cpu_features(void) -{ - u32 features = ARM_CPU_FEATURE_NEON; - - if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) - features |= ARM_CPU_FEATURE_PMULL; - if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE)) - features |= ARM_CPU_FEATURE_CRC32; - - /* FIXME: detect SHA3 and DOTPROD support too. */ - - return features; -} -#else -#error "unhandled case" -#endif - -static const struct cpu_feature arm_cpu_feature_table[] = { - {ARM_CPU_FEATURE_NEON, "neon"}, - {ARM_CPU_FEATURE_PMULL, "pmull"}, - {ARM_CPU_FEATURE_CRC32, "crc32"}, - {ARM_CPU_FEATURE_SHA3, "sha3"}, - {ARM_CPU_FEATURE_DOTPROD, "dotprod"}, -}; - -volatile u32 libdeflate_arm_cpu_features = 0; - -void libdeflate_init_arm_cpu_features(void) -{ - u32 features = query_arm_cpu_features(); - - disable_cpu_features_for_testing(&features, arm_cpu_feature_table, - ARRAY_LEN(arm_cpu_feature_table)); - - libdeflate_arm_cpu_features = features | ARM_CPU_FEATURES_KNOWN; -} - -#endif /* HAVE_DYNAMIC_ARM_CPU_FEATURES */ diff --git a/Sources/DEFLATE/cpu_features.h b/Sources/DEFLATE/cpu_features.h deleted file mode 100644 index c55f007c..00000000 --- a/Sources/DEFLATE/cpu_features.h +++ /dev/null @@ -1,265 +0,0 @@ -/* - * arm/cpu_features.h - feature detection for ARM CPUs - * - * Copyright 2018 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef LIB_ARM_CPU_FEATURES_H -#define LIB_ARM_CPU_FEATURES_H - -#include "../lib_common.h" - -#define HAVE_DYNAMIC_ARM_CPU_FEATURES 0 - -#if defined(ARCH_ARM32) || defined(ARCH_ARM64) - -#if !defined(FREESTANDING) && \ - (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \ - (defined(__linux__) || \ - (defined(__APPLE__) && defined(ARCH_ARM64)) || \ - (defined(_WIN32) && defined(ARCH_ARM64))) -# undef HAVE_DYNAMIC_ARM_CPU_FEATURES -# define HAVE_DYNAMIC_ARM_CPU_FEATURES 1 -#endif - -#define ARM_CPU_FEATURE_NEON 0x00000001 -#define ARM_CPU_FEATURE_PMULL 0x00000002 -#define ARM_CPU_FEATURE_CRC32 0x00000004 -#define ARM_CPU_FEATURE_SHA3 0x00000008 -#define ARM_CPU_FEATURE_DOTPROD 0x00000010 - -#define HAVE_NEON(features) (HAVE_NEON_NATIVE || ((features) & ARM_CPU_FEATURE_NEON)) -#define HAVE_PMULL(features) (HAVE_PMULL_NATIVE || ((features) & ARM_CPU_FEATURE_PMULL)) -#define HAVE_CRC32(features) (HAVE_CRC32_NATIVE || ((features) & ARM_CPU_FEATURE_CRC32)) -#define HAVE_SHA3(features) (HAVE_SHA3_NATIVE || ((features) & ARM_CPU_FEATURE_SHA3)) -#define HAVE_DOTPROD(features) (HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD)) - -#if HAVE_DYNAMIC_ARM_CPU_FEATURES -#define ARM_CPU_FEATURES_KNOWN 0x80000000 -extern volatile u32 libdeflate_arm_cpu_features; - -void libdeflate_init_arm_cpu_features(void); - -static inline u32 get_arm_cpu_features(void) -{ - if (libdeflate_arm_cpu_features == 0) - libdeflate_init_arm_cpu_features(); - return libdeflate_arm_cpu_features; -} -#else /* HAVE_DYNAMIC_ARM_CPU_FEATURES */ -static inline u32 get_arm_cpu_features(void) { return 0; } -#endif /* !HAVE_DYNAMIC_ARM_CPU_FEATURES */ - -/* NEON */ -#if defined(__ARM_NEON) || defined(ARCH_ARM64) -# define HAVE_NEON_NATIVE 1 -#else -# define HAVE_NEON_NATIVE 0 -#endif -/* - * With both gcc and clang, NEON intrinsics require that the main target has - * NEON enabled already. Exception: with gcc 6.1 and later (r230411 for arm32, - * r226563 for arm64), hardware floating point support is sufficient. - */ -#if HAVE_NEON_NATIVE || \ - (HAVE_DYNAMIC_ARM_CPU_FEATURES && GCC_PREREQ(6, 1) && defined(__ARM_FP)) -# define HAVE_NEON_INTRIN 1 -#else -# define HAVE_NEON_INTRIN 0 -#endif - -/* PMULL */ -#ifdef __ARM_FEATURE_CRYPTO -# define HAVE_PMULL_NATIVE 1 -#else -# define HAVE_PMULL_NATIVE 0 -#endif -#if HAVE_PMULL_NATIVE || \ - (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \ - (GCC_PREREQ(6, 1) || CLANG_PREREQ(3, 5, 6010000) || \ - defined(_MSC_VER)) && \ - /* - * On arm32 with clang, the crypto intrinsics (which include pmull) - * are not defined, even when using -mfpu=crypto-neon-fp-armv8, - * because clang's puts their definitions behind - * __aarch64__. - */ \ - !(defined(ARCH_ARM32) && defined(__clang__))) -# define HAVE_PMULL_INTRIN CPU_IS_LITTLE_ENDIAN() /* untested on big endian */ - /* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */ -# ifdef _MSC_VER -# define compat_vmull_p64(a, b) vmull_p64(vcreate_p64(a), vcreate_p64(b)) -# else -# define compat_vmull_p64(a, b) vmull_p64((a), (b)) -# endif -#else -# define HAVE_PMULL_INTRIN 0 -#endif -/* - * Set USE_PMULL_TARGET_EVEN_IF_NATIVE if a workaround for a gcc bug that was - * fixed by commit 11a113d501ff ("aarch64: Simplify feature definitions") in gcc - * 13 is needed. A minimal program that fails to build due to this bug when - * compiled with -mcpu=emag, at least with gcc 10 through 12, is: - * - * static inline __attribute__((always_inline,target("+crypto"))) void f() {} - * void g() { f(); } - * - * The error is: - * - * error: inlining failed in call to ‘always_inline’ ‘f’: target specific option mismatch - * - * The workaround is to explicitly add the crypto target to the non-inline - * function g(), even though this should not be required due to -mcpu=emag - * enabling 'crypto' natively and causing __ARM_FEATURE_CRYPTO to be defined. - */ -#if HAVE_PMULL_NATIVE && defined(ARCH_ARM64) && \ - GCC_PREREQ(6, 1) && !GCC_PREREQ(13, 1) -# define USE_PMULL_TARGET_EVEN_IF_NATIVE 1 -#else -# define USE_PMULL_TARGET_EVEN_IF_NATIVE 0 -#endif - -/* CRC32 */ -#ifdef __ARM_FEATURE_CRC32 -# define HAVE_CRC32_NATIVE 1 -#else -# define HAVE_CRC32_NATIVE 0 -#endif -#undef HAVE_CRC32_INTRIN -#if HAVE_CRC32_NATIVE -# define HAVE_CRC32_INTRIN 1 -#elif HAVE_DYNAMIC_ARM_CPU_FEATURES -# if GCC_PREREQ(1, 0) - /* - * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled - * in the main target has been affected by two gcc bugs, which we must avoid - * by only allowing gcc versions that have the corresponding fixes. First, - * gcc commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a - * and hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed. Second, - * gcc commit c1cdabe3aab8 ("arm: reorder assembler architecture directives - * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when - * binutils is 2.34 or later, due to - * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439. We use the second - * set of prerequisites, as they are stricter and we have no way to detect - * the binutils version directly from a C source file. - * - * Also exclude the cases where the main target arch is armv6kz or armv7e-m. - * In those cases, gcc doesn't let functions that use the main arch be - * inlined into functions that are targeted to armv8-a+crc. (armv8-a is - * necessary for crc to be accepted at all.) That causes build errors. - * This issue happens for these specific sub-archs because they are not a - * subset of armv8-a. Note: clang does not have this limitation. - */ -# if (GCC_PREREQ(11, 3) || \ - (GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \ - (GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0))) && \ - !defined(__ARM_ARCH_6KZ__) && \ - !defined(__ARM_ARCH_7EM__) -# define HAVE_CRC32_INTRIN 1 -# endif -# elif CLANG_PREREQ(3, 4, 6000000) -# define HAVE_CRC32_INTRIN 1 -# elif defined(_MSC_VER) -# define HAVE_CRC32_INTRIN 1 -# endif -#endif -#ifndef HAVE_CRC32_INTRIN -# define HAVE_CRC32_INTRIN 0 -#endif - -/* SHA3 (needed for the eor3 instruction) */ -#if defined(ARCH_ARM64) && !defined(_MSC_VER) -# ifdef __ARM_FEATURE_SHA3 -# define HAVE_SHA3_NATIVE 1 -# else -# define HAVE_SHA3_NATIVE 0 -# endif -# define HAVE_SHA3_TARGET (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - (GCC_PREREQ(8, 1) /* r256478 */ || \ - CLANG_PREREQ(7, 0, 10010463) /* r338010 */)) -# define HAVE_SHA3_INTRIN (HAVE_NEON_INTRIN && \ - (HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \ - (GCC_PREREQ(9, 1) /* r268049 */ || \ - CLANG_PREREQ(13, 0, 13160000))) -#else -# define HAVE_SHA3_NATIVE 0 -# define HAVE_SHA3_TARGET 0 -# define HAVE_SHA3_INTRIN 0 -#endif - -/* dotprod */ -#ifdef ARCH_ARM64 -# ifdef __ARM_FEATURE_DOTPROD -# define HAVE_DOTPROD_NATIVE 1 -# else -# define HAVE_DOTPROD_NATIVE 0 -# endif -# if HAVE_DOTPROD_NATIVE || \ - (HAVE_DYNAMIC_ARM_CPU_FEATURES && \ - (GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || \ - defined(_MSC_VER))) -# define HAVE_DOTPROD_INTRIN 1 -# else -# define HAVE_DOTPROD_INTRIN 0 -# endif -#else -# define HAVE_DOTPROD_NATIVE 0 -# define HAVE_DOTPROD_INTRIN 0 -#endif - -/* - * Work around bugs in arm_acle.h and arm_neon.h where sometimes intrinsics are - * only defined when the corresponding __ARM_FEATURE_* macro is defined. The - * intrinsics actually work in target attribute functions too if they are - * defined, though, so work around this by temporarily defining the - * corresponding __ARM_FEATURE_* macros while including the headers. - */ -#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \ - (defined(__clang__) || defined(ARCH_ARM32)) -# define __ARM_FEATURE_CRC32 1 -#endif -#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__) -# define __ARM_FEATURE_SHA3 1 -#endif -#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__) -# define __ARM_FEATURE_DOTPROD 1 -#endif -#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \ - (defined(__clang__) || defined(ARCH_ARM32)) -# include -# undef __ARM_FEATURE_CRC32 -#endif -#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__) -# include -# undef __ARM_FEATURE_SHA3 -#endif -#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__) -# include -# undef __ARM_FEATURE_DOTPROD -#endif - -#endif /* ARCH_ARM32 || ARCH_ARM64 */ - -#endif /* LIB_ARM_CPU_FEATURES_H */ diff --git a/Sources/DEFLATE/cpu_features_common.h b/Sources/DEFLATE/cpu_features_common.h index d0531d5c..04b9cedf 100644 --- a/Sources/DEFLATE/cpu_features_common.h +++ b/Sources/DEFLATE/cpu_features_common.h @@ -29,7 +29,7 @@ #define LIB_CPU_FEATURES_COMMON_H #if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) - /* for strdup() and strtok_r() */ +/* for strdup() and strtok_r() */ # undef _ANSI_SOURCE # ifndef __APPLE__ # undef _GNU_SOURCE @@ -43,49 +43,49 @@ #include "lib_common.h" struct cpu_feature { - u32 bit; - const char *name; + u32 bit; + const char *name; }; #if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING) /* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */ static inline void disable_cpu_features_for_testing(u32 *features, - const struct cpu_feature *feature_table, - size_t feature_table_length) + const struct cpu_feature *feature_table, + size_t feature_table_length) { - char *env_value, *strbuf, *p, *saveptr = NULL; - size_t i; - - env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES"); - if (!env_value) - return; - strbuf = strdup(env_value); - if (!strbuf) - abort(); - p = strtok_r(strbuf, ",", &saveptr); - while (p) { - for (i = 0; i < feature_table_length; i++) { - if (strcmp(p, feature_table[i].name) == 0) { - *features &= ~feature_table[i].bit; - break; - } - } - if (i == feature_table_length) { - fprintf(stderr, - "unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n", - p); - abort(); - } - p = strtok_r(NULL, ",", &saveptr); - } - free(strbuf); + char *env_value, *strbuf, *p, *saveptr = NULL; + size_t i; + + env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES"); + if (!env_value) + return; + strbuf = strdup(env_value); + if (!strbuf) + abort(); + p = strtok_r(strbuf, ",", &saveptr); + while (p) { + for (i = 0; i < feature_table_length; i++) { + if (strcmp(p, feature_table[i].name) == 0) { + *features &= ~feature_table[i].bit; + break; + } + } + if (i == feature_table_length) { + fprintf(stderr, + "unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n", + p); + abort(); + } + p = strtok_r(NULL, ",", &saveptr); + } + free(strbuf); } #else /* TEST_SUPPORT__DO_NOT_USE */ static inline void disable_cpu_features_for_testing(u32 *features, - const struct cpu_feature *feature_table, - size_t feature_table_length) + const struct cpu_feature *feature_table, + size_t feature_table_length) { } #endif /* !TEST_SUPPORT__DO_NOT_USE */ diff --git a/Sources/DEFLATE/crc32.c b/Sources/DEFLATE/crc32.c index 213dd665..24a15418 100644 --- a/Sources/DEFLATE/crc32.c +++ b/Sources/DEFLATE/crc32.c @@ -33,7 +33,7 @@ * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2), * where the coefficient of 'x^i' is 'bits[len - i]'. Then, compute: * - * R(x) = M(x)*x^n mod G(x) + * R(x) = M(x)*x^n mod G(x) * * where G(x) is a selected "generator" polynomial of degree 'n'. The remainder * R(x) is a polynomial of max degree 'n - 1'. The CRC of 'bits' is R(x) @@ -44,17 +44,17 @@ * * In the gzip format (RFC 1952): * - * - The bitstring to checksum is formed from the bytes of the uncompressed - * data by concatenating the bits from the bytes in order, proceeding - * from the low-order bit to the high-order bit within each byte. + * - The bitstring to checksum is formed from the bytes of the uncompressed + * data by concatenating the bits from the bytes in order, proceeding + * from the low-order bit to the high-order bit within each byte. * - * - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 + - * x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1. - * Consequently, the CRC length is 32 bits ("CRC-32"). + * - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 + + * x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1. + * Consequently, the CRC length is 32 bits ("CRC-32"). * - * - The highest order 32 coefficients of M(x)*x^n are inverted. + * - The highest order 32 coefficients of M(x)*x^n are inverted. * - * - All 32 coefficients of R(x) are inverted. + * - All 32 coefficients of R(x) are inverted. * * The two inversions cause added leading and trailing zero bits to affect the * resulting CRC, whereas with a regular CRC such bits would have no effect on @@ -70,35 +70,35 @@ * subtraction can be implemented as bitwise exclusive OR (since we are working * in GF(2)). Here is an unoptimized implementation: * - * static u32 crc32_gzip(const u8 *p, size_t len) - * { - * u32 crc = 0; - * const u32 divisor = 0xEDB88320; + * static u32 crc32_gzip(const u8 *p, size_t len) + * { + * u32 crc = 0; + * const u32 divisor = 0xEDB88320; * - * for (size_t i = 0; i < len * 8 + 32; i++) { - * int bit; - * u32 multiple; + * for (size_t i = 0; i < len * 8 + 32; i++) { + * int bit; + * u32 multiple; * - * if (i < len * 8) - * bit = (p[i / 8] >> (i % 8)) & 1; - * else - * bit = 0; // one of the 32 appended 0 bits + * if (i < len * 8) + * bit = (p[i / 8] >> (i % 8)) & 1; + * else + * bit = 0; // one of the 32 appended 0 bits * - * if (i < 32) // the first 32 bits are inverted - * bit ^= 1; + * if (i < 32) // the first 32 bits are inverted + * bit ^= 1; * - * if (crc & 1) - * multiple = divisor; - * else - * multiple = 0; + * if (crc & 1) + * multiple = divisor; + * else + * multiple = 0; * - * crc >>= 1; - * crc |= (u32)bit << 31; - * crc ^= multiple; - * } + * crc >>= 1; + * crc |= (u32)bit << 31; + * crc ^= multiple; + * } * - * return ~crc; - * } + * return ~crc; + * } * * In this implementation, the 32-bit integer 'crc' maintains the remainder of * the currently processed portion of the message (with 32 zero bits appended) @@ -114,27 +114,27 @@ * 'multiple' until 32 bits later, we need not actually add each message bit * until that point: * - * static u32 crc32_gzip(const u8 *p, size_t len) - * { - * u32 crc = ~0; - * const u32 divisor = 0xEDB88320; + * static u32 crc32_gzip(const u8 *p, size_t len) + * { + * u32 crc = ~0; + * const u32 divisor = 0xEDB88320; * - * for (size_t i = 0; i < len * 8; i++) { - * int bit; - * u32 multiple; + * for (size_t i = 0; i < len * 8; i++) { + * int bit; + * u32 multiple; * - * bit = (p[i / 8] >> (i % 8)) & 1; - * crc ^= bit; - * if (crc & 1) - * multiple = divisor; - * else - * multiple = 0; - * crc >>= 1; - * crc ^= multiple; - * } + * bit = (p[i / 8] >> (i % 8)) & 1; + * crc ^= bit; + * if (crc & 1) + * multiple = divisor; + * else + * multiple = 0; + * crc >>= 1; + * crc ^= multiple; + * } * - * return ~crc; - * } + * return ~crc; + * } * * With the above implementation we get the effect of 32 appended 0 bits for * free; they never affect the choice of a divisor, nor would they change the @@ -165,7 +165,7 @@ * intermediate remainder (which we never actually store explicitly) is 96 bits. * * On CPUs that support fast carryless multiplication, CRCs can be computed even - * more quickly via "folding". See e.g. the x86 PCLMUL implementation. + * more quickly via "folding". See e.g. the x86 PCLMUL implementations. */ #include "lib_common.h" @@ -176,31 +176,31 @@ static u32 MAYBE_UNUSED crc32_slice8(u32 crc, const u8 *p, size_t len) { - const u8 * const end = p + len; - const u8 *end64; - - for (; ((uintptr_t)p & 7) && p != end; p++) - crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p]; - - end64 = p + ((end - p) & ~7); - for (; p != end64; p += 8) { - u32 v1 = le32_bswap(*(const u32 *)(p + 0)); - u32 v2 = le32_bswap(*(const u32 *)(p + 4)); - - crc = crc32_slice8_table[0x700 + (u8)((crc ^ v1) >> 0)] ^ - crc32_slice8_table[0x600 + (u8)((crc ^ v1) >> 8)] ^ - crc32_slice8_table[0x500 + (u8)((crc ^ v1) >> 16)] ^ - crc32_slice8_table[0x400 + (u8)((crc ^ v1) >> 24)] ^ - crc32_slice8_table[0x300 + (u8)(v2 >> 0)] ^ - crc32_slice8_table[0x200 + (u8)(v2 >> 8)] ^ - crc32_slice8_table[0x100 + (u8)(v2 >> 16)] ^ - crc32_slice8_table[0x000 + (u8)(v2 >> 24)]; - } - - for (; p != end; p++) - crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p]; - - return crc; + const u8 * const end = p + len; + const u8 *end64; + + for (; ((uintptr_t)p & 7) && p != end; p++) + crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p]; + + end64 = p + ((end - p) & ~7); + for (; p != end64; p += 8) { + u32 v1 = le32_bswap(*(const u32 *)(p + 0)); + u32 v2 = le32_bswap(*(const u32 *)(p + 4)); + + crc = crc32_slice8_table[0x700 + (u8)((crc ^ v1) >> 0)] ^ + crc32_slice8_table[0x600 + (u8)((crc ^ v1) >> 8)] ^ + crc32_slice8_table[0x500 + (u8)((crc ^ v1) >> 16)] ^ + crc32_slice8_table[0x400 + (u8)((crc ^ v1) >> 24)] ^ + crc32_slice8_table[0x300 + (u8)(v2 >> 0)] ^ + crc32_slice8_table[0x200 + (u8)(v2 >> 8)] ^ + crc32_slice8_table[0x100 + (u8)(v2 >> 16)] ^ + crc32_slice8_table[0x000 + (u8)(v2 >> 24)]; + } + + for (; p != end; p++) + crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p]; + + return crc; } /* @@ -211,11 +211,11 @@ crc32_slice8(u32 crc, const u8 *p, size_t len) static forceinline u32 MAYBE_UNUSED crc32_slice1(u32 crc, const u8 *p, size_t len) { - size_t i; - - for (i = 0; i < len; i++) - crc = (crc >> 8) ^ crc32_slice1_table[(u8)crc ^ p[i]]; - return crc; + size_t i; + + for (i = 0; i < len; i++) + crc = (crc >> 8) ^ crc32_slice1_table[(u8)crc ^ p[i]]; + return crc; } /* Include architecture-specific implementation(s) if available. */ @@ -223,7 +223,7 @@ crc32_slice1(u32 crc, const u8 *p, size_t len) #undef arch_select_crc32_func typedef u32 (*crc32_func_t)(u32 crc, const u8 *p, size_t len); #if defined(ARCH_ARM32) || defined(ARCH_ARM64) -# include "crc32_impl.h" +# include "arm/crc32_impl.h" #elif defined(ARCH_X86_32) || defined(ARCH_X86_64) # include "x86/crc32_impl.h" #endif @@ -240,13 +240,13 @@ static volatile crc32_func_t crc32_impl = dispatch_crc32; /* Choose the best implementation at runtime. */ static u32 dispatch_crc32(u32 crc, const u8 *p, size_t len) { - crc32_func_t f = arch_select_crc32_func(); - - if (f == NULL) - f = DEFAULT_IMPL; - - crc32_impl = f; - return f(crc, p, len); + crc32_func_t f = arch_select_crc32_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + crc32_impl = f; + return f(crc, p, len); } #else /* The best implementation is statically known, so call it directly. */ @@ -256,7 +256,7 @@ static u32 dispatch_crc32(u32 crc, const u8 *p, size_t len) LIBDEFLATEAPI u32 libdeflate_crc32(u32 crc, const void *p, size_t len) { - if (p == NULL) /* Return initial value. */ - return 0; - return ~crc32_impl(~crc, p, len); + if (p == NULL) /* Return initial value. */ + return 0; + return ~crc32_impl(~crc, p, len); } diff --git a/Sources/DEFLATE/crc32_impl.h b/Sources/DEFLATE/crc32_impl.h deleted file mode 100644 index c802cdf0..00000000 --- a/Sources/DEFLATE/crc32_impl.h +++ /dev/null @@ -1,682 +0,0 @@ -/* - * arm/crc32_impl.h - ARM implementations of the gzip CRC-32 algorithm - * - * Copyright 2022 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#ifndef LIB_ARM_CRC32_IMPL_H -#define LIB_ARM_CRC32_IMPL_H - -#include "cpu_features.h" - -/* - * crc32_arm_crc() - implementation using crc32 instructions (only) - * - * In general this implementation is straightforward. However, naive use of the - * crc32 instructions is serial: one of the two inputs to each crc32 instruction - * is the output of the previous one. To take advantage of CPUs that can - * execute multiple crc32 instructions in parallel, when possible we interleave - * the checksumming of several adjacent chunks, then combine their CRCs. - * - * However, without pmull, combining CRCs is fairly slow. So in this pmull-less - * version, we only use a large chunk length, and thus we only do chunked - * processing if there is a lot of data to checksum. This also means that a - * variable chunk length wouldn't help much, so we just support a fixed length. - */ -#if HAVE_CRC32_INTRIN -# if HAVE_CRC32_NATIVE -# define ATTRIBUTES -# else -# ifdef ARCH_ARM32 -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("armv8-a,crc") -# elif defined(__ARM_PCS_VFP) - /* - * +simd is needed to avoid a "selected architecture lacks an FPU" - * error with Debian arm-linux-gnueabihf-gcc when -mfpu is not - * explicitly specified on the command line. - */ -# define ATTRIBUTES _target_attribute("arch=armv8-a+crc+simd") -# else -# define ATTRIBUTES _target_attribute("arch=armv8-a+crc") -# endif -# else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("crc") -# else -# define ATTRIBUTES _target_attribute("+crc") -# endif -# endif -# endif - -#ifndef _MSC_VER -# include -#endif - -/* - * Combine the CRCs for 4 adjacent chunks of length L = CRC32_FIXED_CHUNK_LEN - * bytes each by computing: - * - * [ crc0*x^(3*8*L) + crc1*x^(2*8*L) + crc2*x^(1*8*L) + crc3 ] mod G(x) - * - * This has been optimized in several ways: - * - * - The needed multipliers (x to some power, reduced mod G(x)) were - * precomputed. - * - * - The 3 multiplications are interleaved. - * - * - The reduction mod G(x) is delayed to the end and done using __crc32d. - * Note that the use of __crc32d introduces an extra factor of x^32. To - * cancel that out along with the extra factor of x^1 that gets introduced - * because of how the 63-bit products are aligned in their 64-bit integers, - * the multipliers are actually x^(j*8*L - 33) instead of x^(j*8*L). - */ -static forceinline ATTRIBUTES u32 -combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3) -{ - u64 res0 = 0, res1 = 0, res2 = 0; - int i; - - /* Multiply crc{0,1,2} by CRC32_FIXED_CHUNK_MULT_{3,2,1}. */ - for (i = 0; i < 32; i++) { - if (CRC32_FIXED_CHUNK_MULT_3 & (1U << i)) - res0 ^= (u64)crc0 << i; - if (CRC32_FIXED_CHUNK_MULT_2 & (1U << i)) - res1 ^= (u64)crc1 << i; - if (CRC32_FIXED_CHUNK_MULT_1 & (1U << i)) - res2 ^= (u64)crc2 << i; - } - /* Add the different parts and reduce mod G(x). */ - return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3; -} - -#define crc32_arm_crc crc32_arm_crc -static u32 ATTRIBUTES MAYBE_UNUSED -crc32_arm_crc(u32 crc, const u8 *p, size_t len) -{ - if (len >= 64) { - const size_t align = -(uintptr_t)p & 7; - - /* Align p to the next 8-byte boundary. */ - if (align) { - if (align & 1) - crc = __crc32b(crc, *p++); - if (align & 2) { - crc = __crc32h(crc, le16_bswap(*(u16 *)p)); - p += 2; - } - if (align & 4) { - crc = __crc32w(crc, le32_bswap(*(u32 *)p)); - p += 4; - } - len -= align; - } - /* - * Interleave the processing of multiple adjacent data chunks to - * take advantage of instruction-level parallelism. - * - * Some CPUs don't prefetch the data if it's being fetched in - * multiple interleaved streams, so do explicit prefetching. - */ - while (len >= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN) { - const u64 *wp0 = (const u64 *)p; - const u64 * const wp0_end = - (const u64 *)(p + CRC32_FIXED_CHUNK_LEN); - u32 crc1 = 0, crc2 = 0, crc3 = 0; - - STATIC_ASSERT(CRC32_NUM_CHUNKS == 4); - STATIC_ASSERT(CRC32_FIXED_CHUNK_LEN % (4 * 8) == 0); - do { - prefetchr(&wp0[64 + 0*CRC32_FIXED_CHUNK_LEN/8]); - prefetchr(&wp0[64 + 1*CRC32_FIXED_CHUNK_LEN/8]); - prefetchr(&wp0[64 + 2*CRC32_FIXED_CHUNK_LEN/8]); - prefetchr(&wp0[64 + 3*CRC32_FIXED_CHUNK_LEN/8]); - crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8])); - crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8])); - crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8])); - crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8])); - wp0++; - crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8])); - crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8])); - crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8])); - crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8])); - wp0++; - crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8])); - crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8])); - crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8])); - crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8])); - wp0++; - crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8])); - crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8])); - crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8])); - crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8])); - wp0++; - } while (wp0 != wp0_end); - crc = combine_crcs_slow(crc, crc1, crc2, crc3); - p += CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN; - len -= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN; - } - /* - * Due to the large fixed chunk length used above, there might - * still be a lot of data left. So use a 64-byte loop here, - * instead of a loop that is less unrolled. - */ - while (len >= 64) { - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 32))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 40))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 48))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 56))); - p += 64; - len -= 64; - } - } - if (len & 32) { - crc = __crc32d(crc, get_unaligned_le64(p + 0)); - crc = __crc32d(crc, get_unaligned_le64(p + 8)); - crc = __crc32d(crc, get_unaligned_le64(p + 16)); - crc = __crc32d(crc, get_unaligned_le64(p + 24)); - p += 32; - } - if (len & 16) { - crc = __crc32d(crc, get_unaligned_le64(p + 0)); - crc = __crc32d(crc, get_unaligned_le64(p + 8)); - p += 16; - } - if (len & 8) { - crc = __crc32d(crc, get_unaligned_le64(p)); - p += 8; - } - if (len & 4) { - crc = __crc32w(crc, get_unaligned_le32(p)); - p += 4; - } - if (len & 2) { - crc = __crc32h(crc, get_unaligned_le16(p)); - p += 2; - } - if (len & 1) - crc = __crc32b(crc, *p); - return crc; -} -#undef ATTRIBUTES -#endif /* crc32_arm_crc() */ - -/* - * crc32_arm_crc_pmullcombine() - implementation using crc32 instructions, plus - * pmull instructions for CRC combining - * - * This is similar to crc32_arm_crc(), but it enables the use of pmull - * (carryless multiplication) instructions for the steps where the CRCs of - * adjacent data chunks are combined. As this greatly speeds up CRC - * combination, this implementation also differs from crc32_arm_crc() in that it - * uses a variable chunk length which can get fairly small. The precomputed - * multipliers needed for the selected chunk length are loaded from a table. - * - * Note that pmull is used here only for combining the CRCs of separately - * checksummed chunks, not for folding the data itself. See crc32_arm_pmull*() - * for implementations that use pmull for folding the data itself. - */ -#if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN -# if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE -# define ATTRIBUTES -# else -# ifdef ARCH_ARM32 -# define ATTRIBUTES _target_attribute("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8") -# else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("crc,aes") -# else -# define ATTRIBUTES _target_attribute("+crc,+crypto") -# endif -# endif -# endif - -#ifndef _MSC_VER -# include -#endif -#include - -/* Do carryless multiplication of two 32-bit values. */ -static forceinline ATTRIBUTES u64 -clmul_u32(u32 a, u32 b) -{ - uint64x2_t res = vreinterpretq_u64_p128( - compat_vmull_p64((poly64_t)a, (poly64_t)b)); - - return vgetq_lane_u64(res, 0); -} - -/* - * Like combine_crcs_slow(), but uses vmull_p64 to do the multiplications more - * quickly, and supports a variable chunk length. The chunk length is - * 'i * CRC32_MIN_VARIABLE_CHUNK_LEN' - * where 1 <= i < ARRAY_LEN(crc32_mults_for_chunklen). - */ -static forceinline ATTRIBUTES u32 -combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i) -{ - u64 res0 = clmul_u32(crc0, crc32_mults_for_chunklen[i][0]); - u64 res1 = clmul_u32(crc1, crc32_mults_for_chunklen[i][1]); - u64 res2 = clmul_u32(crc2, crc32_mults_for_chunklen[i][2]); - - return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3; -} - -#define crc32_arm_crc_pmullcombine crc32_arm_crc_pmullcombine -static u32 ATTRIBUTES MAYBE_UNUSED -crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len) -{ - const size_t align = -(uintptr_t)p & 7; - - if (len >= align + CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) { - /* Align p to the next 8-byte boundary. */ - if (align) { - if (align & 1) - crc = __crc32b(crc, *p++); - if (align & 2) { - crc = __crc32h(crc, le16_bswap(*(u16 *)p)); - p += 2; - } - if (align & 4) { - crc = __crc32w(crc, le32_bswap(*(u32 *)p)); - p += 4; - } - len -= align; - } - /* - * Handle CRC32_MAX_VARIABLE_CHUNK_LEN specially, so that better - * code is generated for it. - */ - while (len >= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN) { - const u64 *wp0 = (const u64 *)p; - const u64 * const wp0_end = - (const u64 *)(p + CRC32_MAX_VARIABLE_CHUNK_LEN); - u32 crc1 = 0, crc2 = 0, crc3 = 0; - - STATIC_ASSERT(CRC32_NUM_CHUNKS == 4); - STATIC_ASSERT(CRC32_MAX_VARIABLE_CHUNK_LEN % (4 * 8) == 0); - do { - prefetchr(&wp0[64 + 0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]); - prefetchr(&wp0[64 + 1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]); - prefetchr(&wp0[64 + 2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]); - prefetchr(&wp0[64 + 3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]); - crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - wp0++; - crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - wp0++; - crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - wp0++; - crc = __crc32d(crc, le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8])); - wp0++; - } while (wp0 != wp0_end); - crc = combine_crcs_fast(crc, crc1, crc2, crc3, - ARRAY_LEN(crc32_mults_for_chunklen) - 1); - p += CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN; - len -= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN; - } - /* Handle up to one variable-length chunk. */ - if (len >= CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) { - const size_t i = len / (CRC32_NUM_CHUNKS * - CRC32_MIN_VARIABLE_CHUNK_LEN); - const size_t chunk_len = - i * CRC32_MIN_VARIABLE_CHUNK_LEN; - const u64 *wp0 = (const u64 *)(p + 0*chunk_len); - const u64 *wp1 = (const u64 *)(p + 1*chunk_len); - const u64 *wp2 = (const u64 *)(p + 2*chunk_len); - const u64 *wp3 = (const u64 *)(p + 3*chunk_len); - const u64 * const wp0_end = wp1; - u32 crc1 = 0, crc2 = 0, crc3 = 0; - - STATIC_ASSERT(CRC32_NUM_CHUNKS == 4); - STATIC_ASSERT(CRC32_MIN_VARIABLE_CHUNK_LEN % (4 * 8) == 0); - do { - prefetchr(wp0 + 64); - prefetchr(wp1 + 64); - prefetchr(wp2 + 64); - prefetchr(wp3 + 64); - crc = __crc32d(crc, le64_bswap(*wp0++)); - crc1 = __crc32d(crc1, le64_bswap(*wp1++)); - crc2 = __crc32d(crc2, le64_bswap(*wp2++)); - crc3 = __crc32d(crc3, le64_bswap(*wp3++)); - crc = __crc32d(crc, le64_bswap(*wp0++)); - crc1 = __crc32d(crc1, le64_bswap(*wp1++)); - crc2 = __crc32d(crc2, le64_bswap(*wp2++)); - crc3 = __crc32d(crc3, le64_bswap(*wp3++)); - crc = __crc32d(crc, le64_bswap(*wp0++)); - crc1 = __crc32d(crc1, le64_bswap(*wp1++)); - crc2 = __crc32d(crc2, le64_bswap(*wp2++)); - crc3 = __crc32d(crc3, le64_bswap(*wp3++)); - crc = __crc32d(crc, le64_bswap(*wp0++)); - crc1 = __crc32d(crc1, le64_bswap(*wp1++)); - crc2 = __crc32d(crc2, le64_bswap(*wp2++)); - crc3 = __crc32d(crc3, le64_bswap(*wp3++)); - } while (wp0 != wp0_end); - crc = combine_crcs_fast(crc, crc1, crc2, crc3, i); - p += CRC32_NUM_CHUNKS * chunk_len; - len -= CRC32_NUM_CHUNKS * chunk_len; - } - - while (len >= 32) { - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16))); - crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24))); - p += 32; - len -= 32; - } - } else { - while (len >= 32) { - crc = __crc32d(crc, get_unaligned_le64(p + 0)); - crc = __crc32d(crc, get_unaligned_le64(p + 8)); - crc = __crc32d(crc, get_unaligned_le64(p + 16)); - crc = __crc32d(crc, get_unaligned_le64(p + 24)); - p += 32; - len -= 32; - } - } - if (len & 16) { - crc = __crc32d(crc, get_unaligned_le64(p + 0)); - crc = __crc32d(crc, get_unaligned_le64(p + 8)); - p += 16; - } - if (len & 8) { - crc = __crc32d(crc, get_unaligned_le64(p)); - p += 8; - } - if (len & 4) { - crc = __crc32w(crc, get_unaligned_le32(p)); - p += 4; - } - if (len & 2) { - crc = __crc32h(crc, get_unaligned_le16(p)); - p += 2; - } - if (len & 1) - crc = __crc32b(crc, *p); - return crc; -} -#undef ATTRIBUTES -#endif /* crc32_arm_crc_pmullcombine() */ - -/* - * crc32_arm_pmullx4() - implementation using "folding" with pmull instructions - * - * This implementation is intended for CPUs that support pmull instructions but - * not crc32 instructions. - */ -#if HAVE_PMULL_INTRIN -# define crc32_arm_pmullx4 crc32_arm_pmullx4 -# define SUFFIX _pmullx4 -# if HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE -# define ATTRIBUTES -# else -# ifdef ARCH_ARM32 -# define ATTRIBUTES _target_attribute("fpu=crypto-neon-fp-armv8") -# else -# ifdef __clang__ - /* - * This used to use "crypto", but that stopped working with clang 16. - * Now only "aes" works. "aes" works with older versions too, so use - * that. No "+" prefix; clang 15 and earlier doesn't accept that. - */ -# define ATTRIBUTES _target_attribute("aes") -# else - /* - * With gcc, only "+crypto" works. Both the "+" prefix and the - * "crypto" (not "aes") are essential... - */ -# define ATTRIBUTES _target_attribute("+crypto") -# endif -# endif -# endif -# define ENABLE_EOR3 0 -# include "crc32_pmull_helpers.h" - -static u32 ATTRIBUTES MAYBE_UNUSED -crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len) -{ - static const u64 _aligned_attribute(16) mults[3][2] = { - CRC32_1VECS_MULTS, - CRC32_4VECS_MULTS, - CRC32_2VECS_MULTS, - }; - static const u64 _aligned_attribute(16) final_mults[3][2] = { - { CRC32_FINAL_MULT, 0 }, - { CRC32_BARRETT_CONSTANT_1, 0 }, - { CRC32_BARRETT_CONSTANT_2, 0 }, - }; - const uint8x16_t zeroes = vdupq_n_u8(0); - const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF)); - const poly64x2_t multipliers_1 = load_multipliers(mults[0]); - uint8x16_t v0, v1, v2, v3; - - if (len < 64 + 15) { - if (len < 16) - return crc32_slice1(crc, p, len); - v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc)); - p += 16; - len -= 16; - while (len >= 16) { - v0 = fold_vec(v0, vld1q_u8(p), multipliers_1); - p += 16; - len -= 16; - } - } else { - const poly64x2_t multipliers_4 = load_multipliers(mults[1]); - const poly64x2_t multipliers_2 = load_multipliers(mults[2]); - const size_t align = -(uintptr_t)p & 15; - const uint8x16_t *vp; - - v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc)); - p += 16; - /* Align p to the next 16-byte boundary. */ - if (align) { - v0 = fold_partial_vec(v0, p, align, multipliers_1); - p += align; - len -= align; - } - vp = (const uint8x16_t *)p; - v1 = *vp++; - v2 = *vp++; - v3 = *vp++; - while (len >= 64 + 64) { - v0 = fold_vec(v0, *vp++, multipliers_4); - v1 = fold_vec(v1, *vp++, multipliers_4); - v2 = fold_vec(v2, *vp++, multipliers_4); - v3 = fold_vec(v3, *vp++, multipliers_4); - len -= 64; - } - v0 = fold_vec(v0, v2, multipliers_2); - v1 = fold_vec(v1, v3, multipliers_2); - if (len & 32) { - v0 = fold_vec(v0, *vp++, multipliers_2); - v1 = fold_vec(v1, *vp++, multipliers_2); - } - v0 = fold_vec(v0, v1, multipliers_1); - if (len & 16) - v0 = fold_vec(v0, *vp++, multipliers_1); - p = (const u8 *)vp; - len &= 15; - } - - /* Handle any remaining partial block now before reducing to 32 bits. */ - if (len) - v0 = fold_partial_vec(v0, p, len, multipliers_1); - - /* - * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, - * which is equivalent to multiplying by x^32. This is needed because - * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). - */ - - v0 = veorq_u8(vextq_u8(v0, zeroes, 8), - clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1)); - - /* Fold 96 => 64 bits. */ - v0 = veorq_u8(vextq_u8(v0, zeroes, 4), - clmul_low(vandq_u8(v0, mask32), - load_multipliers(final_mults[0]))); - - /* Reduce 64 => 32 bits using Barrett reduction. */ - v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1])); - v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2])); - return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1); -} -#undef SUFFIX -#undef ATTRIBUTES -#undef ENABLE_EOR3 -#endif /* crc32_arm_pmullx4() */ - -/* - * crc32_arm_pmullx12_crc() - large-stride implementation using "folding" with - * pmull instructions, where crc32 instructions are also available - * - * See crc32_pmull_wide.h for explanation. - */ -#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN -# define crc32_arm_pmullx12_crc crc32_arm_pmullx12_crc -# define SUFFIX _pmullx12_crc -# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE -# define ATTRIBUTES -# else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("aes,crc") -# else -# define ATTRIBUTES _target_attribute("+crypto,+crc") -# endif -# endif -# define ENABLE_EOR3 0 -# include "crc32_pmull_wide.h" -#endif - -/* - * crc32_arm_pmullx12_crc_eor3() - * - * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from - * the sha3 extension) for even better performance. - * - * Note: we require HAVE_SHA3_TARGET (or HAVE_SHA3_NATIVE) rather than - * HAVE_SHA3_INTRIN, as we have an inline asm fallback for eor3. - */ -#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \ - (HAVE_SHA3_TARGET || HAVE_SHA3_NATIVE) -# define crc32_arm_pmullx12_crc_eor3 crc32_arm_pmullx12_crc_eor3 -# define SUFFIX _pmullx12_crc_eor3 -# if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE && \ - !USE_PMULL_TARGET_EVEN_IF_NATIVE -# define ATTRIBUTES -# else -# ifdef __clang__ -# define ATTRIBUTES _target_attribute("aes,crc,sha3") - /* - * With gcc, arch=armv8.2-a is needed for the sha3 intrinsics, unless the - * default target is armv8.3-a or later in which case it must be omitted. - * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT. - */ -# elif defined(__ARM_FEATURE_JCVT) -# define ATTRIBUTES _target_attribute("+crypto,+crc,+sha3") -# else -# define ATTRIBUTES _target_attribute("arch=armv8.2-a+crypto+crc+sha3") -# endif -# endif -# define ENABLE_EOR3 1 -# include "crc32_pmull_wide.h" -#endif - -/* - * On the Apple M1 processor, crc32 instructions max out at about 25.5 GB/s in - * the best case of using a 3-way or greater interleaved chunked implementation, - * whereas a pmull-based implementation achieves 68 GB/s provided that the - * stride length is large enough (about 10+ vectors with eor3, or 12+ without). - * - * For now we assume that crc32 instructions are preferable in other cases. - */ -#define PREFER_PMULL_TO_CRC 0 -#ifdef __APPLE__ -# include -# if TARGET_OS_OSX -# undef PREFER_PMULL_TO_CRC -# define PREFER_PMULL_TO_CRC 1 -# endif -#endif - -/* - * If the best implementation is statically available, use it unconditionally. - * Otherwise choose the best implementation at runtime. - */ -#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3) && \ - HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE -# define DEFAULT_IMPL crc32_arm_pmullx12_crc_eor3 -#elif !PREFER_PMULL_TO_CRC && defined(crc32_arm_crc_pmullcombine) && \ - HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE -# define DEFAULT_IMPL crc32_arm_crc_pmullcombine -#else -static inline crc32_func_t -arch_select_crc32_func(void) -{ - const u32 features MAYBE_UNUSED = get_arm_cpu_features(); - -#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3) - if (HAVE_PMULL(features) && HAVE_CRC32(features) && HAVE_SHA3(features)) - return crc32_arm_pmullx12_crc_eor3; -#endif -#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc) - if (HAVE_PMULL(features) && HAVE_CRC32(features)) - return crc32_arm_pmullx12_crc; -#endif -#ifdef crc32_arm_crc_pmullcombine - if (HAVE_CRC32(features) && HAVE_PMULL(features)) - return crc32_arm_crc_pmullcombine; -#endif -#ifdef crc32_arm_crc - if (HAVE_CRC32(features)) - return crc32_arm_crc; -#endif -#ifdef crc32_arm_pmullx4 - if (HAVE_PMULL(features)) - return crc32_arm_pmullx4; -#endif - return NULL; -} -#define arch_select_crc32_func arch_select_crc32_func -#endif - -#endif /* LIB_ARM_CRC32_IMPL_H */ diff --git a/Sources/DEFLATE/crc32_multipliers.h b/Sources/DEFLATE/crc32_multipliers.h index 580b775b..d8e92adb 100644 --- a/Sources/DEFLATE/crc32_multipliers.h +++ b/Sources/DEFLATE/crc32_multipliers.h @@ -4,55 +4,103 @@ * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c. DO NOT EDIT. */ -#define CRC32_1VECS_MULT_1 0xae689191 /* x^159 mod G(x) */ -#define CRC32_1VECS_MULT_2 0xccaa009e /* x^95 mod G(x) */ -#define CRC32_1VECS_MULTS { CRC32_1VECS_MULT_1, CRC32_1VECS_MULT_2 } +#define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */ +#define CRC32_X95_MODG 0xccaa009e /* x^95 mod G(x) */ -#define CRC32_2VECS_MULT_1 0xf1da05aa /* x^287 mod G(x) */ -#define CRC32_2VECS_MULT_2 0x81256527 /* x^223 mod G(x) */ -#define CRC32_2VECS_MULTS { CRC32_2VECS_MULT_1, CRC32_2VECS_MULT_2 } +#define CRC32_X287_MODG 0xf1da05aa /* x^287 mod G(x) */ +#define CRC32_X223_MODG 0x81256527 /* x^223 mod G(x) */ -#define CRC32_3VECS_MULT_1 0x3db1ecdc /* x^415 mod G(x) */ -#define CRC32_3VECS_MULT_2 0xaf449247 /* x^351 mod G(x) */ -#define CRC32_3VECS_MULTS { CRC32_3VECS_MULT_1, CRC32_3VECS_MULT_2 } +#define CRC32_X415_MODG 0x3db1ecdc /* x^415 mod G(x) */ +#define CRC32_X351_MODG 0xaf449247 /* x^351 mod G(x) */ -#define CRC32_4VECS_MULT_1 0x8f352d95 /* x^543 mod G(x) */ -#define CRC32_4VECS_MULT_2 0x1d9513d7 /* x^479 mod G(x) */ -#define CRC32_4VECS_MULTS { CRC32_4VECS_MULT_1, CRC32_4VECS_MULT_2 } +#define CRC32_X543_MODG 0x8f352d95 /* x^543 mod G(x) */ +#define CRC32_X479_MODG 0x1d9513d7 /* x^479 mod G(x) */ -#define CRC32_5VECS_MULT_1 0x1c279815 /* x^671 mod G(x) */ -#define CRC32_5VECS_MULT_2 0xae0b5394 /* x^607 mod G(x) */ -#define CRC32_5VECS_MULTS { CRC32_5VECS_MULT_1, CRC32_5VECS_MULT_2 } +#define CRC32_X671_MODG 0x1c279815 /* x^671 mod G(x) */ +#define CRC32_X607_MODG 0xae0b5394 /* x^607 mod G(x) */ -#define CRC32_6VECS_MULT_1 0xdf068dc2 /* x^799 mod G(x) */ -#define CRC32_6VECS_MULT_2 0x57c54819 /* x^735 mod G(x) */ -#define CRC32_6VECS_MULTS { CRC32_6VECS_MULT_1, CRC32_6VECS_MULT_2 } +#define CRC32_X799_MODG 0xdf068dc2 /* x^799 mod G(x) */ +#define CRC32_X735_MODG 0x57c54819 /* x^735 mod G(x) */ -#define CRC32_7VECS_MULT_1 0x31f8303f /* x^927 mod G(x) */ -#define CRC32_7VECS_MULT_2 0x0cbec0ed /* x^863 mod G(x) */ -#define CRC32_7VECS_MULTS { CRC32_7VECS_MULT_1, CRC32_7VECS_MULT_2 } +#define CRC32_X927_MODG 0x31f8303f /* x^927 mod G(x) */ +#define CRC32_X863_MODG 0x0cbec0ed /* x^863 mod G(x) */ -#define CRC32_8VECS_MULT_1 0x33fff533 /* x^1055 mod G(x) */ -#define CRC32_8VECS_MULT_2 0x910eeec1 /* x^991 mod G(x) */ -#define CRC32_8VECS_MULTS { CRC32_8VECS_MULT_1, CRC32_8VECS_MULT_2 } +#define CRC32_X1055_MODG 0x33fff533 /* x^1055 mod G(x) */ +#define CRC32_X991_MODG 0x910eeec1 /* x^991 mod G(x) */ -#define CRC32_9VECS_MULT_1 0x26b70c3d /* x^1183 mod G(x) */ -#define CRC32_9VECS_MULT_2 0x3f41287a /* x^1119 mod G(x) */ -#define CRC32_9VECS_MULTS { CRC32_9VECS_MULT_1, CRC32_9VECS_MULT_2 } +#define CRC32_X1183_MODG 0x26b70c3d /* x^1183 mod G(x) */ +#define CRC32_X1119_MODG 0x3f41287a /* x^1119 mod G(x) */ -#define CRC32_10VECS_MULT_1 0xe3543be0 /* x^1311 mod G(x) */ -#define CRC32_10VECS_MULT_2 0x9026d5b1 /* x^1247 mod G(x) */ -#define CRC32_10VECS_MULTS { CRC32_10VECS_MULT_1, CRC32_10VECS_MULT_2 } +#define CRC32_X1311_MODG 0xe3543be0 /* x^1311 mod G(x) */ +#define CRC32_X1247_MODG 0x9026d5b1 /* x^1247 mod G(x) */ -#define CRC32_11VECS_MULT_1 0x5a1bb05d /* x^1439 mod G(x) */ -#define CRC32_11VECS_MULT_2 0xd1df2327 /* x^1375 mod G(x) */ -#define CRC32_11VECS_MULTS { CRC32_11VECS_MULT_1, CRC32_11VECS_MULT_2 } +#define CRC32_X1439_MODG 0x5a1bb05d /* x^1439 mod G(x) */ +#define CRC32_X1375_MODG 0xd1df2327 /* x^1375 mod G(x) */ -#define CRC32_12VECS_MULT_1 0x596c8d81 /* x^1567 mod G(x) */ -#define CRC32_12VECS_MULT_2 0xf5e48c85 /* x^1503 mod G(x) */ -#define CRC32_12VECS_MULTS { CRC32_12VECS_MULT_1, CRC32_12VECS_MULT_2 } +#define CRC32_X1567_MODG 0x596c8d81 /* x^1567 mod G(x) */ +#define CRC32_X1503_MODG 0xf5e48c85 /* x^1503 mod G(x) */ -#define CRC32_FINAL_MULT 0xb8bc6765 /* x^63 mod G(x) */ +#define CRC32_X1695_MODG 0x682bdd4f /* x^1695 mod G(x) */ +#define CRC32_X1631_MODG 0x3c656ced /* x^1631 mod G(x) */ + +#define CRC32_X1823_MODG 0x4a28bd43 /* x^1823 mod G(x) */ +#define CRC32_X1759_MODG 0xfe807bbd /* x^1759 mod G(x) */ + +#define CRC32_X1951_MODG 0x0077f00d /* x^1951 mod G(x) */ +#define CRC32_X1887_MODG 0x1f0c2cdd /* x^1887 mod G(x) */ + +#define CRC32_X2079_MODG 0xce3371cb /* x^2079 mod G(x) */ +#define CRC32_X2015_MODG 0xe95c1271 /* x^2015 mod G(x) */ + +#define CRC32_X2207_MODG 0xa749e894 /* x^2207 mod G(x) */ +#define CRC32_X2143_MODG 0xb918a347 /* x^2143 mod G(x) */ + +#define CRC32_X2335_MODG 0x2c538639 /* x^2335 mod G(x) */ +#define CRC32_X2271_MODG 0x71d54a59 /* x^2271 mod G(x) */ + +#define CRC32_X2463_MODG 0x32b0733c /* x^2463 mod G(x) */ +#define CRC32_X2399_MODG 0xff6f2fc2 /* x^2399 mod G(x) */ + +#define CRC32_X2591_MODG 0x0e9bd5cc /* x^2591 mod G(x) */ +#define CRC32_X2527_MODG 0xcec97417 /* x^2527 mod G(x) */ + +#define CRC32_X2719_MODG 0x76278617 /* x^2719 mod G(x) */ +#define CRC32_X2655_MODG 0x1c63267b /* x^2655 mod G(x) */ + +#define CRC32_X2847_MODG 0xc51b93e3 /* x^2847 mod G(x) */ +#define CRC32_X2783_MODG 0xf183c71b /* x^2783 mod G(x) */ + +#define CRC32_X2975_MODG 0x7eaed122 /* x^2975 mod G(x) */ +#define CRC32_X2911_MODG 0x9b9bdbd0 /* x^2911 mod G(x) */ + +#define CRC32_X3103_MODG 0x2ce423f1 /* x^3103 mod G(x) */ +#define CRC32_X3039_MODG 0xd31343ea /* x^3039 mod G(x) */ + +#define CRC32_X3231_MODG 0x8b8d8645 /* x^3231 mod G(x) */ +#define CRC32_X3167_MODG 0x4470ac44 /* x^3167 mod G(x) */ + +#define CRC32_X3359_MODG 0x4b700aa8 /* x^3359 mod G(x) */ +#define CRC32_X3295_MODG 0xeea395c4 /* x^3295 mod G(x) */ + +#define CRC32_X3487_MODG 0xeff5e99d /* x^3487 mod G(x) */ +#define CRC32_X3423_MODG 0xf9d9c7ee /* x^3423 mod G(x) */ + +#define CRC32_X3615_MODG 0xad0d2bb2 /* x^3615 mod G(x) */ +#define CRC32_X3551_MODG 0xcd669a40 /* x^3551 mod G(x) */ + +#define CRC32_X3743_MODG 0x9fb66bd3 /* x^3743 mod G(x) */ +#define CRC32_X3679_MODG 0x6d40f445 /* x^3679 mod G(x) */ + +#define CRC32_X3871_MODG 0xc2dcc467 /* x^3871 mod G(x) */ +#define CRC32_X3807_MODG 0x9ee62949 /* x^3807 mod G(x) */ + +#define CRC32_X3999_MODG 0x398e2ff2 /* x^3999 mod G(x) */ +#define CRC32_X3935_MODG 0x145575d5 /* x^3935 mod G(x) */ + +#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */ +#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */ + +#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */ #define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */ #define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */ #define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 } @@ -63,263 +111,263 @@ /* Multipliers for implementations that use a variable chunk length */ static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = { - { 0 /* unused row */ }, - /* chunk_len=128 */ - { 0xd31343ea /* x^3039 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, 0x910eeec1 /* x^991 mod G(x) */, }, - /* chunk_len=256 */ - { 0x1d6708a0 /* x^6111 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, }, - /* chunk_len=384 */ - { 0xdb3839f3 /* x^9183 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, 0xd31343ea /* x^3039 mod G(x) */, }, - /* chunk_len=512 */ - { 0x1753ab84 /* x^12255 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, }, - /* chunk_len=640 */ - { 0x3796455c /* x^15327 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, 0xc352f6de /* x^5087 mod G(x) */, }, - /* chunk_len=768 */ - { 0x3954de39 /* x^18399 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, }, - /* chunk_len=896 */ - { 0x632d78c5 /* x^21471 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, 0x9a1b53c8 /* x^7135 mod G(x) */, }, - /* chunk_len=1024 */ - { 0xa0decef3 /* x^24543 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, }, - /* chunk_len=1152 */ - { 0xe9c09bb0 /* x^27615 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, 0xdb3839f3 /* x^9183 mod G(x) */, }, - /* chunk_len=1280 */ - { 0xd51917a4 /* x^30687 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, }, - /* chunk_len=1408 */ - { 0x154a8a62 /* x^33759 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, 0x3e9a43cd /* x^11231 mod G(x) */, }, - /* chunk_len=1536 */ - { 0xf196555d /* x^36831 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, }, - /* chunk_len=1664 */ - { 0x8eec2999 /* x^39903 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, 0x6044fbb0 /* x^13279 mod G(x) */, }, - /* chunk_len=1792 */ - { 0x27892abf /* x^42975 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, }, - /* chunk_len=1920 */ - { 0x77bc2419 /* x^46047 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, 0x3796455c /* x^15327 mod G(x) */, }, - /* chunk_len=2048 */ - { 0xcea114a5 /* x^49119 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, }, - /* chunk_len=2176 */ - { 0xa1077e85 /* x^52191 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, 0x0c21f835 /* x^17375 mod G(x) */, }, - /* chunk_len=2304 */ - { 0xc5ed75e1 /* x^55263 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, }, - /* chunk_len=2432 */ - { 0xca4fba3f /* x^58335 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, 0x6cb21510 /* x^19423 mod G(x) */, }, - /* chunk_len=2560 */ - { 0xcf5bcdc4 /* x^61407 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, }, - /* chunk_len=2688 */ - { 0xf36b9d16 /* x^64479 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, 0x632d78c5 /* x^21471 mod G(x) */, }, - /* chunk_len=2816 */ - { 0xf76fd988 /* x^67551 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, }, - /* chunk_len=2944 */ - { 0x6c45d92e /* x^70623 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, 0x0c46baec /* x^23519 mod G(x) */, }, - /* chunk_len=3072 */ - { 0x6116b82b /* x^73695 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, }, - /* chunk_len=3200 */ - { 0x4d9899bb /* x^76767 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, 0x53deb236 /* x^25567 mod G(x) */, }, - /* chunk_len=3328 */ - { 0x3e7c93b9 /* x^79839 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, }, - /* chunk_len=3456 */ - { 0x388b20ac /* x^82911 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, 0xe9c09bb0 /* x^27615 mod G(x) */, }, - /* chunk_len=3584 */ - { 0x0956d953 /* x^85983 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, }, - /* chunk_len=3712 */ - { 0x55cb4dfe /* x^89055 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, 0xc07331b3 /* x^29663 mod G(x) */, }, - /* chunk_len=3840 */ - { 0x52222fea /* x^92127 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, }, - /* chunk_len=3968 */ - { 0x0603989b /* x^95199 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, 0x5e04b9a5 /* x^31711 mod G(x) */, }, - /* chunk_len=4096 */ - { 0x4470c029 /* x^98271 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, }, - /* chunk_len=4224 */ - { 0xb6f35093 /* x^101343 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, 0x154a8a62 /* x^33759 mod G(x) */, }, - /* chunk_len=4352 */ - { 0xc46805ba /* x^104415 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, }, - /* chunk_len=4480 */ - { 0xc3876592 /* x^107487 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, 0xc35cf6e7 /* x^35807 mod G(x) */, }, - /* chunk_len=4608 */ - { 0x5b0c98b9 /* x^110559 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, }, - /* chunk_len=4736 */ - { 0x30d13e5f /* x^113631 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, 0x8c224466 /* x^37855 mod G(x) */, }, - /* chunk_len=4864 */ - { 0x54afca53 /* x^116703 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, }, - /* chunk_len=4992 */ - { 0x93102436 /* x^119775 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, 0x8eec2999 /* x^39903 mod G(x) */, }, - /* chunk_len=5120 */ - { 0xbd2655a8 /* x^122847 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, }, - /* chunk_len=5248 */ - { 0x70cd7f26 /* x^125919 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, 0x1691be45 /* x^41951 mod G(x) */, }, - /* chunk_len=5376 */ - { 0x2d546c53 /* x^128991 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, }, - /* chunk_len=5504 */ - { 0xb53410a8 /* x^132063 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, 0x161f3c12 /* x^43999 mod G(x) */, }, - /* chunk_len=5632 */ - { 0x67a93f75 /* x^135135 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, }, - /* chunk_len=5760 */ - { 0x9830ac33 /* x^138207 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, 0x77bc2419 /* x^46047 mod G(x) */, }, - /* chunk_len=5888 */ - { 0xb0b6fc3e /* x^141279 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, }, - /* chunk_len=6016 */ - { 0x84170f16 /* x^144351 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, 0x30de0f98 /* x^48095 mod G(x) */, }, - /* chunk_len=6144 */ - { 0xd7017a0c /* x^147423 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, }, - /* chunk_len=6272 */ - { 0xadb25de6 /* x^150495 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, 0x2b7e0e1b /* x^50143 mod G(x) */, }, - /* chunk_len=6400 */ - { 0x8282fddc /* x^153567 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, }, - /* chunk_len=6528 */ - { 0x46362bee /* x^156639 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, 0xa1077e85 /* x^52191 mod G(x) */, }, - /* chunk_len=6656 */ - { 0xb9077a01 /* x^159711 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, }, - /* chunk_len=6784 */ - { 0xf51d9bc6 /* x^162783 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, 0x7e774cf6 /* x^54239 mod G(x) */, }, - /* chunk_len=6912 */ - { 0x4ca19a29 /* x^165855 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, }, - /* chunk_len=7040 */ - { 0xdc0fc3fc /* x^168927 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, 0x3678fed2 /* x^56287 mod G(x) */, }, - /* chunk_len=7168 */ - { 0x63c3d167 /* x^171999 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, }, - /* chunk_len=7296 */ - { 0x5851d254 /* x^175071 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, 0xca4fba3f /* x^58335 mod G(x) */, }, - /* chunk_len=7424 */ - { 0xfeacf2a1 /* x^178143 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, }, - /* chunk_len=7552 */ - { 0x93b7edc8 /* x^181215 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, 0x58fa96ee /* x^60383 mod G(x) */, }, - /* chunk_len=7680 */ - { 0x5539e44a /* x^184287 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, }, - /* chunk_len=7808 */ - { 0xde32a3d2 /* x^187359 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, 0x6a6a3694 /* x^62431 mod G(x) */, }, - /* chunk_len=7936 */ - { 0xf0baeeb6 /* x^190431 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, }, - /* chunk_len=8064 */ - { 0xbe15887f /* x^193503 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, 0xf36b9d16 /* x^64479 mod G(x) */, }, - /* chunk_len=8192 */ - { 0x64f34a05 /* x^196575 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, }, - /* chunk_len=8320 */ - { 0x1b6d1aea /* x^199647 mod G(x) */, 0xfeafb67c /* x^133087 mod G(x) */, 0x4fb001a8 /* x^66527 mod G(x) */, }, - /* chunk_len=8448 */ - { 0x82adb0b8 /* x^202719 mod G(x) */, 0x67a93f75 /* x^135135 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, }, - /* chunk_len=8576 */ - { 0x694587c7 /* x^205791 mod G(x) */, 0x3b34408b /* x^137183 mod G(x) */, 0xeccb2978 /* x^68575 mod G(x) */, }, - /* chunk_len=8704 */ - { 0xd2fc57c3 /* x^208863 mod G(x) */, 0x07fcf8c6 /* x^139231 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, }, - /* chunk_len=8832 */ - { 0x9dd6837c /* x^211935 mod G(x) */, 0xb0b6fc3e /* x^141279 mod G(x) */, 0x6c45d92e /* x^70623 mod G(x) */, }, - /* chunk_len=8960 */ - { 0x3a9d1f97 /* x^215007 mod G(x) */, 0xefd033b2 /* x^143327 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, }, - /* chunk_len=9088 */ - { 0x1eee1d2a /* x^218079 mod G(x) */, 0xf2a6e46e /* x^145375 mod G(x) */, 0x55b4c814 /* x^72671 mod G(x) */, }, - /* chunk_len=9216 */ - { 0xb57c7728 /* x^221151 mod G(x) */, 0xd7017a0c /* x^147423 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, }, - /* chunk_len=9344 */ - { 0xf2fc5d61 /* x^224223 mod G(x) */, 0x242aac86 /* x^149471 mod G(x) */, 0x05245cf0 /* x^74719 mod G(x) */, }, - /* chunk_len=9472 */ - { 0x26387824 /* x^227295 mod G(x) */, 0xc15c4ca5 /* x^151519 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, }, - /* chunk_len=9600 */ - { 0x8c151e77 /* x^230367 mod G(x) */, 0x8282fddc /* x^153567 mod G(x) */, 0x4d9899bb /* x^76767 mod G(x) */, }, - /* chunk_len=9728 */ - { 0x8ea1f680 /* x^233439 mod G(x) */, 0xf5ff6cdd /* x^155615 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, }, - /* chunk_len=9856 */ - { 0xe8cf3d2a /* x^236511 mod G(x) */, 0x338b1fb1 /* x^157663 mod G(x) */, 0xeda61f70 /* x^78815 mod G(x) */, }, - /* chunk_len=9984 */ - { 0x21f15b59 /* x^239583 mod G(x) */, 0xb9077a01 /* x^159711 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, }, - /* chunk_len=10112 */ - { 0x6f68d64a /* x^242655 mod G(x) */, 0x901b0161 /* x^161759 mod G(x) */, 0xb9fd3537 /* x^80863 mod G(x) */, }, - /* chunk_len=10240 */ - { 0x71b74d95 /* x^245727 mod G(x) */, 0xf5ddd5ad /* x^163807 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, }, - /* chunk_len=10368 */ - { 0x4c2e7261 /* x^248799 mod G(x) */, 0x4ca19a29 /* x^165855 mod G(x) */, 0x388b20ac /* x^82911 mod G(x) */, }, - /* chunk_len=10496 */ - { 0x8a2d38e8 /* x^251871 mod G(x) */, 0xd27ee0a1 /* x^167903 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, }, - /* chunk_len=10624 */ - { 0x7e58ca17 /* x^254943 mod G(x) */, 0x69dfedd2 /* x^169951 mod G(x) */, 0x3a76805e /* x^84959 mod G(x) */, }, - /* chunk_len=10752 */ - { 0xf997967f /* x^258015 mod G(x) */, 0x63c3d167 /* x^171999 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, }, - /* chunk_len=10880 */ - { 0x48215963 /* x^261087 mod G(x) */, 0x71e1dfe0 /* x^174047 mod G(x) */, 0x42a6d410 /* x^87007 mod G(x) */, }, - /* chunk_len=11008 */ - { 0xa704b94c /* x^264159 mod G(x) */, 0x679f198a /* x^176095 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, }, - /* chunk_len=11136 */ - { 0x1d699056 /* x^267231 mod G(x) */, 0xfeacf2a1 /* x^178143 mod G(x) */, 0x55cb4dfe /* x^89055 mod G(x) */, }, - /* chunk_len=11264 */ - { 0x6800bcc5 /* x^270303 mod G(x) */, 0x16024f15 /* x^180191 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, }, - /* chunk_len=11392 */ - { 0x2d48e4ca /* x^273375 mod G(x) */, 0xbe61582f /* x^182239 mod G(x) */, 0x46026283 /* x^91103 mod G(x) */, }, - /* chunk_len=11520 */ - { 0x4c4c2b55 /* x^276447 mod G(x) */, 0x5539e44a /* x^184287 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, }, - /* chunk_len=11648 */ - { 0xd8ce94cb /* x^279519 mod G(x) */, 0xbc613c26 /* x^186335 mod G(x) */, 0x33776b4b /* x^93151 mod G(x) */, }, - /* chunk_len=11776 */ - { 0xd0b5a02b /* x^282591 mod G(x) */, 0x490d3cc6 /* x^188383 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, }, - /* chunk_len=11904 */ - { 0xa223f7ec /* x^285663 mod G(x) */, 0xf0baeeb6 /* x^190431 mod G(x) */, 0x0603989b /* x^95199 mod G(x) */, }, - /* chunk_len=12032 */ - { 0x58de337a /* x^288735 mod G(x) */, 0x3bf3d597 /* x^192479 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, }, - /* chunk_len=12160 */ - { 0x37f5d8f4 /* x^291807 mod G(x) */, 0x4d5b699b /* x^194527 mod G(x) */, 0xd7262e5f /* x^97247 mod G(x) */, }, - /* chunk_len=12288 */ - { 0xfa8a435d /* x^294879 mod G(x) */, 0x64f34a05 /* x^196575 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, }, - /* chunk_len=12416 */ - { 0x238709fe /* x^297951 mod G(x) */, 0x52e7458f /* x^198623 mod G(x) */, 0x9a174cd3 /* x^99295 mod G(x) */, }, - /* chunk_len=12544 */ - { 0x9e1ba6f5 /* x^301023 mod G(x) */, 0xef0272f7 /* x^200671 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, }, - /* chunk_len=12672 */ - { 0xcd8b57fa /* x^304095 mod G(x) */, 0x82adb0b8 /* x^202719 mod G(x) */, 0xb6f35093 /* x^101343 mod G(x) */, }, - /* chunk_len=12800 */ - { 0x0aed142f /* x^307167 mod G(x) */, 0xb1650290 /* x^204767 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, }, - /* chunk_len=12928 */ - { 0xd1f064db /* x^310239 mod G(x) */, 0x6e7340d3 /* x^206815 mod G(x) */, 0x5c28cb52 /* x^103391 mod G(x) */, }, - /* chunk_len=13056 */ - { 0x464ac895 /* x^313311 mod G(x) */, 0xd2fc57c3 /* x^208863 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, }, - /* chunk_len=13184 */ - { 0xa0e6beea /* x^316383 mod G(x) */, 0xcfeec3d0 /* x^210911 mod G(x) */, 0x0225d214 /* x^105439 mod G(x) */, }, - /* chunk_len=13312 */ - { 0x78703ce0 /* x^319455 mod G(x) */, 0xc60f6075 /* x^212959 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, }, - /* chunk_len=13440 */ - { 0xfea48165 /* x^322527 mod G(x) */, 0x3a9d1f97 /* x^215007 mod G(x) */, 0xc3876592 /* x^107487 mod G(x) */, }, - /* chunk_len=13568 */ - { 0xdb89b8db /* x^325599 mod G(x) */, 0xa6172211 /* x^217055 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, }, - /* chunk_len=13696 */ - { 0x7ca03731 /* x^328671 mod G(x) */, 0x1db42849 /* x^219103 mod G(x) */, 0xc5df246e /* x^109535 mod G(x) */, }, - /* chunk_len=13824 */ - { 0x8801d0aa /* x^331743 mod G(x) */, 0xb57c7728 /* x^221151 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, }, - /* chunk_len=13952 */ - { 0xf89cd7f0 /* x^334815 mod G(x) */, 0xcc396a0b /* x^223199 mod G(x) */, 0xdb799c51 /* x^111583 mod G(x) */, }, - /* chunk_len=14080 */ - { 0x1611a808 /* x^337887 mod G(x) */, 0xaeae6105 /* x^225247 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, }, - /* chunk_len=14208 */ - { 0xe3cdb888 /* x^340959 mod G(x) */, 0x26387824 /* x^227295 mod G(x) */, 0x30d13e5f /* x^113631 mod G(x) */, }, - /* chunk_len=14336 */ - { 0x552a4cf6 /* x^344031 mod G(x) */, 0xee2d04bb /* x^229343 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, }, - /* chunk_len=14464 */ - { 0x85e248e9 /* x^347103 mod G(x) */, 0x0a79663f /* x^231391 mod G(x) */, 0x53339cf7 /* x^115679 mod G(x) */, }, - /* chunk_len=14592 */ - { 0x1c61c3e9 /* x^350175 mod G(x) */, 0x8ea1f680 /* x^233439 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, }, - /* chunk_len=14720 */ - { 0xb14cfc2b /* x^353247 mod G(x) */, 0x2e073302 /* x^235487 mod G(x) */, 0x10897992 /* x^117727 mod G(x) */, }, - /* chunk_len=14848 */ - { 0x6ec444cc /* x^356319 mod G(x) */, 0x9e819f13 /* x^237535 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, }, - /* chunk_len=14976 */ - { 0xe2fa5f80 /* x^359391 mod G(x) */, 0x21f15b59 /* x^239583 mod G(x) */, 0x93102436 /* x^119775 mod G(x) */, }, - /* chunk_len=15104 */ - { 0x6d33f4c6 /* x^362463 mod G(x) */, 0x31a27455 /* x^241631 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, }, - /* chunk_len=15232 */ - { 0xb6dec609 /* x^365535 mod G(x) */, 0x4d437056 /* x^243679 mod G(x) */, 0x42eb1e2a /* x^121823 mod G(x) */, }, - /* chunk_len=15360 */ - { 0x1846c518 /* x^368607 mod G(x) */, 0x71b74d95 /* x^245727 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, }, - /* chunk_len=15488 */ - { 0x9f947f8a /* x^371679 mod G(x) */, 0x2b501619 /* x^247775 mod G(x) */, 0xa4924b0e /* x^123871 mod G(x) */, }, - /* chunk_len=15616 */ - { 0xb7442f4d /* x^374751 mod G(x) */, 0xba30a5d8 /* x^249823 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, }, - /* chunk_len=15744 */ - { 0xe2c93242 /* x^377823 mod G(x) */, 0x8a2d38e8 /* x^251871 mod G(x) */, 0x70cd7f26 /* x^125919 mod G(x) */, }, - /* chunk_len=15872 */ - { 0xcd6863df /* x^380895 mod G(x) */, 0x78fd88dc /* x^253919 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, }, - /* chunk_len=16000 */ - { 0xd512001d /* x^383967 mod G(x) */, 0xe6612dff /* x^255967 mod G(x) */, 0x5c4d0ca9 /* x^127967 mod G(x) */, }, - /* chunk_len=16128 */ - { 0x4e8d6b6c /* x^387039 mod G(x) */, 0xf997967f /* x^258015 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, }, - /* chunk_len=16256 */ - { 0xfa653ba1 /* x^390111 mod G(x) */, 0xc99014d4 /* x^260063 mod G(x) */, 0xa0c9fd27 /* x^130015 mod G(x) */, }, - /* chunk_len=16384 */ - { 0x49893408 /* x^393183 mod G(x) */, 0x29c2448b /* x^262111 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, }, + { 0 /* unused row */ }, + /* chunk_len=128 */ + { 0xd31343ea /* x^3039 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, 0x910eeec1 /* x^991 mod G(x) */, }, + /* chunk_len=256 */ + { 0x1d6708a0 /* x^6111 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, }, + /* chunk_len=384 */ + { 0xdb3839f3 /* x^9183 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, 0xd31343ea /* x^3039 mod G(x) */, }, + /* chunk_len=512 */ + { 0x1753ab84 /* x^12255 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, }, + /* chunk_len=640 */ + { 0x3796455c /* x^15327 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, 0xc352f6de /* x^5087 mod G(x) */, }, + /* chunk_len=768 */ + { 0x3954de39 /* x^18399 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, }, + /* chunk_len=896 */ + { 0x632d78c5 /* x^21471 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, 0x9a1b53c8 /* x^7135 mod G(x) */, }, + /* chunk_len=1024 */ + { 0xa0decef3 /* x^24543 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, }, + /* chunk_len=1152 */ + { 0xe9c09bb0 /* x^27615 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, 0xdb3839f3 /* x^9183 mod G(x) */, }, + /* chunk_len=1280 */ + { 0xd51917a4 /* x^30687 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, }, + /* chunk_len=1408 */ + { 0x154a8a62 /* x^33759 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, 0x3e9a43cd /* x^11231 mod G(x) */, }, + /* chunk_len=1536 */ + { 0xf196555d /* x^36831 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, }, + /* chunk_len=1664 */ + { 0x8eec2999 /* x^39903 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, 0x6044fbb0 /* x^13279 mod G(x) */, }, + /* chunk_len=1792 */ + { 0x27892abf /* x^42975 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, }, + /* chunk_len=1920 */ + { 0x77bc2419 /* x^46047 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, 0x3796455c /* x^15327 mod G(x) */, }, + /* chunk_len=2048 */ + { 0xcea114a5 /* x^49119 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, }, + /* chunk_len=2176 */ + { 0xa1077e85 /* x^52191 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, 0x0c21f835 /* x^17375 mod G(x) */, }, + /* chunk_len=2304 */ + { 0xc5ed75e1 /* x^55263 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, }, + /* chunk_len=2432 */ + { 0xca4fba3f /* x^58335 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, 0x6cb21510 /* x^19423 mod G(x) */, }, + /* chunk_len=2560 */ + { 0xcf5bcdc4 /* x^61407 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, }, + /* chunk_len=2688 */ + { 0xf36b9d16 /* x^64479 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, 0x632d78c5 /* x^21471 mod G(x) */, }, + /* chunk_len=2816 */ + { 0xf76fd988 /* x^67551 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, }, + /* chunk_len=2944 */ + { 0x6c45d92e /* x^70623 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, 0x0c46baec /* x^23519 mod G(x) */, }, + /* chunk_len=3072 */ + { 0x6116b82b /* x^73695 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, }, + /* chunk_len=3200 */ + { 0x4d9899bb /* x^76767 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, 0x53deb236 /* x^25567 mod G(x) */, }, + /* chunk_len=3328 */ + { 0x3e7c93b9 /* x^79839 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, }, + /* chunk_len=3456 */ + { 0x388b20ac /* x^82911 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, 0xe9c09bb0 /* x^27615 mod G(x) */, }, + /* chunk_len=3584 */ + { 0x0956d953 /* x^85983 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, }, + /* chunk_len=3712 */ + { 0x55cb4dfe /* x^89055 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, 0xc07331b3 /* x^29663 mod G(x) */, }, + /* chunk_len=3840 */ + { 0x52222fea /* x^92127 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, }, + /* chunk_len=3968 */ + { 0x0603989b /* x^95199 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, 0x5e04b9a5 /* x^31711 mod G(x) */, }, + /* chunk_len=4096 */ + { 0x4470c029 /* x^98271 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, }, + /* chunk_len=4224 */ + { 0xb6f35093 /* x^101343 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, 0x154a8a62 /* x^33759 mod G(x) */, }, + /* chunk_len=4352 */ + { 0xc46805ba /* x^104415 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, }, + /* chunk_len=4480 */ + { 0xc3876592 /* x^107487 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, 0xc35cf6e7 /* x^35807 mod G(x) */, }, + /* chunk_len=4608 */ + { 0x5b0c98b9 /* x^110559 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, }, + /* chunk_len=4736 */ + { 0x30d13e5f /* x^113631 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, 0x8c224466 /* x^37855 mod G(x) */, }, + /* chunk_len=4864 */ + { 0x54afca53 /* x^116703 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, }, + /* chunk_len=4992 */ + { 0x93102436 /* x^119775 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, 0x8eec2999 /* x^39903 mod G(x) */, }, + /* chunk_len=5120 */ + { 0xbd2655a8 /* x^122847 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, }, + /* chunk_len=5248 */ + { 0x70cd7f26 /* x^125919 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, 0x1691be45 /* x^41951 mod G(x) */, }, + /* chunk_len=5376 */ + { 0x2d546c53 /* x^128991 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, }, + /* chunk_len=5504 */ + { 0xb53410a8 /* x^132063 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, 0x161f3c12 /* x^43999 mod G(x) */, }, + /* chunk_len=5632 */ + { 0x67a93f75 /* x^135135 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, }, + /* chunk_len=5760 */ + { 0x9830ac33 /* x^138207 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, 0x77bc2419 /* x^46047 mod G(x) */, }, + /* chunk_len=5888 */ + { 0xb0b6fc3e /* x^141279 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, }, + /* chunk_len=6016 */ + { 0x84170f16 /* x^144351 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, 0x30de0f98 /* x^48095 mod G(x) */, }, + /* chunk_len=6144 */ + { 0xd7017a0c /* x^147423 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, }, + /* chunk_len=6272 */ + { 0xadb25de6 /* x^150495 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, 0x2b7e0e1b /* x^50143 mod G(x) */, }, + /* chunk_len=6400 */ + { 0x8282fddc /* x^153567 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, }, + /* chunk_len=6528 */ + { 0x46362bee /* x^156639 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, 0xa1077e85 /* x^52191 mod G(x) */, }, + /* chunk_len=6656 */ + { 0xb9077a01 /* x^159711 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, }, + /* chunk_len=6784 */ + { 0xf51d9bc6 /* x^162783 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, 0x7e774cf6 /* x^54239 mod G(x) */, }, + /* chunk_len=6912 */ + { 0x4ca19a29 /* x^165855 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, }, + /* chunk_len=7040 */ + { 0xdc0fc3fc /* x^168927 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, 0x3678fed2 /* x^56287 mod G(x) */, }, + /* chunk_len=7168 */ + { 0x63c3d167 /* x^171999 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, }, + /* chunk_len=7296 */ + { 0x5851d254 /* x^175071 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, 0xca4fba3f /* x^58335 mod G(x) */, }, + /* chunk_len=7424 */ + { 0xfeacf2a1 /* x^178143 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, }, + /* chunk_len=7552 */ + { 0x93b7edc8 /* x^181215 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, 0x58fa96ee /* x^60383 mod G(x) */, }, + /* chunk_len=7680 */ + { 0x5539e44a /* x^184287 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, }, + /* chunk_len=7808 */ + { 0xde32a3d2 /* x^187359 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, 0x6a6a3694 /* x^62431 mod G(x) */, }, + /* chunk_len=7936 */ + { 0xf0baeeb6 /* x^190431 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, }, + /* chunk_len=8064 */ + { 0xbe15887f /* x^193503 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, 0xf36b9d16 /* x^64479 mod G(x) */, }, + /* chunk_len=8192 */ + { 0x64f34a05 /* x^196575 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, }, + /* chunk_len=8320 */ + { 0x1b6d1aea /* x^199647 mod G(x) */, 0xfeafb67c /* x^133087 mod G(x) */, 0x4fb001a8 /* x^66527 mod G(x) */, }, + /* chunk_len=8448 */ + { 0x82adb0b8 /* x^202719 mod G(x) */, 0x67a93f75 /* x^135135 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, }, + /* chunk_len=8576 */ + { 0x694587c7 /* x^205791 mod G(x) */, 0x3b34408b /* x^137183 mod G(x) */, 0xeccb2978 /* x^68575 mod G(x) */, }, + /* chunk_len=8704 */ + { 0xd2fc57c3 /* x^208863 mod G(x) */, 0x07fcf8c6 /* x^139231 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, }, + /* chunk_len=8832 */ + { 0x9dd6837c /* x^211935 mod G(x) */, 0xb0b6fc3e /* x^141279 mod G(x) */, 0x6c45d92e /* x^70623 mod G(x) */, }, + /* chunk_len=8960 */ + { 0x3a9d1f97 /* x^215007 mod G(x) */, 0xefd033b2 /* x^143327 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, }, + /* chunk_len=9088 */ + { 0x1eee1d2a /* x^218079 mod G(x) */, 0xf2a6e46e /* x^145375 mod G(x) */, 0x55b4c814 /* x^72671 mod G(x) */, }, + /* chunk_len=9216 */ + { 0xb57c7728 /* x^221151 mod G(x) */, 0xd7017a0c /* x^147423 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, }, + /* chunk_len=9344 */ + { 0xf2fc5d61 /* x^224223 mod G(x) */, 0x242aac86 /* x^149471 mod G(x) */, 0x05245cf0 /* x^74719 mod G(x) */, }, + /* chunk_len=9472 */ + { 0x26387824 /* x^227295 mod G(x) */, 0xc15c4ca5 /* x^151519 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, }, + /* chunk_len=9600 */ + { 0x8c151e77 /* x^230367 mod G(x) */, 0x8282fddc /* x^153567 mod G(x) */, 0x4d9899bb /* x^76767 mod G(x) */, }, + /* chunk_len=9728 */ + { 0x8ea1f680 /* x^233439 mod G(x) */, 0xf5ff6cdd /* x^155615 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, }, + /* chunk_len=9856 */ + { 0xe8cf3d2a /* x^236511 mod G(x) */, 0x338b1fb1 /* x^157663 mod G(x) */, 0xeda61f70 /* x^78815 mod G(x) */, }, + /* chunk_len=9984 */ + { 0x21f15b59 /* x^239583 mod G(x) */, 0xb9077a01 /* x^159711 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, }, + /* chunk_len=10112 */ + { 0x6f68d64a /* x^242655 mod G(x) */, 0x901b0161 /* x^161759 mod G(x) */, 0xb9fd3537 /* x^80863 mod G(x) */, }, + /* chunk_len=10240 */ + { 0x71b74d95 /* x^245727 mod G(x) */, 0xf5ddd5ad /* x^163807 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, }, + /* chunk_len=10368 */ + { 0x4c2e7261 /* x^248799 mod G(x) */, 0x4ca19a29 /* x^165855 mod G(x) */, 0x388b20ac /* x^82911 mod G(x) */, }, + /* chunk_len=10496 */ + { 0x8a2d38e8 /* x^251871 mod G(x) */, 0xd27ee0a1 /* x^167903 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, }, + /* chunk_len=10624 */ + { 0x7e58ca17 /* x^254943 mod G(x) */, 0x69dfedd2 /* x^169951 mod G(x) */, 0x3a76805e /* x^84959 mod G(x) */, }, + /* chunk_len=10752 */ + { 0xf997967f /* x^258015 mod G(x) */, 0x63c3d167 /* x^171999 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, }, + /* chunk_len=10880 */ + { 0x48215963 /* x^261087 mod G(x) */, 0x71e1dfe0 /* x^174047 mod G(x) */, 0x42a6d410 /* x^87007 mod G(x) */, }, + /* chunk_len=11008 */ + { 0xa704b94c /* x^264159 mod G(x) */, 0x679f198a /* x^176095 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, }, + /* chunk_len=11136 */ + { 0x1d699056 /* x^267231 mod G(x) */, 0xfeacf2a1 /* x^178143 mod G(x) */, 0x55cb4dfe /* x^89055 mod G(x) */, }, + /* chunk_len=11264 */ + { 0x6800bcc5 /* x^270303 mod G(x) */, 0x16024f15 /* x^180191 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, }, + /* chunk_len=11392 */ + { 0x2d48e4ca /* x^273375 mod G(x) */, 0xbe61582f /* x^182239 mod G(x) */, 0x46026283 /* x^91103 mod G(x) */, }, + /* chunk_len=11520 */ + { 0x4c4c2b55 /* x^276447 mod G(x) */, 0x5539e44a /* x^184287 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, }, + /* chunk_len=11648 */ + { 0xd8ce94cb /* x^279519 mod G(x) */, 0xbc613c26 /* x^186335 mod G(x) */, 0x33776b4b /* x^93151 mod G(x) */, }, + /* chunk_len=11776 */ + { 0xd0b5a02b /* x^282591 mod G(x) */, 0x490d3cc6 /* x^188383 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, }, + /* chunk_len=11904 */ + { 0xa223f7ec /* x^285663 mod G(x) */, 0xf0baeeb6 /* x^190431 mod G(x) */, 0x0603989b /* x^95199 mod G(x) */, }, + /* chunk_len=12032 */ + { 0x58de337a /* x^288735 mod G(x) */, 0x3bf3d597 /* x^192479 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, }, + /* chunk_len=12160 */ + { 0x37f5d8f4 /* x^291807 mod G(x) */, 0x4d5b699b /* x^194527 mod G(x) */, 0xd7262e5f /* x^97247 mod G(x) */, }, + /* chunk_len=12288 */ + { 0xfa8a435d /* x^294879 mod G(x) */, 0x64f34a05 /* x^196575 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, }, + /* chunk_len=12416 */ + { 0x238709fe /* x^297951 mod G(x) */, 0x52e7458f /* x^198623 mod G(x) */, 0x9a174cd3 /* x^99295 mod G(x) */, }, + /* chunk_len=12544 */ + { 0x9e1ba6f5 /* x^301023 mod G(x) */, 0xef0272f7 /* x^200671 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, }, + /* chunk_len=12672 */ + { 0xcd8b57fa /* x^304095 mod G(x) */, 0x82adb0b8 /* x^202719 mod G(x) */, 0xb6f35093 /* x^101343 mod G(x) */, }, + /* chunk_len=12800 */ + { 0x0aed142f /* x^307167 mod G(x) */, 0xb1650290 /* x^204767 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, }, + /* chunk_len=12928 */ + { 0xd1f064db /* x^310239 mod G(x) */, 0x6e7340d3 /* x^206815 mod G(x) */, 0x5c28cb52 /* x^103391 mod G(x) */, }, + /* chunk_len=13056 */ + { 0x464ac895 /* x^313311 mod G(x) */, 0xd2fc57c3 /* x^208863 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, }, + /* chunk_len=13184 */ + { 0xa0e6beea /* x^316383 mod G(x) */, 0xcfeec3d0 /* x^210911 mod G(x) */, 0x0225d214 /* x^105439 mod G(x) */, }, + /* chunk_len=13312 */ + { 0x78703ce0 /* x^319455 mod G(x) */, 0xc60f6075 /* x^212959 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, }, + /* chunk_len=13440 */ + { 0xfea48165 /* x^322527 mod G(x) */, 0x3a9d1f97 /* x^215007 mod G(x) */, 0xc3876592 /* x^107487 mod G(x) */, }, + /* chunk_len=13568 */ + { 0xdb89b8db /* x^325599 mod G(x) */, 0xa6172211 /* x^217055 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, }, + /* chunk_len=13696 */ + { 0x7ca03731 /* x^328671 mod G(x) */, 0x1db42849 /* x^219103 mod G(x) */, 0xc5df246e /* x^109535 mod G(x) */, }, + /* chunk_len=13824 */ + { 0x8801d0aa /* x^331743 mod G(x) */, 0xb57c7728 /* x^221151 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, }, + /* chunk_len=13952 */ + { 0xf89cd7f0 /* x^334815 mod G(x) */, 0xcc396a0b /* x^223199 mod G(x) */, 0xdb799c51 /* x^111583 mod G(x) */, }, + /* chunk_len=14080 */ + { 0x1611a808 /* x^337887 mod G(x) */, 0xaeae6105 /* x^225247 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, }, + /* chunk_len=14208 */ + { 0xe3cdb888 /* x^340959 mod G(x) */, 0x26387824 /* x^227295 mod G(x) */, 0x30d13e5f /* x^113631 mod G(x) */, }, + /* chunk_len=14336 */ + { 0x552a4cf6 /* x^344031 mod G(x) */, 0xee2d04bb /* x^229343 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, }, + /* chunk_len=14464 */ + { 0x85e248e9 /* x^347103 mod G(x) */, 0x0a79663f /* x^231391 mod G(x) */, 0x53339cf7 /* x^115679 mod G(x) */, }, + /* chunk_len=14592 */ + { 0x1c61c3e9 /* x^350175 mod G(x) */, 0x8ea1f680 /* x^233439 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, }, + /* chunk_len=14720 */ + { 0xb14cfc2b /* x^353247 mod G(x) */, 0x2e073302 /* x^235487 mod G(x) */, 0x10897992 /* x^117727 mod G(x) */, }, + /* chunk_len=14848 */ + { 0x6ec444cc /* x^356319 mod G(x) */, 0x9e819f13 /* x^237535 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, }, + /* chunk_len=14976 */ + { 0xe2fa5f80 /* x^359391 mod G(x) */, 0x21f15b59 /* x^239583 mod G(x) */, 0x93102436 /* x^119775 mod G(x) */, }, + /* chunk_len=15104 */ + { 0x6d33f4c6 /* x^362463 mod G(x) */, 0x31a27455 /* x^241631 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, }, + /* chunk_len=15232 */ + { 0xb6dec609 /* x^365535 mod G(x) */, 0x4d437056 /* x^243679 mod G(x) */, 0x42eb1e2a /* x^121823 mod G(x) */, }, + /* chunk_len=15360 */ + { 0x1846c518 /* x^368607 mod G(x) */, 0x71b74d95 /* x^245727 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, }, + /* chunk_len=15488 */ + { 0x9f947f8a /* x^371679 mod G(x) */, 0x2b501619 /* x^247775 mod G(x) */, 0xa4924b0e /* x^123871 mod G(x) */, }, + /* chunk_len=15616 */ + { 0xb7442f4d /* x^374751 mod G(x) */, 0xba30a5d8 /* x^249823 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, }, + /* chunk_len=15744 */ + { 0xe2c93242 /* x^377823 mod G(x) */, 0x8a2d38e8 /* x^251871 mod G(x) */, 0x70cd7f26 /* x^125919 mod G(x) */, }, + /* chunk_len=15872 */ + { 0xcd6863df /* x^380895 mod G(x) */, 0x78fd88dc /* x^253919 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, }, + /* chunk_len=16000 */ + { 0xd512001d /* x^383967 mod G(x) */, 0xe6612dff /* x^255967 mod G(x) */, 0x5c4d0ca9 /* x^127967 mod G(x) */, }, + /* chunk_len=16128 */ + { 0x4e8d6b6c /* x^387039 mod G(x) */, 0xf997967f /* x^258015 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, }, + /* chunk_len=16256 */ + { 0xfa653ba1 /* x^390111 mod G(x) */, 0xc99014d4 /* x^260063 mod G(x) */, 0xa0c9fd27 /* x^130015 mod G(x) */, }, + /* chunk_len=16384 */ + { 0x49893408 /* x^393183 mod G(x) */, 0x29c2448b /* x^262111 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, }, }; /* Multipliers for implementations that use a large fixed chunk length */ diff --git a/Sources/DEFLATE/crc32_pmull_wide.h b/Sources/DEFLATE/crc32_pmull_wide.h deleted file mode 100644 index a72e1d87..00000000 --- a/Sources/DEFLATE/crc32_pmull_wide.h +++ /dev/null @@ -1,227 +0,0 @@ -/* - * arm/crc32_pmull_wide.h - gzip CRC-32 with PMULL (extra-wide version) - * - * Copyright 2022 Eric Biggers - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -/* - * This file is a "template" for instantiating PMULL-based crc32_arm functions. - * The "parameters" are: - * - * SUFFIX: - * Name suffix to append to all instantiated functions. - * ATTRIBUTES: - * Target function attributes to use. - * ENABLE_EOR3: - * Use the eor3 instruction (from the sha3 extension). - * - * This is the extra-wide version; it uses an unusually large stride length of - * 12, and it assumes that crc32 instructions are available too. It's intended - * for powerful CPUs that support both pmull and crc32 instructions, but where - * throughput of pmull and xor (given enough instructions issued in parallel) is - * significantly higher than that of crc32, thus making the crc32 instructions - * (counterintuitively) not actually the fastest way to compute the CRC-32. The - * Apple M1 processor is an example of such a CPU. - */ - -#ifndef _MSC_VER -# include -#endif -#include - -#include "crc32_pmull_helpers.h" - -static u32 ATTRIBUTES MAYBE_UNUSED -ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len) -{ - uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11; - - if (len < 3 * 192) { - static const u64 _aligned_attribute(16) mults[3][2] = { - CRC32_4VECS_MULTS, CRC32_2VECS_MULTS, CRC32_1VECS_MULTS, - }; - poly64x2_t multipliers_4, multipliers_2, multipliers_1; - - if (len < 64) - goto tail; - multipliers_4 = load_multipliers(mults[0]); - multipliers_2 = load_multipliers(mults[1]); - multipliers_1 = load_multipliers(mults[2]); - /* - * Short length; don't bother aligning the pointer, and fold - * 64 bytes (4 vectors) at a time, at most. - */ - v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc)); - v1 = vld1q_u8(p + 16); - v2 = vld1q_u8(p + 32); - v3 = vld1q_u8(p + 48); - p += 64; - len -= 64; - while (len >= 64) { - v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_4); - v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_4); - v2 = fold_vec(v2, vld1q_u8(p + 32), multipliers_4); - v3 = fold_vec(v3, vld1q_u8(p + 48), multipliers_4); - p += 64; - len -= 64; - } - v0 = fold_vec(v0, v2, multipliers_2); - v1 = fold_vec(v1, v3, multipliers_2); - if (len >= 32) { - v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_2); - v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_2); - p += 32; - len -= 32; - } - v0 = fold_vec(v0, v1, multipliers_1); - } else { - static const u64 _aligned_attribute(16) mults[4][2] = { - CRC32_12VECS_MULTS, CRC32_6VECS_MULTS, - CRC32_3VECS_MULTS, CRC32_1VECS_MULTS, - }; - const poly64x2_t multipliers_12 = load_multipliers(mults[0]); - const poly64x2_t multipliers_6 = load_multipliers(mults[1]); - const poly64x2_t multipliers_3 = load_multipliers(mults[2]); - const poly64x2_t multipliers_1 = load_multipliers(mults[3]); - const size_t align = -(uintptr_t)p & 15; - const uint8x16_t *vp; - - /* Align p to the next 16-byte boundary. */ - if (align) { - if (align & 1) - crc = __crc32b(crc, *p++); - if (align & 2) { - crc = __crc32h(crc, le16_bswap(*(u16 *)p)); - p += 2; - } - if (align & 4) { - crc = __crc32w(crc, le32_bswap(*(u32 *)p)); - p += 4; - } - if (align & 8) { - crc = __crc32d(crc, le64_bswap(*(u64 *)p)); - p += 8; - } - len -= align; - } - vp = (const uint8x16_t *)p; - v0 = veorq_u8(*vp++, u32_to_bytevec(crc)); - v1 = *vp++; - v2 = *vp++; - v3 = *vp++; - v4 = *vp++; - v5 = *vp++; - v6 = *vp++; - v7 = *vp++; - v8 = *vp++; - v9 = *vp++; - v10 = *vp++; - v11 = *vp++; - len -= 192; - /* Fold 192 bytes (12 vectors) at a time. */ - do { - v0 = fold_vec(v0, *vp++, multipliers_12); - v1 = fold_vec(v1, *vp++, multipliers_12); - v2 = fold_vec(v2, *vp++, multipliers_12); - v3 = fold_vec(v3, *vp++, multipliers_12); - v4 = fold_vec(v4, *vp++, multipliers_12); - v5 = fold_vec(v5, *vp++, multipliers_12); - v6 = fold_vec(v6, *vp++, multipliers_12); - v7 = fold_vec(v7, *vp++, multipliers_12); - v8 = fold_vec(v8, *vp++, multipliers_12); - v9 = fold_vec(v9, *vp++, multipliers_12); - v10 = fold_vec(v10, *vp++, multipliers_12); - v11 = fold_vec(v11, *vp++, multipliers_12); - len -= 192; - } while (len >= 192); - - /* - * Fewer than 192 bytes left. Fold v0-v11 down to just v0, - * while processing up to 144 more bytes. - */ - v0 = fold_vec(v0, v6, multipliers_6); - v1 = fold_vec(v1, v7, multipliers_6); - v2 = fold_vec(v2, v8, multipliers_6); - v3 = fold_vec(v3, v9, multipliers_6); - v4 = fold_vec(v4, v10, multipliers_6); - v5 = fold_vec(v5, v11, multipliers_6); - if (len >= 96) { - v0 = fold_vec(v0, *vp++, multipliers_6); - v1 = fold_vec(v1, *vp++, multipliers_6); - v2 = fold_vec(v2, *vp++, multipliers_6); - v3 = fold_vec(v3, *vp++, multipliers_6); - v4 = fold_vec(v4, *vp++, multipliers_6); - v5 = fold_vec(v5, *vp++, multipliers_6); - len -= 96; - } - v0 = fold_vec(v0, v3, multipliers_3); - v1 = fold_vec(v1, v4, multipliers_3); - v2 = fold_vec(v2, v5, multipliers_3); - if (len >= 48) { - v0 = fold_vec(v0, *vp++, multipliers_3); - v1 = fold_vec(v1, *vp++, multipliers_3); - v2 = fold_vec(v2, *vp++, multipliers_3); - len -= 48; - } - v0 = fold_vec(v0, v1, multipliers_1); - v0 = fold_vec(v0, v2, multipliers_1); - p = (const u8 *)vp; - } - /* Reduce 128 to 32 bits using crc32 instructions. */ - crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0)); - crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1)); -tail: - /* Finish up the remainder using crc32 instructions. */ - if (len & 32) { - crc = __crc32d(crc, get_unaligned_le64(p + 0)); - crc = __crc32d(crc, get_unaligned_le64(p + 8)); - crc = __crc32d(crc, get_unaligned_le64(p + 16)); - crc = __crc32d(crc, get_unaligned_le64(p + 24)); - p += 32; - } - if (len & 16) { - crc = __crc32d(crc, get_unaligned_le64(p + 0)); - crc = __crc32d(crc, get_unaligned_le64(p + 8)); - p += 16; - } - if (len & 8) { - crc = __crc32d(crc, get_unaligned_le64(p)); - p += 8; - } - if (len & 4) { - crc = __crc32w(crc, get_unaligned_le32(p)); - p += 4; - } - if (len & 2) { - crc = __crc32h(crc, get_unaligned_le16(p)); - p += 2; - } - if (len & 1) - crc = __crc32b(crc, *p); - return crc; -} - -#undef SUFFIX -#undef ATTRIBUTES -#undef ENABLE_EOR3 diff --git a/Sources/DEFLATE/crc32_tables.h b/Sources/DEFLATE/crc32_tables.h index 86228c72..5a4c1c96 100644 --- a/Sources/DEFLATE/crc32_tables.h +++ b/Sources/DEFLATE/crc32_tables.h @@ -5,583 +5,583 @@ */ static const u32 crc32_slice1_table[] MAYBE_UNUSED = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, - 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, - 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, - 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, - 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, - 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, - 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, - 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, - 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, - 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, - 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, - 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, - 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, - 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, - 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, - 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, - 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, - 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, - 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, - 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, - 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, - 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, - 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, - 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, - 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, - 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, - 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, - 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, - 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, - 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, - 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, - 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, - 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, - 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, - 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, - 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, - 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, - 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, - 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, - 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, - 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, - 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, - 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d, + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, + 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, + 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, + 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, + 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, + 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, + 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, + 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, + 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, + 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, + 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, + 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, + 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, + 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, + 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, + 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, + 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, + 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, + 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, + 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, + 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d, }; static const u32 crc32_slice8_table[] MAYBE_UNUSED = { - 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, - 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, - 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, - 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, - 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, - 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, - 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, - 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, - 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, - 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, - 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, - 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, - 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, - 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, - 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, - 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, - 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, - 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, - 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, - 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, - 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, - 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, - 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, - 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, - 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, - 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, - 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, - 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, - 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, - 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, - 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, - 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, - 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, - 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, - 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, - 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, - 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, - 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, - 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, - 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, - 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, - 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, - 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, - 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, - 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, - 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, - 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, - 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, - 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, - 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, - 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, - 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, - 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, - 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, - 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, - 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, - 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, - 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, - 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, - 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, - 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, - 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, - 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, - 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d, - 0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3, - 0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7, - 0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb, - 0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf, - 0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192, - 0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496, - 0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a, - 0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e, - 0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761, - 0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265, - 0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69, - 0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d, - 0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530, - 0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034, - 0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38, - 0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c, - 0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6, - 0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2, - 0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce, - 0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca, - 0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97, - 0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93, - 0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f, - 0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b, - 0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864, - 0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60, - 0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c, - 0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768, - 0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35, - 0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31, - 0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d, - 0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539, - 0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88, - 0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c, - 0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180, - 0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484, - 0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9, - 0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd, - 0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1, - 0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5, - 0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a, - 0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e, - 0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522, - 0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026, - 0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b, - 0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f, - 0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773, - 0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277, - 0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d, - 0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189, - 0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85, - 0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81, - 0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc, - 0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8, - 0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4, - 0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0, - 0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f, - 0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b, - 0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27, - 0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23, - 0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e, - 0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a, - 0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876, - 0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72, - 0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59, - 0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685, - 0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1, - 0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d, - 0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29, - 0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5, - 0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91, - 0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d, - 0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9, - 0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065, - 0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901, - 0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd, - 0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9, - 0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315, - 0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71, - 0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad, - 0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399, - 0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45, - 0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221, - 0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd, - 0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9, - 0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835, - 0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151, - 0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d, - 0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579, - 0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5, - 0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1, - 0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d, - 0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609, - 0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5, - 0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1, - 0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d, - 0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9, - 0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05, - 0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461, - 0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd, - 0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9, - 0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75, - 0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711, - 0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd, - 0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339, - 0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5, - 0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281, - 0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d, - 0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049, - 0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895, - 0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1, - 0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d, - 0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819, - 0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5, - 0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1, - 0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d, - 0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69, - 0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5, - 0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1, - 0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d, - 0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9, - 0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625, - 0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41, - 0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d, - 0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89, - 0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555, - 0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31, - 0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed, - 0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, - 0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9, - 0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701, - 0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056, - 0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871, - 0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26, - 0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e, - 0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9, - 0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0, - 0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787, - 0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f, - 0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68, - 0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f, - 0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018, - 0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0, - 0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7, - 0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3, - 0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084, - 0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c, - 0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b, - 0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c, - 0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b, - 0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3, - 0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4, - 0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed, - 0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba, - 0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002, - 0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755, - 0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72, - 0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825, - 0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d, - 0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca, - 0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5, - 0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82, - 0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a, - 0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d, - 0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a, - 0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d, - 0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5, - 0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2, - 0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb, - 0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc, - 0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04, - 0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953, - 0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174, - 0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623, - 0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b, - 0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc, - 0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8, - 0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf, - 0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907, - 0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50, - 0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677, - 0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120, - 0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98, - 0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf, - 0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6, - 0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981, - 0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639, - 0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e, - 0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949, - 0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e, - 0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6, - 0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1, - 0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0, - 0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10, - 0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111, - 0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1, - 0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52, - 0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92, - 0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693, - 0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053, - 0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4, - 0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314, - 0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15, - 0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5, - 0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256, - 0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496, - 0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997, - 0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57, - 0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299, - 0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459, - 0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958, - 0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98, - 0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b, - 0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db, - 0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda, - 0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a, - 0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d, - 0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d, - 0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c, - 0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c, - 0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f, - 0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf, - 0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de, - 0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e, - 0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42, - 0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82, - 0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183, - 0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743, - 0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0, - 0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00, - 0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601, - 0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1, - 0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546, - 0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386, - 0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87, - 0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847, - 0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4, - 0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404, - 0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905, - 0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5, - 0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b, - 0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb, - 0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca, - 0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a, - 0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589, - 0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349, - 0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48, - 0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888, - 0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f, - 0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf, - 0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce, - 0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e, - 0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d, - 0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d, - 0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c, - 0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c, - 0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae, - 0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8, - 0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3, - 0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5, - 0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035, - 0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223, - 0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258, - 0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e, - 0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798, - 0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e, - 0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5, - 0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3, - 0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503, - 0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715, - 0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e, - 0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578, - 0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2, - 0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4, - 0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf, - 0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9, - 0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59, - 0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f, - 0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834, - 0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22, - 0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4, - 0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2, - 0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99, - 0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f, - 0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f, - 0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79, - 0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02, - 0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14, - 0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676, - 0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460, - 0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b, - 0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d, - 0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed, - 0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb, - 0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680, - 0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496, - 0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340, - 0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156, - 0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d, - 0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b, - 0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db, - 0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd, - 0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6, - 0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0, - 0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a, - 0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c, - 0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77, - 0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61, - 0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81, - 0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97, - 0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec, - 0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa, - 0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c, - 0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a, - 0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41, - 0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957, - 0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7, - 0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1, - 0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da, - 0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc, - 0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d, - 0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e, - 0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa, - 0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9, - 0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653, - 0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240, - 0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834, - 0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27, - 0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301, - 0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712, - 0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66, - 0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975, - 0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf, - 0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc, - 0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8, - 0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb, - 0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4, - 0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7, - 0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183, - 0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590, - 0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a, - 0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739, - 0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d, - 0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e, - 0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678, - 0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b, - 0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f, - 0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c, - 0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6, - 0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5, - 0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1, - 0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2, - 0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f, - 0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c, - 0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08, - 0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b, - 0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1, - 0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2, - 0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6, - 0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5, - 0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3, - 0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0, - 0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794, - 0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387, - 0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d, - 0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e, - 0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a, - 0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49, - 0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516, - 0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105, - 0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71, - 0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62, - 0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8, - 0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb, - 0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf, - 0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac, - 0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a, - 0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899, - 0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed, - 0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe, - 0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044, - 0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457, - 0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23, - 0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30, - 0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, - 0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919, - 0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56, - 0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac, - 0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8, - 0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832, - 0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d, - 0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387, - 0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5, - 0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f, - 0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00, - 0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa, - 0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e, - 0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64, - 0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b, - 0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1, - 0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e, - 0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4, - 0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb, - 0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041, - 0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425, - 0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf, - 0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90, - 0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a, - 0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758, - 0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2, - 0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced, - 0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217, - 0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673, - 0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889, - 0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6, - 0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c, - 0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239, - 0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3, - 0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c, - 0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776, - 0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312, - 0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8, - 0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7, - 0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d, - 0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f, - 0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95, - 0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda, - 0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520, - 0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144, - 0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe, - 0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1, - 0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b, - 0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4, - 0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e, - 0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61, - 0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b, - 0x061d761c, 0xcab77682, 0x44387161, 0x889271ff, - 0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05, - 0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a, - 0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0, - 0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282, - 0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78, - 0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937, - 0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd, - 0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9, - 0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53, - 0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c, - 0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6, + 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, + 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, + 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, + 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, + 0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, + 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, + 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, + 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, + 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, + 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, + 0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, + 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, + 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, + 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, + 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, + 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, + 0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, + 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, + 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, + 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, + 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, + 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, + 0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, + 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, + 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, + 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, + 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, + 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, + 0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, + 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, + 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, + 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, + 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, + 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, + 0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, + 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, + 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, + 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, + 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, + 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, + 0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, + 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, + 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, + 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, + 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, + 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, + 0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, + 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, + 0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, + 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, + 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, + 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, + 0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, + 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, + 0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, + 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, + 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, + 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, + 0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, + 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, + 0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, + 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, + 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, + 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d, + 0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3, + 0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7, + 0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb, + 0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf, + 0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192, + 0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496, + 0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a, + 0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e, + 0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761, + 0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265, + 0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69, + 0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d, + 0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530, + 0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034, + 0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38, + 0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c, + 0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6, + 0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2, + 0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce, + 0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca, + 0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97, + 0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93, + 0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f, + 0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b, + 0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864, + 0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60, + 0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c, + 0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768, + 0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35, + 0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31, + 0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d, + 0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539, + 0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88, + 0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c, + 0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180, + 0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484, + 0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9, + 0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd, + 0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1, + 0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5, + 0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a, + 0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e, + 0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522, + 0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026, + 0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b, + 0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f, + 0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773, + 0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277, + 0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d, + 0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189, + 0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85, + 0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81, + 0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc, + 0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8, + 0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4, + 0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0, + 0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f, + 0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b, + 0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27, + 0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23, + 0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e, + 0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a, + 0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876, + 0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72, + 0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59, + 0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685, + 0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1, + 0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d, + 0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29, + 0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5, + 0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91, + 0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d, + 0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9, + 0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065, + 0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901, + 0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd, + 0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9, + 0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315, + 0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71, + 0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad, + 0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399, + 0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45, + 0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221, + 0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd, + 0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9, + 0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835, + 0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151, + 0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d, + 0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579, + 0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5, + 0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1, + 0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d, + 0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609, + 0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5, + 0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1, + 0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d, + 0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9, + 0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05, + 0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461, + 0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd, + 0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9, + 0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75, + 0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711, + 0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd, + 0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339, + 0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5, + 0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281, + 0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d, + 0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049, + 0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895, + 0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1, + 0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d, + 0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819, + 0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5, + 0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1, + 0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d, + 0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69, + 0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5, + 0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1, + 0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d, + 0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9, + 0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625, + 0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41, + 0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d, + 0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89, + 0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555, + 0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31, + 0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed, + 0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, + 0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9, + 0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701, + 0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056, + 0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871, + 0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26, + 0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e, + 0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9, + 0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0, + 0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787, + 0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f, + 0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68, + 0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f, + 0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018, + 0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0, + 0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7, + 0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3, + 0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084, + 0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c, + 0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b, + 0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c, + 0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b, + 0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3, + 0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4, + 0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed, + 0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba, + 0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002, + 0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755, + 0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72, + 0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825, + 0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d, + 0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca, + 0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5, + 0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82, + 0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a, + 0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d, + 0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a, + 0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d, + 0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5, + 0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2, + 0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb, + 0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc, + 0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04, + 0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953, + 0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174, + 0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623, + 0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b, + 0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc, + 0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8, + 0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf, + 0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907, + 0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50, + 0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677, + 0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120, + 0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98, + 0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf, + 0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6, + 0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981, + 0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639, + 0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e, + 0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949, + 0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e, + 0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6, + 0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1, + 0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0, + 0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10, + 0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111, + 0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1, + 0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52, + 0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92, + 0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693, + 0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053, + 0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4, + 0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314, + 0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15, + 0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5, + 0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256, + 0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496, + 0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997, + 0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57, + 0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299, + 0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459, + 0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958, + 0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98, + 0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b, + 0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db, + 0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda, + 0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a, + 0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d, + 0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d, + 0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c, + 0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c, + 0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f, + 0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf, + 0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de, + 0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e, + 0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42, + 0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82, + 0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183, + 0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743, + 0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0, + 0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00, + 0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601, + 0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1, + 0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546, + 0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386, + 0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87, + 0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847, + 0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4, + 0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404, + 0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905, + 0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5, + 0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b, + 0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb, + 0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca, + 0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a, + 0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589, + 0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349, + 0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48, + 0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888, + 0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f, + 0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf, + 0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce, + 0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e, + 0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d, + 0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d, + 0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c, + 0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c, + 0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae, + 0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8, + 0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3, + 0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5, + 0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035, + 0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223, + 0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258, + 0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e, + 0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798, + 0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e, + 0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5, + 0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3, + 0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503, + 0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715, + 0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e, + 0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578, + 0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2, + 0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4, + 0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf, + 0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9, + 0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59, + 0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f, + 0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834, + 0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22, + 0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4, + 0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2, + 0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99, + 0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f, + 0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f, + 0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79, + 0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02, + 0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14, + 0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676, + 0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460, + 0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b, + 0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d, + 0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed, + 0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb, + 0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680, + 0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496, + 0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340, + 0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156, + 0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d, + 0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b, + 0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db, + 0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd, + 0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6, + 0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0, + 0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a, + 0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c, + 0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77, + 0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61, + 0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81, + 0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97, + 0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec, + 0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa, + 0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c, + 0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a, + 0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41, + 0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957, + 0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7, + 0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1, + 0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da, + 0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc, + 0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d, + 0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e, + 0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa, + 0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9, + 0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653, + 0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240, + 0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834, + 0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27, + 0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301, + 0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712, + 0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66, + 0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975, + 0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf, + 0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc, + 0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8, + 0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb, + 0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4, + 0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7, + 0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183, + 0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590, + 0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a, + 0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739, + 0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d, + 0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e, + 0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678, + 0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b, + 0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f, + 0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c, + 0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6, + 0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5, + 0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1, + 0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2, + 0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f, + 0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c, + 0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08, + 0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b, + 0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1, + 0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2, + 0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6, + 0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5, + 0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3, + 0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0, + 0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794, + 0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387, + 0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d, + 0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e, + 0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a, + 0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49, + 0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516, + 0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105, + 0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71, + 0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62, + 0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8, + 0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb, + 0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf, + 0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac, + 0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a, + 0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899, + 0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed, + 0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe, + 0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044, + 0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457, + 0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23, + 0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30, + 0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, + 0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919, + 0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56, + 0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac, + 0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8, + 0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832, + 0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d, + 0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387, + 0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5, + 0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f, + 0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00, + 0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa, + 0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e, + 0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64, + 0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b, + 0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1, + 0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e, + 0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4, + 0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb, + 0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041, + 0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425, + 0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf, + 0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90, + 0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a, + 0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758, + 0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2, + 0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced, + 0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217, + 0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673, + 0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889, + 0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6, + 0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c, + 0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239, + 0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3, + 0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c, + 0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776, + 0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312, + 0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8, + 0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7, + 0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d, + 0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f, + 0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95, + 0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda, + 0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520, + 0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144, + 0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe, + 0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1, + 0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b, + 0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4, + 0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e, + 0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61, + 0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b, + 0x061d761c, 0xcab77682, 0x44387161, 0x889271ff, + 0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05, + 0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a, + 0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0, + 0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282, + 0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78, + 0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937, + 0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd, + 0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9, + 0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53, + 0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c, + 0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6, }; diff --git a/Sources/DEFLATE/decompress_template.h b/Sources/DEFLATE/decompress_template.h index 3c1da677..3344323d 100644 --- a/Sources/DEFLATE/decompress_template.h +++ b/Sources/DEFLATE/decompress_template.h @@ -35,740 +35,740 @@ # define ATTRIBUTES #endif #ifndef EXTRACT_VARBITS -# define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count)) +# define EXTRACT_VARBITS(word, count) ((word) & BITMASK(count)) #endif #ifndef EXTRACT_VARBITS8 -# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count))) +# define EXTRACT_VARBITS8(word, count) ((word) & BITMASK((u8)(count))) #endif static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED FUNCNAME(struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) { - u8 *out_next = out; - u8 * const out_end = out_next + out_nbytes_avail; - u8 * const out_fastloop_end = - out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN); - - /* Input bitstream state; see deflate_decompress.c for documentation */ - const u8 *in_next = in; - const u8 * const in_end = in_next + in_nbytes; - const u8 * const in_fastloop_end = - in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ); - bitbuf_t bitbuf = 0; - bitbuf_t saved_bitbuf; - u32 bitsleft = 0; - size_t overread_count = 0; - - bool is_final_block; - unsigned block_type; - unsigned num_litlen_syms; - unsigned num_offset_syms; - bitbuf_t litlen_tablemask; - u32 entry; - + u8 *out_next = out; + u8 * const out_end = out_next + out_nbytes_avail; + u8 * const out_fastloop_end = + out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN); + + /* Input bitstream state; see deflate_decompress.c for documentation */ + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + const u8 * const in_fastloop_end = + in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ); + bitbuf_t bitbuf = 0; + bitbuf_t saved_bitbuf; + u32 bitsleft = 0; + size_t overread_count = 0; + + bool is_final_block; + unsigned block_type; + unsigned num_litlen_syms; + unsigned num_offset_syms; + bitbuf_t litlen_tablemask; + u32 entry; + next_block: - /* Starting to read the next block */ - ; - - STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3)); - REFILL_BITS(); - - /* BFINAL: 1 bit */ - is_final_block = bitbuf & BITMASK(1); - - /* BTYPE: 2 bits */ - block_type = (bitbuf >> 1) & BITMASK(2); - - if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { - - /* Dynamic Huffman block */ - - /* The order in which precode lengths are stored */ - static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { - 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 - }; - - unsigned num_explicit_precode_lens; - unsigned i; - - /* Read the codeword length counts. */ - - STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5)); - num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5)); - - STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5)); - num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5)); - - STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4)); - num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4)); - - d->static_codes_loaded = false; - - /* - * Read the precode codeword lengths. - * - * A 64-bit bitbuffer is just one bit too small to hold the - * maximum number of precode lens, so to minimize branches we - * merge one len with the previous fields. - */ - STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); - if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { - d->u.precode_lens[deflate_precode_lens_permutation[0]] = - (bitbuf >> 17) & BITMASK(3); - bitbuf >>= 20; - bitsleft -= 20; - REFILL_BITS(); - i = 1; - do { - d->u.precode_lens[deflate_precode_lens_permutation[i]] = - bitbuf & BITMASK(3); - bitbuf >>= 3; - bitsleft -= 3; - } while (++i < num_explicit_precode_lens); - } else { - bitbuf >>= 17; - bitsleft -= 17; - i = 0; - do { - if ((u8)bitsleft < 3) - REFILL_BITS(); - d->u.precode_lens[deflate_precode_lens_permutation[i]] = - bitbuf & BITMASK(3); - bitbuf >>= 3; - bitsleft -= 3; - } while (++i < num_explicit_precode_lens); - } - for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) - d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; - - /* Build the decode table for the precode. */ - SAFETY_CHECK(build_precode_decode_table(d)); - - /* Decode the litlen and offset codeword lengths. */ - i = 0; - do { - unsigned presym; - u8 rep_val; - unsigned rep_count; - - if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7) - REFILL_BITS(); - - /* - * The code below assumes that the precode decode table - * doesn't have any subtables. - */ - STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); - - /* Decode the next precode symbol. */ - entry = d->u.l.precode_decode_table[ - bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)]; - bitbuf >>= (u8)entry; - bitsleft -= entry; /* optimization: subtract full entry */ - presym = entry >> 16; - - if (presym < 16) { - /* Explicit codeword length */ - d->u.l.lens[i++] = presym; - continue; - } - - /* Run-length encoded codeword lengths */ - - /* - * Note: we don't need to immediately verify that the - * repeat count doesn't overflow the number of elements, - * since we've sized the lens array to have enough extra - * space to allow for the worst-case overrun (138 zeroes - * when only 1 length was remaining). - * - * In the case of the small repeat counts (presyms 16 - * and 17), it is fastest to always write the maximum - * number of entries. That gets rid of branches that - * would otherwise be required. - * - * It is not just because of the numerical order that - * our checks go in the order 'presym < 16', 'presym == - * 16', and 'presym == 17'. For typical data this is - * ordered from most frequent to least frequent case. - */ - STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); - - if (presym == 16) { - /* Repeat the previous length 3 - 6 times. */ - SAFETY_CHECK(i != 0); - rep_val = d->u.l.lens[i - 1]; - STATIC_ASSERT(3 + BITMASK(2) == 6); - rep_count = 3 + (bitbuf & BITMASK(2)); - bitbuf >>= 2; - bitsleft -= 2; - d->u.l.lens[i + 0] = rep_val; - d->u.l.lens[i + 1] = rep_val; - d->u.l.lens[i + 2] = rep_val; - d->u.l.lens[i + 3] = rep_val; - d->u.l.lens[i + 4] = rep_val; - d->u.l.lens[i + 5] = rep_val; - i += rep_count; - } else if (presym == 17) { - /* Repeat zero 3 - 10 times. */ - STATIC_ASSERT(3 + BITMASK(3) == 10); - rep_count = 3 + (bitbuf & BITMASK(3)); - bitbuf >>= 3; - bitsleft -= 3; - d->u.l.lens[i + 0] = 0; - d->u.l.lens[i + 1] = 0; - d->u.l.lens[i + 2] = 0; - d->u.l.lens[i + 3] = 0; - d->u.l.lens[i + 4] = 0; - d->u.l.lens[i + 5] = 0; - d->u.l.lens[i + 6] = 0; - d->u.l.lens[i + 7] = 0; - d->u.l.lens[i + 8] = 0; - d->u.l.lens[i + 9] = 0; - i += rep_count; - } else { - /* Repeat zero 11 - 138 times. */ - STATIC_ASSERT(11 + BITMASK(7) == 138); - rep_count = 11 + (bitbuf & BITMASK(7)); - bitbuf >>= 7; - bitsleft -= 7; - memset(&d->u.l.lens[i], 0, - rep_count * sizeof(d->u.l.lens[i])); - i += rep_count; - } - } while (i < num_litlen_syms + num_offset_syms); - - /* Unnecessary, but check this for consistency with zlib. */ - SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); - - } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { - u16 len, nlen; - - /* - * Uncompressed block: copy 'len' bytes literally from the input - * buffer to the output buffer. - */ - - bitsleft -= 3; /* for BTYPE and BFINAL */ - - /* - * Align the bitstream to the next byte boundary. This means - * the next byte boundary as if we were reading a byte at a - * time. Therefore, we have to rewind 'in_next' by any bytes - * that have been refilled but not actually consumed yet (not - * counting overread bytes, which don't increment 'in_next'). - */ - bitsleft = (u8)bitsleft; - SAFETY_CHECK(overread_count <= (bitsleft >> 3)); - in_next -= (bitsleft >> 3) - overread_count; - overread_count = 0; - bitbuf = 0; - bitsleft = 0; - - SAFETY_CHECK(in_end - in_next >= 4); - len = get_unaligned_le16(in_next); - nlen = get_unaligned_le16(in_next + 2); - in_next += 4; - - SAFETY_CHECK(len == (u16)~nlen); - if (unlikely(len > out_end - out_next)) - return LIBDEFLATE_INSUFFICIENT_SPACE; - SAFETY_CHECK(len <= in_end - in_next); - - memcpy(out_next, in_next, len); - in_next += len; - out_next += len; - - goto block_done; - - } else { - unsigned i; - - SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); - - /* - * Static Huffman block: build the decode tables for the static - * codes. Skip doing so if the tables are already set up from - * an earlier static block; this speeds up decompression of - * degenerate input of many empty or very short static blocks. - * - * Afterwards, the remainder is the same as decompressing a - * dynamic Huffman block. - */ - - bitbuf >>= 3; /* for BTYPE and BFINAL */ - bitsleft -= 3; - - if (d->static_codes_loaded) - goto have_decode_tables; - - d->static_codes_loaded = true; - - STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); - STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); - - for (i = 0; i < 144; i++) - d->u.l.lens[i] = 8; - for (; i < 256; i++) - d->u.l.lens[i] = 9; - for (; i < 280; i++) - d->u.l.lens[i] = 7; - for (; i < 288; i++) - d->u.l.lens[i] = 8; - - for (; i < 288 + 32; i++) - d->u.l.lens[i] = 5; - - num_litlen_syms = 288; - num_offset_syms = 32; - } - - /* Decompressing a Huffman block (either dynamic or static) */ - - SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); - SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); + /* Starting to read the next block */ + ; + + STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3)); + REFILL_BITS(); + + /* BFINAL: 1 bit */ + is_final_block = bitbuf & BITMASK(1); + + /* BTYPE: 2 bits */ + block_type = (bitbuf >> 1) & BITMASK(2); + + if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) { + + /* Dynamic Huffman block */ + + /* The order in which precode lengths are stored */ + static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + }; + + unsigned num_explicit_precode_lens; + unsigned i; + + /* Read the codeword length counts. */ + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5)); + num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5)); + num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5)); + + STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4)); + num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4)); + + d->static_codes_loaded = false; + + /* + * Read the precode codeword lengths. + * + * A 64-bit bitbuffer is just one bit too small to hold the + * maximum number of precode lens, so to minimize branches we + * merge one len with the previous fields. + */ + STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1); + if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { + d->u.precode_lens[deflate_precode_lens_permutation[0]] = + (bitbuf >> 17) & BITMASK(3); + bitbuf >>= 20; + bitsleft -= 20; + REFILL_BITS(); + i = 1; + do { + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } else { + bitbuf >>= 17; + bitsleft -= 17; + i = 0; + do { + if ((u8)bitsleft < 3) + REFILL_BITS(); + d->u.precode_lens[deflate_precode_lens_permutation[i]] = + bitbuf & BITMASK(3); + bitbuf >>= 3; + bitsleft -= 3; + } while (++i < num_explicit_precode_lens); + } + for (; i < DEFLATE_NUM_PRECODE_SYMS; i++) + d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0; + + /* Build the decode table for the precode. */ + SAFETY_CHECK(build_precode_decode_table(d)); + + /* Decode the litlen and offset codeword lengths. */ + i = 0; + do { + unsigned presym; + u8 rep_val; + unsigned rep_count; + + if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7) + REFILL_BITS(); + + /* + * The code below assumes that the precode decode table + * doesn't have any subtables. + */ + STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN); + + /* Decode the next precode symbol. */ + entry = d->u.l.precode_decode_table[ + bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)]; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + presym = entry >> 16; + + if (presym < 16) { + /* Explicit codeword length */ + d->u.l.lens[i++] = presym; + continue; + } + + /* Run-length encoded codeword lengths */ + + /* + * Note: we don't need to immediately verify that the + * repeat count doesn't overflow the number of elements, + * since we've sized the lens array to have enough extra + * space to allow for the worst-case overrun (138 zeroes + * when only 1 length was remaining). + * + * In the case of the small repeat counts (presyms 16 + * and 17), it is fastest to always write the maximum + * number of entries. That gets rid of branches that + * would otherwise be required. + * + * It is not just because of the numerical order that + * our checks go in the order 'presym < 16', 'presym == + * 16', and 'presym == 17'. For typical data this is + * ordered from most frequent to least frequent case. + */ + STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1); + + if (presym == 16) { + /* Repeat the previous length 3 - 6 times. */ + SAFETY_CHECK(i != 0); + rep_val = d->u.l.lens[i - 1]; + STATIC_ASSERT(3 + BITMASK(2) == 6); + rep_count = 3 + (bitbuf & BITMASK(2)); + bitbuf >>= 2; + bitsleft -= 2; + d->u.l.lens[i + 0] = rep_val; + d->u.l.lens[i + 1] = rep_val; + d->u.l.lens[i + 2] = rep_val; + d->u.l.lens[i + 3] = rep_val; + d->u.l.lens[i + 4] = rep_val; + d->u.l.lens[i + 5] = rep_val; + i += rep_count; + } else if (presym == 17) { + /* Repeat zero 3 - 10 times. */ + STATIC_ASSERT(3 + BITMASK(3) == 10); + rep_count = 3 + (bitbuf & BITMASK(3)); + bitbuf >>= 3; + bitsleft -= 3; + d->u.l.lens[i + 0] = 0; + d->u.l.lens[i + 1] = 0; + d->u.l.lens[i + 2] = 0; + d->u.l.lens[i + 3] = 0; + d->u.l.lens[i + 4] = 0; + d->u.l.lens[i + 5] = 0; + d->u.l.lens[i + 6] = 0; + d->u.l.lens[i + 7] = 0; + d->u.l.lens[i + 8] = 0; + d->u.l.lens[i + 9] = 0; + i += rep_count; + } else { + /* Repeat zero 11 - 138 times. */ + STATIC_ASSERT(11 + BITMASK(7) == 138); + rep_count = 11 + (bitbuf & BITMASK(7)); + bitbuf >>= 7; + bitsleft -= 7; + memset(&d->u.l.lens[i], 0, + rep_count * sizeof(d->u.l.lens[i])); + i += rep_count; + } + } while (i < num_litlen_syms + num_offset_syms); + + /* Unnecessary, but check this for consistency with zlib. */ + SAFETY_CHECK(i == num_litlen_syms + num_offset_syms); + + } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) { + u16 len, nlen; + + /* + * Uncompressed block: copy 'len' bytes literally from the input + * buffer to the output buffer. + */ + + bitsleft -= 3; /* for BTYPE and BFINAL */ + + /* + * Align the bitstream to the next byte boundary. This means + * the next byte boundary as if we were reading a byte at a + * time. Therefore, we have to rewind 'in_next' by any bytes + * that have been refilled but not actually consumed yet (not + * counting overread bytes, which don't increment 'in_next'). + */ + bitsleft = (u8)bitsleft; + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + in_next -= (bitsleft >> 3) - overread_count; + overread_count = 0; + bitbuf = 0; + bitsleft = 0; + + SAFETY_CHECK(in_end - in_next >= 4); + len = get_unaligned_le16(in_next); + nlen = get_unaligned_le16(in_next + 2); + in_next += 4; + + SAFETY_CHECK(len == (u16)~nlen); + if (unlikely(len > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + SAFETY_CHECK(len <= in_end - in_next); + + memcpy(out_next, in_next, len); + in_next += len; + out_next += len; + + goto block_done; + + } else { + unsigned i; + + SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN); + + /* + * Static Huffman block: build the decode tables for the static + * codes. Skip doing so if the tables are already set up from + * an earlier static block; this speeds up decompression of + * degenerate input of many empty or very short static blocks. + * + * Afterwards, the remainder is the same as decompressing a + * dynamic Huffman block. + */ + + bitbuf >>= 3; /* for BTYPE and BFINAL */ + bitsleft -= 3; + + if (d->static_codes_loaded) + goto have_decode_tables; + + d->static_codes_loaded = true; + + STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288); + STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32); + + for (i = 0; i < 144; i++) + d->u.l.lens[i] = 8; + for (; i < 256; i++) + d->u.l.lens[i] = 9; + for (; i < 280; i++) + d->u.l.lens[i] = 7; + for (; i < 288; i++) + d->u.l.lens[i] = 8; + + for (; i < 288 + 32; i++) + d->u.l.lens[i] = 5; + + num_litlen_syms = 288; + num_offset_syms = 32; + } + + /* Decompressing a Huffman block (either dynamic or static) */ + + SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms)); + SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms)); have_decode_tables: - litlen_tablemask = BITMASK(d->litlen_tablebits); - - /* - * This is the "fastloop" for decoding literals and matches. It does - * bounds checks on in_next and out_next in the loop conditions so that - * additional bounds checks aren't needed inside the loop body. - * - * To reduce latency, the bitbuffer is refilled and the next litlen - * decode table entry is preloaded before each loop iteration. - */ - if (in_next >= in_fastloop_end || out_next >= out_fastloop_end) - goto generic_loop; - REFILL_BITS_IN_FASTLOOP(); - entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; - do { - u32 length, offset, lit; - const u8 *src; - u8 *dst; - - /* - * Consume the bits for the litlen decode table entry. Save the - * original bitbuf for later, in case the extra match length - * bits need to be extracted from it. - */ - saved_bitbuf = bitbuf; - bitbuf >>= (u8)entry; - bitsleft -= entry; /* optimization: subtract full entry */ - - /* - * Begin by checking for a "fast" literal, i.e. a literal that - * doesn't need a subtable. - */ - if (entry & HUFFDEC_LITERAL) { - /* - * On 64-bit platforms, we decode up to 2 extra fast - * literals in addition to the primary item, as this - * increases performance and still leaves enough bits - * remaining for what follows. We could actually do 3, - * assuming LITLEN_TABLEBITS=11, but that actually - * decreases performance slightly (perhaps by messing - * with the branch prediction of the conditional refill - * that happens later while decoding the match offset). - * - * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN - * and FASTLOOP_MAX_BYTES_READ need to be updated if the - * number of extra literals decoded here is changed. - */ - if (/* enough bits for 2 fast literals + length + offset preload? */ - CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + - LENGTH_MAXBITS, - OFFSET_TABLEBITS) && - /* enough bits for 2 fast literals + slow literal + litlen preload? */ - CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + - DEFLATE_MAX_LITLEN_CODEWORD_LEN, - LITLEN_TABLEBITS)) { - /* 1st extra fast literal */ - lit = entry >> 16; - entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; - saved_bitbuf = bitbuf; - bitbuf >>= (u8)entry; - bitsleft -= entry; - *out_next++ = lit; - if (entry & HUFFDEC_LITERAL) { - /* 2nd extra fast literal */ - lit = entry >> 16; - entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; - saved_bitbuf = bitbuf; - bitbuf >>= (u8)entry; - bitsleft -= entry; - *out_next++ = lit; - if (entry & HUFFDEC_LITERAL) { - /* - * Another fast literal, but - * this one is in lieu of the - * primary item, so it doesn't - * count as one of the extras. - */ - lit = entry >> 16; - entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; - REFILL_BITS_IN_FASTLOOP(); - *out_next++ = lit; - continue; - } - } - } else { - /* - * Decode a literal. While doing so, preload - * the next litlen decode table entry and refill - * the bitbuffer. To reduce latency, we've - * arranged for there to be enough "preloadable" - * bits remaining to do the table preload - * independently of the refill. - */ - STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD( - LITLEN_TABLEBITS, LITLEN_TABLEBITS)); - lit = entry >> 16; - entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; - REFILL_BITS_IN_FASTLOOP(); - *out_next++ = lit; - continue; - } - } - - /* - * It's not a literal entry, so it can be a length entry, a - * subtable pointer entry, or an end-of-block entry. Detect the - * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag. - */ - if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { - /* Subtable pointer or end-of-block entry */ - - if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) - goto block_done; - - /* - * A subtable is required. Load and consume the - * subtable entry. The subtable entry can be of any - * type: literal, length, or end-of-block. - */ - entry = d->u.litlen_decode_table[(entry >> 16) + - EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; - saved_bitbuf = bitbuf; - bitbuf >>= (u8)entry; - bitsleft -= entry; - - /* - * 32-bit platforms that use the byte-at-a-time refill - * method have to do a refill here for there to always - * be enough bits to decode a literal that requires a - * subtable, then preload the next litlen decode table - * entry; or to decode a match length that requires a - * subtable, then preload the offset decode table entry. - */ - if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN, - LITLEN_TABLEBITS) || - !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS, - OFFSET_TABLEBITS)) - REFILL_BITS_IN_FASTLOOP(); - if (entry & HUFFDEC_LITERAL) { - /* Decode a literal that required a subtable. */ - lit = entry >> 16; - entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; - REFILL_BITS_IN_FASTLOOP(); - *out_next++ = lit; - continue; - } - if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) - goto block_done; - /* Else, it's a length that required a subtable. */ - } - - /* - * Decode the match length: the length base value associated - * with the litlen symbol (which we extract from the decode - * table entry), plus the extra length bits. We don't need to - * consume the extra length bits here, as they were included in - * the bits consumed by the entry earlier. We also don't need - * to check for too-long matches here, as this is inside the - * fastloop where it's already been verified that the output - * buffer has enough space remaining to copy a max-length match. - */ - length = entry >> 16; - length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); - - /* - * Decode the match offset. There are enough "preloadable" bits - * remaining to preload the offset decode table entry, but a - * refill might be needed before consuming it. - */ - STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS, - OFFSET_TABLEBITS)); - entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; - if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS, - LITLEN_TABLEBITS)) { - /* - * Decoding a match offset on a 64-bit platform. We may - * need to refill once, but then we can decode the whole - * offset and preload the next litlen table entry. - */ - if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { - /* Offset codeword requires a subtable */ - if (unlikely((u8)bitsleft < OFFSET_MAXBITS + - LITLEN_TABLEBITS - PRELOAD_SLACK)) - REFILL_BITS_IN_FASTLOOP(); - bitbuf >>= OFFSET_TABLEBITS; - bitsleft -= OFFSET_TABLEBITS; - entry = d->offset_decode_table[(entry >> 16) + - EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; - } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS + - LITLEN_TABLEBITS - PRELOAD_SLACK)) - REFILL_BITS_IN_FASTLOOP(); - } else { - /* Decoding a match offset on a 32-bit platform */ - REFILL_BITS_IN_FASTLOOP(); - if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { - /* Offset codeword requires a subtable */ - bitbuf >>= OFFSET_TABLEBITS; - bitsleft -= OFFSET_TABLEBITS; - entry = d->offset_decode_table[(entry >> 16) + - EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; - REFILL_BITS_IN_FASTLOOP(); - /* No further refill needed before extra bits */ - STATIC_ASSERT(CAN_CONSUME( - OFFSET_MAXBITS - OFFSET_TABLEBITS)); - } else { - /* No refill needed before extra bits */ - STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS)); - } - } - saved_bitbuf = bitbuf; - bitbuf >>= (u8)entry; - bitsleft -= entry; /* optimization: subtract full entry */ - offset = entry >> 16; - offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); - - /* Validate the match offset; needed even in the fastloop. */ - SAFETY_CHECK(offset <= out_next - (const u8 *)out); - src = out_next - offset; - dst = out_next; - out_next += length; - - /* - * Before starting to issue the instructions to copy the match, - * refill the bitbuffer and preload the litlen decode table - * entry for the next loop iteration. This can increase - * performance by allowing the latency of the match copy to - * overlap with these other operations. To further reduce - * latency, we've arranged for there to be enough bits remaining - * to do the table preload independently of the refill, except - * on 32-bit platforms using the byte-at-a-time refill method. - */ - if (!CAN_CONSUME_AND_THEN_PRELOAD( - MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS, - OFFSET_MAXFASTBITS), - LITLEN_TABLEBITS) && - unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK)) - REFILL_BITS_IN_FASTLOOP(); - entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; - REFILL_BITS_IN_FASTLOOP(); - - /* - * Copy the match. On most CPUs the fastest method is a - * word-at-a-time copy, unconditionally copying about 5 words - * since this is enough for most matches without being too much. - * - * The normal word-at-a-time copy works for offset >= WORDBYTES, - * which is most cases. The case of offset == 1 is also common - * and is worth optimizing for, since it is just RLE encoding of - * the previous byte, which is the result of compressing long - * runs of the same byte. - * - * Writing past the match 'length' is allowed here, since it's - * been ensured there is enough output space left for a slight - * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if - * the maximum possible overrun here is changed. - */ - if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) { - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - while (dst < out_next) { - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - store_word_unaligned(load_word_unaligned(src), dst); - src += WORDBYTES; - dst += WORDBYTES; - } - } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) { - machine_word_t v; - - /* - * This part tends to get auto-vectorized, so keep it - * copying a multiple of 16 bytes at a time. - */ - v = (machine_word_t)0x0101010101010101 * src[0]; - store_word_unaligned(v, dst); - dst += WORDBYTES; - store_word_unaligned(v, dst); - dst += WORDBYTES; - store_word_unaligned(v, dst); - dst += WORDBYTES; - store_word_unaligned(v, dst); - dst += WORDBYTES; - while (dst < out_next) { - store_word_unaligned(v, dst); - dst += WORDBYTES; - store_word_unaligned(v, dst); - dst += WORDBYTES; - store_word_unaligned(v, dst); - dst += WORDBYTES; - store_word_unaligned(v, dst); - dst += WORDBYTES; - } - } else if (UNALIGNED_ACCESS_IS_FAST) { - store_word_unaligned(load_word_unaligned(src), dst); - src += offset; - dst += offset; - store_word_unaligned(load_word_unaligned(src), dst); - src += offset; - dst += offset; - do { - store_word_unaligned(load_word_unaligned(src), dst); - src += offset; - dst += offset; - store_word_unaligned(load_word_unaligned(src), dst); - src += offset; - dst += offset; - } while (dst < out_next); - } else { - *dst++ = *src++; - *dst++ = *src++; - do { - *dst++ = *src++; - } while (dst < out_next); - } - } while (in_next < in_fastloop_end && out_next < out_fastloop_end); - - /* - * This is the generic loop for decoding literals and matches. This - * handles cases where in_next and out_next are close to the end of - * their respective buffers. Usually this loop isn't performance- - * critical, as most time is spent in the fastloop above instead. We - * therefore omit some optimizations here in favor of smaller code. - */ + litlen_tablemask = BITMASK(d->litlen_tablebits); + + /* + * This is the "fastloop" for decoding literals and matches. It does + * bounds checks on in_next and out_next in the loop conditions so that + * additional bounds checks aren't needed inside the loop body. + * + * To reduce latency, the bitbuffer is refilled and the next litlen + * decode table entry is preloaded before each loop iteration. + */ + if (in_next >= in_fastloop_end || out_next >= out_fastloop_end) + goto generic_loop; + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + do { + u32 length, offset, lit; + const u8 *src; + u8 *dst; + + /* + * Consume the bits for the litlen decode table entry. Save the + * original bitbuf for later, in case the extra match length + * bits need to be extracted from it. + */ + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + + /* + * Begin by checking for a "fast" literal, i.e. a literal that + * doesn't need a subtable. + */ + if (entry & HUFFDEC_LITERAL) { + /* + * On 64-bit platforms, we decode up to 2 extra fast + * literals in addition to the primary item, as this + * increases performance and still leaves enough bits + * remaining for what follows. We could actually do 3, + * assuming LITLEN_TABLEBITS=11, but that actually + * decreases performance slightly (perhaps by messing + * with the branch prediction of the conditional refill + * that happens later while decoding the match offset). + * + * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN + * and FASTLOOP_MAX_BYTES_READ need to be updated if the + * number of extra literals decoded here is changed. + */ + if (/* enough bits for 2 fast literals + length + offset preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + LENGTH_MAXBITS, + OFFSET_TABLEBITS) && + /* enough bits for 2 fast literals + slow literal + litlen preload? */ + CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS + + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS)) { + /* 1st extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* 2nd extra fast literal */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + *out_next++ = lit; + if (entry & HUFFDEC_LITERAL) { + /* + * Another fast literal, but + * this one is in lieu of the + * primary item, so it doesn't + * count as one of the extras. + */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + } else { + /* + * Decode a literal. While doing so, preload + * the next litlen decode table entry and refill + * the bitbuffer. To reduce latency, we've + * arranged for there to be enough "preloadable" + * bits remaining to do the table preload + * independently of the refill. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD( + LITLEN_TABLEBITS, LITLEN_TABLEBITS)); + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + } + + /* + * It's not a literal entry, so it can be a length entry, a + * subtable pointer entry, or an end-of-block entry. Detect the + * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Subtable pointer or end-of-block entry */ + + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + + /* + * A subtable is required. Load and consume the + * subtable entry. The subtable entry can be of any + * type: literal, length, or end-of-block. + */ + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + + /* + * 32-bit platforms that use the byte-at-a-time refill + * method have to do a refill here for there to always + * be enough bits to decode a literal that requires a + * subtable, then preload the next litlen decode table + * entry; or to decode a match length that requires a + * subtable, then preload the offset decode table entry. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN, + LITLEN_TABLEBITS) || + !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS, + OFFSET_TABLEBITS)) + REFILL_BITS_IN_FASTLOOP(); + if (entry & HUFFDEC_LITERAL) { + /* Decode a literal that required a subtable. */ + lit = entry >> 16; + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + *out_next++ = lit; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + /* Else, it's a length that required a subtable. */ + } + + /* + * Decode the match length: the length base value associated + * with the litlen symbol (which we extract from the decode + * table entry), plus the extra length bits. We don't need to + * consume the extra length bits here, as they were included in + * the bits consumed by the entry earlier. We also don't need + * to check for too-long matches here, as this is inside the + * fastloop where it's already been verified that the output + * buffer has enough space remaining to copy a max-length match. + */ + length = entry >> 16; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* + * Decode the match offset. There are enough "preloadable" bits + * remaining to preload the offset decode table entry, but a + * refill might be needed before consuming it. + */ + STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS, + OFFSET_TABLEBITS)); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS, + LITLEN_TABLEBITS)) { + /* + * Decoding a match offset on a 64-bit platform. We may + * need to refill once, but then we can decode the whole + * offset and preload the next litlen table entry. + */ + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + if (unlikely((u8)bitsleft < OFFSET_MAXBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS + + LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + } else { + /* Decoding a match offset on a 32-bit platform */ + REFILL_BITS_IN_FASTLOOP(); + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + /* Offset codeword requires a subtable */ + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + REFILL_BITS_IN_FASTLOOP(); + /* No further refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME( + OFFSET_MAXBITS - OFFSET_TABLEBITS)); + } else { + /* No refill needed before extra bits */ + STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS)); + } + } + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; /* optimization: subtract full entry */ + offset = entry >> 16; + offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + + /* Validate the match offset; needed even in the fastloop. */ + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + /* + * Before starting to issue the instructions to copy the match, + * refill the bitbuffer and preload the litlen decode table + * entry for the next loop iteration. This can increase + * performance by allowing the latency of the match copy to + * overlap with these other operations. To further reduce + * latency, we've arranged for there to be enough bits remaining + * to do the table preload independently of the refill, except + * on 32-bit platforms using the byte-at-a-time refill method. + */ + if (!CAN_CONSUME_AND_THEN_PRELOAD( + MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS, + OFFSET_MAXFASTBITS), + LITLEN_TABLEBITS) && + unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK)) + REFILL_BITS_IN_FASTLOOP(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + REFILL_BITS_IN_FASTLOOP(); + + /* + * Copy the match. On most CPUs the fastest method is a + * word-at-a-time copy, unconditionally copying about 5 words + * since this is enough for most matches without being too much. + * + * The normal word-at-a-time copy works for offset >= WORDBYTES, + * which is most cases. The case of offset == 1 is also common + * and is worth optimizing for, since it is just RLE encoding of + * the previous byte, which is the result of compressing long + * runs of the same byte. + * + * Writing past the match 'length' is allowed here, since it's + * been ensured there is enough output space left for a slight + * overrun. FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if + * the maximum possible overrun here is changed. + */ + if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + store_word_unaligned(load_word_unaligned(src), dst); + src += WORDBYTES; + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) { + machine_word_t v; + + /* + * This part tends to get auto-vectorized, so keep it + * copying a multiple of 16 bytes at a time. + */ + v = (machine_word_t)0x0101010101010101 * src[0]; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + while (dst < out_next) { + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + store_word_unaligned(v, dst); + dst += WORDBYTES; + } + } else if (UNALIGNED_ACCESS_IS_FAST) { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + do { + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + store_word_unaligned(load_word_unaligned(src), dst); + src += offset; + dst += offset; + } while (dst < out_next); + } else { + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + } while (in_next < in_fastloop_end && out_next < out_fastloop_end); + + /* + * This is the generic loop for decoding literals and matches. This + * handles cases where in_next and out_next are close to the end of + * their respective buffers. Usually this loop isn't performance- + * critical, as most time is spent in the fastloop above instead. We + * therefore omit some optimizations here in favor of smaller code. + */ generic_loop: - for (;;) { - u32 length, offset; - const u8 *src; - u8 *dst; - - REFILL_BITS(); - entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; - saved_bitbuf = bitbuf; - bitbuf >>= (u8)entry; - bitsleft -= entry; - if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) { - entry = d->u.litlen_decode_table[(entry >> 16) + - EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; - saved_bitbuf = bitbuf; - bitbuf >>= (u8)entry; - bitsleft -= entry; - } - length = entry >> 16; - if (entry & HUFFDEC_LITERAL) { - if (unlikely(out_next == out_end)) - return LIBDEFLATE_INSUFFICIENT_SPACE; - *out_next++ = length; - continue; - } - if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) - goto block_done; - length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); - if (unlikely(length > out_end - out_next)) - return LIBDEFLATE_INSUFFICIENT_SPACE; - - if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS)) - REFILL_BITS(); - entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; - if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { - bitbuf >>= OFFSET_TABLEBITS; - bitsleft -= OFFSET_TABLEBITS; - entry = d->offset_decode_table[(entry >> 16) + - EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; - if (!CAN_CONSUME(OFFSET_MAXBITS)) - REFILL_BITS(); - } - offset = entry >> 16; - offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8); - bitbuf >>= (u8)entry; - bitsleft -= entry; - - SAFETY_CHECK(offset <= out_next - (const u8 *)out); - src = out_next - offset; - dst = out_next; - out_next += length; - - STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); - *dst++ = *src++; - *dst++ = *src++; - do { - *dst++ = *src++; - } while (dst < out_next); - } - + for (;;) { + u32 length, offset; + const u8 *src; + u8 *dst; + + REFILL_BITS(); + entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) { + entry = d->u.litlen_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + saved_bitbuf = bitbuf; + bitbuf >>= (u8)entry; + bitsleft -= entry; + } + length = entry >> 16; + if (entry & HUFFDEC_LITERAL) { + if (unlikely(out_next == out_end)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + *out_next++ = length; + continue; + } + if (unlikely(entry & HUFFDEC_END_OF_BLOCK)) + goto block_done; + length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8); + if (unlikely(length > out_end - out_next)) + return LIBDEFLATE_INSUFFICIENT_SPACE; + + if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS)) + REFILL_BITS(); + entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)]; + if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) { + bitbuf >>= OFFSET_TABLEBITS; + bitsleft -= OFFSET_TABLEBITS; + entry = d->offset_decode_table[(entry >> 16) + + EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)]; + if (!CAN_CONSUME(OFFSET_MAXBITS)) + REFILL_BITS(); + } + offset = entry >> 16; + offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8); + bitbuf >>= (u8)entry; + bitsleft -= entry; + + SAFETY_CHECK(offset <= out_next - (const u8 *)out); + src = out_next - offset; + dst = out_next; + out_next += length; + + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3); + *dst++ = *src++; + *dst++ = *src++; + do { + *dst++ = *src++; + } while (dst < out_next); + } + block_done: - /* Finished decoding a block */ - - if (!is_final_block) - goto next_block; - - /* That was the last block. */ - - bitsleft = (u8)bitsleft; - - /* - * If any of the implicit appended zero bytes were consumed (not just - * refilled) before hitting end of stream, then the data is bad. - */ - SAFETY_CHECK(overread_count <= (bitsleft >> 3)); - - /* Optionally return the actual number of bytes consumed. */ - if (actual_in_nbytes_ret) { - /* Don't count bytes that were refilled but not consumed. */ - in_next -= (bitsleft >> 3) - overread_count; - - *actual_in_nbytes_ret = in_next - (u8 *)in; - } - - /* Optionally return the actual number of bytes written. */ - if (actual_out_nbytes_ret) { - *actual_out_nbytes_ret = out_next - (u8 *)out; - } else { - if (out_next != out_end) - return LIBDEFLATE_SHORT_OUTPUT; - } - return LIBDEFLATE_SUCCESS; + /* Finished decoding a block */ + + if (!is_final_block) + goto next_block; + + /* That was the last block. */ + + bitsleft = (u8)bitsleft; + + /* + * If any of the implicit appended zero bytes were consumed (not just + * refilled) before hitting end of stream, then the data is bad. + */ + SAFETY_CHECK(overread_count <= (bitsleft >> 3)); + + /* Optionally return the actual number of bytes consumed. */ + if (actual_in_nbytes_ret) { + /* Don't count bytes that were refilled but not consumed. */ + in_next -= (bitsleft >> 3) - overread_count; + + *actual_in_nbytes_ret = in_next - (u8 *)in; + } + + /* Optionally return the actual number of bytes written. */ + if (actual_out_nbytes_ret) { + *actual_out_nbytes_ret = out_next - (u8 *)out; + } else { + if (out_next != out_end) + return LIBDEFLATE_SHORT_OUTPUT; + } + return LIBDEFLATE_SUCCESS; } #undef FUNCNAME diff --git a/Sources/DEFLATE/deflate_compress.c b/Sources/DEFLATE/deflate_compress.c index 32c736d8..fe71dd8d 100644 --- a/Sources/DEFLATE/deflate_compress.c +++ b/Sources/DEFLATE/deflate_compress.c @@ -45,7 +45,7 @@ * algorithms. However, it is slow. If this parameter is defined to 0, then * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm. */ -#define SUPPORT_NEAR_OPTIMAL_PARSING 1 +#define SUPPORT_NEAR_OPTIMAL_PARSING 1 /* * This is the minimum block length that the compressor will use, in @@ -63,7 +63,7 @@ * reasonable upper bound on the compressed size. It's also needed because our * block splitting algorithm doesn't work well on very short blocks. */ -#define MIN_BLOCK_LENGTH 5000 +#define MIN_BLOCK_LENGTH 5000 /* * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft @@ -78,7 +78,7 @@ * increasing/decreasing this parameter will increase/decrease per-compressor * memory usage linearly. */ -#define SOFT_MAX_BLOCK_LENGTH 300000 +#define SOFT_MAX_BLOCK_LENGTH 300000 /* * For the greedy, lazy, and lazy2 compressors: this is the length of the @@ -90,7 +90,7 @@ * being ended normally before then. Increasing/decreasing this value will * increase/decrease per-compressor memory usage linearly. */ -#define SEQ_STORE_LENGTH 50000 +#define SEQ_STORE_LENGTH 50000 /* * For deflate_compress_fastest(): This is the soft maximum block length. @@ -99,13 +99,13 @@ * FAST_SEQ_STORE_LENGTH matches. Therefore, this value should be lower than * the regular SOFT_MAX_BLOCK_LENGTH. */ -#define FAST_SOFT_MAX_BLOCK_LENGTH 65535 +#define FAST_SOFT_MAX_BLOCK_LENGTH 65535 /* * For deflate_compress_fastest(): this is the length of the sequence store. * This is like SEQ_STORE_LENGTH, but this should be a lower value. */ -#define FAST_SEQ_STORE_LENGTH 8192 +#define FAST_SEQ_STORE_LENGTH 8192 /* * These are the maximum codeword lengths, in bits, the compressor will use for @@ -114,9 +114,9 @@ * negligible effect on compression ratio but allows some optimizations when * outputting bits. (It allows 4 literals to be written at once rather than 3.) */ -#define MAX_LITLEN_CODEWORD_LEN 14 -#define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN -#define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN +#define MAX_LITLEN_CODEWORD_LEN 14 +#define MAX_OFFSET_CODEWORD_LEN DEFLATE_MAX_OFFSET_CODEWORD_LEN +#define MAX_PRE_CODEWORD_LEN DEFLATE_MAX_PRE_CODEWORD_LEN #if SUPPORT_NEAR_OPTIMAL_PARSING @@ -137,7 +137,7 @@ * BIT_COST doesn't apply to deflate_flush_block() and * deflate_compute_true_cost(), which consider whole bits. */ -#define BIT_COST 16 +#define BIT_COST 16 /* * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to @@ -146,23 +146,23 @@ * optimization pass. However, the cost should be relatively high because the * symbol probably won't be used very many times (if at all). */ -#define LITERAL_NOSTAT_BITS 13 -#define LENGTH_NOSTAT_BITS 13 -#define OFFSET_NOSTAT_BITS 10 +#define LITERAL_NOSTAT_BITS 13 +#define LENGTH_NOSTAT_BITS 13 +#define OFFSET_NOSTAT_BITS 10 /* * This is (slightly less than) the maximum number of matches that the * near-optimal compressor will cache per block. This behaves similarly to * SEQ_STORE_LENGTH for the other compressors. */ -#define MATCH_CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) +#define MATCH_CACHE_LENGTH (SOFT_MAX_BLOCK_LENGTH * 5) #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ /******************************************************************************/ /* Include the needed matchfinders. */ -#define MATCHFINDER_WINDOW_ORDER DEFLATE_WINDOW_ORDER +#define MATCHFINDER_WINDOW_ORDER DEFLATE_WINDOW_ORDER #include "hc_matchfinder.h" #include "ht_matchfinder.h" #if SUPPORT_NEAR_OPTIMAL_PARSING @@ -174,8 +174,8 @@ * an upper bound. (This says nothing about whether it is worthwhile to * consider so many matches; this is just defining the worst case.) */ -#define MAX_MATCHES_PER_POS \ - (DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) +#define MAX_MATCHES_PER_POS \ +(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1) #endif /* @@ -185,103 +185,103 @@ * occurs when the lazy2 compressor chooses two literals and a maximum-length * match, starting at SOFT_MAX_BLOCK_LENGTH - 1. */ -#define MAX_BLOCK_LENGTH \ - MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \ - SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN) +#define MAX_BLOCK_LENGTH \ +MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1, \ +SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN) static forceinline void check_buildtime_parameters(void) { - /* - * Verify that MIN_BLOCK_LENGTH is being honored, as - * libdeflate_deflate_compress_bound() depends on it. - */ - STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); - STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); - STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >= - MIN_BLOCK_LENGTH); - STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >= - MIN_BLOCK_LENGTH); + /* + * Verify that MIN_BLOCK_LENGTH is being honored, as + * libdeflate_deflate_compress_bound() depends on it. + */ + STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); + STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH); + STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >= + MIN_BLOCK_LENGTH); + STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >= + MIN_BLOCK_LENGTH); #if SUPPORT_NEAR_OPTIMAL_PARSING - STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <= - MATCH_CACHE_LENGTH); + STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <= + MATCH_CACHE_LENGTH); #endif - - /* The definition of MAX_BLOCK_LENGTH assumes this. */ - STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH); - - /* Verify that the sequence stores aren't uselessly large. */ - STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <= - SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); - STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <= - FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); - - /* Verify that the maximum codeword lengths are valid. */ - STATIC_ASSERT( - MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); - STATIC_ASSERT( - MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); - STATIC_ASSERT( - MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); - STATIC_ASSERT( - (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS); - STATIC_ASSERT( - (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS); - STATIC_ASSERT( - (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS); + + /* The definition of MAX_BLOCK_LENGTH assumes this. */ + STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH); + + /* Verify that the sequence stores aren't uselessly large. */ + STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <= + SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); + STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <= + FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH); + + /* Verify that the maximum codeword lengths are valid. */ + STATIC_ASSERT( + MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN); + STATIC_ASSERT( + MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN); + STATIC_ASSERT( + MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN); + STATIC_ASSERT( + (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS); + STATIC_ASSERT( + (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS); + STATIC_ASSERT( + (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS); } /******************************************************************************/ /* Table: length slot => length slot base value */ static const unsigned deflate_length_slot_base[] = { - 3, 4, 5, 6, 7, 8, 9, 10, - 11, 13, 15, 17, 19, 23, 27, 31, - 35, 43, 51, 59, 67, 83, 99, 115, - 131, 163, 195, 227, 258, + 3, 4, 5, 6, 7, 8, 9, 10, + 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, + 131, 163, 195, 227, 258, }; /* Table: length slot => number of extra length bits */ static const u8 deflate_extra_length_bits[] = { - 0, 0, 0, 0, 0, 0, 0, 0, - 1, 1, 1, 1, 2, 2, 2, 2, - 3, 3, 3, 3, 4, 4, 4, 4, - 5, 5, 5, 5, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, + 5, 5, 5, 5, 0, }; /* Table: offset slot => offset slot base value */ static const unsigned deflate_offset_slot_base[] = { - 1, 2, 3, 4, 5, 7, 9, 13, - 17, 25, 33, 49, 65, 97, 129, 193, - 257, 385, 513, 769, 1025, 1537, 2049, 3073, - 4097, 6145, 8193, 12289, 16385, 24577, + 1, 2, 3, 4, 5, 7, 9, 13, + 17, 25, 33, 49, 65, 97, 129, 193, + 257, 385, 513, 769, 1025, 1537, 2049, 3073, + 4097, 6145, 8193, 12289, 16385, 24577, }; /* Table: offset slot => number of extra offset bits */ static const u8 deflate_extra_offset_bits[] = { - 0, 0, 0, 0, 1, 1, 2, 2, - 3, 3, 4, 4, 5, 5, 6, 6, - 7, 7, 8, 8, 9, 9, 10, 10, - 11, 11, 12, 12, 13, 13, + 0, 0, 0, 0, 1, 1, 2, 2, + 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 10, + 11, 11, 12, 12, 13, 13, }; /* Table: length => length slot */ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { - 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, - 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, - 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, - 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, - 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, - 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, - 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, - 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, - 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, - 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, - 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, - 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, - 27, 27, 28, + 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, + 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16, + 16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18, + 18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 28, }; /* @@ -289,38 +289,38 @@ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = { * This was generated by scripts/gen_offset_slot_map.py. */ static const u8 deflate_offset_slot[256] = { - 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, - 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, - 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, - 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, - 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, - 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, + 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9, + 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, }; /* The order in which precode codeword lengths are stored */ static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = { - 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15 }; /* Table: precode symbol => number of extra bits */ static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7 }; /* Codewords for the DEFLATE Huffman codes */ struct deflate_codewords { - u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; - u32 offset[DEFLATE_NUM_OFFSET_SYMS]; + u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u32 offset[DEFLATE_NUM_OFFSET_SYMS]; }; /* @@ -328,20 +328,20 @@ struct deflate_codewords { * A zero length means the corresponding symbol had zero frequency. */ struct deflate_lens { - u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; - u8 offset[DEFLATE_NUM_OFFSET_SYMS]; + u8 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u8 offset[DEFLATE_NUM_OFFSET_SYMS]; }; /* Codewords and lengths for the DEFLATE Huffman codes */ struct deflate_codes { - struct deflate_codewords codewords; - struct deflate_lens lens; + struct deflate_codewords codewords; + struct deflate_lens lens; }; /* Symbol frequency counters for the DEFLATE Huffman codes */ struct deflate_freqs { - u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; - u32 offset[DEFLATE_NUM_OFFSET_SYMS]; + u32 litlen[DEFLATE_NUM_LITLEN_SYMS]; + u32 offset[DEFLATE_NUM_OFFSET_SYMS]; }; /* @@ -351,47 +351,47 @@ struct deflate_freqs { * block's Huffman codes have been computed. */ struct deflate_sequence { - - /* - * Bits 0..22: the number of literals in this run. This may be 0 and - * can be at most MAX_BLOCK_LENGTH. The literals are not stored - * explicitly in this structure; instead, they are read directly from - * the uncompressed data. - * - * Bits 23..31: the length of the match which follows the literals, or 0 - * if this literal run was the last in the block, so there is no match - * which follows it. - */ + + /* + * Bits 0..22: the number of literals in this run. This may be 0 and + * can be at most MAX_BLOCK_LENGTH. The literals are not stored + * explicitly in this structure; instead, they are read directly from + * the uncompressed data. + * + * Bits 23..31: the length of the match which follows the literals, or 0 + * if this literal run was the last in the block, so there is no match + * which follows it. + */ #define SEQ_LENGTH_SHIFT 23 #define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1) - u32 litrunlen_and_length; - - /* - * If 'length' doesn't indicate end-of-block, then this is the offset of - * the match which follows the literals. - */ - u16 offset; - - /* - * If 'length' doesn't indicate end-of-block, then this is the offset - * slot of the match which follows the literals. - */ - u16 offset_slot; + u32 litrunlen_and_length; + + /* + * If 'length' doesn't indicate end-of-block, then this is the offset of + * the match which follows the literals. + */ + u16 offset; + + /* + * If 'length' doesn't indicate end-of-block, then this is the offset + * slot of the match which follows the literals. + */ + u16 offset_slot; }; #if SUPPORT_NEAR_OPTIMAL_PARSING /* Costs for the near-optimal parsing algorithm */ struct deflate_costs { - - /* The cost to output each possible literal */ - u32 literal[DEFLATE_NUM_LITERALS]; - - /* The cost to output each possible match length */ - u32 length[DEFLATE_MAX_MATCH_LEN + 1]; - - /* The cost to output a match offset of each possible offset slot */ - u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; + + /* The cost to output each possible literal */ + u32 literal[DEFLATE_NUM_LITERALS]; + + /* The cost to output each possible match length */ + u32 length[DEFLATE_MAX_MATCH_LEN + 1]; + + /* The cost to output a match offset of each possible offset slot */ + u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS]; }; /* @@ -406,31 +406,31 @@ struct deflate_costs { * But these "edges" are actually stored elsewhere (in 'match_cache'). Here we * associate with each node just two pieces of information: * - * 'cost_to_end' is the minimum cost to reach the end of the block from - * this position. + * 'cost_to_end' is the minimum cost to reach the end of the block from + * this position. * - * 'item' represents the literal or match that must be chosen from here to - * reach the end of the block with the minimum cost. Equivalently, this - * can be interpreted as the label of the outgoing edge on the minimum-cost - * path to the "end of block" node from this node. + * 'item' represents the literal or match that must be chosen from here to + * reach the end of the block with the minimum cost. Equivalently, this + * can be interpreted as the label of the outgoing edge on the minimum-cost + * path to the "end of block" node from this node. */ struct deflate_optimum_node { - - u32 cost_to_end; - - /* - * Notes on the match/literal representation used here: - * - * The low bits of 'item' are the length: 1 if this is a literal, - * or the match length if this is a match. - * - * The high bits of 'item' are the actual literal byte if this is a - * literal, or the match offset if this is a match. - */ + + u32 cost_to_end; + + /* + * Notes on the match/literal representation used here: + * + * The low bits of 'item' are the length: 1 if this is a literal, + * or the match length if this is a match. + * + * The high bits of 'item' are the actual literal byte if this is a + * literal, or the match offset if this is a match. + */ #define OPTIMUM_OFFSET_SHIFT 9 #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1) - u32 item; - + u32 item; + }; #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ @@ -439,226 +439,226 @@ struct deflate_optimum_node { #define NUM_LITERAL_OBSERVATION_TYPES 8 #define NUM_MATCH_OBSERVATION_TYPES 2 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \ - NUM_MATCH_OBSERVATION_TYPES) +NUM_MATCH_OBSERVATION_TYPES) #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512 struct block_split_stats { - u32 new_observations[NUM_OBSERVATION_TYPES]; - u32 observations[NUM_OBSERVATION_TYPES]; - u32 num_new_observations; - u32 num_observations; + u32 new_observations[NUM_OBSERVATION_TYPES]; + u32 observations[NUM_OBSERVATION_TYPES]; + u32 num_new_observations; + u32 num_observations; }; struct deflate_output_bitstream; /* The main DEFLATE compressor structure */ struct libdeflate_compressor { - - /* Pointer to the compress() implementation chosen at allocation time */ - void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in, - size_t in_nbytes, struct deflate_output_bitstream *os); - - /* The free() function for this struct, chosen at allocation time */ - free_func_t free_func; - - /* The compression level with which this compressor was created */ - unsigned compression_level; - - /* Anything of this size or less we won't bother trying to compress. */ - size_t max_passthrough_size; - - /* - * The maximum search depth: consider at most this many potential - * matches at each position - */ - unsigned max_search_depth; - - /* - * The "nice" match length: if a match of this length is found, choose - * it immediately without further consideration - */ - unsigned nice_match_length; - - /* Frequency counters for the current block */ - struct deflate_freqs freqs; - - /* Block split statistics for the current block */ - struct block_split_stats split_stats; - - /* Dynamic Huffman codes for the current block */ - struct deflate_codes codes; - - /* The static Huffman codes defined by the DEFLATE format */ - struct deflate_codes static_codes; - - /* Temporary space for block flushing */ - union { - /* Information about the precode */ - struct { - u32 freqs[DEFLATE_NUM_PRECODE_SYMS]; - u32 codewords[DEFLATE_NUM_PRECODE_SYMS]; - u8 lens[DEFLATE_NUM_PRECODE_SYMS]; - unsigned items[DEFLATE_NUM_LITLEN_SYMS + - DEFLATE_NUM_OFFSET_SYMS]; - unsigned num_litlen_syms; - unsigned num_offset_syms; - unsigned num_explicit_lens; - unsigned num_items; - } precode; - /* - * The "full" length codewords. Used only after the information - * in 'precode' is no longer needed. - */ - struct { - u32 codewords[DEFLATE_MAX_MATCH_LEN + 1]; - u8 lens[DEFLATE_MAX_MATCH_LEN + 1]; - } length; - } o; - - union { - /* Data for greedy or lazy parsing */ - struct { - /* Hash chains matchfinder */ - struct hc_matchfinder hc_mf; - - /* Matches and literals chosen for the current block */ - struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1]; - - } g; /* (g)reedy */ - - /* Data for fastest parsing */ - struct { - /* Hash table matchfinder */ - struct ht_matchfinder ht_mf; - - /* Matches and literals chosen for the current block */ - struct deflate_sequence sequences[ - FAST_SEQ_STORE_LENGTH + 1]; - - } f; /* (f)astest */ - - #if SUPPORT_NEAR_OPTIMAL_PARSING - /* Data for near-optimal parsing */ - struct { - - /* Binary tree matchfinder */ - struct bt_matchfinder bt_mf; - - /* - * Cached matches for the current block. This array - * contains the matches that were found at each position - * in the block. Specifically, for each position, there - * is a list of matches found at that position, if any, - * sorted by strictly increasing length. In addition, - * following the matches for each position, there is a - * special 'struct lz_match' whose 'length' member - * contains the number of matches found at that - * position, and whose 'offset' member contains the - * literal at that position. - * - * Note: in rare cases, there will be a very high number - * of matches in the block and this array will overflow. - * If this happens, we force the end of the current - * block. MATCH_CACHE_LENGTH is the length at which we - * actually check for overflow. The extra slots beyond - * this are enough to absorb the worst case overflow, - * which occurs if starting at - * &match_cache[MATCH_CACHE_LENGTH - 1], we write - * MAX_MATCHES_PER_POS matches and a match count header, - * then skip searching for matches at - * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the - * match count header for each. - */ - struct lz_match match_cache[MATCH_CACHE_LENGTH + - MAX_MATCHES_PER_POS + - DEFLATE_MAX_MATCH_LEN - 1]; - - /* - * Array of nodes, one per position, for running the - * minimum-cost path algorithm. - * - * This array must be large enough to accommodate the - * worst-case number of nodes, which is MAX_BLOCK_LENGTH - * plus 1 for the end-of-block node. - */ - struct deflate_optimum_node optimum_nodes[ - MAX_BLOCK_LENGTH + 1]; - - /* The current cost model being used */ - struct deflate_costs costs; - - /* Saved cost model */ - struct deflate_costs costs_saved; - - /* - * A table that maps match offset to offset slot. This - * differs from deflate_offset_slot[] in that this is a - * full map, not a condensed one. The full map is more - * appropriate for the near-optimal parser, since the - * near-optimal parser does more offset => offset_slot - * translations, it doesn't intersperse them with - * matchfinding (so cache evictions are less of a - * concern), and it uses more memory anyway. - */ - u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1]; - - /* Literal/match statistics saved from previous block */ - u32 prev_observations[NUM_OBSERVATION_TYPES]; - u32 prev_num_observations; - - /* - * Approximate match length frequencies based on a - * greedy parse, gathered during matchfinding. This is - * used for setting the initial symbol costs. - */ - u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; - u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; - - /* - * The maximum number of optimization passes - * (min-cost path searches) per block. - * Larger values = more compression. - */ - unsigned max_optim_passes; - - /* - * If an optimization pass improves the cost by fewer - * than this number of bits, then optimization will stop - * early, before max_optim_passes has been reached. - * Smaller values = more compression. - */ - unsigned min_improvement_to_continue; - - /* - * The minimum number of bits that would need to be - * saved for it to be considered worth the time to - * regenerate and use the min-cost path from a previous - * optimization pass, in the case where the final - * optimization pass actually increased the cost. - * Smaller values = more compression. - */ - unsigned min_bits_to_use_nonfinal_path; - - /* - * The maximum block length, in uncompressed bytes, at - * which to find and consider the optimal match/literal - * list for the static Huffman codes. This strategy - * improves the compression ratio produced by static - * Huffman blocks and can discover more cases in which - * static blocks are worthwhile. This helps mostly with - * small blocks, hence why this parameter is a max_len. - * - * Above this block length, static Huffman blocks are - * only used opportunistically. I.e. a static Huffman - * block is only used if a static block using the same - * match/literal list as the optimized dynamic block - * happens to be cheaper than the dynamic block itself. - */ - unsigned max_len_to_optimize_static_block; - - } n; /* (n)ear-optimal */ - #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - - } p; /* (p)arser */ + + /* Pointer to the compress() implementation chosen at allocation time */ + void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in, + size_t in_nbytes, struct deflate_output_bitstream *os); + + /* The free() function for this struct, chosen at allocation time */ + free_func_t free_func; + + /* The compression level with which this compressor was created */ + unsigned compression_level; + + /* Anything of this size or less we won't bother trying to compress. */ + size_t max_passthrough_size; + + /* + * The maximum search depth: consider at most this many potential + * matches at each position + */ + unsigned max_search_depth; + + /* + * The "nice" match length: if a match of this length is found, choose + * it immediately without further consideration + */ + unsigned nice_match_length; + + /* Frequency counters for the current block */ + struct deflate_freqs freqs; + + /* Block split statistics for the current block */ + struct block_split_stats split_stats; + + /* Dynamic Huffman codes for the current block */ + struct deflate_codes codes; + + /* The static Huffman codes defined by the DEFLATE format */ + struct deflate_codes static_codes; + + /* Temporary space for block flushing */ + union { + /* Information about the precode */ + struct { + u32 freqs[DEFLATE_NUM_PRECODE_SYMS]; + u32 codewords[DEFLATE_NUM_PRECODE_SYMS]; + u8 lens[DEFLATE_NUM_PRECODE_SYMS]; + unsigned items[DEFLATE_NUM_LITLEN_SYMS + + DEFLATE_NUM_OFFSET_SYMS]; + unsigned num_litlen_syms; + unsigned num_offset_syms; + unsigned num_explicit_lens; + unsigned num_items; + } precode; + /* + * The "full" length codewords. Used only after the information + * in 'precode' is no longer needed. + */ + struct { + u32 codewords[DEFLATE_MAX_MATCH_LEN + 1]; + u8 lens[DEFLATE_MAX_MATCH_LEN + 1]; + } length; + } o; + + union { + /* Data for greedy or lazy parsing */ + struct { + /* Hash chains matchfinder */ + struct hc_matchfinder hc_mf; + + /* Matches and literals chosen for the current block */ + struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1]; + + } g; /* (g)reedy */ + + /* Data for fastest parsing */ + struct { + /* Hash table matchfinder */ + struct ht_matchfinder ht_mf; + + /* Matches and literals chosen for the current block */ + struct deflate_sequence sequences[ + FAST_SEQ_STORE_LENGTH + 1]; + + } f; /* (f)astest */ + +#if SUPPORT_NEAR_OPTIMAL_PARSING + /* Data for near-optimal parsing */ + struct { + + /* Binary tree matchfinder */ + struct bt_matchfinder bt_mf; + + /* + * Cached matches for the current block. This array + * contains the matches that were found at each position + * in the block. Specifically, for each position, there + * is a list of matches found at that position, if any, + * sorted by strictly increasing length. In addition, + * following the matches for each position, there is a + * special 'struct lz_match' whose 'length' member + * contains the number of matches found at that + * position, and whose 'offset' member contains the + * literal at that position. + * + * Note: in rare cases, there will be a very high number + * of matches in the block and this array will overflow. + * If this happens, we force the end of the current + * block. MATCH_CACHE_LENGTH is the length at which we + * actually check for overflow. The extra slots beyond + * this are enough to absorb the worst case overflow, + * which occurs if starting at + * &match_cache[MATCH_CACHE_LENGTH - 1], we write + * MAX_MATCHES_PER_POS matches and a match count header, + * then skip searching for matches at + * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the + * match count header for each. + */ + struct lz_match match_cache[MATCH_CACHE_LENGTH + + MAX_MATCHES_PER_POS + + DEFLATE_MAX_MATCH_LEN - 1]; + + /* + * Array of nodes, one per position, for running the + * minimum-cost path algorithm. + * + * This array must be large enough to accommodate the + * worst-case number of nodes, which is MAX_BLOCK_LENGTH + * plus 1 for the end-of-block node. + */ + struct deflate_optimum_node optimum_nodes[ + MAX_BLOCK_LENGTH + 1]; + + /* The current cost model being used */ + struct deflate_costs costs; + + /* Saved cost model */ + struct deflate_costs costs_saved; + + /* + * A table that maps match offset to offset slot. This + * differs from deflate_offset_slot[] in that this is a + * full map, not a condensed one. The full map is more + * appropriate for the near-optimal parser, since the + * near-optimal parser does more offset => offset_slot + * translations, it doesn't intersperse them with + * matchfinding (so cache evictions are less of a + * concern), and it uses more memory anyway. + */ + u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1]; + + /* Literal/match statistics saved from previous block */ + u32 prev_observations[NUM_OBSERVATION_TYPES]; + u32 prev_num_observations; + + /* + * Approximate match length frequencies based on a + * greedy parse, gathered during matchfinding. This is + * used for setting the initial symbol costs. + */ + u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; + u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1]; + + /* + * The maximum number of optimization passes + * (min-cost path searches) per block. + * Larger values = more compression. + */ + unsigned max_optim_passes; + + /* + * If an optimization pass improves the cost by fewer + * than this number of bits, then optimization will stop + * early, before max_optim_passes has been reached. + * Smaller values = more compression. + */ + unsigned min_improvement_to_continue; + + /* + * The minimum number of bits that would need to be + * saved for it to be considered worth the time to + * regenerate and use the min-cost path from a previous + * optimization pass, in the case where the final + * optimization pass actually increased the cost. + * Smaller values = more compression. + */ + unsigned min_bits_to_use_nonfinal_path; + + /* + * The maximum block length, in uncompressed bytes, at + * which to find and consider the optimal match/literal + * list for the static Huffman codes. This strategy + * improves the compression ratio produced by static + * Huffman blocks and can discover more cases in which + * static blocks are worthwhile. This helps mostly with + * small blocks, hence why this parameter is a max_len. + * + * Above this block length, static Huffman blocks are + * only used opportunistically. I.e. a static Huffman + * block is only used if a static block using the same + * match/literal list as the optimized dynamic block + * happens to be cheaper than the dynamic block itself. + */ + unsigned max_len_to_optimize_static_block; + + } n; /* (n)ear-optimal */ +#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ + + } p; /* (p)arser */ }; /* @@ -672,41 +672,41 @@ typedef machine_word_t bitbuf_t; * The capacity of the bitbuffer, in bits. This is 1 less than the real size, * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7. */ -#define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) +#define BITBUF_NBITS (8 * sizeof(bitbuf_t) - 1) /* * Can the specified number of bits always be added to 'bitbuf' after any * pending bytes have been flushed? There can be up to 7 bits remaining after a * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits. */ -#define CAN_BUFFER(n) (7 + (n) <= BITBUF_NBITS) +#define CAN_BUFFER(n) (7 + (n) <= BITBUF_NBITS) /* * Structure to keep track of the current state of sending bits to the * compressed output buffer */ struct deflate_output_bitstream { - - /* Bits that haven't yet been written to the output buffer */ - bitbuf_t bitbuf; - - /* - * Number of bits currently held in @bitbuf. This can be between 0 and - * BITBUF_NBITS in general, or between 0 and 7 after a flush. - */ - unsigned bitcount; - - /* - * Pointer to the position in the output buffer at which the next byte - * should be written - */ - u8 *next; - - /* Pointer to the end of the output buffer */ - u8 *end; - - /* true if the output buffer ran out of space */ - bool overflow; + + /* Bits that haven't yet been written to the output buffer */ + bitbuf_t bitbuf; + + /* + * Number of bits currently held in @bitbuf. This can be between 0 and + * BITBUF_NBITS in general, or between 0 and 7 after a flush. + */ + unsigned bitcount; + + /* + * Pointer to the position in the output buffer at which the next byte + * should be written + */ + u8 *next; + + /* Pointer to the end of the output buffer */ + u8 *end; + + /* true if the output buffer ran out of space */ + bool overflow; }; /* @@ -714,11 +714,11 @@ struct deflate_output_bitstream { * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS() * frequently enough. */ -#define ADD_BITS(bits, n) \ -do { \ - bitbuf |= (bitbuf_t)(bits) << bitcount; \ - bitcount += (n); \ - ASSERT(bitcount <= BITBUF_NBITS); \ +#define ADD_BITS(bits, n) \ +do { \ +bitbuf |= (bitbuf_t)(bits) << bitcount; \ +bitcount += (n); \ +ASSERT(bitcount <= BITBUF_NBITS); \ } while (0) /* @@ -731,23 +731,23 @@ do { \ * flush a whole word, even though that's fastest. Therefore, flush a whole * word if there is space for it, otherwise flush a byte at a time. */ -#define FLUSH_BITS() \ -do { \ - if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \ - /* Flush a whole word (branchlessly). */ \ - put_unaligned_leword(bitbuf, out_next); \ - bitbuf >>= bitcount & ~7; \ - out_next += bitcount >> 3; \ - bitcount &= 7; \ - } else { \ - /* Flush a byte at a time. */ \ - while (bitcount >= 8) { \ - ASSERT(out_next < os->end); \ - *out_next++ = bitbuf; \ - bitcount -= 8; \ - bitbuf >>= 8; \ - } \ - } \ +#define FLUSH_BITS() \ +do { \ +if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \ +/* Flush a whole word (branchlessly). */ \ +put_unaligned_leword(bitbuf, out_next); \ +bitbuf >>= bitcount & ~7; \ +out_next += bitcount >> 3; \ +bitcount &= 7; \ +} else { \ +/* Flush a byte at a time. */ \ +while (bitcount >= 8) { \ +ASSERT(out_next < os->end); \ +*out_next++ = bitbuf; \ +bitcount -= 8; \ +bitbuf >>= 8; \ +} \ +} \ } while (0) /* @@ -759,21 +759,21 @@ do { \ static void heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) { - unsigned parent_idx; - unsigned child_idx; - u32 v; - - v = A[subtree_idx]; - parent_idx = subtree_idx; - while ((child_idx = parent_idx * 2) <= length) { - if (child_idx < length && A[child_idx + 1] > A[child_idx]) - child_idx++; - if (v >= A[child_idx]) - break; - A[parent_idx] = A[child_idx]; - parent_idx = child_idx; - } - A[parent_idx] = v; + unsigned parent_idx; + unsigned child_idx; + u32 v; + + v = A[subtree_idx]; + parent_idx = subtree_idx; + while ((child_idx = parent_idx * 2) <= length) { + if (child_idx < length && A[child_idx + 1] > A[child_idx]) + child_idx++; + if (v >= A[child_idx]) + break; + A[parent_idx] = A[child_idx]; + parent_idx = child_idx; + } + A[parent_idx] = v; } /* @@ -783,10 +783,10 @@ heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx) static void heapify_array(u32 A[], unsigned length) { - unsigned subtree_idx; - - for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) - heapify_subtree(A, length, subtree_idx); + unsigned subtree_idx; + + for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) + heapify_subtree(A, length, subtree_idx); } /* @@ -798,26 +798,26 @@ heapify_array(u32 A[], unsigned length) static void heap_sort(u32 A[], unsigned length) { - A--; /* Use 1-based indices */ - - heapify_array(A, length); - - while (length >= 2) { - u32 tmp = A[length]; - - A[length] = A[1]; - A[1] = tmp; - length--; - heapify_subtree(A, length, 1); - } + A--; /* Use 1-based indices */ + + heapify_array(A, length); + + while (length >= 2) { + u32 tmp = A[length]; + + A[length] = A[1]; + A[1] = tmp; + length--; + heapify_subtree(A, length, 1); + } } #define NUM_SYMBOL_BITS 10 -#define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS) -#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) -#define FREQ_MASK (~SYMBOL_MASK) +#define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS) +#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) +#define FREQ_MASK (~SYMBOL_MASK) -#define GET_NUM_COUNTERS(num_syms) (num_syms) +#define GET_NUM_COUNTERS(num_syms) (num_syms) /* * Sort the symbols primarily by frequency and secondarily by symbol value. @@ -827,18 +827,18 @@ heap_sort(u32 A[], unsigned length) * contain the frequency. * * @num_syms - * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS. + * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS. * * @freqs[num_syms] - * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1. + * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1. * * @lens[num_syms] - * An array that eventually will hold the length of each codeword. This - * function only fills in the codeword lengths for symbols that have zero - * frequency, which are not well defined per se but will be set to 0. + * An array that eventually will hold the length of each codeword. This + * function only fills in the codeword lengths for symbols that have zero + * frequency, which are not well defined per se but will be set to 0. * * @symout[num_syms] - * The output array, described above. + * The output array, described above. * * Returns the number of entries in 'symout' that were filled. This is the * number of symbols that have nonzero frequency. @@ -846,75 +846,75 @@ heap_sort(u32 A[], unsigned length) static unsigned sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[]) { - unsigned sym; - unsigned i; - unsigned num_used_syms; - unsigned num_counters; - unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; - - /* - * We use heapsort, but with an added optimization. Since often most - * symbol frequencies are low, we first do a count sort using a limited - * number of counters. High frequencies are counted in the last - * counter, and only they will be sorted with heapsort. - * - * Note: with more symbols, it is generally beneficial to have more - * counters. About 1 counter per symbol seems fastest. - */ - - num_counters = GET_NUM_COUNTERS(num_syms); - - memset(counters, 0, num_counters * sizeof(counters[0])); - - /* Count the frequencies. */ - for (sym = 0; sym < num_syms; sym++) - counters[MIN(freqs[sym], num_counters - 1)]++; - - /* - * Make the counters cumulative, ignoring the zero-th, which counted - * symbols with zero frequency. As a side effect, this calculates the - * number of symbols with nonzero frequency. - */ - num_used_syms = 0; - for (i = 1; i < num_counters; i++) { - unsigned count = counters[i]; - - counters[i] = num_used_syms; - num_used_syms += count; - } - - /* - * Sort nonzero-frequency symbols using the counters. At the same time, - * set the codeword lengths of zero-frequency symbols to 0. - */ - for (sym = 0; sym < num_syms; sym++) { - u32 freq = freqs[sym]; - - if (freq != 0) { - symout[counters[MIN(freq, num_counters - 1)]++] = - sym | (freq << NUM_SYMBOL_BITS); - } else { - lens[sym] = 0; - } - } - - /* Sort the symbols counted in the last counter. */ - heap_sort(symout + counters[num_counters - 2], - counters[num_counters - 1] - counters[num_counters - 2]); - - return num_used_syms; + unsigned sym; + unsigned i; + unsigned num_used_syms; + unsigned num_counters; + unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)]; + + /* + * We use heapsort, but with an added optimization. Since often most + * symbol frequencies are low, we first do a count sort using a limited + * number of counters. High frequencies are counted in the last + * counter, and only they will be sorted with heapsort. + * + * Note: with more symbols, it is generally beneficial to have more + * counters. About 1 counter per symbol seems fastest. + */ + + num_counters = GET_NUM_COUNTERS(num_syms); + + memset(counters, 0, num_counters * sizeof(counters[0])); + + /* Count the frequencies. */ + for (sym = 0; sym < num_syms; sym++) + counters[MIN(freqs[sym], num_counters - 1)]++; + + /* + * Make the counters cumulative, ignoring the zero-th, which counted + * symbols with zero frequency. As a side effect, this calculates the + * number of symbols with nonzero frequency. + */ + num_used_syms = 0; + for (i = 1; i < num_counters; i++) { + unsigned count = counters[i]; + + counters[i] = num_used_syms; + num_used_syms += count; + } + + /* + * Sort nonzero-frequency symbols using the counters. At the same time, + * set the codeword lengths of zero-frequency symbols to 0. + */ + for (sym = 0; sym < num_syms; sym++) { + u32 freq = freqs[sym]; + + if (freq != 0) { + symout[counters[MIN(freq, num_counters - 1)]++] = + sym | (freq << NUM_SYMBOL_BITS); + } else { + lens[sym] = 0; + } + } + + /* Sort the symbols counted in the last counter. */ + heap_sort(symout + counters[num_counters - 2], + counters[num_counters - 1] - counters[num_counters - 2]); + + return num_used_syms; } /* * Build a Huffman tree. * * This is an optimized implementation that - * (a) takes advantage of the frequencies being already sorted; - * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman - * tree are sufficient to generate a canonical code; - * (c) Only stores parent pointers, not child pointers; - * (d) Produces the nodes in the same memory used for input frequency - * information. + * (a) takes advantage of the frequencies being already sorted; + * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman + * tree are sufficient to generate a canonical code; + * (c) Only stores parent pointers, not child pointers; + * (d) Produces the nodes in the same memory used for input frequency + * information. * * Array 'A', which contains 'sym_count' entries, is used for both input and * output. For this function, 'sym_count' must be at least 2. @@ -939,59 +939,59 @@ sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[]) static void build_tree(u32 A[], unsigned sym_count) { - const unsigned last_idx = sym_count - 1; - - /* Index of the next lowest frequency leaf that still needs a parent */ - unsigned i = 0; - - /* - * Index of the next lowest frequency non-leaf that still needs a - * parent, or 'e' if there is currently no such node - */ - unsigned b = 0; - - /* Index of the next spot for a non-leaf (will overwrite a leaf) */ - unsigned e = 0; - - do { - u32 new_freq; - - /* - * Select the next two lowest frequency nodes among the leaves - * A[i] and non-leaves A[b], and create a new node A[e] to be - * their parent. Set the new node's frequency to the sum of the - * frequencies of its two children. - * - * Usually the next two lowest frequency nodes are of the same - * type (leaf or non-leaf), so check those cases first. - */ - if (i + 1 <= last_idx && - (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) { - /* Two leaves */ - new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK); - i += 2; - } else if (b + 2 <= e && - (i > last_idx || - (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) { - /* Two non-leaves */ - new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK); - A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); - A[b + 1] = (e << NUM_SYMBOL_BITS) | - (A[b + 1] & SYMBOL_MASK); - b += 2; - } else { - /* One leaf and one non-leaf */ - new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK); - A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); - i++; - b++; - } - A[e] = new_freq | (A[e] & SYMBOL_MASK); - /* - * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the - * tree is complete once we've created 'n - 1' non-leaves. - */ - } while (++e < last_idx); + const unsigned last_idx = sym_count - 1; + + /* Index of the next lowest frequency leaf that still needs a parent */ + unsigned i = 0; + + /* + * Index of the next lowest frequency non-leaf that still needs a + * parent, or 'e' if there is currently no such node + */ + unsigned b = 0; + + /* Index of the next spot for a non-leaf (will overwrite a leaf) */ + unsigned e = 0; + + do { + u32 new_freq; + + /* + * Select the next two lowest frequency nodes among the leaves + * A[i] and non-leaves A[b], and create a new node A[e] to be + * their parent. Set the new node's frequency to the sum of the + * frequencies of its two children. + * + * Usually the next two lowest frequency nodes are of the same + * type (leaf or non-leaf), so check those cases first. + */ + if (i + 1 <= last_idx && + (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) { + /* Two leaves */ + new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK); + i += 2; + } else if (b + 2 <= e && + (i > last_idx || + (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) { + /* Two non-leaves */ + new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + A[b + 1] = (e << NUM_SYMBOL_BITS) | + (A[b + 1] & SYMBOL_MASK); + b += 2; + } else { + /* One leaf and one non-leaf */ + new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + i++; + b++; + } + A[e] = new_freq | (A[e] & SYMBOL_MASK); + /* + * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the + * tree is complete once we've created 'n - 1' non-leaves. + */ + } while (++e < last_idx); } /* @@ -1000,94 +1000,94 @@ build_tree(u32 A[], unsigned sym_count) * into account the length-limited constraint. * * @A - * The array produced by build_tree(), containing parent index information - * for the non-leaf nodes of the Huffman tree. Each entry in this array is - * a node; a node's parent always has a greater index than that node - * itself. This function will overwrite the parent index information in - * this array, so essentially it will destroy the tree. However, the data - * in the low NUM_SYMBOL_BITS of each entry will be preserved. + * The array produced by build_tree(), containing parent index information + * for the non-leaf nodes of the Huffman tree. Each entry in this array is + * a node; a node's parent always has a greater index than that node + * itself. This function will overwrite the parent index information in + * this array, so essentially it will destroy the tree. However, the data + * in the low NUM_SYMBOL_BITS of each entry will be preserved. * * @root_idx - * The 0-based index of the root node in 'A', and consequently one less - * than the number of tree node entries in 'A'. (Or, really 2 less than - * the actual length of 'A'.) + * The 0-based index of the root node in 'A', and consequently one less + * than the number of tree node entries in 'A'. (Or, really 2 less than + * the actual length of 'A'.) * * @len_counts - * An array of length ('max_codeword_len' + 1) in which the number of - * codewords having each length <= max_codeword_len will be returned. + * An array of length ('max_codeword_len' + 1) in which the number of + * codewords having each length <= max_codeword_len will be returned. * * @max_codeword_len - * The maximum permissible codeword length. + * The maximum permissible codeword length. */ static void compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[], - unsigned max_codeword_len) + unsigned max_codeword_len) { - unsigned len; - int node; - - /* - * The key observations are: - * - * (1) We can traverse the non-leaf nodes of the tree, always visiting a - * parent before its children, by simply iterating through the array - * in reverse order. Consequently, we can compute the depth of each - * node in one pass, overwriting the parent indices with depths. - * - * (2) We can initially assume that in the real Huffman tree, both - * children of the root are leaves. This corresponds to two - * codewords of length 1. Then, whenever we visit a (non-leaf) node - * during the traversal, we modify this assumption to account for - * the current node *not* being a leaf, but rather its two children - * being leaves. This causes the loss of one codeword for the - * current depth and the addition of two codewords for the current - * depth plus one. - * - * (3) We can handle the length-limited constraint fairly easily by - * simply using the largest length available when a depth exceeds - * max_codeword_len. - */ - - for (len = 0; len <= max_codeword_len; len++) - len_counts[len] = 0; - len_counts[1] = 2; - - /* Set the root node's depth to 0. */ - A[root_idx] &= SYMBOL_MASK; - - for (node = root_idx - 1; node >= 0; node--) { - - /* Calculate the depth of this node. */ - - unsigned parent = A[node] >> NUM_SYMBOL_BITS; - unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; - unsigned depth = parent_depth + 1; - - /* - * Set the depth of this node so that it is available when its - * children (if any) are processed. - */ - A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); - - /* - * If needed, decrease the length to meet the length-limited - * constraint. This is not the optimal method for generating - * length-limited Huffman codes! But it should be good enough. - */ - if (depth >= max_codeword_len) { - depth = max_codeword_len; - do { - depth--; - } while (len_counts[depth] == 0); - } - - /* - * Account for the fact that we have a non-leaf node at the - * current depth. - */ - len_counts[depth]--; - len_counts[depth + 1] += 2; - } + unsigned len; + int node; + + /* + * The key observations are: + * + * (1) We can traverse the non-leaf nodes of the tree, always visiting a + * parent before its children, by simply iterating through the array + * in reverse order. Consequently, we can compute the depth of each + * node in one pass, overwriting the parent indices with depths. + * + * (2) We can initially assume that in the real Huffman tree, both + * children of the root are leaves. This corresponds to two + * codewords of length 1. Then, whenever we visit a (non-leaf) node + * during the traversal, we modify this assumption to account for + * the current node *not* being a leaf, but rather its two children + * being leaves. This causes the loss of one codeword for the + * current depth and the addition of two codewords for the current + * depth plus one. + * + * (3) We can handle the length-limited constraint fairly easily by + * simply using the largest length available when a depth exceeds + * max_codeword_len. + */ + + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + len_counts[1] = 2; + + /* Set the root node's depth to 0. */ + A[root_idx] &= SYMBOL_MASK; + + for (node = root_idx - 1; node >= 0; node--) { + + /* Calculate the depth of this node. */ + + unsigned parent = A[node] >> NUM_SYMBOL_BITS; + unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; + unsigned depth = parent_depth + 1; + + /* + * Set the depth of this node so that it is available when its + * children (if any) are processed. + */ + A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); + + /* + * If needed, decrease the length to meet the length-limited + * constraint. This is not the optimal method for generating + * length-limited Huffman codes! But it should be good enough. + */ + if (depth >= max_codeword_len) { + depth = max_codeword_len; + do { + depth--; + } while (len_counts[depth] == 0); + } + + /* + * Account for the fact that we have a non-leaf node at the + * current depth. + */ + len_counts[depth]--; + len_counts[depth + 1] += 2; + } } /* @@ -1103,51 +1103,51 @@ compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[], #ifdef rbit32 static forceinline u32 reverse_codeword(u32 codeword, u8 len) { - return rbit32(codeword) >> ((32 - len) & 31); + return rbit32(codeword) >> ((32 - len) & 31); } #else /* Generated by scripts/gen_bitreverse_tab.py */ static const u8 bitreverse_tab[256] = { - 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, - 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, - 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, - 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, - 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, - 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, - 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, - 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, - 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, - 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, - 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, - 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, - 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, - 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, - 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, - 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, - 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, - 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, - 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, - 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, - 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, - 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, - 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, - 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, - 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, - 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, - 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, - 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, - 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, - 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, - 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, - 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, + 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0, + 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0, + 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8, + 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8, + 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4, + 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4, + 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec, + 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc, + 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2, + 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2, + 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea, + 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa, + 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6, + 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6, + 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee, + 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe, + 0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1, + 0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1, + 0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9, + 0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9, + 0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5, + 0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5, + 0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed, + 0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd, + 0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3, + 0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3, + 0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb, + 0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb, + 0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7, + 0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7, + 0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef, + 0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff, }; static forceinline u32 reverse_codeword(u32 codeword, u8 len) { - STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); - codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) | - bitreverse_tab[codeword >> 8]; - return codeword >> (16 - len); + STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16); + codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) | + bitreverse_tab[codeword >> 8]; + return codeword >> (16 - len); } #endif /* !rbit32 */ @@ -1155,98 +1155,98 @@ static forceinline u32 reverse_codeword(u32 codeword, u8 len) * Generate the codewords for a canonical Huffman code. * * @A - * The output array for codewords. In addition, initially this - * array must contain the symbols, sorted primarily by frequency and - * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of - * each entry. + * The output array for codewords. In addition, initially this + * array must contain the symbols, sorted primarily by frequency and + * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of + * each entry. * * @len - * Output array for codeword lengths. + * Output array for codeword lengths. * * @len_counts - * An array that provides the number of codewords that will have - * each possible length <= max_codeword_len. + * An array that provides the number of codewords that will have + * each possible length <= max_codeword_len. * * @max_codeword_len - * Maximum length, in bits, of each codeword. + * Maximum length, in bits, of each codeword. * * @num_syms - * Number of symbols in the alphabet, including symbols with zero - * frequency. This is the length of the 'A' and 'len' arrays. + * Number of symbols in the alphabet, including symbols with zero + * frequency. This is the length of the 'A' and 'len' arrays. */ static void gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[], - unsigned max_codeword_len, unsigned num_syms) + unsigned max_codeword_len, unsigned num_syms) { - u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; - unsigned i; - unsigned len; - unsigned sym; - - /* - * Given the number of codewords that will have each length, assign - * codeword lengths to symbols. We do this by assigning the lengths in - * decreasing order to the symbols sorted primarily by increasing - * frequency and secondarily by increasing symbol value. - */ - for (i = 0, len = max_codeword_len; len >= 1; len--) { - unsigned count = len_counts[len]; - - while (count--) - lens[A[i++] & SYMBOL_MASK] = len; - } - - /* - * Generate the codewords themselves. We initialize the - * 'next_codewords' array to provide the lexicographically first - * codeword of each length, then assign codewords in symbol order. This - * produces a canonical code. - */ - next_codewords[0] = 0; - next_codewords[1] = 0; - for (len = 2; len <= max_codeword_len; len++) - next_codewords[len] = - (next_codewords[len - 1] + len_counts[len - 1]) << 1; - - for (sym = 0; sym < num_syms; sym++) { - /* DEFLATE requires bit-reversed codewords. */ - A[sym] = reverse_codeword(next_codewords[lens[sym]]++, - lens[sym]); - } + u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned i; + unsigned len; + unsigned sym; + + /* + * Given the number of codewords that will have each length, assign + * codeword lengths to symbols. We do this by assigning the lengths in + * decreasing order to the symbols sorted primarily by increasing + * frequency and secondarily by increasing symbol value. + */ + for (i = 0, len = max_codeword_len; len >= 1; len--) { + unsigned count = len_counts[len]; + + while (count--) + lens[A[i++] & SYMBOL_MASK] = len; + } + + /* + * Generate the codewords themselves. We initialize the + * 'next_codewords' array to provide the lexicographically first + * codeword of each length, then assign codewords in symbol order. This + * produces a canonical code. + */ + next_codewords[0] = 0; + next_codewords[1] = 0; + for (len = 2; len <= max_codeword_len; len++) + next_codewords[len] = + (next_codewords[len - 1] + len_counts[len - 1]) << 1; + + for (sym = 0; sym < num_syms; sym++) { + /* DEFLATE requires bit-reversed codewords. */ + A[sym] = reverse_codeword(next_codewords[lens[sym]]++, + lens[sym]); + } } /* * --------------------------------------------------------------------- - * deflate_make_huffman_code() + * deflate_make_huffman_code() * --------------------------------------------------------------------- * * Given an alphabet and the frequency of each symbol in it, construct a * length-limited canonical Huffman code. * * @num_syms - * The number of symbols in the alphabet. The symbols are the integers in - * the range [0, num_syms - 1]. This parameter must be at least 2 and - * must not exceed (1 << NUM_SYMBOL_BITS). + * The number of symbols in the alphabet. The symbols are the integers in + * the range [0, num_syms - 1]. This parameter must be at least 2 and + * must not exceed (1 << NUM_SYMBOL_BITS). * * @max_codeword_len - * The maximum permissible codeword length. + * The maximum permissible codeword length. * * @freqs - * An array of length @num_syms that gives the frequency of each symbol. - * It is valid for some, none, or all of the frequencies to be 0. The sum - * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1. + * An array of length @num_syms that gives the frequency of each symbol. + * It is valid for some, none, or all of the frequencies to be 0. The sum + * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1. * * @lens - * An array of @num_syms entries in which this function will return the - * length, in bits, of the codeword assigned to each symbol. Symbols with - * 0 frequency will not have codewords per se, but their entries in this - * array will be set to 0. No lengths greater than @max_codeword_len will - * be assigned. + * An array of @num_syms entries in which this function will return the + * length, in bits, of the codeword assigned to each symbol. Symbols with + * 0 frequency will not have codewords per se, but their entries in this + * array will be set to 0. No lengths greater than @max_codeword_len will + * be assigned. * * @codewords - * An array of @num_syms entries in which this function will return the - * codeword for each symbol, right-justified and padded on the left with - * zeroes. Codewords for symbols with 0 frequency will be undefined. + * An array of @num_syms entries in which this function will return the + * codeword for each symbol, right-justified and padded on the left with + * zeroes. Codewords for symbols with 0 frequency will be undefined. * * --------------------------------------------------------------------- * @@ -1300,13 +1300,13 @@ gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[], * with depth information as part of the process of extracting codeword lengths * from the tree. So in summary, we do NOT need a big structure like: * - * struct huffman_tree_node { - * unsigned int symbol; - * unsigned int frequency; - * unsigned int depth; - * struct huffman_tree_node *left_child; - * struct huffman_tree_node *right_child; - * }; + * struct huffman_tree_node { + * unsigned int symbol; + * unsigned int frequency; + * unsigned int depth; + * struct huffman_tree_node *left_child; + * struct huffman_tree_node *right_child; + * }; * * * ... which often gets used in "naive" implementations of Huffman code @@ -1317,82 +1317,82 @@ gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[], */ static void deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, - const u32 freqs[], u8 lens[], u32 codewords[]) + const u32 freqs[], u8 lens[], u32 codewords[]) { - u32 *A = codewords; - unsigned num_used_syms; - - STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); - STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1); - - /* - * We begin by sorting the symbols primarily by frequency and - * secondarily by symbol value. As an optimization, the array used for - * this purpose ('A') shares storage with the space in which we will - * eventually return the codewords. - */ - num_used_syms = sort_symbols(num_syms, freqs, lens, A); - /* - * 'num_used_syms' is the number of symbols with nonzero frequency. - * This may be less than @num_syms. 'num_used_syms' is also the number - * of entries in 'A' that are valid. Each entry consists of a distinct - * symbol and a nonzero frequency packed into a 32-bit integer. - */ - - /* - * A complete Huffman code must contain at least 2 codewords. Yet, it's - * possible that fewer than 2 symbols were used. When this happens, - * it's usually for the offset code (0-1 symbols used). But it's also - * theoretically possible for the litlen and pre codes (1 symbol used). - * - * The DEFLATE RFC explicitly allows the offset code to contain just 1 - * codeword, or even be completely empty. But it's silent about the - * other codes. It also doesn't say whether, in the 1-codeword case, - * the codeword (which it says must be 1 bit) is '0' or '1'. - * - * In any case, some DEFLATE decompressors reject these cases. zlib - * generally allows them, but it does reject precodes that have just 1 - * codeword. More problematically, zlib v1.2.1 and earlier rejected - * empty offset codes, and this behavior can also be seen in Windows - * Explorer's ZIP unpacker (supposedly even still in Windows 11). - * - * Other DEFLATE compressors, including zlib, always send at least 2 - * codewords in order to make a complete Huffman code. Therefore, this - * is a case where practice does not entirely match the specification. - * We follow practice by generating 2 codewords of length 1: codeword - * '0' for symbol 0, and codeword '1' for another symbol -- the used - * symbol if it exists and is not symbol 0, otherwise symbol 1. This - * does worsen the compression ratio by having to send an unnecessary - * offset codeword length. But this only affects rare cases such as - * blocks containing all literals, and it only makes a tiny difference. - */ - if (unlikely(num_used_syms < 2)) { - unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0; - unsigned nonzero_idx = sym ? sym : 1; - - codewords[0] = 0; - lens[0] = 1; - codewords[nonzero_idx] = 1; - lens[nonzero_idx] = 1; - return; - } - - /* - * Build a stripped-down version of the Huffman tree, sharing the array - * 'A' with the symbol values. Then extract length counts from the tree - * and use them to generate the final codewords. - */ - - build_tree(A, num_used_syms); - - { - unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; - - compute_length_counts(A, num_used_syms - 2, - len_counts, max_codeword_len); - - gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); - } + u32 *A = codewords; + unsigned num_used_syms; + + STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); + STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1); + + /* + * We begin by sorting the symbols primarily by frequency and + * secondarily by symbol value. As an optimization, the array used for + * this purpose ('A') shares storage with the space in which we will + * eventually return the codewords. + */ + num_used_syms = sort_symbols(num_syms, freqs, lens, A); + /* + * 'num_used_syms' is the number of symbols with nonzero frequency. + * This may be less than @num_syms. 'num_used_syms' is also the number + * of entries in 'A' that are valid. Each entry consists of a distinct + * symbol and a nonzero frequency packed into a 32-bit integer. + */ + + /* + * A complete Huffman code must contain at least 2 codewords. Yet, it's + * possible that fewer than 2 symbols were used. When this happens, + * it's usually for the offset code (0-1 symbols used). But it's also + * theoretically possible for the litlen and pre codes (1 symbol used). + * + * The DEFLATE RFC explicitly allows the offset code to contain just 1 + * codeword, or even be completely empty. But it's silent about the + * other codes. It also doesn't say whether, in the 1-codeword case, + * the codeword (which it says must be 1 bit) is '0' or '1'. + * + * In any case, some DEFLATE decompressors reject these cases. zlib + * generally allows them, but it does reject precodes that have just 1 + * codeword. More problematically, zlib v1.2.1 and earlier rejected + * empty offset codes, and this behavior can also be seen in Windows + * Explorer's ZIP unpacker (supposedly even still in Windows 11). + * + * Other DEFLATE compressors, including zlib, always send at least 2 + * codewords in order to make a complete Huffman code. Therefore, this + * is a case where practice does not entirely match the specification. + * We follow practice by generating 2 codewords of length 1: codeword + * '0' for symbol 0, and codeword '1' for another symbol -- the used + * symbol if it exists and is not symbol 0, otherwise symbol 1. This + * does worsen the compression ratio by having to send an unnecessary + * offset codeword length. But this only affects rare cases such as + * blocks containing all literals, and it only makes a tiny difference. + */ + if (unlikely(num_used_syms < 2)) { + unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0; + unsigned nonzero_idx = sym ? sym : 1; + + codewords[0] = 0; + lens[0] = 1; + codewords[nonzero_idx] = 1; + lens[nonzero_idx] = 1; + return; + } + + /* + * Build a stripped-down version of the Huffman tree, sharing the array + * 'A' with the symbol values. Then extract length counts from the tree + * and use them to generate the final codewords. + */ + + build_tree(A, num_used_syms); + + { + unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; + + compute_length_counts(A, num_used_syms - 2, + len_counts, max_codeword_len); + + gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); + } } /* @@ -1402,7 +1402,7 @@ deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len, static void deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) { - memset(&c->freqs, 0, sizeof(c->freqs)); + memset(&c->freqs, 0, sizeof(c->freqs)); } /* @@ -1413,145 +1413,145 @@ deflate_reset_symbol_frequencies(struct libdeflate_compressor *c) */ static void deflate_make_huffman_codes(const struct deflate_freqs *freqs, - struct deflate_codes *codes) + struct deflate_codes *codes) { - deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, - MAX_LITLEN_CODEWORD_LEN, - freqs->litlen, - codes->lens.litlen, - codes->codewords.litlen); - - deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, - MAX_OFFSET_CODEWORD_LEN, - freqs->offset, - codes->lens.offset, - codes->codewords.offset); + deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS, + MAX_LITLEN_CODEWORD_LEN, + freqs->litlen, + codes->lens.litlen, + codes->codewords.litlen); + + deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS, + MAX_OFFSET_CODEWORD_LEN, + freqs->offset, + codes->lens.offset, + codes->codewords.offset); } /* Initialize c->static_codes. */ static void deflate_init_static_codes(struct libdeflate_compressor *c) { - unsigned i; - - for (i = 0; i < 144; i++) - c->freqs.litlen[i] = 1 << (9 - 8); - for (; i < 256; i++) - c->freqs.litlen[i] = 1 << (9 - 9); - for (; i < 280; i++) - c->freqs.litlen[i] = 1 << (9 - 7); - for (; i < 288; i++) - c->freqs.litlen[i] = 1 << (9 - 8); - - for (i = 0; i < 32; i++) - c->freqs.offset[i] = 1 << (5 - 5); - - deflate_make_huffman_codes(&c->freqs, &c->static_codes); + unsigned i; + + for (i = 0; i < 144; i++) + c->freqs.litlen[i] = 1 << (9 - 8); + for (; i < 256; i++) + c->freqs.litlen[i] = 1 << (9 - 9); + for (; i < 280; i++) + c->freqs.litlen[i] = 1 << (9 - 7); + for (; i < 288; i++) + c->freqs.litlen[i] = 1 << (9 - 8); + + for (i = 0; i < 32; i++) + c->freqs.offset[i] = 1 << (5 - 5); + + deflate_make_huffman_codes(&c->freqs, &c->static_codes); } /* Return the offset slot for the given match offset, using the small map. */ static forceinline unsigned deflate_get_offset_slot(u32 offset) { - /* - * 1 <= offset <= 32768 here. For 1 <= offset <= 256, - * deflate_offset_slot[offset - 1] gives the slot. - * - * For 257 <= offset <= 32768, we take advantage of the fact that 257 is - * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 == - * 128 times larger than each slot [2..16) (since the number of extra - * bits increases by 1 every 2 slots). Thus, the slot is: - * - * deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2) - * == deflate_offset_slot[((offset - 1) >> 7)] + 14 - * - * Define 'n = (offset <= 256) ? 0 : 7'. Then any offset is handled by: - * - * deflate_offset_slot[(offset - 1) >> n] + (n << 1) - * - * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with - * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'. - */ - unsigned n = (256 - offset) >> 29; - - return deflate_offset_slot[(offset - 1) >> n] + (n << 1); + /* + * 1 <= offset <= 32768 here. For 1 <= offset <= 256, + * deflate_offset_slot[offset - 1] gives the slot. + * + * For 257 <= offset <= 32768, we take advantage of the fact that 257 is + * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 == + * 128 times larger than each slot [2..16) (since the number of extra + * bits increases by 1 every 2 slots). Thus, the slot is: + * + * deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2) + * == deflate_offset_slot[((offset - 1) >> 7)] + 14 + * + * Define 'n = (offset <= 256) ? 0 : 7'. Then any offset is handled by: + * + * deflate_offset_slot[(offset - 1) >> n] + (n << 1) + * + * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with + * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'. + */ + unsigned n = (256 - offset) >> 29; + + return deflate_offset_slot[(offset - 1) >> n] + (n << 1); } static unsigned deflate_compute_precode_items(const u8 lens[], const unsigned num_lens, - u32 precode_freqs[], unsigned precode_items[]) + u32 precode_freqs[], unsigned precode_items[]) { - unsigned *itemptr; - unsigned run_start; - unsigned run_end; - unsigned extra_bits; - u8 len; - - memset(precode_freqs, 0, - DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); - - itemptr = precode_items; - run_start = 0; - do { - /* Find the next run of codeword lengths. */ - - /* len = the length being repeated */ - len = lens[run_start]; - - /* Extend the run. */ - run_end = run_start; - do { - run_end++; - } while (run_end != num_lens && len == lens[run_end]); - - if (len == 0) { - /* Run of zeroes. */ - - /* Symbol 18: RLE 11 to 138 zeroes at a time. */ - while ((run_end - run_start) >= 11) { - extra_bits = MIN((run_end - run_start) - 11, - 0x7F); - precode_freqs[18]++; - *itemptr++ = 18 | (extra_bits << 5); - run_start += 11 + extra_bits; - } - - /* Symbol 17: RLE 3 to 10 zeroes at a time. */ - if ((run_end - run_start) >= 3) { - extra_bits = MIN((run_end - run_start) - 3, - 0x7); - precode_freqs[17]++; - *itemptr++ = 17 | (extra_bits << 5); - run_start += 3 + extra_bits; - } - } else { - - /* A run of nonzero lengths. */ - - /* Symbol 16: RLE 3 to 6 of the previous length. */ - if ((run_end - run_start) >= 4) { - precode_freqs[len]++; - *itemptr++ = len; - run_start++; - do { - extra_bits = MIN((run_end - run_start) - - 3, 0x3); - precode_freqs[16]++; - *itemptr++ = 16 | (extra_bits << 5); - run_start += 3 + extra_bits; - } while ((run_end - run_start) >= 3); - } - } - - /* Output any remaining lengths without RLE. */ - while (run_start != run_end) { - precode_freqs[len]++; - *itemptr++ = len; - run_start++; - } - } while (run_start != num_lens); - - return itemptr - precode_items; + unsigned *itemptr; + unsigned run_start; + unsigned run_end; + unsigned extra_bits; + u8 len; + + memset(precode_freqs, 0, + DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0])); + + itemptr = precode_items; + run_start = 0; + do { + /* Find the next run of codeword lengths. */ + + /* len = the length being repeated */ + len = lens[run_start]; + + /* Extend the run. */ + run_end = run_start; + do { + run_end++; + } while (run_end != num_lens && len == lens[run_end]); + + if (len == 0) { + /* Run of zeroes. */ + + /* Symbol 18: RLE 11 to 138 zeroes at a time. */ + while ((run_end - run_start) >= 11) { + extra_bits = MIN((run_end - run_start) - 11, + 0x7F); + precode_freqs[18]++; + *itemptr++ = 18 | (extra_bits << 5); + run_start += 11 + extra_bits; + } + + /* Symbol 17: RLE 3 to 10 zeroes at a time. */ + if ((run_end - run_start) >= 3) { + extra_bits = MIN((run_end - run_start) - 3, + 0x7); + precode_freqs[17]++; + *itemptr++ = 17 | (extra_bits << 5); + run_start += 3 + extra_bits; + } + } else { + + /* A run of nonzero lengths. */ + + /* Symbol 16: RLE 3 to 6 of the previous length. */ + if ((run_end - run_start) >= 4) { + precode_freqs[len]++; + *itemptr++ = len; + run_start++; + do { + extra_bits = MIN((run_end - run_start) - + 3, 0x3); + precode_freqs[16]++; + *itemptr++ = 16 | (extra_bits << 5); + run_start += 3 + extra_bits; + } while ((run_end - run_start) >= 3); + } + } + + /* Output any remaining lengths without RLE. */ + while (run_start != run_end) { + precode_freqs[len]++; + *itemptr++ = len; + run_start++; + } + } while (run_start != num_lens); + + return itemptr - precode_items; } /* @@ -1568,64 +1568,64 @@ deflate_compute_precode_items(const u8 lens[], const unsigned num_lens, static void deflate_precompute_huffman_header(struct libdeflate_compressor *c) { - /* Compute how many litlen and offset symbols are needed. */ - - for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; - c->o.precode.num_litlen_syms > 257; - c->o.precode.num_litlen_syms--) - if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0) - break; - - for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; - c->o.precode.num_offset_syms > 1; - c->o.precode.num_offset_syms--) - if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0) - break; - - /* - * If we're not using the full set of literal/length codeword lengths, - * then temporarily move the offset codeword lengths over so that the - * literal/length and offset codeword lengths are contiguous. - */ - STATIC_ASSERT(offsetof(struct deflate_lens, offset) == - DEFLATE_NUM_LITLEN_SYMS); - if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { - memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, - (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, - c->o.precode.num_offset_syms); - } - - /* - * Compute the "items" (RLE / literal tokens and extra bits) with which - * the codeword lengths in the larger code will be output. - */ - c->o.precode.num_items = - deflate_compute_precode_items((u8 *)&c->codes.lens, - c->o.precode.num_litlen_syms + - c->o.precode.num_offset_syms, - c->o.precode.freqs, - c->o.precode.items); - - /* Build the precode. */ - deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, - MAX_PRE_CODEWORD_LEN, - c->o.precode.freqs, c->o.precode.lens, - c->o.precode.codewords); - - /* Count how many precode lengths we actually need to output. */ - for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; - c->o.precode.num_explicit_lens > 4; - c->o.precode.num_explicit_lens--) - if (c->o.precode.lens[deflate_precode_lens_permutation[ - c->o.precode.num_explicit_lens - 1]] != 0) - break; - - /* Restore the offset codeword lengths if needed. */ - if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { - memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, - (u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, - c->o.precode.num_offset_syms); - } + /* Compute how many litlen and offset symbols are needed. */ + + for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS; + c->o.precode.num_litlen_syms > 257; + c->o.precode.num_litlen_syms--) + if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0) + break; + + for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS; + c->o.precode.num_offset_syms > 1; + c->o.precode.num_offset_syms--) + if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0) + break; + + /* + * If we're not using the full set of literal/length codeword lengths, + * then temporarily move the offset codeword lengths over so that the + * literal/length and offset codeword lengths are contiguous. + */ + STATIC_ASSERT(offsetof(struct deflate_lens, offset) == + DEFLATE_NUM_LITLEN_SYMS); + if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { + memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, + (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, + c->o.precode.num_offset_syms); + } + + /* + * Compute the "items" (RLE / literal tokens and extra bits) with which + * the codeword lengths in the larger code will be output. + */ + c->o.precode.num_items = + deflate_compute_precode_items((u8 *)&c->codes.lens, + c->o.precode.num_litlen_syms + + c->o.precode.num_offset_syms, + c->o.precode.freqs, + c->o.precode.items); + + /* Build the precode. */ + deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS, + MAX_PRE_CODEWORD_LEN, + c->o.precode.freqs, c->o.precode.lens, + c->o.precode.codewords); + + /* Count how many precode lengths we actually need to output. */ + for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS; + c->o.precode.num_explicit_lens > 4; + c->o.precode.num_explicit_lens--) + if (c->o.precode.lens[deflate_precode_lens_permutation[ + c->o.precode.num_explicit_lens - 1]] != 0) + break; + + /* Restore the offset codeword lengths if needed. */ + if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) { + memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS, + (u8 *)&c->codes.lens + c->o.precode.num_litlen_syms, + c->o.precode.num_offset_syms); + } } /* @@ -1635,60 +1635,60 @@ deflate_precompute_huffman_header(struct libdeflate_compressor *c) */ static void deflate_compute_full_len_codewords(struct libdeflate_compressor *c, - const struct deflate_codes *codes) + const struct deflate_codes *codes) { - unsigned len; - - STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN + - DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32); - - for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) { - unsigned slot = deflate_length_slot[len]; - unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot; - u32 extra_bits = len - deflate_length_slot_base[slot]; - - c->o.length.codewords[len] = - codes->codewords.litlen[litlen_sym] | - (extra_bits << codes->lens.litlen[litlen_sym]); - c->o.length.lens[len] = codes->lens.litlen[litlen_sym] + - deflate_extra_length_bits[slot]; - } + unsigned len; + + STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN + + DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32); + + for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) { + unsigned slot = deflate_length_slot[len]; + unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot; + u32 extra_bits = len - deflate_length_slot_base[slot]; + + c->o.length.codewords[len] = + codes->codewords.litlen[litlen_sym] | + (extra_bits << codes->lens.litlen[litlen_sym]); + c->o.length.lens[len] = codes->lens.litlen[litlen_sym] + + deflate_extra_length_bits[slot]; + } } /* Write a match to the output buffer. */ -#define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_) \ -do { \ - const struct libdeflate_compressor *c__ = (c_); \ - const struct deflate_codes *codes__ = (codes_); \ - unsigned length__ = (length_); \ - unsigned offset__ = (offset_); \ - unsigned offset_slot__ = (offset_slot_); \ - \ - /* Litlen symbol and extra length bits */ \ - STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ - DEFLATE_MAX_EXTRA_LENGTH_BITS)); \ - ADD_BITS(c__->o.length.codewords[length__], \ - c__->o.length.lens[length__]); \ - \ - if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ - DEFLATE_MAX_EXTRA_LENGTH_BITS + \ - MAX_OFFSET_CODEWORD_LEN + \ - DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ - FLUSH_BITS(); \ - \ - /* Offset symbol */ \ - ADD_BITS(codes__->codewords.offset[offset_slot__], \ - codes__->lens.offset[offset_slot__]); \ - \ - if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + \ - DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ - FLUSH_BITS(); \ - \ - /* Extra offset bits */ \ - ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__], \ - deflate_extra_offset_bits[offset_slot__]); \ - \ - FLUSH_BITS(); \ +#define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_) \ +do { \ +const struct libdeflate_compressor *c__ = (c_); \ +const struct deflate_codes *codes__ = (codes_); \ +unsigned length__ = (length_); \ +unsigned offset__ = (offset_); \ +unsigned offset_slot__ = (offset_slot_); \ +\ +/* Litlen symbol and extra length bits */ \ +STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ +DEFLATE_MAX_EXTRA_LENGTH_BITS)); \ +ADD_BITS(c__->o.length.codewords[length__], \ +c__->o.length.lens[length__]); \ +\ +if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN + \ +DEFLATE_MAX_EXTRA_LENGTH_BITS + \ +MAX_OFFSET_CODEWORD_LEN + \ +DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ +FLUSH_BITS(); \ +\ +/* Offset symbol */ \ +ADD_BITS(codes__->codewords.offset[offset_slot__], \ +codes__->lens.offset[offset_slot__]); \ +\ +if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN + \ +DEFLATE_MAX_EXTRA_OFFSET_BITS)) \ +FLUSH_BITS(); \ +\ +/* Extra offset bits */ \ +ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__], \ +deflate_extra_offset_bits[offset_slot__]); \ +\ +FLUSH_BITS(); \ } while (0) /* @@ -1703,349 +1703,349 @@ do { \ */ static void deflate_flush_block(struct libdeflate_compressor *c, - struct deflate_output_bitstream *os, - const u8 *block_begin, u32 block_length, - const struct deflate_sequence *sequences, - bool is_final_block) + struct deflate_output_bitstream *os, + const u8 *block_begin, u32 block_length, + const struct deflate_sequence *sequences, + bool is_final_block) { - /* - * It is hard to get compilers to understand that writes to 'os->next' - * don't alias 'os'. That hurts performance significantly, as - * everything in 'os' would keep getting re-loaded. ('restrict' - * *should* do the trick, but it's unreliable.) Therefore, we keep all - * the output bitstream state in local variables, and output bits using - * macros. This is similar to what the decompressor does. - */ - const u8 *in_next = block_begin; - const u8 * const in_end = block_begin + block_length; - bitbuf_t bitbuf = os->bitbuf; - unsigned bitcount = os->bitcount; - u8 *out_next = os->next; - u8 * const out_fast_end = - os->end - MIN(WORDBYTES - 1, os->end - out_next); - /* - * The cost for each block type, in bits. Start with the cost of the - * block header which is 3 bits. - */ - u32 dynamic_cost = 3; - u32 static_cost = 3; - u32 uncompressed_cost = 3; - u32 best_cost; - struct deflate_codes *codes; - unsigned sym; - - ASSERT(block_length >= MIN_BLOCK_LENGTH || - (is_final_block && block_length > 0)); - ASSERT(block_length <= MAX_BLOCK_LENGTH); - ASSERT(bitcount <= 7); - ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0); - ASSERT(out_next <= os->end); - ASSERT(!os->overflow); - - /* Precompute the precode items and build the precode. */ - deflate_precompute_huffman_header(c); - - /* Account for the cost of encoding dynamic Huffman codes. */ - dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); - for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { - u32 extra = deflate_extra_precode_bits[sym]; - - dynamic_cost += c->o.precode.freqs[sym] * - (extra + c->o.precode.lens[sym]); - } - - /* Account for the cost of encoding literals. */ - for (sym = 0; sym < 144; sym++) { - dynamic_cost += c->freqs.litlen[sym] * - c->codes.lens.litlen[sym]; - static_cost += c->freqs.litlen[sym] * 8; - } - for (; sym < 256; sym++) { - dynamic_cost += c->freqs.litlen[sym] * - c->codes.lens.litlen[sym]; - static_cost += c->freqs.litlen[sym] * 9; - } - - /* Account for the cost of encoding the end-of-block symbol. */ - dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK]; - static_cost += 7; - - /* Account for the cost of encoding lengths. */ - for (sym = DEFLATE_FIRST_LEN_SYM; - sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits); - sym++) { - u32 extra = deflate_extra_length_bits[ - sym - DEFLATE_FIRST_LEN_SYM]; - - dynamic_cost += c->freqs.litlen[sym] * - (extra + c->codes.lens.litlen[sym]); - static_cost += c->freqs.litlen[sym] * - (extra + c->static_codes.lens.litlen[sym]); - } - - /* Account for the cost of encoding offsets. */ - for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { - u32 extra = deflate_extra_offset_bits[sym]; - - dynamic_cost += c->freqs.offset[sym] * - (extra + c->codes.lens.offset[sym]); - static_cost += c->freqs.offset[sym] * (extra + 5); - } - - /* Compute the cost of using uncompressed blocks. */ - uncompressed_cost += (-(bitcount + 3) & 7) + 32 + - (40 * (DIV_ROUND_UP(block_length, - UINT16_MAX) - 1)) + - (8 * block_length); - - /* - * Choose and output the cheapest type of block. If there is a tie, - * prefer uncompressed, then static, then dynamic. - */ - - best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost)); - - /* If the block isn't going to fit, then stop early. */ - if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) { - os->overflow = true; - return; - } - /* - * Else, now we know that the block fits, so no further bounds checks on - * the output buffer are required until the next block. - */ - - if (best_cost == uncompressed_cost) { - /* - * Uncompressed block(s). DEFLATE limits the length of - * uncompressed blocks to UINT16_MAX bytes, so if the length of - * the "block" we're flushing is over UINT16_MAX, we actually - * output multiple blocks. - */ - do { - u8 bfinal = 0; - size_t len = UINT16_MAX; - - if (in_end - in_next <= UINT16_MAX) { - bfinal = is_final_block; - len = in_end - in_next; - } - /* It was already checked that there is enough space. */ - ASSERT(os->end - out_next >= - DIV_ROUND_UP(bitcount + 3, 8) + 4 + len); - /* - * Output BFINAL (1 bit) and BTYPE (2 bits), then align - * to a byte boundary. - */ - STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); - *out_next++ = (bfinal << bitcount) | bitbuf; - if (bitcount > 5) - *out_next++ = 0; - bitbuf = 0; - bitcount = 0; - /* Output LEN and NLEN, then the data itself. */ - put_unaligned_le16(len, out_next); - out_next += 2; - put_unaligned_le16(~len, out_next); - out_next += 2; - memcpy(out_next, in_next, len); - out_next += len; - in_next += len; - } while (in_next != in_end); - /* Done outputting uncompressed block(s) */ - goto out; - } - - if (best_cost == static_cost) { - /* Static Huffman block */ - codes = &c->static_codes; - ADD_BITS(is_final_block, 1); - ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); - FLUSH_BITS(); - } else { - const unsigned num_explicit_lens = c->o.precode.num_explicit_lens; - const unsigned num_precode_items = c->o.precode.num_items; - unsigned precode_sym, precode_item; - unsigned i; - - /* Dynamic Huffman block */ - - codes = &c->codes; - STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3)); - ADD_BITS(is_final_block, 1); - ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2); - ADD_BITS(c->o.precode.num_litlen_syms - 257, 5); - ADD_BITS(c->o.precode.num_offset_syms - 1, 5); - ADD_BITS(num_explicit_lens - 4, 4); - - /* Output the lengths of the codewords in the precode. */ - if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { - /* - * A 64-bit bitbuffer is just one bit too small to hold - * the maximum number of precode lens, so to minimize - * flushes we merge one len with the previous fields. - */ - precode_sym = deflate_precode_lens_permutation[0]; - ADD_BITS(c->o.precode.lens[precode_sym], 3); - FLUSH_BITS(); - i = 1; /* num_explicit_lens >= 4 */ - do { - precode_sym = - deflate_precode_lens_permutation[i]; - ADD_BITS(c->o.precode.lens[precode_sym], 3); - } while (++i < num_explicit_lens); - FLUSH_BITS(); - } else { - FLUSH_BITS(); - i = 0; - do { - precode_sym = - deflate_precode_lens_permutation[i]; - ADD_BITS(c->o.precode.lens[precode_sym], 3); - FLUSH_BITS(); - } while (++i < num_explicit_lens); - } - - /* - * Output the lengths of the codewords in the litlen and offset - * codes, encoded by the precode. - */ - i = 0; - do { - precode_item = c->o.precode.items[i]; - precode_sym = precode_item & 0x1F; - STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7)); - ADD_BITS(c->o.precode.codewords[precode_sym], - c->o.precode.lens[precode_sym]); - ADD_BITS(precode_item >> 5, - deflate_extra_precode_bits[precode_sym]); - FLUSH_BITS(); - } while (++i < num_precode_items); - } - - /* Output the literals and matches for a dynamic or static block. */ - ASSERT(bitcount <= 7); - deflate_compute_full_len_codewords(c, codes); + /* + * It is hard to get compilers to understand that writes to 'os->next' + * don't alias 'os'. That hurts performance significantly, as + * everything in 'os' would keep getting re-loaded. ('restrict' + * *should* do the trick, but it's unreliable.) Therefore, we keep all + * the output bitstream state in local variables, and output bits using + * macros. This is similar to what the decompressor does. + */ + const u8 *in_next = block_begin; + const u8 * const in_end = block_begin + block_length; + bitbuf_t bitbuf = os->bitbuf; + unsigned bitcount = os->bitcount; + u8 *out_next = os->next; + u8 * const out_fast_end = + os->end - MIN(WORDBYTES - 1, os->end - out_next); + /* + * The cost for each block type, in bits. Start with the cost of the + * block header which is 3 bits. + */ + u32 dynamic_cost = 3; + u32 static_cost = 3; + u32 uncompressed_cost = 3; + u32 best_cost; + struct deflate_codes *codes; + unsigned sym; + + ASSERT(block_length >= MIN_BLOCK_LENGTH || + (is_final_block && block_length > 0)); + ASSERT(block_length <= MAX_BLOCK_LENGTH); + ASSERT(bitcount <= 7); + ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0); + ASSERT(out_next <= os->end); + ASSERT(!os->overflow); + + /* Precompute the precode items and build the precode. */ + deflate_precompute_huffman_header(c); + + /* Account for the cost of encoding dynamic Huffman codes. */ + dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); + for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { + u32 extra = deflate_extra_precode_bits[sym]; + + dynamic_cost += c->o.precode.freqs[sym] * + (extra + c->o.precode.lens[sym]); + } + + /* Account for the cost of encoding literals. */ + for (sym = 0; sym < 144; sym++) { + dynamic_cost += c->freqs.litlen[sym] * + c->codes.lens.litlen[sym]; + static_cost += c->freqs.litlen[sym] * 8; + } + for (; sym < 256; sym++) { + dynamic_cost += c->freqs.litlen[sym] * + c->codes.lens.litlen[sym]; + static_cost += c->freqs.litlen[sym] * 9; + } + + /* Account for the cost of encoding the end-of-block symbol. */ + dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK]; + static_cost += 7; + + /* Account for the cost of encoding lengths. */ + for (sym = DEFLATE_FIRST_LEN_SYM; + sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits); + sym++) { + u32 extra = deflate_extra_length_bits[ + sym - DEFLATE_FIRST_LEN_SYM]; + + dynamic_cost += c->freqs.litlen[sym] * + (extra + c->codes.lens.litlen[sym]); + static_cost += c->freqs.litlen[sym] * + (extra + c->static_codes.lens.litlen[sym]); + } + + /* Account for the cost of encoding offsets. */ + for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) { + u32 extra = deflate_extra_offset_bits[sym]; + + dynamic_cost += c->freqs.offset[sym] * + (extra + c->codes.lens.offset[sym]); + static_cost += c->freqs.offset[sym] * (extra + 5); + } + + /* Compute the cost of using uncompressed blocks. */ + uncompressed_cost += (-(bitcount + 3) & 7) + 32 + + (40 * (DIV_ROUND_UP(block_length, + UINT16_MAX) - 1)) + + (8 * block_length); + + /* + * Choose and output the cheapest type of block. If there is a tie, + * prefer uncompressed, then static, then dynamic. + */ + + best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost)); + + /* If the block isn't going to fit, then stop early. */ + if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) { + os->overflow = true; + return; + } + /* + * Else, now we know that the block fits, so no further bounds checks on + * the output buffer are required until the next block. + */ + + if (best_cost == uncompressed_cost) { + /* + * Uncompressed block(s). DEFLATE limits the length of + * uncompressed blocks to UINT16_MAX bytes, so if the length of + * the "block" we're flushing is over UINT16_MAX, we actually + * output multiple blocks. + */ + do { + u8 bfinal = 0; + size_t len = UINT16_MAX; + + if (in_end - in_next <= UINT16_MAX) { + bfinal = is_final_block; + len = in_end - in_next; + } + /* It was already checked that there is enough space. */ + ASSERT(os->end - out_next >= + DIV_ROUND_UP(bitcount + 3, 8) + 4 + len); + /* + * Output BFINAL (1 bit) and BTYPE (2 bits), then align + * to a byte boundary. + */ + STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0); + *out_next++ = (bfinal << bitcount) | bitbuf; + if (bitcount > 5) + *out_next++ = 0; + bitbuf = 0; + bitcount = 0; + /* Output LEN and NLEN, then the data itself. */ + put_unaligned_le16(len, out_next); + out_next += 2; + put_unaligned_le16(~len, out_next); + out_next += 2; + memcpy(out_next, in_next, len); + out_next += len; + in_next += len; + } while (in_next != in_end); + /* Done outputting uncompressed block(s) */ + goto out; + } + + if (best_cost == static_cost) { + /* Static Huffman block */ + codes = &c->static_codes; + ADD_BITS(is_final_block, 1); + ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2); + FLUSH_BITS(); + } else { + const unsigned num_explicit_lens = c->o.precode.num_explicit_lens; + const unsigned num_precode_items = c->o.precode.num_items; + unsigned precode_sym, precode_item; + unsigned i; + + /* Dynamic Huffman block */ + + codes = &c->codes; + STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3)); + ADD_BITS(is_final_block, 1); + ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2); + ADD_BITS(c->o.precode.num_litlen_syms - 257, 5); + ADD_BITS(c->o.precode.num_offset_syms - 1, 5); + ADD_BITS(num_explicit_lens - 4, 4); + + /* Output the lengths of the codewords in the precode. */ + if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) { + /* + * A 64-bit bitbuffer is just one bit too small to hold + * the maximum number of precode lens, so to minimize + * flushes we merge one len with the previous fields. + */ + precode_sym = deflate_precode_lens_permutation[0]; + ADD_BITS(c->o.precode.lens[precode_sym], 3); + FLUSH_BITS(); + i = 1; /* num_explicit_lens >= 4 */ + do { + precode_sym = + deflate_precode_lens_permutation[i]; + ADD_BITS(c->o.precode.lens[precode_sym], 3); + } while (++i < num_explicit_lens); + FLUSH_BITS(); + } else { + FLUSH_BITS(); + i = 0; + do { + precode_sym = + deflate_precode_lens_permutation[i]; + ADD_BITS(c->o.precode.lens[precode_sym], 3); + FLUSH_BITS(); + } while (++i < num_explicit_lens); + } + + /* + * Output the lengths of the codewords in the litlen and offset + * codes, encoded by the precode. + */ + i = 0; + do { + precode_item = c->o.precode.items[i]; + precode_sym = precode_item & 0x1F; + STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7)); + ADD_BITS(c->o.precode.codewords[precode_sym], + c->o.precode.lens[precode_sym]); + ADD_BITS(precode_item >> 5, + deflate_extra_precode_bits[precode_sym]); + FLUSH_BITS(); + } while (++i < num_precode_items); + } + + /* Output the literals and matches for a dynamic or static block. */ + ASSERT(bitcount <= 7); + deflate_compute_full_len_codewords(c, codes); #if SUPPORT_NEAR_OPTIMAL_PARSING - if (sequences == NULL) { - /* Output the literals and matches from the minimum-cost path */ - struct deflate_optimum_node *cur_node = - &c->p.n.optimum_nodes[0]; - struct deflate_optimum_node * const end_node = - &c->p.n.optimum_nodes[block_length]; - do { - unsigned length = cur_node->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_node->item >> - OPTIMUM_OFFSET_SHIFT; - if (length == 1) { - /* Literal */ - ADD_BITS(codes->codewords.litlen[offset], - codes->lens.litlen[offset]); - FLUSH_BITS(); - } else { - /* Match */ - WRITE_MATCH(c, codes, length, offset, - c->p.n.offset_slot_full[offset]); - } - cur_node += length; - } while (cur_node != end_node); - } else + if (sequences == NULL) { + /* Output the literals and matches from the minimum-cost path */ + struct deflate_optimum_node *cur_node = + &c->p.n.optimum_nodes[0]; + struct deflate_optimum_node * const end_node = + &c->p.n.optimum_nodes[block_length]; + do { + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_node->item >> + OPTIMUM_OFFSET_SHIFT; + if (length == 1) { + /* Literal */ + ADD_BITS(codes->codewords.litlen[offset], + codes->lens.litlen[offset]); + FLUSH_BITS(); + } else { + /* Match */ + WRITE_MATCH(c, codes, length, offset, + c->p.n.offset_slot_full[offset]); + } + cur_node += length; + } while (cur_node != end_node); + } else #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - { - /* Output the literals and matches from the sequences list. */ - const struct deflate_sequence *seq; - - for (seq = sequences; ; seq++) { - u32 litrunlen = seq->litrunlen_and_length & - SEQ_LITRUNLEN_MASK; - unsigned length = seq->litrunlen_and_length >> - SEQ_LENGTH_SHIFT; - unsigned lit; - - /* Output a run of literals. */ - if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) { - for (; litrunlen >= 4; litrunlen -= 4) { - lit = *in_next++; - ADD_BITS(codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - lit = *in_next++; - ADD_BITS(codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - lit = *in_next++; - ADD_BITS(codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - lit = *in_next++; - ADD_BITS(codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - FLUSH_BITS(); - } - if (litrunlen-- != 0) { - lit = *in_next++; - ADD_BITS(codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - if (litrunlen-- != 0) { - lit = *in_next++; - ADD_BITS(codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - if (litrunlen-- != 0) { - lit = *in_next++; - ADD_BITS(codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - } - } - FLUSH_BITS(); - } - } else { - while (litrunlen--) { - lit = *in_next++; - ADD_BITS(codes->codewords.litlen[lit], - codes->lens.litlen[lit]); - FLUSH_BITS(); - } - } - - if (length == 0) { /* Last sequence? */ - ASSERT(in_next == in_end); - break; - } - - /* Output a match. */ - WRITE_MATCH(c, codes, length, seq->offset, - seq->offset_slot); - in_next += length; - } - } - - /* Output the end-of-block symbol. */ - ASSERT(bitcount <= 7); - ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK], - codes->lens.litlen[DEFLATE_END_OF_BLOCK]); - FLUSH_BITS(); + { + /* Output the literals and matches from the sequences list. */ + const struct deflate_sequence *seq; + + for (seq = sequences; ; seq++) { + u32 litrunlen = seq->litrunlen_and_length & + SEQ_LITRUNLEN_MASK; + unsigned length = seq->litrunlen_and_length >> + SEQ_LENGTH_SHIFT; + unsigned lit; + + /* Output a run of literals. */ + if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) { + for (; litrunlen >= 4; litrunlen -= 4) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + FLUSH_BITS(); + } + if (litrunlen-- != 0) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + if (litrunlen-- != 0) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + if (litrunlen-- != 0) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + } + } + FLUSH_BITS(); + } + } else { + while (litrunlen--) { + lit = *in_next++; + ADD_BITS(codes->codewords.litlen[lit], + codes->lens.litlen[lit]); + FLUSH_BITS(); + } + } + + if (length == 0) { /* Last sequence? */ + ASSERT(in_next == in_end); + break; + } + + /* Output a match. */ + WRITE_MATCH(c, codes, length, seq->offset, + seq->offset_slot); + in_next += length; + } + } + + /* Output the end-of-block symbol. */ + ASSERT(bitcount <= 7); + ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK], + codes->lens.litlen[DEFLATE_END_OF_BLOCK]); + FLUSH_BITS(); out: - ASSERT(bitcount <= 7); - /* - * Assert that the block cost was computed correctly. This is relied on - * above for the bounds check on the output buffer. Also, - * libdeflate_deflate_compress_bound() relies on this via the assumption - * that uncompressed blocks will always be used when cheapest. - */ - ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost); - os->bitbuf = bitbuf; - os->bitcount = bitcount; - os->next = out_next; + ASSERT(bitcount <= 7); + /* + * Assert that the block cost was computed correctly. This is relied on + * above for the bounds check on the output buffer. Also, + * libdeflate_deflate_compress_bound() relies on this via the assumption + * that uncompressed blocks will always be used when cheapest. + */ + ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost); + os->bitbuf = bitbuf; + os->bitcount = bitcount; + os->next = out_next; } static void deflate_finish_block(struct libdeflate_compressor *c, - struct deflate_output_bitstream *os, - const u8 *block_begin, u32 block_length, - const struct deflate_sequence *sequences, - bool is_final_block) + struct deflate_output_bitstream *os, + const u8 *block_begin, u32 block_length, + const struct deflate_sequence *sequences, + bool is_final_block) { - c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; - deflate_make_huffman_codes(&c->freqs, &c->codes); - deflate_flush_block(c, os, block_begin, block_length, sequences, - is_final_block); + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + deflate_make_huffman_codes(&c->freqs, &c->codes); + deflate_flush_block(c, os, block_begin, block_length, sequences, + is_final_block); } /******************************************************************************/ @@ -2090,14 +2090,14 @@ deflate_finish_block(struct libdeflate_compressor *c, static void init_block_split_stats(struct block_split_stats *stats) { - int i; - - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - stats->new_observations[i] = 0; - stats->observations[i] = 0; - } - stats->num_new_observations = 0; - stats->num_observations = 0; + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + stats->new_observations[i] = 0; + stats->observations[i] = 0; + } + stats->num_new_observations = 0; + stats->num_observations = 0; } /* @@ -2107,8 +2107,8 @@ init_block_split_stats(struct block_split_stats *stats) static forceinline void observe_literal(struct block_split_stats *stats, u8 lit) { - stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; - stats->num_new_observations++; + stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; + stats->num_new_observations++; } /* @@ -2118,147 +2118,147 @@ observe_literal(struct block_split_stats *stats, u8 lit) static forceinline void observe_match(struct block_split_stats *stats, unsigned length) { - stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + - (length >= 9)]++; - stats->num_new_observations++; + stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + + (length >= 9)]++; + stats->num_new_observations++; } static void merge_new_observations(struct block_split_stats *stats) { - int i; - - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - stats->observations[i] += stats->new_observations[i]; - stats->new_observations[i] = 0; - } - stats->num_observations += stats->num_new_observations; - stats->num_new_observations = 0; + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + stats->observations[i] += stats->new_observations[i]; + stats->new_observations[i] = 0; + } + stats->num_observations += stats->num_new_observations; + stats->num_new_observations = 0; } static bool do_end_block_check(struct block_split_stats *stats, u32 block_length) { - if (stats->num_observations > 0) { - /* - * Compute the sum of absolute differences of probabilities. To - * avoid needing to use floating point arithmetic or do slow - * divisions, we do all arithmetic with the probabilities - * multiplied by num_observations * num_new_observations. E.g., - * for the "old" observations the probabilities would be - * (double)observations[i] / num_observations, but since we - * multiply by both num_observations and num_new_observations we - * really do observations[i] * num_new_observations. - */ - u32 total_delta = 0; - u32 num_items; - u32 cutoff; - int i; - - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - u32 expected = stats->observations[i] * - stats->num_new_observations; - u32 actual = stats->new_observations[i] * - stats->num_observations; - u32 delta = (actual > expected) ? actual - expected : - expected - actual; - - total_delta += delta; - } - - num_items = stats->num_observations + - stats->num_new_observations; - /* - * Heuristic: the cutoff is when the sum of absolute differences - * of probabilities becomes at least 200/512. As above, the - * probability is multiplied by both num_new_observations and - * num_observations. Be careful to avoid integer overflow. - */ - cutoff = stats->num_new_observations * 200 / 512 * - stats->num_observations; - /* - * Very short blocks have a lot of overhead for the Huffman - * codes, so only use them if it clearly seems worthwhile. - * (This is an additional penalty, which adds to the smaller - * penalty below which scales more slowly.) - */ - if (block_length < 10000 && num_items < 8192) - cutoff += (u64)cutoff * (8192 - num_items) / 8192; - - /* Ready to end the block? */ - if (total_delta + - (block_length / 4096) * stats->num_observations >= cutoff) - return true; - } - merge_new_observations(stats); - return false; + if (stats->num_observations > 0) { + /* + * Compute the sum of absolute differences of probabilities. To + * avoid needing to use floating point arithmetic or do slow + * divisions, we do all arithmetic with the probabilities + * multiplied by num_observations * num_new_observations. E.g., + * for the "old" observations the probabilities would be + * (double)observations[i] / num_observations, but since we + * multiply by both num_observations and num_new_observations we + * really do observations[i] * num_new_observations. + */ + u32 total_delta = 0; + u32 num_items; + u32 cutoff; + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + u32 expected = stats->observations[i] * + stats->num_new_observations; + u32 actual = stats->new_observations[i] * + stats->num_observations; + u32 delta = (actual > expected) ? actual - expected : + expected - actual; + + total_delta += delta; + } + + num_items = stats->num_observations + + stats->num_new_observations; + /* + * Heuristic: the cutoff is when the sum of absolute differences + * of probabilities becomes at least 200/512. As above, the + * probability is multiplied by both num_new_observations and + * num_observations. Be careful to avoid integer overflow. + */ + cutoff = stats->num_new_observations * 200 / 512 * + stats->num_observations; + /* + * Very short blocks have a lot of overhead for the Huffman + * codes, so only use them if it clearly seems worthwhile. + * (This is an additional penalty, which adds to the smaller + * penalty below which scales more slowly.) + */ + if (block_length < 10000 && num_items < 8192) + cutoff += (u64)cutoff * (8192 - num_items) / 8192; + + /* Ready to end the block? */ + if (total_delta + + (block_length / 4096) * stats->num_observations >= cutoff) + return true; + } + merge_new_observations(stats); + return false; } static forceinline bool ready_to_check_block(const struct block_split_stats *stats, - const u8 *in_block_begin, const u8 *in_next, - const u8 *in_end) + const u8 *in_block_begin, const u8 *in_next, + const u8 *in_end) { - return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK - && in_next - in_block_begin >= MIN_BLOCK_LENGTH - && in_end - in_next >= MIN_BLOCK_LENGTH; + return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK + && in_next - in_block_begin >= MIN_BLOCK_LENGTH + && in_end - in_next >= MIN_BLOCK_LENGTH; } static forceinline bool should_end_block(struct block_split_stats *stats, - const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) + const u8 *in_block_begin, const u8 *in_next, const u8 *in_end) { - /* Ready to try to end the block (again)? */ - if (!ready_to_check_block(stats, in_block_begin, in_next, in_end)) - return false; - - return do_end_block_check(stats, in_next - in_block_begin); + /* Ready to try to end the block (again)? */ + if (!ready_to_check_block(stats, in_block_begin, in_next, in_end)) + return false; + + return do_end_block_check(stats, in_next - in_block_begin); } /******************************************************************************/ static void deflate_begin_sequences(struct libdeflate_compressor *c, - struct deflate_sequence *first_seq) + struct deflate_sequence *first_seq) { - deflate_reset_symbol_frequencies(c); - first_seq->litrunlen_and_length = 0; + deflate_reset_symbol_frequencies(c); + first_seq->litrunlen_and_length = 0; } static forceinline void deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal, - bool gather_split_stats, struct deflate_sequence *seq) + bool gather_split_stats, struct deflate_sequence *seq) { - c->freqs.litlen[literal]++; - - if (gather_split_stats) - observe_literal(&c->split_stats, literal); - - STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK); - seq->litrunlen_and_length++; + c->freqs.litlen[literal]++; + + if (gather_split_stats) + observe_literal(&c->split_stats, literal); + + STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK); + seq->litrunlen_and_length++; } static forceinline void deflate_choose_match(struct libdeflate_compressor *c, - unsigned length, unsigned offset, bool gather_split_stats, - struct deflate_sequence **seq_p) + unsigned length, unsigned offset, bool gather_split_stats, + struct deflate_sequence **seq_p) { - struct deflate_sequence *seq = *seq_p; - unsigned length_slot = deflate_length_slot[length]; - unsigned offset_slot = deflate_get_offset_slot(offset); - - c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++; - c->freqs.offset[offset_slot]++; - if (gather_split_stats) - observe_match(&c->split_stats, length); - - seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT; - seq->offset = offset; - seq->offset_slot = offset_slot; - - seq++; - seq->litrunlen_and_length = 0; - *seq_p = seq; + struct deflate_sequence *seq = *seq_p; + unsigned length_slot = deflate_length_slot[length]; + unsigned offset_slot = deflate_get_offset_slot(offset); + + c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++; + c->freqs.offset[offset_slot]++; + if (gather_split_stats) + observe_match(&c->split_stats, length); + + seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT; + seq->offset = offset; + seq->offset_slot = offset_slot; + + seq++; + seq->litrunlen_and_length = 0; + *seq_p = seq; } /* @@ -2268,10 +2268,10 @@ deflate_choose_match(struct libdeflate_compressor *c, static forceinline void adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining) { - if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { - *max_len = remaining; - *nice_len = MIN(*nice_len, *max_len); - } + if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { + *max_len = remaining; + *nice_len = MIN(*nice_len, *max_len); + } } /* @@ -2293,62 +2293,62 @@ adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining) static unsigned choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth) { - /* map from num_used_literals to min_len */ - static const u8 min_lens[] = { - 9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, - 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* The rest is implicitly 3. */ - }; - unsigned min_len; - - STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3); - STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1); - - if (num_used_literals >= ARRAY_LEN(min_lens)) - return 3; - min_len = min_lens[num_used_literals]; - /* - * With a low max_search_depth, it may be too hard to find long matches. - */ - if (max_search_depth < 16) { - if (max_search_depth < 5) - min_len = MIN(min_len, 4); - else if (max_search_depth < 10) - min_len = MIN(min_len, 5); - else - min_len = MIN(min_len, 7); - } - return min_len; + /* map from num_used_literals to min_len */ + static const u8 min_lens[] = { + 9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + /* The rest is implicitly 3. */ + }; + unsigned min_len; + + STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3); + STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1); + + if (num_used_literals >= ARRAY_LEN(min_lens)) + return 3; + min_len = min_lens[num_used_literals]; + /* + * With a low max_search_depth, it may be too hard to find long matches. + */ + if (max_search_depth < 16) { + if (max_search_depth < 5) + min_len = MIN(min_len, 4); + else if (max_search_depth < 10) + min_len = MIN(min_len, 5); + else + min_len = MIN(min_len, 7); + } + return min_len; } static unsigned calculate_min_match_len(const u8 *data, size_t data_len, - unsigned max_search_depth) + unsigned max_search_depth) { - u8 used[256] = { 0 }; - unsigned num_used_literals = 0; - size_t i; - - /* - * For very short inputs, the static Huffman code has a good chance of - * being best, in which case there is no reason to avoid short matches. - */ - if (data_len < 512) - return DEFLATE_MIN_MATCH_LEN; - - /* - * For an initial approximation, scan the first 4 KiB of data. The - * caller may use recalculate_min_match_len() to update min_len later. - */ - data_len = MIN(data_len, 4096); - for (i = 0; i < data_len; i++) - used[data[i]] = 1; - for (i = 0; i < 256; i++) - num_used_literals += used[i]; - return choose_min_match_len(num_used_literals, max_search_depth); + u8 used[256] = { 0 }; + unsigned num_used_literals = 0; + size_t i; + + /* + * For very short inputs, the static Huffman code has a good chance of + * being best, in which case there is no reason to avoid short matches. + */ + if (data_len < 512) + return DEFLATE_MIN_MATCH_LEN; + + /* + * For an initial approximation, scan the first 4 KiB of data. The + * caller may use recalculate_min_match_len() to update min_len later. + */ + data_len = MIN(data_len, 4096); + for (i = 0; i < data_len; i++) + used[data[i]] = 1; + for (i = 0; i < 256; i++) + num_used_literals += used[i]; + return choose_min_match_len(num_used_literals, max_search_depth); } /* @@ -2357,32 +2357,32 @@ calculate_min_match_len(const u8 *data, size_t data_len, */ static unsigned recalculate_min_match_len(const struct deflate_freqs *freqs, - unsigned max_search_depth) + unsigned max_search_depth) { - u32 literal_freq = 0; - u32 cutoff; - unsigned num_used_literals = 0; - int i; - - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) - literal_freq += freqs->litlen[i]; - - cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */ - - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { - if (freqs->litlen[i] > cutoff) - num_used_literals++; - } - return choose_min_match_len(num_used_literals, max_search_depth); + u32 literal_freq = 0; + u32 cutoff; + unsigned num_used_literals = 0; + int i; + + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + literal_freq += freqs->litlen[i]; + + cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */ + + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { + if (freqs->litlen[i] > cutoff) + num_used_literals++; + } + return choose_min_match_len(num_used_literals, max_search_depth); } static forceinline const u8 * choose_max_block_end(const u8 *in_block_begin, const u8 *in_end, - size_t soft_max_len) + size_t soft_max_len) { - if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH) - return in_end; - return in_block_begin + soft_max_len; + if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH) + return in_end; + return in_block_begin + soft_max_len; } /* @@ -2390,55 +2390,55 @@ choose_max_block_end(const u8 *in_block_begin, const u8 *in_end, */ static size_t deflate_compress_none(const u8 *in, size_t in_nbytes, - u8 *out, size_t out_nbytes_avail) + u8 *out, size_t out_nbytes_avail) { - const u8 *in_next = in; - const u8 * const in_end = in + in_nbytes; - u8 *out_next = out; - u8 * const out_end = out + out_nbytes_avail; - - /* - * If the input is zero-length, we still must output a block in order - * for the output to be a valid DEFLATE stream. Handle this case - * specially to avoid potentially passing NULL to memcpy() below. - */ - if (unlikely(in_nbytes == 0)) { - if (out_nbytes_avail < 5) - return 0; - /* BFINAL and BTYPE */ - *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); - /* LEN and NLEN */ - put_unaligned_le32(0xFFFF0000, out_next); - return 5; - } - - do { - u8 bfinal = 0; - size_t len = UINT16_MAX; - - if (in_end - in_next <= UINT16_MAX) { - bfinal = 1; - len = in_end - in_next; - } - if (out_end - out_next < 5 + len) - return 0; - /* - * Output BFINAL and BTYPE. The stream is already byte-aligned - * here, so this step always requires outputting exactly 1 byte. - */ - *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); - - /* Output LEN and NLEN, then the data itself. */ - put_unaligned_le16(len, out_next); - out_next += 2; - put_unaligned_le16(~len, out_next); - out_next += 2; - memcpy(out_next, in_next, len); - out_next += len; - in_next += len; - } while (in_next != in_end); - - return out_next - out; + const u8 *in_next = in; + const u8 * const in_end = in + in_nbytes; + u8 *out_next = out; + u8 * const out_end = out + out_nbytes_avail; + + /* + * If the input is zero-length, we still must output a block in order + * for the output to be a valid DEFLATE stream. Handle this case + * specially to avoid potentially passing NULL to memcpy() below. + */ + if (unlikely(in_nbytes == 0)) { + if (out_nbytes_avail < 5) + return 0; + /* BFINAL and BTYPE */ + *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); + /* LEN and NLEN */ + put_unaligned_le32(0xFFFF0000, out_next); + return 5; + } + + do { + u8 bfinal = 0; + size_t len = UINT16_MAX; + + if (in_end - in_next <= UINT16_MAX) { + bfinal = 1; + len = in_end - in_next; + } + if (out_end - out_next < 5 + len) + return 0; + /* + * Output BFINAL and BTYPE. The stream is already byte-aligned + * here, so this step always requires outputting exactly 1 byte. + */ + *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1); + + /* Output LEN and NLEN, then the data itself. */ + put_unaligned_le16(len, out_next); + out_next += 2; + put_unaligned_le16(~len, out_next); + out_next += 2; + memcpy(out_next, in_next, len); + out_next += len; + in_next += len; + } while (in_next != in_end); + + return out_next - out; } /* @@ -2449,76 +2449,76 @@ deflate_compress_none(const u8 *in, size_t in_nbytes, */ static void deflate_compress_fastest(struct libdeflate_compressor * restrict c, - const u8 *in, size_t in_nbytes, - struct deflate_output_bitstream *os) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { - const u8 *in_next = in; - const u8 *in_end = in_next + in_nbytes; - const u8 *in_cur_base = in_next; - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - u32 next_hash = 0; - - ht_matchfinder_init(&c->p.f.ht_mf); - - do { - /* Starting a new DEFLATE block */ - - const u8 * const in_block_begin = in_next; - const u8 * const in_max_block_end = choose_max_block_end( - in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH); - struct deflate_sequence *seq = c->p.f.sequences; - - deflate_begin_sequences(c, seq); - - do { - u32 length; - u32 offset; - size_t remaining = in_end - in_next; - - if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { - max_len = remaining; - if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) { - do { - deflate_choose_literal(c, - *in_next++, false, seq); - } while (--max_len); - break; - } - nice_len = MIN(nice_len, max_len); - } - length = ht_matchfinder_longest_match(&c->p.f.ht_mf, - &in_cur_base, - in_next, - max_len, - nice_len, - &next_hash, - &offset); - if (length) { - /* Match found */ - deflate_choose_match(c, length, offset, false, - &seq); - ht_matchfinder_skip_bytes(&c->p.f.ht_mf, - &in_cur_base, - in_next + 1, - in_end, - length - 1, - &next_hash); - in_next += length; - } else { - /* No match found */ - deflate_choose_literal(c, *in_next++, false, - seq); - } - - /* Check if it's time to output another block. */ - } while (in_next < in_max_block_end && - seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]); - - deflate_finish_block(c, os, in_block_begin, - in_next - in_block_begin, - c->p.f.sequences, in_next == in_end); - } while (in_next != in_end && !os->overflow); + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + u32 next_hash = 0; + + ht_matchfinder_init(&c->p.f.ht_mf); + + do { + /* Starting a new DEFLATE block */ + + const u8 * const in_block_begin = in_next; + const u8 * const in_max_block_end = choose_max_block_end( + in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH); + struct deflate_sequence *seq = c->p.f.sequences; + + deflate_begin_sequences(c, seq); + + do { + u32 length; + u32 offset; + size_t remaining = in_end - in_next; + + if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) { + max_len = remaining; + if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) { + do { + deflate_choose_literal(c, + *in_next++, false, seq); + } while (--max_len); + break; + } + nice_len = MIN(nice_len, max_len); + } + length = ht_matchfinder_longest_match(&c->p.f.ht_mf, + &in_cur_base, + in_next, + max_len, + nice_len, + &next_hash, + &offset); + if (length) { + /* Match found */ + deflate_choose_match(c, length, offset, false, + &seq); + ht_matchfinder_skip_bytes(&c->p.f.ht_mf, + &in_cur_base, + in_next + 1, + in_end, + length - 1, + &next_hash); + in_next += length; + } else { + /* No match found */ + deflate_choose_literal(c, *in_next++, false, + seq); + } + + /* Check if it's time to output another block. */ + } while (in_next < in_max_block_end && + seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]); + + deflate_finish_block(c, os, in_block_begin, + in_next - in_block_begin, + c->p.f.sequences, in_next == in_end); + } while (in_next != in_end && !os->overflow); } /* @@ -2526,284 +2526,284 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c, */ static void deflate_compress_greedy(struct libdeflate_compressor * restrict c, - const u8 *in, size_t in_nbytes, - struct deflate_output_bitstream *os) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { - const u8 *in_next = in; - const u8 *in_end = in_next + in_nbytes; - const u8 *in_cur_base = in_next; - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - u32 next_hashes[2] = {0, 0}; - - hc_matchfinder_init(&c->p.g.hc_mf); - - do { - /* Starting a new DEFLATE block */ - - const u8 * const in_block_begin = in_next; - const u8 * const in_max_block_end = choose_max_block_end( - in_next, in_end, SOFT_MAX_BLOCK_LENGTH); - struct deflate_sequence *seq = c->p.g.sequences; - unsigned min_len; - - init_block_split_stats(&c->split_stats); - deflate_begin_sequences(c, seq); - min_len = calculate_min_match_len(in_next, - in_max_block_end - in_next, - c->max_search_depth); - do { - u32 length; - u32 offset; - - adjust_max_and_nice_len(&max_len, &nice_len, - in_end - in_next); - length = hc_matchfinder_longest_match( - &c->p.g.hc_mf, - &in_cur_base, - in_next, - min_len - 1, - max_len, - nice_len, - c->max_search_depth, - next_hashes, - &offset); - - if (length >= min_len && - (length > DEFLATE_MIN_MATCH_LEN || - offset <= 4096)) { - /* Match found */ - deflate_choose_match(c, length, offset, true, - &seq); - hc_matchfinder_skip_bytes(&c->p.g.hc_mf, - &in_cur_base, - in_next + 1, - in_end, - length - 1, - next_hashes); - in_next += length; - } else { - /* No match found */ - deflate_choose_literal(c, *in_next++, true, - seq); - } - - /* Check if it's time to output another block. */ - } while (in_next < in_max_block_end && - seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && - !should_end_block(&c->split_stats, - in_block_begin, in_next, in_end)); - - deflate_finish_block(c, os, in_block_begin, - in_next - in_block_begin, - c->p.g.sequences, in_next == in_end); - } while (in_next != in_end && !os->overflow); + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + u32 next_hashes[2] = {0, 0}; + + hc_matchfinder_init(&c->p.g.hc_mf); + + do { + /* Starting a new DEFLATE block */ + + const u8 * const in_block_begin = in_next; + const u8 * const in_max_block_end = choose_max_block_end( + in_next, in_end, SOFT_MAX_BLOCK_LENGTH); + struct deflate_sequence *seq = c->p.g.sequences; + unsigned min_len; + + init_block_split_stats(&c->split_stats); + deflate_begin_sequences(c, seq); + min_len = calculate_min_match_len(in_next, + in_max_block_end - in_next, + c->max_search_depth); + do { + u32 length; + u32 offset; + + adjust_max_and_nice_len(&max_len, &nice_len, + in_end - in_next); + length = hc_matchfinder_longest_match( + &c->p.g.hc_mf, + &in_cur_base, + in_next, + min_len - 1, + max_len, + nice_len, + c->max_search_depth, + next_hashes, + &offset); + + if (length >= min_len && + (length > DEFLATE_MIN_MATCH_LEN || + offset <= 4096)) { + /* Match found */ + deflate_choose_match(c, length, offset, true, + &seq); + hc_matchfinder_skip_bytes(&c->p.g.hc_mf, + &in_cur_base, + in_next + 1, + in_end, + length - 1, + next_hashes); + in_next += length; + } else { + /* No match found */ + deflate_choose_literal(c, *in_next++, true, + seq); + } + + /* Check if it's time to output another block. */ + } while (in_next < in_max_block_end && + seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && + !should_end_block(&c->split_stats, + in_block_begin, in_next, in_end)); + + deflate_finish_block(c, os, in_block_begin, + in_next - in_block_begin, + c->p.g.sequences, in_next == in_end); + } while (in_next != in_end && !os->overflow); } static forceinline void deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c, - const u8 *in, size_t in_nbytes, - struct deflate_output_bitstream *os, bool lazy2) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os, bool lazy2) { - const u8 *in_next = in; - const u8 *in_end = in_next + in_nbytes; - const u8 *in_cur_base = in_next; - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - u32 next_hashes[2] = {0, 0}; - - hc_matchfinder_init(&c->p.g.hc_mf); - - do { - /* Starting a new DEFLATE block */ - - const u8 * const in_block_begin = in_next; - const u8 * const in_max_block_end = choose_max_block_end( - in_next, in_end, SOFT_MAX_BLOCK_LENGTH); - const u8 *next_recalc_min_len = - in_next + MIN(in_end - in_next, 10000); - struct deflate_sequence *seq = c->p.g.sequences; - unsigned min_len; - - init_block_split_stats(&c->split_stats); - deflate_begin_sequences(c, seq); - min_len = calculate_min_match_len(in_next, - in_max_block_end - in_next, - c->max_search_depth); - do { - unsigned cur_len; - unsigned cur_offset; - unsigned next_len; - unsigned next_offset; - - /* - * Recalculate the minimum match length if it hasn't - * been done recently. - */ - if (in_next >= next_recalc_min_len) { - min_len = recalculate_min_match_len( - &c->freqs, - c->max_search_depth); - next_recalc_min_len += - MIN(in_end - next_recalc_min_len, - in_next - in_block_begin); - } - - /* Find the longest match at the current position. */ - adjust_max_and_nice_len(&max_len, &nice_len, - in_end - in_next); - cur_len = hc_matchfinder_longest_match( - &c->p.g.hc_mf, - &in_cur_base, - in_next, - min_len - 1, - max_len, - nice_len, - c->max_search_depth, - next_hashes, - &cur_offset); - if (cur_len < min_len || - (cur_len == DEFLATE_MIN_MATCH_LEN && - cur_offset > 8192)) { - /* No match found. Choose a literal. */ - deflate_choose_literal(c, *in_next++, true, - seq); - continue; - } - in_next++; - -have_cur_match: - /* - * We have a match at the current position. - * If it's very long, choose it immediately. - */ - if (cur_len >= nice_len) { - deflate_choose_match(c, cur_len, cur_offset, - true, &seq); - hc_matchfinder_skip_bytes(&c->p.g.hc_mf, - &in_cur_base, - in_next, - in_end, - cur_len - 1, - next_hashes); - in_next += cur_len - 1; - continue; - } - - /* - * Try to find a better match at the next position. - * - * Note: since we already have a match at the *current* - * position, we use only half the 'max_search_depth' - * when checking the *next* position. This is a useful - * trade-off because it's more worthwhile to use a - * greater search depth on the initial match. - * - * Note: it's possible to structure the code such that - * there's only one call to longest_match(), which - * handles both the "find the initial match" and "try to - * find a better match" cases. However, it is faster to - * have two call sites, with longest_match() inlined at - * each. - */ - adjust_max_and_nice_len(&max_len, &nice_len, - in_end - in_next); - next_len = hc_matchfinder_longest_match( - &c->p.g.hc_mf, - &in_cur_base, - in_next++, - cur_len - 1, - max_len, - nice_len, - c->max_search_depth >> 1, - next_hashes, - &next_offset); - if (next_len >= cur_len && - 4 * (int)(next_len - cur_len) + - ((int)bsr32(cur_offset) - - (int)bsr32(next_offset)) > 2) { - /* - * Found a better match at the next position. - * Output a literal. Then the next match - * becomes the current match. - */ - deflate_choose_literal(c, *(in_next - 2), true, - seq); - cur_len = next_len; - cur_offset = next_offset; - goto have_cur_match; - } - - if (lazy2) { - /* In lazy2 mode, look ahead another position */ - adjust_max_and_nice_len(&max_len, &nice_len, - in_end - in_next); - next_len = hc_matchfinder_longest_match( - &c->p.g.hc_mf, - &in_cur_base, - in_next++, - cur_len - 1, - max_len, - nice_len, - c->max_search_depth >> 2, - next_hashes, - &next_offset); - if (next_len >= cur_len && - 4 * (int)(next_len - cur_len) + - ((int)bsr32(cur_offset) - - (int)bsr32(next_offset)) > 6) { - /* - * There's a much better match two - * positions ahead, so use two literals. - */ - deflate_choose_literal( - c, *(in_next - 3), true, seq); - deflate_choose_literal( - c, *(in_next - 2), true, seq); - cur_len = next_len; - cur_offset = next_offset; - goto have_cur_match; - } - /* - * No better match at either of the next 2 - * positions. Output the current match. - */ - deflate_choose_match(c, cur_len, cur_offset, - true, &seq); - if (cur_len > 3) { - hc_matchfinder_skip_bytes(&c->p.g.hc_mf, - &in_cur_base, - in_next, - in_end, - cur_len - 3, - next_hashes); - in_next += cur_len - 3; - } - } else { /* !lazy2 */ - /* - * No better match at the next position. Output - * the current match. - */ - deflate_choose_match(c, cur_len, cur_offset, - true, &seq); - hc_matchfinder_skip_bytes(&c->p.g.hc_mf, - &in_cur_base, - in_next, - in_end, - cur_len - 2, - next_hashes); - in_next += cur_len - 2; - } - /* Check if it's time to output another block. */ - } while (in_next < in_max_block_end && - seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && - !should_end_block(&c->split_stats, - in_block_begin, in_next, in_end)); - - deflate_finish_block(c, os, in_block_begin, - in_next - in_block_begin, - c->p.g.sequences, in_next == in_end); - } while (in_next != in_end && !os->overflow); + const u8 *in_next = in; + const u8 *in_end = in_next + in_nbytes; + const u8 *in_cur_base = in_next; + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + u32 next_hashes[2] = {0, 0}; + + hc_matchfinder_init(&c->p.g.hc_mf); + + do { + /* Starting a new DEFLATE block */ + + const u8 * const in_block_begin = in_next; + const u8 * const in_max_block_end = choose_max_block_end( + in_next, in_end, SOFT_MAX_BLOCK_LENGTH); + const u8 *next_recalc_min_len = + in_next + MIN(in_end - in_next, 10000); + struct deflate_sequence *seq = c->p.g.sequences; + unsigned min_len; + + init_block_split_stats(&c->split_stats); + deflate_begin_sequences(c, seq); + min_len = calculate_min_match_len(in_next, + in_max_block_end - in_next, + c->max_search_depth); + do { + unsigned cur_len; + unsigned cur_offset; + unsigned next_len; + unsigned next_offset; + + /* + * Recalculate the minimum match length if it hasn't + * been done recently. + */ + if (in_next >= next_recalc_min_len) { + min_len = recalculate_min_match_len( + &c->freqs, + c->max_search_depth); + next_recalc_min_len += + MIN(in_end - next_recalc_min_len, + in_next - in_block_begin); + } + + /* Find the longest match at the current position. */ + adjust_max_and_nice_len(&max_len, &nice_len, + in_end - in_next); + cur_len = hc_matchfinder_longest_match( + &c->p.g.hc_mf, + &in_cur_base, + in_next, + min_len - 1, + max_len, + nice_len, + c->max_search_depth, + next_hashes, + &cur_offset); + if (cur_len < min_len || + (cur_len == DEFLATE_MIN_MATCH_LEN && + cur_offset > 8192)) { + /* No match found. Choose a literal. */ + deflate_choose_literal(c, *in_next++, true, + seq); + continue; + } + in_next++; + + have_cur_match: + /* + * We have a match at the current position. + * If it's very long, choose it immediately. + */ + if (cur_len >= nice_len) { + deflate_choose_match(c, cur_len, cur_offset, + true, &seq); + hc_matchfinder_skip_bytes(&c->p.g.hc_mf, + &in_cur_base, + in_next, + in_end, + cur_len - 1, + next_hashes); + in_next += cur_len - 1; + continue; + } + + /* + * Try to find a better match at the next position. + * + * Note: since we already have a match at the *current* + * position, we use only half the 'max_search_depth' + * when checking the *next* position. This is a useful + * trade-off because it's more worthwhile to use a + * greater search depth on the initial match. + * + * Note: it's possible to structure the code such that + * there's only one call to longest_match(), which + * handles both the "find the initial match" and "try to + * find a better match" cases. However, it is faster to + * have two call sites, with longest_match() inlined at + * each. + */ + adjust_max_and_nice_len(&max_len, &nice_len, + in_end - in_next); + next_len = hc_matchfinder_longest_match( + &c->p.g.hc_mf, + &in_cur_base, + in_next++, + cur_len - 1, + max_len, + nice_len, + c->max_search_depth >> 1, + next_hashes, + &next_offset); + if (next_len >= cur_len && + 4 * (int)(next_len - cur_len) + + ((int)bsr32(cur_offset) - + (int)bsr32(next_offset)) > 2) { + /* + * Found a better match at the next position. + * Output a literal. Then the next match + * becomes the current match. + */ + deflate_choose_literal(c, *(in_next - 2), true, + seq); + cur_len = next_len; + cur_offset = next_offset; + goto have_cur_match; + } + + if (lazy2) { + /* In lazy2 mode, look ahead another position */ + adjust_max_and_nice_len(&max_len, &nice_len, + in_end - in_next); + next_len = hc_matchfinder_longest_match( + &c->p.g.hc_mf, + &in_cur_base, + in_next++, + cur_len - 1, + max_len, + nice_len, + c->max_search_depth >> 2, + next_hashes, + &next_offset); + if (next_len >= cur_len && + 4 * (int)(next_len - cur_len) + + ((int)bsr32(cur_offset) - + (int)bsr32(next_offset)) > 6) { + /* + * There's a much better match two + * positions ahead, so use two literals. + */ + deflate_choose_literal( + c, *(in_next - 3), true, seq); + deflate_choose_literal( + c, *(in_next - 2), true, seq); + cur_len = next_len; + cur_offset = next_offset; + goto have_cur_match; + } + /* + * No better match at either of the next 2 + * positions. Output the current match. + */ + deflate_choose_match(c, cur_len, cur_offset, + true, &seq); + if (cur_len > 3) { + hc_matchfinder_skip_bytes(&c->p.g.hc_mf, + &in_cur_base, + in_next, + in_end, + cur_len - 3, + next_hashes); + in_next += cur_len - 3; + } + } else { /* !lazy2 */ + /* + * No better match at the next position. Output + * the current match. + */ + deflate_choose_match(c, cur_len, cur_offset, + true, &seq); + hc_matchfinder_skip_bytes(&c->p.g.hc_mf, + &in_cur_base, + in_next, + in_end, + cur_len - 2, + next_hashes); + in_next += cur_len - 2; + } + /* Check if it's time to output another block. */ + } while (in_next < in_max_block_end && + seq < &c->p.g.sequences[SEQ_STORE_LENGTH] && + !should_end_block(&c->split_stats, + in_block_begin, in_next, in_end)); + + deflate_finish_block(c, os, in_block_begin, + in_next - in_block_begin, + c->p.g.sequences, in_next == in_end); + } while (in_next != in_end && !os->overflow); } /* @@ -2813,10 +2813,10 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c, */ static void deflate_compress_lazy(struct libdeflate_compressor * restrict c, - const u8 *in, size_t in_nbytes, - struct deflate_output_bitstream *os) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { - deflate_compress_lazy_generic(c, in, in_nbytes, os, false); + deflate_compress_lazy_generic(c, in, in_nbytes, os, false); } /* @@ -2826,10 +2826,10 @@ deflate_compress_lazy(struct libdeflate_compressor * restrict c, */ static void deflate_compress_lazy2(struct libdeflate_compressor * restrict c, - const u8 *in, size_t in_nbytes, - struct deflate_output_bitstream *os) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { - deflate_compress_lazy_generic(c, in, in_nbytes, os, true); + deflate_compress_lazy_generic(c, in, in_nbytes, os, true); } #if SUPPORT_NEAR_OPTIMAL_PARSING @@ -2842,42 +2842,42 @@ deflate_compress_lazy2(struct libdeflate_compressor * restrict c, static void deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length) { - struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; - struct deflate_optimum_node *end_node = - &c->p.n.optimum_nodes[block_length]; - - do { - unsigned length = cur_node->item & OPTIMUM_LEN_MASK; - unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; - - if (length == 1) { - /* Literal */ - c->freqs.litlen[offset]++; - } else { - /* Match */ - c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + - deflate_length_slot[length]]++; - c->freqs.offset[c->p.n.offset_slot_full[offset]]++; - } - cur_node += length; - } while (cur_node != end_node); - - /* Tally the end-of-block symbol. */ - c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0]; + struct deflate_optimum_node *end_node = + &c->p.n.optimum_nodes[block_length]; + + do { + unsigned length = cur_node->item & OPTIMUM_LEN_MASK; + unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; + + if (length == 1) { + /* Literal */ + c->freqs.litlen[offset]++; + } else { + /* Match */ + c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + + deflate_length_slot[length]]++; + c->freqs.offset[c->p.n.offset_slot_full[offset]]++; + } + cur_node += length; + } while (cur_node != end_node); + + /* Tally the end-of-block symbol. */ + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; } static void deflate_choose_all_literals(struct libdeflate_compressor *c, - const u8 *block, u32 block_length) + const u8 *block, u32 block_length) { - u32 i; - - deflate_reset_symbol_frequencies(c); - for (i = 0; i < block_length; i++) - c->freqs.litlen[block[i]]++; - c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; - - deflate_make_huffman_codes(&c->freqs, &c->codes); + u32 i; + + deflate_reset_symbol_frequencies(c); + for (i = 0; i < block_length; i++) + c->freqs.litlen[block[i]]++; + c->freqs.litlen[DEFLATE_END_OF_BLOCK]++; + + deflate_make_huffman_codes(&c->freqs, &c->codes); } /* @@ -2888,71 +2888,71 @@ deflate_choose_all_literals(struct libdeflate_compressor *c, static u32 deflate_compute_true_cost(struct libdeflate_compressor *c) { - u32 cost = 0; - unsigned sym; - - deflate_precompute_huffman_header(c); - - memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0, - DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms); - - cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); - for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { - cost += c->o.precode.freqs[sym] * - (c->o.precode.lens[sym] + - deflate_extra_precode_bits[sym]); - } - - for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++) - cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; - - for (; sym < DEFLATE_FIRST_LEN_SYM + - ARRAY_LEN(deflate_extra_length_bits); sym++) - cost += c->freqs.litlen[sym] * - (c->codes.lens.litlen[sym] + - deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]); - - for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) - cost += c->freqs.offset[sym] * - (c->codes.lens.offset[sym] + - deflate_extra_offset_bits[sym]); - return cost; + u32 cost = 0; + unsigned sym; + + deflate_precompute_huffman_header(c); + + memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0, + DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms); + + cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens); + for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) { + cost += c->o.precode.freqs[sym] * + (c->o.precode.lens[sym] + + deflate_extra_precode_bits[sym]); + } + + for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++) + cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym]; + + for (; sym < DEFLATE_FIRST_LEN_SYM + + ARRAY_LEN(deflate_extra_length_bits); sym++) + cost += c->freqs.litlen[sym] * + (c->codes.lens.litlen[sym] + + deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]); + + for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) + cost += c->freqs.offset[sym] * + (c->codes.lens.offset[sym] + + deflate_extra_offset_bits[sym]); + return cost; } /* Set the current cost model from the codeword lengths specified in @lens. */ static void deflate_set_costs_from_codes(struct libdeflate_compressor *c, - const struct deflate_lens *lens) + const struct deflate_lens *lens) { - unsigned i; - - /* Literals */ - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { - u32 bits = (lens->litlen[i] ? - lens->litlen[i] : LITERAL_NOSTAT_BITS); - - c->p.n.costs.literal[i] = bits * BIT_COST; - } - - /* Lengths */ - for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { - unsigned length_slot = deflate_length_slot[i]; - unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot; - u32 bits = (lens->litlen[litlen_sym] ? - lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); - - bits += deflate_extra_length_bits[length_slot]; - c->p.n.costs.length[i] = bits * BIT_COST; - } - - /* Offset slots */ - for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { - u32 bits = (lens->offset[i] ? - lens->offset[i] : OFFSET_NOSTAT_BITS); - - bits += deflate_extra_offset_bits[i]; - c->p.n.costs.offset_slot[i] = bits * BIT_COST; - } + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { + u32 bits = (lens->litlen[i] ? + lens->litlen[i] : LITERAL_NOSTAT_BITS); + + c->p.n.costs.literal[i] = bits * BIT_COST; + } + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) { + unsigned length_slot = deflate_length_slot[i]; + unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot; + u32 bits = (lens->litlen[litlen_sym] ? + lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS); + + bits += deflate_extra_length_bits[length_slot]; + c->p.n.costs.length[i] = bits * BIT_COST; + } + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) { + u32 bits = (lens->offset[i] ? + lens->offset[i] : OFFSET_NOSTAT_BITS); + + bits += deflate_extra_offset_bits[i]; + c->p.n.costs.offset_slot[i] = bits * BIT_COST; + } } /* @@ -2962,14 +2962,14 @@ deflate_set_costs_from_codes(struct libdeflate_compressor *c, * * This table is indexed first by the estimated match probability: * - * i=0: data doesn't contain many matches [match_prob=0.25] - * i=1: neutral [match_prob=0.50] - * i=2: data contains lots of matches [match_prob=0.75] + * i=0: data doesn't contain many matches [match_prob=0.25] + * i=1: neutral [match_prob=0.50] + * i=2: data contains lots of matches [match_prob=0.75] * * This lookup produces a subtable which maps the number of distinct used * literals to the default cost of a literal symbol, i.e.: * - * int(-log2((1 - match_prob) / num_used_literals) * BIT_COST) + * int(-log2((1 - match_prob) / num_used_literals) * BIT_COST) * * ... for num_used_literals in [1, 256] (and 0, which is copied from 1). This * accounts for literals usually getting cheaper as the number of distinct @@ -2977,127 +2977,127 @@ deflate_set_costs_from_codes(struct libdeflate_compressor *c, * * The lookup also produces the cost of a length symbol, which is: * - * int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST) + * int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST) * * Note: we don't currently assign different costs to different literal symbols, * or to different length symbols, as this is hard to do in a useful way. */ static const struct { - u8 used_lits_to_lit_cost[257]; - u8 len_sym_cost; + u8 used_lits_to_lit_cost[257]; + u8 len_sym_cost; } default_litlen_costs[] = { - { /* match_prob = 0.25 */ - .used_lits_to_lit_cost = { - 6, 6, 22, 32, 38, 43, 48, 51, - 54, 57, 59, 61, 64, 65, 67, 69, - 70, 72, 73, 74, 75, 76, 77, 79, - 80, 80, 81, 82, 83, 84, 85, 85, - 86, 87, 88, 88, 89, 89, 90, 91, - 91, 92, 92, 93, 93, 94, 95, 95, - 96, 96, 96, 97, 97, 98, 98, 99, - 99, 99, 100, 100, 101, 101, 101, 102, - 102, 102, 103, 103, 104, 104, 104, 105, - 105, 105, 105, 106, 106, 106, 107, 107, - 107, 108, 108, 108, 108, 109, 109, 109, - 109, 110, 110, 110, 111, 111, 111, 111, - 112, 112, 112, 112, 112, 113, 113, 113, - 113, 114, 114, 114, 114, 114, 115, 115, - 115, 115, 115, 116, 116, 116, 116, 116, - 117, 117, 117, 117, 117, 118, 118, 118, - 118, 118, 118, 119, 119, 119, 119, 119, - 120, 120, 120, 120, 120, 120, 121, 121, - 121, 121, 121, 121, 121, 122, 122, 122, - 122, 122, 122, 123, 123, 123, 123, 123, - 123, 123, 124, 124, 124, 124, 124, 124, - 124, 125, 125, 125, 125, 125, 125, 125, - 125, 126, 126, 126, 126, 126, 126, 126, - 127, 127, 127, 127, 127, 127, 127, 127, - 128, 128, 128, 128, 128, 128, 128, 128, - 128, 129, 129, 129, 129, 129, 129, 129, - 129, 129, 130, 130, 130, 130, 130, 130, - 130, 130, 130, 131, 131, 131, 131, 131, - 131, 131, 131, 131, 131, 132, 132, 132, - 132, 132, 132, 132, 132, 132, 132, 133, - 133, 133, 133, 133, 133, 133, 133, 133, - 133, 134, 134, 134, 134, 134, 134, 134, - 134, - }, - .len_sym_cost = 109, - }, { /* match_prob = 0.5 */ - .used_lits_to_lit_cost = { - 16, 16, 32, 41, 48, 53, 57, 60, - 64, 66, 69, 71, 73, 75, 76, 78, - 80, 81, 82, 83, 85, 86, 87, 88, - 89, 90, 91, 92, 92, 93, 94, 95, - 96, 96, 97, 98, 98, 99, 99, 100, - 101, 101, 102, 102, 103, 103, 104, 104, - 105, 105, 106, 106, 107, 107, 108, 108, - 108, 109, 109, 110, 110, 110, 111, 111, - 112, 112, 112, 113, 113, 113, 114, 114, - 114, 115, 115, 115, 115, 116, 116, 116, - 117, 117, 117, 118, 118, 118, 118, 119, - 119, 119, 119, 120, 120, 120, 120, 121, - 121, 121, 121, 122, 122, 122, 122, 122, - 123, 123, 123, 123, 124, 124, 124, 124, - 124, 125, 125, 125, 125, 125, 126, 126, - 126, 126, 126, 127, 127, 127, 127, 127, - 128, 128, 128, 128, 128, 128, 129, 129, - 129, 129, 129, 129, 130, 130, 130, 130, - 130, 130, 131, 131, 131, 131, 131, 131, - 131, 132, 132, 132, 132, 132, 132, 133, - 133, 133, 133, 133, 133, 133, 134, 134, - 134, 134, 134, 134, 134, 134, 135, 135, - 135, 135, 135, 135, 135, 135, 136, 136, - 136, 136, 136, 136, 136, 136, 137, 137, - 137, 137, 137, 137, 137, 137, 138, 138, - 138, 138, 138, 138, 138, 138, 138, 139, - 139, 139, 139, 139, 139, 139, 139, 139, - 140, 140, 140, 140, 140, 140, 140, 140, - 140, 141, 141, 141, 141, 141, 141, 141, - 141, 141, 141, 142, 142, 142, 142, 142, - 142, 142, 142, 142, 142, 142, 143, 143, - 143, 143, 143, 143, 143, 143, 143, 143, - 144, - }, - .len_sym_cost = 93, - }, { /* match_prob = 0.75 */ - .used_lits_to_lit_cost = { - 32, 32, 48, 57, 64, 69, 73, 76, - 80, 82, 85, 87, 89, 91, 92, 94, - 96, 97, 98, 99, 101, 102, 103, 104, - 105, 106, 107, 108, 108, 109, 110, 111, - 112, 112, 113, 114, 114, 115, 115, 116, - 117, 117, 118, 118, 119, 119, 120, 120, - 121, 121, 122, 122, 123, 123, 124, 124, - 124, 125, 125, 126, 126, 126, 127, 127, - 128, 128, 128, 129, 129, 129, 130, 130, - 130, 131, 131, 131, 131, 132, 132, 132, - 133, 133, 133, 134, 134, 134, 134, 135, - 135, 135, 135, 136, 136, 136, 136, 137, - 137, 137, 137, 138, 138, 138, 138, 138, - 139, 139, 139, 139, 140, 140, 140, 140, - 140, 141, 141, 141, 141, 141, 142, 142, - 142, 142, 142, 143, 143, 143, 143, 143, - 144, 144, 144, 144, 144, 144, 145, 145, - 145, 145, 145, 145, 146, 146, 146, 146, - 146, 146, 147, 147, 147, 147, 147, 147, - 147, 148, 148, 148, 148, 148, 148, 149, - 149, 149, 149, 149, 149, 149, 150, 150, - 150, 150, 150, 150, 150, 150, 151, 151, - 151, 151, 151, 151, 151, 151, 152, 152, - 152, 152, 152, 152, 152, 152, 153, 153, - 153, 153, 153, 153, 153, 153, 154, 154, - 154, 154, 154, 154, 154, 154, 154, 155, - 155, 155, 155, 155, 155, 155, 155, 155, - 156, 156, 156, 156, 156, 156, 156, 156, - 156, 157, 157, 157, 157, 157, 157, 157, - 157, 157, 157, 158, 158, 158, 158, 158, - 158, 158, 158, 158, 158, 158, 159, 159, - 159, 159, 159, 159, 159, 159, 159, 159, - 160, - }, - .len_sym_cost = 84, - }, + { /* match_prob = 0.25 */ + .used_lits_to_lit_cost = { + 6, 6, 22, 32, 38, 43, 48, 51, + 54, 57, 59, 61, 64, 65, 67, 69, + 70, 72, 73, 74, 75, 76, 77, 79, + 80, 80, 81, 82, 83, 84, 85, 85, + 86, 87, 88, 88, 89, 89, 90, 91, + 91, 92, 92, 93, 93, 94, 95, 95, + 96, 96, 96, 97, 97, 98, 98, 99, + 99, 99, 100, 100, 101, 101, 101, 102, + 102, 102, 103, 103, 104, 104, 104, 105, + 105, 105, 105, 106, 106, 106, 107, 107, + 107, 108, 108, 108, 108, 109, 109, 109, + 109, 110, 110, 110, 111, 111, 111, 111, + 112, 112, 112, 112, 112, 113, 113, 113, + 113, 114, 114, 114, 114, 114, 115, 115, + 115, 115, 115, 116, 116, 116, 116, 116, + 117, 117, 117, 117, 117, 118, 118, 118, + 118, 118, 118, 119, 119, 119, 119, 119, + 120, 120, 120, 120, 120, 120, 121, 121, + 121, 121, 121, 121, 121, 122, 122, 122, + 122, 122, 122, 123, 123, 123, 123, 123, + 123, 123, 124, 124, 124, 124, 124, 124, + 124, 125, 125, 125, 125, 125, 125, 125, + 125, 126, 126, 126, 126, 126, 126, 126, + 127, 127, 127, 127, 127, 127, 127, 127, + 128, 128, 128, 128, 128, 128, 128, 128, + 128, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 133, + 133, 133, 133, 133, 133, 133, 133, 133, + 133, 134, 134, 134, 134, 134, 134, 134, + 134, + }, + .len_sym_cost = 109, + }, { /* match_prob = 0.5 */ + .used_lits_to_lit_cost = { + 16, 16, 32, 41, 48, 53, 57, 60, + 64, 66, 69, 71, 73, 75, 76, 78, + 80, 81, 82, 83, 85, 86, 87, 88, + 89, 90, 91, 92, 92, 93, 94, 95, + 96, 96, 97, 98, 98, 99, 99, 100, + 101, 101, 102, 102, 103, 103, 104, 104, + 105, 105, 106, 106, 107, 107, 108, 108, + 108, 109, 109, 110, 110, 110, 111, 111, + 112, 112, 112, 113, 113, 113, 114, 114, + 114, 115, 115, 115, 115, 116, 116, 116, + 117, 117, 117, 118, 118, 118, 118, 119, + 119, 119, 119, 120, 120, 120, 120, 121, + 121, 121, 121, 122, 122, 122, 122, 122, + 123, 123, 123, 123, 124, 124, 124, 124, + 124, 125, 125, 125, 125, 125, 126, 126, + 126, 126, 126, 127, 127, 127, 127, 127, + 128, 128, 128, 128, 128, 128, 129, 129, + 129, 129, 129, 129, 130, 130, 130, 130, + 130, 130, 131, 131, 131, 131, 131, 131, + 131, 132, 132, 132, 132, 132, 132, 133, + 133, 133, 133, 133, 133, 133, 134, 134, + 134, 134, 134, 134, 134, 134, 135, 135, + 135, 135, 135, 135, 135, 135, 136, 136, + 136, 136, 136, 136, 136, 136, 137, 137, + 137, 137, 137, 137, 137, 137, 138, 138, + 138, 138, 138, 138, 138, 138, 138, 139, + 139, 139, 139, 139, 139, 139, 139, 139, + 140, 140, 140, 140, 140, 140, 140, 140, + 140, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 142, 142, 142, 142, 142, + 142, 142, 142, 142, 142, 142, 143, 143, + 143, 143, 143, 143, 143, 143, 143, 143, + 144, + }, + .len_sym_cost = 93, + }, { /* match_prob = 0.75 */ + .used_lits_to_lit_cost = { + 32, 32, 48, 57, 64, 69, 73, 76, + 80, 82, 85, 87, 89, 91, 92, 94, + 96, 97, 98, 99, 101, 102, 103, 104, + 105, 106, 107, 108, 108, 109, 110, 111, + 112, 112, 113, 114, 114, 115, 115, 116, + 117, 117, 118, 118, 119, 119, 120, 120, + 121, 121, 122, 122, 123, 123, 124, 124, + 124, 125, 125, 126, 126, 126, 127, 127, + 128, 128, 128, 129, 129, 129, 130, 130, + 130, 131, 131, 131, 131, 132, 132, 132, + 133, 133, 133, 134, 134, 134, 134, 135, + 135, 135, 135, 136, 136, 136, 136, 137, + 137, 137, 137, 138, 138, 138, 138, 138, + 139, 139, 139, 139, 140, 140, 140, 140, + 140, 141, 141, 141, 141, 141, 142, 142, + 142, 142, 142, 143, 143, 143, 143, 143, + 144, 144, 144, 144, 144, 144, 145, 145, + 145, 145, 145, 145, 146, 146, 146, 146, + 146, 146, 147, 147, 147, 147, 147, 147, + 147, 148, 148, 148, 148, 148, 148, 149, + 149, 149, 149, 149, 149, 149, 150, 150, + 150, 150, 150, 150, 150, 150, 151, 151, + 151, 151, 151, 151, 151, 151, 152, 152, + 152, 152, 152, 152, 152, 152, 153, 153, + 153, 153, 153, 153, 153, 153, 154, 154, + 154, 154, 154, 154, 154, 154, 154, 155, + 155, 155, 155, 155, 155, 155, 155, 155, + 156, 156, 156, 156, 156, 156, 156, 156, + 156, 157, 157, 157, 157, 157, 157, 157, + 157, 157, 157, 158, 158, 158, 158, 158, + 158, 158, 158, 158, 158, 158, 159, 159, + 159, 159, 159, 159, 159, 159, 159, 159, + 160, + }, + .len_sym_cost = 84, + }, }; /* @@ -3106,141 +3106,141 @@ static const struct { */ static void deflate_choose_default_litlen_costs(struct libdeflate_compressor *c, - const u8 *block_begin, u32 block_length, - u32 *lit_cost, u32 *len_sym_cost) + const u8 *block_begin, u32 block_length, + u32 *lit_cost, u32 *len_sym_cost) { - unsigned num_used_literals = 0; - u32 literal_freq = block_length; - u32 match_freq = 0; - u32 cutoff; - u32 i; - - /* Calculate the number of distinct literals that exist in the data. */ - memset(c->freqs.litlen, 0, - DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0])); - cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */ - for (i = 0; i < block_length; i++) - c->freqs.litlen[block_begin[i]]++; - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { - if (c->freqs.litlen[i] > cutoff) - num_used_literals++; - } - if (num_used_literals == 0) - num_used_literals = 1; - - /* - * Estimate the relative frequency of literals and matches in the - * optimal parsing solution. We don't know the optimal solution, so - * this can only be a very rough estimate. Therefore, we basically use - * the match frequency from a greedy parse. We also apply the min_len - * heuristic used by the greedy and lazy parsers, to avoid counting too - * many matches when literals are cheaper than short matches. - */ - match_freq = 0; - i = choose_min_match_len(num_used_literals, c->max_search_depth); - for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { - match_freq += c->p.n.match_len_freqs[i]; - literal_freq -= i * c->p.n.match_len_freqs[i]; - } - if ((s32)literal_freq < 0) /* shouldn't happen */ - literal_freq = 0; - - if (match_freq > literal_freq) - i = 2; /* many matches */ - else if (match_freq * 4 > literal_freq) - i = 1; /* neutral */ - else - i = 0; /* few matches */ - - STATIC_ASSERT(BIT_COST == 16); - *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[ - num_used_literals]; - *len_sym_cost = default_litlen_costs[i].len_sym_cost; + unsigned num_used_literals = 0; + u32 literal_freq = block_length; + u32 match_freq = 0; + u32 cutoff; + u32 i; + + /* Calculate the number of distinct literals that exist in the data. */ + memset(c->freqs.litlen, 0, + DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0])); + cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */ + for (i = 0; i < block_length; i++) + c->freqs.litlen[block_begin[i]]++; + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) { + if (c->freqs.litlen[i] > cutoff) + num_used_literals++; + } + if (num_used_literals == 0) + num_used_literals = 1; + + /* + * Estimate the relative frequency of literals and matches in the + * optimal parsing solution. We don't know the optimal solution, so + * this can only be a very rough estimate. Therefore, we basically use + * the match frequency from a greedy parse. We also apply the min_len + * heuristic used by the greedy and lazy parsers, to avoid counting too + * many matches when literals are cheaper than short matches. + */ + match_freq = 0; + i = choose_min_match_len(num_used_literals, c->max_search_depth); + for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { + match_freq += c->p.n.match_len_freqs[i]; + literal_freq -= i * c->p.n.match_len_freqs[i]; + } + if ((s32)literal_freq < 0) /* shouldn't happen */ + literal_freq = 0; + + if (match_freq > literal_freq) + i = 2; /* many matches */ + else if (match_freq * 4 > literal_freq) + i = 1; /* neutral */ + else + i = 0; /* few matches */ + + STATIC_ASSERT(BIT_COST == 16); + *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[ + num_used_literals]; + *len_sym_cost = default_litlen_costs[i].len_sym_cost; } static forceinline u32 deflate_default_length_cost(unsigned len, u32 len_sym_cost) { - unsigned slot = deflate_length_slot[len]; - u32 num_extra_bits = deflate_extra_length_bits[slot]; - - return len_sym_cost + (num_extra_bits * BIT_COST); + unsigned slot = deflate_length_slot[len]; + u32 num_extra_bits = deflate_extra_length_bits[slot]; + + return len_sym_cost + (num_extra_bits * BIT_COST); } static forceinline u32 deflate_default_offset_slot_cost(unsigned slot) { - u32 num_extra_bits = deflate_extra_offset_bits[slot]; - /* - * Assume that all offset symbols are equally probable. - * The resulting cost is 'int(-log2(1/30) * BIT_COST)', - * where 30 is the number of potentially-used offset symbols. - */ - u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000; - - return offset_sym_cost + (num_extra_bits * BIT_COST); + u32 num_extra_bits = deflate_extra_offset_bits[slot]; + /* + * Assume that all offset symbols are equally probable. + * The resulting cost is 'int(-log2(1/30) * BIT_COST)', + * where 30 is the number of potentially-used offset symbols. + */ + u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000; + + return offset_sym_cost + (num_extra_bits * BIT_COST); } /* Set default symbol costs for the first block's first optimization pass. */ static void deflate_set_default_costs(struct libdeflate_compressor *c, - u32 lit_cost, u32 len_sym_cost) + u32 lit_cost, u32 len_sym_cost) { - unsigned i; - - /* Literals */ - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) - c->p.n.costs.literal[i] = lit_cost; - - /* Lengths */ - for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) - c->p.n.costs.length[i] = - deflate_default_length_cost(i, len_sym_cost); - - /* Offset slots */ - for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) - c->p.n.costs.offset_slot[i] = - deflate_default_offset_slot_cost(i); + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + c->p.n.costs.literal[i] = lit_cost; + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) + c->p.n.costs.length[i] = + deflate_default_length_cost(i, len_sym_cost); + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) + c->p.n.costs.offset_slot[i] = + deflate_default_offset_slot_cost(i); } static forceinline void deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount) { - if (change_amount == 0) - /* Block is very similar to previous; prefer previous costs. */ - *cost_p = (default_cost + 3 * *cost_p) / 4; - else if (change_amount == 1) - *cost_p = (default_cost + *cost_p) / 2; - else if (change_amount == 2) - *cost_p = (5 * default_cost + 3 * *cost_p) / 8; - else - /* Block differs greatly from previous; prefer default costs. */ - *cost_p = (3 * default_cost + *cost_p) / 4; + if (change_amount == 0) + /* Block is very similar to previous; prefer previous costs. */ + *cost_p = (default_cost + 3 * *cost_p) / 4; + else if (change_amount == 1) + *cost_p = (default_cost + *cost_p) / 2; + else if (change_amount == 2) + *cost_p = (5 * default_cost + 3 * *cost_p) / 8; + else + /* Block differs greatly from previous; prefer default costs. */ + *cost_p = (3 * default_cost + *cost_p) / 4; } static forceinline void deflate_adjust_costs_impl(struct libdeflate_compressor *c, - u32 lit_cost, u32 len_sym_cost, int change_amount) + u32 lit_cost, u32 len_sym_cost, int change_amount) { - unsigned i; - - /* Literals */ - for (i = 0; i < DEFLATE_NUM_LITERALS; i++) - deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost, - change_amount); - - /* Lengths */ - for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) - deflate_adjust_cost(&c->p.n.costs.length[i], - deflate_default_length_cost(i, - len_sym_cost), - change_amount); - - /* Offset slots */ - for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) - deflate_adjust_cost(&c->p.n.costs.offset_slot[i], - deflate_default_offset_slot_cost(i), - change_amount); + unsigned i; + + /* Literals */ + for (i = 0; i < DEFLATE_NUM_LITERALS; i++) + deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost, + change_amount); + + /* Lengths */ + for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) + deflate_adjust_cost(&c->p.n.costs.length[i], + deflate_default_length_cost(i, + len_sym_cost), + change_amount); + + /* Offset slots */ + for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) + deflate_adjust_cost(&c->p.n.costs.offset_slot[i], + deflate_default_offset_slot_cost(i), + change_amount); } /* @@ -3254,59 +3254,59 @@ deflate_adjust_costs_impl(struct libdeflate_compressor *c, */ static void deflate_adjust_costs(struct libdeflate_compressor *c, - u32 lit_cost, u32 len_sym_cost) + u32 lit_cost, u32 len_sym_cost) { - u64 total_delta = 0; - u64 cutoff; - int i; - - /* - * Decide how different the current block is from the previous block, - * using the block splitting statistics from the current and previous - * blocks. The more different the current block is, the more we prefer - * the default costs rather than the previous block's costs. - * - * The algorithm here is similar to the end-of-block check one, but here - * we compare two entire blocks rather than a partial block with a small - * extra part, and therefore we need 64-bit numbers in some places. - */ - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { - u64 prev = (u64)c->p.n.prev_observations[i] * - c->split_stats.num_observations; - u64 cur = (u64)c->split_stats.observations[i] * - c->p.n.prev_num_observations; - - total_delta += prev > cur ? prev - cur : cur - prev; - } - cutoff = ((u64)c->p.n.prev_num_observations * - c->split_stats.num_observations * 200) / 512; - - if (total_delta > 3 * cutoff) - /* Big change in the data; just use the default costs. */ - deflate_set_default_costs(c, lit_cost, len_sym_cost); - else if (4 * total_delta > 9 * cutoff) - deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3); - else if (2 * total_delta > 3 * cutoff) - deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2); - else if (2 * total_delta > cutoff) - deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1); - else - deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0); + u64 total_delta = 0; + u64 cutoff; + int i; + + /* + * Decide how different the current block is from the previous block, + * using the block splitting statistics from the current and previous + * blocks. The more different the current block is, the more we prefer + * the default costs rather than the previous block's costs. + * + * The algorithm here is similar to the end-of-block check one, but here + * we compare two entire blocks rather than a partial block with a small + * extra part, and therefore we need 64-bit numbers in some places. + */ + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) { + u64 prev = (u64)c->p.n.prev_observations[i] * + c->split_stats.num_observations; + u64 cur = (u64)c->split_stats.observations[i] * + c->p.n.prev_num_observations; + + total_delta += prev > cur ? prev - cur : cur - prev; + } + cutoff = ((u64)c->p.n.prev_num_observations * + c->split_stats.num_observations * 200) / 512; + + if (total_delta > 3 * cutoff) + /* Big change in the data; just use the default costs. */ + deflate_set_default_costs(c, lit_cost, len_sym_cost); + else if (4 * total_delta > 9 * cutoff) + deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3); + else if (2 * total_delta > 3 * cutoff) + deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2); + else if (2 * total_delta > cutoff) + deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1); + else + deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0); } static void deflate_set_initial_costs(struct libdeflate_compressor *c, - const u8 *block_begin, u32 block_length, - bool is_first_block) + const u8 *block_begin, u32 block_length, + bool is_first_block) { - u32 lit_cost, len_sym_cost; - - deflate_choose_default_litlen_costs(c, block_begin, block_length, - &lit_cost, &len_sym_cost); - if (is_first_block) - deflate_set_default_costs(c, lit_cost, len_sym_cost); - else - deflate_adjust_costs(c, lit_cost, len_sym_cost); + u32 lit_cost, len_sym_cost; + + deflate_choose_default_litlen_costs(c, block_begin, block_length, + &lit_cost, &len_sym_cost); + if (is_first_block) + deflate_set_default_costs(c, lit_cost, len_sym_cost); + else + deflate_adjust_costs(c, lit_cost, len_sym_cost); } /* @@ -3325,76 +3325,76 @@ deflate_set_initial_costs(struct libdeflate_compressor *c, */ static void deflate_find_min_cost_path(struct libdeflate_compressor *c, - const u32 block_length, - const struct lz_match *cache_ptr) + const u32 block_length, + const struct lz_match *cache_ptr) { - struct deflate_optimum_node *end_node = - &c->p.n.optimum_nodes[block_length]; - struct deflate_optimum_node *cur_node = end_node; - - cur_node->cost_to_end = 0; - do { - unsigned num_matches; - unsigned literal; - u32 best_cost_to_end; - - cur_node--; - cache_ptr--; - - num_matches = cache_ptr->length; - literal = cache_ptr->offset; - - /* It's always possible to choose a literal. */ - best_cost_to_end = c->p.n.costs.literal[literal] + - (cur_node + 1)->cost_to_end; - cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; - - /* Also consider matches if there are any. */ - if (num_matches) { - const struct lz_match *match; - unsigned len; - unsigned offset; - unsigned offset_slot; - u32 offset_cost; - u32 cost_to_end; - - /* - * Consider each length from the minimum - * (DEFLATE_MIN_MATCH_LEN) to the length of the longest - * match found at this position. For each length, we - * consider only the smallest offset for which that - * length is available. Although this is not guaranteed - * to be optimal due to the possibility of a larger - * offset costing less than a smaller offset to code, - * this is a very useful heuristic. - */ - match = cache_ptr - num_matches; - len = DEFLATE_MIN_MATCH_LEN; - do { - offset = match->offset; - offset_slot = c->p.n.offset_slot_full[offset]; - offset_cost = - c->p.n.costs.offset_slot[offset_slot]; - do { - cost_to_end = offset_cost + - c->p.n.costs.length[len] + - (cur_node + len)->cost_to_end; - if (cost_to_end < best_cost_to_end) { - best_cost_to_end = cost_to_end; - cur_node->item = len | - ((u32)offset << - OPTIMUM_OFFSET_SHIFT); - } - } while (++len <= match->length); - } while (++match != cache_ptr); - cache_ptr -= num_matches; - } - cur_node->cost_to_end = best_cost_to_end; - } while (cur_node != &c->p.n.optimum_nodes[0]); - - deflate_reset_symbol_frequencies(c); - deflate_tally_item_list(c, block_length); - deflate_make_huffman_codes(&c->freqs, &c->codes); + struct deflate_optimum_node *end_node = + &c->p.n.optimum_nodes[block_length]; + struct deflate_optimum_node *cur_node = end_node; + + cur_node->cost_to_end = 0; + do { + unsigned num_matches; + unsigned literal; + u32 best_cost_to_end; + + cur_node--; + cache_ptr--; + + num_matches = cache_ptr->length; + literal = cache_ptr->offset; + + /* It's always possible to choose a literal. */ + best_cost_to_end = c->p.n.costs.literal[literal] + + (cur_node + 1)->cost_to_end; + cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1; + + /* Also consider matches if there are any. */ + if (num_matches) { + const struct lz_match *match; + unsigned len; + unsigned offset; + unsigned offset_slot; + u32 offset_cost; + u32 cost_to_end; + + /* + * Consider each length from the minimum + * (DEFLATE_MIN_MATCH_LEN) to the length of the longest + * match found at this position. For each length, we + * consider only the smallest offset for which that + * length is available. Although this is not guaranteed + * to be optimal due to the possibility of a larger + * offset costing less than a smaller offset to code, + * this is a very useful heuristic. + */ + match = cache_ptr - num_matches; + len = DEFLATE_MIN_MATCH_LEN; + do { + offset = match->offset; + offset_slot = c->p.n.offset_slot_full[offset]; + offset_cost = + c->p.n.costs.offset_slot[offset_slot]; + do { + cost_to_end = offset_cost + + c->p.n.costs.length[len] + + (cur_node + len)->cost_to_end; + if (cost_to_end < best_cost_to_end) { + best_cost_to_end = cost_to_end; + cur_node->item = len | + ((u32)offset << + OPTIMUM_OFFSET_SHIFT); + } + } while (++len <= match->length); + } while (++match != cache_ptr); + cache_ptr -= num_matches; + } + cur_node->cost_to_end = best_cost_to_end; + } while (cur_node != &c->p.n.optimum_nodes[0]); + + deflate_reset_symbol_frequencies(c); + deflate_tally_item_list(c, block_length); + deflate_make_huffman_codes(&c->freqs, &c->codes); } /* @@ -3414,139 +3414,139 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c, */ static void deflate_optimize_and_flush_block(struct libdeflate_compressor *c, - struct deflate_output_bitstream *os, - const u8 *block_begin, u32 block_length, - const struct lz_match *cache_ptr, - bool is_first_block, bool is_final_block, - bool *used_only_literals) + struct deflate_output_bitstream *os, + const u8 *block_begin, u32 block_length, + const struct lz_match *cache_ptr, + bool is_first_block, bool is_final_block, + bool *used_only_literals) { - unsigned num_passes_remaining = c->p.n.max_optim_passes; - u32 best_true_cost = UINT32_MAX; - u32 true_cost; - u32 only_lits_cost; - u32 static_cost = UINT32_MAX; - struct deflate_sequence seq_; - struct deflate_sequence *seq = NULL; - u32 i; - - /* - * On some data, using only literals (no matches) ends up being better - * than what the iterative optimization algorithm produces. Therefore, - * consider using only literals. - */ - deflate_choose_all_literals(c, block_begin, block_length); - only_lits_cost = deflate_compute_true_cost(c); - - /* - * Force the block to really end at the desired length, even if some - * matches extend beyond it. - */ - for (i = block_length; - i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, - ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) - c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; - - /* - * Sometimes a static Huffman block ends up being cheapest, particularly - * if the block is small. So, if the block is sufficiently small, find - * the optimal static block solution and remember its cost. - */ - if (block_length <= c->p.n.max_len_to_optimize_static_block) { - /* Save c->p.n.costs temporarily. */ - c->p.n.costs_saved = c->p.n.costs; - - deflate_set_costs_from_codes(c, &c->static_codes.lens); - deflate_find_min_cost_path(c, block_length, cache_ptr); - static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST; - static_cost += 7; /* for the end-of-block symbol */ - - /* Restore c->p.n.costs. */ - c->p.n.costs = c->p.n.costs_saved; - } - - /* Initialize c->p.n.costs with default costs. */ - deflate_set_initial_costs(c, block_begin, block_length, is_first_block); - - do { - /* - * Find the minimum-cost path for this pass. - * Also set c->freqs and c->codes to match the path. - */ - deflate_find_min_cost_path(c, block_length, cache_ptr); - - /* - * Compute the exact cost of the block if the path were to be - * used. Note that this differs from - * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses - * the actual Huffman codes instead of c->p.n.costs. - */ - true_cost = deflate_compute_true_cost(c); - - /* - * If the cost didn't improve much from the previous pass, then - * doing more passes probably won't be helpful, so stop early. - */ - if (true_cost + c->p.n.min_improvement_to_continue > - best_true_cost) - break; - - best_true_cost = true_cost; - - /* Save the cost model that gave 'best_true_cost'. */ - c->p.n.costs_saved = c->p.n.costs; - - /* Update the cost model from the Huffman codes. */ - deflate_set_costs_from_codes(c, &c->codes.lens); - - } while (--num_passes_remaining); - - *used_only_literals = false; - if (MIN(only_lits_cost, static_cost) < best_true_cost) { - if (only_lits_cost < static_cost) { - /* Using only literals ended up being best! */ - deflate_choose_all_literals(c, block_begin, block_length); - deflate_set_costs_from_codes(c, &c->codes.lens); - seq_.litrunlen_and_length = block_length; - seq = &seq_; - *used_only_literals = true; - } else { - /* Static block ended up being best! */ - deflate_set_costs_from_codes(c, &c->static_codes.lens); - deflate_find_min_cost_path(c, block_length, cache_ptr); - } - } else if (true_cost >= - best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) { - /* - * The best solution was actually from a non-final optimization - * pass, so recover and use the min-cost path from that pass. - */ - c->p.n.costs = c->p.n.costs_saved; - deflate_find_min_cost_path(c, block_length, cache_ptr); - deflate_set_costs_from_codes(c, &c->codes.lens); - } - deflate_flush_block(c, os, block_begin, block_length, seq, - is_final_block); + unsigned num_passes_remaining = c->p.n.max_optim_passes; + u32 best_true_cost = UINT32_MAX; + u32 true_cost; + u32 only_lits_cost; + u32 static_cost = UINT32_MAX; + struct deflate_sequence seq_; + struct deflate_sequence *seq = NULL; + u32 i; + + /* + * On some data, using only literals (no matches) ends up being better + * than what the iterative optimization algorithm produces. Therefore, + * consider using only literals. + */ + deflate_choose_all_literals(c, block_begin, block_length); + only_lits_cost = deflate_compute_true_cost(c); + + /* + * Force the block to really end at the desired length, even if some + * matches extend beyond it. + */ + for (i = block_length; + i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN, + ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++) + c->p.n.optimum_nodes[i].cost_to_end = 0x80000000; + + /* + * Sometimes a static Huffman block ends up being cheapest, particularly + * if the block is small. So, if the block is sufficiently small, find + * the optimal static block solution and remember its cost. + */ + if (block_length <= c->p.n.max_len_to_optimize_static_block) { + /* Save c->p.n.costs temporarily. */ + c->p.n.costs_saved = c->p.n.costs; + + deflate_set_costs_from_codes(c, &c->static_codes.lens); + deflate_find_min_cost_path(c, block_length, cache_ptr); + static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST; + static_cost += 7; /* for the end-of-block symbol */ + + /* Restore c->p.n.costs. */ + c->p.n.costs = c->p.n.costs_saved; + } + + /* Initialize c->p.n.costs with default costs. */ + deflate_set_initial_costs(c, block_begin, block_length, is_first_block); + + do { + /* + * Find the minimum-cost path for this pass. + * Also set c->freqs and c->codes to match the path. + */ + deflate_find_min_cost_path(c, block_length, cache_ptr); + + /* + * Compute the exact cost of the block if the path were to be + * used. Note that this differs from + * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses + * the actual Huffman codes instead of c->p.n.costs. + */ + true_cost = deflate_compute_true_cost(c); + + /* + * If the cost didn't improve much from the previous pass, then + * doing more passes probably won't be helpful, so stop early. + */ + if (true_cost + c->p.n.min_improvement_to_continue > + best_true_cost) + break; + + best_true_cost = true_cost; + + /* Save the cost model that gave 'best_true_cost'. */ + c->p.n.costs_saved = c->p.n.costs; + + /* Update the cost model from the Huffman codes. */ + deflate_set_costs_from_codes(c, &c->codes.lens); + + } while (--num_passes_remaining); + + *used_only_literals = false; + if (MIN(only_lits_cost, static_cost) < best_true_cost) { + if (only_lits_cost < static_cost) { + /* Using only literals ended up being best! */ + deflate_choose_all_literals(c, block_begin, block_length); + deflate_set_costs_from_codes(c, &c->codes.lens); + seq_.litrunlen_and_length = block_length; + seq = &seq_; + *used_only_literals = true; + } else { + /* Static block ended up being best! */ + deflate_set_costs_from_codes(c, &c->static_codes.lens); + deflate_find_min_cost_path(c, block_length, cache_ptr); + } + } else if (true_cost >= + best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) { + /* + * The best solution was actually from a non-final optimization + * pass, so recover and use the min-cost path from that pass. + */ + c->p.n.costs = c->p.n.costs_saved; + deflate_find_min_cost_path(c, block_length, cache_ptr); + deflate_set_costs_from_codes(c, &c->codes.lens); + } + deflate_flush_block(c, os, block_begin, block_length, seq, + is_final_block); } static void deflate_near_optimal_init_stats(struct libdeflate_compressor *c) { - init_block_split_stats(&c->split_stats); - memset(c->p.n.new_match_len_freqs, 0, - sizeof(c->p.n.new_match_len_freqs)); - memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); + init_block_split_stats(&c->split_stats); + memset(c->p.n.new_match_len_freqs, 0, + sizeof(c->p.n.new_match_len_freqs)); + memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); } static void deflate_near_optimal_merge_stats(struct libdeflate_compressor *c) { - unsigned i; - - merge_new_observations(&c->split_stats); - for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { - c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i]; - c->p.n.new_match_len_freqs[i] = 0; - } + unsigned i; + + merge_new_observations(&c->split_stats); + for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) { + c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i]; + c->p.n.new_match_len_freqs[i] = 0; + } } /* @@ -3557,22 +3557,22 @@ deflate_near_optimal_merge_stats(struct libdeflate_compressor *c) static void deflate_near_optimal_save_stats(struct libdeflate_compressor *c) { - int i; - - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) - c->p.n.prev_observations[i] = c->split_stats.observations[i]; - c->p.n.prev_num_observations = c->split_stats.num_observations; + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) + c->p.n.prev_observations[i] = c->split_stats.observations[i]; + c->p.n.prev_num_observations = c->split_stats.num_observations; } static void deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c) { - int i; - - for (i = 0; i < NUM_OBSERVATION_TYPES; i++) - c->split_stats.observations[i] = 0; - c->split_stats.num_observations = 0; - memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); + int i; + + for (i = 0; i < NUM_OBSERVATION_TYPES; i++) + c->split_stats.observations[i] = 0; + c->split_stats.num_observations = 0; + memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs)); } /* @@ -3590,538 +3590,538 @@ deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c) */ static void deflate_compress_near_optimal(struct libdeflate_compressor * restrict c, - const u8 *in, size_t in_nbytes, - struct deflate_output_bitstream *os) + const u8 *in, size_t in_nbytes, + struct deflate_output_bitstream *os) { - const u8 *in_next = in; - const u8 *in_block_begin = in_next; - const u8 *in_end = in_next + in_nbytes; - const u8 *in_cur_base = in_next; - const u8 *in_next_slide = - in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); - unsigned max_len = DEFLATE_MAX_MATCH_LEN; - unsigned nice_len = MIN(c->nice_match_length, max_len); - struct lz_match *cache_ptr = c->p.n.match_cache; - u32 next_hashes[2] = {0, 0}; - bool prev_block_used_only_literals = false; - - bt_matchfinder_init(&c->p.n.bt_mf); - deflate_near_optimal_init_stats(c); - - do { - /* Starting a new DEFLATE block */ - const u8 * const in_max_block_end = choose_max_block_end( - in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH); - const u8 *prev_end_block_check = NULL; - bool change_detected = false; - const u8 *next_observation = in_next; - unsigned min_len; - - /* - * Use the minimum match length heuristic to improve the - * literal/match statistics gathered during matchfinding. - * However, the actual near-optimal parse won't respect min_len, - * as it can accurately assess the costs of different matches. - * - * If the "use only literals" strategy happened to be the best - * strategy on the previous block, then probably the - * min_match_len heuristic is still not aggressive enough for - * the data, so force gathering literal stats only. - */ - if (prev_block_used_only_literals) - min_len = DEFLATE_MAX_MATCH_LEN + 1; - else - min_len = calculate_min_match_len( - in_block_begin, - in_max_block_end - in_block_begin, - c->max_search_depth); - - /* - * Find matches until we decide to end the block. We end the - * block if any of the following is true: - * - * (1) Maximum block length has been reached - * (2) Match catch may overflow. - * (3) Block split heuristic says to split now. - */ - for (;;) { - struct lz_match *matches; - unsigned best_len; - size_t remaining = in_end - in_next; - - /* Slide the window forward if needed. */ - if (in_next == in_next_slide) { - bt_matchfinder_slide_window(&c->p.n.bt_mf); - in_cur_base = in_next; - in_next_slide = in_next + - MIN(remaining, MATCHFINDER_WINDOW_SIZE); - } - - /* - * Find matches with the current position using the - * binary tree matchfinder and save them in match_cache. - * - * Note: the binary tree matchfinder is more suited for - * optimal parsing than the hash chain matchfinder. The - * reasons for this include: - * - * - The binary tree matchfinder can find more matches - * in the same number of steps. - * - One of the major advantages of hash chains is that - * skipping positions (not searching for matches at - * them) is faster; however, with optimal parsing we - * search for matches at almost all positions, so this - * advantage of hash chains is negated. - */ - matches = cache_ptr; - best_len = 0; - adjust_max_and_nice_len(&max_len, &nice_len, remaining); - if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { - cache_ptr = bt_matchfinder_get_matches( - &c->p.n.bt_mf, - in_cur_base, - in_next - in_cur_base, - max_len, - nice_len, - c->max_search_depth, - next_hashes, - matches); - if (cache_ptr > matches) - best_len = cache_ptr[-1].length; - } - if (in_next >= next_observation) { - if (best_len >= min_len) { - observe_match(&c->split_stats, - best_len); - next_observation = in_next + best_len; - c->p.n.new_match_len_freqs[best_len]++; - } else { - observe_literal(&c->split_stats, - *in_next); - next_observation = in_next + 1; - } - } - - cache_ptr->length = cache_ptr - matches; - cache_ptr->offset = *in_next; - in_next++; - cache_ptr++; - - /* - * If there was a very long match found, don't cache any - * matches for the bytes covered by that match. This - * avoids degenerate behavior when compressing highly - * redundant data, where the number of matches can be - * very large. - * - * This heuristic doesn't actually hurt the compression - * ratio very much. If there's a long match, then the - * data must be highly compressible, so it doesn't - * matter much what we do. - */ - if (best_len >= DEFLATE_MIN_MATCH_LEN && - best_len >= nice_len) { - --best_len; - do { - remaining = in_end - in_next; - if (in_next == in_next_slide) { - bt_matchfinder_slide_window( - &c->p.n.bt_mf); - in_cur_base = in_next; - in_next_slide = in_next + - MIN(remaining, - MATCHFINDER_WINDOW_SIZE); - } - adjust_max_and_nice_len(&max_len, - &nice_len, - remaining); - if (max_len >= - BT_MATCHFINDER_REQUIRED_NBYTES) { - bt_matchfinder_skip_byte( - &c->p.n.bt_mf, - in_cur_base, - in_next - in_cur_base, - nice_len, - c->max_search_depth, - next_hashes); - } - cache_ptr->length = 0; - cache_ptr->offset = *in_next; - in_next++; - cache_ptr++; - } while (--best_len); - } - /* Maximum block length or end of input reached? */ - if (in_next >= in_max_block_end) - break; - /* Match cache overflowed? */ - if (cache_ptr >= - &c->p.n.match_cache[MATCH_CACHE_LENGTH]) - break; - /* Not ready to try to end the block (again)? */ - if (!ready_to_check_block(&c->split_stats, - in_block_begin, in_next, - in_end)) - continue; - /* Check if it would be worthwhile to end the block. */ - if (do_end_block_check(&c->split_stats, - in_next - in_block_begin)) { - change_detected = true; - break; - } - /* Ending the block doesn't seem worthwhile here. */ - deflate_near_optimal_merge_stats(c); - prev_end_block_check = in_next; - } - /* - * All the matches for this block have been cached. Now choose - * the precise end of the block and the sequence of items to - * output to represent it, then flush the block. - */ - if (change_detected && prev_end_block_check != NULL) { - /* - * The block is being ended because a recent chunk of - * data differs from the rest of the block. We could - * end the block at 'in_next' like the greedy and lazy - * compressors do, but that's not ideal since it would - * include the differing chunk in the block. The - * near-optimal compressor has time to do a better job. - * Therefore, we rewind to just before the chunk, and - * output a block that only goes up to there. - * - * We then set things up to correctly start the next - * block, considering that some work has already been - * done on it (some matches found and stats gathered). - */ - struct lz_match *orig_cache_ptr = cache_ptr; - const u8 *in_block_end = prev_end_block_check; - u32 block_length = in_block_end - in_block_begin; - bool is_first = (in_block_begin == in); - bool is_final = false; - u32 num_bytes_to_rewind = in_next - in_block_end; - size_t cache_len_rewound; - - /* Rewind the match cache. */ - do { - cache_ptr--; - cache_ptr -= cache_ptr->length; - } while (--num_bytes_to_rewind); - cache_len_rewound = orig_cache_ptr - cache_ptr; - - deflate_optimize_and_flush_block( - c, os, in_block_begin, - block_length, cache_ptr, - is_first, is_final, - &prev_block_used_only_literals); - memmove(c->p.n.match_cache, cache_ptr, - cache_len_rewound * sizeof(*cache_ptr)); - cache_ptr = &c->p.n.match_cache[cache_len_rewound]; - deflate_near_optimal_save_stats(c); - /* - * Clear the stats for the just-flushed block, leaving - * just the stats for the beginning of the next block. - */ - deflate_near_optimal_clear_old_stats(c); - in_block_begin = in_block_end; - } else { - /* - * The block is being ended for a reason other than a - * differing data chunk being detected. Don't rewind at - * all; just end the block at the current position. - */ - u32 block_length = in_next - in_block_begin; - bool is_first = (in_block_begin == in); - bool is_final = (in_next == in_end); - - deflate_near_optimal_merge_stats(c); - deflate_optimize_and_flush_block( - c, os, in_block_begin, - block_length, cache_ptr, - is_first, is_final, - &prev_block_used_only_literals); - cache_ptr = &c->p.n.match_cache[0]; - deflate_near_optimal_save_stats(c); - deflate_near_optimal_init_stats(c); - in_block_begin = in_next; - } - } while (in_next != in_end && !os->overflow); + const u8 *in_next = in; + const u8 *in_block_begin = in_next; + const u8 *in_end = in_next + in_nbytes; + const u8 *in_cur_base = in_next; + const u8 *in_next_slide = + in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE); + unsigned max_len = DEFLATE_MAX_MATCH_LEN; + unsigned nice_len = MIN(c->nice_match_length, max_len); + struct lz_match *cache_ptr = c->p.n.match_cache; + u32 next_hashes[2] = {0, 0}; + bool prev_block_used_only_literals = false; + + bt_matchfinder_init(&c->p.n.bt_mf); + deflate_near_optimal_init_stats(c); + + do { + /* Starting a new DEFLATE block */ + const u8 * const in_max_block_end = choose_max_block_end( + in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH); + const u8 *prev_end_block_check = NULL; + bool change_detected = false; + const u8 *next_observation = in_next; + unsigned min_len; + + /* + * Use the minimum match length heuristic to improve the + * literal/match statistics gathered during matchfinding. + * However, the actual near-optimal parse won't respect min_len, + * as it can accurately assess the costs of different matches. + * + * If the "use only literals" strategy happened to be the best + * strategy on the previous block, then probably the + * min_match_len heuristic is still not aggressive enough for + * the data, so force gathering literal stats only. + */ + if (prev_block_used_only_literals) + min_len = DEFLATE_MAX_MATCH_LEN + 1; + else + min_len = calculate_min_match_len( + in_block_begin, + in_max_block_end - in_block_begin, + c->max_search_depth); + + /* + * Find matches until we decide to end the block. We end the + * block if any of the following is true: + * + * (1) Maximum block length has been reached + * (2) Match catch may overflow. + * (3) Block split heuristic says to split now. + */ + for (;;) { + struct lz_match *matches; + unsigned best_len; + size_t remaining = in_end - in_next; + + /* Slide the window forward if needed. */ + if (in_next == in_next_slide) { + bt_matchfinder_slide_window(&c->p.n.bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + + MIN(remaining, MATCHFINDER_WINDOW_SIZE); + } + + /* + * Find matches with the current position using the + * binary tree matchfinder and save them in match_cache. + * + * Note: the binary tree matchfinder is more suited for + * optimal parsing than the hash chain matchfinder. The + * reasons for this include: + * + * - The binary tree matchfinder can find more matches + * in the same number of steps. + * - One of the major advantages of hash chains is that + * skipping positions (not searching for matches at + * them) is faster; however, with optimal parsing we + * search for matches at almost all positions, so this + * advantage of hash chains is negated. + */ + matches = cache_ptr; + best_len = 0; + adjust_max_and_nice_len(&max_len, &nice_len, remaining); + if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) { + cache_ptr = bt_matchfinder_get_matches( + &c->p.n.bt_mf, + in_cur_base, + in_next - in_cur_base, + max_len, + nice_len, + c->max_search_depth, + next_hashes, + matches); + if (cache_ptr > matches) + best_len = cache_ptr[-1].length; + } + if (in_next >= next_observation) { + if (best_len >= min_len) { + observe_match(&c->split_stats, + best_len); + next_observation = in_next + best_len; + c->p.n.new_match_len_freqs[best_len]++; + } else { + observe_literal(&c->split_stats, + *in_next); + next_observation = in_next + 1; + } + } + + cache_ptr->length = cache_ptr - matches; + cache_ptr->offset = *in_next; + in_next++; + cache_ptr++; + + /* + * If there was a very long match found, don't cache any + * matches for the bytes covered by that match. This + * avoids degenerate behavior when compressing highly + * redundant data, where the number of matches can be + * very large. + * + * This heuristic doesn't actually hurt the compression + * ratio very much. If there's a long match, then the + * data must be highly compressible, so it doesn't + * matter much what we do. + */ + if (best_len >= DEFLATE_MIN_MATCH_LEN && + best_len >= nice_len) { + --best_len; + do { + remaining = in_end - in_next; + if (in_next == in_next_slide) { + bt_matchfinder_slide_window( + &c->p.n.bt_mf); + in_cur_base = in_next; + in_next_slide = in_next + + MIN(remaining, + MATCHFINDER_WINDOW_SIZE); + } + adjust_max_and_nice_len(&max_len, + &nice_len, + remaining); + if (max_len >= + BT_MATCHFINDER_REQUIRED_NBYTES) { + bt_matchfinder_skip_byte( + &c->p.n.bt_mf, + in_cur_base, + in_next - in_cur_base, + nice_len, + c->max_search_depth, + next_hashes); + } + cache_ptr->length = 0; + cache_ptr->offset = *in_next; + in_next++; + cache_ptr++; + } while (--best_len); + } + /* Maximum block length or end of input reached? */ + if (in_next >= in_max_block_end) + break; + /* Match cache overflowed? */ + if (cache_ptr >= + &c->p.n.match_cache[MATCH_CACHE_LENGTH]) + break; + /* Not ready to try to end the block (again)? */ + if (!ready_to_check_block(&c->split_stats, + in_block_begin, in_next, + in_end)) + continue; + /* Check if it would be worthwhile to end the block. */ + if (do_end_block_check(&c->split_stats, + in_next - in_block_begin)) { + change_detected = true; + break; + } + /* Ending the block doesn't seem worthwhile here. */ + deflate_near_optimal_merge_stats(c); + prev_end_block_check = in_next; + } + /* + * All the matches for this block have been cached. Now choose + * the precise end of the block and the sequence of items to + * output to represent it, then flush the block. + */ + if (change_detected && prev_end_block_check != NULL) { + /* + * The block is being ended because a recent chunk of + * data differs from the rest of the block. We could + * end the block at 'in_next' like the greedy and lazy + * compressors do, but that's not ideal since it would + * include the differing chunk in the block. The + * near-optimal compressor has time to do a better job. + * Therefore, we rewind to just before the chunk, and + * output a block that only goes up to there. + * + * We then set things up to correctly start the next + * block, considering that some work has already been + * done on it (some matches found and stats gathered). + */ + struct lz_match *orig_cache_ptr = cache_ptr; + const u8 *in_block_end = prev_end_block_check; + u32 block_length = in_block_end - in_block_begin; + bool is_first = (in_block_begin == in); + bool is_final = false; + u32 num_bytes_to_rewind = in_next - in_block_end; + size_t cache_len_rewound; + + /* Rewind the match cache. */ + do { + cache_ptr--; + cache_ptr -= cache_ptr->length; + } while (--num_bytes_to_rewind); + cache_len_rewound = orig_cache_ptr - cache_ptr; + + deflate_optimize_and_flush_block( + c, os, in_block_begin, + block_length, cache_ptr, + is_first, is_final, + &prev_block_used_only_literals); + memmove(c->p.n.match_cache, cache_ptr, + cache_len_rewound * sizeof(*cache_ptr)); + cache_ptr = &c->p.n.match_cache[cache_len_rewound]; + deflate_near_optimal_save_stats(c); + /* + * Clear the stats for the just-flushed block, leaving + * just the stats for the beginning of the next block. + */ + deflate_near_optimal_clear_old_stats(c); + in_block_begin = in_block_end; + } else { + /* + * The block is being ended for a reason other than a + * differing data chunk being detected. Don't rewind at + * all; just end the block at the current position. + */ + u32 block_length = in_next - in_block_begin; + bool is_first = (in_block_begin == in); + bool is_final = (in_next == in_end); + + deflate_near_optimal_merge_stats(c); + deflate_optimize_and_flush_block( + c, os, in_block_begin, + block_length, cache_ptr, + is_first, is_final, + &prev_block_used_only_literals); + cache_ptr = &c->p.n.match_cache[0]; + deflate_near_optimal_save_stats(c); + deflate_near_optimal_init_stats(c); + in_block_begin = in_next; + } + } while (in_next != in_end && !os->overflow); } /* Initialize c->p.n.offset_slot_full. */ static void deflate_init_offset_slot_full(struct libdeflate_compressor *c) { - unsigned offset_slot; - unsigned offset; - unsigned offset_end; - - for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base); - offset_slot++) { - offset = deflate_offset_slot_base[offset_slot]; - offset_end = offset + - (1 << deflate_extra_offset_bits[offset_slot]); - do { - c->p.n.offset_slot_full[offset] = offset_slot; - } while (++offset != offset_end); - } + unsigned offset_slot; + unsigned offset; + unsigned offset_end; + + for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base); + offset_slot++) { + offset = deflate_offset_slot_base[offset_slot]; + offset_end = offset + + (1 << deflate_extra_offset_bits[offset_slot]); + do { + c->p.n.offset_slot_full[offset] = offset_slot; + } while (++offset != offset_end); + } } #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ LIBDEFLATEAPI struct libdeflate_compressor * libdeflate_alloc_compressor_ex(int compression_level, - const struct libdeflate_options *options) + const struct libdeflate_options *options) { - struct libdeflate_compressor *c; - size_t size = offsetof(struct libdeflate_compressor, p); - - check_buildtime_parameters(); - - /* - * Note: if more fields are added to libdeflate_options, this code will - * need to be updated to support both the old and new structs. - */ - if (options->sizeof_options != sizeof(*options)) - return NULL; - - if (compression_level < 0 || compression_level > 12) - return NULL; - + struct libdeflate_compressor *c; + size_t size = offsetof(struct libdeflate_compressor, p); + + check_buildtime_parameters(); + + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + + if (compression_level < 0 || compression_level > 12) + return NULL; + #if SUPPORT_NEAR_OPTIMAL_PARSING - if (compression_level >= 10) - size += sizeof(c->p.n); - else + if (compression_level >= 10) + size += sizeof(c->p.n); + else #endif - { - if (compression_level >= 2) - size += sizeof(c->p.g); - else if (compression_level == 1) - size += sizeof(c->p.f); - } - - c = libdeflate_aligned_malloc(options->malloc_func ? - options->malloc_func : - libdeflate_default_malloc_func, - MATCHFINDER_MEM_ALIGNMENT, size); - if (!c) - return NULL; - c->free_func = options->free_func ? - options->free_func : libdeflate_default_free_func; - - c->compression_level = compression_level; - - /* - * The higher the compression level, the more we should bother trying to - * compress very small inputs. - */ - c->max_passthrough_size = 55 - (compression_level * 4); - - switch (compression_level) { - case 0: - c->max_passthrough_size = SIZE_MAX; - c->impl = NULL; /* not used */ - break; - case 1: - c->impl = deflate_compress_fastest; - /* max_search_depth is unused. */ - c->nice_match_length = 32; - break; - case 2: - c->impl = deflate_compress_greedy; - c->max_search_depth = 6; - c->nice_match_length = 10; - break; - case 3: - c->impl = deflate_compress_greedy; - c->max_search_depth = 12; - c->nice_match_length = 14; - break; - case 4: - c->impl = deflate_compress_greedy; - c->max_search_depth = 16; - c->nice_match_length = 30; - break; - case 5: - c->impl = deflate_compress_lazy; - c->max_search_depth = 16; - c->nice_match_length = 30; - break; - case 6: - c->impl = deflate_compress_lazy; - c->max_search_depth = 35; - c->nice_match_length = 65; - break; - case 7: - c->impl = deflate_compress_lazy; - c->max_search_depth = 100; - c->nice_match_length = 130; - break; - case 8: - c->impl = deflate_compress_lazy2; - c->max_search_depth = 300; - c->nice_match_length = DEFLATE_MAX_MATCH_LEN; - break; - case 9: + { + if (compression_level >= 2) + size += sizeof(c->p.g); + else if (compression_level == 1) + size += sizeof(c->p.f); + } + + c = libdeflate_aligned_malloc(options->malloc_func ? + options->malloc_func : + libdeflate_default_malloc_func, + MATCHFINDER_MEM_ALIGNMENT, size); + if (!c) + return NULL; + c->free_func = options->free_func ? + options->free_func : libdeflate_default_free_func; + + c->compression_level = compression_level; + + /* + * The higher the compression level, the more we should bother trying to + * compress very small inputs. + */ + c->max_passthrough_size = 55 - (compression_level * 4); + + switch (compression_level) { + case 0: + c->max_passthrough_size = SIZE_MAX; + c->impl = NULL; /* not used */ + break; + case 1: + c->impl = deflate_compress_fastest; + /* max_search_depth is unused. */ + c->nice_match_length = 32; + break; + case 2: + c->impl = deflate_compress_greedy; + c->max_search_depth = 6; + c->nice_match_length = 10; + break; + case 3: + c->impl = deflate_compress_greedy; + c->max_search_depth = 12; + c->nice_match_length = 14; + break; + case 4: + c->impl = deflate_compress_greedy; + c->max_search_depth = 16; + c->nice_match_length = 30; + break; + case 5: + c->impl = deflate_compress_lazy; + c->max_search_depth = 16; + c->nice_match_length = 30; + break; + case 6: + c->impl = deflate_compress_lazy; + c->max_search_depth = 35; + c->nice_match_length = 65; + break; + case 7: + c->impl = deflate_compress_lazy; + c->max_search_depth = 100; + c->nice_match_length = 130; + break; + case 8: + c->impl = deflate_compress_lazy2; + c->max_search_depth = 300; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + break; + case 9: #if !SUPPORT_NEAR_OPTIMAL_PARSING - default: + default: #endif - c->impl = deflate_compress_lazy2; - c->max_search_depth = 600; - c->nice_match_length = DEFLATE_MAX_MATCH_LEN; - break; + c->impl = deflate_compress_lazy2; + c->max_search_depth = 600; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + break; #if SUPPORT_NEAR_OPTIMAL_PARSING - case 10: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 35; - c->nice_match_length = 75; - c->p.n.max_optim_passes = 2; - c->p.n.min_improvement_to_continue = 32; - c->p.n.min_bits_to_use_nonfinal_path = 32; - c->p.n.max_len_to_optimize_static_block = 0; - deflate_init_offset_slot_full(c); - break; - case 11: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 100; - c->nice_match_length = 150; - c->p.n.max_optim_passes = 4; - c->p.n.min_improvement_to_continue = 16; - c->p.n.min_bits_to_use_nonfinal_path = 16; - c->p.n.max_len_to_optimize_static_block = 1000; - deflate_init_offset_slot_full(c); - break; - case 12: - default: - c->impl = deflate_compress_near_optimal; - c->max_search_depth = 300; - c->nice_match_length = DEFLATE_MAX_MATCH_LEN; - c->p.n.max_optim_passes = 10; - c->p.n.min_improvement_to_continue = 1; - c->p.n.min_bits_to_use_nonfinal_path = 1; - c->p.n.max_len_to_optimize_static_block = 10000; - deflate_init_offset_slot_full(c); - break; + case 10: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 35; + c->nice_match_length = 75; + c->p.n.max_optim_passes = 2; + c->p.n.min_improvement_to_continue = 32; + c->p.n.min_bits_to_use_nonfinal_path = 32; + c->p.n.max_len_to_optimize_static_block = 0; + deflate_init_offset_slot_full(c); + break; + case 11: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 100; + c->nice_match_length = 150; + c->p.n.max_optim_passes = 4; + c->p.n.min_improvement_to_continue = 16; + c->p.n.min_bits_to_use_nonfinal_path = 16; + c->p.n.max_len_to_optimize_static_block = 1000; + deflate_init_offset_slot_full(c); + break; + case 12: + default: + c->impl = deflate_compress_near_optimal; + c->max_search_depth = 300; + c->nice_match_length = DEFLATE_MAX_MATCH_LEN; + c->p.n.max_optim_passes = 10; + c->p.n.min_improvement_to_continue = 1; + c->p.n.min_bits_to_use_nonfinal_path = 1; + c->p.n.max_len_to_optimize_static_block = 10000; + deflate_init_offset_slot_full(c); + break; #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */ - } - - deflate_init_static_codes(c); - - return c; + } + + deflate_init_static_codes(c); + + return c; } LIBDEFLATEAPI struct libdeflate_compressor * libdeflate_alloc_compressor(int compression_level) { - static const struct libdeflate_options defaults = { - .sizeof_options = sizeof(defaults), - }; - return libdeflate_alloc_compressor_ex(compression_level, &defaults); + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_compressor_ex(compression_level, &defaults); } LIBDEFLATEAPI size_t libdeflate_deflate_compress(struct libdeflate_compressor *c, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) { - struct deflate_output_bitstream os; - - /* - * For extremely short inputs, or for compression level 0, just output - * uncompressed blocks. - */ - if (unlikely(in_nbytes <= c->max_passthrough_size)) - return deflate_compress_none(in, in_nbytes, - out, out_nbytes_avail); - - /* Initialize the output bitstream structure. */ - os.bitbuf = 0; - os.bitcount = 0; - os.next = out; - os.end = os.next + out_nbytes_avail; - os.overflow = false; - - /* Call the actual compression function. */ - (*c->impl)(c, in, in_nbytes, &os); - - /* Return 0 if the output buffer is too small. */ - if (os.overflow) - return 0; - - /* - * Write the final byte if needed. This can't overflow the output - * buffer because deflate_flush_block() would have set the overflow flag - * if there wasn't enough space remaining for the full final block. - */ - ASSERT(os.bitcount <= 7); - if (os.bitcount) { - ASSERT(os.next < os.end); - *os.next++ = os.bitbuf; - } - - /* Return the compressed size in bytes. */ - return os.next - (u8 *)out; + struct deflate_output_bitstream os; + + /* + * For extremely short inputs, or for compression level 0, just output + * uncompressed blocks. + */ + if (unlikely(in_nbytes <= c->max_passthrough_size)) + return deflate_compress_none(in, in_nbytes, + out, out_nbytes_avail); + + /* Initialize the output bitstream structure. */ + os.bitbuf = 0; + os.bitcount = 0; + os.next = out; + os.end = os.next + out_nbytes_avail; + os.overflow = false; + + /* Call the actual compression function. */ + (*c->impl)(c, in, in_nbytes, &os); + + /* Return 0 if the output buffer is too small. */ + if (os.overflow) + return 0; + + /* + * Write the final byte if needed. This can't overflow the output + * buffer because deflate_flush_block() would have set the overflow flag + * if there wasn't enough space remaining for the full final block. + */ + ASSERT(os.bitcount <= 7); + if (os.bitcount) { + ASSERT(os.next < os.end); + *os.next++ = os.bitbuf; + } + + /* Return the compressed size in bytes. */ + return os.next - (u8 *)out; } LIBDEFLATEAPI void libdeflate_free_compressor(struct libdeflate_compressor *c) { - if (c) - libdeflate_aligned_free(c->free_func, c); + if (c) + libdeflate_aligned_free(c->free_func, c); } unsigned int libdeflate_get_compression_level(struct libdeflate_compressor *c) { - return c->compression_level; + return c->compression_level; } LIBDEFLATEAPI size_t libdeflate_deflate_compress_bound(struct libdeflate_compressor *c, - size_t in_nbytes) + size_t in_nbytes) { - size_t max_blocks; - - /* - * Since the compressor never uses a compressed block when an - * uncompressed block is cheaper, the worst case can be no worse than - * the case where only uncompressed blocks are used. - * - * This is true even though up to 7 bits are "wasted" to byte-align the - * bitstream when a compressed block is followed by an uncompressed - * block. This is because a compressed block wouldn't have been used if - * it wasn't cheaper than an uncompressed block, and uncompressed blocks - * always end on a byte boundary. So the alignment bits will, at worst, - * go up to the place where the uncompressed block would have ended. - */ - - /* - * Calculate the maximum number of uncompressed blocks that the - * compressor can use for 'in_nbytes' of data. - * - * The minimum length that is passed to deflate_flush_block() is - * MIN_BLOCK_LENGTH bytes, except for the final block if needed. If - * deflate_flush_block() decides to use an uncompressed block, it - * actually will (in general) output a series of uncompressed blocks in - * order to stay within the UINT16_MAX limit of DEFLATE. But this can - * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX', - * as in that case this behavior can't result in more blocks than the - * case where deflate_flush_block() is called with min-length inputs. - * - * So the number of uncompressed blocks needed would be bounded by - * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH). However, empty inputs - * need 1 (empty) block, which gives the final expression below. - */ - STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX); - max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); - - /* - * Each uncompressed block has 5 bytes of overhead, for the BFINAL, - * BTYPE, LEN, and NLEN fields. (For the reason explained earlier, the - * alignment bits at the very start of the block can be disregarded; - * they would otherwise increase the overhead to 6 bytes per block.) - * Therefore, the maximum number of overhead bytes is '5 * max_blocks'. - * To get the final bound, add the number of uncompressed bytes. - */ - return (5 * max_blocks) + in_nbytes; + size_t max_blocks; + + /* + * Since the compressor never uses a compressed block when an + * uncompressed block is cheaper, the worst case can be no worse than + * the case where only uncompressed blocks are used. + * + * This is true even though up to 7 bits are "wasted" to byte-align the + * bitstream when a compressed block is followed by an uncompressed + * block. This is because a compressed block wouldn't have been used if + * it wasn't cheaper than an uncompressed block, and uncompressed blocks + * always end on a byte boundary. So the alignment bits will, at worst, + * go up to the place where the uncompressed block would have ended. + */ + + /* + * Calculate the maximum number of uncompressed blocks that the + * compressor can use for 'in_nbytes' of data. + * + * The minimum length that is passed to deflate_flush_block() is + * MIN_BLOCK_LENGTH bytes, except for the final block if needed. If + * deflate_flush_block() decides to use an uncompressed block, it + * actually will (in general) output a series of uncompressed blocks in + * order to stay within the UINT16_MAX limit of DEFLATE. But this can + * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX', + * as in that case this behavior can't result in more blocks than the + * case where deflate_flush_block() is called with min-length inputs. + * + * So the number of uncompressed blocks needed would be bounded by + * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH). However, empty inputs + * need 1 (empty) block, which gives the final expression below. + */ + STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX); + max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1); + + /* + * Each uncompressed block has 5 bytes of overhead, for the BFINAL, + * BTYPE, LEN, and NLEN fields. (For the reason explained earlier, the + * alignment bits at the very start of the block can be disregarded; + * they would otherwise increase the overhead to 6 bytes per block.) + * Therefore, the maximum number of overhead bytes is '5 * max_blocks'. + * To get the final bound, add the number of uncompressed bytes. + */ + return (5 * max_blocks) + in_nbytes; } diff --git a/Sources/DEFLATE/deflate_constants.h b/Sources/DEFLATE/deflate_constants.h index 95c9e0a5..a2da4baa 100644 --- a/Sources/DEFLATE/deflate_constants.h +++ b/Sources/DEFLATE/deflate_constants.h @@ -6,51 +6,51 @@ #define LIB_DEFLATE_CONSTANTS_H /* Valid block types */ -#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0 -#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1 -#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2 +#define DEFLATE_BLOCKTYPE_UNCOMPRESSED 0 +#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN 1 +#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN 2 /* Minimum and maximum supported match lengths (in bytes) */ -#define DEFLATE_MIN_MATCH_LEN 3 -#define DEFLATE_MAX_MATCH_LEN 258 +#define DEFLATE_MIN_MATCH_LEN 3 +#define DEFLATE_MAX_MATCH_LEN 258 /* Maximum supported match offset (in bytes) */ -#define DEFLATE_MAX_MATCH_OFFSET 32768 +#define DEFLATE_MAX_MATCH_OFFSET 32768 /* log2 of DEFLATE_MAX_MATCH_OFFSET */ -#define DEFLATE_WINDOW_ORDER 15 +#define DEFLATE_WINDOW_ORDER 15 /* Number of symbols in each Huffman code. Note: for the literal/length * and offset codes, these are actually the maximum values; a given block * might use fewer symbols. */ -#define DEFLATE_NUM_PRECODE_SYMS 19 -#define DEFLATE_NUM_LITLEN_SYMS 288 -#define DEFLATE_NUM_OFFSET_SYMS 32 +#define DEFLATE_NUM_PRECODE_SYMS 19 +#define DEFLATE_NUM_LITLEN_SYMS 288 +#define DEFLATE_NUM_OFFSET_SYMS 32 /* The maximum number of symbols across all codes */ -#define DEFLATE_MAX_NUM_SYMS 288 +#define DEFLATE_MAX_NUM_SYMS 288 /* Division of symbols in the literal/length code */ -#define DEFLATE_NUM_LITERALS 256 -#define DEFLATE_END_OF_BLOCK 256 -#define DEFLATE_FIRST_LEN_SYM 257 +#define DEFLATE_NUM_LITERALS 256 +#define DEFLATE_END_OF_BLOCK 256 +#define DEFLATE_FIRST_LEN_SYM 257 /* Maximum codeword length, in bits, within each Huffman code */ -#define DEFLATE_MAX_PRE_CODEWORD_LEN 7 -#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15 -#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15 +#define DEFLATE_MAX_PRE_CODEWORD_LEN 7 +#define DEFLATE_MAX_LITLEN_CODEWORD_LEN 15 +#define DEFLATE_MAX_OFFSET_CODEWORD_LEN 15 /* The maximum codeword length across all codes */ -#define DEFLATE_MAX_CODEWORD_LEN 15 +#define DEFLATE_MAX_CODEWORD_LEN 15 /* Maximum possible overrun when decoding codeword lengths */ -#define DEFLATE_MAX_LENS_OVERRUN 137 +#define DEFLATE_MAX_LENS_OVERRUN 137 /* * Maximum number of extra bits that may be required to represent a match * length or offset. */ -#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5 -#define DEFLATE_MAX_EXTRA_OFFSET_BITS 13 +#define DEFLATE_MAX_EXTRA_LENGTH_BITS 5 +#define DEFLATE_MAX_EXTRA_OFFSET_BITS 13 #endif /* LIB_DEFLATE_CONSTANTS_H */ diff --git a/Sources/DEFLATE/deflate_decompress.c b/Sources/DEFLATE/deflate_decompress.c index 63726c7a..11a49d69 100644 --- a/Sources/DEFLATE/deflate_decompress.c +++ b/Sources/DEFLATE/deflate_decompress.c @@ -55,44 +55,44 @@ */ #if 0 # pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!") -# define SAFETY_CHECK(expr) (void)(expr) +# define SAFETY_CHECK(expr) (void)(expr) #else -# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA +# define SAFETY_CHECK(expr) if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA #endif /***************************************************************************** - * Input bitstream * + * Input bitstream * *****************************************************************************/ /* * The state of the "input bitstream" consists of the following variables: * - * - in_next: a pointer to the next unread byte in the input buffer + * - in_next: a pointer to the next unread byte in the input buffer * - * - in_end: a pointer to just past the end of the input buffer + * - in_end: a pointer to just past the end of the input buffer * - * - bitbuf: a word-sized variable containing bits that have been read from - * the input buffer or from the implicit appended zero bytes + * - bitbuf: a word-sized variable containing bits that have been read from + * the input buffer or from the implicit appended zero bytes * - * - bitsleft: the number of bits in 'bitbuf' available to be consumed. - * After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually - * contain more bits than this. However, only the bits counted - * by 'bitsleft' can actually be consumed; the rest can only be - * used for preloading. + * - bitsleft: the number of bits in 'bitbuf' available to be consumed. + * After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually + * contain more bits than this. However, only the bits counted + * by 'bitsleft' can actually be consumed; the rest can only be + * used for preloading. * - * As a micro-optimization, we allow bits 8 and higher of - * 'bitsleft' to contain garbage. When consuming the bits - * associated with a decode table entry, this allows us to do - * 'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'. - * On some CPUs, this helps reduce instruction dependencies. - * This does have the disadvantage that 'bitsleft' sometimes - * needs to be cast to 'u8', such as when it's used as a shift - * amount in REFILL_BITS_BRANCHLESS(). But that one happens - * for free since most CPUs ignore high bits in shift amounts. + * As a micro-optimization, we allow bits 8 and higher of + * 'bitsleft' to contain garbage. When consuming the bits + * associated with a decode table entry, this allows us to do + * 'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'. + * On some CPUs, this helps reduce instruction dependencies. + * This does have the disadvantage that 'bitsleft' sometimes + * needs to be cast to 'u8', such as when it's used as a shift + * amount in REFILL_BITS_BRANCHLESS(). But that one happens + * for free since most CPUs ignore high bits in shift amounts. * - * - overread_count: the total number of implicit appended zero bytes that - * have been loaded into the bitbuffer, including any - * counted by 'bitsleft' and any already consumed + * - overread_count: the total number of implicit appended zero bytes that + * have been loaded into the bitbuffer, including any + * counted by 'bitsleft' and any already consumed */ /* @@ -103,18 +103,18 @@ * which they don't have to refill as often. */ typedef machine_word_t bitbuf_t; -#define BITBUF_NBITS (8 * (int)sizeof(bitbuf_t)) +#define BITBUF_NBITS (8 * (int)sizeof(bitbuf_t)) /* BITMASK(n) returns a bitmask of length 'n'. */ -#define BITMASK(n) (((bitbuf_t)1 << (n)) - 1) +#define BITMASK(n) (((bitbuf_t)1 << (n)) - 1) /* * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value * of '(u8)bitsleft'. This is the size of the bitbuffer variable, minus 1 if * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()). */ -#define MAX_BITSLEFT \ - (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS) +#define MAX_BITSLEFT \ +(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS) /* * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be @@ -122,7 +122,7 @@ typedef machine_word_t bitbuf_t; * Since only whole bytes can be added to 'bitsleft', the worst case is * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit. */ -#define CONSUMABLE_NBITS (MAX_BITSLEFT - 7) +#define CONSUMABLE_NBITS (MAX_BITSLEFT - 7) /* * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed @@ -132,8 +132,8 @@ typedef machine_word_t bitbuf_t; * number of consumable bits (counted by 'bitsleft'). Any bits not counted in * 'bitsleft' can only be used for precomputation and cannot be consumed. */ -#define FASTLOOP_PRELOADABLE_NBITS \ - (UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS) +#define FASTLOOP_PRELOADABLE_NBITS \ +(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS) /* * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be @@ -141,14 +141,14 @@ typedef machine_word_t bitbuf_t; * subsequent consumptions. This is 1 bit if the branchless refill method is * being used, and 0 bits otherwise. */ -#define PRELOAD_SLACK MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT) +#define PRELOAD_SLACK MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT) /* * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been * refilled, then it's always possible to consume 'n' bits from it. 'n' should * be a compile-time constant, to enable compile-time evaluation. */ -#define CAN_CONSUME(n) (CONSUMABLE_NBITS >= (n)) +#define CAN_CONSUME(n) (CONSUMABLE_NBITS >= (n)) /* * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's @@ -156,9 +156,9 @@ typedef machine_word_t bitbuf_t; * consume 'consume_nbits' bits, then preload 'preload_nbits' bits. The * arguments should be compile-time constants to enable compile-time evaluation. */ -#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) \ - (CONSUMABLE_NBITS >= (consume_nbits) && \ - FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits)) +#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) \ +(CONSUMABLE_NBITS >= (consume_nbits) && \ +FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits)) /* * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by @@ -169,13 +169,13 @@ typedef machine_word_t bitbuf_t; * * The simplest way of branchlessly updating 'bitsleft' would be: * - * bitsleft += (MAX_BITSLEFT - bitsleft) & ~7; + * bitsleft += (MAX_BITSLEFT - bitsleft) & ~7; * * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than * WORDBITS, so that in binary it looks like 111111 or 11111. Then, we update * 'bitsleft' by just setting the bits above the low 3 bits: * - * bitsleft |= MAX_BITSLEFT & ~7; + * bitsleft |= MAX_BITSLEFT & ~7; * * That compiles down to a single instruction like 'or $0x38, %rbp'. Using * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be @@ -183,17 +183,17 @@ typedef machine_word_t bitbuf_t; * * The simplest way of branchlessly updating 'in_next' would be: * - * in_next += (MAX_BITSLEFT - bitsleft) >> 3; + * in_next += (MAX_BITSLEFT - bitsleft) >> 3; * * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this * isn't really better: * - * in_next += (MAX_BITSLEFT ^ bitsleft) >> 3; + * in_next += (MAX_BITSLEFT ^ bitsleft) >> 3; * * An alternative which can be marginally better is the following: * - * in_next += sizeof(bitbuf_t) - 1; - * in_next -= (bitsleft >> 3) & 0x7; + * in_next += sizeof(bitbuf_t) - 1; + * in_next -= (bitsleft >> 3) & 0x7; * * It seems this would increase the number of CPU instructions from 3 (sub, shr, * add) to 4 (add, shr, and, sub). However, if the CPU has a bitfield @@ -203,12 +203,12 @@ typedef machine_word_t bitbuf_t; * high bits in 'bitsleft', so it is compatible with the micro-optimization we * use where we let the high bits of 'bitsleft' contain garbage. */ -#define REFILL_BITS_BRANCHLESS() \ -do { \ - bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft; \ - in_next += sizeof(bitbuf_t) - 1; \ - in_next -= (bitsleft >> 3) & 0x7; \ - bitsleft |= MAX_BITSLEFT & ~7; \ +#define REFILL_BITS_BRANCHLESS() \ +do { \ +bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft; \ +in_next += sizeof(bitbuf_t) - 1; \ +in_next -= (bitsleft >> 3) & 0x7; \ +bitsleft |= MAX_BITSLEFT & ~7; \ } while (0) /* @@ -233,42 +233,42 @@ do { \ * or return an error. However, we do it to be slightly more friendly to the * not-recommended use case of decompressing with an unknown output size.) */ -#define REFILL_BITS() \ -do { \ - if (UNALIGNED_ACCESS_IS_FAST && \ - likely(in_end - in_next >= sizeof(bitbuf_t))) { \ - REFILL_BITS_BRANCHLESS(); \ - } else { \ - while ((u8)bitsleft < CONSUMABLE_NBITS) { \ - if (likely(in_next != in_end)) { \ - bitbuf |= (bitbuf_t)*in_next++ << \ - (u8)bitsleft; \ - } else { \ - overread_count++; \ - SAFETY_CHECK(overread_count <= \ - sizeof(bitbuf_t)); \ - } \ - bitsleft += 8; \ - } \ - } \ +#define REFILL_BITS() \ +do { \ +if (UNALIGNED_ACCESS_IS_FAST && \ +likely(in_end - in_next >= sizeof(bitbuf_t))) { \ +REFILL_BITS_BRANCHLESS(); \ +} else { \ +while ((u8)bitsleft < CONSUMABLE_NBITS) { \ +if (likely(in_next != in_end)) { \ +bitbuf |= (bitbuf_t)*in_next++ << \ +(u8)bitsleft; \ +} else { \ +overread_count++; \ +SAFETY_CHECK(overread_count <= \ +sizeof(bitbuf_t)); \ +} \ +bitsleft += 8; \ +} \ +} \ } while (0) /* * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the * end of the input. It can only be used in the fastloop. */ -#define REFILL_BITS_IN_FASTLOOP() \ -do { \ - STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST || \ - FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS); \ - if (UNALIGNED_ACCESS_IS_FAST) { \ - REFILL_BITS_BRANCHLESS(); \ - } else { \ - while ((u8)bitsleft < CONSUMABLE_NBITS) { \ - bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft; \ - bitsleft += 8; \ - } \ - } \ +#define REFILL_BITS_IN_FASTLOOP() \ +do { \ +STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST || \ +FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS); \ +if (UNALIGNED_ACCESS_IS_FAST) { \ +REFILL_BITS_BRANCHLESS(); \ +} else { \ +while ((u8)bitsleft < CONSUMABLE_NBITS) { \ +bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft; \ +bitsleft += 8; \ +} \ +} \ } while (0) /* @@ -277,8 +277,8 @@ do { \ * match of length DEFLATE_MAX_MATCH_LEN. Additionally, some slack space must * be included for the intentional overrun in the match copy implementation. */ -#define FASTLOOP_MAX_BYTES_WRITTEN \ - (2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1) +#define FASTLOOP_MAX_BYTES_WRITTEN \ +(2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1) /* * This is the worst-case maximum number of input bytes that are read during @@ -291,10 +291,10 @@ do { \ * can be advanced. Finally, we add sizeof(bitbuf_t) to account for * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'. */ -#define FASTLOOP_MAX_BYTES_READ \ - (DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) + \ - LENGTH_MAXBITS + OFFSET_MAXBITS, 8) + \ - sizeof(bitbuf_t)) +#define FASTLOOP_MAX_BYTES_READ \ +(DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) + \ +LENGTH_MAXBITS + OFFSET_MAXBITS, 8) + \ +sizeof(bitbuf_t)) /***************************************************************************** * Huffman decoding * @@ -361,18 +361,18 @@ do { \ * worst-case maximum number of decode table entries, including the main table * and all subtables. The ENOUGH value depends on three parameters: * - * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) - * (2) the maximum number of main table bits (*_TABLEBITS) - * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) + * (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS) + * (2) the maximum number of main table bits (*_TABLEBITS) + * (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN) * * The ENOUGH values were computed using the utility program 'enough' from zlib. */ -#define PRECODE_TABLEBITS 7 -#define PRECODE_ENOUGH 128 /* enough 19 7 7 */ -#define LITLEN_TABLEBITS 11 -#define LITLEN_ENOUGH 2342 /* enough 288 11 15 */ -#define OFFSET_TABLEBITS 8 -#define OFFSET_ENOUGH 402 /* enough 32 8 15 */ +#define PRECODE_TABLEBITS 7 +#define PRECODE_ENOUGH 128 /* enough 19 7 7 */ +#define LITLEN_TABLEBITS 11 +#define LITLEN_ENOUGH 2342 /* enough 288 11 15 */ +#define OFFSET_TABLEBITS 8 +#define OFFSET_ENOUGH 402 /* enough 32 8 15 */ /* * make_decode_table_entry() creates a decode table entry for the given symbol @@ -387,16 +387,16 @@ do { \ static forceinline u32 make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len) { - return decode_results[sym] + (len << 8) + len; + return decode_results[sym] + (len << 8) + len; } /* * Here is the format of our precode decode table entries. Bits not explicitly * described contain zeroes: * - * Bit 20-16: presym - * Bit 10-8: codeword length [not used] - * Bit 2-0: codeword length + * Bit 20-16: presym + * Bit 10-8: codeword length [not used] + * Bit 2-0: codeword length * * The precode decode table never has subtables, since we use * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN. @@ -405,226 +405,226 @@ make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len) * symbol. make_decode_table_entry() produces the final entries. */ static const u32 precode_decode_results[] = { -#define ENTRY(presym) ((u32)presym << 16) - ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , - ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , - ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , - ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , - ENTRY(16) , ENTRY(17) , ENTRY(18) , +#define ENTRY(presym) ((u32)presym << 16) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , #undef ENTRY }; /* Litlen and offset decode table entry flags */ /* Indicates a literal entry in the litlen decode table */ -#define HUFFDEC_LITERAL 0x80000000 +#define HUFFDEC_LITERAL 0x80000000 /* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */ -#define HUFFDEC_EXCEPTIONAL 0x00008000 +#define HUFFDEC_EXCEPTIONAL 0x00008000 /* Indicates a subtable pointer entry in the litlen or offset decode table */ -#define HUFFDEC_SUBTABLE_POINTER 0x00004000 +#define HUFFDEC_SUBTABLE_POINTER 0x00004000 /* Indicates an end-of-block entry in the litlen decode table */ -#define HUFFDEC_END_OF_BLOCK 0x00002000 +#define HUFFDEC_END_OF_BLOCK 0x00002000 /* Maximum number of bits that can be consumed by decoding a match length */ -#define LENGTH_MAXBITS (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \ - DEFLATE_MAX_EXTRA_LENGTH_BITS) -#define LENGTH_MAXFASTBITS (LITLEN_TABLEBITS /* no subtable needed */ + \ - DEFLATE_MAX_EXTRA_LENGTH_BITS) +#define LENGTH_MAXBITS (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \ +DEFLATE_MAX_EXTRA_LENGTH_BITS) +#define LENGTH_MAXFASTBITS (LITLEN_TABLEBITS /* no subtable needed */ + \ +DEFLATE_MAX_EXTRA_LENGTH_BITS) /* * Here is the format of our litlen decode table entries. Bits not explicitly * described contain zeroes: * - * Literals: - * Bit 31: 1 (HUFFDEC_LITERAL) - * Bit 23-16: literal value - * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) - * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) - * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) - * Bit 11-8: remaining codeword length [not used] - * Bit 3-0: remaining codeword length - * Lengths: - * Bit 31: 0 (!HUFFDEC_LITERAL) - * Bit 24-16: length base value - * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) - * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) - * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) - * Bit 11-8: remaining codeword length - * Bit 4-0: remaining codeword length + number of extra bits - * End of block: - * Bit 31: 0 (!HUFFDEC_LITERAL) - * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) - * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) - * Bit 13: 1 (HUFFDEC_END_OF_BLOCK) - * Bit 11-8: remaining codeword length [not used] - * Bit 3-0: remaining codeword length - * Subtable pointer: - * Bit 31: 0 (!HUFFDEC_LITERAL) - * Bit 30-16: index of start of subtable - * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) - * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) - * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) - * Bit 11-8: number of subtable bits - * Bit 3-0: number of main table bits + * Literals: + * Bit 31: 1 (HUFFDEC_LITERAL) + * Bit 23-16: literal value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length [not used] + * Bit 3-0: remaining codeword length + * Lengths: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 24-16: length base value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length + * Bit 4-0: remaining codeword length + number of extra bits + * End of block: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 1 (HUFFDEC_END_OF_BLOCK) + * Bit 11-8: remaining codeword length [not used] + * Bit 3-0: remaining codeword length + * Subtable pointer: + * Bit 31: 0 (!HUFFDEC_LITERAL) + * Bit 30-16: index of start of subtable + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) + * Bit 13: 0 (!HUFFDEC_END_OF_BLOCK) + * Bit 11-8: number of subtable bits + * Bit 3-0: number of main table bits * * This format has several desirable properties: * - * - The codeword length, length slot base, and number of extra length bits - * are all built in. This eliminates the need to separately look up this - * information by indexing separate arrays by symbol or length slot. + * - The codeword length, length slot base, and number of extra length bits + * are all built in. This eliminates the need to separately look up this + * information by indexing separate arrays by symbol or length slot. * - * - The HUFFDEC_* flags enable easily distinguishing between the different - * types of entries. The HUFFDEC_LITERAL flag enables a fast path for - * literals; the high bit is used for this, as some CPUs can test the - * high bit more easily than other bits. The HUFFDEC_EXCEPTIONAL flag - * makes it possible to detect the two unlikely cases (subtable pointer - * and end of block) in a single bit flag test. + * - The HUFFDEC_* flags enable easily distinguishing between the different + * types of entries. The HUFFDEC_LITERAL flag enables a fast path for + * literals; the high bit is used for this, as some CPUs can test the + * high bit more easily than other bits. The HUFFDEC_EXCEPTIONAL flag + * makes it possible to detect the two unlikely cases (subtable pointer + * and end of block) in a single bit flag test. * - * - The low byte is the number of bits that need to be removed from the - * bitstream; this makes this value easily accessible, and it enables the - * micro-optimization of doing 'bitsleft -= entry' instead of - * 'bitsleft -= (u8)entry'. It also includes the number of extra bits, - * so they don't need to be removed separately. + * - The low byte is the number of bits that need to be removed from the + * bitstream; this makes this value easily accessible, and it enables the + * micro-optimization of doing 'bitsleft -= entry' instead of + * 'bitsleft -= (u8)entry'. It also includes the number of extra bits, + * so they don't need to be removed separately. * - * - The flags in bits 15-13 are arranged to be 0 when the - * "remaining codeword length" in bits 11-8 is needed, making this value - * fairly easily accessible as well via a shift and downcast. + * - The flags in bits 15-13 are arranged to be 0 when the + * "remaining codeword length" in bits 11-8 is needed, making this value + * fairly easily accessible as well via a shift and downcast. * - * - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are - * needed, making it possible to extract this value with '& 0x3F' rather - * than '& 0xF'. This value is only used as a shift amount, so this can - * save an 'and' instruction as the masking by 0x3F happens implicitly. + * - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are + * needed, making it possible to extract this value with '& 0x3F' rather + * than '& 0xF'. This value is only used as a shift amount, so this can + * save an 'and' instruction as the masking by 0x3F happens implicitly. * * litlen_decode_results[] contains the static part of the entry for each * symbol. make_decode_table_entry() produces the final entries. */ static const u32 litlen_decode_results[] = { - - /* Literals */ -#define ENTRY(literal) (HUFFDEC_LITERAL | ((u32)literal << 16)) - ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , - ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , - ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , - ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , - ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , - ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , - ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , - ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , - ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , - ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , - ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , - ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , - ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , - ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , - ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , - ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , - ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , - ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , - ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , - ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , - ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , - ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , - ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , - ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , - ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , - ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , - ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , - ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , - ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , - ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , - ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , - ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , - ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , - ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , - ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , - ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , - ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , - ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , - ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , - ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , - ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , - ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , - ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , - ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , - ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , - ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , - ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , - ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , - ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , - ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , - ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , - ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , - ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , - ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , - ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , - ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , - ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , - ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , - ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , - ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , - ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , - ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , - ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , - ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , + + /* Literals */ +#define ENTRY(literal) (HUFFDEC_LITERAL | ((u32)literal << 16)) + ENTRY(0) , ENTRY(1) , ENTRY(2) , ENTRY(3) , + ENTRY(4) , ENTRY(5) , ENTRY(6) , ENTRY(7) , + ENTRY(8) , ENTRY(9) , ENTRY(10) , ENTRY(11) , + ENTRY(12) , ENTRY(13) , ENTRY(14) , ENTRY(15) , + ENTRY(16) , ENTRY(17) , ENTRY(18) , ENTRY(19) , + ENTRY(20) , ENTRY(21) , ENTRY(22) , ENTRY(23) , + ENTRY(24) , ENTRY(25) , ENTRY(26) , ENTRY(27) , + ENTRY(28) , ENTRY(29) , ENTRY(30) , ENTRY(31) , + ENTRY(32) , ENTRY(33) , ENTRY(34) , ENTRY(35) , + ENTRY(36) , ENTRY(37) , ENTRY(38) , ENTRY(39) , + ENTRY(40) , ENTRY(41) , ENTRY(42) , ENTRY(43) , + ENTRY(44) , ENTRY(45) , ENTRY(46) , ENTRY(47) , + ENTRY(48) , ENTRY(49) , ENTRY(50) , ENTRY(51) , + ENTRY(52) , ENTRY(53) , ENTRY(54) , ENTRY(55) , + ENTRY(56) , ENTRY(57) , ENTRY(58) , ENTRY(59) , + ENTRY(60) , ENTRY(61) , ENTRY(62) , ENTRY(63) , + ENTRY(64) , ENTRY(65) , ENTRY(66) , ENTRY(67) , + ENTRY(68) , ENTRY(69) , ENTRY(70) , ENTRY(71) , + ENTRY(72) , ENTRY(73) , ENTRY(74) , ENTRY(75) , + ENTRY(76) , ENTRY(77) , ENTRY(78) , ENTRY(79) , + ENTRY(80) , ENTRY(81) , ENTRY(82) , ENTRY(83) , + ENTRY(84) , ENTRY(85) , ENTRY(86) , ENTRY(87) , + ENTRY(88) , ENTRY(89) , ENTRY(90) , ENTRY(91) , + ENTRY(92) , ENTRY(93) , ENTRY(94) , ENTRY(95) , + ENTRY(96) , ENTRY(97) , ENTRY(98) , ENTRY(99) , + ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) , + ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) , + ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) , + ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) , + ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) , + ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) , + ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) , + ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) , + ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) , + ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) , + ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) , + ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) , + ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) , + ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) , + ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) , + ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) , + ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) , + ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) , + ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) , + ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) , + ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) , + ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) , + ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) , + ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) , + ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) , + ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) , + ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) , + ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) , + ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) , + ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) , + ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) , + ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) , + ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) , + ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) , + ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) , + ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) , + ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) , + ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) , + ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) , #undef ENTRY - - /* End of block */ - HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK, - - /* Lengths */ -#define ENTRY(length_base, num_extra_bits) \ - (((u32)(length_base) << 16) | (num_extra_bits)) - ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), - ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), - ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), - ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), - ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), - ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), - ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), - ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , + + /* End of block */ + HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK, + + /* Lengths */ +#define ENTRY(length_base, num_extra_bits) \ +(((u32)(length_base) << 16) | (num_extra_bits)) + ENTRY(3 , 0) , ENTRY(4 , 0) , ENTRY(5 , 0) , ENTRY(6 , 0), + ENTRY(7 , 0) , ENTRY(8 , 0) , ENTRY(9 , 0) , ENTRY(10 , 0), + ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1), + ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2), + ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3), + ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4), + ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5), + ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) , #undef ENTRY }; /* Maximum number of bits that can be consumed by decoding a match offset */ -#define OFFSET_MAXBITS (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \ - DEFLATE_MAX_EXTRA_OFFSET_BITS) -#define OFFSET_MAXFASTBITS (OFFSET_TABLEBITS /* no subtable needed */ + \ - DEFLATE_MAX_EXTRA_OFFSET_BITS) +#define OFFSET_MAXBITS (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \ +DEFLATE_MAX_EXTRA_OFFSET_BITS) +#define OFFSET_MAXFASTBITS (OFFSET_TABLEBITS /* no subtable needed */ + \ +DEFLATE_MAX_EXTRA_OFFSET_BITS) /* * Here is the format of our offset decode table entries. Bits not explicitly * described contain zeroes: * - * Offsets: - * Bit 31-16: offset base value - * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) - * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) - * Bit 11-8: remaining codeword length - * Bit 4-0: remaining codeword length + number of extra bits - * Subtable pointer: - * Bit 31-16: index of start of subtable - * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) - * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) - * Bit 11-8: number of subtable bits - * Bit 3-0: number of main table bits + * Offsets: + * Bit 31-16: offset base value + * Bit 15: 0 (!HUFFDEC_EXCEPTIONAL) + * Bit 14: 0 (!HUFFDEC_SUBTABLE_POINTER) + * Bit 11-8: remaining codeword length + * Bit 4-0: remaining codeword length + number of extra bits + * Subtable pointer: + * Bit 31-16: index of start of subtable + * Bit 15: 1 (HUFFDEC_EXCEPTIONAL) + * Bit 14: 1 (HUFFDEC_SUBTABLE_POINTER) + * Bit 11-8: number of subtable bits + * Bit 3-0: number of main table bits * * These work the same way as the length entries and subtable pointer entries in * the litlen decode table; see litlen_decode_results[] above. */ static const u32 offset_decode_results[] = { -#define ENTRY(offset_base, num_extra_bits) \ - (((u32)(offset_base) << 16) | (num_extra_bits)) - ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , - ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , - ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , - ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , - ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , - ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , - ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , - ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , +#define ENTRY(offset_base, num_extra_bits) \ +(((u32)(offset_base) << 16) | (num_extra_bits)) + ENTRY(1 , 0) , ENTRY(2 , 0) , ENTRY(3 , 0) , ENTRY(4 , 0) , + ENTRY(5 , 1) , ENTRY(7 , 1) , ENTRY(9 , 2) , ENTRY(13 , 2) , + ENTRY(17 , 3) , ENTRY(25 , 3) , ENTRY(33 , 4) , ENTRY(49 , 4) , + ENTRY(65 , 5) , ENTRY(97 , 5) , ENTRY(129 , 6) , ENTRY(193 , 6) , + ENTRY(257 , 7) , ENTRY(385 , 7) , ENTRY(513 , 8) , ENTRY(769 , 8) , + ENTRY(1025 , 9) , ENTRY(1537 , 9) , ENTRY(2049 , 10) , ENTRY(3073 , 10) , + ENTRY(4097 , 11) , ENTRY(6145 , 11) , ENTRY(8193 , 12) , ENTRY(12289 , 12) , + ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , #undef ENTRY }; @@ -640,40 +640,40 @@ static const u32 offset_decode_results[] = { * are decoded without an intervening dynamic block, even across streams. */ struct libdeflate_decompressor { - - /* - * The arrays aren't all needed at the same time. 'precode_lens' and - * 'precode_decode_table' are unneeded after 'lens' has been filled. - * Furthermore, 'lens' need not be retained after building the litlen - * and offset decode tables. In fact, 'lens' can be in union with - * 'litlen_decode_table' provided that 'offset_decode_table' is separate - * and is built first. - */ - - union { - u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; - - struct { - u8 lens[DEFLATE_NUM_LITLEN_SYMS + - DEFLATE_NUM_OFFSET_SYMS + - DEFLATE_MAX_LENS_OVERRUN]; - - u32 precode_decode_table[PRECODE_ENOUGH]; - } l; - - u32 litlen_decode_table[LITLEN_ENOUGH]; - } u; - - u32 offset_decode_table[OFFSET_ENOUGH]; - - /* used only during build_decode_table() */ - u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; - - bool static_codes_loaded; - unsigned litlen_tablebits; - - /* The free() function for this struct, chosen at allocation time */ - free_func_t free_func; + + /* + * The arrays aren't all needed at the same time. 'precode_lens' and + * 'precode_decode_table' are unneeded after 'lens' has been filled. + * Furthermore, 'lens' need not be retained after building the litlen + * and offset decode tables. In fact, 'lens' can be in union with + * 'litlen_decode_table' provided that 'offset_decode_table' is separate + * and is built first. + */ + + union { + u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS]; + + struct { + u8 lens[DEFLATE_NUM_LITLEN_SYMS + + DEFLATE_NUM_OFFSET_SYMS + + DEFLATE_MAX_LENS_OVERRUN]; + + u32 precode_decode_table[PRECODE_ENOUGH]; + } l; + + u32 litlen_decode_table[LITLEN_ENOUGH]; + } u; + + u32 offset_decode_table[OFFSET_ENOUGH]; + + /* used only during build_decode_table() */ + u16 sorted_syms[DEFLATE_MAX_NUM_SYMS]; + + bool static_codes_loaded; + unsigned litlen_tablebits; + + /* The free() function for this struct, chosen at allocation time */ + free_func_t free_func; }; /* @@ -686,383 +686,383 @@ struct libdeflate_decompressor { * Huffman codes in DEFLATE. * * @decode_table - * The array in which the decode table will be generated. This array must - * have sufficient length; see the definition of the ENOUGH numbers. + * The array in which the decode table will be generated. This array must + * have sufficient length; see the definition of the ENOUGH numbers. * @lens - * An array which provides, for each symbol, the length of the - * corresponding codeword in bits, or 0 if the symbol is unused. This may - * alias @decode_table, since nothing is written to @decode_table until all - * @lens have been consumed. All codeword lengths are assumed to be <= - * @max_codeword_len but are otherwise considered untrusted. If they do - * not form a valid Huffman code, then the decode table is not built and - * %false is returned. + * An array which provides, for each symbol, the length of the + * corresponding codeword in bits, or 0 if the symbol is unused. This may + * alias @decode_table, since nothing is written to @decode_table until all + * @lens have been consumed. All codeword lengths are assumed to be <= + * @max_codeword_len but are otherwise considered untrusted. If they do + * not form a valid Huffman code, then the decode table is not built and + * %false is returned. * @num_syms - * The number of symbols in the code, including all unused symbols. + * The number of symbols in the code, including all unused symbols. * @decode_results - * An array which gives the incomplete decode result for each symbol. The - * needed values in this array will be combined with codeword lengths to - * make the final decode table entries using make_decode_table_entry(). + * An array which gives the incomplete decode result for each symbol. The + * needed values in this array will be combined with codeword lengths to + * make the final decode table entries using make_decode_table_entry(). * @table_bits - * The log base-2 of the number of main table entries to use. - * If @table_bits_ret != NULL, then @table_bits is treated as a maximum - * value and it will be decreased if a smaller table would be sufficient. + * The log base-2 of the number of main table entries to use. + * If @table_bits_ret != NULL, then @table_bits is treated as a maximum + * value and it will be decreased if a smaller table would be sufficient. * @max_codeword_len - * The maximum allowed codeword length for this Huffman code. - * Must be <= DEFLATE_MAX_CODEWORD_LEN. + * The maximum allowed codeword length for this Huffman code. + * Must be <= DEFLATE_MAX_CODEWORD_LEN. * @sorted_syms - * A temporary array of length @num_syms. + * A temporary array of length @num_syms. * @table_bits_ret - * If non-NULL, then the dynamic table_bits is enabled, and the actual - * table_bits value will be returned here. + * If non-NULL, then the dynamic table_bits is enabled, and the actual + * table_bits value will be returned here. * * Returns %true if successful; %false if the codeword lengths do not form a * valid Huffman code. */ static bool build_decode_table(u32 decode_table[], - const u8 lens[], - const unsigned num_syms, - const u32 decode_results[], - unsigned table_bits, - unsigned max_codeword_len, - u16 *sorted_syms, - unsigned *table_bits_ret) + const u8 lens[], + const unsigned num_syms, + const u32 decode_results[], + unsigned table_bits, + unsigned max_codeword_len, + u16 *sorted_syms, + unsigned *table_bits_ret) { - unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; - unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; - unsigned sym; /* current symbol */ - unsigned codeword; /* current codeword, bit-reversed */ - unsigned len; /* current codeword length in bits */ - unsigned count; /* num codewords remaining with this length */ - u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ - unsigned cur_table_end; /* end index of current table */ - unsigned subtable_prefix; /* codeword prefix of current subtable */ - unsigned subtable_start; /* start index of current subtable */ - unsigned subtable_bits; /* log2 of current subtable length */ - - /* Count how many codewords have each length, including 0. */ - for (len = 0; len <= max_codeword_len; len++) - len_counts[len] = 0; - for (sym = 0; sym < num_syms; sym++) - len_counts[lens[sym]]++; - - /* - * Determine the actual maximum codeword length that was used, and - * decrease table_bits to it if allowed. - */ - while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0) - max_codeword_len--; - if (table_bits_ret != NULL) { - table_bits = MIN(table_bits, max_codeword_len); - *table_bits_ret = table_bits; - } - - /* - * Sort the symbols primarily by increasing codeword length and - * secondarily by increasing symbol value; or equivalently by their - * codewords in lexicographic order, since a canonical code is assumed. - * - * For efficiency, also compute 'codespace_used' in the same pass over - * 'len_counts[]' used to build 'offsets[]' for sorting. - */ - - /* Ensure that 'codespace_used' cannot overflow. */ - STATIC_ASSERT(sizeof(codespace_used) == 4); - STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= - DEFLATE_MAX_NUM_SYMS); - - offsets[0] = 0; - offsets[1] = len_counts[0]; - codespace_used = 0; - for (len = 1; len < max_codeword_len; len++) { - offsets[len + 1] = offsets[len] + len_counts[len]; - codespace_used = (codespace_used << 1) + len_counts[len]; - } - codespace_used = (codespace_used << 1) + len_counts[len]; - - for (sym = 0; sym < num_syms; sym++) - sorted_syms[offsets[lens[sym]]++] = sym; - - sorted_syms += offsets[0]; /* Skip unused symbols */ - - /* lens[] is done being used, so we can write to decode_table[] now. */ - - /* - * Check whether the lengths form a complete code (exactly fills the - * codespace), an incomplete code (doesn't fill the codespace), or an - * overfull code (overflows the codespace). A codeword of length 'n' - * uses proportion '1/(2^n)' of the codespace. An overfull code is - * nonsensical, so is considered invalid. An incomplete code is - * considered valid only in two specific cases; see below. - */ - - /* overfull code? */ - if (unlikely(codespace_used > (1U << max_codeword_len))) - return false; - - /* incomplete code? */ - if (unlikely(codespace_used < (1U << max_codeword_len))) { - u32 entry; - unsigned i; - - /* - * The DEFLATE RFC explicitly allows the offset code to be - * incomplete in two cases: a code containing just 1 codeword, - * if that codeword has length 1; and a code containing no - * codewords. Note: the list of offset codeword lengths is - * always nonempty, but lengths of 0 don't count as codewords. - * - * The RFC doesn't say whether the same cases are allowed for - * the litlen and pre codes. It's actually impossible for no - * symbols to be used from these codes; however, it's - * technically possible for only one symbol to be used. zlib - * allows 1 codeword for the litlen code, but not the precode. - * The RFC also doesn't say whether, when there is 1 codeword, - * that codeword is '0' or '1'. zlib uses '0'. - * - * We accept what zlib accepts, plus a bit more. First, we - * don't treat the precode more strictly than the litlen and - * offset codes. There's no convincing reason to add a special - * case for the precode here. - * - * Second, we just map each allowed incompete code to a complete - * code with only real symbols. To do this, we choose a symbol, - * either the used symbol (for codes with 1 codeword) or an - * arbitrary symbol (for empty codes), and give it both - * codewords '0' and '1'. zlib instead uses a special ERROR - * symbol in the part of the codespace the code doesn't use. - * However, having an ERROR symbol reduces the performance of - * the Huffman decoder, for no real benefit. Our approach also - * avoids having to decide whether '0' or '1' is correct. - * - * Like zlib, we still reject all incomplete codes that contain - * more than 1 codeword or a codeword length greater than 1. - */ - if (codespace_used == 0) { - sym = 0; /* arbitrary */ - } else { - if (codespace_used != (1U << (max_codeword_len - 1)) || - len_counts[1] != 1) - return false; - sym = sorted_syms[0]; - } - entry = make_decode_table_entry(decode_results, sym, 1); - for (i = 0; i < (1U << table_bits); i++) - decode_table[i] = entry; - return true; - } - - /* - * The lengths form a complete code. Now, enumerate the codewords in - * lexicographic order and fill the decode table entries for each one. - * - * First, process all codewords with len <= table_bits. Each one gets - * '2^(table_bits-len)' direct entries in the table. - * - * Since DEFLATE uses bit-reversed codewords, these entries aren't - * consecutive but rather are spaced '2^len' entries apart. This makes - * filling them naively somewhat awkward and inefficient, since strided - * stores are less cache-friendly and preclude the use of word or - * vector-at-a-time stores to fill multiple entries per instruction. - * - * To optimize this, we incrementally double the table size. When - * processing codewords with length 'len', the table is treated as - * having only '2^len' entries, so each codeword uses just one entry. - * Then, each time 'len' is incremented, the table size is doubled and - * the first half is copied to the second half. This significantly - * improves performance over naively doing strided stores. - * - * Note that some entries copied for each table doubling may not have - * been initialized yet, but it doesn't matter since they're guaranteed - * to be initialized later (because the Huffman code is complete). - */ - codeword = 0; - len = 1; - while ((count = len_counts[len]) == 0) - len++; - cur_table_end = 1U << len; - while (len <= table_bits) { - /* Process all 'count' codewords with length 'len' bits. */ - do { - unsigned bit; - - /* Fill the first entry for the current codeword. */ - decode_table[codeword] = - make_decode_table_entry(decode_results, - *sorted_syms++, len); - - if (codeword == cur_table_end - 1) { - /* Last codeword (all 1's) */ - for (; len < table_bits; len++) { - memcpy(&decode_table[cur_table_end], - decode_table, - cur_table_end * - sizeof(decode_table[0])); - cur_table_end <<= 1; - } - return true; - } - /* - * To advance to the lexicographically next codeword in - * the canonical code, the codeword must be incremented, - * then 0's must be appended to the codeword as needed - * to match the next codeword's length. - * - * Since the codeword is bit-reversed, appending 0's is - * a no-op. However, incrementing it is nontrivial. To - * do so efficiently, use the 'bsr' instruction to find - * the last (highest order) 0 bit in the codeword, set - * it, and clear any later (higher order) 1 bits. But - * 'bsr' actually finds the highest order 1 bit, so to - * use it first flip all bits in the codeword by XOR'ing - * it with (1U << len) - 1 == cur_table_end - 1. - */ - bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); - codeword &= bit - 1; - codeword |= bit; - } while (--count); - - /* Advance to the next codeword length. */ - do { - if (++len <= table_bits) { - memcpy(&decode_table[cur_table_end], - decode_table, - cur_table_end * sizeof(decode_table[0])); - cur_table_end <<= 1; - } - } while ((count = len_counts[len]) == 0); - } - - /* Process codewords with len > table_bits. These require subtables. */ - cur_table_end = 1U << table_bits; - subtable_prefix = -1; - subtable_start = 0; - for (;;) { - u32 entry; - unsigned i; - unsigned stride; - unsigned bit; - - /* - * Start a new subtable if the first 'table_bits' bits of the - * codeword don't match the prefix of the current subtable. - */ - if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { - subtable_prefix = (codeword & ((1U << table_bits) - 1)); - subtable_start = cur_table_end; - /* - * Calculate the subtable length. If the codeword has - * length 'table_bits + n', then the subtable needs - * '2^n' entries. But it may need more; if fewer than - * '2^n' codewords of length 'table_bits + n' remain, - * then the length will need to be incremented to bring - * in longer codewords until the subtable can be - * completely filled. Note that because the Huffman - * code is complete, it will always be possible to fill - * the subtable eventually. - */ - subtable_bits = len - table_bits; - codespace_used = count; - while (codespace_used < (1U << subtable_bits)) { - subtable_bits++; - codespace_used = (codespace_used << 1) + - len_counts[table_bits + subtable_bits]; - } - cur_table_end = subtable_start + (1U << subtable_bits); - - /* - * Create the entry that points from the main table to - * the subtable. - */ - decode_table[subtable_prefix] = - ((u32)subtable_start << 16) | - HUFFDEC_EXCEPTIONAL | - HUFFDEC_SUBTABLE_POINTER | - (subtable_bits << 8) | table_bits; - } - - /* Fill the subtable entries for the current codeword. */ - entry = make_decode_table_entry(decode_results, *sorted_syms++, - len - table_bits); - i = subtable_start + (codeword >> table_bits); - stride = 1U << (len - table_bits); - do { - decode_table[i] = entry; - i += stride; - } while (i < cur_table_end); - - /* Advance to the next codeword. */ - if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ - return true; - bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); - codeword &= bit - 1; - codeword |= bit; - count--; - while (count == 0) - count = len_counts[++len]; - } + unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1]; + unsigned sym; /* current symbol */ + unsigned codeword; /* current codeword, bit-reversed */ + unsigned len; /* current codeword length in bits */ + unsigned count; /* num codewords remaining with this length */ + u32 codespace_used; /* codespace used out of '2^max_codeword_len' */ + unsigned cur_table_end; /* end index of current table */ + unsigned subtable_prefix; /* codeword prefix of current subtable */ + unsigned subtable_start; /* start index of current subtable */ + unsigned subtable_bits; /* log2 of current subtable length */ + + /* Count how many codewords have each length, including 0. */ + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + for (sym = 0; sym < num_syms; sym++) + len_counts[lens[sym]]++; + + /* + * Determine the actual maximum codeword length that was used, and + * decrease table_bits to it if allowed. + */ + while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0) + max_codeword_len--; + if (table_bits_ret != NULL) { + table_bits = MIN(table_bits, max_codeword_len); + *table_bits_ret = table_bits; + } + + /* + * Sort the symbols primarily by increasing codeword length and + * secondarily by increasing symbol value; or equivalently by their + * codewords in lexicographic order, since a canonical code is assumed. + * + * For efficiency, also compute 'codespace_used' in the same pass over + * 'len_counts[]' used to build 'offsets[]' for sorting. + */ + + /* Ensure that 'codespace_used' cannot overflow. */ + STATIC_ASSERT(sizeof(codespace_used) == 4); + STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >= + DEFLATE_MAX_NUM_SYMS); + + offsets[0] = 0; + offsets[1] = len_counts[0]; + codespace_used = 0; + for (len = 1; len < max_codeword_len; len++) { + offsets[len + 1] = offsets[len] + len_counts[len]; + codespace_used = (codespace_used << 1) + len_counts[len]; + } + codespace_used = (codespace_used << 1) + len_counts[len]; + + for (sym = 0; sym < num_syms; sym++) + sorted_syms[offsets[lens[sym]]++] = sym; + + sorted_syms += offsets[0]; /* Skip unused symbols */ + + /* lens[] is done being used, so we can write to decode_table[] now. */ + + /* + * Check whether the lengths form a complete code (exactly fills the + * codespace), an incomplete code (doesn't fill the codespace), or an + * overfull code (overflows the codespace). A codeword of length 'n' + * uses proportion '1/(2^n)' of the codespace. An overfull code is + * nonsensical, so is considered invalid. An incomplete code is + * considered valid only in two specific cases; see below. + */ + + /* overfull code? */ + if (unlikely(codespace_used > (1U << max_codeword_len))) + return false; + + /* incomplete code? */ + if (unlikely(codespace_used < (1U << max_codeword_len))) { + u32 entry; + unsigned i; + + /* + * The DEFLATE RFC explicitly allows the offset code to be + * incomplete in two cases: a code containing just 1 codeword, + * if that codeword has length 1; and a code containing no + * codewords. Note: the list of offset codeword lengths is + * always nonempty, but lengths of 0 don't count as codewords. + * + * The RFC doesn't say whether the same cases are allowed for + * the litlen and pre codes. It's actually impossible for no + * symbols to be used from these codes; however, it's + * technically possible for only one symbol to be used. zlib + * allows 1 codeword for the litlen code, but not the precode. + * The RFC also doesn't say whether, when there is 1 codeword, + * that codeword is '0' or '1'. zlib uses '0'. + * + * We accept what zlib accepts, plus a bit more. First, we + * don't treat the precode more strictly than the litlen and + * offset codes. There's no convincing reason to add a special + * case for the precode here. + * + * Second, we just map each allowed incompete code to a complete + * code with only real symbols. To do this, we choose a symbol, + * either the used symbol (for codes with 1 codeword) or an + * arbitrary symbol (for empty codes), and give it both + * codewords '0' and '1'. zlib instead uses a special ERROR + * symbol in the part of the codespace the code doesn't use. + * However, having an ERROR symbol reduces the performance of + * the Huffman decoder, for no real benefit. Our approach also + * avoids having to decide whether '0' or '1' is correct. + * + * Like zlib, we still reject all incomplete codes that contain + * more than 1 codeword or a codeword length greater than 1. + */ + if (codespace_used == 0) { + sym = 0; /* arbitrary */ + } else { + if (codespace_used != (1U << (max_codeword_len - 1)) || + len_counts[1] != 1) + return false; + sym = sorted_syms[0]; + } + entry = make_decode_table_entry(decode_results, sym, 1); + for (i = 0; i < (1U << table_bits); i++) + decode_table[i] = entry; + return true; + } + + /* + * The lengths form a complete code. Now, enumerate the codewords in + * lexicographic order and fill the decode table entries for each one. + * + * First, process all codewords with len <= table_bits. Each one gets + * '2^(table_bits-len)' direct entries in the table. + * + * Since DEFLATE uses bit-reversed codewords, these entries aren't + * consecutive but rather are spaced '2^len' entries apart. This makes + * filling them naively somewhat awkward and inefficient, since strided + * stores are less cache-friendly and preclude the use of word or + * vector-at-a-time stores to fill multiple entries per instruction. + * + * To optimize this, we incrementally double the table size. When + * processing codewords with length 'len', the table is treated as + * having only '2^len' entries, so each codeword uses just one entry. + * Then, each time 'len' is incremented, the table size is doubled and + * the first half is copied to the second half. This significantly + * improves performance over naively doing strided stores. + * + * Note that some entries copied for each table doubling may not have + * been initialized yet, but it doesn't matter since they're guaranteed + * to be initialized later (because the Huffman code is complete). + */ + codeword = 0; + len = 1; + while ((count = len_counts[len]) == 0) + len++; + cur_table_end = 1U << len; + while (len <= table_bits) { + /* Process all 'count' codewords with length 'len' bits. */ + do { + unsigned bit; + + /* Fill the first entry for the current codeword. */ + decode_table[codeword] = + make_decode_table_entry(decode_results, + *sorted_syms++, len); + + if (codeword == cur_table_end - 1) { + /* Last codeword (all 1's) */ + for (; len < table_bits; len++) { + memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * + sizeof(decode_table[0])); + cur_table_end <<= 1; + } + return true; + } + /* + * To advance to the lexicographically next codeword in + * the canonical code, the codeword must be incremented, + * then 0's must be appended to the codeword as needed + * to match the next codeword's length. + * + * Since the codeword is bit-reversed, appending 0's is + * a no-op. However, incrementing it is nontrivial. To + * do so efficiently, use the 'bsr' instruction to find + * the last (highest order) 0 bit in the codeword, set + * it, and clear any later (higher order) 1 bits. But + * 'bsr' actually finds the highest order 1 bit, so to + * use it first flip all bits in the codeword by XOR'ing + * it with (1U << len) - 1 == cur_table_end - 1. + */ + bit = 1U << bsr32(codeword ^ (cur_table_end - 1)); + codeword &= bit - 1; + codeword |= bit; + } while (--count); + + /* Advance to the next codeword length. */ + do { + if (++len <= table_bits) { + memcpy(&decode_table[cur_table_end], + decode_table, + cur_table_end * sizeof(decode_table[0])); + cur_table_end <<= 1; + } + } while ((count = len_counts[len]) == 0); + } + + /* Process codewords with len > table_bits. These require subtables. */ + cur_table_end = 1U << table_bits; + subtable_prefix = -1; + subtable_start = 0; + for (;;) { + u32 entry; + unsigned i; + unsigned stride; + unsigned bit; + + /* + * Start a new subtable if the first 'table_bits' bits of the + * codeword don't match the prefix of the current subtable. + */ + if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) { + subtable_prefix = (codeword & ((1U << table_bits) - 1)); + subtable_start = cur_table_end; + /* + * Calculate the subtable length. If the codeword has + * length 'table_bits + n', then the subtable needs + * '2^n' entries. But it may need more; if fewer than + * '2^n' codewords of length 'table_bits + n' remain, + * then the length will need to be incremented to bring + * in longer codewords until the subtable can be + * completely filled. Note that because the Huffman + * code is complete, it will always be possible to fill + * the subtable eventually. + */ + subtable_bits = len - table_bits; + codespace_used = count; + while (codespace_used < (1U << subtable_bits)) { + subtable_bits++; + codespace_used = (codespace_used << 1) + + len_counts[table_bits + subtable_bits]; + } + cur_table_end = subtable_start + (1U << subtable_bits); + + /* + * Create the entry that points from the main table to + * the subtable. + */ + decode_table[subtable_prefix] = + ((u32)subtable_start << 16) | + HUFFDEC_EXCEPTIONAL | + HUFFDEC_SUBTABLE_POINTER | + (subtable_bits << 8) | table_bits; + } + + /* Fill the subtable entries for the current codeword. */ + entry = make_decode_table_entry(decode_results, *sorted_syms++, + len - table_bits); + i = subtable_start + (codeword >> table_bits); + stride = 1U << (len - table_bits); + do { + decode_table[i] = entry; + i += stride; + } while (i < cur_table_end); + + /* Advance to the next codeword. */ + if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */ + return true; + bit = 1U << bsr32(codeword ^ ((1U << len) - 1)); + codeword &= bit - 1; + codeword |= bit; + count--; + while (count == 0) + count = len_counts[++len]; + } } /* Build the decode table for the precode. */ static bool build_precode_decode_table(struct libdeflate_decompressor *d) { - /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ - STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); - - STATIC_ASSERT(ARRAY_LEN(precode_decode_results) == - DEFLATE_NUM_PRECODE_SYMS); - - return build_decode_table(d->u.l.precode_decode_table, - d->u.precode_lens, - DEFLATE_NUM_PRECODE_SYMS, - precode_decode_results, - PRECODE_TABLEBITS, - DEFLATE_MAX_PRE_CODEWORD_LEN, - d->sorted_syms, - NULL); + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128); + + STATIC_ASSERT(ARRAY_LEN(precode_decode_results) == + DEFLATE_NUM_PRECODE_SYMS); + + return build_decode_table(d->u.l.precode_decode_table, + d->u.precode_lens, + DEFLATE_NUM_PRECODE_SYMS, + precode_decode_results, + PRECODE_TABLEBITS, + DEFLATE_MAX_PRE_CODEWORD_LEN, + d->sorted_syms, + NULL); } /* Build the decode table for the literal/length code. */ static bool build_litlen_decode_table(struct libdeflate_decompressor *d, - unsigned num_litlen_syms, unsigned num_offset_syms) + unsigned num_litlen_syms, unsigned num_offset_syms) { - /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ - STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342); - - STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) == - DEFLATE_NUM_LITLEN_SYMS); - - return build_decode_table(d->u.litlen_decode_table, - d->u.l.lens, - num_litlen_syms, - litlen_decode_results, - LITLEN_TABLEBITS, - DEFLATE_MAX_LITLEN_CODEWORD_LEN, - d->sorted_syms, - &d->litlen_tablebits); + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342); + + STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) == + DEFLATE_NUM_LITLEN_SYMS); + + return build_decode_table(d->u.litlen_decode_table, + d->u.l.lens, + num_litlen_syms, + litlen_decode_results, + LITLEN_TABLEBITS, + DEFLATE_MAX_LITLEN_CODEWORD_LEN, + d->sorted_syms, + &d->litlen_tablebits); } /* Build the decode table for the offset code. */ static bool build_offset_decode_table(struct libdeflate_decompressor *d, - unsigned num_litlen_syms, unsigned num_offset_syms) + unsigned num_litlen_syms, unsigned num_offset_syms) { - /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ - STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); - - STATIC_ASSERT(ARRAY_LEN(offset_decode_results) == - DEFLATE_NUM_OFFSET_SYMS); - - return build_decode_table(d->offset_decode_table, - d->u.l.lens + num_litlen_syms, - num_offset_syms, - offset_decode_results, - OFFSET_TABLEBITS, - DEFLATE_MAX_OFFSET_CODEWORD_LEN, - d->sorted_syms, - NULL); + /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */ + STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402); + + STATIC_ASSERT(ARRAY_LEN(offset_decode_results) == + DEFLATE_NUM_OFFSET_SYMS); + + return build_decode_table(d->offset_decode_table, + d->u.l.lens + num_litlen_syms, + num_offset_syms, + offset_decode_results, + OFFSET_TABLEBITS, + DEFLATE_MAX_OFFSET_CODEWORD_LEN, + d->sorted_syms, + NULL); } /***************************************************************************** @@ -1070,10 +1070,10 @@ build_offset_decode_table(struct libdeflate_decompressor *d, *****************************************************************************/ typedef enum libdeflate_result (*decompress_func_t) - (struct libdeflate_decompressor * restrict d, - const void * restrict in, size_t in_nbytes, - void * restrict out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); +(struct libdeflate_decompressor * restrict d, + const void * restrict in, size_t in_nbytes, + void * restrict out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); #define FUNCNAME deflate_decompress_default #undef ATTRIBUTES @@ -1095,27 +1095,27 @@ typedef enum libdeflate_result (*decompress_func_t) #ifdef arch_select_decompress_func static enum libdeflate_result dispatch_decomp(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret); static volatile decompress_func_t decompress_impl = dispatch_decomp; /* Choose the best implementation at runtime. */ static enum libdeflate_result dispatch_decomp(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret) { - decompress_func_t f = arch_select_decompress_func(); - - if (f == NULL) - f = DEFAULT_IMPL; - - decompress_impl = f; - return f(d, in, in_nbytes, out, out_nbytes_avail, - actual_in_nbytes_ret, actual_out_nbytes_ret); + decompress_func_t f = arch_select_decompress_func(); + + if (f == NULL) + f = DEFAULT_IMPL; + + decompress_impl = f; + return f(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); } #else /* The best implementation is statically known, so call it directly. */ @@ -1132,77 +1132,77 @@ dispatch_decomp(struct libdeflate_decompressor *d, */ LIBDEFLATEAPI enum libdeflate_result libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) { - return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, - actual_in_nbytes_ret, actual_out_nbytes_ret); + return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail, + actual_in_nbytes_ret, actual_out_nbytes_ret); } LIBDEFLATEAPI enum libdeflate_result libdeflate_deflate_decompress(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) { - return libdeflate_deflate_decompress_ex(d, in, in_nbytes, - out, out_nbytes_avail, - NULL, actual_out_nbytes_ret); + return libdeflate_deflate_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); } LIBDEFLATEAPI struct libdeflate_decompressor * libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options) { - struct libdeflate_decompressor *d; - - /* - * Note: if more fields are added to libdeflate_options, this code will - * need to be updated to support both the old and new structs. - */ - if (options->sizeof_options != sizeof(*options)) - return NULL; - - d = (options->malloc_func ? options->malloc_func : - libdeflate_default_malloc_func)(sizeof(*d)); - if (d == NULL) - return NULL; - /* - * Note that only certain parts of the decompressor actually must be - * initialized here: - * - * - 'static_codes_loaded' must be initialized to false. - * - * - The first half of the main portion of each decode table must be - * initialized to any value, to avoid reading from uninitialized - * memory during table expansion in build_decode_table(). (Although, - * this is really just to avoid warnings with dynamic tools like - * valgrind, since build_decode_table() is guaranteed to initialize - * all entries eventually anyway.) - * - * - 'free_func' must be set. - * - * But for simplicity, we currently just zero the whole decompressor. - */ - memset(d, 0, sizeof(*d)); - d->free_func = options->free_func ? - options->free_func : libdeflate_default_free_func; - return d; + struct libdeflate_decompressor *d; + + /* + * Note: if more fields are added to libdeflate_options, this code will + * need to be updated to support both the old and new structs. + */ + if (options->sizeof_options != sizeof(*options)) + return NULL; + + d = (options->malloc_func ? options->malloc_func : + libdeflate_default_malloc_func)(sizeof(*d)); + if (d == NULL) + return NULL; + /* + * Note that only certain parts of the decompressor actually must be + * initialized here: + * + * - 'static_codes_loaded' must be initialized to false. + * + * - The first half of the main portion of each decode table must be + * initialized to any value, to avoid reading from uninitialized + * memory during table expansion in build_decode_table(). (Although, + * this is really just to avoid warnings with dynamic tools like + * valgrind, since build_decode_table() is guaranteed to initialize + * all entries eventually anyway.) + * + * - 'free_func' must be set. + * + * But for simplicity, we currently just zero the whole decompressor. + */ + memset(d, 0, sizeof(*d)); + d->free_func = options->free_func ? + options->free_func : libdeflate_default_free_func; + return d; } LIBDEFLATEAPI struct libdeflate_decompressor * libdeflate_alloc_decompressor(void) { - static const struct libdeflate_options defaults = { - .sizeof_options = sizeof(defaults), - }; - return libdeflate_alloc_decompressor_ex(&defaults); + static const struct libdeflate_options defaults = { + .sizeof_options = sizeof(defaults), + }; + return libdeflate_alloc_decompressor_ex(&defaults); } LIBDEFLATEAPI void libdeflate_free_decompressor(struct libdeflate_decompressor *d) { - if (d) - d->free_func(d); + if (d) + d->free_func(d); } diff --git a/Sources/DEFLATE/gzip_compress.c b/Sources/DEFLATE/gzip_compress.c index b7d5076e..016e638d 100644 --- a/Sources/DEFLATE/gzip_compress.c +++ b/Sources/DEFLATE/gzip_compress.c @@ -30,61 +30,61 @@ LIBDEFLATEAPI size_t libdeflate_gzip_compress(struct libdeflate_compressor *c, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) { - u8 *out_next = out; - unsigned compression_level; - u8 xfl; - size_t deflate_size; - - if (out_nbytes_avail <= GZIP_MIN_OVERHEAD) - return 0; - - /* ID1 */ - *out_next++ = GZIP_ID1; - /* ID2 */ - *out_next++ = GZIP_ID2; - /* CM */ - *out_next++ = GZIP_CM_DEFLATE; - /* FLG */ - *out_next++ = 0; - /* MTIME */ - put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next); - out_next += 4; - /* XFL */ - xfl = 0; - compression_level = libdeflate_get_compression_level(c); - if (compression_level < 2) - xfl |= GZIP_XFL_FASTEST_COMPRESSION; - else if (compression_level >= 8) - xfl |= GZIP_XFL_SLOWEST_COMPRESSION; - *out_next++ = xfl; - /* OS */ - *out_next++ = GZIP_OS_UNKNOWN; /* OS */ - - /* Compressed data */ - deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next, - out_nbytes_avail - GZIP_MIN_OVERHEAD); - if (deflate_size == 0) - return 0; - out_next += deflate_size; - - /* CRC32 */ - put_unaligned_le32(libdeflate_crc32(0, in, in_nbytes), out_next); - out_next += 4; - - /* ISIZE */ - put_unaligned_le32((u32)in_nbytes, out_next); - out_next += 4; - - return out_next - (u8 *)out; + u8 *out_next = out; + unsigned compression_level; + u8 xfl; + size_t deflate_size; + + if (out_nbytes_avail <= GZIP_MIN_OVERHEAD) + return 0; + + /* ID1 */ + *out_next++ = GZIP_ID1; + /* ID2 */ + *out_next++ = GZIP_ID2; + /* CM */ + *out_next++ = GZIP_CM_DEFLATE; + /* FLG */ + *out_next++ = 0; + /* MTIME */ + put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next); + out_next += 4; + /* XFL */ + xfl = 0; + compression_level = libdeflate_get_compression_level(c); + if (compression_level < 2) + xfl |= GZIP_XFL_FASTEST_COMPRESSION; + else if (compression_level >= 8) + xfl |= GZIP_XFL_SLOWEST_COMPRESSION; + *out_next++ = xfl; + /* OS */ + *out_next++ = GZIP_OS_UNKNOWN; /* OS */ + + /* Compressed data */ + deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next, + out_nbytes_avail - GZIP_MIN_OVERHEAD); + if (deflate_size == 0) + return 0; + out_next += deflate_size; + + /* CRC32 */ + put_unaligned_le32(libdeflate_crc32(0, in, in_nbytes), out_next); + out_next += 4; + + /* ISIZE */ + put_unaligned_le32((u32)in_nbytes, out_next); + out_next += 4; + + return out_next - (u8 *)out; } LIBDEFLATEAPI size_t libdeflate_gzip_compress_bound(struct libdeflate_compressor *c, - size_t in_nbytes) + size_t in_nbytes) { - return GZIP_MIN_OVERHEAD + - libdeflate_deflate_compress_bound(c, in_nbytes); + return GZIP_MIN_OVERHEAD + + libdeflate_deflate_compress_bound(c, in_nbytes); } diff --git a/Sources/DEFLATE/gzip_constants.h b/Sources/DEFLATE/gzip_constants.h index 35e4728d..24b100c7 100644 --- a/Sources/DEFLATE/gzip_constants.h +++ b/Sources/DEFLATE/gzip_constants.h @@ -5,41 +5,41 @@ #ifndef LIB_GZIP_CONSTANTS_H #define LIB_GZIP_CONSTANTS_H -#define GZIP_MIN_HEADER_SIZE 10 -#define GZIP_FOOTER_SIZE 8 -#define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE) - -#define GZIP_ID1 0x1F -#define GZIP_ID2 0x8B - -#define GZIP_CM_DEFLATE 8 - -#define GZIP_FTEXT 0x01 -#define GZIP_FHCRC 0x02 -#define GZIP_FEXTRA 0x04 -#define GZIP_FNAME 0x08 -#define GZIP_FCOMMENT 0x10 -#define GZIP_FRESERVED 0xE0 - -#define GZIP_MTIME_UNAVAILABLE 0 - -#define GZIP_XFL_SLOWEST_COMPRESSION 0x02 -#define GZIP_XFL_FASTEST_COMPRESSION 0x04 - -#define GZIP_OS_FAT 0 -#define GZIP_OS_AMIGA 1 -#define GZIP_OS_VMS 2 -#define GZIP_OS_UNIX 3 -#define GZIP_OS_VM_CMS 4 -#define GZIP_OS_ATARI_TOS 5 -#define GZIP_OS_HPFS 6 -#define GZIP_OS_MACINTOSH 7 -#define GZIP_OS_Z_SYSTEM 8 -#define GZIP_OS_CP_M 9 -#define GZIP_OS_TOPS_20 10 -#define GZIP_OS_NTFS 11 -#define GZIP_OS_QDOS 12 -#define GZIP_OS_RISCOS 13 -#define GZIP_OS_UNKNOWN 255 +#define GZIP_MIN_HEADER_SIZE 10 +#define GZIP_FOOTER_SIZE 8 +#define GZIP_MIN_OVERHEAD (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE) + +#define GZIP_ID1 0x1F +#define GZIP_ID2 0x8B + +#define GZIP_CM_DEFLATE 8 + +#define GZIP_FTEXT 0x01 +#define GZIP_FHCRC 0x02 +#define GZIP_FEXTRA 0x04 +#define GZIP_FNAME 0x08 +#define GZIP_FCOMMENT 0x10 +#define GZIP_FRESERVED 0xE0 + +#define GZIP_MTIME_UNAVAILABLE 0 + +#define GZIP_XFL_SLOWEST_COMPRESSION 0x02 +#define GZIP_XFL_FASTEST_COMPRESSION 0x04 + +#define GZIP_OS_FAT 0 +#define GZIP_OS_AMIGA 1 +#define GZIP_OS_VMS 2 +#define GZIP_OS_UNIX 3 +#define GZIP_OS_VM_CMS 4 +#define GZIP_OS_ATARI_TOS 5 +#define GZIP_OS_HPFS 6 +#define GZIP_OS_MACINTOSH 7 +#define GZIP_OS_Z_SYSTEM 8 +#define GZIP_OS_CP_M 9 +#define GZIP_OS_TOPS_20 10 +#define GZIP_OS_NTFS 11 +#define GZIP_OS_QDOS 12 +#define GZIP_OS_RISCOS 13 +#define GZIP_OS_UNKNOWN 255 #endif /* LIB_GZIP_CONSTANTS_H */ diff --git a/Sources/DEFLATE/gzip_decompress.c b/Sources/DEFLATE/gzip_decompress.c index 76b74f69..53aa3979 100644 --- a/Sources/DEFLATE/gzip_decompress.c +++ b/Sources/DEFLATE/gzip_decompress.c @@ -30,115 +30,115 @@ LIBDEFLATEAPI enum libdeflate_result libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) { - const u8 *in_next = in; - const u8 * const in_end = in_next + in_nbytes; - u8 flg; - size_t actual_in_nbytes; - size_t actual_out_nbytes; - enum libdeflate_result result; - - if (in_nbytes < GZIP_MIN_OVERHEAD) - return LIBDEFLATE_BAD_DATA; - - /* ID1 */ - if (*in_next++ != GZIP_ID1) - return LIBDEFLATE_BAD_DATA; - /* ID2 */ - if (*in_next++ != GZIP_ID2) - return LIBDEFLATE_BAD_DATA; - /* CM */ - if (*in_next++ != GZIP_CM_DEFLATE) - return LIBDEFLATE_BAD_DATA; - flg = *in_next++; - /* MTIME */ - in_next += 4; - /* XFL */ - in_next += 1; - /* OS */ - in_next += 1; - - if (flg & GZIP_FRESERVED) - return LIBDEFLATE_BAD_DATA; - - /* Extra field */ - if (flg & GZIP_FEXTRA) { - u16 xlen = get_unaligned_le16(in_next); - in_next += 2; - - if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE) - return LIBDEFLATE_BAD_DATA; - - in_next += xlen; - } - - /* Original file name (zero terminated) */ - if (flg & GZIP_FNAME) { - while (*in_next++ != 0 && in_next != in_end) - ; - if (in_end - in_next < GZIP_FOOTER_SIZE) - return LIBDEFLATE_BAD_DATA; - } - - /* File comment (zero terminated) */ - if (flg & GZIP_FCOMMENT) { - while (*in_next++ != 0 && in_next != in_end) - ; - if (in_end - in_next < GZIP_FOOTER_SIZE) - return LIBDEFLATE_BAD_DATA; - } - - /* CRC16 for gzip header */ - if (flg & GZIP_FHCRC) { - in_next += 2; - if (in_end - in_next < GZIP_FOOTER_SIZE) - return LIBDEFLATE_BAD_DATA; - } - - /* Compressed data */ - result = libdeflate_deflate_decompress_ex(d, in_next, - in_end - GZIP_FOOTER_SIZE - in_next, - out, out_nbytes_avail, - &actual_in_nbytes, - actual_out_nbytes_ret); - if (result != LIBDEFLATE_SUCCESS) - return result; - - if (actual_out_nbytes_ret) - actual_out_nbytes = *actual_out_nbytes_ret; - else - actual_out_nbytes = out_nbytes_avail; - - in_next += actual_in_nbytes; - - /* CRC32 */ - if (libdeflate_crc32(0, out, actual_out_nbytes) != - get_unaligned_le32(in_next)) - return LIBDEFLATE_BAD_DATA; - in_next += 4; - - /* ISIZE */ - if ((u32)actual_out_nbytes != get_unaligned_le32(in_next)) - return LIBDEFLATE_BAD_DATA; - in_next += 4; - - if (actual_in_nbytes_ret) - *actual_in_nbytes_ret = in_next - (u8 *)in; - - return LIBDEFLATE_SUCCESS; + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + u8 flg; + size_t actual_in_nbytes; + size_t actual_out_nbytes; + enum libdeflate_result result; + + if (in_nbytes < GZIP_MIN_OVERHEAD) + return LIBDEFLATE_BAD_DATA; + + /* ID1 */ + if (*in_next++ != GZIP_ID1) + return LIBDEFLATE_BAD_DATA; + /* ID2 */ + if (*in_next++ != GZIP_ID2) + return LIBDEFLATE_BAD_DATA; + /* CM */ + if (*in_next++ != GZIP_CM_DEFLATE) + return LIBDEFLATE_BAD_DATA; + flg = *in_next++; + /* MTIME */ + in_next += 4; + /* XFL */ + in_next += 1; + /* OS */ + in_next += 1; + + if (flg & GZIP_FRESERVED) + return LIBDEFLATE_BAD_DATA; + + /* Extra field */ + if (flg & GZIP_FEXTRA) { + u16 xlen = get_unaligned_le16(in_next); + in_next += 2; + + if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + + in_next += xlen; + } + + /* Original file name (zero terminated) */ + if (flg & GZIP_FNAME) { + while (*in_next++ != 0 && in_next != in_end) + ; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + } + + /* File comment (zero terminated) */ + if (flg & GZIP_FCOMMENT) { + while (*in_next++ != 0 && in_next != in_end) + ; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + } + + /* CRC16 for gzip header */ + if (flg & GZIP_FHCRC) { + in_next += 2; + if (in_end - in_next < GZIP_FOOTER_SIZE) + return LIBDEFLATE_BAD_DATA; + } + + /* Compressed data */ + result = libdeflate_deflate_decompress_ex(d, in_next, + in_end - GZIP_FOOTER_SIZE - in_next, + out, out_nbytes_avail, + &actual_in_nbytes, + actual_out_nbytes_ret); + if (result != LIBDEFLATE_SUCCESS) + return result; + + if (actual_out_nbytes_ret) + actual_out_nbytes = *actual_out_nbytes_ret; + else + actual_out_nbytes = out_nbytes_avail; + + in_next += actual_in_nbytes; + + /* CRC32 */ + if (libdeflate_crc32(0, out, actual_out_nbytes) != + get_unaligned_le32(in_next)) + return LIBDEFLATE_BAD_DATA; + in_next += 4; + + /* ISIZE */ + if ((u32)actual_out_nbytes != get_unaligned_le32(in_next)) + return LIBDEFLATE_BAD_DATA; + in_next += 4; + + if (actual_in_nbytes_ret) + *actual_in_nbytes_ret = in_next - (u8 *)in; + + return LIBDEFLATE_SUCCESS; } LIBDEFLATEAPI enum libdeflate_result libdeflate_gzip_decompress(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) { - return libdeflate_gzip_decompress_ex(d, in, in_nbytes, - out, out_nbytes_avail, - NULL, actual_out_nbytes_ret); + return libdeflate_gzip_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); } diff --git a/Sources/DEFLATE/lib_common.h b/Sources/DEFLATE/lib_common.h index 8c9ff5fe..68bf734b 100644 --- a/Sources/DEFLATE/lib_common.h +++ b/Sources/DEFLATE/lib_common.h @@ -6,10 +6,10 @@ #define LIB_LIB_COMMON_H #ifdef LIBDEFLATE_H - /* - * When building the library, LIBDEFLATEAPI needs to be defined properly before - * including libdeflate.h. - */ +/* + * When building the library, LIBDEFLATEAPI needs to be defined properly before + * including libdeflate.h. + */ # error "lib_common.h must always be included before libdeflate.h" #endif @@ -35,7 +35,7 @@ # define LIBDEFLATE_ALIGN_STACK #endif -#define LIBDEFLATEAPI LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK +#define LIBDEFLATEAPI LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK #include "../common_defs.h" @@ -46,7 +46,7 @@ extern malloc_func_t libdeflate_default_malloc_func; extern free_func_t libdeflate_default_free_func; void *libdeflate_aligned_malloc(malloc_func_t malloc_func, - size_t alignment, size_t size); + size_t alignment, size_t size); void libdeflate_aligned_free(free_func_t free_func, void *ptr); #ifdef FREESTANDING @@ -63,16 +63,16 @@ void libdeflate_aligned_free(free_func_t free_func, void *ptr); * We still need the actual function definitions in case gcc calls them. */ void *memset(void *s, int c, size_t n); -#define memset(s, c, n) __builtin_memset((s), (c), (n)) +#define memset(s, c, n) __builtin_memset((s), (c), (n)) void *memcpy(void *dest, const void *src, size_t n); -#define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n)) +#define memcpy(dest, src, n) __builtin_memcpy((dest), (src), (n)) void *memmove(void *dest, const void *src, size_t n); -#define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) +#define memmove(dest, src, n) __builtin_memmove((dest), (src), (n)) int memcmp(const void *s1, const void *s2, size_t n); -#define memcmp(s1, s2, n) __builtin_memcmp((s1), (s2), (n)) +#define memcmp(s1, s2, n) __builtin_memcmp((s1), (s2), (n)) #undef LIBDEFLATE_ENABLE_ASSERTIONS #else @@ -86,13 +86,13 @@ int memcmp(const void *s1, const void *s2, size_t n); #ifdef LIBDEFLATE_ENABLE_ASSERTIONS void libdeflate_assertion_failed(const char *expr, const char *file, int line); #define ASSERT(expr) { if (unlikely(!(expr))) \ - libdeflate_assertion_failed(#expr, __FILE__, __LINE__); } +libdeflate_assertion_failed(#expr, __FILE__, __LINE__); } #else #define ASSERT(expr) (void)(expr) #endif -#define CONCAT_IMPL(a, b) a##b -#define CONCAT(a, b) CONCAT_IMPL(a, b) -#define ADD_SUFFIX(name) CONCAT(name, SUFFIX) +#define CONCAT_IMPL(a, b) a##b +#define CONCAT(a, b) CONCAT_IMPL(a, b) +#define ADD_SUFFIX(name) CONCAT(name, SUFFIX) #endif /* LIB_LIB_COMMON_H */ diff --git a/Sources/DEFLATE/matchfinder_common.h b/Sources/DEFLATE/matchfinder_common.h index a47d1070..8094c2b2 100644 --- a/Sources/DEFLATE/matchfinder_common.h +++ b/Sources/DEFLATE/matchfinder_common.h @@ -20,10 +20,10 @@ static forceinline u32 loaded_u32_to_u24(u32 v) { - if (CPU_IS_LITTLE_ENDIAN()) - return v & 0xFFFFFF; - else - return v >> 8; + if (CPU_IS_LITTLE_ENDIAN()) + return v & 0xFFFFFF; + else + return v >> 8; } /* @@ -35,12 +35,12 @@ static forceinline u32 load_u24_unaligned(const u8 *p) { #if UNALIGNED_ACCESS_IS_FAST - return loaded_u32_to_u24(load_u32_unaligned(p)); + return loaded_u32_to_u24(load_u32_unaligned(p)); #else - if (CPU_IS_LITTLE_ENDIAN()) - return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); - else - return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16); + if (CPU_IS_LITTLE_ENDIAN()) + return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16); + else + return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16); #endif } @@ -51,18 +51,43 @@ typedef s16 mf_pos_t; #define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE) /* - * Required alignment of the matchfinder buffer pointer and size. The values - * here come from the AVX-2 implementation, which is the worst case. + * This is the memory address alignment, in bytes, required for the matchfinder + * buffers by the architecture-specific implementations of matchfinder_init() + * and matchfinder_rebase(). "Matchfinder buffer" means an entire struct + * hc_matchfinder, bt_matchfinder, or ht_matchfinder; the next_tab field of + * struct hc_matchfinder; or the child_tab field of struct bt_matchfinder. + * + * This affects how the entire 'struct deflate_compressor' is allocated, since + * the matchfinder structures are embedded inside it. + * + * Currently the maximum memory address alignment required is 32 bytes, needed + * by the AVX-2 matchfinder functions. + */ +#define MATCHFINDER_MEM_ALIGNMENT 32 + +/* + * This declares a size, in bytes, that is guaranteed to divide the sizes of the + * matchfinder buffers (where "matchfinder buffers" is as defined for + * MATCHFINDER_MEM_ALIGNMENT). The architecture-specific implementations of + * matchfinder_init() and matchfinder_rebase() take advantage of this value. + * + * Currently the maximum size alignment required is 128 bytes, needed by + * the AVX-2 matchfinder functions. However, the RISC-V Vector Extension + * matchfinder functions can, in principle, take advantage of a larger size + * alignment. Therefore, we set this to 1024, which still easily divides the + * actual sizes that result from the current matchfinder struct definitions. + * This value can safely be changed to any power of two that is >= 128. */ -#define MATCHFINDER_MEM_ALIGNMENT 32 -#define MATCHFINDER_SIZE_ALIGNMENT 128 +#define MATCHFINDER_SIZE_ALIGNMENT 1024 #undef matchfinder_init #undef matchfinder_rebase #ifdef _aligned_attribute # define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT) # if defined(ARCH_ARM32) || defined(ARCH_ARM64) -# include "matchfinder_impl.h" +# include "arm/matchfinder_impl.h" +# elif defined(ARCH_RISCV) +# include "riscv/matchfinder_impl.h" # elif defined(ARCH_X86_32) || defined(ARCH_X86_64) # include "x86/matchfinder_impl.h" # endif @@ -82,11 +107,11 @@ typedef s16 mf_pos_t; static forceinline void matchfinder_init(mf_pos_t *data, size_t size) { - size_t num_entries = size / sizeof(*data); - size_t i; - - for (i = 0; i < num_entries; i++) - data[i] = MATCHFINDER_INITVAL; + size_t num_entries = size / sizeof(*data); + size_t i; + + for (i = 0; i < num_entries; i++) + data[i] = MATCHFINDER_INITVAL; } #endif @@ -111,25 +136,25 @@ matchfinder_init(mf_pos_t *data, size_t size) static forceinline void matchfinder_rebase(mf_pos_t *data, size_t size) { - size_t num_entries = size / sizeof(*data); - size_t i; - - if (MATCHFINDER_WINDOW_SIZE == 32768) { - /* - * Branchless version for 32768-byte windows. Clear all bits if - * the value was already negative, then set the sign bit. This - * is equivalent to subtracting 32768 with signed saturation. - */ - for (i = 0; i < num_entries; i++) - data[i] = 0x8000 | (data[i] & ~(data[i] >> 15)); - } else { - for (i = 0; i < num_entries; i++) { - if (data[i] >= 0) - data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; - else - data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; - } - } + size_t num_entries = size / sizeof(*data); + size_t i; + + if (MATCHFINDER_WINDOW_SIZE == 32768) { + /* + * Branchless version for 32768-byte windows. Clear all bits if + * the value was already negative, then set the sign bit. This + * is equivalent to subtracting 32768 with signed saturation. + */ + for (i = 0; i < num_entries; i++) + data[i] = 0x8000 | (data[i] & ~(data[i] >> 15)); + } else { + for (i = 0; i < num_entries; i++) { + if (data[i] >= 0) + data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; + else + data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE; + } + } } #endif @@ -143,7 +168,7 @@ matchfinder_rebase(mf_pos_t *data, size_t size) static forceinline u32 lz_hash(u32 seq, unsigned num_bits) { - return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits); + return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits); } /* @@ -152,48 +177,48 @@ lz_hash(u32 seq, unsigned num_bits) */ static forceinline unsigned lz_extend(const u8 * const strptr, const u8 * const matchptr, - const unsigned start_len, const unsigned max_len) + const unsigned start_len, const unsigned max_len) { - unsigned len = start_len; - machine_word_t v_word; - - if (UNALIGNED_ACCESS_IS_FAST) { - - if (likely(max_len - len >= 4 * WORDBYTES)) { - - #define COMPARE_WORD_STEP \ - v_word = load_word_unaligned(&matchptr[len]) ^ \ - load_word_unaligned(&strptr[len]); \ - if (v_word != 0) \ - goto word_differs; \ - len += WORDBYTES; \ - - COMPARE_WORD_STEP - COMPARE_WORD_STEP - COMPARE_WORD_STEP - COMPARE_WORD_STEP - #undef COMPARE_WORD_STEP - } - - while (len + WORDBYTES <= max_len) { - v_word = load_word_unaligned(&matchptr[len]) ^ - load_word_unaligned(&strptr[len]); - if (v_word != 0) - goto word_differs; - len += WORDBYTES; - } - } - - while (len < max_len && matchptr[len] == strptr[len]) - len++; - return len; - + unsigned len = start_len; + machine_word_t v_word; + + if (UNALIGNED_ACCESS_IS_FAST) { + + if (likely(max_len - len >= 4 * WORDBYTES)) { + +#define COMPARE_WORD_STEP \ +v_word = load_word_unaligned(&matchptr[len]) ^ \ +load_word_unaligned(&strptr[len]); \ +if (v_word != 0) \ +goto word_differs; \ +len += WORDBYTES; \ + + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP +#undef COMPARE_WORD_STEP + } + + while (len + WORDBYTES <= max_len) { + v_word = load_word_unaligned(&matchptr[len]) ^ + load_word_unaligned(&strptr[len]); + if (v_word != 0) + goto word_differs; + len += WORDBYTES; + } + } + + while (len < max_len && matchptr[len] == strptr[len]) + len++; + return len; + word_differs: - if (CPU_IS_LITTLE_ENDIAN()) - len += (bsfw(v_word) >> 3); - else - len += (WORDBITS - 1 - bsrw(v_word)) >> 3; - return len; + if (CPU_IS_LITTLE_ENDIAN()) + len += (bsfw(v_word) >> 3); + else + len += (WORDBITS - 1 - bsrw(v_word)) >> 3; + return len; } #endif /* LIB_MATCHFINDER_COMMON_H */ diff --git a/Sources/DEFLATE/riscv/matchfinder_impl.h b/Sources/DEFLATE/riscv/matchfinder_impl.h new file mode 100644 index 00000000..76081fa7 --- /dev/null +++ b/Sources/DEFLATE/riscv/matchfinder_impl.h @@ -0,0 +1,97 @@ +/* + * riscv/matchfinder_impl.h - RISC-V implementations of matchfinder functions + * + * Copyright 2024 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_RISCV_MATCHFINDER_IMPL_H +#define LIB_RISCV_MATCHFINDER_IMPL_H + +#if defined(ARCH_RISCV) && defined(__riscv_vector) +#include + +/* + * Return the maximum number of 16-bit (mf_pos_t) elements that fit in 8 RISC-V + * vector registers and also evenly divide the sizes of the matchfinder buffers. + */ +static forceinline size_t +riscv_matchfinder_vl(void) +{ + const size_t vl = __riscv_vsetvlmax_e16m8(); + + STATIC_ASSERT(sizeof(mf_pos_t) == sizeof(s16)); + /* + * MATCHFINDER_SIZE_ALIGNMENT is a power of 2, as is 'vl' because the + * RISC-V Vector Extension requires that the vector register length + * (VLEN) be a power of 2. Thus, a simple MIN() gives the correct + * answer here; rounding to a power of 2 is not required. + */ + STATIC_ASSERT((MATCHFINDER_SIZE_ALIGNMENT & + (MATCHFINDER_SIZE_ALIGNMENT - 1)) == 0); + ASSERT((vl & (vl - 1)) == 0); + return MIN(vl, MATCHFINDER_SIZE_ALIGNMENT / sizeof(mf_pos_t)); +} + +/* matchfinder_init() optimized using the RISC-V Vector Extension */ +static forceinline void +matchfinder_init_rvv(mf_pos_t *p, size_t size) +{ + const size_t vl = riscv_matchfinder_vl(); + const vint16m8_t v = __riscv_vmv_v_x_i16m8(MATCHFINDER_INITVAL, vl); + + ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0); + do { + __riscv_vse16_v_i16m8(p, v, vl); + p += vl; + size -= vl * sizeof(p[0]); + } while (size != 0); +} +#define matchfinder_init matchfinder_init_rvv + +/* matchfinder_rebase() optimized using the RISC-V Vector Extension */ +static forceinline void +matchfinder_rebase_rvv(mf_pos_t *p, size_t size) +{ + const size_t vl = riscv_matchfinder_vl(); + + ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0); + do { + vint16m8_t v = __riscv_vle16_v_i16m8(p, vl); + + /* + * This should generate the vsadd.vx instruction + * (Vector Saturating Add, integer vector-scalar) + */ + v = __riscv_vsadd_vx_i16m8(v, (s16)-MATCHFINDER_WINDOW_SIZE, + vl); + __riscv_vse16_v_i16m8(p, v, vl); + p += vl; + size -= vl * sizeof(p[0]); + } while (size != 0); +} +#define matchfinder_rebase matchfinder_rebase_rvv + +#endif /* ARCH_RISCV && __riscv_vector */ + +#endif /* LIB_RISCV_MATCHFINDER_IMPL_H */ \ No newline at end of file diff --git a/Sources/DEFLATE/utils.c b/Sources/DEFLATE/utils.c index c1e4cc26..3eea24c8 100644 --- a/Sources/DEFLATE/utils.c +++ b/Sources/DEFLATE/utils.c @@ -39,31 +39,31 @@ free_func_t libdeflate_default_free_func = free; void * libdeflate_aligned_malloc(malloc_func_t malloc_func, - size_t alignment, size_t size) + size_t alignment, size_t size) { - void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size); - - if (ptr) { - void *orig_ptr = ptr; - - ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); - ((void **)ptr)[-1] = orig_ptr; - } - return ptr; + void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size); + + if (ptr) { + void *orig_ptr = ptr; + + ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment); + ((void **)ptr)[-1] = orig_ptr; + } + return ptr; } void libdeflate_aligned_free(free_func_t free_func, void *ptr) { - (*free_func)(((void **)ptr)[-1]); + (*free_func)(((void **)ptr)[-1]); } LIBDEFLATEAPI void libdeflate_set_memory_allocator(malloc_func_t malloc_func, - free_func_t free_func) + free_func_t free_func) { - libdeflate_default_malloc_func = malloc_func; - libdeflate_default_free_func = free_func; + libdeflate_default_malloc_func = malloc_func; + libdeflate_default_free_func = free_func; } /* @@ -76,56 +76,56 @@ libdeflate_set_memory_allocator(malloc_func_t malloc_func, void * __attribute__((weak)) memset(void *s, int c, size_t n) { - u8 *p = s; - size_t i; - - for (i = 0; i < n; i++) - p[i] = c; - return s; + u8 *p = s; + size_t i; + + for (i = 0; i < n; i++) + p[i] = c; + return s; } #undef memcpy void * __attribute__((weak)) memcpy(void *dest, const void *src, size_t n) { - u8 *d = dest; - const u8 *s = src; - size_t i; - - for (i = 0; i < n; i++) - d[i] = s[i]; - return dest; + u8 *d = dest; + const u8 *s = src; + size_t i; + + for (i = 0; i < n; i++) + d[i] = s[i]; + return dest; } #undef memmove void * __attribute__((weak)) memmove(void *dest, const void *src, size_t n) { - u8 *d = dest; - const u8 *s = src; - size_t i; - - if (d <= s) - return memcpy(d, s, n); - - for (i = n; i > 0; i--) - d[i - 1] = s[i - 1]; - return dest; + u8 *d = dest; + const u8 *s = src; + size_t i; + + if (d <= s) + return memcpy(d, s, n); + + for (i = n; i > 0; i--) + d[i - 1] = s[i - 1]; + return dest; } #undef memcmp int __attribute__((weak)) memcmp(const void *s1, const void *s2, size_t n) { - const u8 *p1 = s1; - const u8 *p2 = s2; - size_t i; - - for (i = 0; i < n; i++) { - if (p1[i] != p2[i]) - return (int)p1[i] - (int)p2[i]; - } - return 0; + const u8 *p1 = s1; + const u8 *p2 = s2; + size_t i; + + for (i = 0; i < n; i++) { + if (p1[i] != p2[i]) + return (int)p1[i] - (int)p2[i]; + } + return 0; } #endif /* FREESTANDING */ @@ -135,7 +135,7 @@ memcmp(const void *s1, const void *s2, size_t n) void libdeflate_assertion_failed(const char *expr, const char *file, int line) { - fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line); - abort(); + fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line); + abort(); } #endif /* LIBDEFLATE_ENABLE_ASSERTIONS */ diff --git a/Sources/DEFLATE/x86/adler32_impl.h b/Sources/DEFLATE/x86/adler32_impl.h new file mode 100644 index 00000000..0aacdda3 --- /dev/null +++ b/Sources/DEFLATE/x86/adler32_impl.h @@ -0,0 +1,127 @@ +/* + * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_X86_ADLER32_IMPL_H +#define LIB_X86_ADLER32_IMPL_H + +#include "x86/cpu_features.h" + +/* SSE2 and AVX2 implementations. Used on older CPUs. */ +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) +# define adler32_x86_sse2 adler32_x86_sse2 +# define SUFFIX _x86_sse2 +# define ATTRIBUTES _target_attribute("sse2") +# define VL 16 +# define USE_VNNI 0 +# define USE_MASKING 0 +# include "adler32_template.h" + +# define adler32_x86_avx2 adler32_x86_avx2 +# define SUFFIX _x86_avx2 +# define ATTRIBUTES _target_attribute("avx2") +# define VL 32 +# define USE_VNNI 0 +# define USE_MASKING 0 +# include "adler32_template.h" +#endif + +/* + * AVX-VNNI implementation. This is used on CPUs that have AVX2 and AVX-VNNI + * but don't have AVX-512, for example Intel Alder Lake. + */ +#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930) +# define adler32_x86_avx2_vnni adler32_x86_avx2_vnni +# define SUFFIX _x86_avx2_vnni +# define ATTRIBUTES _target_attribute("avx2,avxvnni") +# define VL 32 +# define USE_VNNI 1 +# define USE_MASKING 0 +# include "adler32_template.h" +#endif + +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) +/* + * AVX512VNNI implementation using 256-bit vectors. This is very similar to the + * AVX-VNNI implementation but takes advantage of masking and more registers. + * This is used on CPUs that support AVX-512 but where using 512-bit vectors + * causes downclocking. This should also be the optimal implementation on CPUs + * that support AVX10/256 but not AVX10/512. + */ +# define adler32_x86_avx512_vl256_vnni adler32_x86_avx512_vl256_vnni +# define SUFFIX _x86_avx512_vl256_vnni +# define ATTRIBUTES _target_attribute("avx512bw,avx512vl,avx512vnni") +# define VL 32 +# define USE_VNNI 1 +# define USE_MASKING 1 +# include "adler32_template.h" + +/* + * AVX512VNNI implementation using 512-bit vectors. This is used on CPUs that + * have a good AVX-512 implementation including AVX512VNNI. This should also be + * the optimal implementation on CPUs that support AVX10/512. + */ +# define adler32_x86_avx512_vl512_vnni adler32_x86_avx512_vl512_vnni +# define SUFFIX _x86_avx512_vl512_vnni +# define ATTRIBUTES _target_attribute("avx512bw,avx512vnni") +# define VL 64 +# define USE_VNNI 1 +# define USE_MASKING 1 +# include "adler32_template.h" +#endif + +static inline adler32_func_t +arch_select_adler32_func(void) +{ + const u32 features MAYBE_UNUSED = get_x86_cpu_features(); + +#ifdef adler32_x86_avx512_vl512_vnni + if ((features & X86_CPU_FEATURE_ZMM) && + HAVE_AVX512BW(features) && HAVE_AVX512VNNI(features)) + return adler32_x86_avx512_vl512_vnni; +#endif +#ifdef adler32_x86_avx512_vl256_vnni + if (HAVE_AVX512BW(features) && HAVE_AVX512VL(features) && + HAVE_AVX512VNNI(features)) + return adler32_x86_avx512_vl256_vnni; +#endif +#ifdef adler32_x86_avx2_vnni + if (HAVE_AVX2(features) && HAVE_AVXVNNI(features)) + return adler32_x86_avx2_vnni; +#endif +#ifdef adler32_x86_avx2 + if (HAVE_AVX2(features)) + return adler32_x86_avx2; +#endif +#ifdef adler32_x86_sse2 + if (HAVE_SSE2(features)) + return adler32_x86_sse2; +#endif + return NULL; +} +#define arch_select_adler32_func arch_select_adler32_func + +#endif /* LIB_X86_ADLER32_IMPL_H */ diff --git a/Sources/DEFLATE/x86/adler32_template.h b/Sources/DEFLATE/x86/adler32_template.h new file mode 100644 index 00000000..77087741 --- /dev/null +++ b/Sources/DEFLATE/x86/adler32_template.h @@ -0,0 +1,512 @@ +/* + * x86/adler32_template.h - template for vectorized Adler-32 implementations + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This file is a "template" for instantiating Adler-32 functions for x86. + * The "parameters" are: + * + * SUFFIX: + * Name suffix to append to all instantiated functions. + * ATTRIBUTES: + * Target function attributes to use. Must satisfy the dependencies of the + * other parameters as follows: + * VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2 + * VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2 + * VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni + * VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni + * VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni + * (Other combinations are not useful and have not been tested.) + * VL: + * Vector length in bytes. Must be 16, 32, and 64. + * USE_VNNI: + * If 1, use the VNNI dot product based algorithm. + * If 0, use the legacy SSE2 and AVX2 compatible algorithm. + * USE_MASKING: + * If 1, use AVX-512 features such as masking. + * If 0, assume that the CPU might not support AVX-512. + */ + +#if VL == 16 +# define vec_t __m128i +# define mask_t u16 +# define LOG2_VL 4 +# define VADD8(a, b) _mm_add_epi8((a), (b)) +# define VADD16(a, b) _mm_add_epi16((a), (b)) +# define VADD32(a, b) _mm_add_epi32((a), (b)) +# if USE_MASKING +# define VDPBUSD(a, b, c) _mm_dpbusd_epi32((a), (b), (c)) +# else +# define VDPBUSD(a, b, c) _mm_dpbusd_avx_epi32((a), (b), (c)) +# endif +# define VLOAD(p) _mm_load_si128((const void *)(p)) +# define VLOADU(p) _mm_loadu_si128((const void *)(p)) +# define VMADD16(a, b) _mm_madd_epi16((a), (b)) +# define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p)) +# define VMULLO32(a, b) _mm_mullo_epi32((a), (b)) +# define VSAD8(a, b) _mm_sad_epu8((a), (b)) +# define VSET1_32(a) _mm_set1_epi32(a) +# define VSET1_8(a) _mm_set1_epi8(a) +# define VSETZERO() _mm_setzero_si128() +# define VSLL32(a, b) _mm_slli_epi32((a), (b)) +# define VUNPACKHI8(a, b) _mm_unpackhi_epi8((a), (b)) +# define VUNPACKLO8(a, b) _mm_unpacklo_epi8((a), (b)) +#elif VL == 32 +# define vec_t __m256i +# define mask_t u32 +# define LOG2_VL 5 +# define VADD8(a, b) _mm256_add_epi8((a), (b)) +# define VADD16(a, b) _mm256_add_epi16((a), (b)) +# define VADD32(a, b) _mm256_add_epi32((a), (b)) +# if USE_MASKING +# define VDPBUSD(a, b, c) _mm256_dpbusd_epi32((a), (b), (c)) +# else +# define VDPBUSD(a, b, c) _mm256_dpbusd_avx_epi32((a), (b), (c)) +# endif +# define VLOAD(p) _mm256_load_si256((const void *)(p)) +# define VLOADU(p) _mm256_loadu_si256((const void *)(p)) +# define VMADD16(a, b) _mm256_madd_epi16((a), (b)) +# define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p)) +# define VMULLO32(a, b) _mm256_mullo_epi32((a), (b)) +# define VSAD8(a, b) _mm256_sad_epu8((a), (b)) +# define VSET1_32(a) _mm256_set1_epi32(a) +# define VSET1_8(a) _mm256_set1_epi8(a) +# define VSETZERO() _mm256_setzero_si256() +# define VSLL32(a, b) _mm256_slli_epi32((a), (b)) +# define VUNPACKHI8(a, b) _mm256_unpackhi_epi8((a), (b)) +# define VUNPACKLO8(a, b) _mm256_unpacklo_epi8((a), (b)) +#elif VL == 64 +# define vec_t __m512i +# define mask_t u64 +# define LOG2_VL 6 +# define VADD8(a, b) _mm512_add_epi8((a), (b)) +# define VADD32(a, b) _mm512_add_epi32((a), (b)) +# define VDPBUSD(a, b, c) _mm512_dpbusd_epi32((a), (b), (c)) +# define VLOAD(p) _mm512_load_si512((const void *)(p)) +# define VLOADU(p) _mm512_loadu_si512((const void *)(p)) +# define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p)) +# define VMULLO32(a, b) _mm512_mullo_epi32((a), (b)) +# define VSET1_32(a) _mm512_set1_epi32(a) +# define VSET1_8(a) _mm512_set1_epi8(a) +# define VSETZERO() _mm512_setzero_si512() +# define VSLL32(a, b) _mm512_slli_epi32((a), (b)) +#else +# error "unsupported vector length" +#endif + +#define VADD32_3X(a, b, c) VADD32(VADD32((a), (b)), (c)) +#define VADD32_4X(a, b, c, d) VADD32(VADD32((a), (b)), VADD32((c), (d))) +#define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e))) +#define VADD32_7X(a, b, c, d, e, f, g) \ +VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g))) + +/* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */ +#undef reduce_to_32bits +static forceinline ATTRIBUTES void +ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p) +{ + __m128i v_s1_128, v_s2_128; +#if VL == 16 + { + v_s1_128 = v_s1; + v_s2_128 = v_s2; + } +#else + { + __m256i v_s1_256, v_s2_256; +#if VL == 32 + v_s1_256 = v_s1; + v_s2_256 = v_s2; +#else + /* Reduce 512 bits to 256 bits. */ + v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0), + _mm512_extracti64x4_epi64(v_s1, 1)); + v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0), + _mm512_extracti64x4_epi64(v_s2, 1)); +#endif + /* Reduce 256 bits to 128 bits. */ + v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0), + _mm256_extracti128_si256(v_s1_256, 1)); + v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0), + _mm256_extracti128_si256(v_s2_256, 1)); + } +#endif + + /* + * Reduce 128 bits to 32 bits. + * + * If the bytes were summed into v_s1 using psadbw + paddd, then ignore + * the odd-indexed elements of v_s1_128 since they are zero. + */ +#if USE_VNNI + v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31)); +#endif + v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31)); + v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02)); + v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02)); + + *s1_p += (u32)_mm_cvtsi128_si32(v_s1_128); + *s2_p += (u32)_mm_cvtsi128_si32(v_s2_128); +} +#define reduce_to_32bits ADD_SUFFIX(reduce_to_32bits) + +static u32 ATTRIBUTES +ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len) +{ +#if USE_VNNI + /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */ + static const u8 _aligned_attribute(VL) raw_mults[VL] = { +#if VL == 64 + 64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49, + 48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, +#endif +#if VL >= 32 + 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, +#endif + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, + }; + const vec_t ones = VSET1_8(1); +#else + /* + * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1]. + * For VL==32 the ordering is weird because it has to match the way that + * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately. + */ + static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = { +#if VL == 16 + { 32, 31, 30, 29, 28, 27, 26, 25 }, + { 24, 23, 22, 21, 20, 19, 18, 17 }, + { 16, 15, 14, 13, 12, 11, 10, 9 }, + { 8, 7, 6, 5, 4, 3, 2, 1 }, +#elif VL == 32 + { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 }, + { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 }, + { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10, 9 }, + { 24, 23, 22, 21, 20, 19, 18, 17, 8, 7, 6, 5, 4, 3, 2, 1 }, +#else +# error "unsupported parameters" +#endif + }; + const vec_t mults_a = VLOAD(raw_mults[0]); + const vec_t mults_b = VLOAD(raw_mults[1]); + const vec_t mults_c = VLOAD(raw_mults[2]); + const vec_t mults_d = VLOAD(raw_mults[3]); +#endif + const vec_t zeroes = VSETZERO(); + u32 s1 = adler & 0xFFFF; + u32 s2 = adler >> 16; + + /* + * If the length is large and the pointer is misaligned, align it. + * For smaller lengths, just take the misaligned load penalty. + */ + if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) { + do { + s1 += *p++; + s2 += s1; + len--; + } while ((uintptr_t)p & (VL-1)); + s1 %= DIVISOR; + s2 %= DIVISOR; + } + +#if USE_VNNI + /* + * This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or + * AVX-VNNI. vpdpbusd multiplies the unsigned bytes of one vector by + * the signed bytes of another vector and adds the sums in groups of 4 + * to the 32-bit elements of a third vector. We use it in two ways: + * multiplying the data bytes by a sequence like 64,63,62,...,1 for + * calculating part of s2, and multiplying the data bytes by an all-ones + * sequence 1,1,1,...,1 for calculating s1 and part of s2. The all-ones + * trick seems to be faster than the alternative of vpsadbw + vpaddd. + */ + while (len) { + /* + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX. + */ + size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1)); + vec_t mults = VLOAD(raw_mults); + vec_t v_s1 = zeroes; + vec_t v_s2 = zeroes; + + s2 += s1 * n; + len -= n; + + if (n >= 4*VL) { + vec_t v_s1_b = zeroes; + vec_t v_s1_c = zeroes; + vec_t v_s1_d = zeroes; + vec_t v_s2_b = zeroes; + vec_t v_s2_c = zeroes; + vec_t v_s2_d = zeroes; + vec_t v_s1_sums = zeroes; + vec_t v_s1_sums_b = zeroes; + vec_t v_s1_sums_c = zeroes; + vec_t v_s1_sums_d = zeroes; + vec_t tmp0, tmp1; + + do { + vec_t data_a = VLOADU(p + 0*VL); + vec_t data_b = VLOADU(p + 1*VL); + vec_t data_c = VLOADU(p + 2*VL); + vec_t data_d = VLOADU(p + 3*VL); + + /* + * Workaround for gcc bug where it generates + * unnecessary move instructions + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) + */ +#if GCC_PREREQ(1, 0) + __asm__("" : "+v" (data_a), "+v" (data_b), + "+v" (data_c), "+v" (data_d)); +#endif + + v_s2 = VDPBUSD(v_s2, data_a, mults); + v_s2_b = VDPBUSD(v_s2_b, data_b, mults); + v_s2_c = VDPBUSD(v_s2_c, data_c, mults); + v_s2_d = VDPBUSD(v_s2_d, data_d, mults); + + v_s1_sums = VADD32(v_s1_sums, v_s1); + v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b); + v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c); + v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d); + + v_s1 = VDPBUSD(v_s1, data_a, ones); + v_s1_b = VDPBUSD(v_s1_b, data_b, ones); + v_s1_c = VDPBUSD(v_s1_c, data_c, ones); + v_s1_d = VDPBUSD(v_s1_d, data_d, ones); + + /* Same gcc bug workaround. See above */ +#if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32) + __asm__("" : "+v" (v_s2), "+v" (v_s2_b), + "+v" (v_s2_c), "+v" (v_s2_d), + "+v" (v_s1_sums), + "+v" (v_s1_sums_b), + "+v" (v_s1_sums_c), + "+v" (v_s1_sums_d), + "+v" (v_s1), "+v" (v_s1_b), + "+v" (v_s1_c), "+v" (v_s1_d)); +#endif + p += 4*VL; + n -= 4*VL; + } while (n >= 4*VL); + + /* + * Reduce into v_s1 and v_s2 as follows: + * + * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d + + * (4*VL)*(v_s1_sums + v_s1_sums_b + + * v_s1_sums_c + v_s1_sums_d) + + * (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c + * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d + */ + tmp0 = VADD32(v_s1, v_s1_b); + tmp1 = VADD32(v_s1, v_s1_c); + v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b, + v_s1_sums_c, v_s1_sums_d); + v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d); + v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2), + VSLL32(tmp0, LOG2_VL + 1), + VSLL32(tmp1, LOG2_VL), + v_s2, v_s2_b, v_s2_c, v_s2_d); + } + + /* Process the last 0 <= n < 4*VL bytes of the chunk. */ + if (n >= 2*VL) { + const vec_t data_a = VLOADU(p + 0*VL); + const vec_t data_b = VLOADU(p + 1*VL); + + v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1)); + v_s1 = VDPBUSD(v_s1, data_a, ones); + v_s1 = VDPBUSD(v_s1, data_b, ones); + v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL)); + v_s2 = VDPBUSD(v_s2, data_a, mults); + v_s2 = VDPBUSD(v_s2, data_b, mults); + p += 2*VL; + n -= 2*VL; + } + if (n) { + /* Process the last 0 < n < 2*VL bytes of the chunk. */ + vec_t data; + + v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n))); + + mults = VADD8(mults, VSET1_8((int)n - VL)); + if (n > VL) { + data = VLOADU(p); + v_s1 = VDPBUSD(v_s1, data, ones); + v_s2 = VDPBUSD(v_s2, data, mults); + p += VL; + n -= VL; + mults = VADD8(mults, VSET1_8(-VL)); + } + /* + * Process the last 0 < n <= VL bytes of the chunk. + * Utilize a masked load if it's available. + */ +#if USE_MASKING + data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p); +#else + data = zeroes; + memcpy(&data, p, n); +#endif + v_s1 = VDPBUSD(v_s1, data, ones); + v_s2 = VDPBUSD(v_s2, data, mults); + p += n; + } + + reduce_to_32bits(v_s1, v_s2, &s1, &s2); + s1 %= DIVISOR; + s2 %= DIVISOR; + } +#else /* USE_VNNI */ + /* + * This is Adler-32 for SSE2 and AVX2. + * + * To horizontally sum bytes, use psadbw + paddd, where one of the + * arguments to psadbw is all-zeroes. + * + * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL + * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw + * to sum, for each i across iterations, byte i into a corresponding + * 16-bit counter in v_byte_sums_*. After the inner loop, use pmaddwd + * to multiply each counter by (2*VL - i), then add the products to s2. + * + * An alternative implementation would use pmaddubsw and pmaddwd in the + * inner loop to do (2*VL - i)*data[i] directly and add the products in + * groups of 4 to 32-bit counters. However, on average that approach + * seems to be slower than the current approach which delays the + * multiplications. Also, pmaddubsw requires SSSE3; the current + * approach keeps the implementation aligned between SSE2 and AVX2. + * + * The inner loop processes 2*VL bytes per iteration. Increasing this + * to 4*VL doesn't seem to be helpful here. + */ + while (len) { + /* + * Calculate the length of the next data chunk such that s1 and + * s2 are guaranteed to not exceed UINT32_MAX, and every + * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX. + * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are + * used with pmaddwd which does signed multiplication. In the + * SSE2 case this limits chunks to 4096 bytes instead of 5504. + */ + size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX), + MAX_CHUNK_LEN) & ~(2*VL - 1)); + len -= n; + + if (n >= 2*VL) { + vec_t v_s1 = zeroes; + vec_t v_s1_sums = zeroes; + vec_t v_byte_sums_a = zeroes; + vec_t v_byte_sums_b = zeroes; + vec_t v_byte_sums_c = zeroes; + vec_t v_byte_sums_d = zeroes; + vec_t v_s2; + + s2 += s1 * (n & ~(2*VL - 1)); + + do { + vec_t data_a = VLOADU(p + 0*VL); + vec_t data_b = VLOADU(p + 1*VL); + + v_s1_sums = VADD32(v_s1_sums, v_s1); + v_byte_sums_a = VADD16(v_byte_sums_a, + VUNPACKLO8(data_a, zeroes)); + v_byte_sums_b = VADD16(v_byte_sums_b, + VUNPACKHI8(data_a, zeroes)); + v_byte_sums_c = VADD16(v_byte_sums_c, + VUNPACKLO8(data_b, zeroes)); + v_byte_sums_d = VADD16(v_byte_sums_d, + VUNPACKHI8(data_b, zeroes)); + v_s1 = VADD32(v_s1, + VADD32(VSAD8(data_a, zeroes), + VSAD8(data_b, zeroes))); + /* + * Workaround for gcc bug where it generates + * unnecessary move instructions + * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892) + */ +#if GCC_PREREQ(1, 0) + __asm__("" : "+x" (v_s1), "+x" (v_s1_sums), + "+x" (v_byte_sums_a), + "+x" (v_byte_sums_b), + "+x" (v_byte_sums_c), + "+x" (v_byte_sums_d)); +#endif + p += 2*VL; + n -= 2*VL; + } while (n >= 2*VL); + + /* + * Calculate v_s2 as (2*VL)*v_s1_sums + + * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums. + * Then update s1 and s2 from v_s1 and v_s2. + */ + v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1), + VMADD16(v_byte_sums_a, mults_a), + VMADD16(v_byte_sums_b, mults_b), + VMADD16(v_byte_sums_c, mults_c), + VMADD16(v_byte_sums_d, mults_d)); + reduce_to_32bits(v_s1, v_s2, &s1, &s2); + } + /* + * Process the last 0 <= n < 2*VL bytes of the chunk using + * scalar instructions and reduce s1 and s2 mod DIVISOR. + */ + ADLER32_CHUNK(s1, s2, p, n); + } +#endif /* !USE_VNNI */ + return (s2 << 16) | s1; +} + +#undef vec_t +#undef mask_t +#undef LOG2_VL +#undef VADD8 +#undef VADD16 +#undef VADD32 +#undef VDPBUSD +#undef VLOAD +#undef VLOADU +#undef VMADD16 +#undef VMASKZ_LOADU +#undef VMULLO32 +#undef VSAD8 +#undef VSET1_8 +#undef VSET1_32 +#undef VSETZERO +#undef VSLL32 +#undef VUNPACKHI8 +#undef VUNPACKLO8 + +#undef SUFFIX +#undef ATTRIBUTES +#undef VL +#undef USE_VNNI +#undef USE_MASKING diff --git a/Sources/DEFLATE/x86/cpu_features.c b/Sources/DEFLATE/x86/cpu_features.c new file mode 100644 index 00000000..991aeb04 --- /dev/null +++ b/Sources/DEFLATE/x86/cpu_features.c @@ -0,0 +1,189 @@ +/* + * x86/cpu_features.c - feature detection for x86 CPUs + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "../cpu_features_common.h" /* must be included first */ +#include "x86/cpu_features.h" + +#ifdef X86_CPU_FEATURES_KNOWN +/* Runtime x86 CPU feature detection is supported. */ + +/* Execute the CPUID instruction. */ +static inline void +cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d) +{ +#ifdef _MSC_VER + int result[4]; + + __cpuidex(result, leaf, subleaf); + *a = result[0]; + *b = result[1]; + *c = result[2]; + *d = result[3]; +#else + __asm__ volatile(".ifnc %%ebx, %1; mov %%ebx, %1; .endif\n" + "cpuid \n" + ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n" + : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d) + : "a" (leaf), "c" (subleaf)); +#endif +} + +/* Read an extended control register. */ +static inline u64 +read_xcr(u32 index) +{ +#ifdef _MSC_VER + return _xgetbv(index); +#else + u32 d, a; + + /* + * Execute the "xgetbv" instruction. Old versions of binutils do not + * recognize this instruction, so list the raw bytes instead. + * + * This must be 'volatile' to prevent this code from being moved out + * from under the check for OSXSAVE. + */ + __asm__ volatile(".byte 0x0f, 0x01, 0xd0" : + "=d" (d), "=a" (a) : "c" (index)); + + return ((u64)d << 32) | a; +#endif +} + +static const struct cpu_feature x86_cpu_feature_table[] = { + {X86_CPU_FEATURE_SSE2, "sse2"}, + {X86_CPU_FEATURE_PCLMULQDQ, "pclmulqdq"}, + {X86_CPU_FEATURE_AVX, "avx"}, + {X86_CPU_FEATURE_AVX2, "avx2"}, + {X86_CPU_FEATURE_BMI2, "bmi2"}, + {X86_CPU_FEATURE_ZMM, "zmm"}, + {X86_CPU_FEATURE_AVX512F, "avx512f"}, + {X86_CPU_FEATURE_AVX512BW, "avx512bw"}, + {X86_CPU_FEATURE_AVX512VL, "avx512vl"}, + {X86_CPU_FEATURE_VPCLMULQDQ, "vpclmulqdq"}, + {X86_CPU_FEATURE_AVX512VNNI, "avx512_vnni"}, + {X86_CPU_FEATURE_AVXVNNI, "avx_vnni"}, +}; + +volatile u32 libdeflate_x86_cpu_features = 0; + +/* + * Don't use 512-bit vectors on Intel CPUs before Rocket Lake and Sapphire + * Rapids, due to the downclocking penalty. + */ +static inline bool +allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model) +{ +#ifdef TEST_SUPPORT__DO_NOT_USE + return true; +#endif + if (memcmp(manufacturer, "GenuineIntel", 12) != 0) + return true; + if (family != 6) + return true; + switch (model) { + case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */ + case 106: /* Ice Lake (Server) */ + case 108: /* Ice Lake (Server) */ + case 126: /* Ice Lake (Client) */ + case 140: /* Tiger Lake */ + case 141: /* Tiger Lake */ + return false; + } + return true; +} + +/* Initialize libdeflate_x86_cpu_features. */ +void libdeflate_init_x86_cpu_features(void) +{ + u32 max_leaf; + u32 manufacturer[3]; + u32 family, model; + u32 a, b, c, d; + u64 xcr0 = 0; + u32 features = 0; + + /* EAX=0: Highest Function Parameter and Manufacturer ID */ + cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2], + &manufacturer[1]); + if (max_leaf < 1) + goto out; + + /* EAX=1: Processor Info and Feature Bits */ + cpuid(1, 0, &a, &b, &c, &d); + family = (a >> 8) & 0xf; + model = (a >> 4) & 0xf; + if (family == 6 || family == 0xf) + model += (a >> 12) & 0xf0; + if (family == 0xf) + family += (a >> 20) & 0xff; + if (d & (1 << 26)) + features |= X86_CPU_FEATURE_SSE2; + if (c & (1 << 1)) + features |= X86_CPU_FEATURE_PCLMULQDQ; + if (c & (1 << 27)) + xcr0 = read_xcr(0); + if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_AVX; + + if (max_leaf < 7) + goto out; + + /* EAX=7, ECX=0: Extended Features */ + cpuid(7, 0, &a, &b, &c, &d); + if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_AVX2; + if (b & (1 << 8)) + features |= X86_CPU_FEATURE_BMI2; + if (((xcr0 & 0xe6) == 0xe6) && + allow_512bit_vectors(manufacturer, family, model)) + features |= X86_CPU_FEATURE_ZMM; + if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6)) + features |= X86_CPU_FEATURE_AVX512F; + if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6)) + features |= X86_CPU_FEATURE_AVX512BW; + if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6)) + features |= X86_CPU_FEATURE_AVX512VL; + if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_VPCLMULQDQ; + if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6)) + features |= X86_CPU_FEATURE_AVX512VNNI; + + /* EAX=7, ECX=1: Extended Features */ + cpuid(7, 1, &a, &b, &c, &d); + if ((a & (1 << 4)) && ((xcr0 & 0x6) == 0x6)) + features |= X86_CPU_FEATURE_AVXVNNI; + +out: + disable_cpu_features_for_testing(&features, x86_cpu_feature_table, + ARRAY_LEN(x86_cpu_feature_table)); + + libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN; +} + +#endif /* X86_CPU_FEATURES_KNOWN */ \ No newline at end of file diff --git a/Sources/DEFLATE/x86/cpu_features.h b/Sources/DEFLATE/x86/cpu_features.h new file mode 100644 index 00000000..d5d3f2ac --- /dev/null +++ b/Sources/DEFLATE/x86/cpu_features.h @@ -0,0 +1,176 @@ +/* + * x86/cpu_features.h - feature detection for x86 CPUs + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_X86_CPU_FEATURES_H +#define LIB_X86_CPU_FEATURES_H + +#include "../lib_common.h" + +#if defined(ARCH_X86_32) || defined(ARCH_X86_64) + +#define X86_CPU_FEATURE_SSE2 (1 << 0) +#define X86_CPU_FEATURE_PCLMULQDQ (1 << 1) +#define X86_CPU_FEATURE_AVX (1 << 2) +#define X86_CPU_FEATURE_AVX2 (1 << 3) +#define X86_CPU_FEATURE_BMI2 (1 << 4) +/* + * ZMM indicates whether 512-bit vectors (zmm registers) should be used. On + * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU + * supports it, i.e. even if AVX512F is set. On these CPUs, we may still use + * AVX-512 instructions, but only with ymm and xmm registers. + */ +#define X86_CPU_FEATURE_ZMM (1 << 5) +#define X86_CPU_FEATURE_AVX512F (1 << 6) +#define X86_CPU_FEATURE_AVX512BW (1 << 7) +#define X86_CPU_FEATURE_AVX512VL (1 << 8) +#define X86_CPU_FEATURE_VPCLMULQDQ (1 << 9) +#define X86_CPU_FEATURE_AVX512VNNI (1 << 10) +#define X86_CPU_FEATURE_AVXVNNI (1 << 11) + +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) +/* Runtime x86 CPU feature detection is supported. */ +# define X86_CPU_FEATURES_KNOWN (1U << 31) +extern volatile u32 libdeflate_x86_cpu_features; + +void libdeflate_init_x86_cpu_features(void); + +static inline u32 get_x86_cpu_features(void) +{ + if (libdeflate_x86_cpu_features == 0) + libdeflate_init_x86_cpu_features(); + return libdeflate_x86_cpu_features; +} +/* + * x86 intrinsics are also supported. Include the headers needed to use them. + * Normally just immintrin.h suffices. With clang in MSVC compatibility mode, + * immintrin.h incorrectly skips including sub-headers, so include those too. + */ +# include +# if defined(_MSC_VER) && defined(__clang__) +# include +# include +# include +# include +# include +# include +# include +# include +# if __has_include() +# include +# endif +# if __has_include() +# include +# endif +# if __has_include() +# include +# endif +# if __has_include() +# include +# endif +# if __has_include() +# include +# endif +# endif +#else +static inline u32 get_x86_cpu_features(void) { return 0; } +#endif + +#if defined(__SSE2__) || \ + (defined(_MSC_VER) && \ + (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2))) +# define HAVE_SSE2(features) 1 +# define HAVE_SSE2_NATIVE 1 +#else +# define HAVE_SSE2(features) ((features) & X86_CPU_FEATURE_SSE2) +# define HAVE_SSE2_NATIVE 0 +#endif + +#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__)) +# define HAVE_PCLMULQDQ(features) 1 +#else +# define HAVE_PCLMULQDQ(features) ((features) & X86_CPU_FEATURE_PCLMULQDQ) +#endif + +#ifdef __AVX__ +# define HAVE_AVX(features) 1 +#else +# define HAVE_AVX(features) ((features) & X86_CPU_FEATURE_AVX) +#endif + +#ifdef __AVX2__ +# define HAVE_AVX2(features) 1 +#else +# define HAVE_AVX2(features) ((features) & X86_CPU_FEATURE_AVX2) +#endif + +#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__)) +# define HAVE_BMI2(features) 1 +# define HAVE_BMI2_NATIVE 1 +#else +# define HAVE_BMI2(features) ((features) & X86_CPU_FEATURE_BMI2) +# define HAVE_BMI2_NATIVE 0 +#endif + +#ifdef __AVX512F__ +# define HAVE_AVX512F(features) 1 +#else +# define HAVE_AVX512F(features) ((features) & X86_CPU_FEATURE_AVX512F) +#endif + +#ifdef __AVX512BW__ +# define HAVE_AVX512BW(features) 1 +#else +# define HAVE_AVX512BW(features) ((features) & X86_CPU_FEATURE_AVX512BW) +#endif + +#ifdef __AVX512VL__ +# define HAVE_AVX512VL(features) 1 +#else +# define HAVE_AVX512VL(features) ((features) & X86_CPU_FEATURE_AVX512VL) +#endif + +#ifdef __VPCLMULQDQ__ +# define HAVE_VPCLMULQDQ(features) 1 +#else +# define HAVE_VPCLMULQDQ(features) ((features) & X86_CPU_FEATURE_VPCLMULQDQ) +#endif + +#ifdef __AVX512VNNI__ +# define HAVE_AVX512VNNI(features) 1 +#else +# define HAVE_AVX512VNNI(features) ((features) & X86_CPU_FEATURE_AVX512VNNI) +#endif + +#ifdef __AVXVNNI__ +# define HAVE_AVXVNNI(features) 1 +#else +# define HAVE_AVXVNNI(features) ((features) & X86_CPU_FEATURE_AVXVNNI) +#endif + +#endif /* ARCH_X86_32 || ARCH_X86_64 */ + +#endif /* LIB_X86_CPU_FEATURES_H */ diff --git a/Sources/DEFLATE/x86/crc32_impl.h b/Sources/DEFLATE/x86/crc32_impl.h new file mode 100644 index 00000000..79494468 --- /dev/null +++ b/Sources/DEFLATE/x86/crc32_impl.h @@ -0,0 +1,137 @@ +/* + * x86/crc32_impl.h - x86 implementations of the gzip CRC-32 algorithm + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_X86_CRC32_IMPL_H +#define LIB_X86_CRC32_IMPL_H + +#include "x86/cpu_features.h" + +#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER) +/* PCLMULQDQ implementation */ +# define crc32_x86_pclmulqdq crc32_x86_pclmulqdq +# define SUFFIX _pclmulqdq +# define ATTRIBUTES _target_attribute("pclmul") +# define VL 16 +# define FOLD_LESSTHAN16BYTES 0 +# define USE_TERNARYLOGIC 0 +# include "x86/crc32_pclmul_template.h" + +/* + * PCLMULQDQ/AVX implementation. Compared to the regular PCLMULQDQ + * implementation, this still uses 128-bit vectors, but it has two potential + * benefits. First, simply compiling against the AVX target can improve + * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without + * actually using any AVX intrinsics, probably due to the availability of + * non-destructive VEX-encoded instructions. Second, AVX support implies SSSE3 + * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient + * handling of partial blocks. (We *could* compile a variant with + * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.) + */ +# define crc32_x86_pclmulqdq_avx crc32_x86_pclmulqdq_avx +# define SUFFIX _pclmulqdq_avx +# define ATTRIBUTES _target_attribute("pclmul,avx") +# define VL 16 +# define FOLD_LESSTHAN16BYTES 1 +# define USE_TERNARYLOGIC 0 +# include "x86/crc32_pclmul_template.h" +#endif + +/* + * VPCLMULQDQ/AVX2 implementation. Uses 256-bit vectors. + * + * Currently this can't be enabled with MSVC because MSVC has a bug where it + * incorrectly assumes that VPCLMULQDQ implies AVX-512: + * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest + */ +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) +# define crc32_x86_vpclmulqdq_avx2 crc32_x86_vpclmulqdq_avx2 +# define SUFFIX _vpclmulqdq_avx2 +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx2") +# define VL 32 +# define FOLD_LESSTHAN16BYTES 1 +# define USE_TERNARYLOGIC 0 +# include "x86/crc32_pclmul_template.h" +#endif + +#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920) +/* + * VPCLMULQDQ/AVX512 implementation with 256-bit vectors. This takes advantage + * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit. + * This can be useful on CPUs where 512-bit vectors cause downclocking. + */ +# define crc32_x86_vpclmulqdq_avx512_vl256 crc32_x86_vpclmulqdq_avx512_vl256 +# define SUFFIX _vpclmulqdq_avx512_vl256 +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") +# define VL 32 +# define FOLD_LESSTHAN16BYTES 1 +# define USE_TERNARYLOGIC 1 +# include "x86/crc32_pclmul_template.h" + +/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */ +# define crc32_x86_vpclmulqdq_avx512_vl512 crc32_x86_vpclmulqdq_avx512_vl512 +# define SUFFIX _vpclmulqdq_avx512_vl512 +# define ATTRIBUTES _target_attribute("vpclmulqdq,pclmul,avx512vl") +# define VL 64 +# define FOLD_LESSTHAN16BYTES 1 +# define USE_TERNARYLOGIC 1 +# include "x86/crc32_pclmul_template.h" +#endif + +static inline crc32_func_t +arch_select_crc32_func(void) +{ + const u32 features MAYBE_UNUSED = get_x86_cpu_features(); + +#ifdef crc32_x86_vpclmulqdq_avx512_vl512 + if ((features & X86_CPU_FEATURE_ZMM) && + HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && + HAVE_AVX512F(features) && HAVE_AVX512VL(features)) + return crc32_x86_vpclmulqdq_avx512_vl512; +#endif +#ifdef crc32_x86_vpclmulqdq_avx512_vl256 + if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && + HAVE_AVX512F(features) && HAVE_AVX512VL(features)) + return crc32_x86_vpclmulqdq_avx512_vl256; +#endif +#ifdef crc32_x86_vpclmulqdq_avx2 + if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) && + HAVE_AVX2(features)) + return crc32_x86_vpclmulqdq_avx2; +#endif +#ifdef crc32_x86_pclmulqdq_avx + if (HAVE_PCLMULQDQ(features) && HAVE_AVX(features)) + return crc32_x86_pclmulqdq_avx; +#endif +#ifdef crc32_x86_pclmulqdq + if (HAVE_PCLMULQDQ(features)) + return crc32_x86_pclmulqdq; +#endif + return NULL; +} +#define arch_select_crc32_func arch_select_crc32_func + +#endif /* LIB_X86_CRC32_IMPL_H */ diff --git a/Sources/DEFLATE/x86/crc32_pclmul_template.h b/Sources/DEFLATE/x86/crc32_pclmul_template.h new file mode 100644 index 00000000..4257d449 --- /dev/null +++ b/Sources/DEFLATE/x86/crc32_pclmul_template.h @@ -0,0 +1,487 @@ +/* + * x86/crc32_pclmul_template.h - gzip CRC-32 with PCLMULQDQ instructions + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +/* + * This file is a "template" for instantiating PCLMULQDQ-based crc32_x86 + * functions. The "parameters" are: + * + * SUFFIX: + * Name suffix to append to all instantiated functions. + * ATTRIBUTES: + * Target function attributes to use. Must satisfy the dependencies of the + * other parameters as follows: + * VL=16 && FOLD_LESSTHAN16BYTES=0: at least pclmul + * VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1 + * VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2 + * VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl + * VL=64: at least vpclmulqdq,pclmul,avx512vl + * VL: + * Vector length in bytes. Supported values are 16, 32, and 64. + * FOLD_LESSTHAN16BYTES: + * Use vector instructions to handle any partial blocks at the beginning + * and end, instead of falling back to scalar instructions for those parts. + * USE_TERNARYLOGIC: + * Use the vpternlog instruction to do three-argument XORs. + * + * The overall algorithm used is CRC folding with carryless multiplication + * instructions. Note that the x86 crc32 instruction cannot be used, as it is + * for a different polynomial, not the gzip one. For an explanation of CRC + * folding with carryless multiplication instructions, see + * scripts/gen_crc32_multipliers.c and the following paper: + * + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + * https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf + * + * The original pclmulqdq instruction does one 64x64 to 128-bit carryless + * multiplication. The VPCLMULQDQ feature added instructions that do two + * parallel 64x64 to 128-bit carryless multiplications in combination with AVX + * or AVX512VL, or four in combination with AVX512F. + */ + +#undef fold_vec128 +static forceinline ATTRIBUTES __m128i +ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i multipliers) +{ + dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x00)); + dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x11)); + return dst; +} +#define fold_vec128 ADD_SUFFIX(fold_vec128) + +#if VL >= 32 +#undef fold_vec256 +static forceinline ATTRIBUTES __m256i +ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i multipliers) +{ +#if USE_TERNARYLOGIC + return _mm256_ternarylogic_epi32( + _mm256_clmulepi64_epi128(src, multipliers, 0x00), + _mm256_clmulepi64_epi128(src, multipliers, 0x11), + dst, + 0x96); +#else + return _mm256_xor_si256( + _mm256_xor_si256(dst, + _mm256_clmulepi64_epi128(src, multipliers, 0x00)), + _mm256_clmulepi64_epi128(src, multipliers, 0x11)); +#endif +} +#define fold_vec256 ADD_SUFFIX(fold_vec256) +#endif /* VL >= 32 */ + +#if VL >= 64 +#undef fold_vec512 +static forceinline ATTRIBUTES __m512i +ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers) +{ + return _mm512_ternarylogic_epi32( + _mm512_clmulepi64_epi128(src, multipliers, 0x00), + _mm512_clmulepi64_epi128(src, multipliers, 0x11), + dst, + 0x96); +} +#define fold_vec512 ADD_SUFFIX(fold_vec512) +#endif /* VL >= 64 */ + +#if VL == 16 +# define vec_t __m128i +# define fold_vec fold_vec128 +# define VLOAD_UNALIGNED(p) _mm_loadu_si128((const void *)(p)) +# define VXOR(a, b) _mm_xor_si128((a), (b)) +# define M128I_TO_VEC(a) a +# define MULTS_8V _mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG) +# define MULTS_4V _mm_set_epi64x(CRC32_X479_MODG, CRC32_X543_MODG) +# define MULTS_2V _mm_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG) +# define MULTS_1V _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG) +#elif VL == 32 +# define vec_t __m256i +# define fold_vec fold_vec256 +# define VLOAD_UNALIGNED(p) _mm256_loadu_si256((const void *)(p)) +# define VXOR(a, b) _mm256_xor_si256((a), (b)) +# define M128I_TO_VEC(a) _mm256_castsi128_si256(a) +# define MULTS(a, b) _mm256_set_epi64x(a, b, a, b) +# define MULTS_8V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG) +# define MULTS_4V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG) +# define MULTS_2V MULTS(CRC32_X479_MODG, CRC32_X543_MODG) +# define MULTS_1V MULTS(CRC32_X223_MODG, CRC32_X287_MODG) +#elif VL == 64 +# define vec_t __m512i +# define fold_vec fold_vec512 +# define VLOAD_UNALIGNED(p) _mm512_loadu_si512((const void *)(p)) +# define VXOR(a, b) _mm512_xor_si512((a), (b)) +# define M128I_TO_VEC(a) _mm512_castsi128_si512(a) +# define MULTS(a, b) _mm512_set_epi64(a, b, a, b, a, b, a, b) +# define MULTS_8V MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG) +# define MULTS_4V MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG) +# define MULTS_2V MULTS(CRC32_X991_MODG, CRC32_X1055_MODG) +# define MULTS_1V MULTS(CRC32_X479_MODG, CRC32_X543_MODG) +#else +# error "unsupported vector length" +#endif + +#if FOLD_LESSTHAN16BYTES +/* + * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to + * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and + * the data into vectors x0 and x1 that contain 'len' bytes and 16 bytes, + * respectively. Then fold x0 into x1 and return the result. + * Assumes that 'p + len - 16' is in-bounds. + */ +#undef fold_lessthan16bytes +static forceinline ATTRIBUTES __m128i +ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len, + __m128i /* __v2du */ multipliers_128b) +{ + /* + * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes. + * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes. + */ + static const u8 shift_tab[48] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + }; + __m128i lshift = _mm_loadu_si128((const void *)&shift_tab[len]); + __m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]); + __m128i x0, x1; + + /* x0 = x left-shifted by '16 - len' bytes */ + x0 = _mm_shuffle_epi8(x, lshift); + + /* + * x1 = the last '16 - len' bytes from x (i.e. x right-shifted by 'len' + * bytes) followed by the remaining data. + */ + x1 = _mm_blendv_epi8(_mm_shuffle_epi8(x, rshift), + _mm_loadu_si128((const void *)(p + len - 16)), + /* msb 0/1 of each byte selects byte from arg1/2 */ + rshift); + + return fold_vec128(x0, x1, multipliers_128b); +} +#define fold_lessthan16bytes ADD_SUFFIX(fold_lessthan16bytes) +#endif /* FOLD_LESSTHAN16BYTES */ + +static u32 ATTRIBUTES +ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len) +{ + const vec_t multipliers_8v = MULTS_8V; /* 8 vecs */ + const vec_t multipliers_4v = MULTS_4V; /* 4 vecs */ + const vec_t multipliers_2v = MULTS_2V; /* 2 vecs */ + const vec_t multipliers_1v = MULTS_1V; /* 1 vecs */ + const __m128i /* __v2du */ multipliers_128b = + _mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG); + const __m128i /* __v2du */ final_multiplier = + _mm_set_epi64x(0, CRC32_X63_MODG); + const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF); + const __m128i /* __v2du */ barrett_reduction_constants = + _mm_set_epi64x(CRC32_BARRETT_CONSTANT_2, + CRC32_BARRETT_CONSTANT_1); + vec_t v0, v1, v2, v3, v4, v5, v6, v7; + __m128i x0, x1; + + /* + * There are two overall code paths. The first path supports all + * lengths, but is intended for short lengths; it uses unaligned loads + * and does at most 4-way folds. The second path only supports longer + * lengths, aligns the pointer in order to do aligned loads, and does up + * to 8-way folds. The length check below decides which path to take. + */ + if (len < 64*VL) { + if (len < VL) + return crc32_slice1(crc, p, len); + + v0 = VXOR(VLOAD_UNALIGNED(p), + M128I_TO_VEC(_mm_cvtsi32_si128(crc))); + p += VL; + + if (len >= 4*VL) { + v1 = VLOAD_UNALIGNED(p + 0*VL); + v2 = VLOAD_UNALIGNED(p + 1*VL); + v3 = VLOAD_UNALIGNED(p + 2*VL); + p += 3*VL; + while (len >= 8*VL) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL), + multipliers_4v); + v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL), + multipliers_4v); + v2 = fold_vec(v2, VLOAD_UNALIGNED(p + 2*VL), + multipliers_4v); + v3 = fold_vec(v3, VLOAD_UNALIGNED(p + 3*VL), + multipliers_4v); + p += 4*VL; + len -= 4*VL; + } + v0 = fold_vec(v0, v2, multipliers_2v); + v1 = fold_vec(v1, v3, multipliers_2v); + if (len & (2*VL)) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL), + multipliers_2v); + v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL), + multipliers_2v); + p += 2*VL; + } + v0 = fold_vec(v0, v1, multipliers_1v); + if (len & VL) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p), + multipliers_1v); + p += VL; + } + } else { + if (len >= 2*VL) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p), + multipliers_1v); + p += VL; + if (len >= 3*VL) { + v0 = fold_vec(v0, VLOAD_UNALIGNED(p), + multipliers_1v); + p += VL; + } + } + } + } else { + size_t align = -(uintptr_t)p & (VL-1); + const vec_t *vp; + + /* Align p to the next VL-byte boundary. */ + if (align == 0) { + vp = (const vec_t *)p; + v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc))); + } else { + len -= align; + #if FOLD_LESSTHAN16BYTES + x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p), + _mm_cvtsi32_si128(crc)); + p += 16; + if (align & 15) { + x0 = fold_lessthan16bytes(x0, p, align & 15, + multipliers_128b); + p += align & 15; + align &= ~15; + } + while (align >= 16) { + x0 = fold_vec128(x0, *(const __m128i *)p, + multipliers_128b); + p += 16; + align -= 16; + } + v0 = M128I_TO_VEC(x0); + # if VL == 32 + v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1); + p += 16; + # elif VL == 64 + v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1); + p += 16; + v0 = _mm512_inserti64x4(v0, *(const __m256i *)p, 1); + p += 32; + # endif + vp = (const vec_t *)p; + #else + crc = crc32_slice1(crc, p, align); + p += align; + vp = (const vec_t *)p; + v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc))); + #endif + } + v1 = *vp++; + v2 = *vp++; + v3 = *vp++; + v4 = *vp++; + v5 = *vp++; + v6 = *vp++; + v7 = *vp++; + do { + v0 = fold_vec(v0, *vp++, multipliers_8v); + v1 = fold_vec(v1, *vp++, multipliers_8v); + v2 = fold_vec(v2, *vp++, multipliers_8v); + v3 = fold_vec(v3, *vp++, multipliers_8v); + v4 = fold_vec(v4, *vp++, multipliers_8v); + v5 = fold_vec(v5, *vp++, multipliers_8v); + v6 = fold_vec(v6, *vp++, multipliers_8v); + v7 = fold_vec(v7, *vp++, multipliers_8v); + len -= 8*VL; + } while (len >= 16*VL); + + /* + * Reduce v0-v7 (length 8*VL bytes) to v0 (length VL bytes) + * and fold in any VL-byte data segments that remain. + */ + v0 = fold_vec(v0, v4, multipliers_4v); + v1 = fold_vec(v1, v5, multipliers_4v); + v2 = fold_vec(v2, v6, multipliers_4v); + v3 = fold_vec(v3, v7, multipliers_4v); + if (len & (4*VL)) { + v0 = fold_vec(v0, *vp++, multipliers_4v); + v1 = fold_vec(v1, *vp++, multipliers_4v); + v2 = fold_vec(v2, *vp++, multipliers_4v); + v3 = fold_vec(v3, *vp++, multipliers_4v); + } + v0 = fold_vec(v0, v2, multipliers_2v); + v1 = fold_vec(v1, v3, multipliers_2v); + if (len & (2*VL)) { + v0 = fold_vec(v0, *vp++, multipliers_2v); + v1 = fold_vec(v1, *vp++, multipliers_2v); + } + v0 = fold_vec(v0, v1, multipliers_1v); + if (len & VL) + v0 = fold_vec(v0, *vp++, multipliers_1v); + p = (const u8 *)vp; + } + + /* + * Reduce v0 (length VL bytes) to x0 (length 16 bytes) + * and fold in any 16-byte data segments that remain. + */ +#if VL == 16 + x0 = v0; +#else + { +# if VL == 32 + __m256i y0 = v0; +# else + const __m256i multipliers_256b = + _mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG, + CRC32_X223_MODG, CRC32_X287_MODG); + __m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0), + _mm512_extracti64x4_epi64(v0, 1), + multipliers_256b); + if (len & 32) { + y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p), + multipliers_256b); + p += 32; + } +# endif + x0 = fold_vec128(_mm256_extracti128_si256(y0, 0), + _mm256_extracti128_si256(y0, 1), + multipliers_128b); + } + if (len & 16) { + x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p), + multipliers_128b); + p += 16; + } +#endif + len &= 15; + + /* + * If fold_lessthan16bytes() is available, handle any remainder + * of 1 to 15 bytes now, before reducing to 32 bits. + */ +#if FOLD_LESSTHAN16BYTES + if (len) + x0 = fold_lessthan16bytes(x0, p, len, multipliers_128b); +#endif + + /* + * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, + * which is equivalent to multiplying by x^32. This is needed because + * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). + */ + x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), + _mm_clmulepi64_si128(x0, multipliers_128b, 0x10)); + + /* Fold 96 => 64 bits. */ + x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), + _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), + final_multiplier, 0x00)); + + /* + * Reduce 64 => 32 bits using Barrett reduction. + * + * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to + * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)): + * + * R(x) = (A(x)*x^32 + B(x)) mod G(x) + * = (A(x)*x^32) mod G(x) + B(x) + * + * Then, by the Division Algorithm there exists a unique q(x) such that: + * + * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x) + * + * Since the left-hand side is of maximum degree 31, the right-hand side + * must be too. This implies that we can apply 'mod x^32' to the + * right-hand side without changing its value: + * + * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32 + * + * Note that '+' is equivalent to '-' in polynomials over GF(2). + * + * We also know that: + * + * / A(x)*x^32 \ + * q(x) = floor ( --------- ) + * \ G(x) / + * + * To compute this efficiently, we can multiply the top and bottom by + * x^32 and move the division by G(x) to the top: + * + * / A(x) * floor(x^64 / G(x)) \ + * q(x) = floor ( ------------------------- ) + * \ x^32 / + * + * Note that floor(x^64 / G(x)) is a constant. + * + * So finally we have: + * + * / A(x) * floor(x^64 / G(x)) \ + * R(x) = B(x) + G(x)*floor ( ------------------------- ) + * \ x^32 / + */ + x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), + barrett_reduction_constants, 0x00); + x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32), + barrett_reduction_constants, 0x10); + x0 = _mm_xor_si128(x0, x1); +#if FOLD_LESSTHAN16BYTES + crc = _mm_extract_epi32(x0, 1); +#else + crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01)); + /* Process up to 15 bytes left over at the end. */ + crc = crc32_slice1(crc, p, len); +#endif + return crc; +} + +#undef vec_t +#undef fold_vec +#undef VLOAD_UNALIGNED +#undef VXOR +#undef M128I_TO_VEC +#undef MULTS +#undef MULTS_8V +#undef MULTS_4V +#undef MULTS_2V +#undef MULTS_1V + +#undef SUFFIX +#undef ATTRIBUTES +#undef VL +#undef FOLD_LESSTHAN16BYTES +#undef USE_TERNARYLOGIC diff --git a/Sources/DEFLATE/x86/decompress_impl.h b/Sources/DEFLATE/x86/decompress_impl.h new file mode 100644 index 00000000..85ca920a --- /dev/null +++ b/Sources/DEFLATE/x86/decompress_impl.h @@ -0,0 +1,57 @@ +#ifndef LIB_X86_DECOMPRESS_IMPL_H +#define LIB_X86_DECOMPRESS_IMPL_H + +#include "x86/cpu_features.h" + +/* + * BMI2 optimized decompression function. + * + * With gcc and clang we just compile the whole function with + * __attribute__((target("bmi2"))), and the compiler uses bmi2 automatically. + * + * With MSVC, there is no target function attribute, but it's still possible to + * use bmi2 intrinsics explicitly. Currently we mostly don't, but there's a + * case in which we do (see below), so we at least take advantage of that. + * However, MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*() + * intrinsics. It seems to be fixed in VS2022. Hence, use MSVC_PREREQ(1930). + */ +#if defined(__GNUC__) || defined(__clang__) || MSVC_PREREQ(1930) +# define deflate_decompress_bmi2 deflate_decompress_bmi2 +# define FUNCNAME deflate_decompress_bmi2 +# define ATTRIBUTES _target_attribute("bmi2") + /* + * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the + * bzhi instruction for 'word & BITMASK(count)'. So use the bzhi intrinsic + * explicitly. EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)'; + * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'. + * Nevertheless, their implementation using the bzhi intrinsic is identical, + * as the bzhi instruction truncates the count to 8 bits implicitly. + */ +# ifndef __clang__ +# ifdef ARCH_X86_64 +# define EXTRACT_VARBITS(word, count) _bzhi_u64((word), (count)) +# define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count)) +# else +# define EXTRACT_VARBITS(word, count) _bzhi_u32((word), (count)) +# define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count)) +# endif +# endif +# include "../decompress_template.h" +#endif + +#if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE +#define DEFAULT_IMPL deflate_decompress_bmi2 +#else +static inline decompress_func_t +arch_select_decompress_func(void) +{ +#ifdef deflate_decompress_bmi2 + if (HAVE_BMI2(get_x86_cpu_features())) + return deflate_decompress_bmi2; +#endif + return NULL; +} +#define arch_select_decompress_func arch_select_decompress_func +#endif + +#endif /* LIB_X86_DECOMPRESS_IMPL_H */ \ No newline at end of file diff --git a/Sources/DEFLATE/x86/matchfinder_impl.h b/Sources/DEFLATE/x86/matchfinder_impl.h new file mode 100644 index 00000000..080a7492 --- /dev/null +++ b/Sources/DEFLATE/x86/matchfinder_impl.h @@ -0,0 +1,122 @@ +/* + * x86/matchfinder_impl.h - x86 implementations of matchfinder functions + * + * Copyright 2016 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef LIB_X86_MATCHFINDER_IMPL_H +#define LIB_X86_MATCHFINDER_IMPL_H + +#include "x86/cpu_features.h" + +#ifdef __AVX2__ +static forceinline void +matchfinder_init_avx2(mf_pos_t *data, size_t size) +{ + __m256i *p = (__m256i *)data; + __m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + p[0] = v; + p[1] = v; + p[2] = v; + p[3] = v; + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_init matchfinder_init_avx2 + +static forceinline void +matchfinder_rebase_avx2(mf_pos_t *data, size_t size) +{ + __m256i *p = (__m256i *)data; + __m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + /* PADDSW: Add Packed Signed Integers With Signed Saturation */ + p[0] = _mm256_adds_epi16(p[0], v); + p[1] = _mm256_adds_epi16(p[1], v); + p[2] = _mm256_adds_epi16(p[2], v); + p[3] = _mm256_adds_epi16(p[3], v); + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_rebase matchfinder_rebase_avx2 + +#elif HAVE_SSE2_NATIVE +static forceinline void +matchfinder_init_sse2(mf_pos_t *data, size_t size) +{ + __m128i *p = (__m128i *)data; + __m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + p[0] = v; + p[1] = v; + p[2] = v; + p[3] = v; + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_init matchfinder_init_sse2 + +static forceinline void +matchfinder_rebase_sse2(mf_pos_t *data, size_t size) +{ + __m128i *p = (__m128i *)data; + __m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE); + + STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0); + STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0); + STATIC_ASSERT(sizeof(mf_pos_t) == 2); + + do { + /* PADDSW: Add Packed Signed Integers With Signed Saturation */ + p[0] = _mm_adds_epi16(p[0], v); + p[1] = _mm_adds_epi16(p[1], v); + p[2] = _mm_adds_epi16(p[2], v); + p[3] = _mm_adds_epi16(p[3], v); + p += 4; + size -= 4 * sizeof(*p); + } while (size != 0); +} +#define matchfinder_rebase matchfinder_rebase_sse2 +#endif /* HAVE_SSE2_NATIVE */ + +#endif /* LIB_X86_MATCHFINDER_IMPL_H */ \ No newline at end of file diff --git a/Sources/DEFLATE/zlib_compress.c b/Sources/DEFLATE/zlib_compress.c index 12d43602..ecf38d8b 100644 --- a/Sources/DEFLATE/zlib_compress.c +++ b/Sources/DEFLATE/zlib_compress.c @@ -30,53 +30,53 @@ LIBDEFLATEAPI size_t libdeflate_zlib_compress(struct libdeflate_compressor *c, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail) { - u8 *out_next = out; - u16 hdr; - unsigned compression_level; - unsigned level_hint; - size_t deflate_size; - - if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD) - return 0; - - /* 2 byte header: CMF and FLG */ - hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12); - compression_level = libdeflate_get_compression_level(c); - if (compression_level < 2) - level_hint = ZLIB_FASTEST_COMPRESSION; - else if (compression_level < 6) - level_hint = ZLIB_FAST_COMPRESSION; - else if (compression_level < 8) - level_hint = ZLIB_DEFAULT_COMPRESSION; - else - level_hint = ZLIB_SLOWEST_COMPRESSION; - hdr |= level_hint << 6; - hdr |= 31 - (hdr % 31); - - put_unaligned_be16(hdr, out_next); - out_next += 2; - - /* Compressed data */ - deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next, - out_nbytes_avail - ZLIB_MIN_OVERHEAD); - if (deflate_size == 0) - return 0; - out_next += deflate_size; - - /* ADLER32 */ - put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next); - out_next += 4; - - return out_next - (u8 *)out; + u8 *out_next = out; + u16 hdr; + unsigned compression_level; + unsigned level_hint; + size_t deflate_size; + + if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD) + return 0; + + /* 2 byte header: CMF and FLG */ + hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12); + compression_level = libdeflate_get_compression_level(c); + if (compression_level < 2) + level_hint = ZLIB_FASTEST_COMPRESSION; + else if (compression_level < 6) + level_hint = ZLIB_FAST_COMPRESSION; + else if (compression_level < 8) + level_hint = ZLIB_DEFAULT_COMPRESSION; + else + level_hint = ZLIB_SLOWEST_COMPRESSION; + hdr |= level_hint << 6; + hdr |= 31 - (hdr % 31); + + put_unaligned_be16(hdr, out_next); + out_next += 2; + + /* Compressed data */ + deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next, + out_nbytes_avail - ZLIB_MIN_OVERHEAD); + if (deflate_size == 0) + return 0; + out_next += deflate_size; + + /* ADLER32 */ + put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next); + out_next += 4; + + return out_next - (u8 *)out; } LIBDEFLATEAPI size_t libdeflate_zlib_compress_bound(struct libdeflate_compressor *c, - size_t in_nbytes) + size_t in_nbytes) { - return ZLIB_MIN_OVERHEAD + - libdeflate_deflate_compress_bound(c, in_nbytes); + return ZLIB_MIN_OVERHEAD + + libdeflate_deflate_compress_bound(c, in_nbytes); } diff --git a/Sources/DEFLATE/zlib_constants.h b/Sources/DEFLATE/zlib_constants.h index f304310c..7b6b42a1 100644 --- a/Sources/DEFLATE/zlib_constants.h +++ b/Sources/DEFLATE/zlib_constants.h @@ -5,17 +5,17 @@ #ifndef LIB_ZLIB_CONSTANTS_H #define LIB_ZLIB_CONSTANTS_H -#define ZLIB_MIN_HEADER_SIZE 2 -#define ZLIB_FOOTER_SIZE 4 -#define ZLIB_MIN_OVERHEAD (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE) +#define ZLIB_MIN_HEADER_SIZE 2 +#define ZLIB_FOOTER_SIZE 4 +#define ZLIB_MIN_OVERHEAD (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE) -#define ZLIB_CM_DEFLATE 8 +#define ZLIB_CM_DEFLATE 8 -#define ZLIB_CINFO_32K_WINDOW 7 +#define ZLIB_CINFO_32K_WINDOW 7 -#define ZLIB_FASTEST_COMPRESSION 0 -#define ZLIB_FAST_COMPRESSION 1 -#define ZLIB_DEFAULT_COMPRESSION 2 -#define ZLIB_SLOWEST_COMPRESSION 3 +#define ZLIB_FASTEST_COMPRESSION 0 +#define ZLIB_FAST_COMPRESSION 1 +#define ZLIB_DEFAULT_COMPRESSION 2 +#define ZLIB_SLOWEST_COMPRESSION 3 #endif /* LIB_ZLIB_CONSTANTS_H */ diff --git a/Sources/DEFLATE/zlib_decompress.c b/Sources/DEFLATE/zlib_decompress.c index f5e43eae..526f2706 100644 --- a/Sources/DEFLATE/zlib_decompress.c +++ b/Sources/DEFLATE/zlib_decompress.c @@ -30,75 +30,75 @@ LIBDEFLATEAPI enum libdeflate_result libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_in_nbytes_ret, - size_t *actual_out_nbytes_ret) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_in_nbytes_ret, + size_t *actual_out_nbytes_ret) { - const u8 *in_next = in; - const u8 * const in_end = in_next + in_nbytes; - u16 hdr; - size_t actual_in_nbytes; - size_t actual_out_nbytes; - enum libdeflate_result result; - - if (in_nbytes < ZLIB_MIN_OVERHEAD) - return LIBDEFLATE_BAD_DATA; - - /* 2 byte header: CMF and FLG */ - hdr = get_unaligned_be16(in_next); - in_next += 2; - - /* FCHECK */ - if ((hdr % 31) != 0) - return LIBDEFLATE_BAD_DATA; - - /* CM */ - if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE) - return LIBDEFLATE_BAD_DATA; - - /* CINFO */ - if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW) - return LIBDEFLATE_BAD_DATA; - - /* FDICT */ - if ((hdr >> 5) & 1) - return LIBDEFLATE_BAD_DATA; - - /* Compressed data */ - result = libdeflate_deflate_decompress_ex(d, in_next, - in_end - ZLIB_FOOTER_SIZE - in_next, - out, out_nbytes_avail, - &actual_in_nbytes, actual_out_nbytes_ret); - if (result != LIBDEFLATE_SUCCESS) - return result; - - if (actual_out_nbytes_ret) - actual_out_nbytes = *actual_out_nbytes_ret; - else - actual_out_nbytes = out_nbytes_avail; - - in_next += actual_in_nbytes; - - /* ADLER32 */ - if (libdeflate_adler32(1, out, actual_out_nbytes) != - get_unaligned_be32(in_next)) - return LIBDEFLATE_BAD_DATA; - in_next += 4; - - if (actual_in_nbytes_ret) - *actual_in_nbytes_ret = in_next - (u8 *)in; - - return LIBDEFLATE_SUCCESS; + const u8 *in_next = in; + const u8 * const in_end = in_next + in_nbytes; + u16 hdr; + size_t actual_in_nbytes; + size_t actual_out_nbytes; + enum libdeflate_result result; + + if (in_nbytes < ZLIB_MIN_OVERHEAD) + return LIBDEFLATE_BAD_DATA; + + /* 2 byte header: CMF and FLG */ + hdr = get_unaligned_be16(in_next); + in_next += 2; + + /* FCHECK */ + if ((hdr % 31) != 0) + return LIBDEFLATE_BAD_DATA; + + /* CM */ + if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE) + return LIBDEFLATE_BAD_DATA; + + /* CINFO */ + if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW) + return LIBDEFLATE_BAD_DATA; + + /* FDICT */ + if ((hdr >> 5) & 1) + return LIBDEFLATE_BAD_DATA; + + /* Compressed data */ + result = libdeflate_deflate_decompress_ex(d, in_next, + in_end - ZLIB_FOOTER_SIZE - in_next, + out, out_nbytes_avail, + &actual_in_nbytes, actual_out_nbytes_ret); + if (result != LIBDEFLATE_SUCCESS) + return result; + + if (actual_out_nbytes_ret) + actual_out_nbytes = *actual_out_nbytes_ret; + else + actual_out_nbytes = out_nbytes_avail; + + in_next += actual_in_nbytes; + + /* ADLER32 */ + if (libdeflate_adler32(1, out, actual_out_nbytes) != + get_unaligned_be32(in_next)) + return LIBDEFLATE_BAD_DATA; + in_next += 4; + + if (actual_in_nbytes_ret) + *actual_in_nbytes_ret = in_next - (u8 *)in; + + return LIBDEFLATE_SUCCESS; } LIBDEFLATEAPI enum libdeflate_result libdeflate_zlib_decompress(struct libdeflate_decompressor *d, - const void *in, size_t in_nbytes, - void *out, size_t out_nbytes_avail, - size_t *actual_out_nbytes_ret) + const void *in, size_t in_nbytes, + void *out, size_t out_nbytes_avail, + size_t *actual_out_nbytes_ret) { - return libdeflate_zlib_decompress_ex(d, in, in_nbytes, - out, out_nbytes_avail, - NULL, actual_out_nbytes_ret); + return libdeflate_zlib_decompress_ex(d, in, in_nbytes, + out, out_nbytes_avail, + NULL, actual_out_nbytes_ret); }