diff --git a/Package.swift b/Package.swift
index 2cfe1e2f..9893420c 100644
--- a/Package.swift
+++ b/Package.swift
@@ -223,6 +223,16 @@ let package = Package(
 
     .target(
       name: "DEFLATE",
+      exclude: [
+        // better cross-platform compatibility if we remove gzip
+        // support for now, gzip builds fine for macOS and even
+        // on iOS -- it is only when a user chooses to build an
+        // archive using one of the generic devices, that makes
+        // building gzip more complicated.
+        "crc32.c",
+        "gzip_compress.c",
+        "gzip_decompress.c",
+      ],
       publicHeadersPath: "include",
       cxxSettings: [
         .headerSearchPath("."),
diff --git a/Sources/DEFLATE/adler32.c b/Sources/DEFLATE/adler32.c
index 3aaa7efb..c6085c49 100644
--- a/Sources/DEFLATE/adler32.c
+++ b/Sources/DEFLATE/adler32.c
@@ -35,55 +35,87 @@
  * of s2 overflowing when it is represented as an unsigned 32-bit integer.  This
  * value was computed using the following Python script:
  *
- *	divisor = 65521
- *	count = 0
- *	s1 = divisor - 1
- *	s2 = divisor - 1
- *	while True:
- *		s1 += 0xFF
- *		s2 += s1
- *		if s2 > 0xFFFFFFFF:
- *			break
- *		count += 1
- *	print(count)
+ *  divisor = 65521
+ *  count = 0
+ *  s1 = divisor - 1
+ *  s2 = divisor - 1
+ *  while True:
+ *    s1 += 0xFF
+ *    s2 += s1
+ *    if s2 > 0xFFFFFFFF:
+ *      break
+ *    count += 1
+ *  print(count)
  *
  * Note that to get the correct worst-case value, we must assume that every byte
  * has value 0xFF and that s1 and s2 started with the highest possible values
  * modulo the divisor.
  */
-#define MAX_CHUNK_LEN	5552
+#define MAX_CHUNK_LEN  5552
+
+/*
+ * Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
+ * update n to 0, and reduce s1 and s2 mod DIVISOR.  It is assumed that neither
+ * s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
+ * already processed after the last reduction must not exceed MAX_CHUNK_LEN.
+ *
+ * This uses only portable C code.  This is used as a fallback when a vectorized
+ * implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
+ *
+ * Some of the vectorized implementations also use this to handle the end of the
+ * data when the data isn't evenly divisible by the length the vectorized code
+ * works on.  To avoid compiler errors about target-specific option mismatches
+ * when this is used in that way, this is a macro rather than a function.
+ *
+ * Although this is unvectorized, this does include an optimization where the
+ * main loop processes four bytes at a time using a strategy similar to that
+ * used by vectorized implementations.  This provides increased instruction-
+ * level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
+ */
+#define ADLER32_CHUNK(s1, s2, p, n)          \
+do {                  \
+if (n >= 4) {              \
+u32 s1_sum = 0;            \
+u32 byte_0_sum = 0;          \
+u32 byte_1_sum = 0;          \
+u32 byte_2_sum = 0;          \
+u32 byte_3_sum = 0;          \
+\
+do {              \
+s1_sum += s1;          \
+s1 += p[0] + p[1] + p[2] + p[3];    \
+byte_0_sum += p[0];        \
+byte_1_sum += p[1];        \
+byte_2_sum += p[2];        \
+byte_3_sum += p[3];        \
+p += 4;            \
+n -= 4;            \
+} while (n >= 4);          \
+s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) +  \
+(2 * byte_2_sum) + byte_3_sum;      \
+}                \
+for (; n; n--, p++) {            \
+s1 += *p;            \
+s2 += s1;            \
+}                \
+s1 %= DIVISOR;              \
+s2 %= DIVISOR;              \
+} while (0)
 
 static u32 MAYBE_UNUSED
 adler32_generic(u32 adler, const u8 *p, size_t len)
 {
-	u32 s1 = adler & 0xFFFF;
-	u32 s2 = adler >> 16;
-	const u8 * const end = p + len;
-
-	while (p != end) {
-		size_t chunk_len = MIN(end - p, MAX_CHUNK_LEN);
-		const u8 *chunk_end = p + chunk_len;
-		size_t num_unrolled_iterations = chunk_len / 4;
-
-		while (num_unrolled_iterations--) {
-			s1 += *p++;
-			s2 += s1;
-			s1 += *p++;
-			s2 += s1;
-			s1 += *p++;
-			s2 += s1;
-			s1 += *p++;
-			s2 += s1;
-		}
-		while (p != chunk_end) {
-			s1 += *p++;
-			s2 += s1;
-		}
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-
-	return (s2 << 16) | s1;
+  u32 s1 = adler & 0xFFFF;
+  u32 s2 = adler >> 16;
+  
+  while (len) {
+    size_t n = MIN(len, MAX_CHUNK_LEN & ~3);
+    
+    len -= n;
+    ADLER32_CHUNK(s1, s2, p, n);
+  }
+  
+  return (s2 << 16) | s1;
 }
 
 /* Include architecture-specific implementation(s) if available. */
@@ -91,7 +123,7 @@ adler32_generic(u32 adler, const u8 *p, size_t len)
 #undef arch_select_adler32_func
 typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
 #if defined(ARCH_ARM32) || defined(ARCH_ARM64)
-#  include "adler32_impl.h"
+#  include "arm/adler32_impl.h"
 #elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
 #  include "x86/adler32_impl.h"
 #endif
@@ -108,13 +140,13 @@ static volatile adler32_func_t adler32_impl = dispatch_adler32;
 /* Choose the best implementation at runtime. */
 static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
 {
-	adler32_func_t f = arch_select_adler32_func();
-
-	if (f == NULL)
-		f = DEFAULT_IMPL;
-
-	adler32_impl = f;
-	return f(adler, p, len);
+  adler32_func_t f = arch_select_adler32_func();
+  
+  if (f == NULL)
+    f = DEFAULT_IMPL;
+  
+  adler32_impl = f;
+  return f(adler, p, len);
 }
 #else
 /* The best implementation is statically known, so call it directly. */
@@ -124,7 +156,7 @@ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
 LIBDEFLATEAPI u32
 libdeflate_adler32(u32 adler, const void *buffer, size_t len)
 {
-	if (buffer == NULL) /* Return initial value. */
-		return 1;
-	return adler32_impl(adler, buffer, len);
+  if (buffer == NULL) /* Return initial value. */
+    return 1;
+  return adler32_impl(adler, buffer, len);
 }
diff --git a/Sources/DEFLATE/adler32_impl.h b/Sources/DEFLATE/adler32_impl.h
deleted file mode 100644
index 865547b8..00000000
--- a/Sources/DEFLATE/adler32_impl.h
+++ /dev/null
@@ -1,272 +0,0 @@
-/*
- * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef LIB_ARM_ADLER32_IMPL_H
-#define LIB_ARM_ADLER32_IMPL_H
-
-#include "cpu_features.h"
-
-/* Regular NEON implementation */
-#if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN()
-#  define adler32_neon		adler32_neon
-#  define FUNCNAME		adler32_neon
-#  define FUNCNAME_CHUNK	adler32_neon_chunk
-#  define IMPL_ALIGNMENT	16
-#  define IMPL_SEGMENT_LEN	64
-/* Prevent unsigned overflow of the 16-bit precision byte counters */
-#  define IMPL_MAX_CHUNK_LEN	(64 * (0xFFFF / 0xFF))
-#  if HAVE_NEON_NATIVE
-#    define ATTRIBUTES
-#  else
-#    ifdef ARCH_ARM32
-#      define ATTRIBUTES	_target_attribute("fpu=neon")
-#    else
-#      define ATTRIBUTES	_target_attribute("+simd")
-#    endif
-#  endif
-#  include <arm_neon.h>
-static forceinline ATTRIBUTES void
-adler32_neon_chunk(const uint8x16_t *p, const uint8x16_t * const end,
-		   u32 *s1, u32 *s2)
-{
-	static const u16 _aligned_attribute(16) mults[64] = {
-		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
-		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
-		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
-	};
-	const uint16x8_t mults_a = vld1q_u16(&mults[0]);
-	const uint16x8_t mults_b = vld1q_u16(&mults[8]);
-	const uint16x8_t mults_c = vld1q_u16(&mults[16]);
-	const uint16x8_t mults_d = vld1q_u16(&mults[24]);
-	const uint16x8_t mults_e = vld1q_u16(&mults[32]);
-	const uint16x8_t mults_f = vld1q_u16(&mults[40]);
-	const uint16x8_t mults_g = vld1q_u16(&mults[48]);
-	const uint16x8_t mults_h = vld1q_u16(&mults[56]);
-
-	uint32x4_t v_s1 = vdupq_n_u32(0);
-	uint32x4_t v_s2 = vdupq_n_u32(0);
-	/*
-	 * v_byte_sums_* contain the sum of the bytes at index i across all
-	 * 64-byte segments, for each index 0..63.
-	 */
-	uint16x8_t v_byte_sums_a = vdupq_n_u16(0);
-	uint16x8_t v_byte_sums_b = vdupq_n_u16(0);
-	uint16x8_t v_byte_sums_c = vdupq_n_u16(0);
-	uint16x8_t v_byte_sums_d = vdupq_n_u16(0);
-	uint16x8_t v_byte_sums_e = vdupq_n_u16(0);
-	uint16x8_t v_byte_sums_f = vdupq_n_u16(0);
-	uint16x8_t v_byte_sums_g = vdupq_n_u16(0);
-	uint16x8_t v_byte_sums_h = vdupq_n_u16(0);
-
-	do {
-		/* Load the next 64 bytes. */
-		const uint8x16_t bytes1 = *p++;
-		const uint8x16_t bytes2 = *p++;
-		const uint8x16_t bytes3 = *p++;
-		const uint8x16_t bytes4 = *p++;
-		uint16x8_t tmp;
-
-		/*
-		 * Accumulate the previous s1 counters into the s2 counters.
-		 * The needed multiplication by 64 is delayed to later.
-		 */
-		v_s2 = vaddq_u32(v_s2, v_s1);
-
-		/*
-		 * Add the 64 bytes to their corresponding v_byte_sums counters,
-		 * while also accumulating the sums of each adjacent set of 4
-		 * bytes into v_s1.
-		 */
-		tmp = vpaddlq_u8(bytes1);
-		v_byte_sums_a = vaddw_u8(v_byte_sums_a, vget_low_u8(bytes1));
-		v_byte_sums_b = vaddw_u8(v_byte_sums_b, vget_high_u8(bytes1));
-		tmp = vpadalq_u8(tmp, bytes2);
-		v_byte_sums_c = vaddw_u8(v_byte_sums_c, vget_low_u8(bytes2));
-		v_byte_sums_d = vaddw_u8(v_byte_sums_d, vget_high_u8(bytes2));
-		tmp = vpadalq_u8(tmp, bytes3);
-		v_byte_sums_e = vaddw_u8(v_byte_sums_e, vget_low_u8(bytes3));
-		v_byte_sums_f = vaddw_u8(v_byte_sums_f, vget_high_u8(bytes3));
-		tmp = vpadalq_u8(tmp, bytes4);
-		v_byte_sums_g = vaddw_u8(v_byte_sums_g, vget_low_u8(bytes4));
-		v_byte_sums_h = vaddw_u8(v_byte_sums_h, vget_high_u8(bytes4));
-		v_s1 = vpadalq_u16(v_s1, tmp);
-
-	} while (p != end);
-
-	/* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */
-#ifdef ARCH_ARM32
-#  define umlal2(a, b, c)  vmlal_u16((a), vget_high_u16(b), vget_high_u16(c))
-#else
-#  define umlal2	   vmlal_high_u16
-#endif
-	v_s2 = vqshlq_n_u32(v_s2, 6);
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a), vget_low_u16(mults_a));
-	v_s2 = umlal2(v_s2, v_byte_sums_a, mults_a);
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b), vget_low_u16(mults_b));
-	v_s2 = umlal2(v_s2, v_byte_sums_b, mults_b);
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c), vget_low_u16(mults_c));
-	v_s2 = umlal2(v_s2, v_byte_sums_c, mults_c);
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d), vget_low_u16(mults_d));
-	v_s2 = umlal2(v_s2, v_byte_sums_d, mults_d);
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e), vget_low_u16(mults_e));
-	v_s2 = umlal2(v_s2, v_byte_sums_e, mults_e);
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f), vget_low_u16(mults_f));
-	v_s2 = umlal2(v_s2, v_byte_sums_f, mults_f);
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g), vget_low_u16(mults_g));
-	v_s2 = umlal2(v_s2, v_byte_sums_g, mults_g);
-	v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h), vget_low_u16(mults_h));
-	v_s2 = umlal2(v_s2, v_byte_sums_h, mults_h);
-#undef umlal2
-
-	/* Horizontal sum to finish up */
-#ifdef ARCH_ARM32
-	*s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) +
-	       vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3);
-	*s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) +
-	       vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3);
-#else
-	*s1 += vaddvq_u32(v_s1);
-	*s2 += vaddvq_u32(v_s2);
-#endif
-}
-#  include "adler32_vec_template.h"
-#endif /* Regular NEON implementation */
-
-/* NEON+dotprod implementation */
-#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
-#  define adler32_neon_dotprod	adler32_neon_dotprod
-#  define FUNCNAME		adler32_neon_dotprod
-#  define FUNCNAME_CHUNK	adler32_neon_dotprod_chunk
-#  define IMPL_ALIGNMENT	16
-#  define IMPL_SEGMENT_LEN	64
-#  define IMPL_MAX_CHUNK_LEN	MAX_CHUNK_LEN
-#  if HAVE_DOTPROD_NATIVE
-#    define ATTRIBUTES
-#  else
-#    ifdef __clang__
-#      define ATTRIBUTES  _target_attribute("dotprod")
-     /*
-      * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the
-      * default target is armv8.3-a or later in which case it must be omitted.
-      * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
-      */
-#    elif defined(__ARM_FEATURE_JCVT)
-#      define ATTRIBUTES  _target_attribute("+dotprod")
-#    else
-#      define ATTRIBUTES  _target_attribute("arch=armv8.2-a+dotprod")
-#    endif
-#  endif
-#  include <arm_neon.h>
-static forceinline ATTRIBUTES void
-adler32_neon_dotprod_chunk(const uint8x16_t *p, const uint8x16_t * const end,
-			   u32 *s1, u32 *s2)
-{
-	static const u8 _aligned_attribute(16) mults[64] = {
-		64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
-		48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
-		32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
-		16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
-	};
-	const uint8x16_t mults_a = vld1q_u8(&mults[0]);
-	const uint8x16_t mults_b = vld1q_u8(&mults[16]);
-	const uint8x16_t mults_c = vld1q_u8(&mults[32]);
-	const uint8x16_t mults_d = vld1q_u8(&mults[48]);
-	const uint8x16_t ones = vdupq_n_u8(1);
-	uint32x4_t v_s1_a = vdupq_n_u32(0);
-	uint32x4_t v_s1_b = vdupq_n_u32(0);
-	uint32x4_t v_s1_c = vdupq_n_u32(0);
-	uint32x4_t v_s1_d = vdupq_n_u32(0);
-	uint32x4_t v_s2_a = vdupq_n_u32(0);
-	uint32x4_t v_s2_b = vdupq_n_u32(0);
-	uint32x4_t v_s2_c = vdupq_n_u32(0);
-	uint32x4_t v_s2_d = vdupq_n_u32(0);
-	uint32x4_t v_s1_sums_a = vdupq_n_u32(0);
-	uint32x4_t v_s1_sums_b = vdupq_n_u32(0);
-	uint32x4_t v_s1_sums_c = vdupq_n_u32(0);
-	uint32x4_t v_s1_sums_d = vdupq_n_u32(0);
-	uint32x4_t v_s1;
-	uint32x4_t v_s2;
-	uint32x4_t v_s1_sums;
-
-	do {
-		uint8x16_t bytes_a = *p++;
-		uint8x16_t bytes_b = *p++;
-		uint8x16_t bytes_c = *p++;
-		uint8x16_t bytes_d = *p++;
-
-		v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a);
-		v_s1_a = vdotq_u32(v_s1_a, bytes_a, ones);
-		v_s2_a = vdotq_u32(v_s2_a, bytes_a, mults_a);
-
-		v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b);
-		v_s1_b = vdotq_u32(v_s1_b, bytes_b, ones);
-		v_s2_b = vdotq_u32(v_s2_b, bytes_b, mults_b);
-
-		v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c);
-		v_s1_c = vdotq_u32(v_s1_c, bytes_c, ones);
-		v_s2_c = vdotq_u32(v_s2_c, bytes_c, mults_c);
-
-		v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d);
-		v_s1_d = vdotq_u32(v_s1_d, bytes_d, ones);
-		v_s2_d = vdotq_u32(v_s2_d, bytes_d, mults_d);
-	} while (p != end);
-
-	v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b), vaddq_u32(v_s1_c, v_s1_d));
-	v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b), vaddq_u32(v_s2_c, v_s2_d));
-	v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a, v_s1_sums_b),
-			      vaddq_u32(v_s1_sums_c, v_s1_sums_d));
-	v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6));
-
-	*s1 += vaddvq_u32(v_s1);
-	*s2 += vaddvq_u32(v_s2);
-}
-#  include "adler32_vec_template.h"
-#endif /* NEON+dotprod implementation */
-
-#if defined(adler32_neon_dotprod) && HAVE_DOTPROD_NATIVE
-#define DEFAULT_IMPL	adler32_neon_dotprod
-#else
-static inline adler32_func_t
-arch_select_adler32_func(void)
-{
-	const u32 features MAYBE_UNUSED = get_arm_cpu_features();
-
-#ifdef adler32_neon_dotprod
-	if (HAVE_NEON(features) && HAVE_DOTPROD(features))
-		return adler32_neon_dotprod;
-#endif
-#ifdef adler32_neon
-	if (HAVE_NEON(features))
-		return adler32_neon;
-#endif
-	return NULL;
-}
-#define arch_select_adler32_func	arch_select_adler32_func
-#endif
-
-#endif /* LIB_ARM_ADLER32_IMPL_H */
diff --git a/Sources/DEFLATE/adler32_vec_template.h b/Sources/DEFLATE/adler32_vec_template.h
deleted file mode 100644
index 98c086bb..00000000
--- a/Sources/DEFLATE/adler32_vec_template.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * adler32_vec_template.h - template for vectorized Adler-32 implementations
- *
- * Copyright 2016 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * This file contains a template for vectorized Adler-32 implementations.
- *
- * The inner loop between reductions modulo 65521 of an unvectorized Adler-32
- * implementation looks something like this:
- *
- *	do {
- *		s1 += *p;
- *		s2 += s1;
- *	} while (++p != chunk_end);
- *
- * For vectorized calculation of s1, we only need to sum the input bytes.  They
- * can be accumulated into multiple counters which are eventually summed
- * together.
- *
- * For vectorized calculation of s2, the basic idea is that for each iteration
- * that processes N bytes, we can perform the following vectorizable
- * calculation:
- *
- *	s2 += N*byte_1 + (N-1)*byte_2 + (N-2)*byte_3 + ... + 1*byte_N
- *
- * Or, equivalently, we can sum the byte_1...byte_N for each iteration into N
- * separate counters, then do the multiplications by N...1 just once at the end
- * rather than once per iteration.
- *
- * Also, we must account for how previous bytes will affect s2 by doing the
- * following at beginning of each iteration:
- *
- *	s2 += s1 * N
- *
- * Furthermore, like s1, "s2" can actually be multiple counters which are
- * eventually summed together.
- */
-
-static u32 ATTRIBUTES MAYBE_UNUSED
-FUNCNAME(u32 adler, const u8 *p, size_t len)
-{
-	const size_t max_chunk_len =
-		MIN(MAX_CHUNK_LEN, IMPL_MAX_CHUNK_LEN) -
-		(MIN(MAX_CHUNK_LEN, IMPL_MAX_CHUNK_LEN) % IMPL_SEGMENT_LEN);
-	u32 s1 = adler & 0xFFFF;
-	u32 s2 = adler >> 16;
-	const u8 * const end = p + len;
-	const u8 *vend;
-
-	/* Process a byte at a time until the needed alignment is reached. */
-	if (p != end && (uintptr_t)p % IMPL_ALIGNMENT) {
-		do {
-			s1 += *p++;
-			s2 += s1;
-		} while (p != end && (uintptr_t)p % IMPL_ALIGNMENT);
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-
-	/*
-	 * Process "chunks" of bytes using vector instructions.  Chunk lengths
-	 * are limited to MAX_CHUNK_LEN, which guarantees that s1 and s2 never
-	 * overflow before being reduced modulo DIVISOR.  For vector processing,
-	 * chunk lengths are also made evenly divisible by IMPL_SEGMENT_LEN and
-	 * may be further limited to IMPL_MAX_CHUNK_LEN.
-	 */
-	STATIC_ASSERT(IMPL_SEGMENT_LEN % IMPL_ALIGNMENT == 0);
-	vend = end - ((size_t)(end - p) % IMPL_SEGMENT_LEN);
-	while (p != vend) {
-		size_t chunk_len = MIN((size_t)(vend - p), max_chunk_len);
-
-		s2 += s1 * chunk_len;
-
-		FUNCNAME_CHUNK((const void *)p, (const void *)(p + chunk_len),
-			       &s1, &s2);
-
-		p += chunk_len;
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-
-	/* Process any remaining bytes. */
-	if (p != end) {
-		do {
-			s1 += *p++;
-			s2 += s1;
-		} while (p != end);
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-
-	return (s2 << 16) | s1;
-}
-
-#undef FUNCNAME
-#undef FUNCNAME_CHUNK
-#undef ATTRIBUTES
-#undef IMPL_ALIGNMENT
-#undef IMPL_SEGMENT_LEN
-#undef IMPL_MAX_CHUNK_LEN
diff --git a/Sources/DEFLATE/arm/adler32_impl.h b/Sources/DEFLATE/arm/adler32_impl.h
new file mode 100644
index 00000000..c8892d47
--- /dev/null
+++ b/Sources/DEFLATE/arm/adler32_impl.h
@@ -0,0 +1,358 @@
+/*
+ * arm/adler32_impl.h - ARM implementations of Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_ADLER32_IMPL_H
+#define LIB_ARM_ADLER32_IMPL_H
+
+#include "cpu_features.h"
+
+/* Regular NEON implementation */
+#if HAVE_NEON_INTRIN && CPU_IS_LITTLE_ENDIAN()
+#  define adler32_arm_neon    adler32_arm_neon
+#  if HAVE_NEON_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef ARCH_ARM32
+#      define ATTRIBUTES  _target_attribute("fpu=neon")
+#    else
+#      define ATTRIBUTES  _target_attribute("+simd")
+#    endif
+#  endif
+#  include <arm_neon.h>
+static u32 ATTRIBUTES MAYBE_UNUSED
+adler32_arm_neon(u32 adler, const u8 *p, size_t len)
+{
+  static const u16 _aligned_attribute(16) mults[64] = {
+    64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+    48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+  };
+  const uint16x8_t mults_a = vld1q_u16(&mults[0]);
+  const uint16x8_t mults_b = vld1q_u16(&mults[8]);
+  const uint16x8_t mults_c = vld1q_u16(&mults[16]);
+  const uint16x8_t mults_d = vld1q_u16(&mults[24]);
+  const uint16x8_t mults_e = vld1q_u16(&mults[32]);
+  const uint16x8_t mults_f = vld1q_u16(&mults[40]);
+  const uint16x8_t mults_g = vld1q_u16(&mults[48]);
+  const uint16x8_t mults_h = vld1q_u16(&mults[56]);
+  u32 s1 = adler & 0xFFFF;
+  u32 s2 = adler >> 16;
+  
+  /*
+   * If the length is large and the pointer is misaligned, align it.
+   * For smaller lengths, just take the misaligned load penalty.
+   */
+  if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
+    do {
+      s1 += *p++;
+      s2 += s1;
+      len--;
+    } while ((uintptr_t)p & 15);
+    s1 %= DIVISOR;
+    s2 %= DIVISOR;
+  }
+  
+  while (len) {
+    /*
+     * Calculate the length of the next data chunk such that s1 and
+     * s2 are guaranteed to not exceed UINT32_MAX.
+     */
+    size_t n = MIN(len, MAX_CHUNK_LEN & ~63);
+    
+    len -= n;
+    
+    if (n >= 64) {
+      uint32x4_t v_s1 = vdupq_n_u32(0);
+      uint32x4_t v_s2 = vdupq_n_u32(0);
+      /*
+       * v_byte_sums_* contain the sum of the bytes at index i
+       * across all 64-byte segments, for each index 0..63.
+       */
+      uint16x8_t v_byte_sums_a = vdupq_n_u16(0);
+      uint16x8_t v_byte_sums_b = vdupq_n_u16(0);
+      uint16x8_t v_byte_sums_c = vdupq_n_u16(0);
+      uint16x8_t v_byte_sums_d = vdupq_n_u16(0);
+      uint16x8_t v_byte_sums_e = vdupq_n_u16(0);
+      uint16x8_t v_byte_sums_f = vdupq_n_u16(0);
+      uint16x8_t v_byte_sums_g = vdupq_n_u16(0);
+      uint16x8_t v_byte_sums_h = vdupq_n_u16(0);
+      
+      s2 += s1 * (n & ~63);
+      
+      do {
+        /* Load the next 64 data bytes. */
+        const uint8x16_t data_a = vld1q_u8(p + 0);
+        const uint8x16_t data_b = vld1q_u8(p + 16);
+        const uint8x16_t data_c = vld1q_u8(p + 32);
+        const uint8x16_t data_d = vld1q_u8(p + 48);
+        uint16x8_t tmp;
+        
+        /*
+         * Accumulate the previous s1 counters into the
+         * s2 counters.  The needed multiplication by 64
+         * is delayed to later.
+         */
+        v_s2 = vaddq_u32(v_s2, v_s1);
+        
+        /*
+         * Add the 64 data bytes to their v_byte_sums
+         * counters, while also accumulating the sums of
+         * each adjacent set of 4 bytes into v_s1.
+         */
+        tmp = vpaddlq_u8(data_a);
+        v_byte_sums_a = vaddw_u8(v_byte_sums_a,
+                                 vget_low_u8(data_a));
+        v_byte_sums_b = vaddw_u8(v_byte_sums_b,
+                                 vget_high_u8(data_a));
+        tmp = vpadalq_u8(tmp, data_b);
+        v_byte_sums_c = vaddw_u8(v_byte_sums_c,
+                                 vget_low_u8(data_b));
+        v_byte_sums_d = vaddw_u8(v_byte_sums_d,
+                                 vget_high_u8(data_b));
+        tmp = vpadalq_u8(tmp, data_c);
+        v_byte_sums_e = vaddw_u8(v_byte_sums_e,
+                                 vget_low_u8(data_c));
+        v_byte_sums_f = vaddw_u8(v_byte_sums_f,
+                                 vget_high_u8(data_c));
+        tmp = vpadalq_u8(tmp, data_d);
+        v_byte_sums_g = vaddw_u8(v_byte_sums_g,
+                                 vget_low_u8(data_d));
+        v_byte_sums_h = vaddw_u8(v_byte_sums_h,
+                                 vget_high_u8(data_d));
+        v_s1 = vpadalq_u16(v_s1, tmp);
+        
+        p += 64;
+        n -= 64;
+      } while (n >= 64);
+      
+      /* s2 = 64*s2 + (64*bytesum0 + 63*bytesum1 + ... + 1*bytesum63) */
+#ifdef ARCH_ARM32
+#  define umlal2(a, b, c)  vmlal_u16((a), vget_high_u16(b), vget_high_u16(c))
+#else
+#  define umlal2     vmlal_high_u16
+#endif
+      v_s2 = vqshlq_n_u32(v_s2, 6);
+      v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_a),
+                       vget_low_u16(mults_a));
+      v_s2 = umlal2(v_s2, v_byte_sums_a, mults_a);
+      v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_b),
+                       vget_low_u16(mults_b));
+      v_s2 = umlal2(v_s2, v_byte_sums_b, mults_b);
+      v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_c),
+                       vget_low_u16(mults_c));
+      v_s2 = umlal2(v_s2, v_byte_sums_c, mults_c);
+      v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_d),
+                       vget_low_u16(mults_d));
+      v_s2 = umlal2(v_s2, v_byte_sums_d, mults_d);
+      v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_e),
+                       vget_low_u16(mults_e));
+      v_s2 = umlal2(v_s2, v_byte_sums_e, mults_e);
+      v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_f),
+                       vget_low_u16(mults_f));
+      v_s2 = umlal2(v_s2, v_byte_sums_f, mults_f);
+      v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_g),
+                       vget_low_u16(mults_g));
+      v_s2 = umlal2(v_s2, v_byte_sums_g, mults_g);
+      v_s2 = vmlal_u16(v_s2, vget_low_u16(v_byte_sums_h),
+                       vget_low_u16(mults_h));
+      v_s2 = umlal2(v_s2, v_byte_sums_h, mults_h);
+#undef umlal2
+      
+      /* Horizontal sum to finish up */
+#ifdef ARCH_ARM32
+      s1 += vgetq_lane_u32(v_s1, 0) + vgetq_lane_u32(v_s1, 1) +
+      vgetq_lane_u32(v_s1, 2) + vgetq_lane_u32(v_s1, 3);
+      s2 += vgetq_lane_u32(v_s2, 0) + vgetq_lane_u32(v_s2, 1) +
+      vgetq_lane_u32(v_s2, 2) + vgetq_lane_u32(v_s2, 3);
+#else
+      s1 += vaddvq_u32(v_s1);
+      s2 += vaddvq_u32(v_s2);
+#endif
+    }
+    /*
+     * Process the last 0 <= n < 64 bytes of the chunk using
+     * scalar instructions and reduce s1 and s2 mod DIVISOR.
+     */
+    ADLER32_CHUNK(s1, s2, p, n);
+  }
+  return (s2 << 16) | s1;
+}
+#undef ATTRIBUTES
+#endif /* Regular NEON implementation */
+
+/* NEON+dotprod implementation */
+#if HAVE_DOTPROD_INTRIN && CPU_IS_LITTLE_ENDIAN()
+#  define adler32_arm_neon_dotprod  adler32_arm_neon_dotprod
+#  if HAVE_DOTPROD_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef __clang__
+#      define ATTRIBUTES  _target_attribute("dotprod")
+/*
+ * With gcc, arch=armv8.2-a is needed for dotprod intrinsics, unless the
+ * default target is armv8.3-a or later in which case it must be omitted.
+ * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
+ */
+#    elif defined(__ARM_FEATURE_JCVT)
+#      define ATTRIBUTES  _target_attribute("+dotprod")
+#    else
+#      define ATTRIBUTES  _target_attribute("arch=armv8.2-a+dotprod")
+#    endif
+#  endif
+#  include <arm_neon.h>
+static u32 ATTRIBUTES
+adler32_arm_neon_dotprod(u32 adler, const u8 *p, size_t len)
+{
+  static const u8 _aligned_attribute(16) mults[64] = {
+    64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+    48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+  };
+  const uint8x16_t mults_a = vld1q_u8(&mults[0]);
+  const uint8x16_t mults_b = vld1q_u8(&mults[16]);
+  const uint8x16_t mults_c = vld1q_u8(&mults[32]);
+  const uint8x16_t mults_d = vld1q_u8(&mults[48]);
+  const uint8x16_t ones = vdupq_n_u8(1);
+  u32 s1 = adler & 0xFFFF;
+  u32 s2 = adler >> 16;
+  
+  /*
+   * If the length is large and the pointer is misaligned, align it.
+   * For smaller lengths, just take the misaligned load penalty.
+   */
+  if (unlikely(len > 32768 && ((uintptr_t)p & 15))) {
+    do {
+      s1 += *p++;
+      s2 += s1;
+      len--;
+    } while ((uintptr_t)p & 15);
+    s1 %= DIVISOR;
+    s2 %= DIVISOR;
+  }
+  
+  while (len) {
+    /*
+     * Calculate the length of the next data chunk such that s1 and
+     * s2 are guaranteed to not exceed UINT32_MAX.
+     */
+    size_t n = MIN(len, MAX_CHUNK_LEN & ~63);
+    
+    len -= n;
+    
+    if (n >= 64) {
+      uint32x4_t v_s1_a = vdupq_n_u32(0);
+      uint32x4_t v_s1_b = vdupq_n_u32(0);
+      uint32x4_t v_s1_c = vdupq_n_u32(0);
+      uint32x4_t v_s1_d = vdupq_n_u32(0);
+      uint32x4_t v_s2_a = vdupq_n_u32(0);
+      uint32x4_t v_s2_b = vdupq_n_u32(0);
+      uint32x4_t v_s2_c = vdupq_n_u32(0);
+      uint32x4_t v_s2_d = vdupq_n_u32(0);
+      uint32x4_t v_s1_sums_a = vdupq_n_u32(0);
+      uint32x4_t v_s1_sums_b = vdupq_n_u32(0);
+      uint32x4_t v_s1_sums_c = vdupq_n_u32(0);
+      uint32x4_t v_s1_sums_d = vdupq_n_u32(0);
+      uint32x4_t v_s1;
+      uint32x4_t v_s2;
+      uint32x4_t v_s1_sums;
+      
+      s2 += s1 * (n & ~63);
+      
+      do {
+        uint8x16_t data_a = vld1q_u8(p + 0);
+        uint8x16_t data_b = vld1q_u8(p + 16);
+        uint8x16_t data_c = vld1q_u8(p + 32);
+        uint8x16_t data_d = vld1q_u8(p + 48);
+        
+        v_s1_sums_a = vaddq_u32(v_s1_sums_a, v_s1_a);
+        v_s1_a = vdotq_u32(v_s1_a, data_a, ones);
+        v_s2_a = vdotq_u32(v_s2_a, data_a, mults_a);
+        
+        v_s1_sums_b = vaddq_u32(v_s1_sums_b, v_s1_b);
+        v_s1_b = vdotq_u32(v_s1_b, data_b, ones);
+        v_s2_b = vdotq_u32(v_s2_b, data_b, mults_b);
+        
+        v_s1_sums_c = vaddq_u32(v_s1_sums_c, v_s1_c);
+        v_s1_c = vdotq_u32(v_s1_c, data_c, ones);
+        v_s2_c = vdotq_u32(v_s2_c, data_c, mults_c);
+        
+        v_s1_sums_d = vaddq_u32(v_s1_sums_d, v_s1_d);
+        v_s1_d = vdotq_u32(v_s1_d, data_d, ones);
+        v_s2_d = vdotq_u32(v_s2_d, data_d, mults_d);
+        
+        p += 64;
+        n -= 64;
+      } while (n >= 64);
+      
+      v_s1 = vaddq_u32(vaddq_u32(v_s1_a, v_s1_b),
+                       vaddq_u32(v_s1_c, v_s1_d));
+      v_s2 = vaddq_u32(vaddq_u32(v_s2_a, v_s2_b),
+                       vaddq_u32(v_s2_c, v_s2_d));
+      v_s1_sums = vaddq_u32(vaddq_u32(v_s1_sums_a,
+                                      v_s1_sums_b),
+                            vaddq_u32(v_s1_sums_c,
+                                      v_s1_sums_d));
+      v_s2 = vaddq_u32(v_s2, vqshlq_n_u32(v_s1_sums, 6));
+      
+      s1 += vaddvq_u32(v_s1);
+      s2 += vaddvq_u32(v_s2);
+    }
+    /*
+     * Process the last 0 <= n < 64 bytes of the chunk using
+     * scalar instructions and reduce s1 and s2 mod DIVISOR.
+     */
+    ADLER32_CHUNK(s1, s2, p, n);
+  }
+  return (s2 << 16) | s1;
+}
+#undef ATTRIBUTES
+#endif /* NEON+dotprod implementation */
+
+#if defined(adler32_arm_neon_dotprod) && HAVE_DOTPROD_NATIVE
+#define DEFAULT_IMPL  adler32_arm_neon_dotprod
+#else
+static inline adler32_func_t
+arch_select_adler32_func(void)
+{
+  const u32 features MAYBE_UNUSED = get_arm_cpu_features();
+  
+#ifdef adler32_arm_neon_dotprod
+  if (HAVE_NEON(features) && HAVE_DOTPROD(features))
+    return adler32_arm_neon_dotprod;
+#endif
+#ifdef adler32_arm_neon
+  if (HAVE_NEON(features))
+    return adler32_arm_neon;
+#endif
+  return NULL;
+}
+#define arch_select_adler32_func  arch_select_adler32_func
+#endif
+
+#endif /* LIB_ARM_ADLER32_IMPL_H */
diff --git a/Sources/DEFLATE/arm/cpu_features.c b/Sources/DEFLATE/arm/cpu_features.c
new file mode 100644
index 00000000..fdb2d7c4
--- /dev/null
+++ b/Sources/DEFLATE/arm/cpu_features.c
@@ -0,0 +1,212 @@
+/*
+ * arm/cpu_features.c - feature detection for ARM CPUs
+ *
+ * Copyright 2018 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * ARM CPUs don't have a standard way for unprivileged programs to detect CPU
+ * features.  But an OS-specific way can be used when available.
+ */
+
+#ifdef __APPLE__
+#  undef _ANSI_SOURCE
+#  undef _DARWIN_C_SOURCE
+#  define _DARWIN_C_SOURCE /* for sysctlbyname() */
+#endif
+
+#include "../cpu_features_common.h" /* must be included first */
+#include "cpu_features.h"
+
+#if HAVE_DYNAMIC_ARM_CPU_FEATURES
+
+#ifdef __linux__
+/*
+ * On Linux, arm32 and arm64 CPU features can be detected by reading the
+ * AT_HWCAP and AT_HWCAP2 values from /proc/self/auxv.
+ *
+ * Ideally we'd use the C library function getauxval(), but it's not guaranteed
+ * to be available: it was only added to glibc in 2.16, and in Android it was
+ * added to API level 18 for arm32 and level 21 for arm64.
+ */
+
+#include <errno.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+
+#define AT_HWCAP  16
+#define AT_HWCAP2  26
+
+static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
+{
+  int fd;
+  unsigned long auxbuf[32];
+  int filled = 0;
+  int i;
+  
+  fd = open("/proc/self/auxv", O_RDONLY);
+  if (fd < 0)
+    return;
+  
+  for (;;) {
+    do {
+      int ret = read(fd, &((char *)auxbuf)[filled],
+                     sizeof(auxbuf) - filled);
+      if (ret <= 0) {
+        if (ret < 0 && errno == EINTR)
+          continue;
+        goto out;
+      }
+      filled += ret;
+    } while (filled < 2 * sizeof(long));
+    
+    i = 0;
+    do {
+      unsigned long type = auxbuf[i];
+      unsigned long value = auxbuf[i + 1];
+      
+      if (type == AT_HWCAP)
+        *hwcap = value;
+      else if (type == AT_HWCAP2)
+        *hwcap2 = value;
+      i += 2;
+      filled -= 2 * sizeof(long);
+    } while (filled >= 2 * sizeof(long));
+    
+    memmove(auxbuf, &auxbuf[i], filled);
+  }
+out:
+  close(fd);
+}
+
+static u32 query_arm_cpu_features(void)
+{
+  u32 features = 0;
+  unsigned long hwcap = 0;
+  unsigned long hwcap2 = 0;
+  
+  scan_auxv(&hwcap, &hwcap2);
+  
+#ifdef ARCH_ARM32
+  STATIC_ASSERT(sizeof(long) == 4);
+  if (hwcap & (1 << 12))  /* HWCAP_NEON */
+    features |= ARM_CPU_FEATURE_NEON;
+  if (hwcap2 & (1 << 1))  /* HWCAP2_PMULL */
+    features |= ARM_CPU_FEATURE_PMULL;
+  if (hwcap2 & (1 << 4))  /* HWCAP2_CRC32 */
+    features |= ARM_CPU_FEATURE_CRC32;
+#else
+  STATIC_ASSERT(sizeof(long) == 8);
+  if (hwcap & (1 << 1))  /* HWCAP_ASIMD */
+    features |= ARM_CPU_FEATURE_NEON;
+  if (hwcap & (1 << 4))  /* HWCAP_PMULL */
+    features |= ARM_CPU_FEATURE_PMULL;
+  if (hwcap & (1 << 7))  /* HWCAP_CRC32 */
+    features |= ARM_CPU_FEATURE_CRC32;
+  if (hwcap & (1 << 17))  /* HWCAP_SHA3 */
+    features |= ARM_CPU_FEATURE_SHA3;
+  if (hwcap & (1 << 20))  /* HWCAP_ASIMDDP */
+    features |= ARM_CPU_FEATURE_DOTPROD;
+#endif
+  return features;
+}
+
+#elif defined(__APPLE__)
+/* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+static const struct {
+  const char *name;
+  u32 feature;
+} feature_sysctls[] = {
+  { "hw.optional.neon",      ARM_CPU_FEATURE_NEON },
+  { "hw.optional.AdvSIMD",    ARM_CPU_FEATURE_NEON },
+  { "hw.optional.arm.FEAT_PMULL",    ARM_CPU_FEATURE_PMULL },
+  { "hw.optional.armv8_crc32",    ARM_CPU_FEATURE_CRC32 },
+  { "hw.optional.armv8_2_sha3",    ARM_CPU_FEATURE_SHA3 },
+  { "hw.optional.arm.FEAT_SHA3",    ARM_CPU_FEATURE_SHA3 },
+  { "hw.optional.arm.FEAT_DotProd", ARM_CPU_FEATURE_DOTPROD },
+};
+
+static u32 query_arm_cpu_features(void)
+{
+  u32 features = 0;
+  size_t i;
+  
+  for (i = 0; i < ARRAY_LEN(feature_sysctls); i++) {
+    const char *name = feature_sysctls[i].name;
+    u32 val = 0;
+    size_t valsize = sizeof(val);
+    
+    if (sysctlbyname(name, &val, &valsize, NULL, 0) == 0 &&
+        valsize == sizeof(val) && val == 1)
+      features |= feature_sysctls[i].feature;
+  }
+  return features;
+}
+#elif defined(_WIN32)
+
+#include <windows.h>
+
+static u32 query_arm_cpu_features(void)
+{
+  u32 features = ARM_CPU_FEATURE_NEON;
+  
+  if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
+    features |= ARM_CPU_FEATURE_PMULL;
+  if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE))
+    features |= ARM_CPU_FEATURE_CRC32;
+  
+  /* FIXME: detect SHA3 and DOTPROD support too. */
+  
+  return features;
+}
+#else
+#error "unhandled case"
+#endif
+
+static const struct cpu_feature arm_cpu_feature_table[] = {
+  {ARM_CPU_FEATURE_NEON,    "neon"},
+  {ARM_CPU_FEATURE_PMULL,    "pmull"},
+  {ARM_CPU_FEATURE_CRC32,    "crc32"},
+  {ARM_CPU_FEATURE_SHA3,    "sha3"},
+  {ARM_CPU_FEATURE_DOTPROD,  "dotprod"},
+};
+
+volatile u32 libdeflate_arm_cpu_features = 0;
+
+void libdeflate_init_arm_cpu_features(void)
+{
+  u32 features = query_arm_cpu_features();
+  
+  disable_cpu_features_for_testing(&features, arm_cpu_feature_table,
+                                   ARRAY_LEN(arm_cpu_feature_table));
+  
+  libdeflate_arm_cpu_features = features | ARM_CPU_FEATURES_KNOWN;
+}
+
+#endif /* HAVE_DYNAMIC_ARM_CPU_FEATURES */
diff --git a/Sources/DEFLATE/arm/cpu_features.h b/Sources/DEFLATE/arm/cpu_features.h
new file mode 100644
index 00000000..39fdb40f
--- /dev/null
+++ b/Sources/DEFLATE/arm/cpu_features.h
@@ -0,0 +1,262 @@
+/*
+ * arm/cpu_features.h - feature detection for ARM CPUs
+ *
+ * Copyright 2018 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_CPU_FEATURES_H
+#define LIB_ARM_CPU_FEATURES_H
+
+#include "../lib_common.h"
+
+#define HAVE_DYNAMIC_ARM_CPU_FEATURES  0
+
+#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
+
+#if !defined(FREESTANDING) && \
+(defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)) && \
+(defined(__linux__) || \
+(defined(__APPLE__) && defined(ARCH_ARM64)) || \
+(defined(_WIN32) && defined(ARCH_ARM64)))
+#  undef HAVE_DYNAMIC_ARM_CPU_FEATURES
+#  define HAVE_DYNAMIC_ARM_CPU_FEATURES  1
+#endif
+
+#define ARM_CPU_FEATURE_NEON    (1 << 0)
+#define ARM_CPU_FEATURE_PMULL    (1 << 1)
+#define ARM_CPU_FEATURE_CRC32    (1 << 2)
+#define ARM_CPU_FEATURE_SHA3    (1 << 3)
+#define ARM_CPU_FEATURE_DOTPROD    (1 << 4)
+
+#define HAVE_NEON(features)  (HAVE_NEON_NATIVE    || ((features) & ARM_CPU_FEATURE_NEON))
+#define HAVE_PMULL(features)  (HAVE_PMULL_NATIVE   || ((features) & ARM_CPU_FEATURE_PMULL))
+#define HAVE_CRC32(features)  (HAVE_CRC32_NATIVE   || ((features) & ARM_CPU_FEATURE_CRC32))
+#define HAVE_SHA3(features)  (HAVE_SHA3_NATIVE    || ((features) & ARM_CPU_FEATURE_SHA3))
+#define HAVE_DOTPROD(features)  (HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD))
+
+#if HAVE_DYNAMIC_ARM_CPU_FEATURES
+#define ARM_CPU_FEATURES_KNOWN    (1U << 31)
+extern volatile u32 libdeflate_arm_cpu_features;
+
+void libdeflate_init_arm_cpu_features(void);
+
+static inline u32 get_arm_cpu_features(void)
+{
+  if (libdeflate_arm_cpu_features == 0)
+    libdeflate_init_arm_cpu_features();
+  return libdeflate_arm_cpu_features;
+}
+#else /* HAVE_DYNAMIC_ARM_CPU_FEATURES */
+static inline u32 get_arm_cpu_features(void) { return 0; }
+#endif /* !HAVE_DYNAMIC_ARM_CPU_FEATURES */
+
+/* NEON */
+#if defined(__ARM_NEON) || defined(ARCH_ARM64)
+#  define HAVE_NEON_NATIVE  1
+#else
+#  define HAVE_NEON_NATIVE  0
+#endif
+/*
+ * With both gcc and clang, NEON intrinsics require that the main target has
+ * NEON enabled already.  Exception: with gcc 6.1 and later (r230411 for arm32,
+ * r226563 for arm64), hardware floating point support is sufficient.
+ */
+#if HAVE_NEON_NATIVE || \
+(HAVE_DYNAMIC_ARM_CPU_FEATURES && GCC_PREREQ(6, 1) && defined(__ARM_FP))
+#  define HAVE_NEON_INTRIN  1
+#else
+#  define HAVE_NEON_INTRIN  0
+#endif
+
+/* PMULL */
+#ifdef __ARM_FEATURE_CRYPTO
+#  define HAVE_PMULL_NATIVE  1
+#else
+#  define HAVE_PMULL_NATIVE  0
+#endif
+#if HAVE_PMULL_NATIVE || \
+(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
+HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \
+(GCC_PREREQ(6, 1) || defined(__clang__) || defined(_MSC_VER)) && \
+/*
+* On arm32 with clang, the crypto intrinsics (which include pmull)
+* are not defined, even when using -mfpu=crypto-neon-fp-armv8,
+* because clang's <arm_neon.h> puts their definitions behind
+* __aarch64__.
+*/ \
+!(defined(ARCH_ARM32) && defined(__clang__)))
+#  define HAVE_PMULL_INTRIN  CPU_IS_LITTLE_ENDIAN() /* untested on big endian */
+/* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */
+#  ifdef _MSC_VER
+#    define compat_vmull_p64(a, b)  vmull_p64(vcreate_p64(a), vcreate_p64(b))
+#  else
+#    define compat_vmull_p64(a, b)  vmull_p64((a), (b))
+#  endif
+#else
+#  define HAVE_PMULL_INTRIN  0
+#endif
+/*
+ * Set USE_PMULL_TARGET_EVEN_IF_NATIVE if a workaround for a gcc bug that was
+ * fixed by commit 11a113d501ff ("aarch64: Simplify feature definitions") in gcc
+ * 13 is needed.  A minimal program that fails to build due to this bug when
+ * compiled with -mcpu=emag, at least with gcc 10 through 12, is:
+ *
+ *    static inline __attribute__((always_inline,target("+crypto"))) void f() {}
+ *    void g() { f(); }
+ *
+ * The error is:
+ *
+ *    error: inlining failed in call to ‘always_inline’ ‘f’: target specific option mismatch
+ *
+ * The workaround is to explicitly add the crypto target to the non-inline
+ * function g(), even though this should not be required due to -mcpu=emag
+ * enabling 'crypto' natively and causing __ARM_FEATURE_CRYPTO to be defined.
+ */
+#if HAVE_PMULL_NATIVE && defined(ARCH_ARM64) && \
+GCC_PREREQ(6, 1) && !GCC_PREREQ(13, 1)
+#  define USE_PMULL_TARGET_EVEN_IF_NATIVE  1
+#else
+#  define USE_PMULL_TARGET_EVEN_IF_NATIVE  0
+#endif
+
+/* CRC32 */
+#ifdef __ARM_FEATURE_CRC32
+#  define HAVE_CRC32_NATIVE  1
+#else
+#  define HAVE_CRC32_NATIVE  0
+#endif
+#undef HAVE_CRC32_INTRIN
+#if HAVE_CRC32_NATIVE
+#  define HAVE_CRC32_INTRIN  1
+#elif HAVE_DYNAMIC_ARM_CPU_FEATURES
+#  if GCC_PREREQ(1, 0)
+/*
+ * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled
+ * in the main target has been affected by two gcc bugs, which we must avoid
+ * by only allowing gcc versions that have the corresponding fixes.  First,
+ * gcc commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a
+ * and hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed.  Second,
+ * gcc commit c1cdabe3aab8 ("arm: reorder assembler architecture directives
+ * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when
+ * binutils is 2.34 or later, due to
+ * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439.  We use the second
+ * set of prerequisites, as they are stricter and we have no way to detect
+ * the binutils version directly from a C source file.
+ *
+ * Also exclude the cases where the main target arch is armv6kz or armv7e-m.
+ * In those cases, gcc doesn't let functions that use the main arch be
+ * inlined into functions that are targeted to armv8-a+crc.  (armv8-a is
+ * necessary for crc to be accepted at all.)  That causes build errors.
+ * This issue happens for these specific sub-archs because they are not a
+ * subset of armv8-a.  Note: clang does not have this limitation.
+ */
+#    if (GCC_PREREQ(11, 3) || \
+(GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \
+(GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0))) && \
+!defined(__ARM_ARCH_6KZ__) && \
+!defined(__ARM_ARCH_7EM__)
+#      define HAVE_CRC32_INTRIN  1
+#    endif
+#  elif defined(__clang__) || defined(_MSC_VER)
+#    define HAVE_CRC32_INTRIN  1
+#  endif
+#endif
+#ifndef HAVE_CRC32_INTRIN
+#  define HAVE_CRC32_INTRIN  0
+#endif
+
+/* SHA3 (needed for the eor3 instruction) */
+#if defined(ARCH_ARM64) && !defined(_MSC_VER)
+#  ifdef __ARM_FEATURE_SHA3
+#    define HAVE_SHA3_NATIVE  1
+#  else
+#    define HAVE_SHA3_NATIVE  0
+#  endif
+#  define HAVE_SHA3_TARGET  (HAVE_DYNAMIC_ARM_CPU_FEATURES && \
+(GCC_PREREQ(8, 1) /* r256478 */ || \
+CLANG_PREREQ(7, 0, 10010463) /* r338010 */))
+#  define HAVE_SHA3_INTRIN  (HAVE_NEON_INTRIN && \
+(HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \
+(GCC_PREREQ(9, 1) /* r268049 */ || \
+CLANG_PREREQ(13, 0, 13160000)))
+#else
+#  define HAVE_SHA3_NATIVE  0
+#  define HAVE_SHA3_TARGET  0
+#  define HAVE_SHA3_INTRIN  0
+#endif
+
+/* dotprod */
+#ifdef ARCH_ARM64
+#  ifdef __ARM_FEATURE_DOTPROD
+#    define HAVE_DOTPROD_NATIVE  1
+#  else
+#    define HAVE_DOTPROD_NATIVE  0
+#  endif
+#  if HAVE_DOTPROD_NATIVE || \
+(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
+(GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || \
+defined(_MSC_VER)))
+#    define HAVE_DOTPROD_INTRIN  1
+#  else
+#    define HAVE_DOTPROD_INTRIN  0
+#  endif
+#else
+#  define HAVE_DOTPROD_NATIVE  0
+#  define HAVE_DOTPROD_INTRIN  0
+#endif
+
+/*
+ * Work around bugs in arm_acle.h and arm_neon.h where sometimes intrinsics are
+ * only defined when the corresponding __ARM_FEATURE_* macro is defined.  The
+ * intrinsics actually work in target attribute functions too if they are
+ * defined, though, so work around this by temporarily defining the
+ * corresponding __ARM_FEATURE_* macros while including the headers.
+ */
+#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
+(defined(__clang__) || defined(ARCH_ARM32))
+#  define __ARM_FEATURE_CRC32  1
+#endif
+#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
+#  define __ARM_FEATURE_SHA3  1
+#endif
+#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
+#  define __ARM_FEATURE_DOTPROD  1
+#endif
+#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
+(defined(__clang__) || defined(ARCH_ARM32))
+#  include <arm_acle.h>
+#  undef __ARM_FEATURE_CRC32
+#endif
+#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
+#  include <arm_neon.h>
+#  undef __ARM_FEATURE_SHA3
+#endif
+#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
+#  include <arm_neon.h>
+#  undef __ARM_FEATURE_DOTPROD
+#endif
+
+#endif /* ARCH_ARM32 || ARCH_ARM64 */
+
+#endif /* LIB_ARM_CPU_FEATURES_H */
diff --git a/Sources/DEFLATE/arm/crc32_impl.h b/Sources/DEFLATE/arm/crc32_impl.h
new file mode 100644
index 00000000..472bc00f
--- /dev/null
+++ b/Sources/DEFLATE/arm/crc32_impl.h
@@ -0,0 +1,682 @@
+/*
+ * arm/crc32_impl.h - ARM implementations of the gzip CRC-32 algorithm
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_ARM_CRC32_IMPL_H
+#define LIB_ARM_CRC32_IMPL_H
+
+#include "cpu_features.h"
+
+/*
+ * crc32_arm_crc() - implementation using crc32 instructions (only)
+ *
+ * In general this implementation is straightforward.  However, naive use of the
+ * crc32 instructions is serial: one of the two inputs to each crc32 instruction
+ * is the output of the previous one.  To take advantage of CPUs that can
+ * execute multiple crc32 instructions in parallel, when possible we interleave
+ * the checksumming of several adjacent chunks, then combine their CRCs.
+ *
+ * However, without pmull, combining CRCs is fairly slow.  So in this pmull-less
+ * version, we only use a large chunk length, and thus we only do chunked
+ * processing if there is a lot of data to checksum.  This also means that a
+ * variable chunk length wouldn't help much, so we just support a fixed length.
+ */
+#if HAVE_CRC32_INTRIN
+#  if HAVE_CRC32_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef ARCH_ARM32
+#      ifdef __clang__
+#        define ATTRIBUTES  _target_attribute("armv8-a,crc")
+#      elif defined(__ARM_PCS_VFP)
+/*
+ * +simd is needed to avoid a "selected architecture lacks an FPU"
+ * error with Debian arm-linux-gnueabihf-gcc when -mfpu is not
+ * explicitly specified on the command line.
+ */
+#        define ATTRIBUTES  _target_attribute("arch=armv8-a+crc+simd")
+#      else
+#        define ATTRIBUTES  _target_attribute("arch=armv8-a+crc")
+#      endif
+#    else
+#      ifdef __clang__
+#        define ATTRIBUTES  _target_attribute("crc")
+#      else
+#        define ATTRIBUTES  _target_attribute("+crc")
+#      endif
+#    endif
+#  endif
+
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
+
+/*
+ * Combine the CRCs for 4 adjacent chunks of length L = CRC32_FIXED_CHUNK_LEN
+ * bytes each by computing:
+ *
+ *  [ crc0*x^(3*8*L) + crc1*x^(2*8*L) + crc2*x^(1*8*L) + crc3 ] mod G(x)
+ *
+ * This has been optimized in several ways:
+ *
+ *    - The needed multipliers (x to some power, reduced mod G(x)) were
+ *  precomputed.
+ *
+ *    - The 3 multiplications are interleaved.
+ *
+ *    - The reduction mod G(x) is delayed to the end and done using __crc32d.
+ *  Note that the use of __crc32d introduces an extra factor of x^32.  To
+ *  cancel that out along with the extra factor of x^1 that gets introduced
+ *  because of how the 63-bit products are aligned in their 64-bit integers,
+ *  the multipliers are actually x^(j*8*L - 33) instead of x^(j*8*L).
+ */
+static forceinline ATTRIBUTES u32
+combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3)
+{
+  u64 res0 = 0, res1 = 0, res2 = 0;
+  int i;
+  
+  /* Multiply crc{0,1,2} by CRC32_FIXED_CHUNK_MULT_{3,2,1}. */
+  for (i = 0; i < 32; i++) {
+    if (CRC32_FIXED_CHUNK_MULT_3 & (1U << i))
+      res0 ^= (u64)crc0 << i;
+    if (CRC32_FIXED_CHUNK_MULT_2 & (1U << i))
+      res1 ^= (u64)crc1 << i;
+    if (CRC32_FIXED_CHUNK_MULT_1 & (1U << i))
+      res2 ^= (u64)crc2 << i;
+  }
+  /* Add the different parts and reduce mod G(x). */
+  return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3;
+}
+
+#define crc32_arm_crc  crc32_arm_crc
+static u32 ATTRIBUTES MAYBE_UNUSED
+crc32_arm_crc(u32 crc, const u8 *p, size_t len)
+{
+  if (len >= 64) {
+    const size_t align = -(uintptr_t)p & 7;
+    
+    /* Align p to the next 8-byte boundary. */
+    if (align) {
+      if (align & 1)
+        crc = __crc32b(crc, *p++);
+      if (align & 2) {
+        crc = __crc32h(crc, le16_bswap(*(u16 *)p));
+        p += 2;
+      }
+      if (align & 4) {
+        crc = __crc32w(crc, le32_bswap(*(u32 *)p));
+        p += 4;
+      }
+      len -= align;
+    }
+    /*
+     * Interleave the processing of multiple adjacent data chunks to
+     * take advantage of instruction-level parallelism.
+     *
+     * Some CPUs don't prefetch the data if it's being fetched in
+     * multiple interleaved streams, so do explicit prefetching.
+     */
+    while (len >= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN) {
+      const u64 *wp0 = (const u64 *)p;
+      const u64 * const wp0_end =
+      (const u64 *)(p + CRC32_FIXED_CHUNK_LEN);
+      u32 crc1 = 0, crc2 = 0, crc3 = 0;
+      
+      STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
+      STATIC_ASSERT(CRC32_FIXED_CHUNK_LEN % (4 * 8) == 0);
+      do {
+        prefetchr(&wp0[64 + 0*CRC32_FIXED_CHUNK_LEN/8]);
+        prefetchr(&wp0[64 + 1*CRC32_FIXED_CHUNK_LEN/8]);
+        prefetchr(&wp0[64 + 2*CRC32_FIXED_CHUNK_LEN/8]);
+        prefetchr(&wp0[64 + 3*CRC32_FIXED_CHUNK_LEN/8]);
+        crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
+        crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
+        crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
+        crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
+        wp0++;
+        crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
+        crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
+        crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
+        crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
+        wp0++;
+        crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
+        crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
+        crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
+        crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
+        wp0++;
+        crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
+        crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
+        crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
+        crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
+        wp0++;
+      } while (wp0 != wp0_end);
+      crc = combine_crcs_slow(crc, crc1, crc2, crc3);
+      p += CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN;
+      len -= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN;
+    }
+    /*
+     * Due to the large fixed chunk length used above, there might
+     * still be a lot of data left.  So use a 64-byte loop here,
+     * instead of a loop that is less unrolled.
+     */
+    while (len >= 64) {
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 32)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 40)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 48)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 56)));
+      p += 64;
+      len -= 64;
+    }
+  }
+  if (len & 32) {
+    crc = __crc32d(crc, get_unaligned_le64(p + 0));
+    crc = __crc32d(crc, get_unaligned_le64(p + 8));
+    crc = __crc32d(crc, get_unaligned_le64(p + 16));
+    crc = __crc32d(crc, get_unaligned_le64(p + 24));
+    p += 32;
+  }
+  if (len & 16) {
+    crc = __crc32d(crc, get_unaligned_le64(p + 0));
+    crc = __crc32d(crc, get_unaligned_le64(p + 8));
+    p += 16;
+  }
+  if (len & 8) {
+    crc = __crc32d(crc, get_unaligned_le64(p));
+    p += 8;
+  }
+  if (len & 4) {
+    crc = __crc32w(crc, get_unaligned_le32(p));
+    p += 4;
+  }
+  if (len & 2) {
+    crc = __crc32h(crc, get_unaligned_le16(p));
+    p += 2;
+  }
+  if (len & 1)
+    crc = __crc32b(crc, *p);
+  return crc;
+}
+#undef ATTRIBUTES
+#endif /* crc32_arm_crc() */
+
+/*
+ * crc32_arm_crc_pmullcombine() - implementation using crc32 instructions, plus
+ *  pmull instructions for CRC combining
+ *
+ * This is similar to crc32_arm_crc(), but it enables the use of pmull
+ * (carryless multiplication) instructions for the steps where the CRCs of
+ * adjacent data chunks are combined.  As this greatly speeds up CRC
+ * combination, this implementation also differs from crc32_arm_crc() in that it
+ * uses a variable chunk length which can get fairly small.  The precomputed
+ * multipliers needed for the selected chunk length are loaded from a table.
+ *
+ * Note that pmull is used here only for combining the CRCs of separately
+ * checksummed chunks, not for folding the data itself.  See crc32_arm_pmull*()
+ * for implementations that use pmull for folding the data itself.
+ */
+#if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN
+#  if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef ARCH_ARM32
+#      define ATTRIBUTES  _target_attribute("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8")
+#    else
+#      ifdef __clang__
+#        define ATTRIBUTES  _target_attribute("crc,aes")
+#      else
+#        define ATTRIBUTES  _target_attribute("+crc,+crypto")
+#      endif
+#    endif
+#  endif
+
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
+#include <arm_neon.h>
+
+/* Do carryless multiplication of two 32-bit values. */
+static forceinline ATTRIBUTES u64
+clmul_u32(u32 a, u32 b)
+{
+  uint64x2_t res = vreinterpretq_u64_p128(
+                                          compat_vmull_p64((poly64_t)a, (poly64_t)b));
+  
+  return vgetq_lane_u64(res, 0);
+}
+
+/*
+ * Like combine_crcs_slow(), but uses vmull_p64 to do the multiplications more
+ * quickly, and supports a variable chunk length.  The chunk length is
+ * 'i * CRC32_MIN_VARIABLE_CHUNK_LEN'
+ * where 1 <= i < ARRAY_LEN(crc32_mults_for_chunklen).
+ */
+static forceinline ATTRIBUTES u32
+combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
+{
+  u64 res0 = clmul_u32(crc0, crc32_mults_for_chunklen[i][0]);
+  u64 res1 = clmul_u32(crc1, crc32_mults_for_chunklen[i][1]);
+  u64 res2 = clmul_u32(crc2, crc32_mults_for_chunklen[i][2]);
+  
+  return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3;
+}
+
+#define crc32_arm_crc_pmullcombine  crc32_arm_crc_pmullcombine
+static u32 ATTRIBUTES MAYBE_UNUSED
+crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
+{
+  const size_t align = -(uintptr_t)p & 7;
+  
+  if (len >= align + CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) {
+    /* Align p to the next 8-byte boundary. */
+    if (align) {
+      if (align & 1)
+        crc = __crc32b(crc, *p++);
+      if (align & 2) {
+        crc = __crc32h(crc, le16_bswap(*(u16 *)p));
+        p += 2;
+      }
+      if (align & 4) {
+        crc = __crc32w(crc, le32_bswap(*(u32 *)p));
+        p += 4;
+      }
+      len -= align;
+    }
+    /*
+     * Handle CRC32_MAX_VARIABLE_CHUNK_LEN specially, so that better
+     * code is generated for it.
+     */
+    while (len >= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN) {
+      const u64 *wp0 = (const u64 *)p;
+      const u64 * const wp0_end =
+      (const u64 *)(p + CRC32_MAX_VARIABLE_CHUNK_LEN);
+      u32 crc1 = 0, crc2 = 0, crc3 = 0;
+      
+      STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
+      STATIC_ASSERT(CRC32_MAX_VARIABLE_CHUNK_LEN % (4 * 8) == 0);
+      do {
+        prefetchr(&wp0[64 + 0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
+        prefetchr(&wp0[64 + 1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
+        prefetchr(&wp0[64 + 2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
+        prefetchr(&wp0[64 + 3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
+        crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        wp0++;
+        crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        wp0++;
+        crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        wp0++;
+        crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
+        wp0++;
+      } while (wp0 != wp0_end);
+      crc = combine_crcs_fast(crc, crc1, crc2, crc3,
+                              ARRAY_LEN(crc32_mults_for_chunklen) - 1);
+      p += CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN;
+      len -= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN;
+    }
+    /* Handle up to one variable-length chunk. */
+    if (len >= CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) {
+      const size_t i = len / (CRC32_NUM_CHUNKS *
+                              CRC32_MIN_VARIABLE_CHUNK_LEN);
+      const size_t chunk_len =
+      i * CRC32_MIN_VARIABLE_CHUNK_LEN;
+      const u64 *wp0 = (const u64 *)(p + 0*chunk_len);
+      const u64 *wp1 = (const u64 *)(p + 1*chunk_len);
+      const u64 *wp2 = (const u64 *)(p + 2*chunk_len);
+      const u64 *wp3 = (const u64 *)(p + 3*chunk_len);
+      const u64 * const wp0_end = wp1;
+      u32 crc1 = 0, crc2 = 0, crc3 = 0;
+      
+      STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
+      STATIC_ASSERT(CRC32_MIN_VARIABLE_CHUNK_LEN % (4 * 8) == 0);
+      do {
+        prefetchr(wp0 + 64);
+        prefetchr(wp1 + 64);
+        prefetchr(wp2 + 64);
+        prefetchr(wp3 + 64);
+        crc  = __crc32d(crc,  le64_bswap(*wp0++));
+        crc1 = __crc32d(crc1, le64_bswap(*wp1++));
+        crc2 = __crc32d(crc2, le64_bswap(*wp2++));
+        crc3 = __crc32d(crc3, le64_bswap(*wp3++));
+        crc  = __crc32d(crc,  le64_bswap(*wp0++));
+        crc1 = __crc32d(crc1, le64_bswap(*wp1++));
+        crc2 = __crc32d(crc2, le64_bswap(*wp2++));
+        crc3 = __crc32d(crc3, le64_bswap(*wp3++));
+        crc  = __crc32d(crc,  le64_bswap(*wp0++));
+        crc1 = __crc32d(crc1, le64_bswap(*wp1++));
+        crc2 = __crc32d(crc2, le64_bswap(*wp2++));
+        crc3 = __crc32d(crc3, le64_bswap(*wp3++));
+        crc  = __crc32d(crc,  le64_bswap(*wp0++));
+        crc1 = __crc32d(crc1, le64_bswap(*wp1++));
+        crc2 = __crc32d(crc2, le64_bswap(*wp2++));
+        crc3 = __crc32d(crc3, le64_bswap(*wp3++));
+      } while (wp0 != wp0_end);
+      crc = combine_crcs_fast(crc, crc1, crc2, crc3, i);
+      p += CRC32_NUM_CHUNKS * chunk_len;
+      len -= CRC32_NUM_CHUNKS * chunk_len;
+    }
+    
+    while (len >= 32) {
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16)));
+      crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24)));
+      p += 32;
+      len -= 32;
+    }
+  } else {
+    while (len >= 32) {
+      crc = __crc32d(crc, get_unaligned_le64(p + 0));
+      crc = __crc32d(crc, get_unaligned_le64(p + 8));
+      crc = __crc32d(crc, get_unaligned_le64(p + 16));
+      crc = __crc32d(crc, get_unaligned_le64(p + 24));
+      p += 32;
+      len -= 32;
+    }
+  }
+  if (len & 16) {
+    crc = __crc32d(crc, get_unaligned_le64(p + 0));
+    crc = __crc32d(crc, get_unaligned_le64(p + 8));
+    p += 16;
+  }
+  if (len & 8) {
+    crc = __crc32d(crc, get_unaligned_le64(p));
+    p += 8;
+  }
+  if (len & 4) {
+    crc = __crc32w(crc, get_unaligned_le32(p));
+    p += 4;
+  }
+  if (len & 2) {
+    crc = __crc32h(crc, get_unaligned_le16(p));
+    p += 2;
+  }
+  if (len & 1)
+    crc = __crc32b(crc, *p);
+  return crc;
+}
+#undef ATTRIBUTES
+#endif /* crc32_arm_crc_pmullcombine() */
+
+/*
+ * crc32_arm_pmullx4() - implementation using "folding" with pmull instructions
+ *
+ * This implementation is intended for CPUs that support pmull instructions but
+ * not crc32 instructions.
+ */
+#if HAVE_PMULL_INTRIN
+#  define crc32_arm_pmullx4  crc32_arm_pmullx4
+#  define SUFFIX       _pmullx4
+#  if HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef ARCH_ARM32
+#      define ATTRIBUTES    _target_attribute("fpu=crypto-neon-fp-armv8")
+#    else
+#      ifdef __clang__
+/*
+ * This used to use "crypto", but that stopped working with clang 16.
+ * Now only "aes" works.  "aes" works with older versions too, so use
+ * that.  No "+" prefix; clang 15 and earlier doesn't accept that.
+ */
+#        define ATTRIBUTES  _target_attribute("aes")
+#      else
+/*
+ * With gcc, only "+crypto" works.  Both the "+" prefix and the
+ * "crypto" (not "aes") are essential...
+ */
+#        define ATTRIBUTES  _target_attribute("+crypto")
+#      endif
+#    endif
+#  endif
+#  define ENABLE_EOR3    0
+#  include "crc32_pmull_helpers.h"
+
+static u32 ATTRIBUTES MAYBE_UNUSED
+crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
+{
+  static const u64 _aligned_attribute(16) mults[3][2] = {
+    { CRC32_X159_MODG, CRC32_X95_MODG },  /* 1 vecs */
+    { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
+    { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
+  };
+  static const u64 _aligned_attribute(16) final_mults[3][2] = {
+    { CRC32_X63_MODG, 0 },
+    { CRC32_BARRETT_CONSTANT_1, 0 },
+    { CRC32_BARRETT_CONSTANT_2, 0 },
+  };
+  const uint8x16_t zeroes = vdupq_n_u8(0);
+  const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
+  const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
+  uint8x16_t v0, v1, v2, v3;
+  
+  if (len < 64 + 15) {
+    if (len < 16)
+      return crc32_slice1(crc, p, len);
+    v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
+    p += 16;
+    len -= 16;
+    while (len >= 16) {
+      v0 = fold_vec(v0, vld1q_u8(p), multipliers_1);
+      p += 16;
+      len -= 16;
+    }
+  } else {
+    const poly64x2_t multipliers_4 = load_multipliers(mults[1]);
+    const poly64x2_t multipliers_2 = load_multipliers(mults[2]);
+    const size_t align = -(uintptr_t)p & 15;
+    const uint8x16_t *vp;
+    
+    v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
+    p += 16;
+    /* Align p to the next 16-byte boundary. */
+    if (align) {
+      v0 = fold_partial_vec(v0, p, align, multipliers_1);
+      p += align;
+      len -= align;
+    }
+    vp = (const uint8x16_t *)p;
+    v1 = *vp++;
+    v2 = *vp++;
+    v3 = *vp++;
+    while (len >= 64 + 64) {
+      v0 = fold_vec(v0, *vp++, multipliers_4);
+      v1 = fold_vec(v1, *vp++, multipliers_4);
+      v2 = fold_vec(v2, *vp++, multipliers_4);
+      v3 = fold_vec(v3, *vp++, multipliers_4);
+      len -= 64;
+    }
+    v0 = fold_vec(v0, v2, multipliers_2);
+    v1 = fold_vec(v1, v3, multipliers_2);
+    if (len & 32) {
+      v0 = fold_vec(v0, *vp++, multipliers_2);
+      v1 = fold_vec(v1, *vp++, multipliers_2);
+    }
+    v0 = fold_vec(v0, v1, multipliers_1);
+    if (len & 16)
+      v0 = fold_vec(v0, *vp++, multipliers_1);
+    p = (const u8 *)vp;
+    len &= 15;
+  }
+  
+  /* Handle any remaining partial block now before reducing to 32 bits. */
+  if (len)
+    v0 = fold_partial_vec(v0, p, len, multipliers_1);
+  
+  /*
+   * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
+   * which is equivalent to multiplying by x^32.  This is needed because
+   * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
+   */
+  
+  v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
+                clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
+  
+  /* Fold 96 => 64 bits. */
+  v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
+                clmul_low(vandq_u8(v0, mask32),
+                          load_multipliers(final_mults[0])));
+  
+  /* Reduce 64 => 32 bits using Barrett reduction. */
+  v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
+  v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
+  return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
+}
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef ENABLE_EOR3
+#endif /* crc32_arm_pmullx4() */
+
+/*
+ * crc32_arm_pmullx12_crc() - large-stride implementation using "folding" with
+ *  pmull instructions, where crc32 instructions are also available
+ *
+ * See crc32_pmull_wide.h for explanation.
+ */
+#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN
+#  define crc32_arm_pmullx12_crc  crc32_arm_pmullx12_crc
+#  define SUFFIX         _pmullx12_crc
+#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef __clang__
+#      define ATTRIBUTES  _target_attribute("aes,crc")
+#    else
+#      define ATTRIBUTES  _target_attribute("+crypto,+crc")
+#    endif
+#  endif
+#  define ENABLE_EOR3  0
+#  include "crc32_pmull_wide.h"
+#endif
+
+/*
+ * crc32_arm_pmullx12_crc_eor3()
+ *
+ * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
+ * the sha3 extension) for even better performance.
+ *
+ * Note: we require HAVE_SHA3_TARGET (or HAVE_SHA3_NATIVE) rather than
+ * HAVE_SHA3_INTRIN, as we have an inline asm fallback for eor3.
+ */
+#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \
+(HAVE_SHA3_TARGET || HAVE_SHA3_NATIVE)
+#  define crc32_arm_pmullx12_crc_eor3  crc32_arm_pmullx12_crc_eor3
+#  define SUFFIX         _pmullx12_crc_eor3
+#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE && \
+!USE_PMULL_TARGET_EVEN_IF_NATIVE
+#    define ATTRIBUTES
+#  else
+#    ifdef __clang__
+#      define ATTRIBUTES  _target_attribute("aes,crc,sha3")
+/*
+ * With gcc, arch=armv8.2-a is needed for the sha3 intrinsics, unless the
+ * default target is armv8.3-a or later in which case it must be omitted.
+ * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
+ */
+#    elif defined(__ARM_FEATURE_JCVT)
+#      define ATTRIBUTES  _target_attribute("+crypto,+crc,+sha3")
+#    else
+#      define ATTRIBUTES  _target_attribute("arch=armv8.2-a+crypto+crc+sha3")
+#    endif
+#  endif
+#  define ENABLE_EOR3  1
+#  include "crc32_pmull_wide.h"
+#endif
+
+/*
+ * On the Apple M1 processor, crc32 instructions max out at about 25.5 GB/s in
+ * the best case of using a 3-way or greater interleaved chunked implementation,
+ * whereas a pmull-based implementation achieves 68 GB/s provided that the
+ * stride length is large enough (about 10+ vectors with eor3, or 12+ without).
+ *
+ * For now we assume that crc32 instructions are preferable in other cases.
+ */
+#define PREFER_PMULL_TO_CRC  0
+#ifdef __APPLE__
+#  include <TargetConditionals.h>
+#  if TARGET_OS_OSX
+#    undef PREFER_PMULL_TO_CRC
+#    define PREFER_PMULL_TO_CRC  1
+#  endif
+#endif
+
+/*
+ * If the best implementation is statically available, use it unconditionally.
+ * Otherwise choose the best implementation at runtime.
+ */
+#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3) && \
+HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE
+#  define DEFAULT_IMPL  crc32_arm_pmullx12_crc_eor3
+#elif !PREFER_PMULL_TO_CRC && defined(crc32_arm_crc_pmullcombine) && \
+HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE
+#  define DEFAULT_IMPL  crc32_arm_crc_pmullcombine
+#else
+static inline crc32_func_t
+arch_select_crc32_func(void)
+{
+  const u32 features MAYBE_UNUSED = get_arm_cpu_features();
+  
+#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3)
+  if (HAVE_PMULL(features) && HAVE_CRC32(features) && HAVE_SHA3(features))
+    return crc32_arm_pmullx12_crc_eor3;
+#endif
+#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc)
+  if (HAVE_PMULL(features) && HAVE_CRC32(features))
+    return crc32_arm_pmullx12_crc;
+#endif
+#ifdef crc32_arm_crc_pmullcombine
+  if (HAVE_CRC32(features) && HAVE_PMULL(features))
+    return crc32_arm_crc_pmullcombine;
+#endif
+#ifdef crc32_arm_crc
+  if (HAVE_CRC32(features))
+    return crc32_arm_crc;
+#endif
+#ifdef crc32_arm_pmullx4
+  if (HAVE_PMULL(features))
+    return crc32_arm_pmullx4;
+#endif
+  return NULL;
+}
+#define arch_select_crc32_func  arch_select_crc32_func
+#endif
+
+#endif /* LIB_ARM_CRC32_IMPL_H */
diff --git a/Sources/DEFLATE/crc32_pmull_helpers.h b/Sources/DEFLATE/arm/crc32_pmull_helpers.h
similarity index 51%
rename from Sources/DEFLATE/crc32_pmull_helpers.h
rename to Sources/DEFLATE/arm/crc32_pmull_helpers.h
index 1cd1cc18..2c2172e2 100644
--- a/Sources/DEFLATE/crc32_pmull_helpers.h
+++ b/Sources/DEFLATE/arm/crc32_pmull_helpers.h
@@ -30,11 +30,11 @@
  * with pmull instructions.  It accepts the following parameters:
  *
  * SUFFIX:
- *	Name suffix to append to all instantiated functions.
+ *  Name suffix to append to all instantiated functions.
  * ATTRIBUTES:
- *	Target function attributes to use.
+ *  Target function attributes to use.
  * ENABLE_EOR3:
- *	Use the eor3 instruction (from the sha3 extension).
+ *  Use the eor3 instruction (from the sha3 extension).
  */
 
 #include <arm_neon.h>
@@ -44,29 +44,29 @@
 static forceinline ATTRIBUTES uint8x16_t
 ADD_SUFFIX(u32_to_bytevec)(u32 a)
 {
-	return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0));
+  return vreinterpretq_u8_u32(vsetq_lane_u32(a, vdupq_n_u32(0), 0));
 }
-#define u32_to_bytevec	ADD_SUFFIX(u32_to_bytevec)
+#define u32_to_bytevec  ADD_SUFFIX(u32_to_bytevec)
 
 /* Load two 64-bit values into a vector. */
 #undef load_multipliers
 static forceinline ATTRIBUTES poly64x2_t
 ADD_SUFFIX(load_multipliers)(const u64 p[2])
 {
-	return vreinterpretq_p64_u64(vld1q_u64(p));
+  return vreinterpretq_p64_u64(vld1q_u64(p));
 }
-#define load_multipliers	ADD_SUFFIX(load_multipliers)
+#define load_multipliers  ADD_SUFFIX(load_multipliers)
 
 /* Do carryless multiplication of the low halves of two vectors. */
 #undef clmul_low
 static forceinline ATTRIBUTES uint8x16_t
 ADD_SUFFIX(clmul_low)(uint8x16_t a, poly64x2_t b)
 {
-	return vreinterpretq_u8_p128(
-		     compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
-				      vgetq_lane_p64(b, 0)));
+  return vreinterpretq_u8_p128(
+                               compat_vmull_p64(vgetq_lane_p64(vreinterpretq_p64_u8(a), 0),
+                                                vgetq_lane_p64(b, 0)));
 }
-#define clmul_low	ADD_SUFFIX(clmul_low)
+#define clmul_low  ADD_SUFFIX(clmul_low)
 
 /* Do carryless multiplication of the high halves of two vectors. */
 #undef clmul_high
@@ -74,19 +74,19 @@ static forceinline ATTRIBUTES uint8x16_t
 ADD_SUFFIX(clmul_high)(uint8x16_t a, poly64x2_t b)
 {
 #if defined(__clang__) && defined(ARCH_ARM64)
-	/*
-	 * Use inline asm to ensure that pmull2 is really used.  This works
-	 * around clang bug https://github.com/llvm/llvm-project/issues/52868.
-	 */
-	uint8x16_t res;
-
-	__asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b));
-	return res;
+  /*
+   * Use inline asm to ensure that pmull2 is really used.  This works
+   * around clang bug https://github.com/llvm/llvm-project/issues/52868.
+   */
+  uint8x16_t res;
+  
+  __asm__("pmull2 %0.1q, %1.2d, %2.2d" : "=w" (res) : "w" (a), "w" (b));
+  return res;
 #else
-	return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b));
+  return vreinterpretq_u8_p128(vmull_high_p64(vreinterpretq_p64_u8(a), b));
 #endif
 }
-#define clmul_high	ADD_SUFFIX(clmul_high)
+#define clmul_high  ADD_SUFFIX(clmul_high)
 
 #undef eor3
 static forceinline ATTRIBUTES uint8x16_t
@@ -94,48 +94,48 @@ ADD_SUFFIX(eor3)(uint8x16_t a, uint8x16_t b, uint8x16_t c)
 {
 #if ENABLE_EOR3
 #if HAVE_SHA3_INTRIN
-	return veor3q_u8(a, b, c);
+  return veor3q_u8(a, b, c);
 #else
-	uint8x16_t res;
-
-	__asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b"
-		: "=w" (res) : "w" (a), "w" (b), "w" (c));
-	return res;
+  uint8x16_t res;
+  
+  __asm__("eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+          : "=w" (res) : "w" (a), "w" (b), "w" (c));
+  return res;
 #endif
 #else /* ENABLE_EOR3 */
-	return veorq_u8(veorq_u8(a, b), c);
+  return veorq_u8(veorq_u8(a, b), c);
 #endif /* !ENABLE_EOR3 */
 }
-#define eor3	ADD_SUFFIX(eor3)
+#define eor3  ADD_SUFFIX(eor3)
 
 #undef fold_vec
 static forceinline ATTRIBUTES uint8x16_t
 ADD_SUFFIX(fold_vec)(uint8x16_t src, uint8x16_t dst, poly64x2_t multipliers)
 {
-	uint8x16_t a = clmul_low(src, multipliers);
-	uint8x16_t b = clmul_high(src, multipliers);
-
-	return eor3(a, b, dst);
+  uint8x16_t a = clmul_low(src, multipliers);
+  uint8x16_t b = clmul_high(src, multipliers);
+  
+  return eor3(a, b, dst);
 }
-#define fold_vec	ADD_SUFFIX(fold_vec)
+#define fold_vec  ADD_SUFFIX(fold_vec)
 
 #undef vtbl
 static forceinline ATTRIBUTES uint8x16_t
 ADD_SUFFIX(vtbl)(uint8x16_t table, uint8x16_t indices)
 {
 #ifdef ARCH_ARM64
-	return vqtbl1q_u8(table, indices);
+  return vqtbl1q_u8(table, indices);
 #else
-	uint8x8x2_t tab2;
-
-	tab2.val[0] = vget_low_u8(table);
-	tab2.val[1] = vget_high_u8(table);
-
-	return vcombine_u8(vtbl2_u8(tab2, vget_low_u8(indices)),
-			   vtbl2_u8(tab2, vget_high_u8(indices)));
+  uint8x8x2_t tab2;
+  
+  tab2.val[0] = vget_low_u8(table);
+  tab2.val[1] = vget_high_u8(table);
+  
+  return vcombine_u8(vtbl2_u8(tab2, vget_low_u8(indices)),
+                     vtbl2_u8(tab2, vget_high_u8(indices)));
 #endif
 }
-#define vtbl	ADD_SUFFIX(vtbl)
+#define vtbl  ADD_SUFFIX(vtbl)
 
 /*
  * Given v containing a 16-byte polynomial, and a pointer 'p' that points to the
@@ -147,38 +147,38 @@ ADD_SUFFIX(vtbl)(uint8x16_t table, uint8x16_t indices)
 #undef fold_partial_vec
 static forceinline ATTRIBUTES MAYBE_UNUSED uint8x16_t
 ADD_SUFFIX(fold_partial_vec)(uint8x16_t v, const u8 *p, size_t len,
-			     poly64x2_t multipliers_1)
+                             poly64x2_t multipliers_1)
 {
-	/*
-	 * vtbl(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
-	 * vtbl(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
-	 */
-	static const u8 shift_tab[48] = {
-		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
-		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
-	};
-	const uint8x16_t lshift = vld1q_u8(&shift_tab[len]);
-	const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]);
-	uint8x16_t x0, x1, bsl_mask;
-
-	/* x0 = v left-shifted by '16 - len' bytes */
-	x0 = vtbl(v, lshift);
-
-	/* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */
-	bsl_mask = vreinterpretq_u8_s8(
-			vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7));
-
-	/*
-	 * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
-	 * bytes) followed by the remaining data.
-	 */
-	x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */,
-		      vld1q_u8(p + len - 16), vtbl(v, rshift));
-
-	return fold_vec(x0, x1, multipliers_1);
+  /*
+   * vtbl(v, shift_tab[len..len+15]) left shifts v by 16-len bytes.
+   * vtbl(v, shift_tab[len+16..len+31]) right shifts v by len bytes.
+   */
+  static const u8 shift_tab[48] = {
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+    0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+  };
+  const uint8x16_t lshift = vld1q_u8(&shift_tab[len]);
+  const uint8x16_t rshift = vld1q_u8(&shift_tab[len + 16]);
+  uint8x16_t x0, x1, bsl_mask;
+  
+  /* x0 = v left-shifted by '16 - len' bytes */
+  x0 = vtbl(v, lshift);
+  
+  /* Create a vector of '16 - len' 0x00 bytes, then 'len' 0xff bytes. */
+  bsl_mask = vreinterpretq_u8_s8(
+                                 vshrq_n_s8(vreinterpretq_s8_u8(rshift), 7));
+  
+  /*
+   * x1 = the last '16 - len' bytes from v (i.e. v right-shifted by 'len'
+   * bytes) followed by the remaining data.
+   */
+  x1 = vbslq_u8(bsl_mask /* 0 bits select from arg3, 1 bits from arg2 */,
+                vld1q_u8(p + len - 16), vtbl(v, rshift));
+  
+  return fold_vec(x0, x1, multipliers_1);
 }
-#define fold_partial_vec	ADD_SUFFIX(fold_partial_vec)
+#define fold_partial_vec  ADD_SUFFIX(fold_partial_vec)
diff --git a/Sources/DEFLATE/arm/crc32_pmull_wide.h b/Sources/DEFLATE/arm/crc32_pmull_wide.h
new file mode 100644
index 00000000..67453f63
--- /dev/null
+++ b/Sources/DEFLATE/arm/crc32_pmull_wide.h
@@ -0,0 +1,231 @@
+/*
+ * arm/crc32_pmull_wide.h - gzip CRC-32 with PMULL (extra-wide version)
+ *
+ * Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This file is a "template" for instantiating PMULL-based crc32_arm functions.
+ * The "parameters" are:
+ *
+ * SUFFIX:
+ *  Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *  Target function attributes to use.
+ * ENABLE_EOR3:
+ *  Use the eor3 instruction (from the sha3 extension).
+ *
+ * This is the extra-wide version; it uses an unusually large stride length of
+ * 12, and it assumes that crc32 instructions are available too.  It's intended
+ * for powerful CPUs that support both pmull and crc32 instructions, but where
+ * throughput of pmull and xor (given enough instructions issued in parallel) is
+ * significantly higher than that of crc32, thus making the crc32 instructions
+ * (counterintuitively) not actually the fastest way to compute the CRC-32.  The
+ * Apple M1 processor is an example of such a CPU.
+ */
+
+#ifndef _MSC_VER
+#  include <arm_acle.h>
+#endif
+#include <arm_neon.h>
+
+#include "crc32_pmull_helpers.h"
+
+static u32 ATTRIBUTES MAYBE_UNUSED
+ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
+{
+  uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
+  
+  if (len < 3 * 192) {
+    static const u64 _aligned_attribute(16) mults[3][2] = {
+      { CRC32_X543_MODG, CRC32_X479_MODG }, /* 4 vecs */
+      { CRC32_X287_MODG, CRC32_X223_MODG }, /* 2 vecs */
+      { CRC32_X159_MODG, CRC32_X95_MODG },  /* 1 vecs */
+    };
+    poly64x2_t multipliers_4, multipliers_2, multipliers_1;
+    
+    if (len < 64)
+      goto tail;
+    multipliers_4 = load_multipliers(mults[0]);
+    multipliers_2 = load_multipliers(mults[1]);
+    multipliers_1 = load_multipliers(mults[2]);
+    /*
+     * Short length; don't bother aligning the pointer, and fold
+     * 64 bytes (4 vectors) at a time, at most.
+     */
+    v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc));
+    v1 = vld1q_u8(p + 16);
+    v2 = vld1q_u8(p + 32);
+    v3 = vld1q_u8(p + 48);
+    p += 64;
+    len -= 64;
+    while (len >= 64) {
+      v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_4);
+      v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_4);
+      v2 = fold_vec(v2, vld1q_u8(p + 32), multipliers_4);
+      v3 = fold_vec(v3, vld1q_u8(p + 48), multipliers_4);
+      p += 64;
+      len -= 64;
+    }
+    v0 = fold_vec(v0, v2, multipliers_2);
+    v1 = fold_vec(v1, v3, multipliers_2);
+    if (len >= 32) {
+      v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_2);
+      v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_2);
+      p += 32;
+      len -= 32;
+    }
+    v0 = fold_vec(v0, v1, multipliers_1);
+  } else {
+    static const u64 _aligned_attribute(16) mults[4][2] = {
+      { CRC32_X1567_MODG, CRC32_X1503_MODG }, /* 12 vecs */
+      { CRC32_X799_MODG, CRC32_X735_MODG },   /* 6 vecs */
+      { CRC32_X415_MODG, CRC32_X351_MODG },   /* 3 vecs */
+      { CRC32_X159_MODG, CRC32_X95_MODG },    /* 1 vecs */
+    };
+    const poly64x2_t multipliers_12 = load_multipliers(mults[0]);
+    const poly64x2_t multipliers_6 = load_multipliers(mults[1]);
+    const poly64x2_t multipliers_3 = load_multipliers(mults[2]);
+    const poly64x2_t multipliers_1 = load_multipliers(mults[3]);
+    const size_t align = -(uintptr_t)p & 15;
+    const uint8x16_t *vp;
+    
+    /* Align p to the next 16-byte boundary. */
+    if (align) {
+      if (align & 1)
+        crc = __crc32b(crc, *p++);
+      if (align & 2) {
+        crc = __crc32h(crc, le16_bswap(*(u16 *)p));
+        p += 2;
+      }
+      if (align & 4) {
+        crc = __crc32w(crc, le32_bswap(*(u32 *)p));
+        p += 4;
+      }
+      if (align & 8) {
+        crc = __crc32d(crc, le64_bswap(*(u64 *)p));
+        p += 8;
+      }
+      len -= align;
+    }
+    vp = (const uint8x16_t *)p;
+    v0 = veorq_u8(*vp++, u32_to_bytevec(crc));
+    v1 = *vp++;
+    v2 = *vp++;
+    v3 = *vp++;
+    v4 = *vp++;
+    v5 = *vp++;
+    v6 = *vp++;
+    v7 = *vp++;
+    v8 = *vp++;
+    v9 = *vp++;
+    v10 = *vp++;
+    v11 = *vp++;
+    len -= 192;
+    /* Fold 192 bytes (12 vectors) at a time. */
+    do {
+      v0 = fold_vec(v0, *vp++, multipliers_12);
+      v1 = fold_vec(v1, *vp++, multipliers_12);
+      v2 = fold_vec(v2, *vp++, multipliers_12);
+      v3 = fold_vec(v3, *vp++, multipliers_12);
+      v4 = fold_vec(v4, *vp++, multipliers_12);
+      v5 = fold_vec(v5, *vp++, multipliers_12);
+      v6 = fold_vec(v6, *vp++, multipliers_12);
+      v7 = fold_vec(v7, *vp++, multipliers_12);
+      v8 = fold_vec(v8, *vp++, multipliers_12);
+      v9 = fold_vec(v9, *vp++, multipliers_12);
+      v10 = fold_vec(v10, *vp++, multipliers_12);
+      v11 = fold_vec(v11, *vp++, multipliers_12);
+      len -= 192;
+    } while (len >= 192);
+    
+    /*
+     * Fewer than 192 bytes left.  Fold v0-v11 down to just v0,
+     * while processing up to 144 more bytes.
+     */
+    v0 = fold_vec(v0, v6, multipliers_6);
+    v1 = fold_vec(v1, v7, multipliers_6);
+    v2 = fold_vec(v2, v8, multipliers_6);
+    v3 = fold_vec(v3, v9, multipliers_6);
+    v4 = fold_vec(v4, v10, multipliers_6);
+    v5 = fold_vec(v5, v11, multipliers_6);
+    if (len >= 96) {
+      v0 = fold_vec(v0, *vp++, multipliers_6);
+      v1 = fold_vec(v1, *vp++, multipliers_6);
+      v2 = fold_vec(v2, *vp++, multipliers_6);
+      v3 = fold_vec(v3, *vp++, multipliers_6);
+      v4 = fold_vec(v4, *vp++, multipliers_6);
+      v5 = fold_vec(v5, *vp++, multipliers_6);
+      len -= 96;
+    }
+    v0 = fold_vec(v0, v3, multipliers_3);
+    v1 = fold_vec(v1, v4, multipliers_3);
+    v2 = fold_vec(v2, v5, multipliers_3);
+    if (len >= 48) {
+      v0 = fold_vec(v0, *vp++, multipliers_3);
+      v1 = fold_vec(v1, *vp++, multipliers_3);
+      v2 = fold_vec(v2, *vp++, multipliers_3);
+      len -= 48;
+    }
+    v0 = fold_vec(v0, v1, multipliers_1);
+    v0 = fold_vec(v0, v2, multipliers_1);
+    p = (const u8 *)vp;
+  }
+  /* Reduce 128 to 32 bits using crc32 instructions. */
+  crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0));
+  crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1));
+tail:
+  /* Finish up the remainder using crc32 instructions. */
+  if (len & 32) {
+    crc = __crc32d(crc, get_unaligned_le64(p + 0));
+    crc = __crc32d(crc, get_unaligned_le64(p + 8));
+    crc = __crc32d(crc, get_unaligned_le64(p + 16));
+    crc = __crc32d(crc, get_unaligned_le64(p + 24));
+    p += 32;
+  }
+  if (len & 16) {
+    crc = __crc32d(crc, get_unaligned_le64(p + 0));
+    crc = __crc32d(crc, get_unaligned_le64(p + 8));
+    p += 16;
+  }
+  if (len & 8) {
+    crc = __crc32d(crc, get_unaligned_le64(p));
+    p += 8;
+  }
+  if (len & 4) {
+    crc = __crc32w(crc, get_unaligned_le32(p));
+    p += 4;
+  }
+  if (len & 2) {
+    crc = __crc32h(crc, get_unaligned_le16(p));
+    p += 2;
+  }
+  if (len & 1)
+    crc = __crc32b(crc, *p);
+  return crc;
+}
+
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef ENABLE_EOR3
diff --git a/Sources/DEFLATE/matchfinder_impl.h b/Sources/DEFLATE/arm/matchfinder_impl.h
similarity index 66%
rename from Sources/DEFLATE/matchfinder_impl.h
rename to Sources/DEFLATE/arm/matchfinder_impl.h
index b20f56a3..9917da4a 100644
--- a/Sources/DEFLATE/matchfinder_impl.h
+++ b/Sources/DEFLATE/arm/matchfinder_impl.h
@@ -35,42 +35,42 @@
 static forceinline void
 matchfinder_init_neon(mf_pos_t *data, size_t size)
 {
-	int16x8_t *p = (int16x8_t *)data;
-	int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL);
-
-	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
-	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
-	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
-
-	do {
-		p[0] = v;
-		p[1] = v;
-		p[2] = v;
-		p[3] = v;
-		p += 4;
-		size -= 4 * sizeof(*p);
-	} while (size != 0);
+  int16x8_t *p = (int16x8_t *)data;
+  int16x8_t v = vdupq_n_s16(MATCHFINDER_INITVAL);
+  
+  STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+  STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+  STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+  
+  do {
+    p[0] = v;
+    p[1] = v;
+    p[2] = v;
+    p[3] = v;
+    p += 4;
+    size -= 4 * sizeof(*p);
+  } while (size != 0);
 }
 #define matchfinder_init matchfinder_init_neon
 
 static forceinline void
 matchfinder_rebase_neon(mf_pos_t *data, size_t size)
 {
-	int16x8_t *p = (int16x8_t *)data;
-	int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE);
-
-	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
-	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
-	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
-
-	do {
-		p[0] = vqaddq_s16(p[0], v);
-		p[1] = vqaddq_s16(p[1], v);
-		p[2] = vqaddq_s16(p[2], v);
-		p[3] = vqaddq_s16(p[3], v);
-		p += 4;
-		size -= 4 * sizeof(*p);
-	} while (size != 0);
+  int16x8_t *p = (int16x8_t *)data;
+  int16x8_t v = vdupq_n_s16((u16)-MATCHFINDER_WINDOW_SIZE);
+  
+  STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+  STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+  STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+  
+  do {
+    p[0] = vqaddq_s16(p[0], v);
+    p[1] = vqaddq_s16(p[1], v);
+    p[2] = vqaddq_s16(p[2], v);
+    p[3] = vqaddq_s16(p[3], v);
+    p += 4;
+    size -= 4 * sizeof(*p);
+  } while (size != 0);
 }
 #define matchfinder_rebase matchfinder_rebase_neon
 
diff --git a/Sources/DEFLATE/cpu_features.c b/Sources/DEFLATE/cpu_features.c
deleted file mode 100644
index 72ab03da..00000000
--- a/Sources/DEFLATE/cpu_features.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * arm/cpu_features.c - feature detection for ARM CPUs
- *
- * Copyright 2018 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * ARM CPUs don't have a standard way for unprivileged programs to detect CPU
- * features.  But an OS-specific way can be used when available.
- */
-
-#ifdef __APPLE__
-#  undef _ANSI_SOURCE
-#  undef _DARWIN_C_SOURCE
-#  define _DARWIN_C_SOURCE /* for sysctlbyname() */
-#endif
-
-#include "../cpu_features_common.h" /* must be included first */
-#include "cpu_features.h"
-
-#if HAVE_DYNAMIC_ARM_CPU_FEATURES
-
-#ifdef __linux__
-/*
- * On Linux, arm32 and arm64 CPU features can be detected by reading the
- * AT_HWCAP and AT_HWCAP2 values from /proc/self/auxv.
- *
- * Ideally we'd use the C library function getauxval(), but it's not guaranteed
- * to be available: it was only added to glibc in 2.16, and in Android it was
- * added to API level 18 for arm32 and level 21 for arm64.
- */
-
-#include <errno.h>
-#include <fcntl.h>
-#include <string.h>
-#include <unistd.h>
-
-#define AT_HWCAP	16
-#define AT_HWCAP2	26
-
-static void scan_auxv(unsigned long *hwcap, unsigned long *hwcap2)
-{
-	int fd;
-	unsigned long auxbuf[32];
-	int filled = 0;
-	int i;
-
-	fd = open("/proc/self/auxv", O_RDONLY);
-	if (fd < 0)
-		return;
-
-	for (;;) {
-		do {
-			int ret = read(fd, &((char *)auxbuf)[filled],
-				       sizeof(auxbuf) - filled);
-			if (ret <= 0) {
-				if (ret < 0 && errno == EINTR)
-					continue;
-				goto out;
-			}
-			filled += ret;
-		} while (filled < 2 * sizeof(long));
-
-		i = 0;
-		do {
-			unsigned long type = auxbuf[i];
-			unsigned long value = auxbuf[i + 1];
-
-			if (type == AT_HWCAP)
-				*hwcap = value;
-			else if (type == AT_HWCAP2)
-				*hwcap2 = value;
-			i += 2;
-			filled -= 2 * sizeof(long);
-		} while (filled >= 2 * sizeof(long));
-
-		memmove(auxbuf, &auxbuf[i], filled);
-	}
-out:
-	close(fd);
-}
-
-static u32 query_arm_cpu_features(void)
-{
-	u32 features = 0;
-	unsigned long hwcap = 0;
-	unsigned long hwcap2 = 0;
-
-	scan_auxv(&hwcap, &hwcap2);
-
-#ifdef ARCH_ARM32
-	STATIC_ASSERT(sizeof(long) == 4);
-	if (hwcap & (1 << 12))	/* HWCAP_NEON */
-		features |= ARM_CPU_FEATURE_NEON;
-	if (hwcap2 & (1 << 1))	/* HWCAP2_PMULL */
-		features |= ARM_CPU_FEATURE_PMULL;
-	if (hwcap2 & (1 << 4))	/* HWCAP2_CRC32 */
-		features |= ARM_CPU_FEATURE_CRC32;
-#else
-	STATIC_ASSERT(sizeof(long) == 8);
-	if (hwcap & (1 << 1))	/* HWCAP_ASIMD */
-		features |= ARM_CPU_FEATURE_NEON;
-	if (hwcap & (1 << 4))	/* HWCAP_PMULL */
-		features |= ARM_CPU_FEATURE_PMULL;
-	if (hwcap & (1 << 7))	/* HWCAP_CRC32 */
-		features |= ARM_CPU_FEATURE_CRC32;
-	if (hwcap & (1 << 17))	/* HWCAP_SHA3 */
-		features |= ARM_CPU_FEATURE_SHA3;
-	if (hwcap & (1 << 20))	/* HWCAP_ASIMDDP */
-		features |= ARM_CPU_FEATURE_DOTPROD;
-#endif
-	return features;
-}
-
-#elif defined(__APPLE__)
-/* On Apple platforms, arm64 CPU features can be detected via sysctlbyname(). */
-
-#include <sys/types.h>
-#include <sys/sysctl.h>
-
-static const struct {
-	const char *name;
-	u32 feature;
-} feature_sysctls[] = {
-	{ "hw.optional.neon",		  ARM_CPU_FEATURE_NEON },
-	{ "hw.optional.AdvSIMD",	  ARM_CPU_FEATURE_NEON },
-	{ "hw.optional.arm.FEAT_PMULL",	  ARM_CPU_FEATURE_PMULL },
-	{ "hw.optional.armv8_crc32",	  ARM_CPU_FEATURE_CRC32 },
-	{ "hw.optional.armv8_2_sha3",	  ARM_CPU_FEATURE_SHA3 },
-	{ "hw.optional.arm.FEAT_SHA3",	  ARM_CPU_FEATURE_SHA3 },
-	{ "hw.optional.arm.FEAT_DotProd", ARM_CPU_FEATURE_DOTPROD },
-};
-
-static u32 query_arm_cpu_features(void)
-{
-	u32 features = 0;
-	size_t i;
-
-	for (i = 0; i < ARRAY_LEN(feature_sysctls); i++) {
-		const char *name = feature_sysctls[i].name;
-		u32 val = 0;
-		size_t valsize = sizeof(val);
-
-		if (sysctlbyname(name, &val, &valsize, NULL, 0) == 0 &&
-		    valsize == sizeof(val) && val == 1)
-			features |= feature_sysctls[i].feature;
-	}
-	return features;
-}
-#elif defined(_WIN32)
-
-#include <windows.h>
-
-static u32 query_arm_cpu_features(void)
-{
-	u32 features = ARM_CPU_FEATURE_NEON;
-
-	if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
-		features |= ARM_CPU_FEATURE_PMULL;
-	if (IsProcessorFeaturePresent(PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE))
-		features |= ARM_CPU_FEATURE_CRC32;
-
-	/* FIXME: detect SHA3 and DOTPROD support too. */
-
-	return features;
-}
-#else
-#error "unhandled case"
-#endif
-
-static const struct cpu_feature arm_cpu_feature_table[] = {
-	{ARM_CPU_FEATURE_NEON,		"neon"},
-	{ARM_CPU_FEATURE_PMULL,		"pmull"},
-	{ARM_CPU_FEATURE_CRC32,		"crc32"},
-	{ARM_CPU_FEATURE_SHA3,		"sha3"},
-	{ARM_CPU_FEATURE_DOTPROD,	"dotprod"},
-};
-
-volatile u32 libdeflate_arm_cpu_features = 0;
-
-void libdeflate_init_arm_cpu_features(void)
-{
-	u32 features = query_arm_cpu_features();
-
-	disable_cpu_features_for_testing(&features, arm_cpu_feature_table,
-					 ARRAY_LEN(arm_cpu_feature_table));
-
-	libdeflate_arm_cpu_features = features | ARM_CPU_FEATURES_KNOWN;
-}
-
-#endif /* HAVE_DYNAMIC_ARM_CPU_FEATURES */
diff --git a/Sources/DEFLATE/cpu_features.h b/Sources/DEFLATE/cpu_features.h
deleted file mode 100644
index c55f007c..00000000
--- a/Sources/DEFLATE/cpu_features.h
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * arm/cpu_features.h - feature detection for ARM CPUs
- *
- * Copyright 2018 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef LIB_ARM_CPU_FEATURES_H
-#define LIB_ARM_CPU_FEATURES_H
-
-#include "../lib_common.h"
-
-#define HAVE_DYNAMIC_ARM_CPU_FEATURES	0
-
-#if defined(ARCH_ARM32) || defined(ARCH_ARM64)
-
-#if !defined(FREESTANDING) && \
-    (COMPILER_SUPPORTS_TARGET_FUNCTION_ATTRIBUTE || defined(_MSC_VER)) && \
-    (defined(__linux__) || \
-     (defined(__APPLE__) && defined(ARCH_ARM64)) || \
-     (defined(_WIN32) && defined(ARCH_ARM64)))
-#  undef HAVE_DYNAMIC_ARM_CPU_FEATURES
-#  define HAVE_DYNAMIC_ARM_CPU_FEATURES	1
-#endif
-
-#define ARM_CPU_FEATURE_NEON		0x00000001
-#define ARM_CPU_FEATURE_PMULL		0x00000002
-#define ARM_CPU_FEATURE_CRC32		0x00000004
-#define ARM_CPU_FEATURE_SHA3		0x00000008
-#define ARM_CPU_FEATURE_DOTPROD		0x00000010
-
-#define HAVE_NEON(features)	(HAVE_NEON_NATIVE    || ((features) & ARM_CPU_FEATURE_NEON))
-#define HAVE_PMULL(features)	(HAVE_PMULL_NATIVE   || ((features) & ARM_CPU_FEATURE_PMULL))
-#define HAVE_CRC32(features)	(HAVE_CRC32_NATIVE   || ((features) & ARM_CPU_FEATURE_CRC32))
-#define HAVE_SHA3(features)	(HAVE_SHA3_NATIVE    || ((features) & ARM_CPU_FEATURE_SHA3))
-#define HAVE_DOTPROD(features)	(HAVE_DOTPROD_NATIVE || ((features) & ARM_CPU_FEATURE_DOTPROD))
-
-#if HAVE_DYNAMIC_ARM_CPU_FEATURES
-#define ARM_CPU_FEATURES_KNOWN		0x80000000
-extern volatile u32 libdeflate_arm_cpu_features;
-
-void libdeflate_init_arm_cpu_features(void);
-
-static inline u32 get_arm_cpu_features(void)
-{
-	if (libdeflate_arm_cpu_features == 0)
-		libdeflate_init_arm_cpu_features();
-	return libdeflate_arm_cpu_features;
-}
-#else /* HAVE_DYNAMIC_ARM_CPU_FEATURES */
-static inline u32 get_arm_cpu_features(void) { return 0; }
-#endif /* !HAVE_DYNAMIC_ARM_CPU_FEATURES */
-
-/* NEON */
-#if defined(__ARM_NEON) || defined(ARCH_ARM64)
-#  define HAVE_NEON_NATIVE	1
-#else
-#  define HAVE_NEON_NATIVE	0
-#endif
-/*
- * With both gcc and clang, NEON intrinsics require that the main target has
- * NEON enabled already.  Exception: with gcc 6.1 and later (r230411 for arm32,
- * r226563 for arm64), hardware floating point support is sufficient.
- */
-#if HAVE_NEON_NATIVE || \
-	(HAVE_DYNAMIC_ARM_CPU_FEATURES && GCC_PREREQ(6, 1) && defined(__ARM_FP))
-#  define HAVE_NEON_INTRIN	1
-#else
-#  define HAVE_NEON_INTRIN	0
-#endif
-
-/* PMULL */
-#ifdef __ARM_FEATURE_CRYPTO
-#  define HAVE_PMULL_NATIVE	1
-#else
-#  define HAVE_PMULL_NATIVE	0
-#endif
-#if HAVE_PMULL_NATIVE || \
-	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
-	 HAVE_NEON_INTRIN /* needed to exclude soft float arm32 case */ && \
-	 (GCC_PREREQ(6, 1) || CLANG_PREREQ(3, 5, 6010000) || \
-	  defined(_MSC_VER)) && \
-	  /*
-	   * On arm32 with clang, the crypto intrinsics (which include pmull)
-	   * are not defined, even when using -mfpu=crypto-neon-fp-armv8,
-	   * because clang's <arm_neon.h> puts their definitions behind
-	   * __aarch64__.
-	   */ \
-	 !(defined(ARCH_ARM32) && defined(__clang__)))
-#  define HAVE_PMULL_INTRIN	CPU_IS_LITTLE_ENDIAN() /* untested on big endian */
-   /* Work around MSVC's vmull_p64() taking poly64x1_t instead of poly64_t */
-#  ifdef _MSC_VER
-#    define compat_vmull_p64(a, b)  vmull_p64(vcreate_p64(a), vcreate_p64(b))
-#  else
-#    define compat_vmull_p64(a, b)  vmull_p64((a), (b))
-#  endif
-#else
-#  define HAVE_PMULL_INTRIN	0
-#endif
-/*
- * Set USE_PMULL_TARGET_EVEN_IF_NATIVE if a workaround for a gcc bug that was
- * fixed by commit 11a113d501ff ("aarch64: Simplify feature definitions") in gcc
- * 13 is needed.  A minimal program that fails to build due to this bug when
- * compiled with -mcpu=emag, at least with gcc 10 through 12, is:
- *
- *    static inline __attribute__((always_inline,target("+crypto"))) void f() {}
- *    void g() { f(); }
- *
- * The error is:
- *
- *    error: inlining failed in call to ‘always_inline’ ‘f’: target specific option mismatch
- *
- * The workaround is to explicitly add the crypto target to the non-inline
- * function g(), even though this should not be required due to -mcpu=emag
- * enabling 'crypto' natively and causing __ARM_FEATURE_CRYPTO to be defined.
- */
-#if HAVE_PMULL_NATIVE && defined(ARCH_ARM64) && \
-		GCC_PREREQ(6, 1) && !GCC_PREREQ(13, 1)
-#  define USE_PMULL_TARGET_EVEN_IF_NATIVE	1
-#else
-#  define USE_PMULL_TARGET_EVEN_IF_NATIVE	0
-#endif
-
-/* CRC32 */
-#ifdef __ARM_FEATURE_CRC32
-#  define HAVE_CRC32_NATIVE	1
-#else
-#  define HAVE_CRC32_NATIVE	0
-#endif
-#undef HAVE_CRC32_INTRIN
-#if HAVE_CRC32_NATIVE
-#  define HAVE_CRC32_INTRIN	1
-#elif HAVE_DYNAMIC_ARM_CPU_FEATURES
-#  if GCC_PREREQ(1, 0)
-    /*
-     * Support for ARM CRC32 intrinsics when CRC32 instructions are not enabled
-     * in the main target has been affected by two gcc bugs, which we must avoid
-     * by only allowing gcc versions that have the corresponding fixes.  First,
-     * gcc commit 943766d37ae4 ("[arm] Fix use of CRC32 intrinsics with Armv8-a
-     * and hard-float"), i.e. gcc 8.4+, 9.3+, 10.1+, or 11+, is needed.  Second,
-     * gcc commit c1cdabe3aab8 ("arm: reorder assembler architecture directives
-     * [PR101723]"), i.e. gcc 9.5+, 10.4+, 11.3+, or 12+, is needed when
-     * binutils is 2.34 or later, due to
-     * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=104439.  We use the second
-     * set of prerequisites, as they are stricter and we have no way to detect
-     * the binutils version directly from a C source file.
-     *
-     * Also exclude the cases where the main target arch is armv6kz or armv7e-m.
-     * In those cases, gcc doesn't let functions that use the main arch be
-     * inlined into functions that are targeted to armv8-a+crc.  (armv8-a is
-     * necessary for crc to be accepted at all.)  That causes build errors.
-     * This issue happens for these specific sub-archs because they are not a
-     * subset of armv8-a.  Note: clang does not have this limitation.
-     */
-#    if (GCC_PREREQ(11, 3) || \
-	 (GCC_PREREQ(10, 4) && !GCC_PREREQ(11, 0)) || \
-	 (GCC_PREREQ(9, 5) && !GCC_PREREQ(10, 0))) && \
-	!defined(__ARM_ARCH_6KZ__) && \
-	!defined(__ARM_ARCH_7EM__)
-#      define HAVE_CRC32_INTRIN	1
-#    endif
-#  elif CLANG_PREREQ(3, 4, 6000000)
-#    define HAVE_CRC32_INTRIN	1
-#  elif defined(_MSC_VER)
-#    define HAVE_CRC32_INTRIN	1
-#  endif
-#endif
-#ifndef HAVE_CRC32_INTRIN
-#  define HAVE_CRC32_INTRIN	0
-#endif
-
-/* SHA3 (needed for the eor3 instruction) */
-#if defined(ARCH_ARM64) && !defined(_MSC_VER)
-#  ifdef __ARM_FEATURE_SHA3
-#    define HAVE_SHA3_NATIVE	1
-#  else
-#    define HAVE_SHA3_NATIVE	0
-#  endif
-#  define HAVE_SHA3_TARGET	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
-				 (GCC_PREREQ(8, 1) /* r256478 */ || \
-				  CLANG_PREREQ(7, 0, 10010463) /* r338010 */))
-#  define HAVE_SHA3_INTRIN	(HAVE_NEON_INTRIN && \
-				 (HAVE_SHA3_NATIVE || HAVE_SHA3_TARGET) && \
-				 (GCC_PREREQ(9, 1) /* r268049 */ || \
-				  CLANG_PREREQ(13, 0, 13160000)))
-#else
-#  define HAVE_SHA3_NATIVE	0
-#  define HAVE_SHA3_TARGET	0
-#  define HAVE_SHA3_INTRIN	0
-#endif
-
-/* dotprod */
-#ifdef ARCH_ARM64
-#  ifdef __ARM_FEATURE_DOTPROD
-#    define HAVE_DOTPROD_NATIVE	1
-#  else
-#    define HAVE_DOTPROD_NATIVE	0
-#  endif
-#  if HAVE_DOTPROD_NATIVE || \
-	(HAVE_DYNAMIC_ARM_CPU_FEATURES && \
-	 (GCC_PREREQ(8, 1) || CLANG_PREREQ(7, 0, 10010000) || \
-	  defined(_MSC_VER)))
-#    define HAVE_DOTPROD_INTRIN	1
-#  else
-#    define HAVE_DOTPROD_INTRIN	0
-#  endif
-#else
-#  define HAVE_DOTPROD_NATIVE	0
-#  define HAVE_DOTPROD_INTRIN	0
-#endif
-
-/*
- * Work around bugs in arm_acle.h and arm_neon.h where sometimes intrinsics are
- * only defined when the corresponding __ARM_FEATURE_* macro is defined.  The
- * intrinsics actually work in target attribute functions too if they are
- * defined, though, so work around this by temporarily defining the
- * corresponding __ARM_FEATURE_* macros while including the headers.
- */
-#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
-	(defined(__clang__) || defined(ARCH_ARM32))
-#  define __ARM_FEATURE_CRC32	1
-#endif
-#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
-#  define __ARM_FEATURE_SHA3	1
-#endif
-#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
-#  define __ARM_FEATURE_DOTPROD	1
-#endif
-#if HAVE_CRC32_INTRIN && !HAVE_CRC32_NATIVE && \
-	(defined(__clang__) || defined(ARCH_ARM32))
-#  include <arm_acle.h>
-#  undef __ARM_FEATURE_CRC32
-#endif
-#if HAVE_SHA3_INTRIN && !HAVE_SHA3_NATIVE && defined(__clang__)
-#  include <arm_neon.h>
-#  undef __ARM_FEATURE_SHA3
-#endif
-#if HAVE_DOTPROD_INTRIN && !HAVE_DOTPROD_NATIVE && defined(__clang__)
-#  include <arm_neon.h>
-#  undef __ARM_FEATURE_DOTPROD
-#endif
-
-#endif /* ARCH_ARM32 || ARCH_ARM64 */
-
-#endif /* LIB_ARM_CPU_FEATURES_H */
diff --git a/Sources/DEFLATE/cpu_features_common.h b/Sources/DEFLATE/cpu_features_common.h
index d0531d5c..04b9cedf 100644
--- a/Sources/DEFLATE/cpu_features_common.h
+++ b/Sources/DEFLATE/cpu_features_common.h
@@ -29,7 +29,7 @@
 #define LIB_CPU_FEATURES_COMMON_H
 
 #if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
-   /* for strdup() and strtok_r() */
+/* for strdup() and strtok_r() */
 #  undef _ANSI_SOURCE
 #  ifndef __APPLE__
 #    undef _GNU_SOURCE
@@ -43,49 +43,49 @@
 #include "lib_common.h"
 
 struct cpu_feature {
-	u32 bit;
-	const char *name;
+  u32 bit;
+  const char *name;
 };
 
 #if defined(TEST_SUPPORT__DO_NOT_USE) && !defined(FREESTANDING)
 /* Disable any features that are listed in $LIBDEFLATE_DISABLE_CPU_FEATURES. */
 static inline void
 disable_cpu_features_for_testing(u32 *features,
-				 const struct cpu_feature *feature_table,
-				 size_t feature_table_length)
+                                 const struct cpu_feature *feature_table,
+                                 size_t feature_table_length)
 {
-	char *env_value, *strbuf, *p, *saveptr = NULL;
-	size_t i;
-
-	env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES");
-	if (!env_value)
-		return;
-	strbuf = strdup(env_value);
-	if (!strbuf)
-		abort();
-	p = strtok_r(strbuf, ",", &saveptr);
-	while (p) {
-		for (i = 0; i < feature_table_length; i++) {
-			if (strcmp(p, feature_table[i].name) == 0) {
-				*features &= ~feature_table[i].bit;
-				break;
-			}
-		}
-		if (i == feature_table_length) {
-			fprintf(stderr,
-				"unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n",
-				p);
-			abort();
-		}
-		p = strtok_r(NULL, ",", &saveptr);
-	}
-	free(strbuf);
+  char *env_value, *strbuf, *p, *saveptr = NULL;
+  size_t i;
+  
+  env_value = getenv("LIBDEFLATE_DISABLE_CPU_FEATURES");
+  if (!env_value)
+    return;
+  strbuf = strdup(env_value);
+  if (!strbuf)
+    abort();
+  p = strtok_r(strbuf, ",", &saveptr);
+  while (p) {
+    for (i = 0; i < feature_table_length; i++) {
+      if (strcmp(p, feature_table[i].name) == 0) {
+        *features &= ~feature_table[i].bit;
+        break;
+      }
+    }
+    if (i == feature_table_length) {
+      fprintf(stderr,
+              "unrecognized feature in LIBDEFLATE_DISABLE_CPU_FEATURES: \"%s\"\n",
+              p);
+      abort();
+    }
+    p = strtok_r(NULL, ",", &saveptr);
+  }
+  free(strbuf);
 }
 #else /* TEST_SUPPORT__DO_NOT_USE */
 static inline void
 disable_cpu_features_for_testing(u32 *features,
-				 const struct cpu_feature *feature_table,
-				 size_t feature_table_length)
+                                 const struct cpu_feature *feature_table,
+                                 size_t feature_table_length)
 {
 }
 #endif /* !TEST_SUPPORT__DO_NOT_USE */
diff --git a/Sources/DEFLATE/crc32.c b/Sources/DEFLATE/crc32.c
index 213dd665..24a15418 100644
--- a/Sources/DEFLATE/crc32.c
+++ b/Sources/DEFLATE/crc32.c
@@ -33,7 +33,7 @@
  * polynomial M(x) with coefficients in GF(2) (the field of integers modulo 2),
  * where the coefficient of 'x^i' is 'bits[len - i]'.  Then, compute:
  *
- *			R(x) = M(x)*x^n mod G(x)
+ *      R(x) = M(x)*x^n mod G(x)
  *
  * where G(x) is a selected "generator" polynomial of degree 'n'.  The remainder
  * R(x) is a polynomial of max degree 'n - 1'.  The CRC of 'bits' is R(x)
@@ -44,17 +44,17 @@
  *
  * In the gzip format (RFC 1952):
  *
- *	- The bitstring to checksum is formed from the bytes of the uncompressed
- *	  data by concatenating the bits from the bytes in order, proceeding
- *	  from the low-order bit to the high-order bit within each byte.
+ *  - The bitstring to checksum is formed from the bytes of the uncompressed
+ *    data by concatenating the bits from the bytes in order, proceeding
+ *    from the low-order bit to the high-order bit within each byte.
  *
- *	- The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
- *	  x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
- *	  Consequently, the CRC length is 32 bits ("CRC-32").
+ *  - The generator polynomial G(x) is: x^32 + x^26 + x^23 + x^22 + x^16 +
+ *    x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x + 1.
+ *    Consequently, the CRC length is 32 bits ("CRC-32").
  *
- *	- The highest order 32 coefficients of M(x)*x^n are inverted.
+ *  - The highest order 32 coefficients of M(x)*x^n are inverted.
  *
- *	- All 32 coefficients of R(x) are inverted.
+ *  - All 32 coefficients of R(x) are inverted.
  *
  * The two inversions cause added leading and trailing zero bits to affect the
  * resulting CRC, whereas with a regular CRC such bits would have no effect on
@@ -70,35 +70,35 @@
  * subtraction can be implemented as bitwise exclusive OR (since we are working
  * in GF(2)).  Here is an unoptimized implementation:
  *
- *	static u32 crc32_gzip(const u8 *p, size_t len)
- *	{
- *		u32 crc = 0;
- *		const u32 divisor = 0xEDB88320;
+ *  static u32 crc32_gzip(const u8 *p, size_t len)
+ *  {
+ *    u32 crc = 0;
+ *    const u32 divisor = 0xEDB88320;
  *
- *		for (size_t i = 0; i < len * 8 + 32; i++) {
- *			int bit;
- *			u32 multiple;
+ *    for (size_t i = 0; i < len * 8 + 32; i++) {
+ *      int bit;
+ *      u32 multiple;
  *
- *			if (i < len * 8)
- *				bit = (p[i / 8] >> (i % 8)) & 1;
- *			else
- *				bit = 0; // one of the 32 appended 0 bits
+ *      if (i < len * 8)
+ *        bit = (p[i / 8] >> (i % 8)) & 1;
+ *      else
+ *        bit = 0; // one of the 32 appended 0 bits
  *
- *			if (i < 32) // the first 32 bits are inverted
- *				bit ^= 1;
+ *      if (i < 32) // the first 32 bits are inverted
+ *        bit ^= 1;
  *
- *			if (crc & 1)
- *				multiple = divisor;
- *			else
- *				multiple = 0;
+ *      if (crc & 1)
+ *        multiple = divisor;
+ *      else
+ *        multiple = 0;
  *
- *			crc >>= 1;
- *			crc |= (u32)bit << 31;
- *			crc ^= multiple;
- *		}
+ *      crc >>= 1;
+ *      crc |= (u32)bit << 31;
+ *      crc ^= multiple;
+ *    }
  *
- *		return ~crc;
- *	}
+ *    return ~crc;
+ *  }
  *
  * In this implementation, the 32-bit integer 'crc' maintains the remainder of
  * the currently processed portion of the message (with 32 zero bits appended)
@@ -114,27 +114,27 @@
  * 'multiple' until 32 bits later, we need not actually add each message bit
  * until that point:
  *
- *	static u32 crc32_gzip(const u8 *p, size_t len)
- *	{
- *		u32 crc = ~0;
- *		const u32 divisor = 0xEDB88320;
+ *  static u32 crc32_gzip(const u8 *p, size_t len)
+ *  {
+ *    u32 crc = ~0;
+ *    const u32 divisor = 0xEDB88320;
  *
- *		for (size_t i = 0; i < len * 8; i++) {
- *			int bit;
- *			u32 multiple;
+ *    for (size_t i = 0; i < len * 8; i++) {
+ *      int bit;
+ *      u32 multiple;
  *
- *			bit = (p[i / 8] >> (i % 8)) & 1;
- *			crc ^= bit;
- *			if (crc & 1)
- *				multiple = divisor;
- *			else
- *				multiple = 0;
- *			crc >>= 1;
- *			crc ^= multiple;
- *		}
+ *      bit = (p[i / 8] >> (i % 8)) & 1;
+ *      crc ^= bit;
+ *      if (crc & 1)
+ *        multiple = divisor;
+ *      else
+ *        multiple = 0;
+ *      crc >>= 1;
+ *      crc ^= multiple;
+ *    }
  *
- *		return ~crc;
- *	}
+ *    return ~crc;
+ *  }
  *
  * With the above implementation we get the effect of 32 appended 0 bits for
  * free; they never affect the choice of a divisor, nor would they change the
@@ -165,7 +165,7 @@
  * intermediate remainder (which we never actually store explicitly) is 96 bits.
  *
  * On CPUs that support fast carryless multiplication, CRCs can be computed even
- * more quickly via "folding".  See e.g. the x86 PCLMUL implementation.
+ * more quickly via "folding".  See e.g. the x86 PCLMUL implementations.
  */
 
 #include "lib_common.h"
@@ -176,31 +176,31 @@
 static u32 MAYBE_UNUSED
 crc32_slice8(u32 crc, const u8 *p, size_t len)
 {
-	const u8 * const end = p + len;
-	const u8 *end64;
-
-	for (; ((uintptr_t)p & 7) && p != end; p++)
-		crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p];
-
-	end64 = p + ((end - p) & ~7);
-	for (; p != end64; p += 8) {
-		u32 v1 = le32_bswap(*(const u32 *)(p + 0));
-		u32 v2 = le32_bswap(*(const u32 *)(p + 4));
-
-		crc = crc32_slice8_table[0x700 + (u8)((crc ^ v1) >> 0)] ^
-		      crc32_slice8_table[0x600 + (u8)((crc ^ v1) >> 8)] ^
-		      crc32_slice8_table[0x500 + (u8)((crc ^ v1) >> 16)] ^
-		      crc32_slice8_table[0x400 + (u8)((crc ^ v1) >> 24)] ^
-		      crc32_slice8_table[0x300 + (u8)(v2 >> 0)] ^
-		      crc32_slice8_table[0x200 + (u8)(v2 >> 8)] ^
-		      crc32_slice8_table[0x100 + (u8)(v2 >> 16)] ^
-		      crc32_slice8_table[0x000 + (u8)(v2 >> 24)];
-	}
-
-	for (; p != end; p++)
-		crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p];
-
-	return crc;
+  const u8 * const end = p + len;
+  const u8 *end64;
+  
+  for (; ((uintptr_t)p & 7) && p != end; p++)
+    crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p];
+  
+  end64 = p + ((end - p) & ~7);
+  for (; p != end64; p += 8) {
+    u32 v1 = le32_bswap(*(const u32 *)(p + 0));
+    u32 v2 = le32_bswap(*(const u32 *)(p + 4));
+    
+    crc = crc32_slice8_table[0x700 + (u8)((crc ^ v1) >> 0)] ^
+    crc32_slice8_table[0x600 + (u8)((crc ^ v1) >> 8)] ^
+    crc32_slice8_table[0x500 + (u8)((crc ^ v1) >> 16)] ^
+    crc32_slice8_table[0x400 + (u8)((crc ^ v1) >> 24)] ^
+    crc32_slice8_table[0x300 + (u8)(v2 >> 0)] ^
+    crc32_slice8_table[0x200 + (u8)(v2 >> 8)] ^
+    crc32_slice8_table[0x100 + (u8)(v2 >> 16)] ^
+    crc32_slice8_table[0x000 + (u8)(v2 >> 24)];
+  }
+  
+  for (; p != end; p++)
+    crc = (crc >> 8) ^ crc32_slice8_table[(u8)crc ^ *p];
+  
+  return crc;
 }
 
 /*
@@ -211,11 +211,11 @@ crc32_slice8(u32 crc, const u8 *p, size_t len)
 static forceinline u32 MAYBE_UNUSED
 crc32_slice1(u32 crc, const u8 *p, size_t len)
 {
-	size_t i;
-
-	for (i = 0; i < len; i++)
-		crc = (crc >> 8) ^ crc32_slice1_table[(u8)crc ^ p[i]];
-	return crc;
+  size_t i;
+  
+  for (i = 0; i < len; i++)
+    crc = (crc >> 8) ^ crc32_slice1_table[(u8)crc ^ p[i]];
+  return crc;
 }
 
 /* Include architecture-specific implementation(s) if available. */
@@ -223,7 +223,7 @@ crc32_slice1(u32 crc, const u8 *p, size_t len)
 #undef arch_select_crc32_func
 typedef u32 (*crc32_func_t)(u32 crc, const u8 *p, size_t len);
 #if defined(ARCH_ARM32) || defined(ARCH_ARM64)
-#  include "crc32_impl.h"
+#  include "arm/crc32_impl.h"
 #elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
 #  include "x86/crc32_impl.h"
 #endif
@@ -240,13 +240,13 @@ static volatile crc32_func_t crc32_impl = dispatch_crc32;
 /* Choose the best implementation at runtime. */
 static u32 dispatch_crc32(u32 crc, const u8 *p, size_t len)
 {
-	crc32_func_t f = arch_select_crc32_func();
-
-	if (f == NULL)
-		f = DEFAULT_IMPL;
-
-	crc32_impl = f;
-	return f(crc, p, len);
+  crc32_func_t f = arch_select_crc32_func();
+  
+  if (f == NULL)
+    f = DEFAULT_IMPL;
+  
+  crc32_impl = f;
+  return f(crc, p, len);
 }
 #else
 /* The best implementation is statically known, so call it directly. */
@@ -256,7 +256,7 @@ static u32 dispatch_crc32(u32 crc, const u8 *p, size_t len)
 LIBDEFLATEAPI u32
 libdeflate_crc32(u32 crc, const void *p, size_t len)
 {
-	if (p == NULL) /* Return initial value. */
-		return 0;
-	return ~crc32_impl(~crc, p, len);
+  if (p == NULL) /* Return initial value. */
+    return 0;
+  return ~crc32_impl(~crc, p, len);
 }
diff --git a/Sources/DEFLATE/crc32_impl.h b/Sources/DEFLATE/crc32_impl.h
deleted file mode 100644
index c802cdf0..00000000
--- a/Sources/DEFLATE/crc32_impl.h
+++ /dev/null
@@ -1,682 +0,0 @@
-/*
- * arm/crc32_impl.h - ARM implementations of the gzip CRC-32 algorithm
- *
- * Copyright 2022 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#ifndef LIB_ARM_CRC32_IMPL_H
-#define LIB_ARM_CRC32_IMPL_H
-
-#include "cpu_features.h"
-
-/*
- * crc32_arm_crc() - implementation using crc32 instructions (only)
- *
- * In general this implementation is straightforward.  However, naive use of the
- * crc32 instructions is serial: one of the two inputs to each crc32 instruction
- * is the output of the previous one.  To take advantage of CPUs that can
- * execute multiple crc32 instructions in parallel, when possible we interleave
- * the checksumming of several adjacent chunks, then combine their CRCs.
- *
- * However, without pmull, combining CRCs is fairly slow.  So in this pmull-less
- * version, we only use a large chunk length, and thus we only do chunked
- * processing if there is a lot of data to checksum.  This also means that a
- * variable chunk length wouldn't help much, so we just support a fixed length.
- */
-#if HAVE_CRC32_INTRIN
-#  if HAVE_CRC32_NATIVE
-#    define ATTRIBUTES
-#  else
-#    ifdef ARCH_ARM32
-#      ifdef __clang__
-#        define ATTRIBUTES	_target_attribute("armv8-a,crc")
-#      elif defined(__ARM_PCS_VFP)
-	 /*
-	  * +simd is needed to avoid a "selected architecture lacks an FPU"
-	  * error with Debian arm-linux-gnueabihf-gcc when -mfpu is not
-	  * explicitly specified on the command line.
-	  */
-#        define ATTRIBUTES	_target_attribute("arch=armv8-a+crc+simd")
-#      else
-#        define ATTRIBUTES	_target_attribute("arch=armv8-a+crc")
-#      endif
-#    else
-#      ifdef __clang__
-#        define ATTRIBUTES	_target_attribute("crc")
-#      else
-#        define ATTRIBUTES	_target_attribute("+crc")
-#      endif
-#    endif
-#  endif
-
-#ifndef _MSC_VER
-#  include <arm_acle.h>
-#endif
-
-/*
- * Combine the CRCs for 4 adjacent chunks of length L = CRC32_FIXED_CHUNK_LEN
- * bytes each by computing:
- *
- *	[ crc0*x^(3*8*L) + crc1*x^(2*8*L) + crc2*x^(1*8*L) + crc3 ] mod G(x)
- *
- * This has been optimized in several ways:
- *
- *    - The needed multipliers (x to some power, reduced mod G(x)) were
- *	precomputed.
- *
- *    - The 3 multiplications are interleaved.
- *
- *    - The reduction mod G(x) is delayed to the end and done using __crc32d.
- *	Note that the use of __crc32d introduces an extra factor of x^32.  To
- *	cancel that out along with the extra factor of x^1 that gets introduced
- *	because of how the 63-bit products are aligned in their 64-bit integers,
- *	the multipliers are actually x^(j*8*L - 33) instead of x^(j*8*L).
- */
-static forceinline ATTRIBUTES u32
-combine_crcs_slow(u32 crc0, u32 crc1, u32 crc2, u32 crc3)
-{
-	u64 res0 = 0, res1 = 0, res2 = 0;
-	int i;
-
-	/* Multiply crc{0,1,2} by CRC32_FIXED_CHUNK_MULT_{3,2,1}. */
-	for (i = 0; i < 32; i++) {
-		if (CRC32_FIXED_CHUNK_MULT_3 & (1U << i))
-			res0 ^= (u64)crc0 << i;
-		if (CRC32_FIXED_CHUNK_MULT_2 & (1U << i))
-			res1 ^= (u64)crc1 << i;
-		if (CRC32_FIXED_CHUNK_MULT_1 & (1U << i))
-			res2 ^= (u64)crc2 << i;
-	}
-	/* Add the different parts and reduce mod G(x). */
-	return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3;
-}
-
-#define crc32_arm_crc	crc32_arm_crc
-static u32 ATTRIBUTES MAYBE_UNUSED
-crc32_arm_crc(u32 crc, const u8 *p, size_t len)
-{
-	if (len >= 64) {
-		const size_t align = -(uintptr_t)p & 7;
-
-		/* Align p to the next 8-byte boundary. */
-		if (align) {
-			if (align & 1)
-				crc = __crc32b(crc, *p++);
-			if (align & 2) {
-				crc = __crc32h(crc, le16_bswap(*(u16 *)p));
-				p += 2;
-			}
-			if (align & 4) {
-				crc = __crc32w(crc, le32_bswap(*(u32 *)p));
-				p += 4;
-			}
-			len -= align;
-		}
-		/*
-		 * Interleave the processing of multiple adjacent data chunks to
-		 * take advantage of instruction-level parallelism.
-		 *
-		 * Some CPUs don't prefetch the data if it's being fetched in
-		 * multiple interleaved streams, so do explicit prefetching.
-		 */
-		while (len >= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN) {
-			const u64 *wp0 = (const u64 *)p;
-			const u64 * const wp0_end =
-				(const u64 *)(p + CRC32_FIXED_CHUNK_LEN);
-			u32 crc1 = 0, crc2 = 0, crc3 = 0;
-
-			STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
-			STATIC_ASSERT(CRC32_FIXED_CHUNK_LEN % (4 * 8) == 0);
-			do {
-				prefetchr(&wp0[64 + 0*CRC32_FIXED_CHUNK_LEN/8]);
-				prefetchr(&wp0[64 + 1*CRC32_FIXED_CHUNK_LEN/8]);
-				prefetchr(&wp0[64 + 2*CRC32_FIXED_CHUNK_LEN/8]);
-				prefetchr(&wp0[64 + 3*CRC32_FIXED_CHUNK_LEN/8]);
-				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
-				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
-				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
-				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
-				wp0++;
-				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
-				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
-				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
-				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
-				wp0++;
-				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
-				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
-				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
-				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
-				wp0++;
-				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_FIXED_CHUNK_LEN/8]));
-				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_FIXED_CHUNK_LEN/8]));
-				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_FIXED_CHUNK_LEN/8]));
-				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_FIXED_CHUNK_LEN/8]));
-				wp0++;
-			} while (wp0 != wp0_end);
-			crc = combine_crcs_slow(crc, crc1, crc2, crc3);
-			p += CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN;
-			len -= CRC32_NUM_CHUNKS * CRC32_FIXED_CHUNK_LEN;
-		}
-		/*
-		 * Due to the large fixed chunk length used above, there might
-		 * still be a lot of data left.  So use a 64-byte loop here,
-		 * instead of a loop that is less unrolled.
-		 */
-		while (len >= 64) {
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 32)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 40)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 48)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 56)));
-			p += 64;
-			len -= 64;
-		}
-	}
-	if (len & 32) {
-		crc = __crc32d(crc, get_unaligned_le64(p + 0));
-		crc = __crc32d(crc, get_unaligned_le64(p + 8));
-		crc = __crc32d(crc, get_unaligned_le64(p + 16));
-		crc = __crc32d(crc, get_unaligned_le64(p + 24));
-		p += 32;
-	}
-	if (len & 16) {
-		crc = __crc32d(crc, get_unaligned_le64(p + 0));
-		crc = __crc32d(crc, get_unaligned_le64(p + 8));
-		p += 16;
-	}
-	if (len & 8) {
-		crc = __crc32d(crc, get_unaligned_le64(p));
-		p += 8;
-	}
-	if (len & 4) {
-		crc = __crc32w(crc, get_unaligned_le32(p));
-		p += 4;
-	}
-	if (len & 2) {
-		crc = __crc32h(crc, get_unaligned_le16(p));
-		p += 2;
-	}
-	if (len & 1)
-		crc = __crc32b(crc, *p);
-	return crc;
-}
-#undef ATTRIBUTES
-#endif /* crc32_arm_crc() */
-
-/*
- * crc32_arm_crc_pmullcombine() - implementation using crc32 instructions, plus
- *	pmull instructions for CRC combining
- *
- * This is similar to crc32_arm_crc(), but it enables the use of pmull
- * (carryless multiplication) instructions for the steps where the CRCs of
- * adjacent data chunks are combined.  As this greatly speeds up CRC
- * combination, this implementation also differs from crc32_arm_crc() in that it
- * uses a variable chunk length which can get fairly small.  The precomputed
- * multipliers needed for the selected chunk length are loaded from a table.
- *
- * Note that pmull is used here only for combining the CRCs of separately
- * checksummed chunks, not for folding the data itself.  See crc32_arm_pmull*()
- * for implementations that use pmull for folding the data itself.
- */
-#if HAVE_CRC32_INTRIN && HAVE_PMULL_INTRIN
-#  if HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE
-#    define ATTRIBUTES
-#  else
-#    ifdef ARCH_ARM32
-#      define ATTRIBUTES	_target_attribute("arch=armv8-a+crc,fpu=crypto-neon-fp-armv8")
-#    else
-#      ifdef __clang__
-#        define ATTRIBUTES	_target_attribute("crc,aes")
-#      else
-#        define ATTRIBUTES	_target_attribute("+crc,+crypto")
-#      endif
-#    endif
-#  endif
-
-#ifndef _MSC_VER
-#  include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-
-/* Do carryless multiplication of two 32-bit values. */
-static forceinline ATTRIBUTES u64
-clmul_u32(u32 a, u32 b)
-{
-	uint64x2_t res = vreinterpretq_u64_p128(
-				compat_vmull_p64((poly64_t)a, (poly64_t)b));
-
-	return vgetq_lane_u64(res, 0);
-}
-
-/*
- * Like combine_crcs_slow(), but uses vmull_p64 to do the multiplications more
- * quickly, and supports a variable chunk length.  The chunk length is
- * 'i * CRC32_MIN_VARIABLE_CHUNK_LEN'
- * where 1 <= i < ARRAY_LEN(crc32_mults_for_chunklen).
- */
-static forceinline ATTRIBUTES u32
-combine_crcs_fast(u32 crc0, u32 crc1, u32 crc2, u32 crc3, size_t i)
-{
-	u64 res0 = clmul_u32(crc0, crc32_mults_for_chunklen[i][0]);
-	u64 res1 = clmul_u32(crc1, crc32_mults_for_chunklen[i][1]);
-	u64 res2 = clmul_u32(crc2, crc32_mults_for_chunklen[i][2]);
-
-	return __crc32d(0, res0 ^ res1 ^ res2) ^ crc3;
-}
-
-#define crc32_arm_crc_pmullcombine	crc32_arm_crc_pmullcombine
-static u32 ATTRIBUTES MAYBE_UNUSED
-crc32_arm_crc_pmullcombine(u32 crc, const u8 *p, size_t len)
-{
-	const size_t align = -(uintptr_t)p & 7;
-
-	if (len >= align + CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) {
-		/* Align p to the next 8-byte boundary. */
-		if (align) {
-			if (align & 1)
-				crc = __crc32b(crc, *p++);
-			if (align & 2) {
-				crc = __crc32h(crc, le16_bswap(*(u16 *)p));
-				p += 2;
-			}
-			if (align & 4) {
-				crc = __crc32w(crc, le32_bswap(*(u32 *)p));
-				p += 4;
-			}
-			len -= align;
-		}
-		/*
-		 * Handle CRC32_MAX_VARIABLE_CHUNK_LEN specially, so that better
-		 * code is generated for it.
-		 */
-		while (len >= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN) {
-			const u64 *wp0 = (const u64 *)p;
-			const u64 * const wp0_end =
-				(const u64 *)(p + CRC32_MAX_VARIABLE_CHUNK_LEN);
-			u32 crc1 = 0, crc2 = 0, crc3 = 0;
-
-			STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
-			STATIC_ASSERT(CRC32_MAX_VARIABLE_CHUNK_LEN % (4 * 8) == 0);
-			do {
-				prefetchr(&wp0[64 + 0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
-				prefetchr(&wp0[64 + 1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
-				prefetchr(&wp0[64 + 2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
-				prefetchr(&wp0[64 + 3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]);
-				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				wp0++;
-				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				wp0++;
-				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				wp0++;
-				crc  = __crc32d(crc,  le64_bswap(wp0[0*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc1 = __crc32d(crc1, le64_bswap(wp0[1*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc2 = __crc32d(crc2, le64_bswap(wp0[2*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				crc3 = __crc32d(crc3, le64_bswap(wp0[3*CRC32_MAX_VARIABLE_CHUNK_LEN/8]));
-				wp0++;
-			} while (wp0 != wp0_end);
-			crc = combine_crcs_fast(crc, crc1, crc2, crc3,
-						ARRAY_LEN(crc32_mults_for_chunklen) - 1);
-			p += CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN;
-			len -= CRC32_NUM_CHUNKS * CRC32_MAX_VARIABLE_CHUNK_LEN;
-		}
-		/* Handle up to one variable-length chunk. */
-		if (len >= CRC32_NUM_CHUNKS * CRC32_MIN_VARIABLE_CHUNK_LEN) {
-			const size_t i = len / (CRC32_NUM_CHUNKS *
-						CRC32_MIN_VARIABLE_CHUNK_LEN);
-			const size_t chunk_len =
-				i * CRC32_MIN_VARIABLE_CHUNK_LEN;
-			const u64 *wp0 = (const u64 *)(p + 0*chunk_len);
-			const u64 *wp1 = (const u64 *)(p + 1*chunk_len);
-			const u64 *wp2 = (const u64 *)(p + 2*chunk_len);
-			const u64 *wp3 = (const u64 *)(p + 3*chunk_len);
-			const u64 * const wp0_end = wp1;
-			u32 crc1 = 0, crc2 = 0, crc3 = 0;
-
-			STATIC_ASSERT(CRC32_NUM_CHUNKS == 4);
-			STATIC_ASSERT(CRC32_MIN_VARIABLE_CHUNK_LEN % (4 * 8) == 0);
-			do {
-				prefetchr(wp0 + 64);
-				prefetchr(wp1 + 64);
-				prefetchr(wp2 + 64);
-				prefetchr(wp3 + 64);
-				crc  = __crc32d(crc,  le64_bswap(*wp0++));
-				crc1 = __crc32d(crc1, le64_bswap(*wp1++));
-				crc2 = __crc32d(crc2, le64_bswap(*wp2++));
-				crc3 = __crc32d(crc3, le64_bswap(*wp3++));
-				crc  = __crc32d(crc,  le64_bswap(*wp0++));
-				crc1 = __crc32d(crc1, le64_bswap(*wp1++));
-				crc2 = __crc32d(crc2, le64_bswap(*wp2++));
-				crc3 = __crc32d(crc3, le64_bswap(*wp3++));
-				crc  = __crc32d(crc,  le64_bswap(*wp0++));
-				crc1 = __crc32d(crc1, le64_bswap(*wp1++));
-				crc2 = __crc32d(crc2, le64_bswap(*wp2++));
-				crc3 = __crc32d(crc3, le64_bswap(*wp3++));
-				crc  = __crc32d(crc,  le64_bswap(*wp0++));
-				crc1 = __crc32d(crc1, le64_bswap(*wp1++));
-				crc2 = __crc32d(crc2, le64_bswap(*wp2++));
-				crc3 = __crc32d(crc3, le64_bswap(*wp3++));
-			} while (wp0 != wp0_end);
-			crc = combine_crcs_fast(crc, crc1, crc2, crc3, i);
-			p += CRC32_NUM_CHUNKS * chunk_len;
-			len -= CRC32_NUM_CHUNKS * chunk_len;
-		}
-
-		while (len >= 32) {
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 0)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 8)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 16)));
-			crc = __crc32d(crc, le64_bswap(*(u64 *)(p + 24)));
-			p += 32;
-			len -= 32;
-		}
-	} else {
-		while (len >= 32) {
-			crc = __crc32d(crc, get_unaligned_le64(p + 0));
-			crc = __crc32d(crc, get_unaligned_le64(p + 8));
-			crc = __crc32d(crc, get_unaligned_le64(p + 16));
-			crc = __crc32d(crc, get_unaligned_le64(p + 24));
-			p += 32;
-			len -= 32;
-		}
-	}
-	if (len & 16) {
-		crc = __crc32d(crc, get_unaligned_le64(p + 0));
-		crc = __crc32d(crc, get_unaligned_le64(p + 8));
-		p += 16;
-	}
-	if (len & 8) {
-		crc = __crc32d(crc, get_unaligned_le64(p));
-		p += 8;
-	}
-	if (len & 4) {
-		crc = __crc32w(crc, get_unaligned_le32(p));
-		p += 4;
-	}
-	if (len & 2) {
-		crc = __crc32h(crc, get_unaligned_le16(p));
-		p += 2;
-	}
-	if (len & 1)
-		crc = __crc32b(crc, *p);
-	return crc;
-}
-#undef ATTRIBUTES
-#endif /* crc32_arm_crc_pmullcombine() */
-
-/*
- * crc32_arm_pmullx4() - implementation using "folding" with pmull instructions
- *
- * This implementation is intended for CPUs that support pmull instructions but
- * not crc32 instructions.
- */
-#if HAVE_PMULL_INTRIN
-#  define crc32_arm_pmullx4	crc32_arm_pmullx4
-#  define SUFFIX			 _pmullx4
-#  if HAVE_PMULL_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE
-#    define ATTRIBUTES
-#  else
-#    ifdef ARCH_ARM32
-#      define ATTRIBUTES    _target_attribute("fpu=crypto-neon-fp-armv8")
-#    else
-#      ifdef __clang__
-	 /*
-	  * This used to use "crypto", but that stopped working with clang 16.
-	  * Now only "aes" works.  "aes" works with older versions too, so use
-	  * that.  No "+" prefix; clang 15 and earlier doesn't accept that.
-	  */
-#        define ATTRIBUTES  _target_attribute("aes")
-#      else
-	 /*
-	  * With gcc, only "+crypto" works.  Both the "+" prefix and the
-	  * "crypto" (not "aes") are essential...
-	  */
-#        define ATTRIBUTES  _target_attribute("+crypto")
-#      endif
-#    endif
-#  endif
-#  define ENABLE_EOR3		0
-#  include "crc32_pmull_helpers.h"
-
-static u32 ATTRIBUTES MAYBE_UNUSED
-crc32_arm_pmullx4(u32 crc, const u8 *p, size_t len)
-{
-	static const u64 _aligned_attribute(16) mults[3][2] = {
-		CRC32_1VECS_MULTS,
-		CRC32_4VECS_MULTS,
-		CRC32_2VECS_MULTS,
-	};
-	static const u64 _aligned_attribute(16) final_mults[3][2] = {
-		{ CRC32_FINAL_MULT, 0 },
-		{ CRC32_BARRETT_CONSTANT_1, 0 },
-		{ CRC32_BARRETT_CONSTANT_2, 0 },
-	};
-	const uint8x16_t zeroes = vdupq_n_u8(0);
-	const uint8x16_t mask32 = vreinterpretq_u8_u64(vdupq_n_u64(0xFFFFFFFF));
-	const poly64x2_t multipliers_1 = load_multipliers(mults[0]);
-	uint8x16_t v0, v1, v2, v3;
-
-	if (len < 64 + 15) {
-		if (len < 16)
-			return crc32_slice1(crc, p, len);
-		v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
-		p += 16;
-		len -= 16;
-		while (len >= 16) {
-			v0 = fold_vec(v0, vld1q_u8(p), multipliers_1);
-			p += 16;
-			len -= 16;
-		}
-	} else {
-		const poly64x2_t multipliers_4 = load_multipliers(mults[1]);
-		const poly64x2_t multipliers_2 = load_multipliers(mults[2]);
-		const size_t align = -(uintptr_t)p & 15;
-		const uint8x16_t *vp;
-
-		v0 = veorq_u8(vld1q_u8(p), u32_to_bytevec(crc));
-		p += 16;
-		/* Align p to the next 16-byte boundary. */
-		if (align) {
-			v0 = fold_partial_vec(v0, p, align, multipliers_1);
-			p += align;
-			len -= align;
-		}
-		vp = (const uint8x16_t *)p;
-		v1 = *vp++;
-		v2 = *vp++;
-		v3 = *vp++;
-		while (len >= 64 + 64) {
-			v0 = fold_vec(v0, *vp++, multipliers_4);
-			v1 = fold_vec(v1, *vp++, multipliers_4);
-			v2 = fold_vec(v2, *vp++, multipliers_4);
-			v3 = fold_vec(v3, *vp++, multipliers_4);
-			len -= 64;
-		}
-		v0 = fold_vec(v0, v2, multipliers_2);
-		v1 = fold_vec(v1, v3, multipliers_2);
-		if (len & 32) {
-			v0 = fold_vec(v0, *vp++, multipliers_2);
-			v1 = fold_vec(v1, *vp++, multipliers_2);
-		}
-		v0 = fold_vec(v0, v1, multipliers_1);
-		if (len & 16)
-			v0 = fold_vec(v0, *vp++, multipliers_1);
-		p = (const u8 *)vp;
-		len &= 15;
-	}
-
-	/* Handle any remaining partial block now before reducing to 32 bits. */
-	if (len)
-		v0 = fold_partial_vec(v0, p, len, multipliers_1);
-
-	/*
-	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
-	 * which is equivalent to multiplying by x^32.  This is needed because
-	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
-	 */
-
-	v0 = veorq_u8(vextq_u8(v0, zeroes, 8),
-		      clmul_high(vextq_u8(zeroes, v0, 8), multipliers_1));
-
-	/* Fold 96 => 64 bits. */
-	v0 = veorq_u8(vextq_u8(v0, zeroes, 4),
-		      clmul_low(vandq_u8(v0, mask32),
-				load_multipliers(final_mults[0])));
-
-	/* Reduce 64 => 32 bits using Barrett reduction. */
-	v1 = clmul_low(vandq_u8(v0, mask32), load_multipliers(final_mults[1]));
-	v1 = clmul_low(vandq_u8(v1, mask32), load_multipliers(final_mults[2]));
-	return vgetq_lane_u32(vreinterpretq_u32_u8(veorq_u8(v0, v1)), 1);
-}
-#undef SUFFIX
-#undef ATTRIBUTES
-#undef ENABLE_EOR3
-#endif /* crc32_arm_pmullx4() */
-
-/*
- * crc32_arm_pmullx12_crc() - large-stride implementation using "folding" with
- *	pmull instructions, where crc32 instructions are also available
- *
- * See crc32_pmull_wide.h for explanation.
- */
-#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN
-#  define crc32_arm_pmullx12_crc	crc32_arm_pmullx12_crc
-#  define SUFFIX				 _pmullx12_crc
-#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && !USE_PMULL_TARGET_EVEN_IF_NATIVE
-#    define ATTRIBUTES
-#  else
-#    ifdef __clang__
-#      define ATTRIBUTES  _target_attribute("aes,crc")
-#    else
-#      define ATTRIBUTES  _target_attribute("+crypto,+crc")
-#    endif
-#  endif
-#  define ENABLE_EOR3	0
-#  include "crc32_pmull_wide.h"
-#endif
-
-/*
- * crc32_arm_pmullx12_crc_eor3()
- *
- * This like crc32_arm_pmullx12_crc(), but it adds the eor3 instruction (from
- * the sha3 extension) for even better performance.
- *
- * Note: we require HAVE_SHA3_TARGET (or HAVE_SHA3_NATIVE) rather than
- * HAVE_SHA3_INTRIN, as we have an inline asm fallback for eor3.
- */
-#if defined(ARCH_ARM64) && HAVE_PMULL_INTRIN && HAVE_CRC32_INTRIN && \
-	(HAVE_SHA3_TARGET || HAVE_SHA3_NATIVE)
-#  define crc32_arm_pmullx12_crc_eor3	crc32_arm_pmullx12_crc_eor3
-#  define SUFFIX				 _pmullx12_crc_eor3
-#  if HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE && \
-	!USE_PMULL_TARGET_EVEN_IF_NATIVE
-#    define ATTRIBUTES
-#  else
-#    ifdef __clang__
-#      define ATTRIBUTES  _target_attribute("aes,crc,sha3")
-     /*
-      * With gcc, arch=armv8.2-a is needed for the sha3 intrinsics, unless the
-      * default target is armv8.3-a or later in which case it must be omitted.
-      * armv8.3-a or later can be detected by checking for __ARM_FEATURE_JCVT.
-      */
-#    elif defined(__ARM_FEATURE_JCVT)
-#      define ATTRIBUTES  _target_attribute("+crypto,+crc,+sha3")
-#    else
-#      define ATTRIBUTES  _target_attribute("arch=armv8.2-a+crypto+crc+sha3")
-#    endif
-#  endif
-#  define ENABLE_EOR3	1
-#  include "crc32_pmull_wide.h"
-#endif
-
-/*
- * On the Apple M1 processor, crc32 instructions max out at about 25.5 GB/s in
- * the best case of using a 3-way or greater interleaved chunked implementation,
- * whereas a pmull-based implementation achieves 68 GB/s provided that the
- * stride length is large enough (about 10+ vectors with eor3, or 12+ without).
- *
- * For now we assume that crc32 instructions are preferable in other cases.
- */
-#define PREFER_PMULL_TO_CRC	0
-#ifdef __APPLE__
-#  include <TargetConditionals.h>
-#  if TARGET_OS_OSX
-#    undef PREFER_PMULL_TO_CRC
-#    define PREFER_PMULL_TO_CRC	1
-#  endif
-#endif
-
-/*
- * If the best implementation is statically available, use it unconditionally.
- * Otherwise choose the best implementation at runtime.
- */
-#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3) && \
-	HAVE_PMULL_NATIVE && HAVE_CRC32_NATIVE && HAVE_SHA3_NATIVE
-#  define DEFAULT_IMPL	crc32_arm_pmullx12_crc_eor3
-#elif !PREFER_PMULL_TO_CRC && defined(crc32_arm_crc_pmullcombine) && \
-	HAVE_CRC32_NATIVE && HAVE_PMULL_NATIVE
-#  define DEFAULT_IMPL	crc32_arm_crc_pmullcombine
-#else
-static inline crc32_func_t
-arch_select_crc32_func(void)
-{
-	const u32 features MAYBE_UNUSED = get_arm_cpu_features();
-
-#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc_eor3)
-	if (HAVE_PMULL(features) && HAVE_CRC32(features) && HAVE_SHA3(features))
-		return crc32_arm_pmullx12_crc_eor3;
-#endif
-#if PREFER_PMULL_TO_CRC && defined(crc32_arm_pmullx12_crc)
-	if (HAVE_PMULL(features) && HAVE_CRC32(features))
-		return crc32_arm_pmullx12_crc;
-#endif
-#ifdef crc32_arm_crc_pmullcombine
-	if (HAVE_CRC32(features) && HAVE_PMULL(features))
-		return crc32_arm_crc_pmullcombine;
-#endif
-#ifdef crc32_arm_crc
-	if (HAVE_CRC32(features))
-		return crc32_arm_crc;
-#endif
-#ifdef crc32_arm_pmullx4
-	if (HAVE_PMULL(features))
-		return crc32_arm_pmullx4;
-#endif
-	return NULL;
-}
-#define arch_select_crc32_func	arch_select_crc32_func
-#endif
-
-#endif /* LIB_ARM_CRC32_IMPL_H */
diff --git a/Sources/DEFLATE/crc32_multipliers.h b/Sources/DEFLATE/crc32_multipliers.h
index 580b775b..d8e92adb 100644
--- a/Sources/DEFLATE/crc32_multipliers.h
+++ b/Sources/DEFLATE/crc32_multipliers.h
@@ -4,55 +4,103 @@
  * THIS FILE WAS GENERATED BY gen_crc32_multipliers.c.  DO NOT EDIT.
  */
 
-#define CRC32_1VECS_MULT_1 0xae689191 /* x^159 mod G(x) */
-#define CRC32_1VECS_MULT_2 0xccaa009e /* x^95 mod G(x) */
-#define CRC32_1VECS_MULTS { CRC32_1VECS_MULT_1, CRC32_1VECS_MULT_2 }
+#define CRC32_X159_MODG 0xae689191 /* x^159 mod G(x) */
+#define CRC32_X95_MODG 0xccaa009e /* x^95 mod G(x) */
 
-#define CRC32_2VECS_MULT_1 0xf1da05aa /* x^287 mod G(x) */
-#define CRC32_2VECS_MULT_2 0x81256527 /* x^223 mod G(x) */
-#define CRC32_2VECS_MULTS { CRC32_2VECS_MULT_1, CRC32_2VECS_MULT_2 }
+#define CRC32_X287_MODG 0xf1da05aa /* x^287 mod G(x) */
+#define CRC32_X223_MODG 0x81256527 /* x^223 mod G(x) */
 
-#define CRC32_3VECS_MULT_1 0x3db1ecdc /* x^415 mod G(x) */
-#define CRC32_3VECS_MULT_2 0xaf449247 /* x^351 mod G(x) */
-#define CRC32_3VECS_MULTS { CRC32_3VECS_MULT_1, CRC32_3VECS_MULT_2 }
+#define CRC32_X415_MODG 0x3db1ecdc /* x^415 mod G(x) */
+#define CRC32_X351_MODG 0xaf449247 /* x^351 mod G(x) */
 
-#define CRC32_4VECS_MULT_1 0x8f352d95 /* x^543 mod G(x) */
-#define CRC32_4VECS_MULT_2 0x1d9513d7 /* x^479 mod G(x) */
-#define CRC32_4VECS_MULTS { CRC32_4VECS_MULT_1, CRC32_4VECS_MULT_2 }
+#define CRC32_X543_MODG 0x8f352d95 /* x^543 mod G(x) */
+#define CRC32_X479_MODG 0x1d9513d7 /* x^479 mod G(x) */
 
-#define CRC32_5VECS_MULT_1 0x1c279815 /* x^671 mod G(x) */
-#define CRC32_5VECS_MULT_2 0xae0b5394 /* x^607 mod G(x) */
-#define CRC32_5VECS_MULTS { CRC32_5VECS_MULT_1, CRC32_5VECS_MULT_2 }
+#define CRC32_X671_MODG 0x1c279815 /* x^671 mod G(x) */
+#define CRC32_X607_MODG 0xae0b5394 /* x^607 mod G(x) */
 
-#define CRC32_6VECS_MULT_1 0xdf068dc2 /* x^799 mod G(x) */
-#define CRC32_6VECS_MULT_2 0x57c54819 /* x^735 mod G(x) */
-#define CRC32_6VECS_MULTS { CRC32_6VECS_MULT_1, CRC32_6VECS_MULT_2 }
+#define CRC32_X799_MODG 0xdf068dc2 /* x^799 mod G(x) */
+#define CRC32_X735_MODG 0x57c54819 /* x^735 mod G(x) */
 
-#define CRC32_7VECS_MULT_1 0x31f8303f /* x^927 mod G(x) */
-#define CRC32_7VECS_MULT_2 0x0cbec0ed /* x^863 mod G(x) */
-#define CRC32_7VECS_MULTS { CRC32_7VECS_MULT_1, CRC32_7VECS_MULT_2 }
+#define CRC32_X927_MODG 0x31f8303f /* x^927 mod G(x) */
+#define CRC32_X863_MODG 0x0cbec0ed /* x^863 mod G(x) */
 
-#define CRC32_8VECS_MULT_1 0x33fff533 /* x^1055 mod G(x) */
-#define CRC32_8VECS_MULT_2 0x910eeec1 /* x^991 mod G(x) */
-#define CRC32_8VECS_MULTS { CRC32_8VECS_MULT_1, CRC32_8VECS_MULT_2 }
+#define CRC32_X1055_MODG 0x33fff533 /* x^1055 mod G(x) */
+#define CRC32_X991_MODG 0x910eeec1 /* x^991 mod G(x) */
 
-#define CRC32_9VECS_MULT_1 0x26b70c3d /* x^1183 mod G(x) */
-#define CRC32_9VECS_MULT_2 0x3f41287a /* x^1119 mod G(x) */
-#define CRC32_9VECS_MULTS { CRC32_9VECS_MULT_1, CRC32_9VECS_MULT_2 }
+#define CRC32_X1183_MODG 0x26b70c3d /* x^1183 mod G(x) */
+#define CRC32_X1119_MODG 0x3f41287a /* x^1119 mod G(x) */
 
-#define CRC32_10VECS_MULT_1 0xe3543be0 /* x^1311 mod G(x) */
-#define CRC32_10VECS_MULT_2 0x9026d5b1 /* x^1247 mod G(x) */
-#define CRC32_10VECS_MULTS { CRC32_10VECS_MULT_1, CRC32_10VECS_MULT_2 }
+#define CRC32_X1311_MODG 0xe3543be0 /* x^1311 mod G(x) */
+#define CRC32_X1247_MODG 0x9026d5b1 /* x^1247 mod G(x) */
 
-#define CRC32_11VECS_MULT_1 0x5a1bb05d /* x^1439 mod G(x) */
-#define CRC32_11VECS_MULT_2 0xd1df2327 /* x^1375 mod G(x) */
-#define CRC32_11VECS_MULTS { CRC32_11VECS_MULT_1, CRC32_11VECS_MULT_2 }
+#define CRC32_X1439_MODG 0x5a1bb05d /* x^1439 mod G(x) */
+#define CRC32_X1375_MODG 0xd1df2327 /* x^1375 mod G(x) */
 
-#define CRC32_12VECS_MULT_1 0x596c8d81 /* x^1567 mod G(x) */
-#define CRC32_12VECS_MULT_2 0xf5e48c85 /* x^1503 mod G(x) */
-#define CRC32_12VECS_MULTS { CRC32_12VECS_MULT_1, CRC32_12VECS_MULT_2 }
+#define CRC32_X1567_MODG 0x596c8d81 /* x^1567 mod G(x) */
+#define CRC32_X1503_MODG 0xf5e48c85 /* x^1503 mod G(x) */
 
-#define CRC32_FINAL_MULT 0xb8bc6765 /* x^63 mod G(x) */
+#define CRC32_X1695_MODG 0x682bdd4f /* x^1695 mod G(x) */
+#define CRC32_X1631_MODG 0x3c656ced /* x^1631 mod G(x) */
+
+#define CRC32_X1823_MODG 0x4a28bd43 /* x^1823 mod G(x) */
+#define CRC32_X1759_MODG 0xfe807bbd /* x^1759 mod G(x) */
+
+#define CRC32_X1951_MODG 0x0077f00d /* x^1951 mod G(x) */
+#define CRC32_X1887_MODG 0x1f0c2cdd /* x^1887 mod G(x) */
+
+#define CRC32_X2079_MODG 0xce3371cb /* x^2079 mod G(x) */
+#define CRC32_X2015_MODG 0xe95c1271 /* x^2015 mod G(x) */
+
+#define CRC32_X2207_MODG 0xa749e894 /* x^2207 mod G(x) */
+#define CRC32_X2143_MODG 0xb918a347 /* x^2143 mod G(x) */
+
+#define CRC32_X2335_MODG 0x2c538639 /* x^2335 mod G(x) */
+#define CRC32_X2271_MODG 0x71d54a59 /* x^2271 mod G(x) */
+
+#define CRC32_X2463_MODG 0x32b0733c /* x^2463 mod G(x) */
+#define CRC32_X2399_MODG 0xff6f2fc2 /* x^2399 mod G(x) */
+
+#define CRC32_X2591_MODG 0x0e9bd5cc /* x^2591 mod G(x) */
+#define CRC32_X2527_MODG 0xcec97417 /* x^2527 mod G(x) */
+
+#define CRC32_X2719_MODG 0x76278617 /* x^2719 mod G(x) */
+#define CRC32_X2655_MODG 0x1c63267b /* x^2655 mod G(x) */
+
+#define CRC32_X2847_MODG 0xc51b93e3 /* x^2847 mod G(x) */
+#define CRC32_X2783_MODG 0xf183c71b /* x^2783 mod G(x) */
+
+#define CRC32_X2975_MODG 0x7eaed122 /* x^2975 mod G(x) */
+#define CRC32_X2911_MODG 0x9b9bdbd0 /* x^2911 mod G(x) */
+
+#define CRC32_X3103_MODG 0x2ce423f1 /* x^3103 mod G(x) */
+#define CRC32_X3039_MODG 0xd31343ea /* x^3039 mod G(x) */
+
+#define CRC32_X3231_MODG 0x8b8d8645 /* x^3231 mod G(x) */
+#define CRC32_X3167_MODG 0x4470ac44 /* x^3167 mod G(x) */
+
+#define CRC32_X3359_MODG 0x4b700aa8 /* x^3359 mod G(x) */
+#define CRC32_X3295_MODG 0xeea395c4 /* x^3295 mod G(x) */
+
+#define CRC32_X3487_MODG 0xeff5e99d /* x^3487 mod G(x) */
+#define CRC32_X3423_MODG 0xf9d9c7ee /* x^3423 mod G(x) */
+
+#define CRC32_X3615_MODG 0xad0d2bb2 /* x^3615 mod G(x) */
+#define CRC32_X3551_MODG 0xcd669a40 /* x^3551 mod G(x) */
+
+#define CRC32_X3743_MODG 0x9fb66bd3 /* x^3743 mod G(x) */
+#define CRC32_X3679_MODG 0x6d40f445 /* x^3679 mod G(x) */
+
+#define CRC32_X3871_MODG 0xc2dcc467 /* x^3871 mod G(x) */
+#define CRC32_X3807_MODG 0x9ee62949 /* x^3807 mod G(x) */
+
+#define CRC32_X3999_MODG 0x398e2ff2 /* x^3999 mod G(x) */
+#define CRC32_X3935_MODG 0x145575d5 /* x^3935 mod G(x) */
+
+#define CRC32_X4127_MODG 0x1072db28 /* x^4127 mod G(x) */
+#define CRC32_X4063_MODG 0x0c30f51d /* x^4063 mod G(x) */
+
+#define CRC32_X63_MODG 0xb8bc6765 /* x^63 mod G(x) */
 #define CRC32_BARRETT_CONSTANT_1 0x00000001f7011641ULL /* floor(x^64 / G(x)) */
 #define CRC32_BARRETT_CONSTANT_2 0x00000001db710641ULL /* G(x) */
 #define CRC32_BARRETT_CONSTANTS { CRC32_BARRETT_CONSTANT_1, CRC32_BARRETT_CONSTANT_2 }
@@ -63,263 +111,263 @@
 
 /* Multipliers for implementations that use a variable chunk length */
 static const u32 crc32_mults_for_chunklen[][CRC32_NUM_CHUNKS - 1] MAYBE_UNUSED = {
-	{ 0 /* unused row */ },
-	/* chunk_len=128 */
-	{ 0xd31343ea /* x^3039 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, 0x910eeec1 /* x^991 mod G(x) */, },
-	/* chunk_len=256 */
-	{ 0x1d6708a0 /* x^6111 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, },
-	/* chunk_len=384 */
-	{ 0xdb3839f3 /* x^9183 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, 0xd31343ea /* x^3039 mod G(x) */, },
-	/* chunk_len=512 */
-	{ 0x1753ab84 /* x^12255 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, },
-	/* chunk_len=640 */
-	{ 0x3796455c /* x^15327 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, 0xc352f6de /* x^5087 mod G(x) */, },
-	/* chunk_len=768 */
-	{ 0x3954de39 /* x^18399 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, },
-	/* chunk_len=896 */
-	{ 0x632d78c5 /* x^21471 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, 0x9a1b53c8 /* x^7135 mod G(x) */, },
-	/* chunk_len=1024 */
-	{ 0xa0decef3 /* x^24543 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, },
-	/* chunk_len=1152 */
-	{ 0xe9c09bb0 /* x^27615 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, 0xdb3839f3 /* x^9183 mod G(x) */, },
-	/* chunk_len=1280 */
-	{ 0xd51917a4 /* x^30687 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, },
-	/* chunk_len=1408 */
-	{ 0x154a8a62 /* x^33759 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, 0x3e9a43cd /* x^11231 mod G(x) */, },
-	/* chunk_len=1536 */
-	{ 0xf196555d /* x^36831 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, },
-	/* chunk_len=1664 */
-	{ 0x8eec2999 /* x^39903 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, 0x6044fbb0 /* x^13279 mod G(x) */, },
-	/* chunk_len=1792 */
-	{ 0x27892abf /* x^42975 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, },
-	/* chunk_len=1920 */
-	{ 0x77bc2419 /* x^46047 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, 0x3796455c /* x^15327 mod G(x) */, },
-	/* chunk_len=2048 */
-	{ 0xcea114a5 /* x^49119 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, },
-	/* chunk_len=2176 */
-	{ 0xa1077e85 /* x^52191 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, 0x0c21f835 /* x^17375 mod G(x) */, },
-	/* chunk_len=2304 */
-	{ 0xc5ed75e1 /* x^55263 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, },
-	/* chunk_len=2432 */
-	{ 0xca4fba3f /* x^58335 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, 0x6cb21510 /* x^19423 mod G(x) */, },
-	/* chunk_len=2560 */
-	{ 0xcf5bcdc4 /* x^61407 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, },
-	/* chunk_len=2688 */
-	{ 0xf36b9d16 /* x^64479 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, 0x632d78c5 /* x^21471 mod G(x) */, },
-	/* chunk_len=2816 */
-	{ 0xf76fd988 /* x^67551 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, },
-	/* chunk_len=2944 */
-	{ 0x6c45d92e /* x^70623 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, 0x0c46baec /* x^23519 mod G(x) */, },
-	/* chunk_len=3072 */
-	{ 0x6116b82b /* x^73695 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, },
-	/* chunk_len=3200 */
-	{ 0x4d9899bb /* x^76767 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, 0x53deb236 /* x^25567 mod G(x) */, },
-	/* chunk_len=3328 */
-	{ 0x3e7c93b9 /* x^79839 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, },
-	/* chunk_len=3456 */
-	{ 0x388b20ac /* x^82911 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, 0xe9c09bb0 /* x^27615 mod G(x) */, },
-	/* chunk_len=3584 */
-	{ 0x0956d953 /* x^85983 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, },
-	/* chunk_len=3712 */
-	{ 0x55cb4dfe /* x^89055 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, 0xc07331b3 /* x^29663 mod G(x) */, },
-	/* chunk_len=3840 */
-	{ 0x52222fea /* x^92127 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, },
-	/* chunk_len=3968 */
-	{ 0x0603989b /* x^95199 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, 0x5e04b9a5 /* x^31711 mod G(x) */, },
-	/* chunk_len=4096 */
-	{ 0x4470c029 /* x^98271 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, },
-	/* chunk_len=4224 */
-	{ 0xb6f35093 /* x^101343 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, 0x154a8a62 /* x^33759 mod G(x) */, },
-	/* chunk_len=4352 */
-	{ 0xc46805ba /* x^104415 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, },
-	/* chunk_len=4480 */
-	{ 0xc3876592 /* x^107487 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, 0xc35cf6e7 /* x^35807 mod G(x) */, },
-	/* chunk_len=4608 */
-	{ 0x5b0c98b9 /* x^110559 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, },
-	/* chunk_len=4736 */
-	{ 0x30d13e5f /* x^113631 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, 0x8c224466 /* x^37855 mod G(x) */, },
-	/* chunk_len=4864 */
-	{ 0x54afca53 /* x^116703 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, },
-	/* chunk_len=4992 */
-	{ 0x93102436 /* x^119775 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, 0x8eec2999 /* x^39903 mod G(x) */, },
-	/* chunk_len=5120 */
-	{ 0xbd2655a8 /* x^122847 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, },
-	/* chunk_len=5248 */
-	{ 0x70cd7f26 /* x^125919 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, 0x1691be45 /* x^41951 mod G(x) */, },
-	/* chunk_len=5376 */
-	{ 0x2d546c53 /* x^128991 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, },
-	/* chunk_len=5504 */
-	{ 0xb53410a8 /* x^132063 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, 0x161f3c12 /* x^43999 mod G(x) */, },
-	/* chunk_len=5632 */
-	{ 0x67a93f75 /* x^135135 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, },
-	/* chunk_len=5760 */
-	{ 0x9830ac33 /* x^138207 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, 0x77bc2419 /* x^46047 mod G(x) */, },
-	/* chunk_len=5888 */
-	{ 0xb0b6fc3e /* x^141279 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, },
-	/* chunk_len=6016 */
-	{ 0x84170f16 /* x^144351 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, 0x30de0f98 /* x^48095 mod G(x) */, },
-	/* chunk_len=6144 */
-	{ 0xd7017a0c /* x^147423 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, },
-	/* chunk_len=6272 */
-	{ 0xadb25de6 /* x^150495 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, 0x2b7e0e1b /* x^50143 mod G(x) */, },
-	/* chunk_len=6400 */
-	{ 0x8282fddc /* x^153567 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, },
-	/* chunk_len=6528 */
-	{ 0x46362bee /* x^156639 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, 0xa1077e85 /* x^52191 mod G(x) */, },
-	/* chunk_len=6656 */
-	{ 0xb9077a01 /* x^159711 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, },
-	/* chunk_len=6784 */
-	{ 0xf51d9bc6 /* x^162783 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, 0x7e774cf6 /* x^54239 mod G(x) */, },
-	/* chunk_len=6912 */
-	{ 0x4ca19a29 /* x^165855 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, },
-	/* chunk_len=7040 */
-	{ 0xdc0fc3fc /* x^168927 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, 0x3678fed2 /* x^56287 mod G(x) */, },
-	/* chunk_len=7168 */
-	{ 0x63c3d167 /* x^171999 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, },
-	/* chunk_len=7296 */
-	{ 0x5851d254 /* x^175071 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, 0xca4fba3f /* x^58335 mod G(x) */, },
-	/* chunk_len=7424 */
-	{ 0xfeacf2a1 /* x^178143 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, },
-	/* chunk_len=7552 */
-	{ 0x93b7edc8 /* x^181215 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, 0x58fa96ee /* x^60383 mod G(x) */, },
-	/* chunk_len=7680 */
-	{ 0x5539e44a /* x^184287 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, },
-	/* chunk_len=7808 */
-	{ 0xde32a3d2 /* x^187359 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, 0x6a6a3694 /* x^62431 mod G(x) */, },
-	/* chunk_len=7936 */
-	{ 0xf0baeeb6 /* x^190431 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, },
-	/* chunk_len=8064 */
-	{ 0xbe15887f /* x^193503 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, 0xf36b9d16 /* x^64479 mod G(x) */, },
-	/* chunk_len=8192 */
-	{ 0x64f34a05 /* x^196575 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, },
-	/* chunk_len=8320 */
-	{ 0x1b6d1aea /* x^199647 mod G(x) */, 0xfeafb67c /* x^133087 mod G(x) */, 0x4fb001a8 /* x^66527 mod G(x) */, },
-	/* chunk_len=8448 */
-	{ 0x82adb0b8 /* x^202719 mod G(x) */, 0x67a93f75 /* x^135135 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, },
-	/* chunk_len=8576 */
-	{ 0x694587c7 /* x^205791 mod G(x) */, 0x3b34408b /* x^137183 mod G(x) */, 0xeccb2978 /* x^68575 mod G(x) */, },
-	/* chunk_len=8704 */
-	{ 0xd2fc57c3 /* x^208863 mod G(x) */, 0x07fcf8c6 /* x^139231 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, },
-	/* chunk_len=8832 */
-	{ 0x9dd6837c /* x^211935 mod G(x) */, 0xb0b6fc3e /* x^141279 mod G(x) */, 0x6c45d92e /* x^70623 mod G(x) */, },
-	/* chunk_len=8960 */
-	{ 0x3a9d1f97 /* x^215007 mod G(x) */, 0xefd033b2 /* x^143327 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, },
-	/* chunk_len=9088 */
-	{ 0x1eee1d2a /* x^218079 mod G(x) */, 0xf2a6e46e /* x^145375 mod G(x) */, 0x55b4c814 /* x^72671 mod G(x) */, },
-	/* chunk_len=9216 */
-	{ 0xb57c7728 /* x^221151 mod G(x) */, 0xd7017a0c /* x^147423 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, },
-	/* chunk_len=9344 */
-	{ 0xf2fc5d61 /* x^224223 mod G(x) */, 0x242aac86 /* x^149471 mod G(x) */, 0x05245cf0 /* x^74719 mod G(x) */, },
-	/* chunk_len=9472 */
-	{ 0x26387824 /* x^227295 mod G(x) */, 0xc15c4ca5 /* x^151519 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, },
-	/* chunk_len=9600 */
-	{ 0x8c151e77 /* x^230367 mod G(x) */, 0x8282fddc /* x^153567 mod G(x) */, 0x4d9899bb /* x^76767 mod G(x) */, },
-	/* chunk_len=9728 */
-	{ 0x8ea1f680 /* x^233439 mod G(x) */, 0xf5ff6cdd /* x^155615 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, },
-	/* chunk_len=9856 */
-	{ 0xe8cf3d2a /* x^236511 mod G(x) */, 0x338b1fb1 /* x^157663 mod G(x) */, 0xeda61f70 /* x^78815 mod G(x) */, },
-	/* chunk_len=9984 */
-	{ 0x21f15b59 /* x^239583 mod G(x) */, 0xb9077a01 /* x^159711 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, },
-	/* chunk_len=10112 */
-	{ 0x6f68d64a /* x^242655 mod G(x) */, 0x901b0161 /* x^161759 mod G(x) */, 0xb9fd3537 /* x^80863 mod G(x) */, },
-	/* chunk_len=10240 */
-	{ 0x71b74d95 /* x^245727 mod G(x) */, 0xf5ddd5ad /* x^163807 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, },
-	/* chunk_len=10368 */
-	{ 0x4c2e7261 /* x^248799 mod G(x) */, 0x4ca19a29 /* x^165855 mod G(x) */, 0x388b20ac /* x^82911 mod G(x) */, },
-	/* chunk_len=10496 */
-	{ 0x8a2d38e8 /* x^251871 mod G(x) */, 0xd27ee0a1 /* x^167903 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, },
-	/* chunk_len=10624 */
-	{ 0x7e58ca17 /* x^254943 mod G(x) */, 0x69dfedd2 /* x^169951 mod G(x) */, 0x3a76805e /* x^84959 mod G(x) */, },
-	/* chunk_len=10752 */
-	{ 0xf997967f /* x^258015 mod G(x) */, 0x63c3d167 /* x^171999 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, },
-	/* chunk_len=10880 */
-	{ 0x48215963 /* x^261087 mod G(x) */, 0x71e1dfe0 /* x^174047 mod G(x) */, 0x42a6d410 /* x^87007 mod G(x) */, },
-	/* chunk_len=11008 */
-	{ 0xa704b94c /* x^264159 mod G(x) */, 0x679f198a /* x^176095 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, },
-	/* chunk_len=11136 */
-	{ 0x1d699056 /* x^267231 mod G(x) */, 0xfeacf2a1 /* x^178143 mod G(x) */, 0x55cb4dfe /* x^89055 mod G(x) */, },
-	/* chunk_len=11264 */
-	{ 0x6800bcc5 /* x^270303 mod G(x) */, 0x16024f15 /* x^180191 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, },
-	/* chunk_len=11392 */
-	{ 0x2d48e4ca /* x^273375 mod G(x) */, 0xbe61582f /* x^182239 mod G(x) */, 0x46026283 /* x^91103 mod G(x) */, },
-	/* chunk_len=11520 */
-	{ 0x4c4c2b55 /* x^276447 mod G(x) */, 0x5539e44a /* x^184287 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, },
-	/* chunk_len=11648 */
-	{ 0xd8ce94cb /* x^279519 mod G(x) */, 0xbc613c26 /* x^186335 mod G(x) */, 0x33776b4b /* x^93151 mod G(x) */, },
-	/* chunk_len=11776 */
-	{ 0xd0b5a02b /* x^282591 mod G(x) */, 0x490d3cc6 /* x^188383 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, },
-	/* chunk_len=11904 */
-	{ 0xa223f7ec /* x^285663 mod G(x) */, 0xf0baeeb6 /* x^190431 mod G(x) */, 0x0603989b /* x^95199 mod G(x) */, },
-	/* chunk_len=12032 */
-	{ 0x58de337a /* x^288735 mod G(x) */, 0x3bf3d597 /* x^192479 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, },
-	/* chunk_len=12160 */
-	{ 0x37f5d8f4 /* x^291807 mod G(x) */, 0x4d5b699b /* x^194527 mod G(x) */, 0xd7262e5f /* x^97247 mod G(x) */, },
-	/* chunk_len=12288 */
-	{ 0xfa8a435d /* x^294879 mod G(x) */, 0x64f34a05 /* x^196575 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, },
-	/* chunk_len=12416 */
-	{ 0x238709fe /* x^297951 mod G(x) */, 0x52e7458f /* x^198623 mod G(x) */, 0x9a174cd3 /* x^99295 mod G(x) */, },
-	/* chunk_len=12544 */
-	{ 0x9e1ba6f5 /* x^301023 mod G(x) */, 0xef0272f7 /* x^200671 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, },
-	/* chunk_len=12672 */
-	{ 0xcd8b57fa /* x^304095 mod G(x) */, 0x82adb0b8 /* x^202719 mod G(x) */, 0xb6f35093 /* x^101343 mod G(x) */, },
-	/* chunk_len=12800 */
-	{ 0x0aed142f /* x^307167 mod G(x) */, 0xb1650290 /* x^204767 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, },
-	/* chunk_len=12928 */
-	{ 0xd1f064db /* x^310239 mod G(x) */, 0x6e7340d3 /* x^206815 mod G(x) */, 0x5c28cb52 /* x^103391 mod G(x) */, },
-	/* chunk_len=13056 */
-	{ 0x464ac895 /* x^313311 mod G(x) */, 0xd2fc57c3 /* x^208863 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, },
-	/* chunk_len=13184 */
-	{ 0xa0e6beea /* x^316383 mod G(x) */, 0xcfeec3d0 /* x^210911 mod G(x) */, 0x0225d214 /* x^105439 mod G(x) */, },
-	/* chunk_len=13312 */
-	{ 0x78703ce0 /* x^319455 mod G(x) */, 0xc60f6075 /* x^212959 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, },
-	/* chunk_len=13440 */
-	{ 0xfea48165 /* x^322527 mod G(x) */, 0x3a9d1f97 /* x^215007 mod G(x) */, 0xc3876592 /* x^107487 mod G(x) */, },
-	/* chunk_len=13568 */
-	{ 0xdb89b8db /* x^325599 mod G(x) */, 0xa6172211 /* x^217055 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, },
-	/* chunk_len=13696 */
-	{ 0x7ca03731 /* x^328671 mod G(x) */, 0x1db42849 /* x^219103 mod G(x) */, 0xc5df246e /* x^109535 mod G(x) */, },
-	/* chunk_len=13824 */
-	{ 0x8801d0aa /* x^331743 mod G(x) */, 0xb57c7728 /* x^221151 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, },
-	/* chunk_len=13952 */
-	{ 0xf89cd7f0 /* x^334815 mod G(x) */, 0xcc396a0b /* x^223199 mod G(x) */, 0xdb799c51 /* x^111583 mod G(x) */, },
-	/* chunk_len=14080 */
-	{ 0x1611a808 /* x^337887 mod G(x) */, 0xaeae6105 /* x^225247 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, },
-	/* chunk_len=14208 */
-	{ 0xe3cdb888 /* x^340959 mod G(x) */, 0x26387824 /* x^227295 mod G(x) */, 0x30d13e5f /* x^113631 mod G(x) */, },
-	/* chunk_len=14336 */
-	{ 0x552a4cf6 /* x^344031 mod G(x) */, 0xee2d04bb /* x^229343 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, },
-	/* chunk_len=14464 */
-	{ 0x85e248e9 /* x^347103 mod G(x) */, 0x0a79663f /* x^231391 mod G(x) */, 0x53339cf7 /* x^115679 mod G(x) */, },
-	/* chunk_len=14592 */
-	{ 0x1c61c3e9 /* x^350175 mod G(x) */, 0x8ea1f680 /* x^233439 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, },
-	/* chunk_len=14720 */
-	{ 0xb14cfc2b /* x^353247 mod G(x) */, 0x2e073302 /* x^235487 mod G(x) */, 0x10897992 /* x^117727 mod G(x) */, },
-	/* chunk_len=14848 */
-	{ 0x6ec444cc /* x^356319 mod G(x) */, 0x9e819f13 /* x^237535 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, },
-	/* chunk_len=14976 */
-	{ 0xe2fa5f80 /* x^359391 mod G(x) */, 0x21f15b59 /* x^239583 mod G(x) */, 0x93102436 /* x^119775 mod G(x) */, },
-	/* chunk_len=15104 */
-	{ 0x6d33f4c6 /* x^362463 mod G(x) */, 0x31a27455 /* x^241631 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, },
-	/* chunk_len=15232 */
-	{ 0xb6dec609 /* x^365535 mod G(x) */, 0x4d437056 /* x^243679 mod G(x) */, 0x42eb1e2a /* x^121823 mod G(x) */, },
-	/* chunk_len=15360 */
-	{ 0x1846c518 /* x^368607 mod G(x) */, 0x71b74d95 /* x^245727 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, },
-	/* chunk_len=15488 */
-	{ 0x9f947f8a /* x^371679 mod G(x) */, 0x2b501619 /* x^247775 mod G(x) */, 0xa4924b0e /* x^123871 mod G(x) */, },
-	/* chunk_len=15616 */
-	{ 0xb7442f4d /* x^374751 mod G(x) */, 0xba30a5d8 /* x^249823 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, },
-	/* chunk_len=15744 */
-	{ 0xe2c93242 /* x^377823 mod G(x) */, 0x8a2d38e8 /* x^251871 mod G(x) */, 0x70cd7f26 /* x^125919 mod G(x) */, },
-	/* chunk_len=15872 */
-	{ 0xcd6863df /* x^380895 mod G(x) */, 0x78fd88dc /* x^253919 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, },
-	/* chunk_len=16000 */
-	{ 0xd512001d /* x^383967 mod G(x) */, 0xe6612dff /* x^255967 mod G(x) */, 0x5c4d0ca9 /* x^127967 mod G(x) */, },
-	/* chunk_len=16128 */
-	{ 0x4e8d6b6c /* x^387039 mod G(x) */, 0xf997967f /* x^258015 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, },
-	/* chunk_len=16256 */
-	{ 0xfa653ba1 /* x^390111 mod G(x) */, 0xc99014d4 /* x^260063 mod G(x) */, 0xa0c9fd27 /* x^130015 mod G(x) */, },
-	/* chunk_len=16384 */
-	{ 0x49893408 /* x^393183 mod G(x) */, 0x29c2448b /* x^262111 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, },
+  { 0 /* unused row */ },
+  /* chunk_len=128 */
+  { 0xd31343ea /* x^3039 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, 0x910eeec1 /* x^991 mod G(x) */, },
+  /* chunk_len=256 */
+  { 0x1d6708a0 /* x^6111 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, 0xe95c1271 /* x^2015 mod G(x) */, },
+  /* chunk_len=384 */
+  { 0xdb3839f3 /* x^9183 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, 0xd31343ea /* x^3039 mod G(x) */, },
+  /* chunk_len=512 */
+  { 0x1753ab84 /* x^12255 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, 0x0c30f51d /* x^4063 mod G(x) */, },
+  /* chunk_len=640 */
+  { 0x3796455c /* x^15327 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, 0xc352f6de /* x^5087 mod G(x) */, },
+  /* chunk_len=768 */
+  { 0x3954de39 /* x^18399 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, 0x1d6708a0 /* x^6111 mod G(x) */, },
+  /* chunk_len=896 */
+  { 0x632d78c5 /* x^21471 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, 0x9a1b53c8 /* x^7135 mod G(x) */, },
+  /* chunk_len=1024 */
+  { 0xa0decef3 /* x^24543 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, 0xbbf2f6d6 /* x^8159 mod G(x) */, },
+  /* chunk_len=1152 */
+  { 0xe9c09bb0 /* x^27615 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, 0xdb3839f3 /* x^9183 mod G(x) */, },
+  /* chunk_len=1280 */
+  { 0xd51917a4 /* x^30687 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, 0xb8e0e4a8 /* x^10207 mod G(x) */, },
+  /* chunk_len=1408 */
+  { 0x154a8a62 /* x^33759 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, 0x3e9a43cd /* x^11231 mod G(x) */, },
+  /* chunk_len=1536 */
+  { 0xf196555d /* x^36831 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, 0x1753ab84 /* x^12255 mod G(x) */, },
+  /* chunk_len=1664 */
+  { 0x8eec2999 /* x^39903 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, 0x6044fbb0 /* x^13279 mod G(x) */, },
+  /* chunk_len=1792 */
+  { 0x27892abf /* x^42975 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, 0x3fc33de4 /* x^14303 mod G(x) */, },
+  /* chunk_len=1920 */
+  { 0x77bc2419 /* x^46047 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, 0x3796455c /* x^15327 mod G(x) */, },
+  /* chunk_len=2048 */
+  { 0xcea114a5 /* x^49119 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, 0x7b4aa8b7 /* x^16351 mod G(x) */, },
+  /* chunk_len=2176 */
+  { 0xa1077e85 /* x^52191 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, 0x0c21f835 /* x^17375 mod G(x) */, },
+  /* chunk_len=2304 */
+  { 0xc5ed75e1 /* x^55263 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, 0x3954de39 /* x^18399 mod G(x) */, },
+  /* chunk_len=2432 */
+  { 0xca4fba3f /* x^58335 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, 0x6cb21510 /* x^19423 mod G(x) */, },
+  /* chunk_len=2560 */
+  { 0xcf5bcdc4 /* x^61407 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, 0xcae68461 /* x^20447 mod G(x) */, },
+  /* chunk_len=2688 */
+  { 0xf36b9d16 /* x^64479 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, 0x632d78c5 /* x^21471 mod G(x) */, },
+  /* chunk_len=2816 */
+  { 0xf76fd988 /* x^67551 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, 0x41e7589c /* x^22495 mod G(x) */, },
+  /* chunk_len=2944 */
+  { 0x6c45d92e /* x^70623 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, 0x0c46baec /* x^23519 mod G(x) */, },
+  /* chunk_len=3072 */
+  { 0x6116b82b /* x^73695 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, 0xa0decef3 /* x^24543 mod G(x) */, },
+  /* chunk_len=3200 */
+  { 0x4d9899bb /* x^76767 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, 0x53deb236 /* x^25567 mod G(x) */, },
+  /* chunk_len=3328 */
+  { 0x3e7c93b9 /* x^79839 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, 0xefb0a128 /* x^26591 mod G(x) */, },
+  /* chunk_len=3456 */
+  { 0x388b20ac /* x^82911 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, 0xe9c09bb0 /* x^27615 mod G(x) */, },
+  /* chunk_len=3584 */
+  { 0x0956d953 /* x^85983 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, 0x48d72bb1 /* x^28639 mod G(x) */, },
+  /* chunk_len=3712 */
+  { 0x55cb4dfe /* x^89055 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, 0xc07331b3 /* x^29663 mod G(x) */, },
+  /* chunk_len=3840 */
+  { 0x52222fea /* x^92127 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, 0xd51917a4 /* x^30687 mod G(x) */, },
+  /* chunk_len=3968 */
+  { 0x0603989b /* x^95199 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, 0x5e04b9a5 /* x^31711 mod G(x) */, },
+  /* chunk_len=4096 */
+  { 0x4470c029 /* x^98271 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, 0x68c0a2c5 /* x^32735 mod G(x) */, },
+  /* chunk_len=4224 */
+  { 0xb6f35093 /* x^101343 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, 0x154a8a62 /* x^33759 mod G(x) */, },
+  /* chunk_len=4352 */
+  { 0xc46805ba /* x^104415 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, 0x188cc628 /* x^34783 mod G(x) */, },
+  /* chunk_len=4480 */
+  { 0xc3876592 /* x^107487 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, 0xc35cf6e7 /* x^35807 mod G(x) */, },
+  /* chunk_len=4608 */
+  { 0x5b0c98b9 /* x^110559 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, 0xf196555d /* x^36831 mod G(x) */, },
+  /* chunk_len=4736 */
+  { 0x30d13e5f /* x^113631 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, 0x8c224466 /* x^37855 mod G(x) */, },
+  /* chunk_len=4864 */
+  { 0x54afca53 /* x^116703 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, 0x0acfa26f /* x^38879 mod G(x) */, },
+  /* chunk_len=4992 */
+  { 0x93102436 /* x^119775 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, 0x8eec2999 /* x^39903 mod G(x) */, },
+  /* chunk_len=5120 */
+  { 0xbd2655a8 /* x^122847 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, 0x4fae7fc0 /* x^40927 mod G(x) */, },
+  /* chunk_len=5248 */
+  { 0x70cd7f26 /* x^125919 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, 0x1691be45 /* x^41951 mod G(x) */, },
+  /* chunk_len=5376 */
+  { 0x2d546c53 /* x^128991 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, 0x27892abf /* x^42975 mod G(x) */, },
+  /* chunk_len=5504 */
+  { 0xb53410a8 /* x^132063 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, 0x161f3c12 /* x^43999 mod G(x) */, },
+  /* chunk_len=5632 */
+  { 0x67a93f75 /* x^135135 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, 0xed5c39b1 /* x^45023 mod G(x) */, },
+  /* chunk_len=5760 */
+  { 0x9830ac33 /* x^138207 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, 0x77bc2419 /* x^46047 mod G(x) */, },
+  /* chunk_len=5888 */
+  { 0xb0b6fc3e /* x^141279 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, 0xff809fcd /* x^47071 mod G(x) */, },
+  /* chunk_len=6016 */
+  { 0x84170f16 /* x^144351 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, 0x30de0f98 /* x^48095 mod G(x) */, },
+  /* chunk_len=6144 */
+  { 0xd7017a0c /* x^147423 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, 0xcea114a5 /* x^49119 mod G(x) */, },
+  /* chunk_len=6272 */
+  { 0xadb25de6 /* x^150495 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, 0x2b7e0e1b /* x^50143 mod G(x) */, },
+  /* chunk_len=6400 */
+  { 0x8282fddc /* x^153567 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, 0x9f9d8d9c /* x^51167 mod G(x) */, },
+  /* chunk_len=6528 */
+  { 0x46362bee /* x^156639 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, 0xa1077e85 /* x^52191 mod G(x) */, },
+  /* chunk_len=6656 */
+  { 0xb9077a01 /* x^159711 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, 0x6666b805 /* x^53215 mod G(x) */, },
+  /* chunk_len=6784 */
+  { 0xf51d9bc6 /* x^162783 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, 0x7e774cf6 /* x^54239 mod G(x) */, },
+  /* chunk_len=6912 */
+  { 0x4ca19a29 /* x^165855 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, 0xc5ed75e1 /* x^55263 mod G(x) */, },
+  /* chunk_len=7040 */
+  { 0xdc0fc3fc /* x^168927 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, 0x3678fed2 /* x^56287 mod G(x) */, },
+  /* chunk_len=7168 */
+  { 0x63c3d167 /* x^171999 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, 0x97fbdb14 /* x^57311 mod G(x) */, },
+  /* chunk_len=7296 */
+  { 0x5851d254 /* x^175071 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, 0xca4fba3f /* x^58335 mod G(x) */, },
+  /* chunk_len=7424 */
+  { 0xfeacf2a1 /* x^178143 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, 0x1b37c832 /* x^59359 mod G(x) */, },
+  /* chunk_len=7552 */
+  { 0x93b7edc8 /* x^181215 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, 0x58fa96ee /* x^60383 mod G(x) */, },
+  /* chunk_len=7680 */
+  { 0x5539e44a /* x^184287 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, 0xcf5bcdc4 /* x^61407 mod G(x) */, },
+  /* chunk_len=7808 */
+  { 0xde32a3d2 /* x^187359 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, 0x6a6a3694 /* x^62431 mod G(x) */, },
+  /* chunk_len=7936 */
+  { 0xf0baeeb6 /* x^190431 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, 0xb03c8112 /* x^63455 mod G(x) */, },
+  /* chunk_len=8064 */
+  { 0xbe15887f /* x^193503 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, 0xf36b9d16 /* x^64479 mod G(x) */, },
+  /* chunk_len=8192 */
+  { 0x64f34a05 /* x^196575 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, 0x2339d155 /* x^65503 mod G(x) */, },
+  /* chunk_len=8320 */
+  { 0x1b6d1aea /* x^199647 mod G(x) */, 0xfeafb67c /* x^133087 mod G(x) */, 0x4fb001a8 /* x^66527 mod G(x) */, },
+  /* chunk_len=8448 */
+  { 0x82adb0b8 /* x^202719 mod G(x) */, 0x67a93f75 /* x^135135 mod G(x) */, 0xf76fd988 /* x^67551 mod G(x) */, },
+  /* chunk_len=8576 */
+  { 0x694587c7 /* x^205791 mod G(x) */, 0x3b34408b /* x^137183 mod G(x) */, 0xeccb2978 /* x^68575 mod G(x) */, },
+  /* chunk_len=8704 */
+  { 0xd2fc57c3 /* x^208863 mod G(x) */, 0x07fcf8c6 /* x^139231 mod G(x) */, 0x416f9449 /* x^69599 mod G(x) */, },
+  /* chunk_len=8832 */
+  { 0x9dd6837c /* x^211935 mod G(x) */, 0xb0b6fc3e /* x^141279 mod G(x) */, 0x6c45d92e /* x^70623 mod G(x) */, },
+  /* chunk_len=8960 */
+  { 0x3a9d1f97 /* x^215007 mod G(x) */, 0xefd033b2 /* x^143327 mod G(x) */, 0x4b809189 /* x^71647 mod G(x) */, },
+  /* chunk_len=9088 */
+  { 0x1eee1d2a /* x^218079 mod G(x) */, 0xf2a6e46e /* x^145375 mod G(x) */, 0x55b4c814 /* x^72671 mod G(x) */, },
+  /* chunk_len=9216 */
+  { 0xb57c7728 /* x^221151 mod G(x) */, 0xd7017a0c /* x^147423 mod G(x) */, 0x6116b82b /* x^73695 mod G(x) */, },
+  /* chunk_len=9344 */
+  { 0xf2fc5d61 /* x^224223 mod G(x) */, 0x242aac86 /* x^149471 mod G(x) */, 0x05245cf0 /* x^74719 mod G(x) */, },
+  /* chunk_len=9472 */
+  { 0x26387824 /* x^227295 mod G(x) */, 0xc15c4ca5 /* x^151519 mod G(x) */, 0x4c5a315a /* x^75743 mod G(x) */, },
+  /* chunk_len=9600 */
+  { 0x8c151e77 /* x^230367 mod G(x) */, 0x8282fddc /* x^153567 mod G(x) */, 0x4d9899bb /* x^76767 mod G(x) */, },
+  /* chunk_len=9728 */
+  { 0x8ea1f680 /* x^233439 mod G(x) */, 0xf5ff6cdd /* x^155615 mod G(x) */, 0xbccfa2c1 /* x^77791 mod G(x) */, },
+  /* chunk_len=9856 */
+  { 0xe8cf3d2a /* x^236511 mod G(x) */, 0x338b1fb1 /* x^157663 mod G(x) */, 0xeda61f70 /* x^78815 mod G(x) */, },
+  /* chunk_len=9984 */
+  { 0x21f15b59 /* x^239583 mod G(x) */, 0xb9077a01 /* x^159711 mod G(x) */, 0x3e7c93b9 /* x^79839 mod G(x) */, },
+  /* chunk_len=10112 */
+  { 0x6f68d64a /* x^242655 mod G(x) */, 0x901b0161 /* x^161759 mod G(x) */, 0xb9fd3537 /* x^80863 mod G(x) */, },
+  /* chunk_len=10240 */
+  { 0x71b74d95 /* x^245727 mod G(x) */, 0xf5ddd5ad /* x^163807 mod G(x) */, 0x3e116c9d /* x^81887 mod G(x) */, },
+  /* chunk_len=10368 */
+  { 0x4c2e7261 /* x^248799 mod G(x) */, 0x4ca19a29 /* x^165855 mod G(x) */, 0x388b20ac /* x^82911 mod G(x) */, },
+  /* chunk_len=10496 */
+  { 0x8a2d38e8 /* x^251871 mod G(x) */, 0xd27ee0a1 /* x^167903 mod G(x) */, 0x408e57f2 /* x^83935 mod G(x) */, },
+  /* chunk_len=10624 */
+  { 0x7e58ca17 /* x^254943 mod G(x) */, 0x69dfedd2 /* x^169951 mod G(x) */, 0x3a76805e /* x^84959 mod G(x) */, },
+  /* chunk_len=10752 */
+  { 0xf997967f /* x^258015 mod G(x) */, 0x63c3d167 /* x^171999 mod G(x) */, 0x0956d953 /* x^85983 mod G(x) */, },
+  /* chunk_len=10880 */
+  { 0x48215963 /* x^261087 mod G(x) */, 0x71e1dfe0 /* x^174047 mod G(x) */, 0x42a6d410 /* x^87007 mod G(x) */, },
+  /* chunk_len=11008 */
+  { 0xa704b94c /* x^264159 mod G(x) */, 0x679f198a /* x^176095 mod G(x) */, 0x42ebf0ad /* x^88031 mod G(x) */, },
+  /* chunk_len=11136 */
+  { 0x1d699056 /* x^267231 mod G(x) */, 0xfeacf2a1 /* x^178143 mod G(x) */, 0x55cb4dfe /* x^89055 mod G(x) */, },
+  /* chunk_len=11264 */
+  { 0x6800bcc5 /* x^270303 mod G(x) */, 0x16024f15 /* x^180191 mod G(x) */, 0xcf3233e4 /* x^90079 mod G(x) */, },
+  /* chunk_len=11392 */
+  { 0x2d48e4ca /* x^273375 mod G(x) */, 0xbe61582f /* x^182239 mod G(x) */, 0x46026283 /* x^91103 mod G(x) */, },
+  /* chunk_len=11520 */
+  { 0x4c4c2b55 /* x^276447 mod G(x) */, 0x5539e44a /* x^184287 mod G(x) */, 0x52222fea /* x^92127 mod G(x) */, },
+  /* chunk_len=11648 */
+  { 0xd8ce94cb /* x^279519 mod G(x) */, 0xbc613c26 /* x^186335 mod G(x) */, 0x33776b4b /* x^93151 mod G(x) */, },
+  /* chunk_len=11776 */
+  { 0xd0b5a02b /* x^282591 mod G(x) */, 0x490d3cc6 /* x^188383 mod G(x) */, 0x2fde73f8 /* x^94175 mod G(x) */, },
+  /* chunk_len=11904 */
+  { 0xa223f7ec /* x^285663 mod G(x) */, 0xf0baeeb6 /* x^190431 mod G(x) */, 0x0603989b /* x^95199 mod G(x) */, },
+  /* chunk_len=12032 */
+  { 0x58de337a /* x^288735 mod G(x) */, 0x3bf3d597 /* x^192479 mod G(x) */, 0xced90d99 /* x^96223 mod G(x) */, },
+  /* chunk_len=12160 */
+  { 0x37f5d8f4 /* x^291807 mod G(x) */, 0x4d5b699b /* x^194527 mod G(x) */, 0xd7262e5f /* x^97247 mod G(x) */, },
+  /* chunk_len=12288 */
+  { 0xfa8a435d /* x^294879 mod G(x) */, 0x64f34a05 /* x^196575 mod G(x) */, 0x4470c029 /* x^98271 mod G(x) */, },
+  /* chunk_len=12416 */
+  { 0x238709fe /* x^297951 mod G(x) */, 0x52e7458f /* x^198623 mod G(x) */, 0x9a174cd3 /* x^99295 mod G(x) */, },
+  /* chunk_len=12544 */
+  { 0x9e1ba6f5 /* x^301023 mod G(x) */, 0xef0272f7 /* x^200671 mod G(x) */, 0x84f40beb /* x^100319 mod G(x) */, },
+  /* chunk_len=12672 */
+  { 0xcd8b57fa /* x^304095 mod G(x) */, 0x82adb0b8 /* x^202719 mod G(x) */, 0xb6f35093 /* x^101343 mod G(x) */, },
+  /* chunk_len=12800 */
+  { 0x0aed142f /* x^307167 mod G(x) */, 0xb1650290 /* x^204767 mod G(x) */, 0xec855937 /* x^102367 mod G(x) */, },
+  /* chunk_len=12928 */
+  { 0xd1f064db /* x^310239 mod G(x) */, 0x6e7340d3 /* x^206815 mod G(x) */, 0x5c28cb52 /* x^103391 mod G(x) */, },
+  /* chunk_len=13056 */
+  { 0x464ac895 /* x^313311 mod G(x) */, 0xd2fc57c3 /* x^208863 mod G(x) */, 0xc46805ba /* x^104415 mod G(x) */, },
+  /* chunk_len=13184 */
+  { 0xa0e6beea /* x^316383 mod G(x) */, 0xcfeec3d0 /* x^210911 mod G(x) */, 0x0225d214 /* x^105439 mod G(x) */, },
+  /* chunk_len=13312 */
+  { 0x78703ce0 /* x^319455 mod G(x) */, 0xc60f6075 /* x^212959 mod G(x) */, 0xdf7a24ac /* x^106463 mod G(x) */, },
+  /* chunk_len=13440 */
+  { 0xfea48165 /* x^322527 mod G(x) */, 0x3a9d1f97 /* x^215007 mod G(x) */, 0xc3876592 /* x^107487 mod G(x) */, },
+  /* chunk_len=13568 */
+  { 0xdb89b8db /* x^325599 mod G(x) */, 0xa6172211 /* x^217055 mod G(x) */, 0x2b52dc39 /* x^108511 mod G(x) */, },
+  /* chunk_len=13696 */
+  { 0x7ca03731 /* x^328671 mod G(x) */, 0x1db42849 /* x^219103 mod G(x) */, 0xc5df246e /* x^109535 mod G(x) */, },
+  /* chunk_len=13824 */
+  { 0x8801d0aa /* x^331743 mod G(x) */, 0xb57c7728 /* x^221151 mod G(x) */, 0x5b0c98b9 /* x^110559 mod G(x) */, },
+  /* chunk_len=13952 */
+  { 0xf89cd7f0 /* x^334815 mod G(x) */, 0xcc396a0b /* x^223199 mod G(x) */, 0xdb799c51 /* x^111583 mod G(x) */, },
+  /* chunk_len=14080 */
+  { 0x1611a808 /* x^337887 mod G(x) */, 0xaeae6105 /* x^225247 mod G(x) */, 0xb939fcdf /* x^112607 mod G(x) */, },
+  /* chunk_len=14208 */
+  { 0xe3cdb888 /* x^340959 mod G(x) */, 0x26387824 /* x^227295 mod G(x) */, 0x30d13e5f /* x^113631 mod G(x) */, },
+  /* chunk_len=14336 */
+  { 0x552a4cf6 /* x^344031 mod G(x) */, 0xee2d04bb /* x^229343 mod G(x) */, 0x70f9947d /* x^114655 mod G(x) */, },
+  /* chunk_len=14464 */
+  { 0x85e248e9 /* x^347103 mod G(x) */, 0x0a79663f /* x^231391 mod G(x) */, 0x53339cf7 /* x^115679 mod G(x) */, },
+  /* chunk_len=14592 */
+  { 0x1c61c3e9 /* x^350175 mod G(x) */, 0x8ea1f680 /* x^233439 mod G(x) */, 0x54afca53 /* x^116703 mod G(x) */, },
+  /* chunk_len=14720 */
+  { 0xb14cfc2b /* x^353247 mod G(x) */, 0x2e073302 /* x^235487 mod G(x) */, 0x10897992 /* x^117727 mod G(x) */, },
+  /* chunk_len=14848 */
+  { 0x6ec444cc /* x^356319 mod G(x) */, 0x9e819f13 /* x^237535 mod G(x) */, 0x7a3c0a6a /* x^118751 mod G(x) */, },
+  /* chunk_len=14976 */
+  { 0xe2fa5f80 /* x^359391 mod G(x) */, 0x21f15b59 /* x^239583 mod G(x) */, 0x93102436 /* x^119775 mod G(x) */, },
+  /* chunk_len=15104 */
+  { 0x6d33f4c6 /* x^362463 mod G(x) */, 0x31a27455 /* x^241631 mod G(x) */, 0x1fea4d2a /* x^120799 mod G(x) */, },
+  /* chunk_len=15232 */
+  { 0xb6dec609 /* x^365535 mod G(x) */, 0x4d437056 /* x^243679 mod G(x) */, 0x42eb1e2a /* x^121823 mod G(x) */, },
+  /* chunk_len=15360 */
+  { 0x1846c518 /* x^368607 mod G(x) */, 0x71b74d95 /* x^245727 mod G(x) */, 0xbd2655a8 /* x^122847 mod G(x) */, },
+  /* chunk_len=15488 */
+  { 0x9f947f8a /* x^371679 mod G(x) */, 0x2b501619 /* x^247775 mod G(x) */, 0xa4924b0e /* x^123871 mod G(x) */, },
+  /* chunk_len=15616 */
+  { 0xb7442f4d /* x^374751 mod G(x) */, 0xba30a5d8 /* x^249823 mod G(x) */, 0x4ff61aa1 /* x^124895 mod G(x) */, },
+  /* chunk_len=15744 */
+  { 0xe2c93242 /* x^377823 mod G(x) */, 0x8a2d38e8 /* x^251871 mod G(x) */, 0x70cd7f26 /* x^125919 mod G(x) */, },
+  /* chunk_len=15872 */
+  { 0xcd6863df /* x^380895 mod G(x) */, 0x78fd88dc /* x^253919 mod G(x) */, 0x7ae2f6f4 /* x^126943 mod G(x) */, },
+  /* chunk_len=16000 */
+  { 0xd512001d /* x^383967 mod G(x) */, 0xe6612dff /* x^255967 mod G(x) */, 0x5c4d0ca9 /* x^127967 mod G(x) */, },
+  /* chunk_len=16128 */
+  { 0x4e8d6b6c /* x^387039 mod G(x) */, 0xf997967f /* x^258015 mod G(x) */, 0x2d546c53 /* x^128991 mod G(x) */, },
+  /* chunk_len=16256 */
+  { 0xfa653ba1 /* x^390111 mod G(x) */, 0xc99014d4 /* x^260063 mod G(x) */, 0xa0c9fd27 /* x^130015 mod G(x) */, },
+  /* chunk_len=16384 */
+  { 0x49893408 /* x^393183 mod G(x) */, 0x29c2448b /* x^262111 mod G(x) */, 0xe0ee5efe /* x^131039 mod G(x) */, },
 };
 
 /* Multipliers for implementations that use a large fixed chunk length */
diff --git a/Sources/DEFLATE/crc32_pmull_wide.h b/Sources/DEFLATE/crc32_pmull_wide.h
deleted file mode 100644
index a72e1d87..00000000
--- a/Sources/DEFLATE/crc32_pmull_wide.h
+++ /dev/null
@@ -1,227 +0,0 @@
-/*
- * arm/crc32_pmull_wide.h - gzip CRC-32 with PMULL (extra-wide version)
- *
- * Copyright 2022 Eric Biggers
- *
- * Permission is hereby granted, free of charge, to any person
- * obtaining a copy of this software and associated documentation
- * files (the "Software"), to deal in the Software without
- * restriction, including without limitation the rights to use,
- * copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following
- * conditions:
- *
- * The above copyright notice and this permission notice shall be
- * included in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
- * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
- * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
- * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
- * OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/*
- * This file is a "template" for instantiating PMULL-based crc32_arm functions.
- * The "parameters" are:
- *
- * SUFFIX:
- *	Name suffix to append to all instantiated functions.
- * ATTRIBUTES:
- *	Target function attributes to use.
- * ENABLE_EOR3:
- *	Use the eor3 instruction (from the sha3 extension).
- *
- * This is the extra-wide version; it uses an unusually large stride length of
- * 12, and it assumes that crc32 instructions are available too.  It's intended
- * for powerful CPUs that support both pmull and crc32 instructions, but where
- * throughput of pmull and xor (given enough instructions issued in parallel) is
- * significantly higher than that of crc32, thus making the crc32 instructions
- * (counterintuitively) not actually the fastest way to compute the CRC-32.  The
- * Apple M1 processor is an example of such a CPU.
- */
-
-#ifndef _MSC_VER
-#  include <arm_acle.h>
-#endif
-#include <arm_neon.h>
-
-#include "crc32_pmull_helpers.h"
-
-static u32 ATTRIBUTES MAYBE_UNUSED
-ADD_SUFFIX(crc32_arm)(u32 crc, const u8 *p, size_t len)
-{
-	uint8x16_t v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11;
-
-	if (len < 3 * 192) {
-		static const u64 _aligned_attribute(16) mults[3][2] = {
-			CRC32_4VECS_MULTS, CRC32_2VECS_MULTS, CRC32_1VECS_MULTS,
-		};
-		poly64x2_t multipliers_4, multipliers_2, multipliers_1;
-
-		if (len < 64)
-			goto tail;
-		multipliers_4 = load_multipliers(mults[0]);
-		multipliers_2 = load_multipliers(mults[1]);
-		multipliers_1 = load_multipliers(mults[2]);
-		/*
-		 * Short length; don't bother aligning the pointer, and fold
-		 * 64 bytes (4 vectors) at a time, at most.
-		 */
-		v0 = veorq_u8(vld1q_u8(p + 0), u32_to_bytevec(crc));
-		v1 = vld1q_u8(p + 16);
-		v2 = vld1q_u8(p + 32);
-		v3 = vld1q_u8(p + 48);
-		p += 64;
-		len -= 64;
-		while (len >= 64) {
-			v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_4);
-			v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_4);
-			v2 = fold_vec(v2, vld1q_u8(p + 32), multipliers_4);
-			v3 = fold_vec(v3, vld1q_u8(p + 48), multipliers_4);
-			p += 64;
-			len -= 64;
-		}
-		v0 = fold_vec(v0, v2, multipliers_2);
-		v1 = fold_vec(v1, v3, multipliers_2);
-		if (len >= 32) {
-			v0 = fold_vec(v0, vld1q_u8(p + 0), multipliers_2);
-			v1 = fold_vec(v1, vld1q_u8(p + 16), multipliers_2);
-			p += 32;
-			len -= 32;
-		}
-		v0 = fold_vec(v0, v1, multipliers_1);
-	} else {
-		static const u64 _aligned_attribute(16) mults[4][2] = {
-			CRC32_12VECS_MULTS, CRC32_6VECS_MULTS,
-			CRC32_3VECS_MULTS, CRC32_1VECS_MULTS,
-		};
-		const poly64x2_t multipliers_12 = load_multipliers(mults[0]);
-		const poly64x2_t multipliers_6 = load_multipliers(mults[1]);
-		const poly64x2_t multipliers_3 = load_multipliers(mults[2]);
-		const poly64x2_t multipliers_1 = load_multipliers(mults[3]);
-		const size_t align = -(uintptr_t)p & 15;
-		const uint8x16_t *vp;
-
-		/* Align p to the next 16-byte boundary. */
-		if (align) {
-			if (align & 1)
-				crc = __crc32b(crc, *p++);
-			if (align & 2) {
-				crc = __crc32h(crc, le16_bswap(*(u16 *)p));
-				p += 2;
-			}
-			if (align & 4) {
-				crc = __crc32w(crc, le32_bswap(*(u32 *)p));
-				p += 4;
-			}
-			if (align & 8) {
-				crc = __crc32d(crc, le64_bswap(*(u64 *)p));
-				p += 8;
-			}
-			len -= align;
-		}
-		vp = (const uint8x16_t *)p;
-		v0 = veorq_u8(*vp++, u32_to_bytevec(crc));
-		v1 = *vp++;
-		v2 = *vp++;
-		v3 = *vp++;
-		v4 = *vp++;
-		v5 = *vp++;
-		v6 = *vp++;
-		v7 = *vp++;
-		v8 = *vp++;
-		v9 = *vp++;
-		v10 = *vp++;
-		v11 = *vp++;
-		len -= 192;
-		/* Fold 192 bytes (12 vectors) at a time. */
-		do {
-			v0 = fold_vec(v0, *vp++, multipliers_12);
-			v1 = fold_vec(v1, *vp++, multipliers_12);
-			v2 = fold_vec(v2, *vp++, multipliers_12);
-			v3 = fold_vec(v3, *vp++, multipliers_12);
-			v4 = fold_vec(v4, *vp++, multipliers_12);
-			v5 = fold_vec(v5, *vp++, multipliers_12);
-			v6 = fold_vec(v6, *vp++, multipliers_12);
-			v7 = fold_vec(v7, *vp++, multipliers_12);
-			v8 = fold_vec(v8, *vp++, multipliers_12);
-			v9 = fold_vec(v9, *vp++, multipliers_12);
-			v10 = fold_vec(v10, *vp++, multipliers_12);
-			v11 = fold_vec(v11, *vp++, multipliers_12);
-			len -= 192;
-		} while (len >= 192);
-
-		/*
-		 * Fewer than 192 bytes left.  Fold v0-v11 down to just v0,
-		 * while processing up to 144 more bytes.
-		 */
-		v0 = fold_vec(v0, v6, multipliers_6);
-		v1 = fold_vec(v1, v7, multipliers_6);
-		v2 = fold_vec(v2, v8, multipliers_6);
-		v3 = fold_vec(v3, v9, multipliers_6);
-		v4 = fold_vec(v4, v10, multipliers_6);
-		v5 = fold_vec(v5, v11, multipliers_6);
-		if (len >= 96) {
-			v0 = fold_vec(v0, *vp++, multipliers_6);
-			v1 = fold_vec(v1, *vp++, multipliers_6);
-			v2 = fold_vec(v2, *vp++, multipliers_6);
-			v3 = fold_vec(v3, *vp++, multipliers_6);
-			v4 = fold_vec(v4, *vp++, multipliers_6);
-			v5 = fold_vec(v5, *vp++, multipliers_6);
-			len -= 96;
-		}
-		v0 = fold_vec(v0, v3, multipliers_3);
-		v1 = fold_vec(v1, v4, multipliers_3);
-		v2 = fold_vec(v2, v5, multipliers_3);
-		if (len >= 48) {
-			v0 = fold_vec(v0, *vp++, multipliers_3);
-			v1 = fold_vec(v1, *vp++, multipliers_3);
-			v2 = fold_vec(v2, *vp++, multipliers_3);
-			len -= 48;
-		}
-		v0 = fold_vec(v0, v1, multipliers_1);
-		v0 = fold_vec(v0, v2, multipliers_1);
-		p = (const u8 *)vp;
-	}
-	/* Reduce 128 to 32 bits using crc32 instructions. */
-	crc = __crc32d(0, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 0));
-	crc = __crc32d(crc, vgetq_lane_u64(vreinterpretq_u64_u8(v0), 1));
-tail:
-	/* Finish up the remainder using crc32 instructions. */
-	if (len & 32) {
-		crc = __crc32d(crc, get_unaligned_le64(p + 0));
-		crc = __crc32d(crc, get_unaligned_le64(p + 8));
-		crc = __crc32d(crc, get_unaligned_le64(p + 16));
-		crc = __crc32d(crc, get_unaligned_le64(p + 24));
-		p += 32;
-	}
-	if (len & 16) {
-		crc = __crc32d(crc, get_unaligned_le64(p + 0));
-		crc = __crc32d(crc, get_unaligned_le64(p + 8));
-		p += 16;
-	}
-	if (len & 8) {
-		crc = __crc32d(crc, get_unaligned_le64(p));
-		p += 8;
-	}
-	if (len & 4) {
-		crc = __crc32w(crc, get_unaligned_le32(p));
-		p += 4;
-	}
-	if (len & 2) {
-		crc = __crc32h(crc, get_unaligned_le16(p));
-		p += 2;
-	}
-	if (len & 1)
-		crc = __crc32b(crc, *p);
-	return crc;
-}
-
-#undef SUFFIX
-#undef ATTRIBUTES
-#undef ENABLE_EOR3
diff --git a/Sources/DEFLATE/crc32_tables.h b/Sources/DEFLATE/crc32_tables.h
index 86228c72..5a4c1c96 100644
--- a/Sources/DEFLATE/crc32_tables.h
+++ b/Sources/DEFLATE/crc32_tables.h
@@ -5,583 +5,583 @@
  */
 
 static const u32 crc32_slice1_table[] MAYBE_UNUSED = {
-	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
-	0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
-	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
-	0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
-	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
-	0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
-	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
-	0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
-	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
-	0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
-	0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
-	0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
-	0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
-	0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
-	0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
-	0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
-	0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
-	0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
-	0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
-	0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
-	0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
-	0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
-	0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
-	0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
-	0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
-	0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
-	0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
-	0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
-	0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
-	0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
-	0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
-	0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
-	0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
-	0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
-	0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
-	0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
-	0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
-	0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
-	0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
-	0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
-	0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
-	0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
-	0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+  0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+  0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+  0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+  0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+  0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+  0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+  0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+  0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+  0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+  0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+  0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+  0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+  0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+  0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+  0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+  0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+  0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+  0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+  0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+  0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+  0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+  0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+  0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+  0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+  0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+  0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+  0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+  0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+  0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+  0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+  0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+  0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+  0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+  0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+  0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+  0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+  0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+  0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+  0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+  0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+  0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+  0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
 };
 
 static const u32 crc32_slice8_table[] MAYBE_UNUSED = {
-	0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
-	0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
-	0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
-	0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
-	0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
-	0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
-	0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
-	0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
-	0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
-	0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
-	0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
-	0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
-	0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
-	0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
-	0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
-	0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
-	0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
-	0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
-	0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
-	0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
-	0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
-	0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
-	0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
-	0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
-	0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
-	0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
-	0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
-	0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
-	0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
-	0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
-	0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
-	0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
-	0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
-	0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
-	0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
-	0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
-	0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
-	0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
-	0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
-	0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
-	0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
-	0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
-	0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
-	0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
-	0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
-	0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
-	0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
-	0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
-	0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
-	0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
-	0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
-	0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
-	0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
-	0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
-	0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
-	0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
-	0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
-	0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
-	0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
-	0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
-	0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
-	0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
-	0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
-	0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
-	0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3,
-	0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7,
-	0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb,
-	0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf,
-	0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
-	0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496,
-	0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a,
-	0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e,
-	0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761,
-	0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
-	0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69,
-	0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d,
-	0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530,
-	0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034,
-	0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
-	0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c,
-	0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6,
-	0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2,
-	0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce,
-	0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
-	0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97,
-	0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93,
-	0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f,
-	0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b,
-	0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
-	0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60,
-	0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c,
-	0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768,
-	0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35,
-	0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
-	0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d,
-	0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539,
-	0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88,
-	0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c,
-	0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
-	0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484,
-	0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9,
-	0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd,
-	0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1,
-	0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
-	0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a,
-	0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e,
-	0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522,
-	0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026,
-	0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
-	0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f,
-	0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773,
-	0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277,
-	0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d,
-	0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
-	0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85,
-	0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81,
-	0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc,
-	0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8,
-	0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
-	0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0,
-	0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f,
-	0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b,
-	0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27,
-	0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
-	0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e,
-	0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a,
-	0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876,
-	0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72,
-	0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59,
-	0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685,
-	0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1,
-	0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d,
-	0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
-	0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5,
-	0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91,
-	0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d,
-	0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9,
-	0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
-	0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901,
-	0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd,
-	0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9,
-	0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315,
-	0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
-	0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad,
-	0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399,
-	0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45,
-	0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221,
-	0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
-	0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9,
-	0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835,
-	0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151,
-	0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d,
-	0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
-	0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5,
-	0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1,
-	0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d,
-	0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609,
-	0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
-	0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1,
-	0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d,
-	0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9,
-	0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05,
-	0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
-	0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd,
-	0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9,
-	0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75,
-	0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711,
-	0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
-	0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339,
-	0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5,
-	0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281,
-	0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d,
-	0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
-	0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895,
-	0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1,
-	0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d,
-	0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819,
-	0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
-	0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1,
-	0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d,
-	0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69,
-	0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5,
-	0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
-	0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d,
-	0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9,
-	0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625,
-	0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41,
-	0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
-	0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89,
-	0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555,
-	0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31,
-	0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed,
-	0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee,
-	0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9,
-	0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701,
-	0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056,
-	0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
-	0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26,
-	0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e,
-	0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9,
-	0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0,
-	0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
-	0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f,
-	0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68,
-	0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f,
-	0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018,
-	0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
-	0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7,
-	0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3,
-	0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084,
-	0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c,
-	0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
-	0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c,
-	0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b,
-	0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3,
-	0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4,
-	0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
-	0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba,
-	0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002,
-	0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755,
-	0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72,
-	0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
-	0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d,
-	0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca,
-	0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5,
-	0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82,
-	0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
-	0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d,
-	0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a,
-	0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d,
-	0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5,
-	0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
-	0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb,
-	0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc,
-	0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04,
-	0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953,
-	0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
-	0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623,
-	0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b,
-	0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc,
-	0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8,
-	0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
-	0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907,
-	0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50,
-	0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677,
-	0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120,
-	0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
-	0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf,
-	0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6,
-	0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981,
-	0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639,
-	0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
-	0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949,
-	0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e,
-	0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6,
-	0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1,
-	0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0,
-	0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10,
-	0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111,
-	0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1,
-	0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
-	0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92,
-	0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693,
-	0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053,
-	0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4,
-	0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
-	0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15,
-	0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5,
-	0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256,
-	0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496,
-	0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
-	0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57,
-	0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299,
-	0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459,
-	0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958,
-	0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
-	0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b,
-	0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db,
-	0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda,
-	0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a,
-	0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
-	0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d,
-	0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c,
-	0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c,
-	0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f,
-	0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
-	0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de,
-	0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e,
-	0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42,
-	0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82,
-	0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
-	0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743,
-	0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0,
-	0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00,
-	0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601,
-	0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
-	0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546,
-	0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386,
-	0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87,
-	0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847,
-	0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
-	0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404,
-	0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905,
-	0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5,
-	0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b,
-	0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
-	0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca,
-	0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a,
-	0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589,
-	0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349,
-	0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
-	0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888,
-	0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f,
-	0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf,
-	0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce,
-	0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
-	0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d,
-	0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d,
-	0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c,
-	0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c,
-	0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae,
-	0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8,
-	0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3,
-	0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5,
-	0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
-	0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223,
-	0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258,
-	0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e,
-	0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798,
-	0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
-	0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5,
-	0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3,
-	0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503,
-	0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715,
-	0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
-	0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578,
-	0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2,
-	0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4,
-	0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf,
-	0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
-	0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59,
-	0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f,
-	0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834,
-	0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22,
-	0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
-	0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2,
-	0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99,
-	0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f,
-	0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f,
-	0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
-	0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02,
-	0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14,
-	0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676,
-	0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460,
-	0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
-	0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d,
-	0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed,
-	0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb,
-	0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680,
-	0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
-	0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340,
-	0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156,
-	0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d,
-	0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b,
-	0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
-	0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd,
-	0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6,
-	0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0,
-	0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a,
-	0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
-	0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77,
-	0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61,
-	0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81,
-	0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97,
-	0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
-	0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa,
-	0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c,
-	0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a,
-	0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41,
-	0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
-	0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7,
-	0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1,
-	0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da,
-	0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc,
-	0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d,
-	0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e,
-	0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa,
-	0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9,
-	0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
-	0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240,
-	0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834,
-	0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27,
-	0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301,
-	0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
-	0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66,
-	0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975,
-	0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf,
-	0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc,
-	0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
-	0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb,
-	0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4,
-	0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7,
-	0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183,
-	0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
-	0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a,
-	0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739,
-	0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d,
-	0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e,
-	0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
-	0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b,
-	0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f,
-	0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c,
-	0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6,
-	0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
-	0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1,
-	0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2,
-	0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f,
-	0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c,
-	0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
-	0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b,
-	0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1,
-	0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2,
-	0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6,
-	0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
-	0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3,
-	0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0,
-	0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794,
-	0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387,
-	0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
-	0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e,
-	0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a,
-	0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49,
-	0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516,
-	0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
-	0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71,
-	0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62,
-	0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8,
-	0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb,
-	0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
-	0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac,
-	0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a,
-	0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899,
-	0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed,
-	0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
-	0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044,
-	0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457,
-	0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23,
-	0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30,
-	0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3,
-	0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919,
-	0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56,
-	0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac,
-	0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
-	0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832,
-	0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d,
-	0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387,
-	0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5,
-	0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
-	0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00,
-	0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa,
-	0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e,
-	0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64,
-	0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
-	0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1,
-	0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e,
-	0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4,
-	0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb,
-	0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
-	0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425,
-	0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf,
-	0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90,
-	0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a,
-	0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
-	0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2,
-	0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced,
-	0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217,
-	0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673,
-	0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
-	0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6,
-	0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c,
-	0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239,
-	0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3,
-	0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
-	0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776,
-	0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312,
-	0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8,
-	0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7,
-	0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
-	0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f,
-	0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95,
-	0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda,
-	0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520,
-	0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
-	0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe,
-	0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1,
-	0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b,
-	0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4,
-	0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
-	0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61,
-	0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b,
-	0x061d761c, 0xcab77682, 0x44387161, 0x889271ff,
-	0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05,
-	0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
-	0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0,
-	0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282,
-	0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78,
-	0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937,
-	0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
-	0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9,
-	0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53,
-	0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c,
-	0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6,
+  0x00000000, 0x77073096, 0xee0e612c, 0x990951ba,
+  0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3,
+  0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
+  0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91,
+  0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de,
+  0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
+  0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec,
+  0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5,
+  0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
+  0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b,
+  0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940,
+  0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
+  0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116,
+  0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f,
+  0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
+  0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d,
+  0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a,
+  0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
+  0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818,
+  0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01,
+  0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
+  0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457,
+  0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c,
+  0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
+  0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2,
+  0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb,
+  0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
+  0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9,
+  0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086,
+  0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
+  0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4,
+  0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad,
+  0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
+  0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683,
+  0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8,
+  0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
+  0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe,
+  0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7,
+  0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
+  0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5,
+  0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252,
+  0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
+  0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60,
+  0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79,
+  0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
+  0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f,
+  0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04,
+  0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
+  0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a,
+  0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713,
+  0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
+  0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21,
+  0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e,
+  0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
+  0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c,
+  0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45,
+  0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
+  0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db,
+  0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0,
+  0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
+  0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6,
+  0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf,
+  0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
+  0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d,
+  0x00000000, 0x191b3141, 0x32366282, 0x2b2d53c3,
+  0x646cc504, 0x7d77f445, 0x565aa786, 0x4f4196c7,
+  0xc8d98a08, 0xd1c2bb49, 0xfaefe88a, 0xe3f4d9cb,
+  0xacb54f0c, 0xb5ae7e4d, 0x9e832d8e, 0x87981ccf,
+  0x4ac21251, 0x53d92310, 0x78f470d3, 0x61ef4192,
+  0x2eaed755, 0x37b5e614, 0x1c98b5d7, 0x05838496,
+  0x821b9859, 0x9b00a918, 0xb02dfadb, 0xa936cb9a,
+  0xe6775d5d, 0xff6c6c1c, 0xd4413fdf, 0xcd5a0e9e,
+  0x958424a2, 0x8c9f15e3, 0xa7b24620, 0xbea97761,
+  0xf1e8e1a6, 0xe8f3d0e7, 0xc3de8324, 0xdac5b265,
+  0x5d5daeaa, 0x44469feb, 0x6f6bcc28, 0x7670fd69,
+  0x39316bae, 0x202a5aef, 0x0b07092c, 0x121c386d,
+  0xdf4636f3, 0xc65d07b2, 0xed705471, 0xf46b6530,
+  0xbb2af3f7, 0xa231c2b6, 0x891c9175, 0x9007a034,
+  0x179fbcfb, 0x0e848dba, 0x25a9de79, 0x3cb2ef38,
+  0x73f379ff, 0x6ae848be, 0x41c51b7d, 0x58de2a3c,
+  0xf0794f05, 0xe9627e44, 0xc24f2d87, 0xdb541cc6,
+  0x94158a01, 0x8d0ebb40, 0xa623e883, 0xbf38d9c2,
+  0x38a0c50d, 0x21bbf44c, 0x0a96a78f, 0x138d96ce,
+  0x5ccc0009, 0x45d73148, 0x6efa628b, 0x77e153ca,
+  0xbabb5d54, 0xa3a06c15, 0x888d3fd6, 0x91960e97,
+  0xded79850, 0xc7cca911, 0xece1fad2, 0xf5facb93,
+  0x7262d75c, 0x6b79e61d, 0x4054b5de, 0x594f849f,
+  0x160e1258, 0x0f152319, 0x243870da, 0x3d23419b,
+  0x65fd6ba7, 0x7ce65ae6, 0x57cb0925, 0x4ed03864,
+  0x0191aea3, 0x188a9fe2, 0x33a7cc21, 0x2abcfd60,
+  0xad24e1af, 0xb43fd0ee, 0x9f12832d, 0x8609b26c,
+  0xc94824ab, 0xd05315ea, 0xfb7e4629, 0xe2657768,
+  0x2f3f79f6, 0x362448b7, 0x1d091b74, 0x04122a35,
+  0x4b53bcf2, 0x52488db3, 0x7965de70, 0x607eef31,
+  0xe7e6f3fe, 0xfefdc2bf, 0xd5d0917c, 0xcccba03d,
+  0x838a36fa, 0x9a9107bb, 0xb1bc5478, 0xa8a76539,
+  0x3b83984b, 0x2298a90a, 0x09b5fac9, 0x10aecb88,
+  0x5fef5d4f, 0x46f46c0e, 0x6dd93fcd, 0x74c20e8c,
+  0xf35a1243, 0xea412302, 0xc16c70c1, 0xd8774180,
+  0x9736d747, 0x8e2de606, 0xa500b5c5, 0xbc1b8484,
+  0x71418a1a, 0x685abb5b, 0x4377e898, 0x5a6cd9d9,
+  0x152d4f1e, 0x0c367e5f, 0x271b2d9c, 0x3e001cdd,
+  0xb9980012, 0xa0833153, 0x8bae6290, 0x92b553d1,
+  0xddf4c516, 0xc4eff457, 0xefc2a794, 0xf6d996d5,
+  0xae07bce9, 0xb71c8da8, 0x9c31de6b, 0x852aef2a,
+  0xca6b79ed, 0xd37048ac, 0xf85d1b6f, 0xe1462a2e,
+  0x66de36e1, 0x7fc507a0, 0x54e85463, 0x4df36522,
+  0x02b2f3e5, 0x1ba9c2a4, 0x30849167, 0x299fa026,
+  0xe4c5aeb8, 0xfdde9ff9, 0xd6f3cc3a, 0xcfe8fd7b,
+  0x80a96bbc, 0x99b25afd, 0xb29f093e, 0xab84387f,
+  0x2c1c24b0, 0x350715f1, 0x1e2a4632, 0x07317773,
+  0x4870e1b4, 0x516bd0f5, 0x7a468336, 0x635db277,
+  0xcbfad74e, 0xd2e1e60f, 0xf9ccb5cc, 0xe0d7848d,
+  0xaf96124a, 0xb68d230b, 0x9da070c8, 0x84bb4189,
+  0x03235d46, 0x1a386c07, 0x31153fc4, 0x280e0e85,
+  0x674f9842, 0x7e54a903, 0x5579fac0, 0x4c62cb81,
+  0x8138c51f, 0x9823f45e, 0xb30ea79d, 0xaa1596dc,
+  0xe554001b, 0xfc4f315a, 0xd7626299, 0xce7953d8,
+  0x49e14f17, 0x50fa7e56, 0x7bd72d95, 0x62cc1cd4,
+  0x2d8d8a13, 0x3496bb52, 0x1fbbe891, 0x06a0d9d0,
+  0x5e7ef3ec, 0x4765c2ad, 0x6c48916e, 0x7553a02f,
+  0x3a1236e8, 0x230907a9, 0x0824546a, 0x113f652b,
+  0x96a779e4, 0x8fbc48a5, 0xa4911b66, 0xbd8a2a27,
+  0xf2cbbce0, 0xebd08da1, 0xc0fdde62, 0xd9e6ef23,
+  0x14bce1bd, 0x0da7d0fc, 0x268a833f, 0x3f91b27e,
+  0x70d024b9, 0x69cb15f8, 0x42e6463b, 0x5bfd777a,
+  0xdc656bb5, 0xc57e5af4, 0xee530937, 0xf7483876,
+  0xb809aeb1, 0xa1129ff0, 0x8a3fcc33, 0x9324fd72,
+  0x00000000, 0x01c26a37, 0x0384d46e, 0x0246be59,
+  0x0709a8dc, 0x06cbc2eb, 0x048d7cb2, 0x054f1685,
+  0x0e1351b8, 0x0fd13b8f, 0x0d9785d6, 0x0c55efe1,
+  0x091af964, 0x08d89353, 0x0a9e2d0a, 0x0b5c473d,
+  0x1c26a370, 0x1de4c947, 0x1fa2771e, 0x1e601d29,
+  0x1b2f0bac, 0x1aed619b, 0x18abdfc2, 0x1969b5f5,
+  0x1235f2c8, 0x13f798ff, 0x11b126a6, 0x10734c91,
+  0x153c5a14, 0x14fe3023, 0x16b88e7a, 0x177ae44d,
+  0x384d46e0, 0x398f2cd7, 0x3bc9928e, 0x3a0bf8b9,
+  0x3f44ee3c, 0x3e86840b, 0x3cc03a52, 0x3d025065,
+  0x365e1758, 0x379c7d6f, 0x35dac336, 0x3418a901,
+  0x3157bf84, 0x3095d5b3, 0x32d36bea, 0x331101dd,
+  0x246be590, 0x25a98fa7, 0x27ef31fe, 0x262d5bc9,
+  0x23624d4c, 0x22a0277b, 0x20e69922, 0x2124f315,
+  0x2a78b428, 0x2bbade1f, 0x29fc6046, 0x283e0a71,
+  0x2d711cf4, 0x2cb376c3, 0x2ef5c89a, 0x2f37a2ad,
+  0x709a8dc0, 0x7158e7f7, 0x731e59ae, 0x72dc3399,
+  0x7793251c, 0x76514f2b, 0x7417f172, 0x75d59b45,
+  0x7e89dc78, 0x7f4bb64f, 0x7d0d0816, 0x7ccf6221,
+  0x798074a4, 0x78421e93, 0x7a04a0ca, 0x7bc6cafd,
+  0x6cbc2eb0, 0x6d7e4487, 0x6f38fade, 0x6efa90e9,
+  0x6bb5866c, 0x6a77ec5b, 0x68315202, 0x69f33835,
+  0x62af7f08, 0x636d153f, 0x612bab66, 0x60e9c151,
+  0x65a6d7d4, 0x6464bde3, 0x662203ba, 0x67e0698d,
+  0x48d7cb20, 0x4915a117, 0x4b531f4e, 0x4a917579,
+  0x4fde63fc, 0x4e1c09cb, 0x4c5ab792, 0x4d98dda5,
+  0x46c49a98, 0x4706f0af, 0x45404ef6, 0x448224c1,
+  0x41cd3244, 0x400f5873, 0x4249e62a, 0x438b8c1d,
+  0x54f16850, 0x55330267, 0x5775bc3e, 0x56b7d609,
+  0x53f8c08c, 0x523aaabb, 0x507c14e2, 0x51be7ed5,
+  0x5ae239e8, 0x5b2053df, 0x5966ed86, 0x58a487b1,
+  0x5deb9134, 0x5c29fb03, 0x5e6f455a, 0x5fad2f6d,
+  0xe1351b80, 0xe0f771b7, 0xe2b1cfee, 0xe373a5d9,
+  0xe63cb35c, 0xe7fed96b, 0xe5b86732, 0xe47a0d05,
+  0xef264a38, 0xeee4200f, 0xeca29e56, 0xed60f461,
+  0xe82fe2e4, 0xe9ed88d3, 0xebab368a, 0xea695cbd,
+  0xfd13b8f0, 0xfcd1d2c7, 0xfe976c9e, 0xff5506a9,
+  0xfa1a102c, 0xfbd87a1b, 0xf99ec442, 0xf85cae75,
+  0xf300e948, 0xf2c2837f, 0xf0843d26, 0xf1465711,
+  0xf4094194, 0xf5cb2ba3, 0xf78d95fa, 0xf64fffcd,
+  0xd9785d60, 0xd8ba3757, 0xdafc890e, 0xdb3ee339,
+  0xde71f5bc, 0xdfb39f8b, 0xddf521d2, 0xdc374be5,
+  0xd76b0cd8, 0xd6a966ef, 0xd4efd8b6, 0xd52db281,
+  0xd062a404, 0xd1a0ce33, 0xd3e6706a, 0xd2241a5d,
+  0xc55efe10, 0xc49c9427, 0xc6da2a7e, 0xc7184049,
+  0xc25756cc, 0xc3953cfb, 0xc1d382a2, 0xc011e895,
+  0xcb4dafa8, 0xca8fc59f, 0xc8c97bc6, 0xc90b11f1,
+  0xcc440774, 0xcd866d43, 0xcfc0d31a, 0xce02b92d,
+  0x91af9640, 0x906dfc77, 0x922b422e, 0x93e92819,
+  0x96a63e9c, 0x976454ab, 0x9522eaf2, 0x94e080c5,
+  0x9fbcc7f8, 0x9e7eadcf, 0x9c381396, 0x9dfa79a1,
+  0x98b56f24, 0x99770513, 0x9b31bb4a, 0x9af3d17d,
+  0x8d893530, 0x8c4b5f07, 0x8e0de15e, 0x8fcf8b69,
+  0x8a809dec, 0x8b42f7db, 0x89044982, 0x88c623b5,
+  0x839a6488, 0x82580ebf, 0x801eb0e6, 0x81dcdad1,
+  0x8493cc54, 0x8551a663, 0x8717183a, 0x86d5720d,
+  0xa9e2d0a0, 0xa820ba97, 0xaa6604ce, 0xaba46ef9,
+  0xaeeb787c, 0xaf29124b, 0xad6fac12, 0xacadc625,
+  0xa7f18118, 0xa633eb2f, 0xa4755576, 0xa5b73f41,
+  0xa0f829c4, 0xa13a43f3, 0xa37cfdaa, 0xa2be979d,
+  0xb5c473d0, 0xb40619e7, 0xb640a7be, 0xb782cd89,
+  0xb2cddb0c, 0xb30fb13b, 0xb1490f62, 0xb08b6555,
+  0xbbd72268, 0xba15485f, 0xb853f606, 0xb9919c31,
+  0xbcde8ab4, 0xbd1ce083, 0xbf5a5eda, 0xbe9834ed,
+  0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee,
+  0x8f629757, 0x37def032, 0x256b5fdc, 0x9dd738b9,
+  0xc5b428ef, 0x7d084f8a, 0x6fbde064, 0xd7018701,
+  0x4ad6bfb8, 0xf26ad8dd, 0xe0df7733, 0x58631056,
+  0x5019579f, 0xe8a530fa, 0xfa109f14, 0x42acf871,
+  0xdf7bc0c8, 0x67c7a7ad, 0x75720843, 0xcdce6f26,
+  0x95ad7f70, 0x2d111815, 0x3fa4b7fb, 0x8718d09e,
+  0x1acfe827, 0xa2738f42, 0xb0c620ac, 0x087a47c9,
+  0xa032af3e, 0x188ec85b, 0x0a3b67b5, 0xb28700d0,
+  0x2f503869, 0x97ec5f0c, 0x8559f0e2, 0x3de59787,
+  0x658687d1, 0xdd3ae0b4, 0xcf8f4f5a, 0x7733283f,
+  0xeae41086, 0x525877e3, 0x40edd80d, 0xf851bf68,
+  0xf02bf8a1, 0x48979fc4, 0x5a22302a, 0xe29e574f,
+  0x7f496ff6, 0xc7f50893, 0xd540a77d, 0x6dfcc018,
+  0x359fd04e, 0x8d23b72b, 0x9f9618c5, 0x272a7fa0,
+  0xbafd4719, 0x0241207c, 0x10f48f92, 0xa848e8f7,
+  0x9b14583d, 0x23a83f58, 0x311d90b6, 0x89a1f7d3,
+  0x1476cf6a, 0xaccaa80f, 0xbe7f07e1, 0x06c36084,
+  0x5ea070d2, 0xe61c17b7, 0xf4a9b859, 0x4c15df3c,
+  0xd1c2e785, 0x697e80e0, 0x7bcb2f0e, 0xc377486b,
+  0xcb0d0fa2, 0x73b168c7, 0x6104c729, 0xd9b8a04c,
+  0x446f98f5, 0xfcd3ff90, 0xee66507e, 0x56da371b,
+  0x0eb9274d, 0xb6054028, 0xa4b0efc6, 0x1c0c88a3,
+  0x81dbb01a, 0x3967d77f, 0x2bd27891, 0x936e1ff4,
+  0x3b26f703, 0x839a9066, 0x912f3f88, 0x299358ed,
+  0xb4446054, 0x0cf80731, 0x1e4da8df, 0xa6f1cfba,
+  0xfe92dfec, 0x462eb889, 0x549b1767, 0xec277002,
+  0x71f048bb, 0xc94c2fde, 0xdbf98030, 0x6345e755,
+  0x6b3fa09c, 0xd383c7f9, 0xc1366817, 0x798a0f72,
+  0xe45d37cb, 0x5ce150ae, 0x4e54ff40, 0xf6e89825,
+  0xae8b8873, 0x1637ef16, 0x048240f8, 0xbc3e279d,
+  0x21e91f24, 0x99557841, 0x8be0d7af, 0x335cb0ca,
+  0xed59b63b, 0x55e5d15e, 0x47507eb0, 0xffec19d5,
+  0x623b216c, 0xda874609, 0xc832e9e7, 0x708e8e82,
+  0x28ed9ed4, 0x9051f9b1, 0x82e4565f, 0x3a58313a,
+  0xa78f0983, 0x1f336ee6, 0x0d86c108, 0xb53aa66d,
+  0xbd40e1a4, 0x05fc86c1, 0x1749292f, 0xaff54e4a,
+  0x322276f3, 0x8a9e1196, 0x982bbe78, 0x2097d91d,
+  0x78f4c94b, 0xc048ae2e, 0xd2fd01c0, 0x6a4166a5,
+  0xf7965e1c, 0x4f2a3979, 0x5d9f9697, 0xe523f1f2,
+  0x4d6b1905, 0xf5d77e60, 0xe762d18e, 0x5fdeb6eb,
+  0xc2098e52, 0x7ab5e937, 0x680046d9, 0xd0bc21bc,
+  0x88df31ea, 0x3063568f, 0x22d6f961, 0x9a6a9e04,
+  0x07bda6bd, 0xbf01c1d8, 0xadb46e36, 0x15080953,
+  0x1d724e9a, 0xa5ce29ff, 0xb77b8611, 0x0fc7e174,
+  0x9210d9cd, 0x2aacbea8, 0x38191146, 0x80a57623,
+  0xd8c66675, 0x607a0110, 0x72cfaefe, 0xca73c99b,
+  0x57a4f122, 0xef189647, 0xfdad39a9, 0x45115ecc,
+  0x764dee06, 0xcef18963, 0xdc44268d, 0x64f841e8,
+  0xf92f7951, 0x41931e34, 0x5326b1da, 0xeb9ad6bf,
+  0xb3f9c6e9, 0x0b45a18c, 0x19f00e62, 0xa14c6907,
+  0x3c9b51be, 0x842736db, 0x96929935, 0x2e2efe50,
+  0x2654b999, 0x9ee8defc, 0x8c5d7112, 0x34e11677,
+  0xa9362ece, 0x118a49ab, 0x033fe645, 0xbb838120,
+  0xe3e09176, 0x5b5cf613, 0x49e959fd, 0xf1553e98,
+  0x6c820621, 0xd43e6144, 0xc68bceaa, 0x7e37a9cf,
+  0xd67f4138, 0x6ec3265d, 0x7c7689b3, 0xc4caeed6,
+  0x591dd66f, 0xe1a1b10a, 0xf3141ee4, 0x4ba87981,
+  0x13cb69d7, 0xab770eb2, 0xb9c2a15c, 0x017ec639,
+  0x9ca9fe80, 0x241599e5, 0x36a0360b, 0x8e1c516e,
+  0x866616a7, 0x3eda71c2, 0x2c6fde2c, 0x94d3b949,
+  0x090481f0, 0xb1b8e695, 0xa30d497b, 0x1bb12e1e,
+  0x43d23e48, 0xfb6e592d, 0xe9dbf6c3, 0x516791a6,
+  0xccb0a91f, 0x740cce7a, 0x66b96194, 0xde0506f1,
+  0x00000000, 0x3d6029b0, 0x7ac05360, 0x47a07ad0,
+  0xf580a6c0, 0xc8e08f70, 0x8f40f5a0, 0xb220dc10,
+  0x30704bc1, 0x0d106271, 0x4ab018a1, 0x77d03111,
+  0xc5f0ed01, 0xf890c4b1, 0xbf30be61, 0x825097d1,
+  0x60e09782, 0x5d80be32, 0x1a20c4e2, 0x2740ed52,
+  0x95603142, 0xa80018f2, 0xefa06222, 0xd2c04b92,
+  0x5090dc43, 0x6df0f5f3, 0x2a508f23, 0x1730a693,
+  0xa5107a83, 0x98705333, 0xdfd029e3, 0xe2b00053,
+  0xc1c12f04, 0xfca106b4, 0xbb017c64, 0x866155d4,
+  0x344189c4, 0x0921a074, 0x4e81daa4, 0x73e1f314,
+  0xf1b164c5, 0xccd14d75, 0x8b7137a5, 0xb6111e15,
+  0x0431c205, 0x3951ebb5, 0x7ef19165, 0x4391b8d5,
+  0xa121b886, 0x9c419136, 0xdbe1ebe6, 0xe681c256,
+  0x54a11e46, 0x69c137f6, 0x2e614d26, 0x13016496,
+  0x9151f347, 0xac31daf7, 0xeb91a027, 0xd6f18997,
+  0x64d15587, 0x59b17c37, 0x1e1106e7, 0x23712f57,
+  0x58f35849, 0x659371f9, 0x22330b29, 0x1f532299,
+  0xad73fe89, 0x9013d739, 0xd7b3ade9, 0xead38459,
+  0x68831388, 0x55e33a38, 0x124340e8, 0x2f236958,
+  0x9d03b548, 0xa0639cf8, 0xe7c3e628, 0xdaa3cf98,
+  0x3813cfcb, 0x0573e67b, 0x42d39cab, 0x7fb3b51b,
+  0xcd93690b, 0xf0f340bb, 0xb7533a6b, 0x8a3313db,
+  0x0863840a, 0x3503adba, 0x72a3d76a, 0x4fc3feda,
+  0xfde322ca, 0xc0830b7a, 0x872371aa, 0xba43581a,
+  0x9932774d, 0xa4525efd, 0xe3f2242d, 0xde920d9d,
+  0x6cb2d18d, 0x51d2f83d, 0x167282ed, 0x2b12ab5d,
+  0xa9423c8c, 0x9422153c, 0xd3826fec, 0xeee2465c,
+  0x5cc29a4c, 0x61a2b3fc, 0x2602c92c, 0x1b62e09c,
+  0xf9d2e0cf, 0xc4b2c97f, 0x8312b3af, 0xbe729a1f,
+  0x0c52460f, 0x31326fbf, 0x7692156f, 0x4bf23cdf,
+  0xc9a2ab0e, 0xf4c282be, 0xb362f86e, 0x8e02d1de,
+  0x3c220dce, 0x0142247e, 0x46e25eae, 0x7b82771e,
+  0xb1e6b092, 0x8c869922, 0xcb26e3f2, 0xf646ca42,
+  0x44661652, 0x79063fe2, 0x3ea64532, 0x03c66c82,
+  0x8196fb53, 0xbcf6d2e3, 0xfb56a833, 0xc6368183,
+  0x74165d93, 0x49767423, 0x0ed60ef3, 0x33b62743,
+  0xd1062710, 0xec660ea0, 0xabc67470, 0x96a65dc0,
+  0x248681d0, 0x19e6a860, 0x5e46d2b0, 0x6326fb00,
+  0xe1766cd1, 0xdc164561, 0x9bb63fb1, 0xa6d61601,
+  0x14f6ca11, 0x2996e3a1, 0x6e369971, 0x5356b0c1,
+  0x70279f96, 0x4d47b626, 0x0ae7ccf6, 0x3787e546,
+  0x85a73956, 0xb8c710e6, 0xff676a36, 0xc2074386,
+  0x4057d457, 0x7d37fde7, 0x3a978737, 0x07f7ae87,
+  0xb5d77297, 0x88b75b27, 0xcf1721f7, 0xf2770847,
+  0x10c70814, 0x2da721a4, 0x6a075b74, 0x576772c4,
+  0xe547aed4, 0xd8278764, 0x9f87fdb4, 0xa2e7d404,
+  0x20b743d5, 0x1dd76a65, 0x5a7710b5, 0x67173905,
+  0xd537e515, 0xe857cca5, 0xaff7b675, 0x92979fc5,
+  0xe915e8db, 0xd475c16b, 0x93d5bbbb, 0xaeb5920b,
+  0x1c954e1b, 0x21f567ab, 0x66551d7b, 0x5b3534cb,
+  0xd965a31a, 0xe4058aaa, 0xa3a5f07a, 0x9ec5d9ca,
+  0x2ce505da, 0x11852c6a, 0x562556ba, 0x6b457f0a,
+  0x89f57f59, 0xb49556e9, 0xf3352c39, 0xce550589,
+  0x7c75d999, 0x4115f029, 0x06b58af9, 0x3bd5a349,
+  0xb9853498, 0x84e51d28, 0xc34567f8, 0xfe254e48,
+  0x4c059258, 0x7165bbe8, 0x36c5c138, 0x0ba5e888,
+  0x28d4c7df, 0x15b4ee6f, 0x521494bf, 0x6f74bd0f,
+  0xdd54611f, 0xe03448af, 0xa794327f, 0x9af41bcf,
+  0x18a48c1e, 0x25c4a5ae, 0x6264df7e, 0x5f04f6ce,
+  0xed242ade, 0xd044036e, 0x97e479be, 0xaa84500e,
+  0x4834505d, 0x755479ed, 0x32f4033d, 0x0f942a8d,
+  0xbdb4f69d, 0x80d4df2d, 0xc774a5fd, 0xfa148c4d,
+  0x78441b9c, 0x4524322c, 0x028448fc, 0x3fe4614c,
+  0x8dc4bd5c, 0xb0a494ec, 0xf704ee3c, 0xca64c78c,
+  0x00000000, 0xcb5cd3a5, 0x4dc8a10b, 0x869472ae,
+  0x9b914216, 0x50cd91b3, 0xd659e31d, 0x1d0530b8,
+  0xec53826d, 0x270f51c8, 0xa19b2366, 0x6ac7f0c3,
+  0x77c2c07b, 0xbc9e13de, 0x3a0a6170, 0xf156b2d5,
+  0x03d6029b, 0xc88ad13e, 0x4e1ea390, 0x85427035,
+  0x9847408d, 0x531b9328, 0xd58fe186, 0x1ed33223,
+  0xef8580f6, 0x24d95353, 0xa24d21fd, 0x6911f258,
+  0x7414c2e0, 0xbf481145, 0x39dc63eb, 0xf280b04e,
+  0x07ac0536, 0xccf0d693, 0x4a64a43d, 0x81387798,
+  0x9c3d4720, 0x57619485, 0xd1f5e62b, 0x1aa9358e,
+  0xebff875b, 0x20a354fe, 0xa6372650, 0x6d6bf5f5,
+  0x706ec54d, 0xbb3216e8, 0x3da66446, 0xf6fab7e3,
+  0x047a07ad, 0xcf26d408, 0x49b2a6a6, 0x82ee7503,
+  0x9feb45bb, 0x54b7961e, 0xd223e4b0, 0x197f3715,
+  0xe82985c0, 0x23755665, 0xa5e124cb, 0x6ebdf76e,
+  0x73b8c7d6, 0xb8e41473, 0x3e7066dd, 0xf52cb578,
+  0x0f580a6c, 0xc404d9c9, 0x4290ab67, 0x89cc78c2,
+  0x94c9487a, 0x5f959bdf, 0xd901e971, 0x125d3ad4,
+  0xe30b8801, 0x28575ba4, 0xaec3290a, 0x659ffaaf,
+  0x789aca17, 0xb3c619b2, 0x35526b1c, 0xfe0eb8b9,
+  0x0c8e08f7, 0xc7d2db52, 0x4146a9fc, 0x8a1a7a59,
+  0x971f4ae1, 0x5c439944, 0xdad7ebea, 0x118b384f,
+  0xe0dd8a9a, 0x2b81593f, 0xad152b91, 0x6649f834,
+  0x7b4cc88c, 0xb0101b29, 0x36846987, 0xfdd8ba22,
+  0x08f40f5a, 0xc3a8dcff, 0x453cae51, 0x8e607df4,
+  0x93654d4c, 0x58399ee9, 0xdeadec47, 0x15f13fe2,
+  0xe4a78d37, 0x2ffb5e92, 0xa96f2c3c, 0x6233ff99,
+  0x7f36cf21, 0xb46a1c84, 0x32fe6e2a, 0xf9a2bd8f,
+  0x0b220dc1, 0xc07ede64, 0x46eaacca, 0x8db67f6f,
+  0x90b34fd7, 0x5bef9c72, 0xdd7beedc, 0x16273d79,
+  0xe7718fac, 0x2c2d5c09, 0xaab92ea7, 0x61e5fd02,
+  0x7ce0cdba, 0xb7bc1e1f, 0x31286cb1, 0xfa74bf14,
+  0x1eb014d8, 0xd5ecc77d, 0x5378b5d3, 0x98246676,
+  0x852156ce, 0x4e7d856b, 0xc8e9f7c5, 0x03b52460,
+  0xf2e396b5, 0x39bf4510, 0xbf2b37be, 0x7477e41b,
+  0x6972d4a3, 0xa22e0706, 0x24ba75a8, 0xefe6a60d,
+  0x1d661643, 0xd63ac5e6, 0x50aeb748, 0x9bf264ed,
+  0x86f75455, 0x4dab87f0, 0xcb3ff55e, 0x006326fb,
+  0xf135942e, 0x3a69478b, 0xbcfd3525, 0x77a1e680,
+  0x6aa4d638, 0xa1f8059d, 0x276c7733, 0xec30a496,
+  0x191c11ee, 0xd240c24b, 0x54d4b0e5, 0x9f886340,
+  0x828d53f8, 0x49d1805d, 0xcf45f2f3, 0x04192156,
+  0xf54f9383, 0x3e134026, 0xb8873288, 0x73dbe12d,
+  0x6eded195, 0xa5820230, 0x2316709e, 0xe84aa33b,
+  0x1aca1375, 0xd196c0d0, 0x5702b27e, 0x9c5e61db,
+  0x815b5163, 0x4a0782c6, 0xcc93f068, 0x07cf23cd,
+  0xf6999118, 0x3dc542bd, 0xbb513013, 0x700de3b6,
+  0x6d08d30e, 0xa65400ab, 0x20c07205, 0xeb9ca1a0,
+  0x11e81eb4, 0xdab4cd11, 0x5c20bfbf, 0x977c6c1a,
+  0x8a795ca2, 0x41258f07, 0xc7b1fda9, 0x0ced2e0c,
+  0xfdbb9cd9, 0x36e74f7c, 0xb0733dd2, 0x7b2fee77,
+  0x662adecf, 0xad760d6a, 0x2be27fc4, 0xe0beac61,
+  0x123e1c2f, 0xd962cf8a, 0x5ff6bd24, 0x94aa6e81,
+  0x89af5e39, 0x42f38d9c, 0xc467ff32, 0x0f3b2c97,
+  0xfe6d9e42, 0x35314de7, 0xb3a53f49, 0x78f9ecec,
+  0x65fcdc54, 0xaea00ff1, 0x28347d5f, 0xe368aefa,
+  0x16441b82, 0xdd18c827, 0x5b8cba89, 0x90d0692c,
+  0x8dd55994, 0x46898a31, 0xc01df89f, 0x0b412b3a,
+  0xfa1799ef, 0x314b4a4a, 0xb7df38e4, 0x7c83eb41,
+  0x6186dbf9, 0xaada085c, 0x2c4e7af2, 0xe712a957,
+  0x15921919, 0xdececabc, 0x585ab812, 0x93066bb7,
+  0x8e035b0f, 0x455f88aa, 0xc3cbfa04, 0x089729a1,
+  0xf9c19b74, 0x329d48d1, 0xb4093a7f, 0x7f55e9da,
+  0x6250d962, 0xa90c0ac7, 0x2f987869, 0xe4c4abcc,
+  0x00000000, 0xa6770bb4, 0x979f1129, 0x31e81a9d,
+  0xf44f2413, 0x52382fa7, 0x63d0353a, 0xc5a73e8e,
+  0x33ef4e67, 0x959845d3, 0xa4705f4e, 0x020754fa,
+  0xc7a06a74, 0x61d761c0, 0x503f7b5d, 0xf64870e9,
+  0x67de9cce, 0xc1a9977a, 0xf0418de7, 0x56368653,
+  0x9391b8dd, 0x35e6b369, 0x040ea9f4, 0xa279a240,
+  0x5431d2a9, 0xf246d91d, 0xc3aec380, 0x65d9c834,
+  0xa07ef6ba, 0x0609fd0e, 0x37e1e793, 0x9196ec27,
+  0xcfbd399c, 0x69ca3228, 0x582228b5, 0xfe552301,
+  0x3bf21d8f, 0x9d85163b, 0xac6d0ca6, 0x0a1a0712,
+  0xfc5277fb, 0x5a257c4f, 0x6bcd66d2, 0xcdba6d66,
+  0x081d53e8, 0xae6a585c, 0x9f8242c1, 0x39f54975,
+  0xa863a552, 0x0e14aee6, 0x3ffcb47b, 0x998bbfcf,
+  0x5c2c8141, 0xfa5b8af5, 0xcbb39068, 0x6dc49bdc,
+  0x9b8ceb35, 0x3dfbe081, 0x0c13fa1c, 0xaa64f1a8,
+  0x6fc3cf26, 0xc9b4c492, 0xf85cde0f, 0x5e2bd5bb,
+  0x440b7579, 0xe27c7ecd, 0xd3946450, 0x75e36fe4,
+  0xb044516a, 0x16335ade, 0x27db4043, 0x81ac4bf7,
+  0x77e43b1e, 0xd19330aa, 0xe07b2a37, 0x460c2183,
+  0x83ab1f0d, 0x25dc14b9, 0x14340e24, 0xb2430590,
+  0x23d5e9b7, 0x85a2e203, 0xb44af89e, 0x123df32a,
+  0xd79acda4, 0x71edc610, 0x4005dc8d, 0xe672d739,
+  0x103aa7d0, 0xb64dac64, 0x87a5b6f9, 0x21d2bd4d,
+  0xe47583c3, 0x42028877, 0x73ea92ea, 0xd59d995e,
+  0x8bb64ce5, 0x2dc14751, 0x1c295dcc, 0xba5e5678,
+  0x7ff968f6, 0xd98e6342, 0xe86679df, 0x4e11726b,
+  0xb8590282, 0x1e2e0936, 0x2fc613ab, 0x89b1181f,
+  0x4c162691, 0xea612d25, 0xdb8937b8, 0x7dfe3c0c,
+  0xec68d02b, 0x4a1fdb9f, 0x7bf7c102, 0xdd80cab6,
+  0x1827f438, 0xbe50ff8c, 0x8fb8e511, 0x29cfeea5,
+  0xdf879e4c, 0x79f095f8, 0x48188f65, 0xee6f84d1,
+  0x2bc8ba5f, 0x8dbfb1eb, 0xbc57ab76, 0x1a20a0c2,
+  0x8816eaf2, 0x2e61e146, 0x1f89fbdb, 0xb9fef06f,
+  0x7c59cee1, 0xda2ec555, 0xebc6dfc8, 0x4db1d47c,
+  0xbbf9a495, 0x1d8eaf21, 0x2c66b5bc, 0x8a11be08,
+  0x4fb68086, 0xe9c18b32, 0xd82991af, 0x7e5e9a1b,
+  0xefc8763c, 0x49bf7d88, 0x78576715, 0xde206ca1,
+  0x1b87522f, 0xbdf0599b, 0x8c184306, 0x2a6f48b2,
+  0xdc27385b, 0x7a5033ef, 0x4bb82972, 0xedcf22c6,
+  0x28681c48, 0x8e1f17fc, 0xbff70d61, 0x198006d5,
+  0x47abd36e, 0xe1dcd8da, 0xd034c247, 0x7643c9f3,
+  0xb3e4f77d, 0x1593fcc9, 0x247be654, 0x820cede0,
+  0x74449d09, 0xd23396bd, 0xe3db8c20, 0x45ac8794,
+  0x800bb91a, 0x267cb2ae, 0x1794a833, 0xb1e3a387,
+  0x20754fa0, 0x86024414, 0xb7ea5e89, 0x119d553d,
+  0xd43a6bb3, 0x724d6007, 0x43a57a9a, 0xe5d2712e,
+  0x139a01c7, 0xb5ed0a73, 0x840510ee, 0x22721b5a,
+  0xe7d525d4, 0x41a22e60, 0x704a34fd, 0xd63d3f49,
+  0xcc1d9f8b, 0x6a6a943f, 0x5b828ea2, 0xfdf58516,
+  0x3852bb98, 0x9e25b02c, 0xafcdaab1, 0x09baa105,
+  0xfff2d1ec, 0x5985da58, 0x686dc0c5, 0xce1acb71,
+  0x0bbdf5ff, 0xadcafe4b, 0x9c22e4d6, 0x3a55ef62,
+  0xabc30345, 0x0db408f1, 0x3c5c126c, 0x9a2b19d8,
+  0x5f8c2756, 0xf9fb2ce2, 0xc813367f, 0x6e643dcb,
+  0x982c4d22, 0x3e5b4696, 0x0fb35c0b, 0xa9c457bf,
+  0x6c636931, 0xca146285, 0xfbfc7818, 0x5d8b73ac,
+  0x03a0a617, 0xa5d7ada3, 0x943fb73e, 0x3248bc8a,
+  0xf7ef8204, 0x519889b0, 0x6070932d, 0xc6079899,
+  0x304fe870, 0x9638e3c4, 0xa7d0f959, 0x01a7f2ed,
+  0xc400cc63, 0x6277c7d7, 0x539fdd4a, 0xf5e8d6fe,
+  0x647e3ad9, 0xc209316d, 0xf3e12bf0, 0x55962044,
+  0x90311eca, 0x3646157e, 0x07ae0fe3, 0xa1d90457,
+  0x579174be, 0xf1e67f0a, 0xc00e6597, 0x66796e23,
+  0xa3de50ad, 0x05a95b19, 0x34414184, 0x92364a30,
+  0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3,
+  0x844a0efa, 0x48e00e64, 0xc66f0987, 0x0ac50919,
+  0xd3e51bb5, 0x1f4f1b2b, 0x91c01cc8, 0x5d6a1c56,
+  0x57af154f, 0x9b0515d1, 0x158a1232, 0xd92012ac,
+  0x7cbb312b, 0xb01131b5, 0x3e9e3656, 0xf23436c8,
+  0xf8f13fd1, 0x345b3f4f, 0xbad438ac, 0x767e3832,
+  0xaf5e2a9e, 0x63f42a00, 0xed7b2de3, 0x21d12d7d,
+  0x2b142464, 0xe7be24fa, 0x69312319, 0xa59b2387,
+  0xf9766256, 0x35dc62c8, 0xbb53652b, 0x77f965b5,
+  0x7d3c6cac, 0xb1966c32, 0x3f196bd1, 0xf3b36b4f,
+  0x2a9379e3, 0xe639797d, 0x68b67e9e, 0xa41c7e00,
+  0xaed97719, 0x62737787, 0xecfc7064, 0x205670fa,
+  0x85cd537d, 0x496753e3, 0xc7e85400, 0x0b42549e,
+  0x01875d87, 0xcd2d5d19, 0x43a25afa, 0x8f085a64,
+  0x562848c8, 0x9a824856, 0x140d4fb5, 0xd8a74f2b,
+  0xd2624632, 0x1ec846ac, 0x9047414f, 0x5ced41d1,
+  0x299dc2ed, 0xe537c273, 0x6bb8c590, 0xa712c50e,
+  0xadd7cc17, 0x617dcc89, 0xeff2cb6a, 0x2358cbf4,
+  0xfa78d958, 0x36d2d9c6, 0xb85dde25, 0x74f7debb,
+  0x7e32d7a2, 0xb298d73c, 0x3c17d0df, 0xf0bdd041,
+  0x5526f3c6, 0x998cf358, 0x1703f4bb, 0xdba9f425,
+  0xd16cfd3c, 0x1dc6fda2, 0x9349fa41, 0x5fe3fadf,
+  0x86c3e873, 0x4a69e8ed, 0xc4e6ef0e, 0x084cef90,
+  0x0289e689, 0xce23e617, 0x40ace1f4, 0x8c06e16a,
+  0xd0eba0bb, 0x1c41a025, 0x92cea7c6, 0x5e64a758,
+  0x54a1ae41, 0x980baedf, 0x1684a93c, 0xda2ea9a2,
+  0x030ebb0e, 0xcfa4bb90, 0x412bbc73, 0x8d81bced,
+  0x8744b5f4, 0x4beeb56a, 0xc561b289, 0x09cbb217,
+  0xac509190, 0x60fa910e, 0xee7596ed, 0x22df9673,
+  0x281a9f6a, 0xe4b09ff4, 0x6a3f9817, 0xa6959889,
+  0x7fb58a25, 0xb31f8abb, 0x3d908d58, 0xf13a8dc6,
+  0xfbff84df, 0x37558441, 0xb9da83a2, 0x7570833c,
+  0x533b85da, 0x9f918544, 0x111e82a7, 0xddb48239,
+  0xd7718b20, 0x1bdb8bbe, 0x95548c5d, 0x59fe8cc3,
+  0x80de9e6f, 0x4c749ef1, 0xc2fb9912, 0x0e51998c,
+  0x04949095, 0xc83e900b, 0x46b197e8, 0x8a1b9776,
+  0x2f80b4f1, 0xe32ab46f, 0x6da5b38c, 0xa10fb312,
+  0xabcaba0b, 0x6760ba95, 0xe9efbd76, 0x2545bde8,
+  0xfc65af44, 0x30cfafda, 0xbe40a839, 0x72eaa8a7,
+  0x782fa1be, 0xb485a120, 0x3a0aa6c3, 0xf6a0a65d,
+  0xaa4de78c, 0x66e7e712, 0xe868e0f1, 0x24c2e06f,
+  0x2e07e976, 0xe2ade9e8, 0x6c22ee0b, 0xa088ee95,
+  0x79a8fc39, 0xb502fca7, 0x3b8dfb44, 0xf727fbda,
+  0xfde2f2c3, 0x3148f25d, 0xbfc7f5be, 0x736df520,
+  0xd6f6d6a7, 0x1a5cd639, 0x94d3d1da, 0x5879d144,
+  0x52bcd85d, 0x9e16d8c3, 0x1099df20, 0xdc33dfbe,
+  0x0513cd12, 0xc9b9cd8c, 0x4736ca6f, 0x8b9ccaf1,
+  0x8159c3e8, 0x4df3c376, 0xc37cc495, 0x0fd6c40b,
+  0x7aa64737, 0xb60c47a9, 0x3883404a, 0xf42940d4,
+  0xfeec49cd, 0x32464953, 0xbcc94eb0, 0x70634e2e,
+  0xa9435c82, 0x65e95c1c, 0xeb665bff, 0x27cc5b61,
+  0x2d095278, 0xe1a352e6, 0x6f2c5505, 0xa386559b,
+  0x061d761c, 0xcab77682, 0x44387161, 0x889271ff,
+  0x825778e6, 0x4efd7878, 0xc0727f9b, 0x0cd87f05,
+  0xd5f86da9, 0x19526d37, 0x97dd6ad4, 0x5b776a4a,
+  0x51b26353, 0x9d1863cd, 0x1397642e, 0xdf3d64b0,
+  0x83d02561, 0x4f7a25ff, 0xc1f5221c, 0x0d5f2282,
+  0x079a2b9b, 0xcb302b05, 0x45bf2ce6, 0x89152c78,
+  0x50353ed4, 0x9c9f3e4a, 0x121039a9, 0xdeba3937,
+  0xd47f302e, 0x18d530b0, 0x965a3753, 0x5af037cd,
+  0xff6b144a, 0x33c114d4, 0xbd4e1337, 0x71e413a9,
+  0x7b211ab0, 0xb78b1a2e, 0x39041dcd, 0xf5ae1d53,
+  0x2c8e0fff, 0xe0240f61, 0x6eab0882, 0xa201081c,
+  0xa8c40105, 0x646e019b, 0xeae10678, 0x264b06e6,
 };
diff --git a/Sources/DEFLATE/decompress_template.h b/Sources/DEFLATE/decompress_template.h
index 3c1da677..3344323d 100644
--- a/Sources/DEFLATE/decompress_template.h
+++ b/Sources/DEFLATE/decompress_template.h
@@ -35,740 +35,740 @@
 #  define ATTRIBUTES
 #endif
 #ifndef EXTRACT_VARBITS
-#  define EXTRACT_VARBITS(word, count)	((word) & BITMASK(count))
+#  define EXTRACT_VARBITS(word, count)  ((word) & BITMASK(count))
 #endif
 #ifndef EXTRACT_VARBITS8
-#  define EXTRACT_VARBITS8(word, count)	((word) & BITMASK((u8)(count)))
+#  define EXTRACT_VARBITS8(word, count)  ((word) & BITMASK((u8)(count)))
 #endif
 
 static enum libdeflate_result ATTRIBUTES MAYBE_UNUSED
 FUNCNAME(struct libdeflate_decompressor * restrict d,
-	 const void * restrict in, size_t in_nbytes,
-	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+         const void * restrict in, size_t in_nbytes,
+         void * restrict out, size_t out_nbytes_avail,
+         size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
 {
-	u8 *out_next = out;
-	u8 * const out_end = out_next + out_nbytes_avail;
-	u8 * const out_fastloop_end =
-		out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
-
-	/* Input bitstream state; see deflate_decompress.c for documentation */
-	const u8 *in_next = in;
-	const u8 * const in_end = in_next + in_nbytes;
-	const u8 * const in_fastloop_end =
-		in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
-	bitbuf_t bitbuf = 0;
-	bitbuf_t saved_bitbuf;
-	u32 bitsleft = 0;
-	size_t overread_count = 0;
-
-	bool is_final_block;
-	unsigned block_type;
-	unsigned num_litlen_syms;
-	unsigned num_offset_syms;
-	bitbuf_t litlen_tablemask;
-	u32 entry;
-
+  u8 *out_next = out;
+  u8 * const out_end = out_next + out_nbytes_avail;
+  u8 * const out_fastloop_end =
+  out_end - MIN(out_nbytes_avail, FASTLOOP_MAX_BYTES_WRITTEN);
+  
+  /* Input bitstream state; see deflate_decompress.c for documentation */
+  const u8 *in_next = in;
+  const u8 * const in_end = in_next + in_nbytes;
+  const u8 * const in_fastloop_end =
+  in_end - MIN(in_nbytes, FASTLOOP_MAX_BYTES_READ);
+  bitbuf_t bitbuf = 0;
+  bitbuf_t saved_bitbuf;
+  u32 bitsleft = 0;
+  size_t overread_count = 0;
+  
+  bool is_final_block;
+  unsigned block_type;
+  unsigned num_litlen_syms;
+  unsigned num_offset_syms;
+  bitbuf_t litlen_tablemask;
+  u32 entry;
+  
 next_block:
-	/* Starting to read the next block */
-	;
-
-	STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
-	REFILL_BITS();
-
-	/* BFINAL: 1 bit */
-	is_final_block = bitbuf & BITMASK(1);
-
-	/* BTYPE: 2 bits */
-	block_type = (bitbuf >> 1) & BITMASK(2);
-
-	if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
-
-		/* Dynamic Huffman block */
-
-		/* The order in which precode lengths are stored */
-		static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
-			16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
-		};
-
-		unsigned num_explicit_precode_lens;
-		unsigned i;
-
-		/* Read the codeword length counts. */
-
-		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
-		num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
-
-		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
-		num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
-
-		STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
-		num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
-
-		d->static_codes_loaded = false;
-
-		/*
-		 * Read the precode codeword lengths.
-		 *
-		 * A 64-bit bitbuffer is just one bit too small to hold the
-		 * maximum number of precode lens, so to minimize branches we
-		 * merge one len with the previous fields.
-		 */
-		STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
-		if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
-			d->u.precode_lens[deflate_precode_lens_permutation[0]] =
-				(bitbuf >> 17) & BITMASK(3);
-			bitbuf >>= 20;
-			bitsleft -= 20;
-			REFILL_BITS();
-			i = 1;
-			do {
-				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
-					bitbuf & BITMASK(3);
-				bitbuf >>= 3;
-				bitsleft -= 3;
-			} while (++i < num_explicit_precode_lens);
-		} else {
-			bitbuf >>= 17;
-			bitsleft -= 17;
-			i = 0;
-			do {
-				if ((u8)bitsleft < 3)
-					REFILL_BITS();
-				d->u.precode_lens[deflate_precode_lens_permutation[i]] =
-					bitbuf & BITMASK(3);
-				bitbuf >>= 3;
-				bitsleft -= 3;
-			} while (++i < num_explicit_precode_lens);
-		}
-		for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
-			d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
-
-		/* Build the decode table for the precode. */
-		SAFETY_CHECK(build_precode_decode_table(d));
-
-		/* Decode the litlen and offset codeword lengths. */
-		i = 0;
-		do {
-			unsigned presym;
-			u8 rep_val;
-			unsigned rep_count;
-
-			if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
-				REFILL_BITS();
-
-			/*
-			 * The code below assumes that the precode decode table
-			 * doesn't have any subtables.
-			 */
-			STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
-
-			/* Decode the next precode symbol. */
-			entry = d->u.l.precode_decode_table[
-				bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
-			bitbuf >>= (u8)entry;
-			bitsleft -= entry; /* optimization: subtract full entry */
-			presym = entry >> 16;
-
-			if (presym < 16) {
-				/* Explicit codeword length */
-				d->u.l.lens[i++] = presym;
-				continue;
-			}
-
-			/* Run-length encoded codeword lengths */
-
-			/*
-			 * Note: we don't need to immediately verify that the
-			 * repeat count doesn't overflow the number of elements,
-			 * since we've sized the lens array to have enough extra
-			 * space to allow for the worst-case overrun (138 zeroes
-			 * when only 1 length was remaining).
-			 *
-			 * In the case of the small repeat counts (presyms 16
-			 * and 17), it is fastest to always write the maximum
-			 * number of entries.  That gets rid of branches that
-			 * would otherwise be required.
-			 *
-			 * It is not just because of the numerical order that
-			 * our checks go in the order 'presym < 16', 'presym ==
-			 * 16', and 'presym == 17'.  For typical data this is
-			 * ordered from most frequent to least frequent case.
-			 */
-			STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
-
-			if (presym == 16) {
-				/* Repeat the previous length 3 - 6 times. */
-				SAFETY_CHECK(i != 0);
-				rep_val = d->u.l.lens[i - 1];
-				STATIC_ASSERT(3 + BITMASK(2) == 6);
-				rep_count = 3 + (bitbuf & BITMASK(2));
-				bitbuf >>= 2;
-				bitsleft -= 2;
-				d->u.l.lens[i + 0] = rep_val;
-				d->u.l.lens[i + 1] = rep_val;
-				d->u.l.lens[i + 2] = rep_val;
-				d->u.l.lens[i + 3] = rep_val;
-				d->u.l.lens[i + 4] = rep_val;
-				d->u.l.lens[i + 5] = rep_val;
-				i += rep_count;
-			} else if (presym == 17) {
-				/* Repeat zero 3 - 10 times. */
-				STATIC_ASSERT(3 + BITMASK(3) == 10);
-				rep_count = 3 + (bitbuf & BITMASK(3));
-				bitbuf >>= 3;
-				bitsleft -= 3;
-				d->u.l.lens[i + 0] = 0;
-				d->u.l.lens[i + 1] = 0;
-				d->u.l.lens[i + 2] = 0;
-				d->u.l.lens[i + 3] = 0;
-				d->u.l.lens[i + 4] = 0;
-				d->u.l.lens[i + 5] = 0;
-				d->u.l.lens[i + 6] = 0;
-				d->u.l.lens[i + 7] = 0;
-				d->u.l.lens[i + 8] = 0;
-				d->u.l.lens[i + 9] = 0;
-				i += rep_count;
-			} else {
-				/* Repeat zero 11 - 138 times. */
-				STATIC_ASSERT(11 + BITMASK(7) == 138);
-				rep_count = 11 + (bitbuf & BITMASK(7));
-				bitbuf >>= 7;
-				bitsleft -= 7;
-				memset(&d->u.l.lens[i], 0,
-				       rep_count * sizeof(d->u.l.lens[i]));
-				i += rep_count;
-			}
-		} while (i < num_litlen_syms + num_offset_syms);
-
-		/* Unnecessary, but check this for consistency with zlib. */
-		SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
-
-	} else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
-		u16 len, nlen;
-
-		/*
-		 * Uncompressed block: copy 'len' bytes literally from the input
-		 * buffer to the output buffer.
-		 */
-
-		bitsleft -= 3; /* for BTYPE and BFINAL */
-
-		/*
-		 * Align the bitstream to the next byte boundary.  This means
-		 * the next byte boundary as if we were reading a byte at a
-		 * time.  Therefore, we have to rewind 'in_next' by any bytes
-		 * that have been refilled but not actually consumed yet (not
-		 * counting overread bytes, which don't increment 'in_next').
-		 */
-		bitsleft = (u8)bitsleft;
-		SAFETY_CHECK(overread_count <= (bitsleft >> 3));
-		in_next -= (bitsleft >> 3) - overread_count;
-		overread_count = 0;
-		bitbuf = 0;
-		bitsleft = 0;
-
-		SAFETY_CHECK(in_end - in_next >= 4);
-		len = get_unaligned_le16(in_next);
-		nlen = get_unaligned_le16(in_next + 2);
-		in_next += 4;
-
-		SAFETY_CHECK(len == (u16)~nlen);
-		if (unlikely(len > out_end - out_next))
-			return LIBDEFLATE_INSUFFICIENT_SPACE;
-		SAFETY_CHECK(len <= in_end - in_next);
-
-		memcpy(out_next, in_next, len);
-		in_next += len;
-		out_next += len;
-
-		goto block_done;
-
-	} else {
-		unsigned i;
-
-		SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
-
-		/*
-		 * Static Huffman block: build the decode tables for the static
-		 * codes.  Skip doing so if the tables are already set up from
-		 * an earlier static block; this speeds up decompression of
-		 * degenerate input of many empty or very short static blocks.
-		 *
-		 * Afterwards, the remainder is the same as decompressing a
-		 * dynamic Huffman block.
-		 */
-
-		bitbuf >>= 3; /* for BTYPE and BFINAL */
-		bitsleft -= 3;
-
-		if (d->static_codes_loaded)
-			goto have_decode_tables;
-
-		d->static_codes_loaded = true;
-
-		STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
-		STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
-
-		for (i = 0; i < 144; i++)
-			d->u.l.lens[i] = 8;
-		for (; i < 256; i++)
-			d->u.l.lens[i] = 9;
-		for (; i < 280; i++)
-			d->u.l.lens[i] = 7;
-		for (; i < 288; i++)
-			d->u.l.lens[i] = 8;
-
-		for (; i < 288 + 32; i++)
-			d->u.l.lens[i] = 5;
-
-		num_litlen_syms = 288;
-		num_offset_syms = 32;
-	}
-
-	/* Decompressing a Huffman block (either dynamic or static) */
-
-	SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
-	SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
+  /* Starting to read the next block */
+  ;
+  
+  STATIC_ASSERT(CAN_CONSUME(1 + 2 + 5 + 5 + 4 + 3));
+  REFILL_BITS();
+  
+  /* BFINAL: 1 bit */
+  is_final_block = bitbuf & BITMASK(1);
+  
+  /* BTYPE: 2 bits */
+  block_type = (bitbuf >> 1) & BITMASK(2);
+  
+  if (block_type == DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN) {
+    
+    /* Dynamic Huffman block */
+    
+    /* The order in which precode lengths are stored */
+    static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
+      16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+    };
+    
+    unsigned num_explicit_precode_lens;
+    unsigned i;
+    
+    /* Read the codeword length counts. */
+    
+    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 257 + BITMASK(5));
+    num_litlen_syms = 257 + ((bitbuf >> 3) & BITMASK(5));
+    
+    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 1 + BITMASK(5));
+    num_offset_syms = 1 + ((bitbuf >> 8) & BITMASK(5));
+    
+    STATIC_ASSERT(DEFLATE_NUM_PRECODE_SYMS == 4 + BITMASK(4));
+    num_explicit_precode_lens = 4 + ((bitbuf >> 13) & BITMASK(4));
+    
+    d->static_codes_loaded = false;
+    
+    /*
+     * Read the precode codeword lengths.
+     *
+     * A 64-bit bitbuffer is just one bit too small to hold the
+     * maximum number of precode lens, so to minimize branches we
+     * merge one len with the previous fields.
+     */
+    STATIC_ASSERT(DEFLATE_MAX_PRE_CODEWORD_LEN == (1 << 3) - 1);
+    if (CAN_CONSUME(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+      d->u.precode_lens[deflate_precode_lens_permutation[0]] =
+      (bitbuf >> 17) & BITMASK(3);
+      bitbuf >>= 20;
+      bitsleft -= 20;
+      REFILL_BITS();
+      i = 1;
+      do {
+        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+        bitbuf & BITMASK(3);
+        bitbuf >>= 3;
+        bitsleft -= 3;
+      } while (++i < num_explicit_precode_lens);
+    } else {
+      bitbuf >>= 17;
+      bitsleft -= 17;
+      i = 0;
+      do {
+        if ((u8)bitsleft < 3)
+          REFILL_BITS();
+        d->u.precode_lens[deflate_precode_lens_permutation[i]] =
+        bitbuf & BITMASK(3);
+        bitbuf >>= 3;
+        bitsleft -= 3;
+      } while (++i < num_explicit_precode_lens);
+    }
+    for (; i < DEFLATE_NUM_PRECODE_SYMS; i++)
+      d->u.precode_lens[deflate_precode_lens_permutation[i]] = 0;
+    
+    /* Build the decode table for the precode. */
+    SAFETY_CHECK(build_precode_decode_table(d));
+    
+    /* Decode the litlen and offset codeword lengths. */
+    i = 0;
+    do {
+      unsigned presym;
+      u8 rep_val;
+      unsigned rep_count;
+      
+      if ((u8)bitsleft < DEFLATE_MAX_PRE_CODEWORD_LEN + 7)
+        REFILL_BITS();
+      
+      /*
+       * The code below assumes that the precode decode table
+       * doesn't have any subtables.
+       */
+      STATIC_ASSERT(PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN);
+      
+      /* Decode the next precode symbol. */
+      entry = d->u.l.precode_decode_table[
+        bitbuf & BITMASK(DEFLATE_MAX_PRE_CODEWORD_LEN)];
+      bitbuf >>= (u8)entry;
+      bitsleft -= entry; /* optimization: subtract full entry */
+      presym = entry >> 16;
+      
+      if (presym < 16) {
+        /* Explicit codeword length */
+        d->u.l.lens[i++] = presym;
+        continue;
+      }
+      
+      /* Run-length encoded codeword lengths */
+      
+      /*
+       * Note: we don't need to immediately verify that the
+       * repeat count doesn't overflow the number of elements,
+       * since we've sized the lens array to have enough extra
+       * space to allow for the worst-case overrun (138 zeroes
+       * when only 1 length was remaining).
+       *
+       * In the case of the small repeat counts (presyms 16
+       * and 17), it is fastest to always write the maximum
+       * number of entries.  That gets rid of branches that
+       * would otherwise be required.
+       *
+       * It is not just because of the numerical order that
+       * our checks go in the order 'presym < 16', 'presym ==
+       * 16', and 'presym == 17'.  For typical data this is
+       * ordered from most frequent to least frequent case.
+       */
+      STATIC_ASSERT(DEFLATE_MAX_LENS_OVERRUN == 138 - 1);
+      
+      if (presym == 16) {
+        /* Repeat the previous length 3 - 6 times. */
+        SAFETY_CHECK(i != 0);
+        rep_val = d->u.l.lens[i - 1];
+        STATIC_ASSERT(3 + BITMASK(2) == 6);
+        rep_count = 3 + (bitbuf & BITMASK(2));
+        bitbuf >>= 2;
+        bitsleft -= 2;
+        d->u.l.lens[i + 0] = rep_val;
+        d->u.l.lens[i + 1] = rep_val;
+        d->u.l.lens[i + 2] = rep_val;
+        d->u.l.lens[i + 3] = rep_val;
+        d->u.l.lens[i + 4] = rep_val;
+        d->u.l.lens[i + 5] = rep_val;
+        i += rep_count;
+      } else if (presym == 17) {
+        /* Repeat zero 3 - 10 times. */
+        STATIC_ASSERT(3 + BITMASK(3) == 10);
+        rep_count = 3 + (bitbuf & BITMASK(3));
+        bitbuf >>= 3;
+        bitsleft -= 3;
+        d->u.l.lens[i + 0] = 0;
+        d->u.l.lens[i + 1] = 0;
+        d->u.l.lens[i + 2] = 0;
+        d->u.l.lens[i + 3] = 0;
+        d->u.l.lens[i + 4] = 0;
+        d->u.l.lens[i + 5] = 0;
+        d->u.l.lens[i + 6] = 0;
+        d->u.l.lens[i + 7] = 0;
+        d->u.l.lens[i + 8] = 0;
+        d->u.l.lens[i + 9] = 0;
+        i += rep_count;
+      } else {
+        /* Repeat zero 11 - 138 times. */
+        STATIC_ASSERT(11 + BITMASK(7) == 138);
+        rep_count = 11 + (bitbuf & BITMASK(7));
+        bitbuf >>= 7;
+        bitsleft -= 7;
+        memset(&d->u.l.lens[i], 0,
+               rep_count * sizeof(d->u.l.lens[i]));
+        i += rep_count;
+      }
+    } while (i < num_litlen_syms + num_offset_syms);
+    
+    /* Unnecessary, but check this for consistency with zlib. */
+    SAFETY_CHECK(i == num_litlen_syms + num_offset_syms);
+    
+  } else if (block_type == DEFLATE_BLOCKTYPE_UNCOMPRESSED) {
+    u16 len, nlen;
+    
+    /*
+     * Uncompressed block: copy 'len' bytes literally from the input
+     * buffer to the output buffer.
+     */
+    
+    bitsleft -= 3; /* for BTYPE and BFINAL */
+    
+    /*
+     * Align the bitstream to the next byte boundary.  This means
+     * the next byte boundary as if we were reading a byte at a
+     * time.  Therefore, we have to rewind 'in_next' by any bytes
+     * that have been refilled but not actually consumed yet (not
+     * counting overread bytes, which don't increment 'in_next').
+     */
+    bitsleft = (u8)bitsleft;
+    SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+    in_next -= (bitsleft >> 3) - overread_count;
+    overread_count = 0;
+    bitbuf = 0;
+    bitsleft = 0;
+    
+    SAFETY_CHECK(in_end - in_next >= 4);
+    len = get_unaligned_le16(in_next);
+    nlen = get_unaligned_le16(in_next + 2);
+    in_next += 4;
+    
+    SAFETY_CHECK(len == (u16)~nlen);
+    if (unlikely(len > out_end - out_next))
+      return LIBDEFLATE_INSUFFICIENT_SPACE;
+    SAFETY_CHECK(len <= in_end - in_next);
+    
+    memcpy(out_next, in_next, len);
+    in_next += len;
+    out_next += len;
+    
+    goto block_done;
+    
+  } else {
+    unsigned i;
+    
+    SAFETY_CHECK(block_type == DEFLATE_BLOCKTYPE_STATIC_HUFFMAN);
+    
+    /*
+     * Static Huffman block: build the decode tables for the static
+     * codes.  Skip doing so if the tables are already set up from
+     * an earlier static block; this speeds up decompression of
+     * degenerate input of many empty or very short static blocks.
+     *
+     * Afterwards, the remainder is the same as decompressing a
+     * dynamic Huffman block.
+     */
+    
+    bitbuf >>= 3; /* for BTYPE and BFINAL */
+    bitsleft -= 3;
+    
+    if (d->static_codes_loaded)
+      goto have_decode_tables;
+    
+    d->static_codes_loaded = true;
+    
+    STATIC_ASSERT(DEFLATE_NUM_LITLEN_SYMS == 288);
+    STATIC_ASSERT(DEFLATE_NUM_OFFSET_SYMS == 32);
+    
+    for (i = 0; i < 144; i++)
+      d->u.l.lens[i] = 8;
+    for (; i < 256; i++)
+      d->u.l.lens[i] = 9;
+    for (; i < 280; i++)
+      d->u.l.lens[i] = 7;
+    for (; i < 288; i++)
+      d->u.l.lens[i] = 8;
+    
+    for (; i < 288 + 32; i++)
+      d->u.l.lens[i] = 5;
+    
+    num_litlen_syms = 288;
+    num_offset_syms = 32;
+  }
+  
+  /* Decompressing a Huffman block (either dynamic or static) */
+  
+  SAFETY_CHECK(build_offset_decode_table(d, num_litlen_syms, num_offset_syms));
+  SAFETY_CHECK(build_litlen_decode_table(d, num_litlen_syms, num_offset_syms));
 have_decode_tables:
-	litlen_tablemask = BITMASK(d->litlen_tablebits);
-
-	/*
-	 * This is the "fastloop" for decoding literals and matches.  It does
-	 * bounds checks on in_next and out_next in the loop conditions so that
-	 * additional bounds checks aren't needed inside the loop body.
-	 *
-	 * To reduce latency, the bitbuffer is refilled and the next litlen
-	 * decode table entry is preloaded before each loop iteration.
-	 */
-	if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
-		goto generic_loop;
-	REFILL_BITS_IN_FASTLOOP();
-	entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
-	do {
-		u32 length, offset, lit;
-		const u8 *src;
-		u8 *dst;
-
-		/*
-		 * Consume the bits for the litlen decode table entry.  Save the
-		 * original bitbuf for later, in case the extra match length
-		 * bits need to be extracted from it.
-		 */
-		saved_bitbuf = bitbuf;
-		bitbuf >>= (u8)entry;
-		bitsleft -= entry; /* optimization: subtract full entry */
-
-		/*
-		 * Begin by checking for a "fast" literal, i.e. a literal that
-		 * doesn't need a subtable.
-		 */
-		if (entry & HUFFDEC_LITERAL) {
-			/*
-			 * On 64-bit platforms, we decode up to 2 extra fast
-			 * literals in addition to the primary item, as this
-			 * increases performance and still leaves enough bits
-			 * remaining for what follows.  We could actually do 3,
-			 * assuming LITLEN_TABLEBITS=11, but that actually
-			 * decreases performance slightly (perhaps by messing
-			 * with the branch prediction of the conditional refill
-			 * that happens later while decoding the match offset).
-			 *
-			 * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
-			 * and FASTLOOP_MAX_BYTES_READ need to be updated if the
-			 * number of extra literals decoded here is changed.
-			 */
-			if (/* enough bits for 2 fast literals + length + offset preload? */
-			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
-							 LENGTH_MAXBITS,
-							 OFFSET_TABLEBITS) &&
-			    /* enough bits for 2 fast literals + slow literal + litlen preload? */
-			    CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
-							 DEFLATE_MAX_LITLEN_CODEWORD_LEN,
-							 LITLEN_TABLEBITS)) {
-				/* 1st extra fast literal */
-				lit = entry >> 16;
-				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
-				saved_bitbuf = bitbuf;
-				bitbuf >>= (u8)entry;
-				bitsleft -= entry;
-				*out_next++ = lit;
-				if (entry & HUFFDEC_LITERAL) {
-					/* 2nd extra fast literal */
-					lit = entry >> 16;
-					entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
-					saved_bitbuf = bitbuf;
-					bitbuf >>= (u8)entry;
-					bitsleft -= entry;
-					*out_next++ = lit;
-					if (entry & HUFFDEC_LITERAL) {
-						/*
-						 * Another fast literal, but
-						 * this one is in lieu of the
-						 * primary item, so it doesn't
-						 * count as one of the extras.
-						 */
-						lit = entry >> 16;
-						entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
-						REFILL_BITS_IN_FASTLOOP();
-						*out_next++ = lit;
-						continue;
-					}
-				}
-			} else {
-				/*
-				 * Decode a literal.  While doing so, preload
-				 * the next litlen decode table entry and refill
-				 * the bitbuffer.  To reduce latency, we've
-				 * arranged for there to be enough "preloadable"
-				 * bits remaining to do the table preload
-				 * independently of the refill.
-				 */
-				STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
-						LITLEN_TABLEBITS, LITLEN_TABLEBITS));
-				lit = entry >> 16;
-				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
-				REFILL_BITS_IN_FASTLOOP();
-				*out_next++ = lit;
-				continue;
-			}
-		}
-
-		/*
-		 * It's not a literal entry, so it can be a length entry, a
-		 * subtable pointer entry, or an end-of-block entry.  Detect the
-		 * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
-		 */
-		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
-			/* Subtable pointer or end-of-block entry */
-
-			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
-				goto block_done;
-
-			/*
-			 * A subtable is required.  Load and consume the
-			 * subtable entry.  The subtable entry can be of any
-			 * type: literal, length, or end-of-block.
-			 */
-			entry = d->u.litlen_decode_table[(entry >> 16) +
-				EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
-			saved_bitbuf = bitbuf;
-			bitbuf >>= (u8)entry;
-			bitsleft -= entry;
-
-			/*
-			 * 32-bit platforms that use the byte-at-a-time refill
-			 * method have to do a refill here for there to always
-			 * be enough bits to decode a literal that requires a
-			 * subtable, then preload the next litlen decode table
-			 * entry; or to decode a match length that requires a
-			 * subtable, then preload the offset decode table entry.
-			 */
-			if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
-							  LITLEN_TABLEBITS) ||
-			    !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
-							  OFFSET_TABLEBITS))
-				REFILL_BITS_IN_FASTLOOP();
-			if (entry & HUFFDEC_LITERAL) {
-				/* Decode a literal that required a subtable. */
-				lit = entry >> 16;
-				entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
-				REFILL_BITS_IN_FASTLOOP();
-				*out_next++ = lit;
-				continue;
-			}
-			if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
-				goto block_done;
-			/* Else, it's a length that required a subtable. */
-		}
-
-		/*
-		 * Decode the match length: the length base value associated
-		 * with the litlen symbol (which we extract from the decode
-		 * table entry), plus the extra length bits.  We don't need to
-		 * consume the extra length bits here, as they were included in
-		 * the bits consumed by the entry earlier.  We also don't need
-		 * to check for too-long matches here, as this is inside the
-		 * fastloop where it's already been verified that the output
-		 * buffer has enough space remaining to copy a max-length match.
-		 */
-		length = entry >> 16;
-		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
-
-		/*
-		 * Decode the match offset.  There are enough "preloadable" bits
-		 * remaining to preload the offset decode table entry, but a
-		 * refill might be needed before consuming it.
-		 */
-		STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
-							   OFFSET_TABLEBITS));
-		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
-		if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
-						 LITLEN_TABLEBITS)) {
-			/*
-			 * Decoding a match offset on a 64-bit platform.  We may
-			 * need to refill once, but then we can decode the whole
-			 * offset and preload the next litlen table entry.
-			 */
-			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
-				/* Offset codeword requires a subtable */
-				if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
-					     LITLEN_TABLEBITS - PRELOAD_SLACK))
-					REFILL_BITS_IN_FASTLOOP();
-				bitbuf >>= OFFSET_TABLEBITS;
-				bitsleft -= OFFSET_TABLEBITS;
-				entry = d->offset_decode_table[(entry >> 16) +
-					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
-			} else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
-					    LITLEN_TABLEBITS - PRELOAD_SLACK))
-				REFILL_BITS_IN_FASTLOOP();
-		} else {
-			/* Decoding a match offset on a 32-bit platform */
-			REFILL_BITS_IN_FASTLOOP();
-			if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
-				/* Offset codeword requires a subtable */
-				bitbuf >>= OFFSET_TABLEBITS;
-				bitsleft -= OFFSET_TABLEBITS;
-				entry = d->offset_decode_table[(entry >> 16) +
-					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
-				REFILL_BITS_IN_FASTLOOP();
-				/* No further refill needed before extra bits */
-				STATIC_ASSERT(CAN_CONSUME(
-					OFFSET_MAXBITS - OFFSET_TABLEBITS));
-			} else {
-				/* No refill needed before extra bits */
-				STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
-			}
-		}
-		saved_bitbuf = bitbuf;
-		bitbuf >>= (u8)entry;
-		bitsleft -= entry; /* optimization: subtract full entry */
-		offset = entry >> 16;
-		offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
-
-		/* Validate the match offset; needed even in the fastloop. */
-		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
-		src = out_next - offset;
-		dst = out_next;
-		out_next += length;
-
-		/*
-		 * Before starting to issue the instructions to copy the match,
-		 * refill the bitbuffer and preload the litlen decode table
-		 * entry for the next loop iteration.  This can increase
-		 * performance by allowing the latency of the match copy to
-		 * overlap with these other operations.  To further reduce
-		 * latency, we've arranged for there to be enough bits remaining
-		 * to do the table preload independently of the refill, except
-		 * on 32-bit platforms using the byte-at-a-time refill method.
-		 */
-		if (!CAN_CONSUME_AND_THEN_PRELOAD(
-			MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
-			    OFFSET_MAXFASTBITS),
-			LITLEN_TABLEBITS) &&
-		    unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
-			REFILL_BITS_IN_FASTLOOP();
-		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
-		REFILL_BITS_IN_FASTLOOP();
-
-		/*
-		 * Copy the match.  On most CPUs the fastest method is a
-		 * word-at-a-time copy, unconditionally copying about 5 words
-		 * since this is enough for most matches without being too much.
-		 *
-		 * The normal word-at-a-time copy works for offset >= WORDBYTES,
-		 * which is most cases.  The case of offset == 1 is also common
-		 * and is worth optimizing for, since it is just RLE encoding of
-		 * the previous byte, which is the result of compressing long
-		 * runs of the same byte.
-		 *
-		 * Writing past the match 'length' is allowed here, since it's
-		 * been ensured there is enough output space left for a slight
-		 * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
-		 * the maximum possible overrun here is changed.
-		 */
-		if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
-			store_word_unaligned(load_word_unaligned(src), dst);
-			src += WORDBYTES;
-			dst += WORDBYTES;
-			store_word_unaligned(load_word_unaligned(src), dst);
-			src += WORDBYTES;
-			dst += WORDBYTES;
-			store_word_unaligned(load_word_unaligned(src), dst);
-			src += WORDBYTES;
-			dst += WORDBYTES;
-			store_word_unaligned(load_word_unaligned(src), dst);
-			src += WORDBYTES;
-			dst += WORDBYTES;
-			store_word_unaligned(load_word_unaligned(src), dst);
-			src += WORDBYTES;
-			dst += WORDBYTES;
-			while (dst < out_next) {
-				store_word_unaligned(load_word_unaligned(src), dst);
-				src += WORDBYTES;
-				dst += WORDBYTES;
-				store_word_unaligned(load_word_unaligned(src), dst);
-				src += WORDBYTES;
-				dst += WORDBYTES;
-				store_word_unaligned(load_word_unaligned(src), dst);
-				src += WORDBYTES;
-				dst += WORDBYTES;
-				store_word_unaligned(load_word_unaligned(src), dst);
-				src += WORDBYTES;
-				dst += WORDBYTES;
-				store_word_unaligned(load_word_unaligned(src), dst);
-				src += WORDBYTES;
-				dst += WORDBYTES;
-			}
-		} else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
-			machine_word_t v;
-
-			/*
-			 * This part tends to get auto-vectorized, so keep it
-			 * copying a multiple of 16 bytes at a time.
-			 */
-			v = (machine_word_t)0x0101010101010101 * src[0];
-			store_word_unaligned(v, dst);
-			dst += WORDBYTES;
-			store_word_unaligned(v, dst);
-			dst += WORDBYTES;
-			store_word_unaligned(v, dst);
-			dst += WORDBYTES;
-			store_word_unaligned(v, dst);
-			dst += WORDBYTES;
-			while (dst < out_next) {
-				store_word_unaligned(v, dst);
-				dst += WORDBYTES;
-				store_word_unaligned(v, dst);
-				dst += WORDBYTES;
-				store_word_unaligned(v, dst);
-				dst += WORDBYTES;
-				store_word_unaligned(v, dst);
-				dst += WORDBYTES;
-			}
-		} else if (UNALIGNED_ACCESS_IS_FAST) {
-			store_word_unaligned(load_word_unaligned(src), dst);
-			src += offset;
-			dst += offset;
-			store_word_unaligned(load_word_unaligned(src), dst);
-			src += offset;
-			dst += offset;
-			do {
-				store_word_unaligned(load_word_unaligned(src), dst);
-				src += offset;
-				dst += offset;
-				store_word_unaligned(load_word_unaligned(src), dst);
-				src += offset;
-				dst += offset;
-			} while (dst < out_next);
-		} else {
-			*dst++ = *src++;
-			*dst++ = *src++;
-			do {
-				*dst++ = *src++;
-			} while (dst < out_next);
-		}
-	} while (in_next < in_fastloop_end && out_next < out_fastloop_end);
-
-	/*
-	 * This is the generic loop for decoding literals and matches.  This
-	 * handles cases where in_next and out_next are close to the end of
-	 * their respective buffers.  Usually this loop isn't performance-
-	 * critical, as most time is spent in the fastloop above instead.  We
-	 * therefore omit some optimizations here in favor of smaller code.
-	 */
+  litlen_tablemask = BITMASK(d->litlen_tablebits);
+  
+  /*
+   * This is the "fastloop" for decoding literals and matches.  It does
+   * bounds checks on in_next and out_next in the loop conditions so that
+   * additional bounds checks aren't needed inside the loop body.
+   *
+   * To reduce latency, the bitbuffer is refilled and the next litlen
+   * decode table entry is preloaded before each loop iteration.
+   */
+  if (in_next >= in_fastloop_end || out_next >= out_fastloop_end)
+    goto generic_loop;
+  REFILL_BITS_IN_FASTLOOP();
+  entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+  do {
+    u32 length, offset, lit;
+    const u8 *src;
+    u8 *dst;
+    
+    /*
+     * Consume the bits for the litlen decode table entry.  Save the
+     * original bitbuf for later, in case the extra match length
+     * bits need to be extracted from it.
+     */
+    saved_bitbuf = bitbuf;
+    bitbuf >>= (u8)entry;
+    bitsleft -= entry; /* optimization: subtract full entry */
+    
+    /*
+     * Begin by checking for a "fast" literal, i.e. a literal that
+     * doesn't need a subtable.
+     */
+    if (entry & HUFFDEC_LITERAL) {
+      /*
+       * On 64-bit platforms, we decode up to 2 extra fast
+       * literals in addition to the primary item, as this
+       * increases performance and still leaves enough bits
+       * remaining for what follows.  We could actually do 3,
+       * assuming LITLEN_TABLEBITS=11, but that actually
+       * decreases performance slightly (perhaps by messing
+       * with the branch prediction of the conditional refill
+       * that happens later while decoding the match offset).
+       *
+       * Note: the definitions of FASTLOOP_MAX_BYTES_WRITTEN
+       * and FASTLOOP_MAX_BYTES_READ need to be updated if the
+       * number of extra literals decoded here is changed.
+       */
+      if (/* enough bits for 2 fast literals + length + offset preload? */
+          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+                                       LENGTH_MAXBITS,
+                                       OFFSET_TABLEBITS) &&
+          /* enough bits for 2 fast literals + slow literal + litlen preload? */
+          CAN_CONSUME_AND_THEN_PRELOAD(2 * LITLEN_TABLEBITS +
+                                       DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+                                       LITLEN_TABLEBITS)) {
+                                         /* 1st extra fast literal */
+                                         lit = entry >> 16;
+                                         entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+                                         saved_bitbuf = bitbuf;
+                                         bitbuf >>= (u8)entry;
+                                         bitsleft -= entry;
+                                         *out_next++ = lit;
+                                         if (entry & HUFFDEC_LITERAL) {
+                                           /* 2nd extra fast literal */
+                                           lit = entry >> 16;
+                                           entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+                                           saved_bitbuf = bitbuf;
+                                           bitbuf >>= (u8)entry;
+                                           bitsleft -= entry;
+                                           *out_next++ = lit;
+                                           if (entry & HUFFDEC_LITERAL) {
+                                             /*
+                                              * Another fast literal, but
+                                              * this one is in lieu of the
+                                              * primary item, so it doesn't
+                                              * count as one of the extras.
+                                              */
+                                             lit = entry >> 16;
+                                             entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+                                             REFILL_BITS_IN_FASTLOOP();
+                                             *out_next++ = lit;
+                                             continue;
+                                           }
+                                         }
+                                       } else {
+                                         /*
+                                          * Decode a literal.  While doing so, preload
+                                          * the next litlen decode table entry and refill
+                                          * the bitbuffer.  To reduce latency, we've
+                                          * arranged for there to be enough "preloadable"
+                                          * bits remaining to do the table preload
+                                          * independently of the refill.
+                                          */
+                                         STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(
+                                                                                    LITLEN_TABLEBITS, LITLEN_TABLEBITS));
+                                         lit = entry >> 16;
+                                         entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+                                         REFILL_BITS_IN_FASTLOOP();
+                                         *out_next++ = lit;
+                                         continue;
+                                       }
+    }
+    
+    /*
+     * It's not a literal entry, so it can be a length entry, a
+     * subtable pointer entry, or an end-of-block entry.  Detect the
+     * two unlikely cases by testing the HUFFDEC_EXCEPTIONAL flag.
+     */
+    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+      /* Subtable pointer or end-of-block entry */
+      
+      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+        goto block_done;
+      
+      /*
+       * A subtable is required.  Load and consume the
+       * subtable entry.  The subtable entry can be of any
+       * type: literal, length, or end-of-block.
+       */
+      entry = d->u.litlen_decode_table[(entry >> 16) +
+                                       EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+      saved_bitbuf = bitbuf;
+      bitbuf >>= (u8)entry;
+      bitsleft -= entry;
+      
+      /*
+       * 32-bit platforms that use the byte-at-a-time refill
+       * method have to do a refill here for there to always
+       * be enough bits to decode a literal that requires a
+       * subtable, then preload the next litlen decode table
+       * entry; or to decode a match length that requires a
+       * subtable, then preload the offset decode table entry.
+       */
+      if (!CAN_CONSUME_AND_THEN_PRELOAD(DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+                                        LITLEN_TABLEBITS) ||
+          !CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXBITS,
+                                        OFFSET_TABLEBITS))
+        REFILL_BITS_IN_FASTLOOP();
+      if (entry & HUFFDEC_LITERAL) {
+        /* Decode a literal that required a subtable. */
+        lit = entry >> 16;
+        entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+        REFILL_BITS_IN_FASTLOOP();
+        *out_next++ = lit;
+        continue;
+      }
+      if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+        goto block_done;
+      /* Else, it's a length that required a subtable. */
+    }
+    
+    /*
+     * Decode the match length: the length base value associated
+     * with the litlen symbol (which we extract from the decode
+     * table entry), plus the extra length bits.  We don't need to
+     * consume the extra length bits here, as they were included in
+     * the bits consumed by the entry earlier.  We also don't need
+     * to check for too-long matches here, as this is inside the
+     * fastloop where it's already been verified that the output
+     * buffer has enough space remaining to copy a max-length match.
+     */
+    length = entry >> 16;
+    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+    
+    /*
+     * Decode the match offset.  There are enough "preloadable" bits
+     * remaining to preload the offset decode table entry, but a
+     * refill might be needed before consuming it.
+     */
+    STATIC_ASSERT(CAN_CONSUME_AND_THEN_PRELOAD(LENGTH_MAXFASTBITS,
+                                               OFFSET_TABLEBITS));
+    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+    if (CAN_CONSUME_AND_THEN_PRELOAD(OFFSET_MAXBITS,
+                                     LITLEN_TABLEBITS)) {
+      /*
+       * Decoding a match offset on a 64-bit platform.  We may
+       * need to refill once, but then we can decode the whole
+       * offset and preload the next litlen table entry.
+       */
+      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+        /* Offset codeword requires a subtable */
+        if (unlikely((u8)bitsleft < OFFSET_MAXBITS +
+                     LITLEN_TABLEBITS - PRELOAD_SLACK))
+          REFILL_BITS_IN_FASTLOOP();
+        bitbuf >>= OFFSET_TABLEBITS;
+        bitsleft -= OFFSET_TABLEBITS;
+        entry = d->offset_decode_table[(entry >> 16) +
+                                       EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+      } else if (unlikely((u8)bitsleft < OFFSET_MAXFASTBITS +
+                          LITLEN_TABLEBITS - PRELOAD_SLACK))
+        REFILL_BITS_IN_FASTLOOP();
+    } else {
+      /* Decoding a match offset on a 32-bit platform */
+      REFILL_BITS_IN_FASTLOOP();
+      if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+        /* Offset codeword requires a subtable */
+        bitbuf >>= OFFSET_TABLEBITS;
+        bitsleft -= OFFSET_TABLEBITS;
+        entry = d->offset_decode_table[(entry >> 16) +
+                                       EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+        REFILL_BITS_IN_FASTLOOP();
+        /* No further refill needed before extra bits */
+        STATIC_ASSERT(CAN_CONSUME(
+                                  OFFSET_MAXBITS - OFFSET_TABLEBITS));
+      } else {
+        /* No refill needed before extra bits */
+        STATIC_ASSERT(CAN_CONSUME(OFFSET_MAXFASTBITS));
+      }
+    }
+    saved_bitbuf = bitbuf;
+    bitbuf >>= (u8)entry;
+    bitsleft -= entry; /* optimization: subtract full entry */
+    offset = entry >> 16;
+    offset += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+    
+    /* Validate the match offset; needed even in the fastloop. */
+    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+    src = out_next - offset;
+    dst = out_next;
+    out_next += length;
+    
+    /*
+     * Before starting to issue the instructions to copy the match,
+     * refill the bitbuffer and preload the litlen decode table
+     * entry for the next loop iteration.  This can increase
+     * performance by allowing the latency of the match copy to
+     * overlap with these other operations.  To further reduce
+     * latency, we've arranged for there to be enough bits remaining
+     * to do the table preload independently of the refill, except
+     * on 32-bit platforms using the byte-at-a-time refill method.
+     */
+    if (!CAN_CONSUME_AND_THEN_PRELOAD(
+                                      MAX(OFFSET_MAXBITS - OFFSET_TABLEBITS,
+                                          OFFSET_MAXFASTBITS),
+                                      LITLEN_TABLEBITS) &&
+        unlikely((u8)bitsleft < LITLEN_TABLEBITS - PRELOAD_SLACK))
+      REFILL_BITS_IN_FASTLOOP();
+    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+    REFILL_BITS_IN_FASTLOOP();
+    
+    /*
+     * Copy the match.  On most CPUs the fastest method is a
+     * word-at-a-time copy, unconditionally copying about 5 words
+     * since this is enough for most matches without being too much.
+     *
+     * The normal word-at-a-time copy works for offset >= WORDBYTES,
+     * which is most cases.  The case of offset == 1 is also common
+     * and is worth optimizing for, since it is just RLE encoding of
+     * the previous byte, which is the result of compressing long
+     * runs of the same byte.
+     *
+     * Writing past the match 'length' is allowed here, since it's
+     * been ensured there is enough output space left for a slight
+     * overrun.  FASTLOOP_MAX_BYTES_WRITTEN needs to be updated if
+     * the maximum possible overrun here is changed.
+     */
+    if (UNALIGNED_ACCESS_IS_FAST && offset >= WORDBYTES) {
+      store_word_unaligned(load_word_unaligned(src), dst);
+      src += WORDBYTES;
+      dst += WORDBYTES;
+      store_word_unaligned(load_word_unaligned(src), dst);
+      src += WORDBYTES;
+      dst += WORDBYTES;
+      store_word_unaligned(load_word_unaligned(src), dst);
+      src += WORDBYTES;
+      dst += WORDBYTES;
+      store_word_unaligned(load_word_unaligned(src), dst);
+      src += WORDBYTES;
+      dst += WORDBYTES;
+      store_word_unaligned(load_word_unaligned(src), dst);
+      src += WORDBYTES;
+      dst += WORDBYTES;
+      while (dst < out_next) {
+        store_word_unaligned(load_word_unaligned(src), dst);
+        src += WORDBYTES;
+        dst += WORDBYTES;
+        store_word_unaligned(load_word_unaligned(src), dst);
+        src += WORDBYTES;
+        dst += WORDBYTES;
+        store_word_unaligned(load_word_unaligned(src), dst);
+        src += WORDBYTES;
+        dst += WORDBYTES;
+        store_word_unaligned(load_word_unaligned(src), dst);
+        src += WORDBYTES;
+        dst += WORDBYTES;
+        store_word_unaligned(load_word_unaligned(src), dst);
+        src += WORDBYTES;
+        dst += WORDBYTES;
+      }
+    } else if (UNALIGNED_ACCESS_IS_FAST && offset == 1) {
+      machine_word_t v;
+      
+      /*
+       * This part tends to get auto-vectorized, so keep it
+       * copying a multiple of 16 bytes at a time.
+       */
+      v = (machine_word_t)0x0101010101010101 * src[0];
+      store_word_unaligned(v, dst);
+      dst += WORDBYTES;
+      store_word_unaligned(v, dst);
+      dst += WORDBYTES;
+      store_word_unaligned(v, dst);
+      dst += WORDBYTES;
+      store_word_unaligned(v, dst);
+      dst += WORDBYTES;
+      while (dst < out_next) {
+        store_word_unaligned(v, dst);
+        dst += WORDBYTES;
+        store_word_unaligned(v, dst);
+        dst += WORDBYTES;
+        store_word_unaligned(v, dst);
+        dst += WORDBYTES;
+        store_word_unaligned(v, dst);
+        dst += WORDBYTES;
+      }
+    } else if (UNALIGNED_ACCESS_IS_FAST) {
+      store_word_unaligned(load_word_unaligned(src), dst);
+      src += offset;
+      dst += offset;
+      store_word_unaligned(load_word_unaligned(src), dst);
+      src += offset;
+      dst += offset;
+      do {
+        store_word_unaligned(load_word_unaligned(src), dst);
+        src += offset;
+        dst += offset;
+        store_word_unaligned(load_word_unaligned(src), dst);
+        src += offset;
+        dst += offset;
+      } while (dst < out_next);
+    } else {
+      *dst++ = *src++;
+      *dst++ = *src++;
+      do {
+        *dst++ = *src++;
+      } while (dst < out_next);
+    }
+  } while (in_next < in_fastloop_end && out_next < out_fastloop_end);
+  
+  /*
+   * This is the generic loop for decoding literals and matches.  This
+   * handles cases where in_next and out_next are close to the end of
+   * their respective buffers.  Usually this loop isn't performance-
+   * critical, as most time is spent in the fastloop above instead.  We
+   * therefore omit some optimizations here in favor of smaller code.
+   */
 generic_loop:
-	for (;;) {
-		u32 length, offset;
-		const u8 *src;
-		u8 *dst;
-
-		REFILL_BITS();
-		entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
-		saved_bitbuf = bitbuf;
-		bitbuf >>= (u8)entry;
-		bitsleft -= entry;
-		if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
-			entry = d->u.litlen_decode_table[(entry >> 16) +
-					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
-			saved_bitbuf = bitbuf;
-			bitbuf >>= (u8)entry;
-			bitsleft -= entry;
-		}
-		length = entry >> 16;
-		if (entry & HUFFDEC_LITERAL) {
-			if (unlikely(out_next == out_end))
-				return LIBDEFLATE_INSUFFICIENT_SPACE;
-			*out_next++ = length;
-			continue;
-		}
-		if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
-			goto block_done;
-		length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
-		if (unlikely(length > out_end - out_next))
-			return LIBDEFLATE_INSUFFICIENT_SPACE;
-
-		if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
-			REFILL_BITS();
-		entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
-		if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
-			bitbuf >>= OFFSET_TABLEBITS;
-			bitsleft -= OFFSET_TABLEBITS;
-			entry = d->offset_decode_table[(entry >> 16) +
-					EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
-			if (!CAN_CONSUME(OFFSET_MAXBITS))
-				REFILL_BITS();
-		}
-		offset = entry >> 16;
-		offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
-		bitbuf >>= (u8)entry;
-		bitsleft -= entry;
-
-		SAFETY_CHECK(offset <= out_next - (const u8 *)out);
-		src = out_next - offset;
-		dst = out_next;
-		out_next += length;
-
-		STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
-		*dst++ = *src++;
-		*dst++ = *src++;
-		do {
-			*dst++ = *src++;
-		} while (dst < out_next);
-	}
-
+  for (;;) {
+    u32 length, offset;
+    const u8 *src;
+    u8 *dst;
+    
+    REFILL_BITS();
+    entry = d->u.litlen_decode_table[bitbuf & litlen_tablemask];
+    saved_bitbuf = bitbuf;
+    bitbuf >>= (u8)entry;
+    bitsleft -= entry;
+    if (unlikely(entry & HUFFDEC_SUBTABLE_POINTER)) {
+      entry = d->u.litlen_decode_table[(entry >> 16) +
+                                       EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+      saved_bitbuf = bitbuf;
+      bitbuf >>= (u8)entry;
+      bitsleft -= entry;
+    }
+    length = entry >> 16;
+    if (entry & HUFFDEC_LITERAL) {
+      if (unlikely(out_next == out_end))
+        return LIBDEFLATE_INSUFFICIENT_SPACE;
+      *out_next++ = length;
+      continue;
+    }
+    if (unlikely(entry & HUFFDEC_END_OF_BLOCK))
+      goto block_done;
+    length += EXTRACT_VARBITS8(saved_bitbuf, entry) >> (u8)(entry >> 8);
+    if (unlikely(length > out_end - out_next))
+      return LIBDEFLATE_INSUFFICIENT_SPACE;
+    
+    if (!CAN_CONSUME(LENGTH_MAXBITS + OFFSET_MAXBITS))
+      REFILL_BITS();
+    entry = d->offset_decode_table[bitbuf & BITMASK(OFFSET_TABLEBITS)];
+    if (unlikely(entry & HUFFDEC_EXCEPTIONAL)) {
+      bitbuf >>= OFFSET_TABLEBITS;
+      bitsleft -= OFFSET_TABLEBITS;
+      entry = d->offset_decode_table[(entry >> 16) +
+                                     EXTRACT_VARBITS(bitbuf, (entry >> 8) & 0x3F)];
+      if (!CAN_CONSUME(OFFSET_MAXBITS))
+        REFILL_BITS();
+    }
+    offset = entry >> 16;
+    offset += EXTRACT_VARBITS8(bitbuf, entry) >> (u8)(entry >> 8);
+    bitbuf >>= (u8)entry;
+    bitsleft -= entry;
+    
+    SAFETY_CHECK(offset <= out_next - (const u8 *)out);
+    src = out_next - offset;
+    dst = out_next;
+    out_next += length;
+    
+    STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN == 3);
+    *dst++ = *src++;
+    *dst++ = *src++;
+    do {
+      *dst++ = *src++;
+    } while (dst < out_next);
+  }
+  
 block_done:
-	/* Finished decoding a block */
-
-	if (!is_final_block)
-		goto next_block;
-
-	/* That was the last block. */
-
-	bitsleft = (u8)bitsleft;
-
-	/*
-	 * If any of the implicit appended zero bytes were consumed (not just
-	 * refilled) before hitting end of stream, then the data is bad.
-	 */
-	SAFETY_CHECK(overread_count <= (bitsleft >> 3));
-
-	/* Optionally return the actual number of bytes consumed. */
-	if (actual_in_nbytes_ret) {
-		/* Don't count bytes that were refilled but not consumed. */
-		in_next -= (bitsleft >> 3) - overread_count;
-
-		*actual_in_nbytes_ret = in_next - (u8 *)in;
-	}
-
-	/* Optionally return the actual number of bytes written. */
-	if (actual_out_nbytes_ret) {
-		*actual_out_nbytes_ret = out_next - (u8 *)out;
-	} else {
-		if (out_next != out_end)
-			return LIBDEFLATE_SHORT_OUTPUT;
-	}
-	return LIBDEFLATE_SUCCESS;
+  /* Finished decoding a block */
+  
+  if (!is_final_block)
+    goto next_block;
+  
+  /* That was the last block. */
+  
+  bitsleft = (u8)bitsleft;
+  
+  /*
+   * If any of the implicit appended zero bytes were consumed (not just
+   * refilled) before hitting end of stream, then the data is bad.
+   */
+  SAFETY_CHECK(overread_count <= (bitsleft >> 3));
+  
+  /* Optionally return the actual number of bytes consumed. */
+  if (actual_in_nbytes_ret) {
+    /* Don't count bytes that were refilled but not consumed. */
+    in_next -= (bitsleft >> 3) - overread_count;
+    
+    *actual_in_nbytes_ret = in_next - (u8 *)in;
+  }
+  
+  /* Optionally return the actual number of bytes written. */
+  if (actual_out_nbytes_ret) {
+    *actual_out_nbytes_ret = out_next - (u8 *)out;
+  } else {
+    if (out_next != out_end)
+      return LIBDEFLATE_SHORT_OUTPUT;
+  }
+  return LIBDEFLATE_SUCCESS;
 }
 
 #undef FUNCNAME
diff --git a/Sources/DEFLATE/deflate_compress.c b/Sources/DEFLATE/deflate_compress.c
index 32c736d8..fe71dd8d 100644
--- a/Sources/DEFLATE/deflate_compress.c
+++ b/Sources/DEFLATE/deflate_compress.c
@@ -45,7 +45,7 @@
  * algorithms.  However, it is slow.  If this parameter is defined to 0, then
  * levels 10-12 will be the same as level 9 and will use the lazy2 algorithm.
  */
-#define SUPPORT_NEAR_OPTIMAL_PARSING	1
+#define SUPPORT_NEAR_OPTIMAL_PARSING  1
 
 /*
  * This is the minimum block length that the compressor will use, in
@@ -63,7 +63,7 @@
  * reasonable upper bound on the compressed size.  It's also needed because our
  * block splitting algorithm doesn't work well on very short blocks.
  */
-#define MIN_BLOCK_LENGTH	5000
+#define MIN_BLOCK_LENGTH  5000
 
 /*
  * For the greedy, lazy, lazy2, and near-optimal compressors: This is the soft
@@ -78,7 +78,7 @@
  * increasing/decreasing this parameter will increase/decrease per-compressor
  * memory usage linearly.
  */
-#define SOFT_MAX_BLOCK_LENGTH	300000
+#define SOFT_MAX_BLOCK_LENGTH  300000
 
 /*
  * For the greedy, lazy, and lazy2 compressors: this is the length of the
@@ -90,7 +90,7 @@
  * being ended normally before then.  Increasing/decreasing this value will
  * increase/decrease per-compressor memory usage linearly.
  */
-#define SEQ_STORE_LENGTH	50000
+#define SEQ_STORE_LENGTH  50000
 
 /*
  * For deflate_compress_fastest(): This is the soft maximum block length.
@@ -99,13 +99,13 @@
  * FAST_SEQ_STORE_LENGTH matches.  Therefore, this value should be lower than
  * the regular SOFT_MAX_BLOCK_LENGTH.
  */
-#define FAST_SOFT_MAX_BLOCK_LENGTH	65535
+#define FAST_SOFT_MAX_BLOCK_LENGTH  65535
 
 /*
  * For deflate_compress_fastest(): this is the length of the sequence store.
  * This is like SEQ_STORE_LENGTH, but this should be a lower value.
  */
-#define FAST_SEQ_STORE_LENGTH	8192
+#define FAST_SEQ_STORE_LENGTH  8192
 
 /*
  * These are the maximum codeword lengths, in bits, the compressor will use for
@@ -114,9 +114,9 @@
  * negligible effect on compression ratio but allows some optimizations when
  * outputting bits.  (It allows 4 literals to be written at once rather than 3.)
  */
-#define MAX_LITLEN_CODEWORD_LEN		14
-#define MAX_OFFSET_CODEWORD_LEN		DEFLATE_MAX_OFFSET_CODEWORD_LEN
-#define MAX_PRE_CODEWORD_LEN		DEFLATE_MAX_PRE_CODEWORD_LEN
+#define MAX_LITLEN_CODEWORD_LEN    14
+#define MAX_OFFSET_CODEWORD_LEN    DEFLATE_MAX_OFFSET_CODEWORD_LEN
+#define MAX_PRE_CODEWORD_LEN    DEFLATE_MAX_PRE_CODEWORD_LEN
 
 #if SUPPORT_NEAR_OPTIMAL_PARSING
 
@@ -137,7 +137,7 @@
  * BIT_COST doesn't apply to deflate_flush_block() and
  * deflate_compute_true_cost(), which consider whole bits.
  */
-#define BIT_COST	16
+#define BIT_COST  16
 
 /*
  * The NOSTAT_BITS value for a given alphabet is the number of bits assumed to
@@ -146,23 +146,23 @@
  * optimization pass.  However, the cost should be relatively high because the
  * symbol probably won't be used very many times (if at all).
  */
-#define LITERAL_NOSTAT_BITS	13
-#define LENGTH_NOSTAT_BITS	13
-#define OFFSET_NOSTAT_BITS	10
+#define LITERAL_NOSTAT_BITS  13
+#define LENGTH_NOSTAT_BITS  13
+#define OFFSET_NOSTAT_BITS  10
 
 /*
  * This is (slightly less than) the maximum number of matches that the
  * near-optimal compressor will cache per block.  This behaves similarly to
  * SEQ_STORE_LENGTH for the other compressors.
  */
-#define MATCH_CACHE_LENGTH	(SOFT_MAX_BLOCK_LENGTH * 5)
+#define MATCH_CACHE_LENGTH  (SOFT_MAX_BLOCK_LENGTH * 5)
 
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
 /******************************************************************************/
 
 /* Include the needed matchfinders. */
-#define MATCHFINDER_WINDOW_ORDER	DEFLATE_WINDOW_ORDER
+#define MATCHFINDER_WINDOW_ORDER  DEFLATE_WINDOW_ORDER
 #include "hc_matchfinder.h"
 #include "ht_matchfinder.h"
 #if SUPPORT_NEAR_OPTIMAL_PARSING
@@ -174,8 +174,8 @@
  * an upper bound.  (This says nothing about whether it is worthwhile to
  * consider so many matches; this is just defining the worst case.)
  */
-#define MAX_MATCHES_PER_POS	\
-	(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
+#define MAX_MATCHES_PER_POS  \
+(DEFLATE_MAX_MATCH_LEN - DEFLATE_MIN_MATCH_LEN + 1)
 #endif
 
 /*
@@ -185,103 +185,103 @@
  * occurs when the lazy2 compressor chooses two literals and a maximum-length
  * match, starting at SOFT_MAX_BLOCK_LENGTH - 1.
  */
-#define MAX_BLOCK_LENGTH	\
-	MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1,	\
-	    SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN)
+#define MAX_BLOCK_LENGTH  \
+MAX(SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH - 1,  \
+SOFT_MAX_BLOCK_LENGTH + 1 + DEFLATE_MAX_MATCH_LEN)
 
 static forceinline void
 check_buildtime_parameters(void)
 {
-	/*
-	 * Verify that MIN_BLOCK_LENGTH is being honored, as
-	 * libdeflate_deflate_compress_bound() depends on it.
-	 */
-	STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
-	STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
-	STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >=
-		      MIN_BLOCK_LENGTH);
-	STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >=
-		      MIN_BLOCK_LENGTH);
+  /*
+   * Verify that MIN_BLOCK_LENGTH is being honored, as
+   * libdeflate_deflate_compress_bound() depends on it.
+   */
+  STATIC_ASSERT(SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+  STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH >= MIN_BLOCK_LENGTH);
+  STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN >=
+                MIN_BLOCK_LENGTH);
+  STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN >=
+                MIN_BLOCK_LENGTH);
 #if SUPPORT_NEAR_OPTIMAL_PARSING
-	STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <=
-		      MATCH_CACHE_LENGTH);
+  STATIC_ASSERT(MIN_BLOCK_LENGTH * MAX_MATCHES_PER_POS <=
+                MATCH_CACHE_LENGTH);
 #endif
-
-	/* The definition of MAX_BLOCK_LENGTH assumes this. */
-	STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH);
-
-	/* Verify that the sequence stores aren't uselessly large. */
-	STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <=
-		      SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
-	STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <=
-		      FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
-
-	/* Verify that the maximum codeword lengths are valid. */
-	STATIC_ASSERT(
-		MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
-	STATIC_ASSERT(
-		MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
-	STATIC_ASSERT(
-		MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
-	STATIC_ASSERT(
-		(1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS);
-	STATIC_ASSERT(
-		(1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS);
-	STATIC_ASSERT(
-		(1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS);
+  
+  /* The definition of MAX_BLOCK_LENGTH assumes this. */
+  STATIC_ASSERT(FAST_SOFT_MAX_BLOCK_LENGTH <= SOFT_MAX_BLOCK_LENGTH);
+  
+  /* Verify that the sequence stores aren't uselessly large. */
+  STATIC_ASSERT(SEQ_STORE_LENGTH * DEFLATE_MIN_MATCH_LEN <=
+                SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+  STATIC_ASSERT(FAST_SEQ_STORE_LENGTH * HT_MATCHFINDER_MIN_MATCH_LEN <=
+                FAST_SOFT_MAX_BLOCK_LENGTH + MIN_BLOCK_LENGTH);
+  
+  /* Verify that the maximum codeword lengths are valid. */
+  STATIC_ASSERT(
+                MAX_LITLEN_CODEWORD_LEN <= DEFLATE_MAX_LITLEN_CODEWORD_LEN);
+  STATIC_ASSERT(
+                MAX_OFFSET_CODEWORD_LEN <= DEFLATE_MAX_OFFSET_CODEWORD_LEN);
+  STATIC_ASSERT(
+                MAX_PRE_CODEWORD_LEN <= DEFLATE_MAX_PRE_CODEWORD_LEN);
+  STATIC_ASSERT(
+                (1U << MAX_LITLEN_CODEWORD_LEN) >= DEFLATE_NUM_LITLEN_SYMS);
+  STATIC_ASSERT(
+                (1U << MAX_OFFSET_CODEWORD_LEN) >= DEFLATE_NUM_OFFSET_SYMS);
+  STATIC_ASSERT(
+                (1U << MAX_PRE_CODEWORD_LEN) >= DEFLATE_NUM_PRECODE_SYMS);
 }
 
 /******************************************************************************/
 
 /* Table: length slot => length slot base value */
 static const unsigned deflate_length_slot_base[] = {
-	3,    4,    5,    6,    7,    8,    9,    10,
-	11,   13,   15,   17,   19,   23,   27,   31,
-	35,   43,   51,   59,   67,   83,   99,   115,
-	131,  163,  195,  227,  258,
+  3,    4,    5,    6,    7,    8,    9,    10,
+  11,   13,   15,   17,   19,   23,   27,   31,
+  35,   43,   51,   59,   67,   83,   99,   115,
+  131,  163,  195,  227,  258,
 };
 
 /* Table: length slot => number of extra length bits */
 static const u8 deflate_extra_length_bits[] = {
-	0,    0,    0,    0,    0,    0,    0,    0,
-	1,    1,    1,    1,    2,    2,    2,    2,
-	3,    3,    3,    3,    4,    4,    4,    4,
-	5,    5,    5,    5,    0,
+  0,    0,    0,    0,    0,    0,    0,    0,
+  1,    1,    1,    1,    2,    2,    2,    2,
+  3,    3,    3,    3,    4,    4,    4,    4,
+  5,    5,    5,    5,    0,
 };
 
 /* Table: offset slot => offset slot base value */
 static const unsigned deflate_offset_slot_base[] = {
-	1,     2,     3,     4,     5,     7,     9,     13,
-	17,    25,    33,    49,    65,    97,    129,   193,
-	257,   385,   513,   769,   1025,  1537,  2049,  3073,
-	4097,  6145,  8193,  12289, 16385, 24577,
+  1,     2,     3,     4,     5,     7,     9,     13,
+  17,    25,    33,    49,    65,    97,    129,   193,
+  257,   385,   513,   769,   1025,  1537,  2049,  3073,
+  4097,  6145,  8193,  12289, 16385, 24577,
 };
 
 /* Table: offset slot => number of extra offset bits */
 static const u8 deflate_extra_offset_bits[] = {
-	0,     0,     0,     0,     1,     1,     2,     2,
-	3,     3,     4,     4,     5,     5,     6,     6,
-	7,     7,     8,     8,     9,     9,     10,    10,
-	11,    11,    12,    12,    13,    13,
+  0,     0,     0,     0,     1,     1,     2,     2,
+  3,     3,     4,     4,     5,     5,     6,     6,
+  7,     7,     8,     8,     9,     9,     10,    10,
+  11,    11,    12,    12,    13,    13,
 };
 
 /* Table: length => length slot */
 static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
-	0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
-	12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
-	16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
-	18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
-	20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
-	21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
-	22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
-	23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
-	24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
-	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
-	25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
-	26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
-	26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
-	27, 27, 28,
+  0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12,
+  12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 16,
+  16, 16, 16, 17, 17, 17, 17, 17, 17, 17, 17, 18, 18, 18, 18, 18, 18, 18,
+  18, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20,
+  20, 20, 20, 20, 20, 20, 20, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
+  21, 21, 21, 21, 21, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
+  22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+  23, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+  24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, 25, 25,
+  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+  25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26,
+  26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+  26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+  27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
+  27, 27, 28,
 };
 
 /*
@@ -289,38 +289,38 @@ static const u8 deflate_length_slot[DEFLATE_MAX_MATCH_LEN + 1] = {
  * This was generated by scripts/gen_offset_slot_map.py.
  */
 static const u8 deflate_offset_slot[256] = {
-	0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
-	8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
-	10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-	11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
-	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-	12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
-	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-	13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
-	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
-	15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
+  8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
+  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+  11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
+  15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
 };
 
 /* The order in which precode codeword lengths are stored */
 static const u8 deflate_precode_lens_permutation[DEFLATE_NUM_PRECODE_SYMS] = {
-	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
+  16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15
 };
 
 /* Table: precode symbol => number of extra bits */
 static const u8 deflate_extra_precode_bits[DEFLATE_NUM_PRECODE_SYMS] = {
-	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 3, 7
 };
 
 /* Codewords for the DEFLATE Huffman codes */
 struct deflate_codewords {
-	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
-	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+  u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+  u32 offset[DEFLATE_NUM_OFFSET_SYMS];
 };
 
 /*
@@ -328,20 +328,20 @@ struct deflate_codewords {
  * A zero length means the corresponding symbol had zero frequency.
  */
 struct deflate_lens {
-	u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
-	u8 offset[DEFLATE_NUM_OFFSET_SYMS];
+  u8 litlen[DEFLATE_NUM_LITLEN_SYMS];
+  u8 offset[DEFLATE_NUM_OFFSET_SYMS];
 };
 
 /* Codewords and lengths for the DEFLATE Huffman codes */
 struct deflate_codes {
-	struct deflate_codewords codewords;
-	struct deflate_lens lens;
+  struct deflate_codewords codewords;
+  struct deflate_lens lens;
 };
 
 /* Symbol frequency counters for the DEFLATE Huffman codes */
 struct deflate_freqs {
-	u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
-	u32 offset[DEFLATE_NUM_OFFSET_SYMS];
+  u32 litlen[DEFLATE_NUM_LITLEN_SYMS];
+  u32 offset[DEFLATE_NUM_OFFSET_SYMS];
 };
 
 /*
@@ -351,47 +351,47 @@ struct deflate_freqs {
  * block's Huffman codes have been computed.
  */
 struct deflate_sequence {
-
-	/*
-	 * Bits 0..22: the number of literals in this run.  This may be 0 and
-	 * can be at most MAX_BLOCK_LENGTH.  The literals are not stored
-	 * explicitly in this structure; instead, they are read directly from
-	 * the uncompressed data.
-	 *
-	 * Bits 23..31: the length of the match which follows the literals, or 0
-	 * if this literal run was the last in the block, so there is no match
-	 * which follows it.
-	 */
+  
+  /*
+   * Bits 0..22: the number of literals in this run.  This may be 0 and
+   * can be at most MAX_BLOCK_LENGTH.  The literals are not stored
+   * explicitly in this structure; instead, they are read directly from
+   * the uncompressed data.
+   *
+   * Bits 23..31: the length of the match which follows the literals, or 0
+   * if this literal run was the last in the block, so there is no match
+   * which follows it.
+   */
 #define SEQ_LENGTH_SHIFT 23
 #define SEQ_LITRUNLEN_MASK (((u32)1 << SEQ_LENGTH_SHIFT) - 1)
-	u32 litrunlen_and_length;
-
-	/*
-	 * If 'length' doesn't indicate end-of-block, then this is the offset of
-	 * the match which follows the literals.
-	 */
-	u16 offset;
-
-	/*
-	 * If 'length' doesn't indicate end-of-block, then this is the offset
-	 * slot of the match which follows the literals.
-	 */
-	u16 offset_slot;
+  u32 litrunlen_and_length;
+  
+  /*
+   * If 'length' doesn't indicate end-of-block, then this is the offset of
+   * the match which follows the literals.
+   */
+  u16 offset;
+  
+  /*
+   * If 'length' doesn't indicate end-of-block, then this is the offset
+   * slot of the match which follows the literals.
+   */
+  u16 offset_slot;
 };
 
 #if SUPPORT_NEAR_OPTIMAL_PARSING
 
 /* Costs for the near-optimal parsing algorithm */
 struct deflate_costs {
-
-	/* The cost to output each possible literal */
-	u32 literal[DEFLATE_NUM_LITERALS];
-
-	/* The cost to output each possible match length */
-	u32 length[DEFLATE_MAX_MATCH_LEN + 1];
-
-	/* The cost to output a match offset of each possible offset slot */
-	u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
+  
+  /* The cost to output each possible literal */
+  u32 literal[DEFLATE_NUM_LITERALS];
+  
+  /* The cost to output each possible match length */
+  u32 length[DEFLATE_MAX_MATCH_LEN + 1];
+  
+  /* The cost to output a match offset of each possible offset slot */
+  u32 offset_slot[DEFLATE_NUM_OFFSET_SYMS];
 };
 
 /*
@@ -406,31 +406,31 @@ struct deflate_costs {
  * But these "edges" are actually stored elsewhere (in 'match_cache').  Here we
  * associate with each node just two pieces of information:
  *
- *	'cost_to_end' is the minimum cost to reach the end of the block from
- *	this position.
+ *  'cost_to_end' is the minimum cost to reach the end of the block from
+ *  this position.
  *
- *	'item' represents the literal or match that must be chosen from here to
- *	reach the end of the block with the minimum cost.  Equivalently, this
- *	can be interpreted as the label of the outgoing edge on the minimum-cost
- *	path to the "end of block" node from this node.
+ *  'item' represents the literal or match that must be chosen from here to
+ *  reach the end of the block with the minimum cost.  Equivalently, this
+ *  can be interpreted as the label of the outgoing edge on the minimum-cost
+ *  path to the "end of block" node from this node.
  */
 struct deflate_optimum_node {
-
-	u32 cost_to_end;
-
-	/*
-	 * Notes on the match/literal representation used here:
-	 *
-	 *	The low bits of 'item' are the length: 1 if this is a literal,
-	 *	or the match length if this is a match.
-	 *
-	 *	The high bits of 'item' are the actual literal byte if this is a
-	 *	literal, or the match offset if this is a match.
-	 */
+  
+  u32 cost_to_end;
+  
+  /*
+   * Notes on the match/literal representation used here:
+   *
+   *  The low bits of 'item' are the length: 1 if this is a literal,
+   *  or the match length if this is a match.
+   *
+   *  The high bits of 'item' are the actual literal byte if this is a
+   *  literal, or the match offset if this is a match.
+   */
 #define OPTIMUM_OFFSET_SHIFT 9
 #define OPTIMUM_LEN_MASK (((u32)1 << OPTIMUM_OFFSET_SHIFT) - 1)
-	u32 item;
-
+  u32 item;
+  
 };
 
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
@@ -439,226 +439,226 @@ struct deflate_optimum_node {
 #define NUM_LITERAL_OBSERVATION_TYPES 8
 #define NUM_MATCH_OBSERVATION_TYPES 2
 #define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
-			       NUM_MATCH_OBSERVATION_TYPES)
+NUM_MATCH_OBSERVATION_TYPES)
 #define NUM_OBSERVATIONS_PER_BLOCK_CHECK 512
 struct block_split_stats {
-	u32 new_observations[NUM_OBSERVATION_TYPES];
-	u32 observations[NUM_OBSERVATION_TYPES];
-	u32 num_new_observations;
-	u32 num_observations;
+  u32 new_observations[NUM_OBSERVATION_TYPES];
+  u32 observations[NUM_OBSERVATION_TYPES];
+  u32 num_new_observations;
+  u32 num_observations;
 };
 
 struct deflate_output_bitstream;
 
 /* The main DEFLATE compressor structure */
 struct libdeflate_compressor {
-
-	/* Pointer to the compress() implementation chosen at allocation time */
-	void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
-		     size_t in_nbytes, struct deflate_output_bitstream *os);
-
-	/* The free() function for this struct, chosen at allocation time */
-	free_func_t free_func;
-
-	/* The compression level with which this compressor was created */
-	unsigned compression_level;
-
-	/* Anything of this size or less we won't bother trying to compress. */
-	size_t max_passthrough_size;
-
-	/*
-	 * The maximum search depth: consider at most this many potential
-	 * matches at each position
-	 */
-	unsigned max_search_depth;
-
-	/*
-	 * The "nice" match length: if a match of this length is found, choose
-	 * it immediately without further consideration
-	 */
-	unsigned nice_match_length;
-
-	/* Frequency counters for the current block */
-	struct deflate_freqs freqs;
-
-	/* Block split statistics for the current block */
-	struct block_split_stats split_stats;
-
-	/* Dynamic Huffman codes for the current block */
-	struct deflate_codes codes;
-
-	/* The static Huffman codes defined by the DEFLATE format */
-	struct deflate_codes static_codes;
-
-	/* Temporary space for block flushing */
-	union {
-		/* Information about the precode */
-		struct {
-			u32 freqs[DEFLATE_NUM_PRECODE_SYMS];
-			u32 codewords[DEFLATE_NUM_PRECODE_SYMS];
-			u8 lens[DEFLATE_NUM_PRECODE_SYMS];
-			unsigned items[DEFLATE_NUM_LITLEN_SYMS +
-				       DEFLATE_NUM_OFFSET_SYMS];
-			unsigned num_litlen_syms;
-			unsigned num_offset_syms;
-			unsigned num_explicit_lens;
-			unsigned num_items;
-		} precode;
-		/*
-		 * The "full" length codewords.  Used only after the information
-		 * in 'precode' is no longer needed.
-		 */
-		struct {
-			u32 codewords[DEFLATE_MAX_MATCH_LEN + 1];
-			u8 lens[DEFLATE_MAX_MATCH_LEN + 1];
-		} length;
-	} o;
-
-	union {
-		/* Data for greedy or lazy parsing */
-		struct {
-			/* Hash chains matchfinder */
-			struct hc_matchfinder hc_mf;
-
-			/* Matches and literals chosen for the current block */
-			struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1];
-
-		} g; /* (g)reedy */
-
-		/* Data for fastest parsing */
-		struct {
-			/* Hash table matchfinder */
-			struct ht_matchfinder ht_mf;
-
-			/* Matches and literals chosen for the current block */
-			struct deflate_sequence sequences[
-						FAST_SEQ_STORE_LENGTH + 1];
-
-		} f; /* (f)astest */
-
-	#if SUPPORT_NEAR_OPTIMAL_PARSING
-		/* Data for near-optimal parsing */
-		struct {
-
-			/* Binary tree matchfinder */
-			struct bt_matchfinder bt_mf;
-
-			/*
-			 * Cached matches for the current block.  This array
-			 * contains the matches that were found at each position
-			 * in the block.  Specifically, for each position, there
-			 * is a list of matches found at that position, if any,
-			 * sorted by strictly increasing length.  In addition,
-			 * following the matches for each position, there is a
-			 * special 'struct lz_match' whose 'length' member
-			 * contains the number of matches found at that
-			 * position, and whose 'offset' member contains the
-			 * literal at that position.
-			 *
-			 * Note: in rare cases, there will be a very high number
-			 * of matches in the block and this array will overflow.
-			 * If this happens, we force the end of the current
-			 * block.  MATCH_CACHE_LENGTH is the length at which we
-			 * actually check for overflow.  The extra slots beyond
-			 * this are enough to absorb the worst case overflow,
-			 * which occurs if starting at
-			 * &match_cache[MATCH_CACHE_LENGTH - 1], we write
-			 * MAX_MATCHES_PER_POS matches and a match count header,
-			 * then skip searching for matches at
-			 * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the
-			 * match count header for each.
-			 */
-			struct lz_match match_cache[MATCH_CACHE_LENGTH +
-						    MAX_MATCHES_PER_POS +
-						    DEFLATE_MAX_MATCH_LEN - 1];
-
-			/*
-			 * Array of nodes, one per position, for running the
-			 * minimum-cost path algorithm.
-			 *
-			 * This array must be large enough to accommodate the
-			 * worst-case number of nodes, which is MAX_BLOCK_LENGTH
-			 * plus 1 for the end-of-block node.
-			 */
-			struct deflate_optimum_node optimum_nodes[
-				MAX_BLOCK_LENGTH + 1];
-
-			/* The current cost model being used */
-			struct deflate_costs costs;
-
-			/* Saved cost model */
-			struct deflate_costs costs_saved;
-
-			/*
-			 * A table that maps match offset to offset slot.  This
-			 * differs from deflate_offset_slot[] in that this is a
-			 * full map, not a condensed one.  The full map is more
-			 * appropriate for the near-optimal parser, since the
-			 * near-optimal parser does more offset => offset_slot
-			 * translations, it doesn't intersperse them with
-			 * matchfinding (so cache evictions are less of a
-			 * concern), and it uses more memory anyway.
-			 */
-			u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
-
-			/* Literal/match statistics saved from previous block */
-			u32 prev_observations[NUM_OBSERVATION_TYPES];
-			u32 prev_num_observations;
-
-			/*
-			 * Approximate match length frequencies based on a
-			 * greedy parse, gathered during matchfinding.  This is
-			 * used for setting the initial symbol costs.
-			 */
-			u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
-			u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
-
-			/*
-			 * The maximum number of optimization passes
-			 * (min-cost path searches) per block.
-			 * Larger values = more compression.
-			 */
-			unsigned max_optim_passes;
-
-			/*
-			 * If an optimization pass improves the cost by fewer
-			 * than this number of bits, then optimization will stop
-			 * early, before max_optim_passes has been reached.
-			 * Smaller values = more compression.
-			 */
-			unsigned min_improvement_to_continue;
-
-			/*
-			 * The minimum number of bits that would need to be
-			 * saved for it to be considered worth the time to
-			 * regenerate and use the min-cost path from a previous
-			 * optimization pass, in the case where the final
-			 * optimization pass actually increased the cost.
-			 * Smaller values = more compression.
-			 */
-			unsigned min_bits_to_use_nonfinal_path;
-
-			/*
-			 * The maximum block length, in uncompressed bytes, at
-			 * which to find and consider the optimal match/literal
-			 * list for the static Huffman codes.  This strategy
-			 * improves the compression ratio produced by static
-			 * Huffman blocks and can discover more cases in which
-			 * static blocks are worthwhile.  This helps mostly with
-			 * small blocks, hence why this parameter is a max_len.
-			 *
-			 * Above this block length, static Huffman blocks are
-			 * only used opportunistically.  I.e. a static Huffman
-			 * block is only used if a static block using the same
-			 * match/literal list as the optimized dynamic block
-			 * happens to be cheaper than the dynamic block itself.
-			 */
-			unsigned max_len_to_optimize_static_block;
-
-		} n; /* (n)ear-optimal */
-	#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-
-	} p; /* (p)arser */
+  
+  /* Pointer to the compress() implementation chosen at allocation time */
+  void (*impl)(struct libdeflate_compressor *restrict c, const u8 *in,
+               size_t in_nbytes, struct deflate_output_bitstream *os);
+  
+  /* The free() function for this struct, chosen at allocation time */
+  free_func_t free_func;
+  
+  /* The compression level with which this compressor was created */
+  unsigned compression_level;
+  
+  /* Anything of this size or less we won't bother trying to compress. */
+  size_t max_passthrough_size;
+  
+  /*
+   * The maximum search depth: consider at most this many potential
+   * matches at each position
+   */
+  unsigned max_search_depth;
+  
+  /*
+   * The "nice" match length: if a match of this length is found, choose
+   * it immediately without further consideration
+   */
+  unsigned nice_match_length;
+  
+  /* Frequency counters for the current block */
+  struct deflate_freqs freqs;
+  
+  /* Block split statistics for the current block */
+  struct block_split_stats split_stats;
+  
+  /* Dynamic Huffman codes for the current block */
+  struct deflate_codes codes;
+  
+  /* The static Huffman codes defined by the DEFLATE format */
+  struct deflate_codes static_codes;
+  
+  /* Temporary space for block flushing */
+  union {
+    /* Information about the precode */
+    struct {
+      u32 freqs[DEFLATE_NUM_PRECODE_SYMS];
+      u32 codewords[DEFLATE_NUM_PRECODE_SYMS];
+      u8 lens[DEFLATE_NUM_PRECODE_SYMS];
+      unsigned items[DEFLATE_NUM_LITLEN_SYMS +
+                     DEFLATE_NUM_OFFSET_SYMS];
+      unsigned num_litlen_syms;
+      unsigned num_offset_syms;
+      unsigned num_explicit_lens;
+      unsigned num_items;
+    } precode;
+    /*
+     * The "full" length codewords.  Used only after the information
+     * in 'precode' is no longer needed.
+     */
+    struct {
+      u32 codewords[DEFLATE_MAX_MATCH_LEN + 1];
+      u8 lens[DEFLATE_MAX_MATCH_LEN + 1];
+    } length;
+  } o;
+  
+  union {
+    /* Data for greedy or lazy parsing */
+    struct {
+      /* Hash chains matchfinder */
+      struct hc_matchfinder hc_mf;
+      
+      /* Matches and literals chosen for the current block */
+      struct deflate_sequence sequences[SEQ_STORE_LENGTH + 1];
+      
+    } g; /* (g)reedy */
+    
+    /* Data for fastest parsing */
+    struct {
+      /* Hash table matchfinder */
+      struct ht_matchfinder ht_mf;
+      
+      /* Matches and literals chosen for the current block */
+      struct deflate_sequence sequences[
+        FAST_SEQ_STORE_LENGTH + 1];
+      
+    } f; /* (f)astest */
+    
+#if SUPPORT_NEAR_OPTIMAL_PARSING
+    /* Data for near-optimal parsing */
+    struct {
+      
+      /* Binary tree matchfinder */
+      struct bt_matchfinder bt_mf;
+      
+      /*
+       * Cached matches for the current block.  This array
+       * contains the matches that were found at each position
+       * in the block.  Specifically, for each position, there
+       * is a list of matches found at that position, if any,
+       * sorted by strictly increasing length.  In addition,
+       * following the matches for each position, there is a
+       * special 'struct lz_match' whose 'length' member
+       * contains the number of matches found at that
+       * position, and whose 'offset' member contains the
+       * literal at that position.
+       *
+       * Note: in rare cases, there will be a very high number
+       * of matches in the block and this array will overflow.
+       * If this happens, we force the end of the current
+       * block.  MATCH_CACHE_LENGTH is the length at which we
+       * actually check for overflow.  The extra slots beyond
+       * this are enough to absorb the worst case overflow,
+       * which occurs if starting at
+       * &match_cache[MATCH_CACHE_LENGTH - 1], we write
+       * MAX_MATCHES_PER_POS matches and a match count header,
+       * then skip searching for matches at
+       * 'DEFLATE_MAX_MATCH_LEN - 1' positions and write the
+       * match count header for each.
+       */
+      struct lz_match match_cache[MATCH_CACHE_LENGTH +
+                                  MAX_MATCHES_PER_POS +
+                                  DEFLATE_MAX_MATCH_LEN - 1];
+      
+      /*
+       * Array of nodes, one per position, for running the
+       * minimum-cost path algorithm.
+       *
+       * This array must be large enough to accommodate the
+       * worst-case number of nodes, which is MAX_BLOCK_LENGTH
+       * plus 1 for the end-of-block node.
+       */
+      struct deflate_optimum_node optimum_nodes[
+        MAX_BLOCK_LENGTH + 1];
+      
+      /* The current cost model being used */
+      struct deflate_costs costs;
+      
+      /* Saved cost model */
+      struct deflate_costs costs_saved;
+      
+      /*
+       * A table that maps match offset to offset slot.  This
+       * differs from deflate_offset_slot[] in that this is a
+       * full map, not a condensed one.  The full map is more
+       * appropriate for the near-optimal parser, since the
+       * near-optimal parser does more offset => offset_slot
+       * translations, it doesn't intersperse them with
+       * matchfinding (so cache evictions are less of a
+       * concern), and it uses more memory anyway.
+       */
+      u8 offset_slot_full[DEFLATE_MAX_MATCH_OFFSET + 1];
+      
+      /* Literal/match statistics saved from previous block */
+      u32 prev_observations[NUM_OBSERVATION_TYPES];
+      u32 prev_num_observations;
+      
+      /*
+       * Approximate match length frequencies based on a
+       * greedy parse, gathered during matchfinding.  This is
+       * used for setting the initial symbol costs.
+       */
+      u32 new_match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+      u32 match_len_freqs[DEFLATE_MAX_MATCH_LEN + 1];
+      
+      /*
+       * The maximum number of optimization passes
+       * (min-cost path searches) per block.
+       * Larger values = more compression.
+       */
+      unsigned max_optim_passes;
+      
+      /*
+       * If an optimization pass improves the cost by fewer
+       * than this number of bits, then optimization will stop
+       * early, before max_optim_passes has been reached.
+       * Smaller values = more compression.
+       */
+      unsigned min_improvement_to_continue;
+      
+      /*
+       * The minimum number of bits that would need to be
+       * saved for it to be considered worth the time to
+       * regenerate and use the min-cost path from a previous
+       * optimization pass, in the case where the final
+       * optimization pass actually increased the cost.
+       * Smaller values = more compression.
+       */
+      unsigned min_bits_to_use_nonfinal_path;
+      
+      /*
+       * The maximum block length, in uncompressed bytes, at
+       * which to find and consider the optimal match/literal
+       * list for the static Huffman codes.  This strategy
+       * improves the compression ratio produced by static
+       * Huffman blocks and can discover more cases in which
+       * static blocks are worthwhile.  This helps mostly with
+       * small blocks, hence why this parameter is a max_len.
+       *
+       * Above this block length, static Huffman blocks are
+       * only used opportunistically.  I.e. a static Huffman
+       * block is only used if a static block using the same
+       * match/literal list as the optimized dynamic block
+       * happens to be cheaper than the dynamic block itself.
+       */
+      unsigned max_len_to_optimize_static_block;
+      
+    } n; /* (n)ear-optimal */
+#endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
+    
+  } p; /* (p)arser */
 };
 
 /*
@@ -672,41 +672,41 @@ typedef machine_word_t bitbuf_t;
  * The capacity of the bitbuffer, in bits.  This is 1 less than the real size,
  * in order to avoid undefined behavior when doing bitbuf >>= bitcount & ~7.
  */
-#define BITBUF_NBITS	(8 * sizeof(bitbuf_t) - 1)
+#define BITBUF_NBITS  (8 * sizeof(bitbuf_t) - 1)
 
 /*
  * Can the specified number of bits always be added to 'bitbuf' after any
  * pending bytes have been flushed?  There can be up to 7 bits remaining after a
  * flush, so the count must not exceed BITBUF_NBITS after adding 'n' more bits.
  */
-#define CAN_BUFFER(n)	(7 + (n) <= BITBUF_NBITS)
+#define CAN_BUFFER(n)  (7 + (n) <= BITBUF_NBITS)
 
 /*
  * Structure to keep track of the current state of sending bits to the
  * compressed output buffer
  */
 struct deflate_output_bitstream {
-
-	/* Bits that haven't yet been written to the output buffer */
-	bitbuf_t bitbuf;
-
-	/*
-	 * Number of bits currently held in @bitbuf.  This can be between 0 and
-	 * BITBUF_NBITS in general, or between 0 and 7 after a flush.
-	 */
-	unsigned bitcount;
-
-	/*
-	 * Pointer to the position in the output buffer at which the next byte
-	 * should be written
-	 */
-	u8 *next;
-
-	/* Pointer to the end of the output buffer */
-	u8 *end;
-
-	/* true if the output buffer ran out of space */
-	bool overflow;
+  
+  /* Bits that haven't yet been written to the output buffer */
+  bitbuf_t bitbuf;
+  
+  /*
+   * Number of bits currently held in @bitbuf.  This can be between 0 and
+   * BITBUF_NBITS in general, or between 0 and 7 after a flush.
+   */
+  unsigned bitcount;
+  
+  /*
+   * Pointer to the position in the output buffer at which the next byte
+   * should be written
+   */
+  u8 *next;
+  
+  /* Pointer to the end of the output buffer */
+  u8 *end;
+  
+  /* true if the output buffer ran out of space */
+  bool overflow;
 };
 
 /*
@@ -714,11 +714,11 @@ struct deflate_output_bitstream {
  * must ensure that 'bitcount + n <= BITBUF_NBITS', by calling FLUSH_BITS()
  * frequently enough.
  */
-#define ADD_BITS(bits, n)			\
-do {						\
-	bitbuf |= (bitbuf_t)(bits) << bitcount;	\
-	bitcount += (n);			\
-	ASSERT(bitcount <= BITBUF_NBITS);	\
+#define ADD_BITS(bits, n)      \
+do {            \
+bitbuf |= (bitbuf_t)(bits) << bitcount;  \
+bitcount += (n);      \
+ASSERT(bitcount <= BITBUF_NBITS);  \
 } while (0)
 
 /*
@@ -731,23 +731,23 @@ do {						\
  * flush a whole word, even though that's fastest.  Therefore, flush a whole
  * word if there is space for it, otherwise flush a byte at a time.
  */
-#define FLUSH_BITS()							\
-do {									\
-	if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \
-		/* Flush a whole word (branchlessly). */		\
-		put_unaligned_leword(bitbuf, out_next);			\
-		bitbuf >>= bitcount & ~7;				\
-		out_next += bitcount >> 3;				\
-		bitcount &= 7;						\
-	} else {							\
-		/* Flush a byte at a time. */				\
-		while (bitcount >= 8) {					\
-			ASSERT(out_next < os->end);			\
-			*out_next++ = bitbuf;				\
-			bitcount -= 8;					\
-			bitbuf >>= 8;					\
-		}							\
-	}								\
+#define FLUSH_BITS()              \
+do {                  \
+if (UNALIGNED_ACCESS_IS_FAST && likely(out_next < out_fast_end)) { \
+/* Flush a whole word (branchlessly). */    \
+put_unaligned_leword(bitbuf, out_next);      \
+bitbuf >>= bitcount & ~7;        \
+out_next += bitcount >> 3;        \
+bitcount &= 7;            \
+} else {              \
+/* Flush a byte at a time. */        \
+while (bitcount >= 8) {          \
+ASSERT(out_next < os->end);      \
+*out_next++ = bitbuf;        \
+bitcount -= 8;          \
+bitbuf >>= 8;          \
+}              \
+}                \
 } while (0)
 
 /*
@@ -759,21 +759,21 @@ do {									\
 static void
 heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
 {
-	unsigned parent_idx;
-	unsigned child_idx;
-	u32 v;
-
-	v = A[subtree_idx];
-	parent_idx = subtree_idx;
-	while ((child_idx = parent_idx * 2) <= length) {
-		if (child_idx < length && A[child_idx + 1] > A[child_idx])
-			child_idx++;
-		if (v >= A[child_idx])
-			break;
-		A[parent_idx] = A[child_idx];
-		parent_idx = child_idx;
-	}
-	A[parent_idx] = v;
+  unsigned parent_idx;
+  unsigned child_idx;
+  u32 v;
+  
+  v = A[subtree_idx];
+  parent_idx = subtree_idx;
+  while ((child_idx = parent_idx * 2) <= length) {
+    if (child_idx < length && A[child_idx + 1] > A[child_idx])
+      child_idx++;
+    if (v >= A[child_idx])
+      break;
+    A[parent_idx] = A[child_idx];
+    parent_idx = child_idx;
+  }
+  A[parent_idx] = v;
 }
 
 /*
@@ -783,10 +783,10 @@ heapify_subtree(u32 A[], unsigned length, unsigned subtree_idx)
 static void
 heapify_array(u32 A[], unsigned length)
 {
-	unsigned subtree_idx;
-
-	for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
-		heapify_subtree(A, length, subtree_idx);
+  unsigned subtree_idx;
+  
+  for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+    heapify_subtree(A, length, subtree_idx);
 }
 
 /*
@@ -798,26 +798,26 @@ heapify_array(u32 A[], unsigned length)
 static void
 heap_sort(u32 A[], unsigned length)
 {
-	A--; /* Use 1-based indices  */
-
-	heapify_array(A, length);
-
-	while (length >= 2) {
-		u32 tmp = A[length];
-
-		A[length] = A[1];
-		A[1] = tmp;
-		length--;
-		heapify_subtree(A, length, 1);
-	}
+  A--; /* Use 1-based indices  */
+  
+  heapify_array(A, length);
+  
+  while (length >= 2) {
+    u32 tmp = A[length];
+    
+    A[length] = A[1];
+    A[1] = tmp;
+    length--;
+    heapify_subtree(A, length, 1);
+  }
 }
 
 #define NUM_SYMBOL_BITS 10
-#define NUM_FREQ_BITS	(32 - NUM_SYMBOL_BITS)
-#define SYMBOL_MASK	((1 << NUM_SYMBOL_BITS) - 1)
-#define FREQ_MASK	(~SYMBOL_MASK)
+#define NUM_FREQ_BITS  (32 - NUM_SYMBOL_BITS)
+#define SYMBOL_MASK  ((1 << NUM_SYMBOL_BITS) - 1)
+#define FREQ_MASK  (~SYMBOL_MASK)
 
-#define GET_NUM_COUNTERS(num_syms)	(num_syms)
+#define GET_NUM_COUNTERS(num_syms)  (num_syms)
 
 /*
  * Sort the symbols primarily by frequency and secondarily by symbol value.
@@ -827,18 +827,18 @@ heap_sort(u32 A[], unsigned length)
  * contain the frequency.
  *
  * @num_syms
- *	Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
+ *  Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
  *
  * @freqs[num_syms]
- *	Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
+ *  Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
  *
  * @lens[num_syms]
- *	An array that eventually will hold the length of each codeword.  This
- *	function only fills in the codeword lengths for symbols that have zero
- *	frequency, which are not well defined per se but will be set to 0.
+ *  An array that eventually will hold the length of each codeword.  This
+ *  function only fills in the codeword lengths for symbols that have zero
+ *  frequency, which are not well defined per se but will be set to 0.
  *
  * @symout[num_syms]
- *	The output array, described above.
+ *  The output array, described above.
  *
  * Returns the number of entries in 'symout' that were filled.  This is the
  * number of symbols that have nonzero frequency.
@@ -846,75 +846,75 @@ heap_sort(u32 A[], unsigned length)
 static unsigned
 sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
 {
-	unsigned sym;
-	unsigned i;
-	unsigned num_used_syms;
-	unsigned num_counters;
-	unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
-
-	/*
-	 * We use heapsort, but with an added optimization.  Since often most
-	 * symbol frequencies are low, we first do a count sort using a limited
-	 * number of counters.  High frequencies are counted in the last
-	 * counter, and only they will be sorted with heapsort.
-	 *
-	 * Note: with more symbols, it is generally beneficial to have more
-	 * counters.  About 1 counter per symbol seems fastest.
-	 */
-
-	num_counters = GET_NUM_COUNTERS(num_syms);
-
-	memset(counters, 0, num_counters * sizeof(counters[0]));
-
-	/* Count the frequencies. */
-	for (sym = 0; sym < num_syms; sym++)
-		counters[MIN(freqs[sym], num_counters - 1)]++;
-
-	/*
-	 * Make the counters cumulative, ignoring the zero-th, which counted
-	 * symbols with zero frequency.  As a side effect, this calculates the
-	 * number of symbols with nonzero frequency.
-	 */
-	num_used_syms = 0;
-	for (i = 1; i < num_counters; i++) {
-		unsigned count = counters[i];
-
-		counters[i] = num_used_syms;
-		num_used_syms += count;
-	}
-
-	/*
-	 * Sort nonzero-frequency symbols using the counters.  At the same time,
-	 * set the codeword lengths of zero-frequency symbols to 0.
-	 */
-	for (sym = 0; sym < num_syms; sym++) {
-		u32 freq = freqs[sym];
-
-		if (freq != 0) {
-			symout[counters[MIN(freq, num_counters - 1)]++] =
-				sym | (freq << NUM_SYMBOL_BITS);
-		} else {
-			lens[sym] = 0;
-		}
-	}
-
-	/* Sort the symbols counted in the last counter. */
-	heap_sort(symout + counters[num_counters - 2],
-		  counters[num_counters - 1] - counters[num_counters - 2]);
-
-	return num_used_syms;
+  unsigned sym;
+  unsigned i;
+  unsigned num_used_syms;
+  unsigned num_counters;
+  unsigned counters[GET_NUM_COUNTERS(DEFLATE_MAX_NUM_SYMS)];
+  
+  /*
+   * We use heapsort, but with an added optimization.  Since often most
+   * symbol frequencies are low, we first do a count sort using a limited
+   * number of counters.  High frequencies are counted in the last
+   * counter, and only they will be sorted with heapsort.
+   *
+   * Note: with more symbols, it is generally beneficial to have more
+   * counters.  About 1 counter per symbol seems fastest.
+   */
+  
+  num_counters = GET_NUM_COUNTERS(num_syms);
+  
+  memset(counters, 0, num_counters * sizeof(counters[0]));
+  
+  /* Count the frequencies. */
+  for (sym = 0; sym < num_syms; sym++)
+    counters[MIN(freqs[sym], num_counters - 1)]++;
+  
+  /*
+   * Make the counters cumulative, ignoring the zero-th, which counted
+   * symbols with zero frequency.  As a side effect, this calculates the
+   * number of symbols with nonzero frequency.
+   */
+  num_used_syms = 0;
+  for (i = 1; i < num_counters; i++) {
+    unsigned count = counters[i];
+    
+    counters[i] = num_used_syms;
+    num_used_syms += count;
+  }
+  
+  /*
+   * Sort nonzero-frequency symbols using the counters.  At the same time,
+   * set the codeword lengths of zero-frequency symbols to 0.
+   */
+  for (sym = 0; sym < num_syms; sym++) {
+    u32 freq = freqs[sym];
+    
+    if (freq != 0) {
+      symout[counters[MIN(freq, num_counters - 1)]++] =
+      sym | (freq << NUM_SYMBOL_BITS);
+    } else {
+      lens[sym] = 0;
+    }
+  }
+  
+  /* Sort the symbols counted in the last counter. */
+  heap_sort(symout + counters[num_counters - 2],
+            counters[num_counters - 1] - counters[num_counters - 2]);
+  
+  return num_used_syms;
 }
 
 /*
  * Build a Huffman tree.
  *
  * This is an optimized implementation that
- *	(a) takes advantage of the frequencies being already sorted;
- *	(b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
- *	    tree are sufficient to generate a canonical code;
- *	(c) Only stores parent pointers, not child pointers;
- *	(d) Produces the nodes in the same memory used for input frequency
- *	    information.
+ *  (a) takes advantage of the frequencies being already sorted;
+ *  (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
+ *      tree are sufficient to generate a canonical code;
+ *  (c) Only stores parent pointers, not child pointers;
+ *  (d) Produces the nodes in the same memory used for input frequency
+ *      information.
  *
  * Array 'A', which contains 'sym_count' entries, is used for both input and
  * output.  For this function, 'sym_count' must be at least 2.
@@ -939,59 +939,59 @@ sort_symbols(unsigned num_syms, const u32 freqs[], u8 lens[], u32 symout[])
 static void
 build_tree(u32 A[], unsigned sym_count)
 {
-	const unsigned last_idx = sym_count - 1;
-
-	/* Index of the next lowest frequency leaf that still needs a parent */
-	unsigned i = 0;
-
-	/*
-	 * Index of the next lowest frequency non-leaf that still needs a
-	 * parent, or 'e' if there is currently no such node
-	 */
-	unsigned b = 0;
-
-	/* Index of the next spot for a non-leaf (will overwrite a leaf) */
-	unsigned e = 0;
-
-	do {
-		u32 new_freq;
-
-		/*
-		 * Select the next two lowest frequency nodes among the leaves
-		 * A[i] and non-leaves A[b], and create a new node A[e] to be
-		 * their parent.  Set the new node's frequency to the sum of the
-		 * frequencies of its two children.
-		 *
-		 * Usually the next two lowest frequency nodes are of the same
-		 * type (leaf or non-leaf), so check those cases first.
-		 */
-		if (i + 1 <= last_idx &&
-		    (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
-			/* Two leaves */
-			new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
-			i += 2;
-		} else if (b + 2 <= e &&
-			   (i > last_idx ||
-			    (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
-			/* Two non-leaves */
-			new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
-			A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
-			A[b + 1] = (e << NUM_SYMBOL_BITS) |
-				   (A[b + 1] & SYMBOL_MASK);
-			b += 2;
-		} else {
-			/* One leaf and one non-leaf */
-			new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
-			A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
-			i++;
-			b++;
-		}
-		A[e] = new_freq | (A[e] & SYMBOL_MASK);
-		/*
-		 * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
-		 * tree is complete once we've created 'n - 1' non-leaves.
-		 */
-	} while (++e < last_idx);
+  const unsigned last_idx = sym_count - 1;
+  
+  /* Index of the next lowest frequency leaf that still needs a parent */
+  unsigned i = 0;
+  
+  /*
+   * Index of the next lowest frequency non-leaf that still needs a
+   * parent, or 'e' if there is currently no such node
+   */
+  unsigned b = 0;
+  
+  /* Index of the next spot for a non-leaf (will overwrite a leaf) */
+  unsigned e = 0;
+  
+  do {
+    u32 new_freq;
+    
+    /*
+     * Select the next two lowest frequency nodes among the leaves
+     * A[i] and non-leaves A[b], and create a new node A[e] to be
+     * their parent.  Set the new node's frequency to the sum of the
+     * frequencies of its two children.
+     *
+     * Usually the next two lowest frequency nodes are of the same
+     * type (leaf or non-leaf), so check those cases first.
+     */
+    if (i + 1 <= last_idx &&
+        (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
+      /* Two leaves */
+      new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
+      i += 2;
+    } else if (b + 2 <= e &&
+               (i > last_idx ||
+                (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
+      /* Two non-leaves */
+      new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
+      A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+      A[b + 1] = (e << NUM_SYMBOL_BITS) |
+      (A[b + 1] & SYMBOL_MASK);
+      b += 2;
+    } else {
+      /* One leaf and one non-leaf */
+      new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
+      A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+      i++;
+      b++;
+    }
+    A[e] = new_freq | (A[e] & SYMBOL_MASK);
+    /*
+     * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
+     * tree is complete once we've created 'n - 1' non-leaves.
+     */
+  } while (++e < last_idx);
 }
 
 /*
@@ -1000,94 +1000,94 @@ build_tree(u32 A[], unsigned sym_count)
  * into account the length-limited constraint.
  *
  * @A
- *	The array produced by build_tree(), containing parent index information
- *	for the non-leaf nodes of the Huffman tree.  Each entry in this array is
- *	a node; a node's parent always has a greater index than that node
- *	itself.  This function will overwrite the parent index information in
- *	this array, so essentially it will destroy the tree.  However, the data
- *	in the low NUM_SYMBOL_BITS of each entry will be preserved.
+ *  The array produced by build_tree(), containing parent index information
+ *  for the non-leaf nodes of the Huffman tree.  Each entry in this array is
+ *  a node; a node's parent always has a greater index than that node
+ *  itself.  This function will overwrite the parent index information in
+ *  this array, so essentially it will destroy the tree.  However, the data
+ *  in the low NUM_SYMBOL_BITS of each entry will be preserved.
  *
  * @root_idx
- *	The 0-based index of the root node in 'A', and consequently one less
- *	than the number of tree node entries in 'A'.  (Or, really 2 less than
- *	the actual length of 'A'.)
+ *  The 0-based index of the root node in 'A', and consequently one less
+ *  than the number of tree node entries in 'A'.  (Or, really 2 less than
+ *  the actual length of 'A'.)
  *
  * @len_counts
- *	An array of length ('max_codeword_len' + 1) in which the number of
- *	codewords having each length <= max_codeword_len will be returned.
+ *  An array of length ('max_codeword_len' + 1) in which the number of
+ *  codewords having each length <= max_codeword_len will be returned.
  *
  * @max_codeword_len
- *	The maximum permissible codeword length.
+ *  The maximum permissible codeword length.
  */
 static void
 compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
-		      unsigned max_codeword_len)
+                      unsigned max_codeword_len)
 {
-	unsigned len;
-	int node;
-
-	/*
-	 * The key observations are:
-	 *
-	 * (1) We can traverse the non-leaf nodes of the tree, always visiting a
-	 *     parent before its children, by simply iterating through the array
-	 *     in reverse order.  Consequently, we can compute the depth of each
-	 *     node in one pass, overwriting the parent indices with depths.
-	 *
-	 * (2) We can initially assume that in the real Huffman tree, both
-	 *     children of the root are leaves.  This corresponds to two
-	 *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
-	 *     during the traversal, we modify this assumption to account for
-	 *     the current node *not* being a leaf, but rather its two children
-	 *     being leaves.  This causes the loss of one codeword for the
-	 *     current depth and the addition of two codewords for the current
-	 *     depth plus one.
-	 *
-	 * (3) We can handle the length-limited constraint fairly easily by
-	 *     simply using the largest length available when a depth exceeds
-	 *     max_codeword_len.
-	 */
-
-	for (len = 0; len <= max_codeword_len; len++)
-		len_counts[len] = 0;
-	len_counts[1] = 2;
-
-	/* Set the root node's depth to 0. */
-	A[root_idx] &= SYMBOL_MASK;
-
-	for (node = root_idx - 1; node >= 0; node--) {
-
-		/* Calculate the depth of this node. */
-
-		unsigned parent = A[node] >> NUM_SYMBOL_BITS;
-		unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
-		unsigned depth = parent_depth + 1;
-
-		/*
-		 * Set the depth of this node so that it is available when its
-		 * children (if any) are processed.
-		 */
-		A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
-
-		/*
-		 * If needed, decrease the length to meet the length-limited
-		 * constraint.  This is not the optimal method for generating
-		 * length-limited Huffman codes!  But it should be good enough.
-		 */
-		if (depth >= max_codeword_len) {
-			depth = max_codeword_len;
-			do {
-				depth--;
-			} while (len_counts[depth] == 0);
-		}
-
-		/*
-		 * Account for the fact that we have a non-leaf node at the
-		 * current depth.
-		 */
-		len_counts[depth]--;
-		len_counts[depth + 1] += 2;
-	}
+  unsigned len;
+  int node;
+  
+  /*
+   * The key observations are:
+   *
+   * (1) We can traverse the non-leaf nodes of the tree, always visiting a
+   *     parent before its children, by simply iterating through the array
+   *     in reverse order.  Consequently, we can compute the depth of each
+   *     node in one pass, overwriting the parent indices with depths.
+   *
+   * (2) We can initially assume that in the real Huffman tree, both
+   *     children of the root are leaves.  This corresponds to two
+   *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
+   *     during the traversal, we modify this assumption to account for
+   *     the current node *not* being a leaf, but rather its two children
+   *     being leaves.  This causes the loss of one codeword for the
+   *     current depth and the addition of two codewords for the current
+   *     depth plus one.
+   *
+   * (3) We can handle the length-limited constraint fairly easily by
+   *     simply using the largest length available when a depth exceeds
+   *     max_codeword_len.
+   */
+  
+  for (len = 0; len <= max_codeword_len; len++)
+    len_counts[len] = 0;
+  len_counts[1] = 2;
+  
+  /* Set the root node's depth to 0. */
+  A[root_idx] &= SYMBOL_MASK;
+  
+  for (node = root_idx - 1; node >= 0; node--) {
+    
+    /* Calculate the depth of this node. */
+    
+    unsigned parent = A[node] >> NUM_SYMBOL_BITS;
+    unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
+    unsigned depth = parent_depth + 1;
+    
+    /*
+     * Set the depth of this node so that it is available when its
+     * children (if any) are processed.
+     */
+    A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
+    
+    /*
+     * If needed, decrease the length to meet the length-limited
+     * constraint.  This is not the optimal method for generating
+     * length-limited Huffman codes!  But it should be good enough.
+     */
+    if (depth >= max_codeword_len) {
+      depth = max_codeword_len;
+      do {
+        depth--;
+      } while (len_counts[depth] == 0);
+    }
+    
+    /*
+     * Account for the fact that we have a non-leaf node at the
+     * current depth.
+     */
+    len_counts[depth]--;
+    len_counts[depth + 1] += 2;
+  }
 }
 
 /*
@@ -1103,51 +1103,51 @@ compute_length_counts(u32 A[], unsigned root_idx, unsigned len_counts[],
 #ifdef rbit32
 static forceinline u32 reverse_codeword(u32 codeword, u8 len)
 {
-	return rbit32(codeword) >> ((32 - len) & 31);
+  return rbit32(codeword) >> ((32 - len) & 31);
 }
 #else
 /* Generated by scripts/gen_bitreverse_tab.py */
 static const u8 bitreverse_tab[256] = {
-	0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
-	0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
-	0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
-	0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
-	0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
-	0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
-	0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
-	0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
-	0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
-	0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
-	0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
-	0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
-	0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
-	0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
-	0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
-	0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
-	0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
-	0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
-	0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
-	0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
-	0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
-	0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
-	0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
-	0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
-	0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
-	0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
-	0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
-	0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
-	0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
-	0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
-	0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
-	0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
+  0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
+  0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
+  0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
+  0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
+  0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
+  0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
+  0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
+  0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
+  0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
+  0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
+  0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
+  0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
+  0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
+  0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
+  0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
+  0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe,
+  0x01, 0x81, 0x41, 0xc1, 0x21, 0xa1, 0x61, 0xe1,
+  0x11, 0x91, 0x51, 0xd1, 0x31, 0xb1, 0x71, 0xf1,
+  0x09, 0x89, 0x49, 0xc9, 0x29, 0xa9, 0x69, 0xe9,
+  0x19, 0x99, 0x59, 0xd9, 0x39, 0xb9, 0x79, 0xf9,
+  0x05, 0x85, 0x45, 0xc5, 0x25, 0xa5, 0x65, 0xe5,
+  0x15, 0x95, 0x55, 0xd5, 0x35, 0xb5, 0x75, 0xf5,
+  0x0d, 0x8d, 0x4d, 0xcd, 0x2d, 0xad, 0x6d, 0xed,
+  0x1d, 0x9d, 0x5d, 0xdd, 0x3d, 0xbd, 0x7d, 0xfd,
+  0x03, 0x83, 0x43, 0xc3, 0x23, 0xa3, 0x63, 0xe3,
+  0x13, 0x93, 0x53, 0xd3, 0x33, 0xb3, 0x73, 0xf3,
+  0x0b, 0x8b, 0x4b, 0xcb, 0x2b, 0xab, 0x6b, 0xeb,
+  0x1b, 0x9b, 0x5b, 0xdb, 0x3b, 0xbb, 0x7b, 0xfb,
+  0x07, 0x87, 0x47, 0xc7, 0x27, 0xa7, 0x67, 0xe7,
+  0x17, 0x97, 0x57, 0xd7, 0x37, 0xb7, 0x77, 0xf7,
+  0x0f, 0x8f, 0x4f, 0xcf, 0x2f, 0xaf, 0x6f, 0xef,
+  0x1f, 0x9f, 0x5f, 0xdf, 0x3f, 0xbf, 0x7f, 0xff,
 };
 
 static forceinline u32 reverse_codeword(u32 codeword, u8 len)
 {
-	STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
-	codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) |
-		   bitreverse_tab[codeword >> 8];
-	return codeword >> (16 - len);
+  STATIC_ASSERT(DEFLATE_MAX_CODEWORD_LEN <= 16);
+  codeword = ((u32)bitreverse_tab[codeword & 0xff] << 8) |
+  bitreverse_tab[codeword >> 8];
+  return codeword >> (16 - len);
 }
 #endif /* !rbit32 */
 
@@ -1155,98 +1155,98 @@ static forceinline u32 reverse_codeword(u32 codeword, u8 len)
  * Generate the codewords for a canonical Huffman code.
  *
  * @A
- *	The output array for codewords.  In addition, initially this
- *	array must contain the symbols, sorted primarily by frequency and
- *	secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
- *	each entry.
+ *  The output array for codewords.  In addition, initially this
+ *  array must contain the symbols, sorted primarily by frequency and
+ *  secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
+ *  each entry.
  *
  * @len
- *	Output array for codeword lengths.
+ *  Output array for codeword lengths.
  *
  * @len_counts
- *	An array that provides the number of codewords that will have
- *	each possible length <= max_codeword_len.
+ *  An array that provides the number of codewords that will have
+ *  each possible length <= max_codeword_len.
  *
  * @max_codeword_len
- *	Maximum length, in bits, of each codeword.
+ *  Maximum length, in bits, of each codeword.
  *
  * @num_syms
- *	Number of symbols in the alphabet, including symbols with zero
- *	frequency.  This is the length of the 'A' and 'len' arrays.
+ *  Number of symbols in the alphabet, including symbols with zero
+ *  frequency.  This is the length of the 'A' and 'len' arrays.
  */
 static void
 gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
-	      unsigned max_codeword_len, unsigned num_syms)
+              unsigned max_codeword_len, unsigned num_syms)
 {
-	u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
-	unsigned i;
-	unsigned len;
-	unsigned sym;
-
-	/*
-	 * Given the number of codewords that will have each length, assign
-	 * codeword lengths to symbols.  We do this by assigning the lengths in
-	 * decreasing order to the symbols sorted primarily by increasing
-	 * frequency and secondarily by increasing symbol value.
-	 */
-	for (i = 0, len = max_codeword_len; len >= 1; len--) {
-		unsigned count = len_counts[len];
-
-		while (count--)
-			lens[A[i++] & SYMBOL_MASK] = len;
-	}
-
-	/*
-	 * Generate the codewords themselves.  We initialize the
-	 * 'next_codewords' array to provide the lexicographically first
-	 * codeword of each length, then assign codewords in symbol order.  This
-	 * produces a canonical code.
-	 */
-	next_codewords[0] = 0;
-	next_codewords[1] = 0;
-	for (len = 2; len <= max_codeword_len; len++)
-		next_codewords[len] =
-			(next_codewords[len - 1] + len_counts[len - 1]) << 1;
-
-	for (sym = 0; sym < num_syms; sym++) {
-		/* DEFLATE requires bit-reversed codewords. */
-		A[sym] = reverse_codeword(next_codewords[lens[sym]]++,
-					  lens[sym]);
-	}
+  u32 next_codewords[DEFLATE_MAX_CODEWORD_LEN + 1];
+  unsigned i;
+  unsigned len;
+  unsigned sym;
+  
+  /*
+   * Given the number of codewords that will have each length, assign
+   * codeword lengths to symbols.  We do this by assigning the lengths in
+   * decreasing order to the symbols sorted primarily by increasing
+   * frequency and secondarily by increasing symbol value.
+   */
+  for (i = 0, len = max_codeword_len; len >= 1; len--) {
+    unsigned count = len_counts[len];
+    
+    while (count--)
+      lens[A[i++] & SYMBOL_MASK] = len;
+  }
+  
+  /*
+   * Generate the codewords themselves.  We initialize the
+   * 'next_codewords' array to provide the lexicographically first
+   * codeword of each length, then assign codewords in symbol order.  This
+   * produces a canonical code.
+   */
+  next_codewords[0] = 0;
+  next_codewords[1] = 0;
+  for (len = 2; len <= max_codeword_len; len++)
+    next_codewords[len] =
+    (next_codewords[len - 1] + len_counts[len - 1]) << 1;
+  
+  for (sym = 0; sym < num_syms; sym++) {
+    /* DEFLATE requires bit-reversed codewords. */
+    A[sym] = reverse_codeword(next_codewords[lens[sym]]++,
+                              lens[sym]);
+  }
 }
 
 /*
  * ---------------------------------------------------------------------
- *			deflate_make_huffman_code()
+ *      deflate_make_huffman_code()
  * ---------------------------------------------------------------------
  *
  * Given an alphabet and the frequency of each symbol in it, construct a
  * length-limited canonical Huffman code.
  *
  * @num_syms
- *	The number of symbols in the alphabet.  The symbols are the integers in
- *	the range [0, num_syms - 1].  This parameter must be at least 2 and
- *	must not exceed (1 << NUM_SYMBOL_BITS).
+ *  The number of symbols in the alphabet.  The symbols are the integers in
+ *  the range [0, num_syms - 1].  This parameter must be at least 2 and
+ *  must not exceed (1 << NUM_SYMBOL_BITS).
  *
  * @max_codeword_len
- *	The maximum permissible codeword length.
+ *  The maximum permissible codeword length.
  *
  * @freqs
- *	An array of length @num_syms that gives the frequency of each symbol.
- *	It is valid for some, none, or all of the frequencies to be 0.  The sum
- *	of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
+ *  An array of length @num_syms that gives the frequency of each symbol.
+ *  It is valid for some, none, or all of the frequencies to be 0.  The sum
+ *  of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
  *
  * @lens
- *	An array of @num_syms entries in which this function will return the
- *	length, in bits, of the codeword assigned to each symbol.  Symbols with
- *	0 frequency will not have codewords per se, but their entries in this
- *	array will be set to 0.  No lengths greater than @max_codeword_len will
- *	be assigned.
+ *  An array of @num_syms entries in which this function will return the
+ *  length, in bits, of the codeword assigned to each symbol.  Symbols with
+ *  0 frequency will not have codewords per se, but their entries in this
+ *  array will be set to 0.  No lengths greater than @max_codeword_len will
+ *  be assigned.
  *
  * @codewords
- *	An array of @num_syms entries in which this function will return the
- *	codeword for each symbol, right-justified and padded on the left with
- *	zeroes.  Codewords for symbols with 0 frequency will be undefined.
+ *  An array of @num_syms entries in which this function will return the
+ *  codeword for each symbol, right-justified and padded on the left with
+ *  zeroes.  Codewords for symbols with 0 frequency will be undefined.
  *
  * ---------------------------------------------------------------------
  *
@@ -1300,13 +1300,13 @@ gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
  * with depth information as part of the process of extracting codeword lengths
  * from the tree.  So in summary, we do NOT need a big structure like:
  *
- *	struct huffman_tree_node {
- *		unsigned int symbol;
- *		unsigned int frequency;
- *		unsigned int depth;
- *		struct huffman_tree_node *left_child;
- *		struct huffman_tree_node *right_child;
- *	};
+ *  struct huffman_tree_node {
+ *    unsigned int symbol;
+ *    unsigned int frequency;
+ *    unsigned int depth;
+ *    struct huffman_tree_node *left_child;
+ *    struct huffman_tree_node *right_child;
+ *  };
  *
  *
  * ... which often gets used in "naive" implementations of Huffman code
@@ -1317,82 +1317,82 @@ gen_codewords(u32 A[], u8 lens[], const unsigned len_counts[],
  */
 static void
 deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
-			  const u32 freqs[], u8 lens[], u32 codewords[])
+                          const u32 freqs[], u8 lens[], u32 codewords[])
 {
-	u32 *A = codewords;
-	unsigned num_used_syms;
-
-	STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
-	STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1);
-
-	/*
-	 * We begin by sorting the symbols primarily by frequency and
-	 * secondarily by symbol value.  As an optimization, the array used for
-	 * this purpose ('A') shares storage with the space in which we will
-	 * eventually return the codewords.
-	 */
-	num_used_syms = sort_symbols(num_syms, freqs, lens, A);
-	/*
-	 * 'num_used_syms' is the number of symbols with nonzero frequency.
-	 * This may be less than @num_syms.  'num_used_syms' is also the number
-	 * of entries in 'A' that are valid.  Each entry consists of a distinct
-	 * symbol and a nonzero frequency packed into a 32-bit integer.
-	 */
-
-	/*
-	 * A complete Huffman code must contain at least 2 codewords.  Yet, it's
-	 * possible that fewer than 2 symbols were used.  When this happens,
-	 * it's usually for the offset code (0-1 symbols used).  But it's also
-	 * theoretically possible for the litlen and pre codes (1 symbol used).
-	 *
-	 * The DEFLATE RFC explicitly allows the offset code to contain just 1
-	 * codeword, or even be completely empty.  But it's silent about the
-	 * other codes.  It also doesn't say whether, in the 1-codeword case,
-	 * the codeword (which it says must be 1 bit) is '0' or '1'.
-	 *
-	 * In any case, some DEFLATE decompressors reject these cases.  zlib
-	 * generally allows them, but it does reject precodes that have just 1
-	 * codeword.  More problematically, zlib v1.2.1 and earlier rejected
-	 * empty offset codes, and this behavior can also be seen in Windows
-	 * Explorer's ZIP unpacker (supposedly even still in Windows 11).
-	 *
-	 * Other DEFLATE compressors, including zlib, always send at least 2
-	 * codewords in order to make a complete Huffman code.  Therefore, this
-	 * is a case where practice does not entirely match the specification.
-	 * We follow practice by generating 2 codewords of length 1: codeword
-	 * '0' for symbol 0, and codeword '1' for another symbol -- the used
-	 * symbol if it exists and is not symbol 0, otherwise symbol 1.  This
-	 * does worsen the compression ratio by having to send an unnecessary
-	 * offset codeword length.  But this only affects rare cases such as
-	 * blocks containing all literals, and it only makes a tiny difference.
-	 */
-	if (unlikely(num_used_syms < 2)) {
-		unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0;
-		unsigned nonzero_idx = sym ? sym : 1;
-
-		codewords[0] = 0;
-		lens[0] = 1;
-		codewords[nonzero_idx] = 1;
-		lens[nonzero_idx] = 1;
-		return;
-	}
-
-	/*
-	 * Build a stripped-down version of the Huffman tree, sharing the array
-	 * 'A' with the symbol values.  Then extract length counts from the tree
-	 * and use them to generate the final codewords.
-	 */
-
-	build_tree(A, num_used_syms);
-
-	{
-		unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
-
-		compute_length_counts(A, num_used_syms - 2,
-				      len_counts, max_codeword_len);
-
-		gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
-	}
+  u32 *A = codewords;
+  unsigned num_used_syms;
+  
+  STATIC_ASSERT(DEFLATE_MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
+  STATIC_ASSERT(MAX_BLOCK_LENGTH <= ((u32)1 << NUM_FREQ_BITS) - 1);
+  
+  /*
+   * We begin by sorting the symbols primarily by frequency and
+   * secondarily by symbol value.  As an optimization, the array used for
+   * this purpose ('A') shares storage with the space in which we will
+   * eventually return the codewords.
+   */
+  num_used_syms = sort_symbols(num_syms, freqs, lens, A);
+  /*
+   * 'num_used_syms' is the number of symbols with nonzero frequency.
+   * This may be less than @num_syms.  'num_used_syms' is also the number
+   * of entries in 'A' that are valid.  Each entry consists of a distinct
+   * symbol and a nonzero frequency packed into a 32-bit integer.
+   */
+  
+  /*
+   * A complete Huffman code must contain at least 2 codewords.  Yet, it's
+   * possible that fewer than 2 symbols were used.  When this happens,
+   * it's usually for the offset code (0-1 symbols used).  But it's also
+   * theoretically possible for the litlen and pre codes (1 symbol used).
+   *
+   * The DEFLATE RFC explicitly allows the offset code to contain just 1
+   * codeword, or even be completely empty.  But it's silent about the
+   * other codes.  It also doesn't say whether, in the 1-codeword case,
+   * the codeword (which it says must be 1 bit) is '0' or '1'.
+   *
+   * In any case, some DEFLATE decompressors reject these cases.  zlib
+   * generally allows them, but it does reject precodes that have just 1
+   * codeword.  More problematically, zlib v1.2.1 and earlier rejected
+   * empty offset codes, and this behavior can also be seen in Windows
+   * Explorer's ZIP unpacker (supposedly even still in Windows 11).
+   *
+   * Other DEFLATE compressors, including zlib, always send at least 2
+   * codewords in order to make a complete Huffman code.  Therefore, this
+   * is a case where practice does not entirely match the specification.
+   * We follow practice by generating 2 codewords of length 1: codeword
+   * '0' for symbol 0, and codeword '1' for another symbol -- the used
+   * symbol if it exists and is not symbol 0, otherwise symbol 1.  This
+   * does worsen the compression ratio by having to send an unnecessary
+   * offset codeword length.  But this only affects rare cases such as
+   * blocks containing all literals, and it only makes a tiny difference.
+   */
+  if (unlikely(num_used_syms < 2)) {
+    unsigned sym = num_used_syms ? (A[0] & SYMBOL_MASK) : 0;
+    unsigned nonzero_idx = sym ? sym : 1;
+    
+    codewords[0] = 0;
+    lens[0] = 1;
+    codewords[nonzero_idx] = 1;
+    lens[nonzero_idx] = 1;
+    return;
+  }
+  
+  /*
+   * Build a stripped-down version of the Huffman tree, sharing the array
+   * 'A' with the symbol values.  Then extract length counts from the tree
+   * and use them to generate the final codewords.
+   */
+  
+  build_tree(A, num_used_syms);
+  
+  {
+    unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
+    
+    compute_length_counts(A, num_used_syms - 2,
+                          len_counts, max_codeword_len);
+    
+    gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
+  }
 }
 
 /*
@@ -1402,7 +1402,7 @@ deflate_make_huffman_code(unsigned num_syms, unsigned max_codeword_len,
 static void
 deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
 {
-	memset(&c->freqs, 0, sizeof(c->freqs));
+  memset(&c->freqs, 0, sizeof(c->freqs));
 }
 
 /*
@@ -1413,145 +1413,145 @@ deflate_reset_symbol_frequencies(struct libdeflate_compressor *c)
  */
 static void
 deflate_make_huffman_codes(const struct deflate_freqs *freqs,
-			   struct deflate_codes *codes)
+                           struct deflate_codes *codes)
 {
-	deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
-				  MAX_LITLEN_CODEWORD_LEN,
-				  freqs->litlen,
-				  codes->lens.litlen,
-				  codes->codewords.litlen);
-
-	deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
-				  MAX_OFFSET_CODEWORD_LEN,
-				  freqs->offset,
-				  codes->lens.offset,
-				  codes->codewords.offset);
+  deflate_make_huffman_code(DEFLATE_NUM_LITLEN_SYMS,
+                            MAX_LITLEN_CODEWORD_LEN,
+                            freqs->litlen,
+                            codes->lens.litlen,
+                            codes->codewords.litlen);
+  
+  deflate_make_huffman_code(DEFLATE_NUM_OFFSET_SYMS,
+                            MAX_OFFSET_CODEWORD_LEN,
+                            freqs->offset,
+                            codes->lens.offset,
+                            codes->codewords.offset);
 }
 
 /* Initialize c->static_codes. */
 static void
 deflate_init_static_codes(struct libdeflate_compressor *c)
 {
-	unsigned i;
-
-	for (i = 0; i < 144; i++)
-		c->freqs.litlen[i] = 1 << (9 - 8);
-	for (; i < 256; i++)
-		c->freqs.litlen[i] = 1 << (9 - 9);
-	for (; i < 280; i++)
-		c->freqs.litlen[i] = 1 << (9 - 7);
-	for (; i < 288; i++)
-		c->freqs.litlen[i] = 1 << (9 - 8);
-
-	for (i = 0; i < 32; i++)
-		c->freqs.offset[i] = 1 << (5 - 5);
-
-	deflate_make_huffman_codes(&c->freqs, &c->static_codes);
+  unsigned i;
+  
+  for (i = 0; i < 144; i++)
+    c->freqs.litlen[i] = 1 << (9 - 8);
+  for (; i < 256; i++)
+    c->freqs.litlen[i] = 1 << (9 - 9);
+  for (; i < 280; i++)
+    c->freqs.litlen[i] = 1 << (9 - 7);
+  for (; i < 288; i++)
+    c->freqs.litlen[i] = 1 << (9 - 8);
+  
+  for (i = 0; i < 32; i++)
+    c->freqs.offset[i] = 1 << (5 - 5);
+  
+  deflate_make_huffman_codes(&c->freqs, &c->static_codes);
 }
 
 /* Return the offset slot for the given match offset, using the small map. */
 static forceinline unsigned
 deflate_get_offset_slot(u32 offset)
 {
-	/*
-	 * 1 <= offset <= 32768 here.  For 1 <= offset <= 256,
-	 * deflate_offset_slot[offset - 1] gives the slot.
-	 *
-	 * For 257 <= offset <= 32768, we take advantage of the fact that 257 is
-	 * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 ==
-	 * 128 times larger than each slot [2..16) (since the number of extra
-	 * bits increases by 1 every 2 slots).  Thus, the slot is:
-	 *
-	 *	deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2)
-	 *   == deflate_offset_slot[((offset - 1) >> 7)] + 14
-	 *
-	 * Define 'n = (offset <= 256) ? 0 : 7'.  Then any offset is handled by:
-	 *
-	 *      deflate_offset_slot[(offset - 1) >> n] + (n << 1)
-	 *
-	 * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with
-	 * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'.
-	 */
-	unsigned n = (256 - offset) >> 29;
-
-	return deflate_offset_slot[(offset - 1) >> n] + (n << 1);
+  /*
+   * 1 <= offset <= 32768 here.  For 1 <= offset <= 256,
+   * deflate_offset_slot[offset - 1] gives the slot.
+   *
+   * For 257 <= offset <= 32768, we take advantage of the fact that 257 is
+   * the beginning of slot 16, and each slot [16..30) is exactly 1 << 7 ==
+   * 128 times larger than each slot [2..16) (since the number of extra
+   * bits increases by 1 every 2 slots).  Thus, the slot is:
+   *
+   *  deflate_offset_slot[2 + ((offset - 257) >> 7)] + (16 - 2)
+   *   == deflate_offset_slot[((offset - 1) >> 7)] + 14
+   *
+   * Define 'n = (offset <= 256) ? 0 : 7'.  Then any offset is handled by:
+   *
+   *      deflate_offset_slot[(offset - 1) >> n] + (n << 1)
+   *
+   * For better performance, replace 'n = (offset <= 256) ? 0 : 7' with
+   * the equivalent (for offset <= 536871168) 'n = (256 - offset) >> 29'.
+   */
+  unsigned n = (256 - offset) >> 29;
+  
+  return deflate_offset_slot[(offset - 1) >> n] + (n << 1);
 }
 
 static unsigned
 deflate_compute_precode_items(const u8 lens[], const unsigned num_lens,
-			      u32 precode_freqs[], unsigned precode_items[])
+                              u32 precode_freqs[], unsigned precode_items[])
 {
-	unsigned *itemptr;
-	unsigned run_start;
-	unsigned run_end;
-	unsigned extra_bits;
-	u8 len;
-
-	memset(precode_freqs, 0,
-	       DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
-
-	itemptr = precode_items;
-	run_start = 0;
-	do {
-		/* Find the next run of codeword lengths. */
-
-		/* len = the length being repeated */
-		len = lens[run_start];
-
-		/* Extend the run. */
-		run_end = run_start;
-		do {
-			run_end++;
-		} while (run_end != num_lens && len == lens[run_end]);
-
-		if (len == 0) {
-			/* Run of zeroes. */
-
-			/* Symbol 18: RLE 11 to 138 zeroes at a time. */
-			while ((run_end - run_start) >= 11) {
-				extra_bits = MIN((run_end - run_start) - 11,
-						 0x7F);
-				precode_freqs[18]++;
-				*itemptr++ = 18 | (extra_bits << 5);
-				run_start += 11 + extra_bits;
-			}
-
-			/* Symbol 17: RLE 3 to 10 zeroes at a time. */
-			if ((run_end - run_start) >= 3) {
-				extra_bits = MIN((run_end - run_start) - 3,
-						 0x7);
-				precode_freqs[17]++;
-				*itemptr++ = 17 | (extra_bits << 5);
-				run_start += 3 + extra_bits;
-			}
-		} else {
-
-			/* A run of nonzero lengths. */
-
-			/* Symbol 16: RLE 3 to 6 of the previous length. */
-			if ((run_end - run_start) >= 4) {
-				precode_freqs[len]++;
-				*itemptr++ = len;
-				run_start++;
-				do {
-					extra_bits = MIN((run_end - run_start) -
-							 3, 0x3);
-					precode_freqs[16]++;
-					*itemptr++ = 16 | (extra_bits << 5);
-					run_start += 3 + extra_bits;
-				} while ((run_end - run_start) >= 3);
-			}
-		}
-
-		/* Output any remaining lengths without RLE. */
-		while (run_start != run_end) {
-			precode_freqs[len]++;
-			*itemptr++ = len;
-			run_start++;
-		}
-	} while (run_start != num_lens);
-
-	return itemptr - precode_items;
+  unsigned *itemptr;
+  unsigned run_start;
+  unsigned run_end;
+  unsigned extra_bits;
+  u8 len;
+  
+  memset(precode_freqs, 0,
+         DEFLATE_NUM_PRECODE_SYMS * sizeof(precode_freqs[0]));
+  
+  itemptr = precode_items;
+  run_start = 0;
+  do {
+    /* Find the next run of codeword lengths. */
+    
+    /* len = the length being repeated */
+    len = lens[run_start];
+    
+    /* Extend the run. */
+    run_end = run_start;
+    do {
+      run_end++;
+    } while (run_end != num_lens && len == lens[run_end]);
+    
+    if (len == 0) {
+      /* Run of zeroes. */
+      
+      /* Symbol 18: RLE 11 to 138 zeroes at a time. */
+      while ((run_end - run_start) >= 11) {
+        extra_bits = MIN((run_end - run_start) - 11,
+                         0x7F);
+        precode_freqs[18]++;
+        *itemptr++ = 18 | (extra_bits << 5);
+        run_start += 11 + extra_bits;
+      }
+      
+      /* Symbol 17: RLE 3 to 10 zeroes at a time. */
+      if ((run_end - run_start) >= 3) {
+        extra_bits = MIN((run_end - run_start) - 3,
+                         0x7);
+        precode_freqs[17]++;
+        *itemptr++ = 17 | (extra_bits << 5);
+        run_start += 3 + extra_bits;
+      }
+    } else {
+      
+      /* A run of nonzero lengths. */
+      
+      /* Symbol 16: RLE 3 to 6 of the previous length. */
+      if ((run_end - run_start) >= 4) {
+        precode_freqs[len]++;
+        *itemptr++ = len;
+        run_start++;
+        do {
+          extra_bits = MIN((run_end - run_start) -
+                           3, 0x3);
+          precode_freqs[16]++;
+          *itemptr++ = 16 | (extra_bits << 5);
+          run_start += 3 + extra_bits;
+        } while ((run_end - run_start) >= 3);
+      }
+    }
+    
+    /* Output any remaining lengths without RLE. */
+    while (run_start != run_end) {
+      precode_freqs[len]++;
+      *itemptr++ = len;
+      run_start++;
+    }
+  } while (run_start != num_lens);
+  
+  return itemptr - precode_items;
 }
 
 /*
@@ -1568,64 +1568,64 @@ deflate_compute_precode_items(const u8 lens[], const unsigned num_lens,
 static void
 deflate_precompute_huffman_header(struct libdeflate_compressor *c)
 {
-	/* Compute how many litlen and offset symbols are needed. */
-
-	for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
-	     c->o.precode.num_litlen_syms > 257;
-	     c->o.precode.num_litlen_syms--)
-		if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0)
-			break;
-
-	for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
-	     c->o.precode.num_offset_syms > 1;
-	     c->o.precode.num_offset_syms--)
-		if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0)
-			break;
-
-	/*
-	 * If we're not using the full set of literal/length codeword lengths,
-	 * then temporarily move the offset codeword lengths over so that the
-	 * literal/length and offset codeword lengths are contiguous.
-	 */
-	STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
-		      DEFLATE_NUM_LITLEN_SYMS);
-	if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
-		memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
-			(u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
-			c->o.precode.num_offset_syms);
-	}
-
-	/*
-	 * Compute the "items" (RLE / literal tokens and extra bits) with which
-	 * the codeword lengths in the larger code will be output.
-	 */
-	c->o.precode.num_items =
-		deflate_compute_precode_items((u8 *)&c->codes.lens,
-					      c->o.precode.num_litlen_syms +
-					      c->o.precode.num_offset_syms,
-					      c->o.precode.freqs,
-					      c->o.precode.items);
-
-	/* Build the precode. */
-	deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
-				  MAX_PRE_CODEWORD_LEN,
-				  c->o.precode.freqs, c->o.precode.lens,
-				  c->o.precode.codewords);
-
-	/* Count how many precode lengths we actually need to output. */
-	for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
-	     c->o.precode.num_explicit_lens > 4;
-	     c->o.precode.num_explicit_lens--)
-		if (c->o.precode.lens[deflate_precode_lens_permutation[
-				c->o.precode.num_explicit_lens - 1]] != 0)
-			break;
-
-	/* Restore the offset codeword lengths if needed. */
-	if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
-		memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
-			(u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
-			c->o.precode.num_offset_syms);
-	}
+  /* Compute how many litlen and offset symbols are needed. */
+  
+  for (c->o.precode.num_litlen_syms = DEFLATE_NUM_LITLEN_SYMS;
+       c->o.precode.num_litlen_syms > 257;
+       c->o.precode.num_litlen_syms--)
+    if (c->codes.lens.litlen[c->o.precode.num_litlen_syms - 1] != 0)
+      break;
+  
+  for (c->o.precode.num_offset_syms = DEFLATE_NUM_OFFSET_SYMS;
+       c->o.precode.num_offset_syms > 1;
+       c->o.precode.num_offset_syms--)
+    if (c->codes.lens.offset[c->o.precode.num_offset_syms - 1] != 0)
+      break;
+  
+  /*
+   * If we're not using the full set of literal/length codeword lengths,
+   * then temporarily move the offset codeword lengths over so that the
+   * literal/length and offset codeword lengths are contiguous.
+   */
+  STATIC_ASSERT(offsetof(struct deflate_lens, offset) ==
+                DEFLATE_NUM_LITLEN_SYMS);
+  if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
+    memmove((u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
+            (u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
+            c->o.precode.num_offset_syms);
+  }
+  
+  /*
+   * Compute the "items" (RLE / literal tokens and extra bits) with which
+   * the codeword lengths in the larger code will be output.
+   */
+  c->o.precode.num_items =
+  deflate_compute_precode_items((u8 *)&c->codes.lens,
+                                c->o.precode.num_litlen_syms +
+                                c->o.precode.num_offset_syms,
+                                c->o.precode.freqs,
+                                c->o.precode.items);
+  
+  /* Build the precode. */
+  deflate_make_huffman_code(DEFLATE_NUM_PRECODE_SYMS,
+                            MAX_PRE_CODEWORD_LEN,
+                            c->o.precode.freqs, c->o.precode.lens,
+                            c->o.precode.codewords);
+  
+  /* Count how many precode lengths we actually need to output. */
+  for (c->o.precode.num_explicit_lens = DEFLATE_NUM_PRECODE_SYMS;
+       c->o.precode.num_explicit_lens > 4;
+       c->o.precode.num_explicit_lens--)
+    if (c->o.precode.lens[deflate_precode_lens_permutation[
+      c->o.precode.num_explicit_lens - 1]] != 0)
+      break;
+  
+  /* Restore the offset codeword lengths if needed. */
+  if (c->o.precode.num_litlen_syms != DEFLATE_NUM_LITLEN_SYMS) {
+    memmove((u8 *)&c->codes.lens + DEFLATE_NUM_LITLEN_SYMS,
+            (u8 *)&c->codes.lens + c->o.precode.num_litlen_syms,
+            c->o.precode.num_offset_syms);
+  }
 }
 
 /*
@@ -1635,60 +1635,60 @@ deflate_precompute_huffman_header(struct libdeflate_compressor *c)
  */
 static void
 deflate_compute_full_len_codewords(struct libdeflate_compressor *c,
-				   const struct deflate_codes *codes)
+                                   const struct deflate_codes *codes)
 {
-	unsigned len;
-
-	STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN +
-		      DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32);
-
-	for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) {
-		unsigned slot = deflate_length_slot[len];
-		unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot;
-		u32 extra_bits = len - deflate_length_slot_base[slot];
-
-		c->o.length.codewords[len] =
-			codes->codewords.litlen[litlen_sym] |
-			(extra_bits << codes->lens.litlen[litlen_sym]);
-		c->o.length.lens[len] = codes->lens.litlen[litlen_sym] +
-					deflate_extra_length_bits[slot];
-	}
+  unsigned len;
+  
+  STATIC_ASSERT(MAX_LITLEN_CODEWORD_LEN +
+                DEFLATE_MAX_EXTRA_LENGTH_BITS <= 32);
+  
+  for (len = DEFLATE_MIN_MATCH_LEN; len <= DEFLATE_MAX_MATCH_LEN; len++) {
+    unsigned slot = deflate_length_slot[len];
+    unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + slot;
+    u32 extra_bits = len - deflate_length_slot_base[slot];
+    
+    c->o.length.codewords[len] =
+    codes->codewords.litlen[litlen_sym] |
+    (extra_bits << codes->lens.litlen[litlen_sym]);
+    c->o.length.lens[len] = codes->lens.litlen[litlen_sym] +
+    deflate_extra_length_bits[slot];
+  }
 }
 
 /* Write a match to the output buffer. */
-#define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_)		\
-do {									\
-	const struct libdeflate_compressor *c__ = (c_);			\
-	const struct deflate_codes *codes__ = (codes_);			\
-	unsigned length__ = (length_);					\
-	unsigned offset__ = (offset_);					\
-	unsigned offset_slot__ = (offset_slot_);			\
-									\
-	/* Litlen symbol and extra length bits */			\
-	STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +		\
-				 DEFLATE_MAX_EXTRA_LENGTH_BITS));	\
-	ADD_BITS(c__->o.length.codewords[length__],			\
-		 c__->o.length.lens[length__]);				\
-									\
-	if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +			\
-			DEFLATE_MAX_EXTRA_LENGTH_BITS +			\
-			MAX_OFFSET_CODEWORD_LEN +			\
-			DEFLATE_MAX_EXTRA_OFFSET_BITS))			\
-		FLUSH_BITS();						\
-									\
-	/* Offset symbol */						\
-	ADD_BITS(codes__->codewords.offset[offset_slot__],		\
-		 codes__->lens.offset[offset_slot__]);			\
-									\
-	if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +			\
-			DEFLATE_MAX_EXTRA_OFFSET_BITS))			\
-		FLUSH_BITS();						\
-									\
-	/* Extra offset bits */						\
-	ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__],	\
-		 deflate_extra_offset_bits[offset_slot__]);		\
-									\
-	FLUSH_BITS();							\
+#define WRITE_MATCH(c_, codes_, length_, offset_, offset_slot_)    \
+do {                  \
+const struct libdeflate_compressor *c__ = (c_);      \
+const struct deflate_codes *codes__ = (codes_);      \
+unsigned length__ = (length_);          \
+unsigned offset__ = (offset_);          \
+unsigned offset_slot__ = (offset_slot_);      \
+\
+/* Litlen symbol and extra length bits */      \
+STATIC_ASSERT(CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +    \
+DEFLATE_MAX_EXTRA_LENGTH_BITS));  \
+ADD_BITS(c__->o.length.codewords[length__],      \
+c__->o.length.lens[length__]);        \
+\
+if (!CAN_BUFFER(MAX_LITLEN_CODEWORD_LEN +      \
+DEFLATE_MAX_EXTRA_LENGTH_BITS +      \
+MAX_OFFSET_CODEWORD_LEN +      \
+DEFLATE_MAX_EXTRA_OFFSET_BITS))      \
+FLUSH_BITS();            \
+\
+/* Offset symbol */            \
+ADD_BITS(codes__->codewords.offset[offset_slot__],    \
+codes__->lens.offset[offset_slot__]);      \
+\
+if (!CAN_BUFFER(MAX_OFFSET_CODEWORD_LEN +      \
+DEFLATE_MAX_EXTRA_OFFSET_BITS))      \
+FLUSH_BITS();            \
+\
+/* Extra offset bits */            \
+ADD_BITS(offset__ - deflate_offset_slot_base[offset_slot__],  \
+deflate_extra_offset_bits[offset_slot__]);    \
+\
+FLUSH_BITS();              \
 } while (0)
 
 /*
@@ -1703,349 +1703,349 @@ do {									\
  */
 static void
 deflate_flush_block(struct libdeflate_compressor *c,
-		    struct deflate_output_bitstream *os,
-		    const u8 *block_begin, u32 block_length,
-		    const struct deflate_sequence *sequences,
-		    bool is_final_block)
+                    struct deflate_output_bitstream *os,
+                    const u8 *block_begin, u32 block_length,
+                    const struct deflate_sequence *sequences,
+                    bool is_final_block)
 {
-	/*
-	 * It is hard to get compilers to understand that writes to 'os->next'
-	 * don't alias 'os'.  That hurts performance significantly, as
-	 * everything in 'os' would keep getting re-loaded.  ('restrict'
-	 * *should* do the trick, but it's unreliable.)  Therefore, we keep all
-	 * the output bitstream state in local variables, and output bits using
-	 * macros.  This is similar to what the decompressor does.
-	 */
-	const u8 *in_next = block_begin;
-	const u8 * const in_end = block_begin + block_length;
-	bitbuf_t bitbuf = os->bitbuf;
-	unsigned bitcount = os->bitcount;
-	u8 *out_next = os->next;
-	u8 * const out_fast_end =
-		os->end - MIN(WORDBYTES - 1, os->end - out_next);
-	/*
-	 * The cost for each block type, in bits.  Start with the cost of the
-	 * block header which is 3 bits.
-	 */
-	u32 dynamic_cost = 3;
-	u32 static_cost = 3;
-	u32 uncompressed_cost = 3;
-	u32 best_cost;
-	struct deflate_codes *codes;
-	unsigned sym;
-
-	ASSERT(block_length >= MIN_BLOCK_LENGTH ||
-	       (is_final_block && block_length > 0));
-	ASSERT(block_length <= MAX_BLOCK_LENGTH);
-	ASSERT(bitcount <= 7);
-	ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
-	ASSERT(out_next <= os->end);
-	ASSERT(!os->overflow);
-
-	/* Precompute the precode items and build the precode. */
-	deflate_precompute_huffman_header(c);
-
-	/* Account for the cost of encoding dynamic Huffman codes. */
-	dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
-	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
-		u32 extra = deflate_extra_precode_bits[sym];
-
-		dynamic_cost += c->o.precode.freqs[sym] *
-				(extra + c->o.precode.lens[sym]);
-	}
-
-	/* Account for the cost of encoding literals. */
-	for (sym = 0; sym < 144; sym++) {
-		dynamic_cost += c->freqs.litlen[sym] *
-				c->codes.lens.litlen[sym];
-		static_cost += c->freqs.litlen[sym] * 8;
-	}
-	for (; sym < 256; sym++) {
-		dynamic_cost += c->freqs.litlen[sym] *
-				c->codes.lens.litlen[sym];
-		static_cost += c->freqs.litlen[sym] * 9;
-	}
-
-	/* Account for the cost of encoding the end-of-block symbol. */
-	dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK];
-	static_cost += 7;
-
-	/* Account for the cost of encoding lengths. */
-	for (sym = DEFLATE_FIRST_LEN_SYM;
-	     sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits);
-	     sym++) {
-		u32 extra = deflate_extra_length_bits[
-					sym - DEFLATE_FIRST_LEN_SYM];
-
-		dynamic_cost += c->freqs.litlen[sym] *
-				(extra + c->codes.lens.litlen[sym]);
-		static_cost += c->freqs.litlen[sym] *
-				(extra + c->static_codes.lens.litlen[sym]);
-	}
-
-	/* Account for the cost of encoding offsets. */
-	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
-		u32 extra = deflate_extra_offset_bits[sym];
-
-		dynamic_cost += c->freqs.offset[sym] *
-				(extra + c->codes.lens.offset[sym]);
-		static_cost += c->freqs.offset[sym] * (extra + 5);
-	}
-
-	/* Compute the cost of using uncompressed blocks. */
-	uncompressed_cost += (-(bitcount + 3) & 7) + 32 +
-			     (40 * (DIV_ROUND_UP(block_length,
-						 UINT16_MAX) - 1)) +
-			     (8 * block_length);
-
-	/*
-	 * Choose and output the cheapest type of block.  If there is a tie,
-	 * prefer uncompressed, then static, then dynamic.
-	 */
-
-	best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost));
-
-	/* If the block isn't going to fit, then stop early. */
-	if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) {
-		os->overflow = true;
-		return;
-	}
-	/*
-	 * Else, now we know that the block fits, so no further bounds checks on
-	 * the output buffer are required until the next block.
-	 */
-
-	if (best_cost == uncompressed_cost) {
-		/*
-		 * Uncompressed block(s).  DEFLATE limits the length of
-		 * uncompressed blocks to UINT16_MAX bytes, so if the length of
-		 * the "block" we're flushing is over UINT16_MAX, we actually
-		 * output multiple blocks.
-		 */
-		do {
-			u8 bfinal = 0;
-			size_t len = UINT16_MAX;
-
-			if (in_end - in_next <= UINT16_MAX) {
-				bfinal = is_final_block;
-				len = in_end - in_next;
-			}
-			/* It was already checked that there is enough space. */
-			ASSERT(os->end - out_next >=
-			       DIV_ROUND_UP(bitcount + 3, 8) + 4 + len);
-			/*
-			 * Output BFINAL (1 bit) and BTYPE (2 bits), then align
-			 * to a byte boundary.
-			 */
-			STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
-			*out_next++ = (bfinal << bitcount) | bitbuf;
-			if (bitcount > 5)
-				*out_next++ = 0;
-			bitbuf = 0;
-			bitcount = 0;
-			/* Output LEN and NLEN, then the data itself. */
-			put_unaligned_le16(len, out_next);
-			out_next += 2;
-			put_unaligned_le16(~len, out_next);
-			out_next += 2;
-			memcpy(out_next, in_next, len);
-			out_next += len;
-			in_next += len;
-		} while (in_next != in_end);
-		/* Done outputting uncompressed block(s) */
-		goto out;
-	}
-
-	if (best_cost == static_cost) {
-		/* Static Huffman block */
-		codes = &c->static_codes;
-		ADD_BITS(is_final_block, 1);
-		ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
-		FLUSH_BITS();
-	} else {
-		const unsigned num_explicit_lens = c->o.precode.num_explicit_lens;
-		const unsigned num_precode_items = c->o.precode.num_items;
-		unsigned precode_sym, precode_item;
-		unsigned i;
-
-		/* Dynamic Huffman block */
-
-		codes = &c->codes;
-		STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3));
-		ADD_BITS(is_final_block, 1);
-		ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2);
-		ADD_BITS(c->o.precode.num_litlen_syms - 257, 5);
-		ADD_BITS(c->o.precode.num_offset_syms - 1, 5);
-		ADD_BITS(num_explicit_lens - 4, 4);
-
-		/* Output the lengths of the codewords in the precode. */
-		if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
-			/*
-			 * A 64-bit bitbuffer is just one bit too small to hold
-			 * the maximum number of precode lens, so to minimize
-			 * flushes we merge one len with the previous fields.
-			 */
-			precode_sym = deflate_precode_lens_permutation[0];
-			ADD_BITS(c->o.precode.lens[precode_sym], 3);
-			FLUSH_BITS();
-			i = 1; /* num_explicit_lens >= 4 */
-			do {
-				precode_sym =
-					deflate_precode_lens_permutation[i];
-				ADD_BITS(c->o.precode.lens[precode_sym], 3);
-			} while (++i < num_explicit_lens);
-			FLUSH_BITS();
-		} else {
-			FLUSH_BITS();
-			i = 0;
-			do {
-				precode_sym =
-					deflate_precode_lens_permutation[i];
-				ADD_BITS(c->o.precode.lens[precode_sym], 3);
-				FLUSH_BITS();
-			} while (++i < num_explicit_lens);
-		}
-
-		/*
-		 * Output the lengths of the codewords in the litlen and offset
-		 * codes, encoded by the precode.
-		 */
-		i = 0;
-		do {
-			precode_item = c->o.precode.items[i];
-			precode_sym = precode_item & 0x1F;
-			STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7));
-			ADD_BITS(c->o.precode.codewords[precode_sym],
-				 c->o.precode.lens[precode_sym]);
-			ADD_BITS(precode_item >> 5,
-				 deflate_extra_precode_bits[precode_sym]);
-			FLUSH_BITS();
-		} while (++i < num_precode_items);
-	}
-
-	/* Output the literals and matches for a dynamic or static block. */
-	ASSERT(bitcount <= 7);
-	deflate_compute_full_len_codewords(c, codes);
+  /*
+   * It is hard to get compilers to understand that writes to 'os->next'
+   * don't alias 'os'.  That hurts performance significantly, as
+   * everything in 'os' would keep getting re-loaded.  ('restrict'
+   * *should* do the trick, but it's unreliable.)  Therefore, we keep all
+   * the output bitstream state in local variables, and output bits using
+   * macros.  This is similar to what the decompressor does.
+   */
+  const u8 *in_next = block_begin;
+  const u8 * const in_end = block_begin + block_length;
+  bitbuf_t bitbuf = os->bitbuf;
+  unsigned bitcount = os->bitcount;
+  u8 *out_next = os->next;
+  u8 * const out_fast_end =
+  os->end - MIN(WORDBYTES - 1, os->end - out_next);
+  /*
+   * The cost for each block type, in bits.  Start with the cost of the
+   * block header which is 3 bits.
+   */
+  u32 dynamic_cost = 3;
+  u32 static_cost = 3;
+  u32 uncompressed_cost = 3;
+  u32 best_cost;
+  struct deflate_codes *codes;
+  unsigned sym;
+  
+  ASSERT(block_length >= MIN_BLOCK_LENGTH ||
+         (is_final_block && block_length > 0));
+  ASSERT(block_length <= MAX_BLOCK_LENGTH);
+  ASSERT(bitcount <= 7);
+  ASSERT((bitbuf & ~(((bitbuf_t)1 << bitcount) - 1)) == 0);
+  ASSERT(out_next <= os->end);
+  ASSERT(!os->overflow);
+  
+  /* Precompute the precode items and build the precode. */
+  deflate_precompute_huffman_header(c);
+  
+  /* Account for the cost of encoding dynamic Huffman codes. */
+  dynamic_cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
+  for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
+    u32 extra = deflate_extra_precode_bits[sym];
+    
+    dynamic_cost += c->o.precode.freqs[sym] *
+    (extra + c->o.precode.lens[sym]);
+  }
+  
+  /* Account for the cost of encoding literals. */
+  for (sym = 0; sym < 144; sym++) {
+    dynamic_cost += c->freqs.litlen[sym] *
+    c->codes.lens.litlen[sym];
+    static_cost += c->freqs.litlen[sym] * 8;
+  }
+  for (; sym < 256; sym++) {
+    dynamic_cost += c->freqs.litlen[sym] *
+    c->codes.lens.litlen[sym];
+    static_cost += c->freqs.litlen[sym] * 9;
+  }
+  
+  /* Account for the cost of encoding the end-of-block symbol. */
+  dynamic_cost += c->codes.lens.litlen[DEFLATE_END_OF_BLOCK];
+  static_cost += 7;
+  
+  /* Account for the cost of encoding lengths. */
+  for (sym = DEFLATE_FIRST_LEN_SYM;
+       sym < DEFLATE_FIRST_LEN_SYM + ARRAY_LEN(deflate_extra_length_bits);
+       sym++) {
+    u32 extra = deflate_extra_length_bits[
+      sym - DEFLATE_FIRST_LEN_SYM];
+    
+    dynamic_cost += c->freqs.litlen[sym] *
+    (extra + c->codes.lens.litlen[sym]);
+    static_cost += c->freqs.litlen[sym] *
+    (extra + c->static_codes.lens.litlen[sym]);
+  }
+  
+  /* Account for the cost of encoding offsets. */
+  for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++) {
+    u32 extra = deflate_extra_offset_bits[sym];
+    
+    dynamic_cost += c->freqs.offset[sym] *
+    (extra + c->codes.lens.offset[sym]);
+    static_cost += c->freqs.offset[sym] * (extra + 5);
+  }
+  
+  /* Compute the cost of using uncompressed blocks. */
+  uncompressed_cost += (-(bitcount + 3) & 7) + 32 +
+  (40 * (DIV_ROUND_UP(block_length,
+                      UINT16_MAX) - 1)) +
+  (8 * block_length);
+  
+  /*
+   * Choose and output the cheapest type of block.  If there is a tie,
+   * prefer uncompressed, then static, then dynamic.
+   */
+  
+  best_cost = MIN(dynamic_cost, MIN(static_cost, uncompressed_cost));
+  
+  /* If the block isn't going to fit, then stop early. */
+  if (DIV_ROUND_UP(bitcount + best_cost, 8) > os->end - out_next) {
+    os->overflow = true;
+    return;
+  }
+  /*
+   * Else, now we know that the block fits, so no further bounds checks on
+   * the output buffer are required until the next block.
+   */
+  
+  if (best_cost == uncompressed_cost) {
+    /*
+     * Uncompressed block(s).  DEFLATE limits the length of
+     * uncompressed blocks to UINT16_MAX bytes, so if the length of
+     * the "block" we're flushing is over UINT16_MAX, we actually
+     * output multiple blocks.
+     */
+    do {
+      u8 bfinal = 0;
+      size_t len = UINT16_MAX;
+      
+      if (in_end - in_next <= UINT16_MAX) {
+        bfinal = is_final_block;
+        len = in_end - in_next;
+      }
+      /* It was already checked that there is enough space. */
+      ASSERT(os->end - out_next >=
+             DIV_ROUND_UP(bitcount + 3, 8) + 4 + len);
+      /*
+       * Output BFINAL (1 bit) and BTYPE (2 bits), then align
+       * to a byte boundary.
+       */
+      STATIC_ASSERT(DEFLATE_BLOCKTYPE_UNCOMPRESSED == 0);
+      *out_next++ = (bfinal << bitcount) | bitbuf;
+      if (bitcount > 5)
+        *out_next++ = 0;
+      bitbuf = 0;
+      bitcount = 0;
+      /* Output LEN and NLEN, then the data itself. */
+      put_unaligned_le16(len, out_next);
+      out_next += 2;
+      put_unaligned_le16(~len, out_next);
+      out_next += 2;
+      memcpy(out_next, in_next, len);
+      out_next += len;
+      in_next += len;
+    } while (in_next != in_end);
+    /* Done outputting uncompressed block(s) */
+    goto out;
+  }
+  
+  if (best_cost == static_cost) {
+    /* Static Huffman block */
+    codes = &c->static_codes;
+    ADD_BITS(is_final_block, 1);
+    ADD_BITS(DEFLATE_BLOCKTYPE_STATIC_HUFFMAN, 2);
+    FLUSH_BITS();
+  } else {
+    const unsigned num_explicit_lens = c->o.precode.num_explicit_lens;
+    const unsigned num_precode_items = c->o.precode.num_items;
+    unsigned precode_sym, precode_item;
+    unsigned i;
+    
+    /* Dynamic Huffman block */
+    
+    codes = &c->codes;
+    STATIC_ASSERT(CAN_BUFFER(1 + 2 + 5 + 5 + 4 + 3));
+    ADD_BITS(is_final_block, 1);
+    ADD_BITS(DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN, 2);
+    ADD_BITS(c->o.precode.num_litlen_syms - 257, 5);
+    ADD_BITS(c->o.precode.num_offset_syms - 1, 5);
+    ADD_BITS(num_explicit_lens - 4, 4);
+    
+    /* Output the lengths of the codewords in the precode. */
+    if (CAN_BUFFER(3 * (DEFLATE_NUM_PRECODE_SYMS - 1))) {
+      /*
+       * A 64-bit bitbuffer is just one bit too small to hold
+       * the maximum number of precode lens, so to minimize
+       * flushes we merge one len with the previous fields.
+       */
+      precode_sym = deflate_precode_lens_permutation[0];
+      ADD_BITS(c->o.precode.lens[precode_sym], 3);
+      FLUSH_BITS();
+      i = 1; /* num_explicit_lens >= 4 */
+      do {
+        precode_sym =
+        deflate_precode_lens_permutation[i];
+        ADD_BITS(c->o.precode.lens[precode_sym], 3);
+      } while (++i < num_explicit_lens);
+      FLUSH_BITS();
+    } else {
+      FLUSH_BITS();
+      i = 0;
+      do {
+        precode_sym =
+        deflate_precode_lens_permutation[i];
+        ADD_BITS(c->o.precode.lens[precode_sym], 3);
+        FLUSH_BITS();
+      } while (++i < num_explicit_lens);
+    }
+    
+    /*
+     * Output the lengths of the codewords in the litlen and offset
+     * codes, encoded by the precode.
+     */
+    i = 0;
+    do {
+      precode_item = c->o.precode.items[i];
+      precode_sym = precode_item & 0x1F;
+      STATIC_ASSERT(CAN_BUFFER(MAX_PRE_CODEWORD_LEN + 7));
+      ADD_BITS(c->o.precode.codewords[precode_sym],
+               c->o.precode.lens[precode_sym]);
+      ADD_BITS(precode_item >> 5,
+               deflate_extra_precode_bits[precode_sym]);
+      FLUSH_BITS();
+    } while (++i < num_precode_items);
+  }
+  
+  /* Output the literals and matches for a dynamic or static block. */
+  ASSERT(bitcount <= 7);
+  deflate_compute_full_len_codewords(c, codes);
 #if SUPPORT_NEAR_OPTIMAL_PARSING
-	if (sequences == NULL) {
-		/* Output the literals and matches from the minimum-cost path */
-		struct deflate_optimum_node *cur_node =
-			&c->p.n.optimum_nodes[0];
-		struct deflate_optimum_node * const end_node =
-			&c->p.n.optimum_nodes[block_length];
-		do {
-			unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
-			unsigned offset = cur_node->item >>
-					  OPTIMUM_OFFSET_SHIFT;
-			if (length == 1) {
-				/* Literal */
-				ADD_BITS(codes->codewords.litlen[offset],
-					 codes->lens.litlen[offset]);
-				FLUSH_BITS();
-			} else {
-				/* Match */
-				WRITE_MATCH(c, codes, length, offset,
-					    c->p.n.offset_slot_full[offset]);
-			}
-			cur_node += length;
-		} while (cur_node != end_node);
-	} else
+  if (sequences == NULL) {
+    /* Output the literals and matches from the minimum-cost path */
+    struct deflate_optimum_node *cur_node =
+    &c->p.n.optimum_nodes[0];
+    struct deflate_optimum_node * const end_node =
+    &c->p.n.optimum_nodes[block_length];
+    do {
+      unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+      unsigned offset = cur_node->item >>
+      OPTIMUM_OFFSET_SHIFT;
+      if (length == 1) {
+        /* Literal */
+        ADD_BITS(codes->codewords.litlen[offset],
+                 codes->lens.litlen[offset]);
+        FLUSH_BITS();
+      } else {
+        /* Match */
+        WRITE_MATCH(c, codes, length, offset,
+                    c->p.n.offset_slot_full[offset]);
+      }
+      cur_node += length;
+    } while (cur_node != end_node);
+  } else
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-	{
-		/* Output the literals and matches from the sequences list. */
-		const struct deflate_sequence *seq;
-
-		for (seq = sequences; ; seq++) {
-			u32 litrunlen = seq->litrunlen_and_length &
-					SEQ_LITRUNLEN_MASK;
-			unsigned length = seq->litrunlen_and_length >>
-					  SEQ_LENGTH_SHIFT;
-			unsigned lit;
-
-			/* Output a run of literals. */
-			if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) {
-				for (; litrunlen >= 4; litrunlen -= 4) {
-					lit = *in_next++;
-					ADD_BITS(codes->codewords.litlen[lit],
-						 codes->lens.litlen[lit]);
-					lit = *in_next++;
-					ADD_BITS(codes->codewords.litlen[lit],
-						 codes->lens.litlen[lit]);
-					lit = *in_next++;
-					ADD_BITS(codes->codewords.litlen[lit],
-						 codes->lens.litlen[lit]);
-					lit = *in_next++;
-					ADD_BITS(codes->codewords.litlen[lit],
-						 codes->lens.litlen[lit]);
-					FLUSH_BITS();
-				}
-				if (litrunlen-- != 0) {
-					lit = *in_next++;
-					ADD_BITS(codes->codewords.litlen[lit],
-						 codes->lens.litlen[lit]);
-					if (litrunlen-- != 0) {
-						lit = *in_next++;
-						ADD_BITS(codes->codewords.litlen[lit],
-							 codes->lens.litlen[lit]);
-						if (litrunlen-- != 0) {
-							lit = *in_next++;
-							ADD_BITS(codes->codewords.litlen[lit],
-								 codes->lens.litlen[lit]);
-						}
-					}
-					FLUSH_BITS();
-				}
-			} else {
-				while (litrunlen--) {
-					lit = *in_next++;
-					ADD_BITS(codes->codewords.litlen[lit],
-						 codes->lens.litlen[lit]);
-					FLUSH_BITS();
-				}
-			}
-
-			if (length == 0) { /* Last sequence? */
-				ASSERT(in_next == in_end);
-				break;
-			}
-
-			/* Output a match. */
-			WRITE_MATCH(c, codes, length, seq->offset,
-				    seq->offset_slot);
-			in_next += length;
-		}
-	}
-
-	/* Output the end-of-block symbol. */
-	ASSERT(bitcount <= 7);
-	ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
-		 codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
-	FLUSH_BITS();
+  {
+    /* Output the literals and matches from the sequences list. */
+    const struct deflate_sequence *seq;
+    
+    for (seq = sequences; ; seq++) {
+      u32 litrunlen = seq->litrunlen_and_length &
+      SEQ_LITRUNLEN_MASK;
+      unsigned length = seq->litrunlen_and_length >>
+      SEQ_LENGTH_SHIFT;
+      unsigned lit;
+      
+      /* Output a run of literals. */
+      if (CAN_BUFFER(4 * MAX_LITLEN_CODEWORD_LEN)) {
+        for (; litrunlen >= 4; litrunlen -= 4) {
+          lit = *in_next++;
+          ADD_BITS(codes->codewords.litlen[lit],
+                   codes->lens.litlen[lit]);
+          lit = *in_next++;
+          ADD_BITS(codes->codewords.litlen[lit],
+                   codes->lens.litlen[lit]);
+          lit = *in_next++;
+          ADD_BITS(codes->codewords.litlen[lit],
+                   codes->lens.litlen[lit]);
+          lit = *in_next++;
+          ADD_BITS(codes->codewords.litlen[lit],
+                   codes->lens.litlen[lit]);
+          FLUSH_BITS();
+        }
+        if (litrunlen-- != 0) {
+          lit = *in_next++;
+          ADD_BITS(codes->codewords.litlen[lit],
+                   codes->lens.litlen[lit]);
+          if (litrunlen-- != 0) {
+            lit = *in_next++;
+            ADD_BITS(codes->codewords.litlen[lit],
+                     codes->lens.litlen[lit]);
+            if (litrunlen-- != 0) {
+              lit = *in_next++;
+              ADD_BITS(codes->codewords.litlen[lit],
+                       codes->lens.litlen[lit]);
+            }
+          }
+          FLUSH_BITS();
+        }
+      } else {
+        while (litrunlen--) {
+          lit = *in_next++;
+          ADD_BITS(codes->codewords.litlen[lit],
+                   codes->lens.litlen[lit]);
+          FLUSH_BITS();
+        }
+      }
+      
+      if (length == 0) { /* Last sequence? */
+        ASSERT(in_next == in_end);
+        break;
+      }
+      
+      /* Output a match. */
+      WRITE_MATCH(c, codes, length, seq->offset,
+                  seq->offset_slot);
+      in_next += length;
+    }
+  }
+  
+  /* Output the end-of-block symbol. */
+  ASSERT(bitcount <= 7);
+  ADD_BITS(codes->codewords.litlen[DEFLATE_END_OF_BLOCK],
+           codes->lens.litlen[DEFLATE_END_OF_BLOCK]);
+  FLUSH_BITS();
 out:
-	ASSERT(bitcount <= 7);
-	/*
-	 * Assert that the block cost was computed correctly.  This is relied on
-	 * above for the bounds check on the output buffer.  Also,
-	 * libdeflate_deflate_compress_bound() relies on this via the assumption
-	 * that uncompressed blocks will always be used when cheapest.
-	 */
-	ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost);
-	os->bitbuf = bitbuf;
-	os->bitcount = bitcount;
-	os->next = out_next;
+  ASSERT(bitcount <= 7);
+  /*
+   * Assert that the block cost was computed correctly.  This is relied on
+   * above for the bounds check on the output buffer.  Also,
+   * libdeflate_deflate_compress_bound() relies on this via the assumption
+   * that uncompressed blocks will always be used when cheapest.
+   */
+  ASSERT(8 * (out_next - os->next) + bitcount - os->bitcount == best_cost);
+  os->bitbuf = bitbuf;
+  os->bitcount = bitcount;
+  os->next = out_next;
 }
 
 static void
 deflate_finish_block(struct libdeflate_compressor *c,
-		     struct deflate_output_bitstream *os,
-		     const u8 *block_begin, u32 block_length,
-		     const struct deflate_sequence *sequences,
-		     bool is_final_block)
+                     struct deflate_output_bitstream *os,
+                     const u8 *block_begin, u32 block_length,
+                     const struct deflate_sequence *sequences,
+                     bool is_final_block)
 {
-	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
-	deflate_make_huffman_codes(&c->freqs, &c->codes);
-	deflate_flush_block(c, os, block_begin, block_length, sequences,
-			    is_final_block);
+  c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+  deflate_make_huffman_codes(&c->freqs, &c->codes);
+  deflate_flush_block(c, os, block_begin, block_length, sequences,
+                      is_final_block);
 }
 
 /******************************************************************************/
@@ -2090,14 +2090,14 @@ deflate_finish_block(struct libdeflate_compressor *c,
 static void
 init_block_split_stats(struct block_split_stats *stats)
 {
-	int i;
-
-	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-		stats->new_observations[i] = 0;
-		stats->observations[i] = 0;
-	}
-	stats->num_new_observations = 0;
-	stats->num_observations = 0;
+  int i;
+  
+  for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+    stats->new_observations[i] = 0;
+    stats->observations[i] = 0;
+  }
+  stats->num_new_observations = 0;
+  stats->num_observations = 0;
 }
 
 /*
@@ -2107,8 +2107,8 @@ init_block_split_stats(struct block_split_stats *stats)
 static forceinline void
 observe_literal(struct block_split_stats *stats, u8 lit)
 {
-	stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
-	stats->num_new_observations++;
+  stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+  stats->num_new_observations++;
 }
 
 /*
@@ -2118,147 +2118,147 @@ observe_literal(struct block_split_stats *stats, u8 lit)
 static forceinline void
 observe_match(struct block_split_stats *stats, unsigned length)
 {
-	stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES +
-				(length >= 9)]++;
-	stats->num_new_observations++;
+  stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES +
+                          (length >= 9)]++;
+  stats->num_new_observations++;
 }
 
 static void
 merge_new_observations(struct block_split_stats *stats)
 {
-	int i;
-
-	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-		stats->observations[i] += stats->new_observations[i];
-		stats->new_observations[i] = 0;
-	}
-	stats->num_observations += stats->num_new_observations;
-	stats->num_new_observations = 0;
+  int i;
+  
+  for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+    stats->observations[i] += stats->new_observations[i];
+    stats->new_observations[i] = 0;
+  }
+  stats->num_observations += stats->num_new_observations;
+  stats->num_new_observations = 0;
 }
 
 static bool
 do_end_block_check(struct block_split_stats *stats, u32 block_length)
 {
-	if (stats->num_observations > 0) {
-		/*
-		 * Compute the sum of absolute differences of probabilities.  To
-		 * avoid needing to use floating point arithmetic or do slow
-		 * divisions, we do all arithmetic with the probabilities
-		 * multiplied by num_observations * num_new_observations.  E.g.,
-		 * for the "old" observations the probabilities would be
-		 * (double)observations[i] / num_observations, but since we
-		 * multiply by both num_observations and num_new_observations we
-		 * really do observations[i] * num_new_observations.
-		 */
-		u32 total_delta = 0;
-		u32 num_items;
-		u32 cutoff;
-		int i;
-
-		for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-			u32 expected = stats->observations[i] *
-				       stats->num_new_observations;
-			u32 actual = stats->new_observations[i] *
-				     stats->num_observations;
-			u32 delta = (actual > expected) ? actual - expected :
-							  expected - actual;
-
-			total_delta += delta;
-		}
-
-		num_items = stats->num_observations +
-			    stats->num_new_observations;
-		/*
-		 * Heuristic: the cutoff is when the sum of absolute differences
-		 * of probabilities becomes at least 200/512.  As above, the
-		 * probability is multiplied by both num_new_observations and
-		 * num_observations.  Be careful to avoid integer overflow.
-		 */
-		cutoff = stats->num_new_observations * 200 / 512 *
-			 stats->num_observations;
-		/*
-		 * Very short blocks have a lot of overhead for the Huffman
-		 * codes, so only use them if it clearly seems worthwhile.
-		 * (This is an additional penalty, which adds to the smaller
-		 * penalty below which scales more slowly.)
-		 */
-		if (block_length < 10000 && num_items < 8192)
-			cutoff += (u64)cutoff * (8192 - num_items) / 8192;
-
-		/* Ready to end the block? */
-		if (total_delta +
-		    (block_length / 4096) * stats->num_observations >= cutoff)
-			return true;
-	}
-	merge_new_observations(stats);
-	return false;
+  if (stats->num_observations > 0) {
+    /*
+     * Compute the sum of absolute differences of probabilities.  To
+     * avoid needing to use floating point arithmetic or do slow
+     * divisions, we do all arithmetic with the probabilities
+     * multiplied by num_observations * num_new_observations.  E.g.,
+     * for the "old" observations the probabilities would be
+     * (double)observations[i] / num_observations, but since we
+     * multiply by both num_observations and num_new_observations we
+     * really do observations[i] * num_new_observations.
+     */
+    u32 total_delta = 0;
+    u32 num_items;
+    u32 cutoff;
+    int i;
+    
+    for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+      u32 expected = stats->observations[i] *
+      stats->num_new_observations;
+      u32 actual = stats->new_observations[i] *
+      stats->num_observations;
+      u32 delta = (actual > expected) ? actual - expected :
+      expected - actual;
+      
+      total_delta += delta;
+    }
+    
+    num_items = stats->num_observations +
+    stats->num_new_observations;
+    /*
+     * Heuristic: the cutoff is when the sum of absolute differences
+     * of probabilities becomes at least 200/512.  As above, the
+     * probability is multiplied by both num_new_observations and
+     * num_observations.  Be careful to avoid integer overflow.
+     */
+    cutoff = stats->num_new_observations * 200 / 512 *
+    stats->num_observations;
+    /*
+     * Very short blocks have a lot of overhead for the Huffman
+     * codes, so only use them if it clearly seems worthwhile.
+     * (This is an additional penalty, which adds to the smaller
+     * penalty below which scales more slowly.)
+     */
+    if (block_length < 10000 && num_items < 8192)
+      cutoff += (u64)cutoff * (8192 - num_items) / 8192;
+    
+    /* Ready to end the block? */
+    if (total_delta +
+        (block_length / 4096) * stats->num_observations >= cutoff)
+      return true;
+  }
+  merge_new_observations(stats);
+  return false;
 }
 
 static forceinline bool
 ready_to_check_block(const struct block_split_stats *stats,
-		     const u8 *in_block_begin, const u8 *in_next,
-		     const u8 *in_end)
+                     const u8 *in_block_begin, const u8 *in_next,
+                     const u8 *in_end)
 {
-	return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK
-		&& in_next - in_block_begin >= MIN_BLOCK_LENGTH
-		&& in_end - in_next >= MIN_BLOCK_LENGTH;
+  return stats->num_new_observations >= NUM_OBSERVATIONS_PER_BLOCK_CHECK
+  && in_next - in_block_begin >= MIN_BLOCK_LENGTH
+  && in_end - in_next >= MIN_BLOCK_LENGTH;
 }
 
 static forceinline bool
 should_end_block(struct block_split_stats *stats,
-		 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
+                 const u8 *in_block_begin, const u8 *in_next, const u8 *in_end)
 {
-	/* Ready to try to end the block (again)? */
-	if (!ready_to_check_block(stats, in_block_begin, in_next, in_end))
-		return false;
-
-	return do_end_block_check(stats, in_next - in_block_begin);
+  /* Ready to try to end the block (again)? */
+  if (!ready_to_check_block(stats, in_block_begin, in_next, in_end))
+    return false;
+  
+  return do_end_block_check(stats, in_next - in_block_begin);
 }
 
 /******************************************************************************/
 
 static void
 deflate_begin_sequences(struct libdeflate_compressor *c,
-			struct deflate_sequence *first_seq)
+                        struct deflate_sequence *first_seq)
 {
-	deflate_reset_symbol_frequencies(c);
-	first_seq->litrunlen_and_length = 0;
+  deflate_reset_symbol_frequencies(c);
+  first_seq->litrunlen_and_length = 0;
 }
 
 static forceinline void
 deflate_choose_literal(struct libdeflate_compressor *c, unsigned literal,
-		       bool gather_split_stats, struct deflate_sequence *seq)
+                       bool gather_split_stats, struct deflate_sequence *seq)
 {
-	c->freqs.litlen[literal]++;
-
-	if (gather_split_stats)
-		observe_literal(&c->split_stats, literal);
-
-	STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK);
-	seq->litrunlen_and_length++;
+  c->freqs.litlen[literal]++;
+  
+  if (gather_split_stats)
+    observe_literal(&c->split_stats, literal);
+  
+  STATIC_ASSERT(MAX_BLOCK_LENGTH <= SEQ_LITRUNLEN_MASK);
+  seq->litrunlen_and_length++;
 }
 
 static forceinline void
 deflate_choose_match(struct libdeflate_compressor *c,
-		     unsigned length, unsigned offset, bool gather_split_stats,
-		     struct deflate_sequence **seq_p)
+                     unsigned length, unsigned offset, bool gather_split_stats,
+                     struct deflate_sequence **seq_p)
 {
-	struct deflate_sequence *seq = *seq_p;
-	unsigned length_slot = deflate_length_slot[length];
-	unsigned offset_slot = deflate_get_offset_slot(offset);
-
-	c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
-	c->freqs.offset[offset_slot]++;
-	if (gather_split_stats)
-		observe_match(&c->split_stats, length);
-
-	seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT;
-	seq->offset = offset;
-	seq->offset_slot = offset_slot;
-
-	seq++;
-	seq->litrunlen_and_length = 0;
-	*seq_p = seq;
+  struct deflate_sequence *seq = *seq_p;
+  unsigned length_slot = deflate_length_slot[length];
+  unsigned offset_slot = deflate_get_offset_slot(offset);
+  
+  c->freqs.litlen[DEFLATE_FIRST_LEN_SYM + length_slot]++;
+  c->freqs.offset[offset_slot]++;
+  if (gather_split_stats)
+    observe_match(&c->split_stats, length);
+  
+  seq->litrunlen_and_length |= (u32)length << SEQ_LENGTH_SHIFT;
+  seq->offset = offset;
+  seq->offset_slot = offset_slot;
+  
+  seq++;
+  seq->litrunlen_and_length = 0;
+  *seq_p = seq;
 }
 
 /*
@@ -2268,10 +2268,10 @@ deflate_choose_match(struct libdeflate_compressor *c,
 static forceinline void
 adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
 {
-	if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
-		*max_len = remaining;
-		*nice_len = MIN(*nice_len, *max_len);
-	}
+  if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
+    *max_len = remaining;
+    *nice_len = MIN(*nice_len, *max_len);
+  }
 }
 
 /*
@@ -2293,62 +2293,62 @@ adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
 static unsigned
 choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
 {
-	/* map from num_used_literals to min_len */
-	static const u8 min_lens[] = {
-		9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
-		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
-		5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
-		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-		4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-		/* The rest is implicitly 3. */
-	};
-	unsigned min_len;
-
-	STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
-	STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
-
-	if (num_used_literals >= ARRAY_LEN(min_lens))
-		return 3;
-	min_len = min_lens[num_used_literals];
-	/*
-	 * With a low max_search_depth, it may be too hard to find long matches.
-	 */
-	if (max_search_depth < 16) {
-		if (max_search_depth < 5)
-			min_len = MIN(min_len, 4);
-		else if (max_search_depth < 10)
-			min_len = MIN(min_len, 5);
-		else
-			min_len = MIN(min_len, 7);
-	}
-	return min_len;
+  /* map from num_used_literals to min_len */
+  static const u8 min_lens[] = {
+    9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+    5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    /* The rest is implicitly 3. */
+  };
+  unsigned min_len;
+  
+  STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
+  STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);
+  
+  if (num_used_literals >= ARRAY_LEN(min_lens))
+    return 3;
+  min_len = min_lens[num_used_literals];
+  /*
+   * With a low max_search_depth, it may be too hard to find long matches.
+   */
+  if (max_search_depth < 16) {
+    if (max_search_depth < 5)
+      min_len = MIN(min_len, 4);
+    else if (max_search_depth < 10)
+      min_len = MIN(min_len, 5);
+    else
+      min_len = MIN(min_len, 7);
+  }
+  return min_len;
 }
 
 static unsigned
 calculate_min_match_len(const u8 *data, size_t data_len,
-			unsigned max_search_depth)
+                        unsigned max_search_depth)
 {
-	u8 used[256] = { 0 };
-	unsigned num_used_literals = 0;
-	size_t i;
-
-	/*
-	 * For very short inputs, the static Huffman code has a good chance of
-	 * being best, in which case there is no reason to avoid short matches.
-	 */
-	if (data_len < 512)
-		return DEFLATE_MIN_MATCH_LEN;
-
-	/*
-	 * For an initial approximation, scan the first 4 KiB of data.  The
-	 * caller may use recalculate_min_match_len() to update min_len later.
-	 */
-	data_len = MIN(data_len, 4096);
-	for (i = 0; i < data_len; i++)
-		used[data[i]] = 1;
-	for (i = 0; i < 256; i++)
-		num_used_literals += used[i];
-	return choose_min_match_len(num_used_literals, max_search_depth);
+  u8 used[256] = { 0 };
+  unsigned num_used_literals = 0;
+  size_t i;
+  
+  /*
+   * For very short inputs, the static Huffman code has a good chance of
+   * being best, in which case there is no reason to avoid short matches.
+   */
+  if (data_len < 512)
+    return DEFLATE_MIN_MATCH_LEN;
+  
+  /*
+   * For an initial approximation, scan the first 4 KiB of data.  The
+   * caller may use recalculate_min_match_len() to update min_len later.
+   */
+  data_len = MIN(data_len, 4096);
+  for (i = 0; i < data_len; i++)
+    used[data[i]] = 1;
+  for (i = 0; i < 256; i++)
+    num_used_literals += used[i];
+  return choose_min_match_len(num_used_literals, max_search_depth);
 }
 
 /*
@@ -2357,32 +2357,32 @@ calculate_min_match_len(const u8 *data, size_t data_len,
  */
 static unsigned
 recalculate_min_match_len(const struct deflate_freqs *freqs,
-			  unsigned max_search_depth)
+                          unsigned max_search_depth)
 {
-	u32 literal_freq = 0;
-	u32 cutoff;
-	unsigned num_used_literals = 0;
-	int i;
-
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
-		literal_freq += freqs->litlen[i];
-
-	cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */
-
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
-		if (freqs->litlen[i] > cutoff)
-			num_used_literals++;
-	}
-	return choose_min_match_len(num_used_literals, max_search_depth);
+  u32 literal_freq = 0;
+  u32 cutoff;
+  unsigned num_used_literals = 0;
+  int i;
+  
+  for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+    literal_freq += freqs->litlen[i];
+  
+  cutoff = literal_freq >> 10; /* Ignore literals used very rarely. */
+  
+  for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+    if (freqs->litlen[i] > cutoff)
+      num_used_literals++;
+  }
+  return choose_min_match_len(num_used_literals, max_search_depth);
 }
 
 static forceinline const u8 *
 choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
-		     size_t soft_max_len)
+                     size_t soft_max_len)
 {
-	if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH)
-		return in_end;
-	return in_block_begin + soft_max_len;
+  if (in_end - in_block_begin < soft_max_len + MIN_BLOCK_LENGTH)
+    return in_end;
+  return in_block_begin + soft_max_len;
 }
 
 /*
@@ -2390,55 +2390,55 @@ choose_max_block_end(const u8 *in_block_begin, const u8 *in_end,
  */
 static size_t
 deflate_compress_none(const u8 *in, size_t in_nbytes,
-		      u8 *out, size_t out_nbytes_avail)
+                      u8 *out, size_t out_nbytes_avail)
 {
-	const u8 *in_next = in;
-	const u8 * const in_end = in + in_nbytes;
-	u8 *out_next = out;
-	u8 * const out_end = out + out_nbytes_avail;
-
-	/*
-	 * If the input is zero-length, we still must output a block in order
-	 * for the output to be a valid DEFLATE stream.  Handle this case
-	 * specially to avoid potentially passing NULL to memcpy() below.
-	 */
-	if (unlikely(in_nbytes == 0)) {
-		if (out_nbytes_avail < 5)
-			return 0;
-		/* BFINAL and BTYPE */
-		*out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
-		/* LEN and NLEN */
-		put_unaligned_le32(0xFFFF0000, out_next);
-		return 5;
-	}
-
-	do {
-		u8 bfinal = 0;
-		size_t len = UINT16_MAX;
-
-		if (in_end - in_next <= UINT16_MAX) {
-			bfinal = 1;
-			len = in_end - in_next;
-		}
-		if (out_end - out_next < 5 + len)
-			return 0;
-		/*
-		 * Output BFINAL and BTYPE.  The stream is already byte-aligned
-		 * here, so this step always requires outputting exactly 1 byte.
-		 */
-		*out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
-
-		/* Output LEN and NLEN, then the data itself. */
-		put_unaligned_le16(len, out_next);
-		out_next += 2;
-		put_unaligned_le16(~len, out_next);
-		out_next += 2;
-		memcpy(out_next, in_next, len);
-		out_next += len;
-		in_next += len;
-	} while (in_next != in_end);
-
-	return out_next - out;
+  const u8 *in_next = in;
+  const u8 * const in_end = in + in_nbytes;
+  u8 *out_next = out;
+  u8 * const out_end = out + out_nbytes_avail;
+  
+  /*
+   * If the input is zero-length, we still must output a block in order
+   * for the output to be a valid DEFLATE stream.  Handle this case
+   * specially to avoid potentially passing NULL to memcpy() below.
+   */
+  if (unlikely(in_nbytes == 0)) {
+    if (out_nbytes_avail < 5)
+      return 0;
+    /* BFINAL and BTYPE */
+    *out_next++ = 1 | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
+    /* LEN and NLEN */
+    put_unaligned_le32(0xFFFF0000, out_next);
+    return 5;
+  }
+  
+  do {
+    u8 bfinal = 0;
+    size_t len = UINT16_MAX;
+    
+    if (in_end - in_next <= UINT16_MAX) {
+      bfinal = 1;
+      len = in_end - in_next;
+    }
+    if (out_end - out_next < 5 + len)
+      return 0;
+    /*
+     * Output BFINAL and BTYPE.  The stream is already byte-aligned
+     * here, so this step always requires outputting exactly 1 byte.
+     */
+    *out_next++ = bfinal | (DEFLATE_BLOCKTYPE_UNCOMPRESSED << 1);
+    
+    /* Output LEN and NLEN, then the data itself. */
+    put_unaligned_le16(len, out_next);
+    out_next += 2;
+    put_unaligned_le16(~len, out_next);
+    out_next += 2;
+    memcpy(out_next, in_next, len);
+    out_next += len;
+    in_next += len;
+  } while (in_next != in_end);
+  
+  return out_next - out;
 }
 
 /*
@@ -2449,76 +2449,76 @@ deflate_compress_none(const u8 *in, size_t in_nbytes,
  */
 static void
 deflate_compress_fastest(struct libdeflate_compressor * restrict c,
-			 const u8 *in, size_t in_nbytes,
-			 struct deflate_output_bitstream *os)
+                         const u8 *in, size_t in_nbytes,
+                         struct deflate_output_bitstream *os)
 {
-	const u8 *in_next = in;
-	const u8 *in_end = in_next + in_nbytes;
-	const u8 *in_cur_base = in_next;
-	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
-	unsigned nice_len = MIN(c->nice_match_length, max_len);
-	u32 next_hash = 0;
-
-	ht_matchfinder_init(&c->p.f.ht_mf);
-
-	do {
-		/* Starting a new DEFLATE block */
-
-		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end = choose_max_block_end(
-				in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH);
-		struct deflate_sequence *seq = c->p.f.sequences;
-
-		deflate_begin_sequences(c, seq);
-
-		do {
-			u32 length;
-			u32 offset;
-			size_t remaining = in_end - in_next;
-
-			if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
-				max_len = remaining;
-				if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) {
-					do {
-						deflate_choose_literal(c,
-							*in_next++, false, seq);
-					} while (--max_len);
-					break;
-				}
-				nice_len = MIN(nice_len, max_len);
-			}
-			length = ht_matchfinder_longest_match(&c->p.f.ht_mf,
-							      &in_cur_base,
-							      in_next,
-							      max_len,
-							      nice_len,
-							      &next_hash,
-							      &offset);
-			if (length) {
-				/* Match found */
-				deflate_choose_match(c, length, offset, false,
-						     &seq);
-				ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
-							  &in_cur_base,
-							  in_next + 1,
-							  in_end,
-							  length - 1,
-							  &next_hash);
-				in_next += length;
-			} else {
-				/* No match found */
-				deflate_choose_literal(c, *in_next++, false,
-						       seq);
-			}
-
-			/* Check if it's time to output another block. */
-		} while (in_next < in_max_block_end &&
-			 seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]);
-
-		deflate_finish_block(c, os, in_block_begin,
-				     in_next - in_block_begin,
-				     c->p.f.sequences, in_next == in_end);
-	} while (in_next != in_end && !os->overflow);
+  const u8 *in_next = in;
+  const u8 *in_end = in_next + in_nbytes;
+  const u8 *in_cur_base = in_next;
+  unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+  unsigned nice_len = MIN(c->nice_match_length, max_len);
+  u32 next_hash = 0;
+  
+  ht_matchfinder_init(&c->p.f.ht_mf);
+  
+  do {
+    /* Starting a new DEFLATE block */
+    
+    const u8 * const in_block_begin = in_next;
+    const u8 * const in_max_block_end = choose_max_block_end(
+                                                             in_next, in_end, FAST_SOFT_MAX_BLOCK_LENGTH);
+    struct deflate_sequence *seq = c->p.f.sequences;
+    
+    deflate_begin_sequences(c, seq);
+    
+    do {
+      u32 length;
+      u32 offset;
+      size_t remaining = in_end - in_next;
+      
+      if (unlikely(remaining < DEFLATE_MAX_MATCH_LEN)) {
+        max_len = remaining;
+        if (max_len < HT_MATCHFINDER_REQUIRED_NBYTES) {
+          do {
+            deflate_choose_literal(c,
+                                   *in_next++, false, seq);
+          } while (--max_len);
+          break;
+        }
+        nice_len = MIN(nice_len, max_len);
+      }
+      length = ht_matchfinder_longest_match(&c->p.f.ht_mf,
+                                            &in_cur_base,
+                                            in_next,
+                                            max_len,
+                                            nice_len,
+                                            &next_hash,
+                                            &offset);
+      if (length) {
+        /* Match found */
+        deflate_choose_match(c, length, offset, false,
+                             &seq);
+        ht_matchfinder_skip_bytes(&c->p.f.ht_mf,
+                                  &in_cur_base,
+                                  in_next + 1,
+                                  in_end,
+                                  length - 1,
+                                  &next_hash);
+        in_next += length;
+      } else {
+        /* No match found */
+        deflate_choose_literal(c, *in_next++, false,
+                               seq);
+      }
+      
+      /* Check if it's time to output another block. */
+    } while (in_next < in_max_block_end &&
+             seq < &c->p.f.sequences[FAST_SEQ_STORE_LENGTH]);
+    
+    deflate_finish_block(c, os, in_block_begin,
+                         in_next - in_block_begin,
+                         c->p.f.sequences, in_next == in_end);
+  } while (in_next != in_end && !os->overflow);
 }
 
 /*
@@ -2526,284 +2526,284 @@ deflate_compress_fastest(struct libdeflate_compressor * restrict c,
  */
 static void
 deflate_compress_greedy(struct libdeflate_compressor * restrict c,
-			const u8 *in, size_t in_nbytes,
-			struct deflate_output_bitstream *os)
+                        const u8 *in, size_t in_nbytes,
+                        struct deflate_output_bitstream *os)
 {
-	const u8 *in_next = in;
-	const u8 *in_end = in_next + in_nbytes;
-	const u8 *in_cur_base = in_next;
-	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
-	unsigned nice_len = MIN(c->nice_match_length, max_len);
-	u32 next_hashes[2] = {0, 0};
-
-	hc_matchfinder_init(&c->p.g.hc_mf);
-
-	do {
-		/* Starting a new DEFLATE block */
-
-		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end = choose_max_block_end(
-				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
-		struct deflate_sequence *seq = c->p.g.sequences;
-		unsigned min_len;
-
-		init_block_split_stats(&c->split_stats);
-		deflate_begin_sequences(c, seq);
-		min_len = calculate_min_match_len(in_next,
-						  in_max_block_end - in_next,
-						  c->max_search_depth);
-		do {
-			u32 length;
-			u32 offset;
-
-			adjust_max_and_nice_len(&max_len, &nice_len,
-						in_end - in_next);
-			length = hc_matchfinder_longest_match(
-						&c->p.g.hc_mf,
-						&in_cur_base,
-						in_next,
-						min_len - 1,
-						max_len,
-						nice_len,
-						c->max_search_depth,
-						next_hashes,
-						&offset);
-
-			if (length >= min_len &&
-			    (length > DEFLATE_MIN_MATCH_LEN ||
-			     offset <= 4096)) {
-				/* Match found */
-				deflate_choose_match(c, length, offset, true,
-						     &seq);
-				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
-							  &in_cur_base,
-							  in_next + 1,
-							  in_end,
-							  length - 1,
-							  next_hashes);
-				in_next += length;
-			} else {
-				/* No match found */
-				deflate_choose_literal(c, *in_next++, true,
-						       seq);
-			}
-
-			/* Check if it's time to output another block. */
-		} while (in_next < in_max_block_end &&
-			 seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
-			 !should_end_block(&c->split_stats,
-					   in_block_begin, in_next, in_end));
-
-		deflate_finish_block(c, os, in_block_begin,
-				     in_next - in_block_begin,
-				     c->p.g.sequences, in_next == in_end);
-	} while (in_next != in_end && !os->overflow);
+  const u8 *in_next = in;
+  const u8 *in_end = in_next + in_nbytes;
+  const u8 *in_cur_base = in_next;
+  unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+  unsigned nice_len = MIN(c->nice_match_length, max_len);
+  u32 next_hashes[2] = {0, 0};
+  
+  hc_matchfinder_init(&c->p.g.hc_mf);
+  
+  do {
+    /* Starting a new DEFLATE block */
+    
+    const u8 * const in_block_begin = in_next;
+    const u8 * const in_max_block_end = choose_max_block_end(
+                                                             in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+    struct deflate_sequence *seq = c->p.g.sequences;
+    unsigned min_len;
+    
+    init_block_split_stats(&c->split_stats);
+    deflate_begin_sequences(c, seq);
+    min_len = calculate_min_match_len(in_next,
+                                      in_max_block_end - in_next,
+                                      c->max_search_depth);
+    do {
+      u32 length;
+      u32 offset;
+      
+      adjust_max_and_nice_len(&max_len, &nice_len,
+                              in_end - in_next);
+      length = hc_matchfinder_longest_match(
+                                            &c->p.g.hc_mf,
+                                            &in_cur_base,
+                                            in_next,
+                                            min_len - 1,
+                                            max_len,
+                                            nice_len,
+                                            c->max_search_depth,
+                                            next_hashes,
+                                            &offset);
+      
+      if (length >= min_len &&
+          (length > DEFLATE_MIN_MATCH_LEN ||
+           offset <= 4096)) {
+        /* Match found */
+        deflate_choose_match(c, length, offset, true,
+                             &seq);
+        hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+                                  &in_cur_base,
+                                  in_next + 1,
+                                  in_end,
+                                  length - 1,
+                                  next_hashes);
+        in_next += length;
+      } else {
+        /* No match found */
+        deflate_choose_literal(c, *in_next++, true,
+                               seq);
+      }
+      
+      /* Check if it's time to output another block. */
+    } while (in_next < in_max_block_end &&
+             seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
+             !should_end_block(&c->split_stats,
+                               in_block_begin, in_next, in_end));
+    
+    deflate_finish_block(c, os, in_block_begin,
+                         in_next - in_block_begin,
+                         c->p.g.sequences, in_next == in_end);
+  } while (in_next != in_end && !os->overflow);
 }
 
 static forceinline void
 deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
-			      const u8 *in, size_t in_nbytes,
-			      struct deflate_output_bitstream *os, bool lazy2)
+                              const u8 *in, size_t in_nbytes,
+                              struct deflate_output_bitstream *os, bool lazy2)
 {
-	const u8 *in_next = in;
-	const u8 *in_end = in_next + in_nbytes;
-	const u8 *in_cur_base = in_next;
-	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
-	unsigned nice_len = MIN(c->nice_match_length, max_len);
-	u32 next_hashes[2] = {0, 0};
-
-	hc_matchfinder_init(&c->p.g.hc_mf);
-
-	do {
-		/* Starting a new DEFLATE block */
-
-		const u8 * const in_block_begin = in_next;
-		const u8 * const in_max_block_end = choose_max_block_end(
-				in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
-		const u8 *next_recalc_min_len =
-			in_next + MIN(in_end - in_next, 10000);
-		struct deflate_sequence *seq = c->p.g.sequences;
-		unsigned min_len;
-
-		init_block_split_stats(&c->split_stats);
-		deflate_begin_sequences(c, seq);
-		min_len = calculate_min_match_len(in_next,
-						  in_max_block_end - in_next,
-						  c->max_search_depth);
-		do {
-			unsigned cur_len;
-			unsigned cur_offset;
-			unsigned next_len;
-			unsigned next_offset;
-
-			/*
-			 * Recalculate the minimum match length if it hasn't
-			 * been done recently.
-			 */
-			if (in_next >= next_recalc_min_len) {
-				min_len = recalculate_min_match_len(
-						&c->freqs,
-						c->max_search_depth);
-				next_recalc_min_len +=
-					MIN(in_end - next_recalc_min_len,
-					    in_next - in_block_begin);
-			}
-
-			/* Find the longest match at the current position. */
-			adjust_max_and_nice_len(&max_len, &nice_len,
-						in_end - in_next);
-			cur_len = hc_matchfinder_longest_match(
-						&c->p.g.hc_mf,
-						&in_cur_base,
-						in_next,
-						min_len - 1,
-						max_len,
-						nice_len,
-						c->max_search_depth,
-						next_hashes,
-						&cur_offset);
-			if (cur_len < min_len ||
-			    (cur_len == DEFLATE_MIN_MATCH_LEN &&
-			     cur_offset > 8192)) {
-				/* No match found.  Choose a literal. */
-				deflate_choose_literal(c, *in_next++, true,
-						       seq);
-				continue;
-			}
-			in_next++;
-
-have_cur_match:
-			/*
-			 * We have a match at the current position.
-			 * If it's very long, choose it immediately.
-			 */
-			if (cur_len >= nice_len) {
-				deflate_choose_match(c, cur_len, cur_offset,
-						     true, &seq);
-				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
-							  &in_cur_base,
-							  in_next,
-							  in_end,
-							  cur_len - 1,
-							  next_hashes);
-				in_next += cur_len - 1;
-				continue;
-			}
-
-			/*
-			 * Try to find a better match at the next position.
-			 *
-			 * Note: since we already have a match at the *current*
-			 * position, we use only half the 'max_search_depth'
-			 * when checking the *next* position.  This is a useful
-			 * trade-off because it's more worthwhile to use a
-			 * greater search depth on the initial match.
-			 *
-			 * Note: it's possible to structure the code such that
-			 * there's only one call to longest_match(), which
-			 * handles both the "find the initial match" and "try to
-			 * find a better match" cases.  However, it is faster to
-			 * have two call sites, with longest_match() inlined at
-			 * each.
-			 */
-			adjust_max_and_nice_len(&max_len, &nice_len,
-						in_end - in_next);
-			next_len = hc_matchfinder_longest_match(
-						&c->p.g.hc_mf,
-						&in_cur_base,
-						in_next++,
-						cur_len - 1,
-						max_len,
-						nice_len,
-						c->max_search_depth >> 1,
-						next_hashes,
-						&next_offset);
-			if (next_len >= cur_len &&
-			    4 * (int)(next_len - cur_len) +
-			    ((int)bsr32(cur_offset) -
-			     (int)bsr32(next_offset)) > 2) {
-				/*
-				 * Found a better match at the next position.
-				 * Output a literal.  Then the next match
-				 * becomes the current match.
-				 */
-				deflate_choose_literal(c, *(in_next - 2), true,
-						       seq);
-				cur_len = next_len;
-				cur_offset = next_offset;
-				goto have_cur_match;
-			}
-
-			if (lazy2) {
-				/* In lazy2 mode, look ahead another position */
-				adjust_max_and_nice_len(&max_len, &nice_len,
-							in_end - in_next);
-				next_len = hc_matchfinder_longest_match(
-						&c->p.g.hc_mf,
-						&in_cur_base,
-						in_next++,
-						cur_len - 1,
-						max_len,
-						nice_len,
-						c->max_search_depth >> 2,
-						next_hashes,
-						&next_offset);
-				if (next_len >= cur_len &&
-				    4 * (int)(next_len - cur_len) +
-				    ((int)bsr32(cur_offset) -
-				     (int)bsr32(next_offset)) > 6) {
-					/*
-					 * There's a much better match two
-					 * positions ahead, so use two literals.
-					 */
-					deflate_choose_literal(
-						c, *(in_next - 3), true, seq);
-					deflate_choose_literal(
-						c, *(in_next - 2), true, seq);
-					cur_len = next_len;
-					cur_offset = next_offset;
-					goto have_cur_match;
-				}
-				/*
-				 * No better match at either of the next 2
-				 * positions.  Output the current match.
-				 */
-				deflate_choose_match(c, cur_len, cur_offset,
-						     true, &seq);
-				if (cur_len > 3) {
-					hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
-								  &in_cur_base,
-								  in_next,
-								  in_end,
-								  cur_len - 3,
-								  next_hashes);
-					in_next += cur_len - 3;
-				}
-			} else { /* !lazy2 */
-				/*
-				 * No better match at the next position.  Output
-				 * the current match.
-				 */
-				deflate_choose_match(c, cur_len, cur_offset,
-						     true, &seq);
-				hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
-							  &in_cur_base,
-							  in_next,
-							  in_end,
-							  cur_len - 2,
-							  next_hashes);
-				in_next += cur_len - 2;
-			}
-			/* Check if it's time to output another block. */
-		} while (in_next < in_max_block_end &&
-			 seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
-			 !should_end_block(&c->split_stats,
-					   in_block_begin, in_next, in_end));
-
-		deflate_finish_block(c, os, in_block_begin,
-				     in_next - in_block_begin,
-				     c->p.g.sequences, in_next == in_end);
-	} while (in_next != in_end && !os->overflow);
+  const u8 *in_next = in;
+  const u8 *in_end = in_next + in_nbytes;
+  const u8 *in_cur_base = in_next;
+  unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+  unsigned nice_len = MIN(c->nice_match_length, max_len);
+  u32 next_hashes[2] = {0, 0};
+  
+  hc_matchfinder_init(&c->p.g.hc_mf);
+  
+  do {
+    /* Starting a new DEFLATE block */
+    
+    const u8 * const in_block_begin = in_next;
+    const u8 * const in_max_block_end = choose_max_block_end(
+                                                             in_next, in_end, SOFT_MAX_BLOCK_LENGTH);
+    const u8 *next_recalc_min_len =
+    in_next + MIN(in_end - in_next, 10000);
+    struct deflate_sequence *seq = c->p.g.sequences;
+    unsigned min_len;
+    
+    init_block_split_stats(&c->split_stats);
+    deflate_begin_sequences(c, seq);
+    min_len = calculate_min_match_len(in_next,
+                                      in_max_block_end - in_next,
+                                      c->max_search_depth);
+    do {
+      unsigned cur_len;
+      unsigned cur_offset;
+      unsigned next_len;
+      unsigned next_offset;
+      
+      /*
+       * Recalculate the minimum match length if it hasn't
+       * been done recently.
+       */
+      if (in_next >= next_recalc_min_len) {
+        min_len = recalculate_min_match_len(
+                                            &c->freqs,
+                                            c->max_search_depth);
+        next_recalc_min_len +=
+        MIN(in_end - next_recalc_min_len,
+            in_next - in_block_begin);
+      }
+      
+      /* Find the longest match at the current position. */
+      adjust_max_and_nice_len(&max_len, &nice_len,
+                              in_end - in_next);
+      cur_len = hc_matchfinder_longest_match(
+                                             &c->p.g.hc_mf,
+                                             &in_cur_base,
+                                             in_next,
+                                             min_len - 1,
+                                             max_len,
+                                             nice_len,
+                                             c->max_search_depth,
+                                             next_hashes,
+                                             &cur_offset);
+      if (cur_len < min_len ||
+          (cur_len == DEFLATE_MIN_MATCH_LEN &&
+           cur_offset > 8192)) {
+        /* No match found.  Choose a literal. */
+        deflate_choose_literal(c, *in_next++, true,
+                               seq);
+        continue;
+      }
+      in_next++;
+      
+    have_cur_match:
+      /*
+       * We have a match at the current position.
+       * If it's very long, choose it immediately.
+       */
+      if (cur_len >= nice_len) {
+        deflate_choose_match(c, cur_len, cur_offset,
+                             true, &seq);
+        hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+                                  &in_cur_base,
+                                  in_next,
+                                  in_end,
+                                  cur_len - 1,
+                                  next_hashes);
+        in_next += cur_len - 1;
+        continue;
+      }
+      
+      /*
+       * Try to find a better match at the next position.
+       *
+       * Note: since we already have a match at the *current*
+       * position, we use only half the 'max_search_depth'
+       * when checking the *next* position.  This is a useful
+       * trade-off because it's more worthwhile to use a
+       * greater search depth on the initial match.
+       *
+       * Note: it's possible to structure the code such that
+       * there's only one call to longest_match(), which
+       * handles both the "find the initial match" and "try to
+       * find a better match" cases.  However, it is faster to
+       * have two call sites, with longest_match() inlined at
+       * each.
+       */
+      adjust_max_and_nice_len(&max_len, &nice_len,
+                              in_end - in_next);
+      next_len = hc_matchfinder_longest_match(
+                                              &c->p.g.hc_mf,
+                                              &in_cur_base,
+                                              in_next++,
+                                              cur_len - 1,
+                                              max_len,
+                                              nice_len,
+                                              c->max_search_depth >> 1,
+                                              next_hashes,
+                                              &next_offset);
+      if (next_len >= cur_len &&
+          4 * (int)(next_len - cur_len) +
+          ((int)bsr32(cur_offset) -
+           (int)bsr32(next_offset)) > 2) {
+        /*
+         * Found a better match at the next position.
+         * Output a literal.  Then the next match
+         * becomes the current match.
+         */
+        deflate_choose_literal(c, *(in_next - 2), true,
+                               seq);
+        cur_len = next_len;
+        cur_offset = next_offset;
+        goto have_cur_match;
+      }
+      
+      if (lazy2) {
+        /* In lazy2 mode, look ahead another position */
+        adjust_max_and_nice_len(&max_len, &nice_len,
+                                in_end - in_next);
+        next_len = hc_matchfinder_longest_match(
+                                                &c->p.g.hc_mf,
+                                                &in_cur_base,
+                                                in_next++,
+                                                cur_len - 1,
+                                                max_len,
+                                                nice_len,
+                                                c->max_search_depth >> 2,
+                                                next_hashes,
+                                                &next_offset);
+        if (next_len >= cur_len &&
+            4 * (int)(next_len - cur_len) +
+            ((int)bsr32(cur_offset) -
+             (int)bsr32(next_offset)) > 6) {
+          /*
+           * There's a much better match two
+           * positions ahead, so use two literals.
+           */
+          deflate_choose_literal(
+                                 c, *(in_next - 3), true, seq);
+          deflate_choose_literal(
+                                 c, *(in_next - 2), true, seq);
+          cur_len = next_len;
+          cur_offset = next_offset;
+          goto have_cur_match;
+        }
+        /*
+         * No better match at either of the next 2
+         * positions.  Output the current match.
+         */
+        deflate_choose_match(c, cur_len, cur_offset,
+                             true, &seq);
+        if (cur_len > 3) {
+          hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+                                    &in_cur_base,
+                                    in_next,
+                                    in_end,
+                                    cur_len - 3,
+                                    next_hashes);
+          in_next += cur_len - 3;
+        }
+      } else { /* !lazy2 */
+        /*
+         * No better match at the next position.  Output
+         * the current match.
+         */
+        deflate_choose_match(c, cur_len, cur_offset,
+                             true, &seq);
+        hc_matchfinder_skip_bytes(&c->p.g.hc_mf,
+                                  &in_cur_base,
+                                  in_next,
+                                  in_end,
+                                  cur_len - 2,
+                                  next_hashes);
+        in_next += cur_len - 2;
+      }
+      /* Check if it's time to output another block. */
+    } while (in_next < in_max_block_end &&
+             seq < &c->p.g.sequences[SEQ_STORE_LENGTH] &&
+             !should_end_block(&c->split_stats,
+                               in_block_begin, in_next, in_end));
+    
+    deflate_finish_block(c, os, in_block_begin,
+                         in_next - in_block_begin,
+                         c->p.g.sequences, in_next == in_end);
+  } while (in_next != in_end && !os->overflow);
 }
 
 /*
@@ -2813,10 +2813,10 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
  */
 static void
 deflate_compress_lazy(struct libdeflate_compressor * restrict c,
-		      const u8 *in, size_t in_nbytes,
-		      struct deflate_output_bitstream *os)
+                      const u8 *in, size_t in_nbytes,
+                      struct deflate_output_bitstream *os)
 {
-	deflate_compress_lazy_generic(c, in, in_nbytes, os, false);
+  deflate_compress_lazy_generic(c, in, in_nbytes, os, false);
 }
 
 /*
@@ -2826,10 +2826,10 @@ deflate_compress_lazy(struct libdeflate_compressor * restrict c,
  */
 static void
 deflate_compress_lazy2(struct libdeflate_compressor * restrict c,
-		       const u8 *in, size_t in_nbytes,
-		       struct deflate_output_bitstream *os)
+                       const u8 *in, size_t in_nbytes,
+                       struct deflate_output_bitstream *os)
 {
-	deflate_compress_lazy_generic(c, in, in_nbytes, os, true);
+  deflate_compress_lazy_generic(c, in, in_nbytes, os, true);
 }
 
 #if SUPPORT_NEAR_OPTIMAL_PARSING
@@ -2842,42 +2842,42 @@ deflate_compress_lazy2(struct libdeflate_compressor * restrict c,
 static void
 deflate_tally_item_list(struct libdeflate_compressor *c, u32 block_length)
 {
-	struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
-	struct deflate_optimum_node *end_node =
-		&c->p.n.optimum_nodes[block_length];
-
-	do {
-		unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
-		unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
-
-		if (length == 1) {
-			/* Literal */
-			c->freqs.litlen[offset]++;
-		} else {
-			/* Match */
-			c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
-					deflate_length_slot[length]]++;
-			c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
-		}
-		cur_node += length;
-	} while (cur_node != end_node);
-
-	/* Tally the end-of-block symbol. */
-	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+  struct deflate_optimum_node *cur_node = &c->p.n.optimum_nodes[0];
+  struct deflate_optimum_node *end_node =
+  &c->p.n.optimum_nodes[block_length];
+  
+  do {
+    unsigned length = cur_node->item & OPTIMUM_LEN_MASK;
+    unsigned offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+    
+    if (length == 1) {
+      /* Literal */
+      c->freqs.litlen[offset]++;
+    } else {
+      /* Match */
+      c->freqs.litlen[DEFLATE_FIRST_LEN_SYM +
+                      deflate_length_slot[length]]++;
+      c->freqs.offset[c->p.n.offset_slot_full[offset]]++;
+    }
+    cur_node += length;
+  } while (cur_node != end_node);
+  
+  /* Tally the end-of-block symbol. */
+  c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
 }
 
 static void
 deflate_choose_all_literals(struct libdeflate_compressor *c,
-			    const u8 *block, u32 block_length)
+                            const u8 *block, u32 block_length)
 {
-	u32 i;
-
-	deflate_reset_symbol_frequencies(c);
-	for (i = 0; i < block_length; i++)
-		c->freqs.litlen[block[i]]++;
-	c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
-
-	deflate_make_huffman_codes(&c->freqs, &c->codes);
+  u32 i;
+  
+  deflate_reset_symbol_frequencies(c);
+  for (i = 0; i < block_length; i++)
+    c->freqs.litlen[block[i]]++;
+  c->freqs.litlen[DEFLATE_END_OF_BLOCK]++;
+  
+  deflate_make_huffman_codes(&c->freqs, &c->codes);
 }
 
 /*
@@ -2888,71 +2888,71 @@ deflate_choose_all_literals(struct libdeflate_compressor *c,
 static u32
 deflate_compute_true_cost(struct libdeflate_compressor *c)
 {
-	u32 cost = 0;
-	unsigned sym;
-
-	deflate_precompute_huffman_header(c);
-
-	memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0,
-	       DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms);
-
-	cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
-	for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
-		cost += c->o.precode.freqs[sym] *
-			(c->o.precode.lens[sym] +
-			 deflate_extra_precode_bits[sym]);
-	}
-
-	for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++)
-		cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym];
-
-	for (; sym < DEFLATE_FIRST_LEN_SYM +
-	       ARRAY_LEN(deflate_extra_length_bits); sym++)
-		cost += c->freqs.litlen[sym] *
-			(c->codes.lens.litlen[sym] +
-			 deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]);
-
-	for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++)
-		cost += c->freqs.offset[sym] *
-			(c->codes.lens.offset[sym] +
-			 deflate_extra_offset_bits[sym]);
-	return cost;
+  u32 cost = 0;
+  unsigned sym;
+  
+  deflate_precompute_huffman_header(c);
+  
+  memset(&c->codes.lens.litlen[c->o.precode.num_litlen_syms], 0,
+         DEFLATE_NUM_LITLEN_SYMS - c->o.precode.num_litlen_syms);
+  
+  cost += 5 + 5 + 4 + (3 * c->o.precode.num_explicit_lens);
+  for (sym = 0; sym < DEFLATE_NUM_PRECODE_SYMS; sym++) {
+    cost += c->o.precode.freqs[sym] *
+    (c->o.precode.lens[sym] +
+     deflate_extra_precode_bits[sym]);
+  }
+  
+  for (sym = 0; sym < DEFLATE_FIRST_LEN_SYM; sym++)
+    cost += c->freqs.litlen[sym] * c->codes.lens.litlen[sym];
+  
+  for (; sym < DEFLATE_FIRST_LEN_SYM +
+       ARRAY_LEN(deflate_extra_length_bits); sym++)
+    cost += c->freqs.litlen[sym] *
+    (c->codes.lens.litlen[sym] +
+     deflate_extra_length_bits[sym - DEFLATE_FIRST_LEN_SYM]);
+  
+  for (sym = 0; sym < ARRAY_LEN(deflate_extra_offset_bits); sym++)
+    cost += c->freqs.offset[sym] *
+    (c->codes.lens.offset[sym] +
+     deflate_extra_offset_bits[sym]);
+  return cost;
 }
 
 /* Set the current cost model from the codeword lengths specified in @lens. */
 static void
 deflate_set_costs_from_codes(struct libdeflate_compressor *c,
-			     const struct deflate_lens *lens)
+                             const struct deflate_lens *lens)
 {
-	unsigned i;
-
-	/* Literals */
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
-		u32 bits = (lens->litlen[i] ?
-			    lens->litlen[i] : LITERAL_NOSTAT_BITS);
-
-		c->p.n.costs.literal[i] = bits * BIT_COST;
-	}
-
-	/* Lengths */
-	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
-		unsigned length_slot = deflate_length_slot[i];
-		unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
-		u32 bits = (lens->litlen[litlen_sym] ?
-			    lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
-
-		bits += deflate_extra_length_bits[length_slot];
-		c->p.n.costs.length[i] = bits * BIT_COST;
-	}
-
-	/* Offset slots */
-	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
-		u32 bits = (lens->offset[i] ?
-			    lens->offset[i] : OFFSET_NOSTAT_BITS);
-
-		bits += deflate_extra_offset_bits[i];
-		c->p.n.costs.offset_slot[i] = bits * BIT_COST;
-	}
+  unsigned i;
+  
+  /* Literals */
+  for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+    u32 bits = (lens->litlen[i] ?
+                lens->litlen[i] : LITERAL_NOSTAT_BITS);
+    
+    c->p.n.costs.literal[i] = bits * BIT_COST;
+  }
+  
+  /* Lengths */
+  for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++) {
+    unsigned length_slot = deflate_length_slot[i];
+    unsigned litlen_sym = DEFLATE_FIRST_LEN_SYM + length_slot;
+    u32 bits = (lens->litlen[litlen_sym] ?
+                lens->litlen[litlen_sym] : LENGTH_NOSTAT_BITS);
+    
+    bits += deflate_extra_length_bits[length_slot];
+    c->p.n.costs.length[i] = bits * BIT_COST;
+  }
+  
+  /* Offset slots */
+  for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++) {
+    u32 bits = (lens->offset[i] ?
+                lens->offset[i] : OFFSET_NOSTAT_BITS);
+    
+    bits += deflate_extra_offset_bits[i];
+    c->p.n.costs.offset_slot[i] = bits * BIT_COST;
+  }
 }
 
 /*
@@ -2962,14 +2962,14 @@ deflate_set_costs_from_codes(struct libdeflate_compressor *c,
  *
  * This table is indexed first by the estimated match probability:
  *
- *	i=0: data doesn't contain many matches	[match_prob=0.25]
- *	i=1: neutral				[match_prob=0.50]
- *	i=2: data contains lots of matches	[match_prob=0.75]
+ *  i=0: data doesn't contain many matches  [match_prob=0.25]
+ *  i=1: neutral        [match_prob=0.50]
+ *  i=2: data contains lots of matches  [match_prob=0.75]
  *
  * This lookup produces a subtable which maps the number of distinct used
  * literals to the default cost of a literal symbol, i.e.:
  *
- *	int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
+ *  int(-log2((1 - match_prob) / num_used_literals) * BIT_COST)
  *
  * ... for num_used_literals in [1, 256] (and 0, which is copied from 1).  This
  * accounts for literals usually getting cheaper as the number of distinct
@@ -2977,127 +2977,127 @@ deflate_set_costs_from_codes(struct libdeflate_compressor *c,
  *
  * The lookup also produces the cost of a length symbol, which is:
  *
- *	int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
+ *  int(-log2(match_prob/NUM_LEN_SLOTS) * BIT_COST)
  *
  * Note: we don't currently assign different costs to different literal symbols,
  * or to different length symbols, as this is hard to do in a useful way.
  */
 static const struct {
-	u8 used_lits_to_lit_cost[257];
-	u8 len_sym_cost;
+  u8 used_lits_to_lit_cost[257];
+  u8 len_sym_cost;
 } default_litlen_costs[] = {
-	{ /* match_prob = 0.25 */
-		.used_lits_to_lit_cost = {
-			6, 6, 22, 32, 38, 43, 48, 51,
-			54, 57, 59, 61, 64, 65, 67, 69,
-			70, 72, 73, 74, 75, 76, 77, 79,
-			80, 80, 81, 82, 83, 84, 85, 85,
-			86, 87, 88, 88, 89, 89, 90, 91,
-			91, 92, 92, 93, 93, 94, 95, 95,
-			96, 96, 96, 97, 97, 98, 98, 99,
-			99, 99, 100, 100, 101, 101, 101, 102,
-			102, 102, 103, 103, 104, 104, 104, 105,
-			105, 105, 105, 106, 106, 106, 107, 107,
-			107, 108, 108, 108, 108, 109, 109, 109,
-			109, 110, 110, 110, 111, 111, 111, 111,
-			112, 112, 112, 112, 112, 113, 113, 113,
-			113, 114, 114, 114, 114, 114, 115, 115,
-			115, 115, 115, 116, 116, 116, 116, 116,
-			117, 117, 117, 117, 117, 118, 118, 118,
-			118, 118, 118, 119, 119, 119, 119, 119,
-			120, 120, 120, 120, 120, 120, 121, 121,
-			121, 121, 121, 121, 121, 122, 122, 122,
-			122, 122, 122, 123, 123, 123, 123, 123,
-			123, 123, 124, 124, 124, 124, 124, 124,
-			124, 125, 125, 125, 125, 125, 125, 125,
-			125, 126, 126, 126, 126, 126, 126, 126,
-			127, 127, 127, 127, 127, 127, 127, 127,
-			128, 128, 128, 128, 128, 128, 128, 128,
-			128, 129, 129, 129, 129, 129, 129, 129,
-			129, 129, 130, 130, 130, 130, 130, 130,
-			130, 130, 130, 131, 131, 131, 131, 131,
-			131, 131, 131, 131, 131, 132, 132, 132,
-			132, 132, 132, 132, 132, 132, 132, 133,
-			133, 133, 133, 133, 133, 133, 133, 133,
-			133, 134, 134, 134, 134, 134, 134, 134,
-			134,
-		},
-		.len_sym_cost = 109,
-	}, { /* match_prob = 0.5 */
-		.used_lits_to_lit_cost = {
-			16, 16, 32, 41, 48, 53, 57, 60,
-			64, 66, 69, 71, 73, 75, 76, 78,
-			80, 81, 82, 83, 85, 86, 87, 88,
-			89, 90, 91, 92, 92, 93, 94, 95,
-			96, 96, 97, 98, 98, 99, 99, 100,
-			101, 101, 102, 102, 103, 103, 104, 104,
-			105, 105, 106, 106, 107, 107, 108, 108,
-			108, 109, 109, 110, 110, 110, 111, 111,
-			112, 112, 112, 113, 113, 113, 114, 114,
-			114, 115, 115, 115, 115, 116, 116, 116,
-			117, 117, 117, 118, 118, 118, 118, 119,
-			119, 119, 119, 120, 120, 120, 120, 121,
-			121, 121, 121, 122, 122, 122, 122, 122,
-			123, 123, 123, 123, 124, 124, 124, 124,
-			124, 125, 125, 125, 125, 125, 126, 126,
-			126, 126, 126, 127, 127, 127, 127, 127,
-			128, 128, 128, 128, 128, 128, 129, 129,
-			129, 129, 129, 129, 130, 130, 130, 130,
-			130, 130, 131, 131, 131, 131, 131, 131,
-			131, 132, 132, 132, 132, 132, 132, 133,
-			133, 133, 133, 133, 133, 133, 134, 134,
-			134, 134, 134, 134, 134, 134, 135, 135,
-			135, 135, 135, 135, 135, 135, 136, 136,
-			136, 136, 136, 136, 136, 136, 137, 137,
-			137, 137, 137, 137, 137, 137, 138, 138,
-			138, 138, 138, 138, 138, 138, 138, 139,
-			139, 139, 139, 139, 139, 139, 139, 139,
-			140, 140, 140, 140, 140, 140, 140, 140,
-			140, 141, 141, 141, 141, 141, 141, 141,
-			141, 141, 141, 142, 142, 142, 142, 142,
-			142, 142, 142, 142, 142, 142, 143, 143,
-			143, 143, 143, 143, 143, 143, 143, 143,
-			144,
-		},
-		.len_sym_cost = 93,
-	}, { /* match_prob = 0.75 */
-		.used_lits_to_lit_cost = {
-			32, 32, 48, 57, 64, 69, 73, 76,
-			80, 82, 85, 87, 89, 91, 92, 94,
-			96, 97, 98, 99, 101, 102, 103, 104,
-			105, 106, 107, 108, 108, 109, 110, 111,
-			112, 112, 113, 114, 114, 115, 115, 116,
-			117, 117, 118, 118, 119, 119, 120, 120,
-			121, 121, 122, 122, 123, 123, 124, 124,
-			124, 125, 125, 126, 126, 126, 127, 127,
-			128, 128, 128, 129, 129, 129, 130, 130,
-			130, 131, 131, 131, 131, 132, 132, 132,
-			133, 133, 133, 134, 134, 134, 134, 135,
-			135, 135, 135, 136, 136, 136, 136, 137,
-			137, 137, 137, 138, 138, 138, 138, 138,
-			139, 139, 139, 139, 140, 140, 140, 140,
-			140, 141, 141, 141, 141, 141, 142, 142,
-			142, 142, 142, 143, 143, 143, 143, 143,
-			144, 144, 144, 144, 144, 144, 145, 145,
-			145, 145, 145, 145, 146, 146, 146, 146,
-			146, 146, 147, 147, 147, 147, 147, 147,
-			147, 148, 148, 148, 148, 148, 148, 149,
-			149, 149, 149, 149, 149, 149, 150, 150,
-			150, 150, 150, 150, 150, 150, 151, 151,
-			151, 151, 151, 151, 151, 151, 152, 152,
-			152, 152, 152, 152, 152, 152, 153, 153,
-			153, 153, 153, 153, 153, 153, 154, 154,
-			154, 154, 154, 154, 154, 154, 154, 155,
-			155, 155, 155, 155, 155, 155, 155, 155,
-			156, 156, 156, 156, 156, 156, 156, 156,
-			156, 157, 157, 157, 157, 157, 157, 157,
-			157, 157, 157, 158, 158, 158, 158, 158,
-			158, 158, 158, 158, 158, 158, 159, 159,
-			159, 159, 159, 159, 159, 159, 159, 159,
-			160,
-		},
-		.len_sym_cost = 84,
-	},
+  { /* match_prob = 0.25 */
+    .used_lits_to_lit_cost = {
+      6, 6, 22, 32, 38, 43, 48, 51,
+      54, 57, 59, 61, 64, 65, 67, 69,
+      70, 72, 73, 74, 75, 76, 77, 79,
+      80, 80, 81, 82, 83, 84, 85, 85,
+      86, 87, 88, 88, 89, 89, 90, 91,
+      91, 92, 92, 93, 93, 94, 95, 95,
+      96, 96, 96, 97, 97, 98, 98, 99,
+      99, 99, 100, 100, 101, 101, 101, 102,
+      102, 102, 103, 103, 104, 104, 104, 105,
+      105, 105, 105, 106, 106, 106, 107, 107,
+      107, 108, 108, 108, 108, 109, 109, 109,
+      109, 110, 110, 110, 111, 111, 111, 111,
+      112, 112, 112, 112, 112, 113, 113, 113,
+      113, 114, 114, 114, 114, 114, 115, 115,
+      115, 115, 115, 116, 116, 116, 116, 116,
+      117, 117, 117, 117, 117, 118, 118, 118,
+      118, 118, 118, 119, 119, 119, 119, 119,
+      120, 120, 120, 120, 120, 120, 121, 121,
+      121, 121, 121, 121, 121, 122, 122, 122,
+      122, 122, 122, 123, 123, 123, 123, 123,
+      123, 123, 124, 124, 124, 124, 124, 124,
+      124, 125, 125, 125, 125, 125, 125, 125,
+      125, 126, 126, 126, 126, 126, 126, 126,
+      127, 127, 127, 127, 127, 127, 127, 127,
+      128, 128, 128, 128, 128, 128, 128, 128,
+      128, 129, 129, 129, 129, 129, 129, 129,
+      129, 129, 130, 130, 130, 130, 130, 130,
+      130, 130, 130, 131, 131, 131, 131, 131,
+      131, 131, 131, 131, 131, 132, 132, 132,
+      132, 132, 132, 132, 132, 132, 132, 133,
+      133, 133, 133, 133, 133, 133, 133, 133,
+      133, 134, 134, 134, 134, 134, 134, 134,
+      134,
+    },
+      .len_sym_cost = 109,
+  }, { /* match_prob = 0.5 */
+    .used_lits_to_lit_cost = {
+      16, 16, 32, 41, 48, 53, 57, 60,
+      64, 66, 69, 71, 73, 75, 76, 78,
+      80, 81, 82, 83, 85, 86, 87, 88,
+      89, 90, 91, 92, 92, 93, 94, 95,
+      96, 96, 97, 98, 98, 99, 99, 100,
+      101, 101, 102, 102, 103, 103, 104, 104,
+      105, 105, 106, 106, 107, 107, 108, 108,
+      108, 109, 109, 110, 110, 110, 111, 111,
+      112, 112, 112, 113, 113, 113, 114, 114,
+      114, 115, 115, 115, 115, 116, 116, 116,
+      117, 117, 117, 118, 118, 118, 118, 119,
+      119, 119, 119, 120, 120, 120, 120, 121,
+      121, 121, 121, 122, 122, 122, 122, 122,
+      123, 123, 123, 123, 124, 124, 124, 124,
+      124, 125, 125, 125, 125, 125, 126, 126,
+      126, 126, 126, 127, 127, 127, 127, 127,
+      128, 128, 128, 128, 128, 128, 129, 129,
+      129, 129, 129, 129, 130, 130, 130, 130,
+      130, 130, 131, 131, 131, 131, 131, 131,
+      131, 132, 132, 132, 132, 132, 132, 133,
+      133, 133, 133, 133, 133, 133, 134, 134,
+      134, 134, 134, 134, 134, 134, 135, 135,
+      135, 135, 135, 135, 135, 135, 136, 136,
+      136, 136, 136, 136, 136, 136, 137, 137,
+      137, 137, 137, 137, 137, 137, 138, 138,
+      138, 138, 138, 138, 138, 138, 138, 139,
+      139, 139, 139, 139, 139, 139, 139, 139,
+      140, 140, 140, 140, 140, 140, 140, 140,
+      140, 141, 141, 141, 141, 141, 141, 141,
+      141, 141, 141, 142, 142, 142, 142, 142,
+      142, 142, 142, 142, 142, 142, 143, 143,
+      143, 143, 143, 143, 143, 143, 143, 143,
+      144,
+    },
+      .len_sym_cost = 93,
+  }, { /* match_prob = 0.75 */
+    .used_lits_to_lit_cost = {
+      32, 32, 48, 57, 64, 69, 73, 76,
+      80, 82, 85, 87, 89, 91, 92, 94,
+      96, 97, 98, 99, 101, 102, 103, 104,
+      105, 106, 107, 108, 108, 109, 110, 111,
+      112, 112, 113, 114, 114, 115, 115, 116,
+      117, 117, 118, 118, 119, 119, 120, 120,
+      121, 121, 122, 122, 123, 123, 124, 124,
+      124, 125, 125, 126, 126, 126, 127, 127,
+      128, 128, 128, 129, 129, 129, 130, 130,
+      130, 131, 131, 131, 131, 132, 132, 132,
+      133, 133, 133, 134, 134, 134, 134, 135,
+      135, 135, 135, 136, 136, 136, 136, 137,
+      137, 137, 137, 138, 138, 138, 138, 138,
+      139, 139, 139, 139, 140, 140, 140, 140,
+      140, 141, 141, 141, 141, 141, 142, 142,
+      142, 142, 142, 143, 143, 143, 143, 143,
+      144, 144, 144, 144, 144, 144, 145, 145,
+      145, 145, 145, 145, 146, 146, 146, 146,
+      146, 146, 147, 147, 147, 147, 147, 147,
+      147, 148, 148, 148, 148, 148, 148, 149,
+      149, 149, 149, 149, 149, 149, 150, 150,
+      150, 150, 150, 150, 150, 150, 151, 151,
+      151, 151, 151, 151, 151, 151, 152, 152,
+      152, 152, 152, 152, 152, 152, 153, 153,
+      153, 153, 153, 153, 153, 153, 154, 154,
+      154, 154, 154, 154, 154, 154, 154, 155,
+      155, 155, 155, 155, 155, 155, 155, 155,
+      156, 156, 156, 156, 156, 156, 156, 156,
+      156, 157, 157, 157, 157, 157, 157, 157,
+      157, 157, 157, 158, 158, 158, 158, 158,
+      158, 158, 158, 158, 158, 158, 159, 159,
+      159, 159, 159, 159, 159, 159, 159, 159,
+      160,
+    },
+      .len_sym_cost = 84,
+  },
 };
 
 /*
@@ -3106,141 +3106,141 @@ static const struct {
  */
 static void
 deflate_choose_default_litlen_costs(struct libdeflate_compressor *c,
-				    const u8 *block_begin, u32 block_length,
-				    u32 *lit_cost, u32 *len_sym_cost)
+                                    const u8 *block_begin, u32 block_length,
+                                    u32 *lit_cost, u32 *len_sym_cost)
 {
-	unsigned num_used_literals = 0;
-	u32 literal_freq = block_length;
-	u32 match_freq = 0;
-	u32 cutoff;
-	u32 i;
-
-	/* Calculate the number of distinct literals that exist in the data. */
-	memset(c->freqs.litlen, 0,
-	       DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
-	cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
-	for (i = 0; i < block_length; i++)
-		c->freqs.litlen[block_begin[i]]++;
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
-		if (c->freqs.litlen[i] > cutoff)
-			num_used_literals++;
-	}
-	if (num_used_literals == 0)
-		num_used_literals = 1;
-
-	/*
-	 * Estimate the relative frequency of literals and matches in the
-	 * optimal parsing solution.  We don't know the optimal solution, so
-	 * this can only be a very rough estimate.  Therefore, we basically use
-	 * the match frequency from a greedy parse.  We also apply the min_len
-	 * heuristic used by the greedy and lazy parsers, to avoid counting too
-	 * many matches when literals are cheaper than short matches.
-	 */
-	match_freq = 0;
-	i = choose_min_match_len(num_used_literals, c->max_search_depth);
-	for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
-		match_freq += c->p.n.match_len_freqs[i];
-		literal_freq -= i * c->p.n.match_len_freqs[i];
-	}
-	if ((s32)literal_freq < 0) /* shouldn't happen */
-		literal_freq = 0;
-
-	if (match_freq > literal_freq)
-		i = 2; /* many matches */
-	else if (match_freq * 4 > literal_freq)
-		i = 1; /* neutral */
-	else
-		i = 0; /* few matches */
-
-	STATIC_ASSERT(BIT_COST == 16);
-	*lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
-							num_used_literals];
-	*len_sym_cost = default_litlen_costs[i].len_sym_cost;
+  unsigned num_used_literals = 0;
+  u32 literal_freq = block_length;
+  u32 match_freq = 0;
+  u32 cutoff;
+  u32 i;
+  
+  /* Calculate the number of distinct literals that exist in the data. */
+  memset(c->freqs.litlen, 0,
+         DEFLATE_NUM_LITERALS * sizeof(c->freqs.litlen[0]));
+  cutoff = literal_freq >> 11; /* Ignore literals used very rarely. */
+  for (i = 0; i < block_length; i++)
+    c->freqs.litlen[block_begin[i]]++;
+  for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
+    if (c->freqs.litlen[i] > cutoff)
+      num_used_literals++;
+  }
+  if (num_used_literals == 0)
+    num_used_literals = 1;
+  
+  /*
+   * Estimate the relative frequency of literals and matches in the
+   * optimal parsing solution.  We don't know the optimal solution, so
+   * this can only be a very rough estimate.  Therefore, we basically use
+   * the match frequency from a greedy parse.  We also apply the min_len
+   * heuristic used by the greedy and lazy parsers, to avoid counting too
+   * many matches when literals are cheaper than short matches.
+   */
+  match_freq = 0;
+  i = choose_min_match_len(num_used_literals, c->max_search_depth);
+  for (; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+    match_freq += c->p.n.match_len_freqs[i];
+    literal_freq -= i * c->p.n.match_len_freqs[i];
+  }
+  if ((s32)literal_freq < 0) /* shouldn't happen */
+    literal_freq = 0;
+  
+  if (match_freq > literal_freq)
+    i = 2; /* many matches */
+  else if (match_freq * 4 > literal_freq)
+    i = 1; /* neutral */
+  else
+    i = 0; /* few matches */
+  
+  STATIC_ASSERT(BIT_COST == 16);
+  *lit_cost = default_litlen_costs[i].used_lits_to_lit_cost[
+                                                            num_used_literals];
+  *len_sym_cost = default_litlen_costs[i].len_sym_cost;
 }
 
 static forceinline u32
 deflate_default_length_cost(unsigned len, u32 len_sym_cost)
 {
-	unsigned slot = deflate_length_slot[len];
-	u32 num_extra_bits = deflate_extra_length_bits[slot];
-
-	return len_sym_cost + (num_extra_bits * BIT_COST);
+  unsigned slot = deflate_length_slot[len];
+  u32 num_extra_bits = deflate_extra_length_bits[slot];
+  
+  return len_sym_cost + (num_extra_bits * BIT_COST);
 }
 
 static forceinline u32
 deflate_default_offset_slot_cost(unsigned slot)
 {
-	u32 num_extra_bits = deflate_extra_offset_bits[slot];
-	/*
-	 * Assume that all offset symbols are equally probable.
-	 * The resulting cost is 'int(-log2(1/30) * BIT_COST)',
-	 * where 30 is the number of potentially-used offset symbols.
-	 */
-	u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;
-
-	return offset_sym_cost + (num_extra_bits * BIT_COST);
+  u32 num_extra_bits = deflate_extra_offset_bits[slot];
+  /*
+   * Assume that all offset symbols are equally probable.
+   * The resulting cost is 'int(-log2(1/30) * BIT_COST)',
+   * where 30 is the number of potentially-used offset symbols.
+   */
+  u32 offset_sym_cost = 4*BIT_COST + (907*BIT_COST)/1000;
+  
+  return offset_sym_cost + (num_extra_bits * BIT_COST);
 }
 
 /* Set default symbol costs for the first block's first optimization pass. */
 static void
 deflate_set_default_costs(struct libdeflate_compressor *c,
-			  u32 lit_cost, u32 len_sym_cost)
+                          u32 lit_cost, u32 len_sym_cost)
 {
-	unsigned i;
-
-	/* Literals */
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
-		c->p.n.costs.literal[i] = lit_cost;
-
-	/* Lengths */
-	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
-		c->p.n.costs.length[i] =
-			deflate_default_length_cost(i, len_sym_cost);
-
-	/* Offset slots */
-	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
-		c->p.n.costs.offset_slot[i] =
-			deflate_default_offset_slot_cost(i);
+  unsigned i;
+  
+  /* Literals */
+  for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+    c->p.n.costs.literal[i] = lit_cost;
+  
+  /* Lengths */
+  for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+    c->p.n.costs.length[i] =
+    deflate_default_length_cost(i, len_sym_cost);
+  
+  /* Offset slots */
+  for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+    c->p.n.costs.offset_slot[i] =
+    deflate_default_offset_slot_cost(i);
 }
 
 static forceinline void
 deflate_adjust_cost(u32 *cost_p, u32 default_cost, int change_amount)
 {
-	if (change_amount == 0)
-		/* Block is very similar to previous; prefer previous costs. */
-		*cost_p = (default_cost + 3 * *cost_p) / 4;
-	else if (change_amount == 1)
-		*cost_p = (default_cost + *cost_p) / 2;
-	else if (change_amount == 2)
-		*cost_p = (5 * default_cost + 3 * *cost_p) / 8;
-	else
-		/* Block differs greatly from previous; prefer default costs. */
-		*cost_p = (3 * default_cost + *cost_p) / 4;
+  if (change_amount == 0)
+  /* Block is very similar to previous; prefer previous costs. */
+    *cost_p = (default_cost + 3 * *cost_p) / 4;
+  else if (change_amount == 1)
+    *cost_p = (default_cost + *cost_p) / 2;
+  else if (change_amount == 2)
+    *cost_p = (5 * default_cost + 3 * *cost_p) / 8;
+  else
+  /* Block differs greatly from previous; prefer default costs. */
+    *cost_p = (3 * default_cost + *cost_p) / 4;
 }
 
 static forceinline void
 deflate_adjust_costs_impl(struct libdeflate_compressor *c,
-			  u32 lit_cost, u32 len_sym_cost, int change_amount)
+                          u32 lit_cost, u32 len_sym_cost, int change_amount)
 {
-	unsigned i;
-
-	/* Literals */
-	for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
-		deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
-				    change_amount);
-
-	/* Lengths */
-	for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
-		deflate_adjust_cost(&c->p.n.costs.length[i],
-				    deflate_default_length_cost(i,
-								len_sym_cost),
-				    change_amount);
-
-	/* Offset slots */
-	for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
-		deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
-				    deflate_default_offset_slot_cost(i),
-				    change_amount);
+  unsigned i;
+  
+  /* Literals */
+  for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
+    deflate_adjust_cost(&c->p.n.costs.literal[i], lit_cost,
+                        change_amount);
+  
+  /* Lengths */
+  for (i = DEFLATE_MIN_MATCH_LEN; i <= DEFLATE_MAX_MATCH_LEN; i++)
+    deflate_adjust_cost(&c->p.n.costs.length[i],
+                        deflate_default_length_cost(i,
+                                                    len_sym_cost),
+                        change_amount);
+  
+  /* Offset slots */
+  for (i = 0; i < ARRAY_LEN(deflate_offset_slot_base); i++)
+    deflate_adjust_cost(&c->p.n.costs.offset_slot[i],
+                        deflate_default_offset_slot_cost(i),
+                        change_amount);
 }
 
 /*
@@ -3254,59 +3254,59 @@ deflate_adjust_costs_impl(struct libdeflate_compressor *c,
  */
 static void
 deflate_adjust_costs(struct libdeflate_compressor *c,
-		     u32 lit_cost, u32 len_sym_cost)
+                     u32 lit_cost, u32 len_sym_cost)
 {
-	u64 total_delta = 0;
-	u64 cutoff;
-	int i;
-
-	/*
-	 * Decide how different the current block is from the previous block,
-	 * using the block splitting statistics from the current and previous
-	 * blocks.  The more different the current block is, the more we prefer
-	 * the default costs rather than the previous block's costs.
-	 *
-	 * The algorithm here is similar to the end-of-block check one, but here
-	 * we compare two entire blocks rather than a partial block with a small
-	 * extra part, and therefore we need 64-bit numbers in some places.
-	 */
-	for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
-		u64 prev = (u64)c->p.n.prev_observations[i] *
-			    c->split_stats.num_observations;
-		u64 cur = (u64)c->split_stats.observations[i] *
-			  c->p.n.prev_num_observations;
-
-		total_delta += prev > cur ? prev - cur : cur - prev;
-	}
-	cutoff = ((u64)c->p.n.prev_num_observations *
-		  c->split_stats.num_observations * 200) / 512;
-
-	if (total_delta > 3 * cutoff)
-		/* Big change in the data; just use the default costs. */
-		deflate_set_default_costs(c, lit_cost, len_sym_cost);
-	else if (4 * total_delta > 9 * cutoff)
-		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
-	else if (2 * total_delta > 3 * cutoff)
-		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
-	else if (2 * total_delta > cutoff)
-		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
-	else
-		deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
+  u64 total_delta = 0;
+  u64 cutoff;
+  int i;
+  
+  /*
+   * Decide how different the current block is from the previous block,
+   * using the block splitting statistics from the current and previous
+   * blocks.  The more different the current block is, the more we prefer
+   * the default costs rather than the previous block's costs.
+   *
+   * The algorithm here is similar to the end-of-block check one, but here
+   * we compare two entire blocks rather than a partial block with a small
+   * extra part, and therefore we need 64-bit numbers in some places.
+   */
+  for (i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+    u64 prev = (u64)c->p.n.prev_observations[i] *
+    c->split_stats.num_observations;
+    u64 cur = (u64)c->split_stats.observations[i] *
+    c->p.n.prev_num_observations;
+    
+    total_delta += prev > cur ? prev - cur : cur - prev;
+  }
+  cutoff = ((u64)c->p.n.prev_num_observations *
+            c->split_stats.num_observations * 200) / 512;
+  
+  if (total_delta > 3 * cutoff)
+  /* Big change in the data; just use the default costs. */
+    deflate_set_default_costs(c, lit_cost, len_sym_cost);
+  else if (4 * total_delta > 9 * cutoff)
+    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 3);
+  else if (2 * total_delta > 3 * cutoff)
+    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 2);
+  else if (2 * total_delta > cutoff)
+    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 1);
+  else
+    deflate_adjust_costs_impl(c, lit_cost, len_sym_cost, 0);
 }
 
 static void
 deflate_set_initial_costs(struct libdeflate_compressor *c,
-			  const u8 *block_begin, u32 block_length,
-			  bool is_first_block)
+                          const u8 *block_begin, u32 block_length,
+                          bool is_first_block)
 {
-	u32 lit_cost, len_sym_cost;
-
-	deflate_choose_default_litlen_costs(c, block_begin, block_length,
-					    &lit_cost, &len_sym_cost);
-	if (is_first_block)
-		deflate_set_default_costs(c, lit_cost, len_sym_cost);
-	else
-		deflate_adjust_costs(c, lit_cost, len_sym_cost);
+  u32 lit_cost, len_sym_cost;
+  
+  deflate_choose_default_litlen_costs(c, block_begin, block_length,
+                                      &lit_cost, &len_sym_cost);
+  if (is_first_block)
+    deflate_set_default_costs(c, lit_cost, len_sym_cost);
+  else
+    deflate_adjust_costs(c, lit_cost, len_sym_cost);
 }
 
 /*
@@ -3325,76 +3325,76 @@ deflate_set_initial_costs(struct libdeflate_compressor *c,
  */
 static void
 deflate_find_min_cost_path(struct libdeflate_compressor *c,
-			   const u32 block_length,
-			   const struct lz_match *cache_ptr)
+                           const u32 block_length,
+                           const struct lz_match *cache_ptr)
 {
-	struct deflate_optimum_node *end_node =
-		&c->p.n.optimum_nodes[block_length];
-	struct deflate_optimum_node *cur_node = end_node;
-
-	cur_node->cost_to_end = 0;
-	do {
-		unsigned num_matches;
-		unsigned literal;
-		u32 best_cost_to_end;
-
-		cur_node--;
-		cache_ptr--;
-
-		num_matches = cache_ptr->length;
-		literal = cache_ptr->offset;
-
-		/* It's always possible to choose a literal. */
-		best_cost_to_end = c->p.n.costs.literal[literal] +
-				   (cur_node + 1)->cost_to_end;
-		cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
-
-		/* Also consider matches if there are any. */
-		if (num_matches) {
-			const struct lz_match *match;
-			unsigned len;
-			unsigned offset;
-			unsigned offset_slot;
-			u32 offset_cost;
-			u32 cost_to_end;
-
-			/*
-			 * Consider each length from the minimum
-			 * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
-			 * match found at this position.  For each length, we
-			 * consider only the smallest offset for which that
-			 * length is available.  Although this is not guaranteed
-			 * to be optimal due to the possibility of a larger
-			 * offset costing less than a smaller offset to code,
-			 * this is a very useful heuristic.
-			 */
-			match = cache_ptr - num_matches;
-			len = DEFLATE_MIN_MATCH_LEN;
-			do {
-				offset = match->offset;
-				offset_slot = c->p.n.offset_slot_full[offset];
-				offset_cost =
-					c->p.n.costs.offset_slot[offset_slot];
-				do {
-					cost_to_end = offset_cost +
-						c->p.n.costs.length[len] +
-						(cur_node + len)->cost_to_end;
-					if (cost_to_end < best_cost_to_end) {
-						best_cost_to_end = cost_to_end;
-						cur_node->item = len |
-							((u32)offset <<
-							 OPTIMUM_OFFSET_SHIFT);
-					}
-				} while (++len <= match->length);
-			} while (++match != cache_ptr);
-			cache_ptr -= num_matches;
-		}
-		cur_node->cost_to_end = best_cost_to_end;
-	} while (cur_node != &c->p.n.optimum_nodes[0]);
-
-	deflate_reset_symbol_frequencies(c);
-	deflate_tally_item_list(c, block_length);
-	deflate_make_huffman_codes(&c->freqs, &c->codes);
+  struct deflate_optimum_node *end_node =
+  &c->p.n.optimum_nodes[block_length];
+  struct deflate_optimum_node *cur_node = end_node;
+  
+  cur_node->cost_to_end = 0;
+  do {
+    unsigned num_matches;
+    unsigned literal;
+    u32 best_cost_to_end;
+    
+    cur_node--;
+    cache_ptr--;
+    
+    num_matches = cache_ptr->length;
+    literal = cache_ptr->offset;
+    
+    /* It's always possible to choose a literal. */
+    best_cost_to_end = c->p.n.costs.literal[literal] +
+    (cur_node + 1)->cost_to_end;
+    cur_node->item = ((u32)literal << OPTIMUM_OFFSET_SHIFT) | 1;
+    
+    /* Also consider matches if there are any. */
+    if (num_matches) {
+      const struct lz_match *match;
+      unsigned len;
+      unsigned offset;
+      unsigned offset_slot;
+      u32 offset_cost;
+      u32 cost_to_end;
+      
+      /*
+       * Consider each length from the minimum
+       * (DEFLATE_MIN_MATCH_LEN) to the length of the longest
+       * match found at this position.  For each length, we
+       * consider only the smallest offset for which that
+       * length is available.  Although this is not guaranteed
+       * to be optimal due to the possibility of a larger
+       * offset costing less than a smaller offset to code,
+       * this is a very useful heuristic.
+       */
+      match = cache_ptr - num_matches;
+      len = DEFLATE_MIN_MATCH_LEN;
+      do {
+        offset = match->offset;
+        offset_slot = c->p.n.offset_slot_full[offset];
+        offset_cost =
+        c->p.n.costs.offset_slot[offset_slot];
+        do {
+          cost_to_end = offset_cost +
+          c->p.n.costs.length[len] +
+          (cur_node + len)->cost_to_end;
+          if (cost_to_end < best_cost_to_end) {
+            best_cost_to_end = cost_to_end;
+            cur_node->item = len |
+            ((u32)offset <<
+             OPTIMUM_OFFSET_SHIFT);
+          }
+        } while (++len <= match->length);
+      } while (++match != cache_ptr);
+      cache_ptr -= num_matches;
+    }
+    cur_node->cost_to_end = best_cost_to_end;
+  } while (cur_node != &c->p.n.optimum_nodes[0]);
+  
+  deflate_reset_symbol_frequencies(c);
+  deflate_tally_item_list(c, block_length);
+  deflate_make_huffman_codes(&c->freqs, &c->codes);
 }
 
 /*
@@ -3414,139 +3414,139 @@ deflate_find_min_cost_path(struct libdeflate_compressor *c,
  */
 static void
 deflate_optimize_and_flush_block(struct libdeflate_compressor *c,
-				 struct deflate_output_bitstream *os,
-				 const u8 *block_begin, u32 block_length,
-				 const struct lz_match *cache_ptr,
-				 bool is_first_block, bool is_final_block,
-				 bool *used_only_literals)
+                                 struct deflate_output_bitstream *os,
+                                 const u8 *block_begin, u32 block_length,
+                                 const struct lz_match *cache_ptr,
+                                 bool is_first_block, bool is_final_block,
+                                 bool *used_only_literals)
 {
-	unsigned num_passes_remaining = c->p.n.max_optim_passes;
-	u32 best_true_cost = UINT32_MAX;
-	u32 true_cost;
-	u32 only_lits_cost;
-	u32 static_cost = UINT32_MAX;
-	struct deflate_sequence seq_;
-	struct deflate_sequence *seq = NULL;
-	u32 i;
-
-	/*
-	 * On some data, using only literals (no matches) ends up being better
-	 * than what the iterative optimization algorithm produces.  Therefore,
-	 * consider using only literals.
-	 */
-	deflate_choose_all_literals(c, block_begin, block_length);
-	only_lits_cost = deflate_compute_true_cost(c);
-
-	/*
-	 * Force the block to really end at the desired length, even if some
-	 * matches extend beyond it.
-	 */
-	for (i = block_length;
-	     i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
-		      ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
-		c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
-
-	/*
-	 * Sometimes a static Huffman block ends up being cheapest, particularly
-	 * if the block is small.  So, if the block is sufficiently small, find
-	 * the optimal static block solution and remember its cost.
-	 */
-	if (block_length <= c->p.n.max_len_to_optimize_static_block) {
-		/* Save c->p.n.costs temporarily. */
-		c->p.n.costs_saved = c->p.n.costs;
-
-		deflate_set_costs_from_codes(c, &c->static_codes.lens);
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
-		static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
-		static_cost += 7; /* for the end-of-block symbol */
-
-		/* Restore c->p.n.costs. */
-		c->p.n.costs = c->p.n.costs_saved;
-	}
-
-	/* Initialize c->p.n.costs with default costs. */
-	deflate_set_initial_costs(c, block_begin, block_length, is_first_block);
-
-	do {
-		/*
-		 * Find the minimum-cost path for this pass.
-		 * Also set c->freqs and c->codes to match the path.
-		 */
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
-
-		/*
-		 * Compute the exact cost of the block if the path were to be
-		 * used.  Note that this differs from
-		 * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses
-		 * the actual Huffman codes instead of c->p.n.costs.
-		 */
-		true_cost = deflate_compute_true_cost(c);
-
-		/*
-		 * If the cost didn't improve much from the previous pass, then
-		 * doing more passes probably won't be helpful, so stop early.
-		 */
-		if (true_cost + c->p.n.min_improvement_to_continue >
-		    best_true_cost)
-			break;
-
-		best_true_cost = true_cost;
-
-		/* Save the cost model that gave 'best_true_cost'. */
-		c->p.n.costs_saved = c->p.n.costs;
-
-		/* Update the cost model from the Huffman codes. */
-		deflate_set_costs_from_codes(c, &c->codes.lens);
-
-	} while (--num_passes_remaining);
-
-	*used_only_literals = false;
-	if (MIN(only_lits_cost, static_cost) < best_true_cost) {
-		if (only_lits_cost < static_cost) {
-			/* Using only literals ended up being best! */
-			deflate_choose_all_literals(c, block_begin, block_length);
-			deflate_set_costs_from_codes(c, &c->codes.lens);
-			seq_.litrunlen_and_length = block_length;
-			seq = &seq_;
-			*used_only_literals = true;
-		} else {
-			/* Static block ended up being best! */
-			deflate_set_costs_from_codes(c, &c->static_codes.lens);
-			deflate_find_min_cost_path(c, block_length, cache_ptr);
-		}
-	} else if (true_cost >=
-		   best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) {
-		/*
-		 * The best solution was actually from a non-final optimization
-		 * pass, so recover and use the min-cost path from that pass.
-		 */
-		c->p.n.costs = c->p.n.costs_saved;
-		deflate_find_min_cost_path(c, block_length, cache_ptr);
-		deflate_set_costs_from_codes(c, &c->codes.lens);
-	}
-	deflate_flush_block(c, os, block_begin, block_length, seq,
-			    is_final_block);
+  unsigned num_passes_remaining = c->p.n.max_optim_passes;
+  u32 best_true_cost = UINT32_MAX;
+  u32 true_cost;
+  u32 only_lits_cost;
+  u32 static_cost = UINT32_MAX;
+  struct deflate_sequence seq_;
+  struct deflate_sequence *seq = NULL;
+  u32 i;
+  
+  /*
+   * On some data, using only literals (no matches) ends up being better
+   * than what the iterative optimization algorithm produces.  Therefore,
+   * consider using only literals.
+   */
+  deflate_choose_all_literals(c, block_begin, block_length);
+  only_lits_cost = deflate_compute_true_cost(c);
+  
+  /*
+   * Force the block to really end at the desired length, even if some
+   * matches extend beyond it.
+   */
+  for (i = block_length;
+       i <= MIN(block_length - 1 + DEFLATE_MAX_MATCH_LEN,
+                ARRAY_LEN(c->p.n.optimum_nodes) - 1); i++)
+    c->p.n.optimum_nodes[i].cost_to_end = 0x80000000;
+  
+  /*
+   * Sometimes a static Huffman block ends up being cheapest, particularly
+   * if the block is small.  So, if the block is sufficiently small, find
+   * the optimal static block solution and remember its cost.
+   */
+  if (block_length <= c->p.n.max_len_to_optimize_static_block) {
+    /* Save c->p.n.costs temporarily. */
+    c->p.n.costs_saved = c->p.n.costs;
+    
+    deflate_set_costs_from_codes(c, &c->static_codes.lens);
+    deflate_find_min_cost_path(c, block_length, cache_ptr);
+    static_cost = c->p.n.optimum_nodes[0].cost_to_end / BIT_COST;
+    static_cost += 7; /* for the end-of-block symbol */
+    
+    /* Restore c->p.n.costs. */
+    c->p.n.costs = c->p.n.costs_saved;
+  }
+  
+  /* Initialize c->p.n.costs with default costs. */
+  deflate_set_initial_costs(c, block_begin, block_length, is_first_block);
+  
+  do {
+    /*
+     * Find the minimum-cost path for this pass.
+     * Also set c->freqs and c->codes to match the path.
+     */
+    deflate_find_min_cost_path(c, block_length, cache_ptr);
+    
+    /*
+     * Compute the exact cost of the block if the path were to be
+     * used.  Note that this differs from
+     * c->p.n.optimum_nodes[0].cost_to_end in that true_cost uses
+     * the actual Huffman codes instead of c->p.n.costs.
+     */
+    true_cost = deflate_compute_true_cost(c);
+    
+    /*
+     * If the cost didn't improve much from the previous pass, then
+     * doing more passes probably won't be helpful, so stop early.
+     */
+    if (true_cost + c->p.n.min_improvement_to_continue >
+        best_true_cost)
+      break;
+    
+    best_true_cost = true_cost;
+    
+    /* Save the cost model that gave 'best_true_cost'. */
+    c->p.n.costs_saved = c->p.n.costs;
+    
+    /* Update the cost model from the Huffman codes. */
+    deflate_set_costs_from_codes(c, &c->codes.lens);
+    
+  } while (--num_passes_remaining);
+  
+  *used_only_literals = false;
+  if (MIN(only_lits_cost, static_cost) < best_true_cost) {
+    if (only_lits_cost < static_cost) {
+      /* Using only literals ended up being best! */
+      deflate_choose_all_literals(c, block_begin, block_length);
+      deflate_set_costs_from_codes(c, &c->codes.lens);
+      seq_.litrunlen_and_length = block_length;
+      seq = &seq_;
+      *used_only_literals = true;
+    } else {
+      /* Static block ended up being best! */
+      deflate_set_costs_from_codes(c, &c->static_codes.lens);
+      deflate_find_min_cost_path(c, block_length, cache_ptr);
+    }
+  } else if (true_cost >=
+             best_true_cost + c->p.n.min_bits_to_use_nonfinal_path) {
+    /*
+     * The best solution was actually from a non-final optimization
+     * pass, so recover and use the min-cost path from that pass.
+     */
+    c->p.n.costs = c->p.n.costs_saved;
+    deflate_find_min_cost_path(c, block_length, cache_ptr);
+    deflate_set_costs_from_codes(c, &c->codes.lens);
+  }
+  deflate_flush_block(c, os, block_begin, block_length, seq,
+                      is_final_block);
 }
 
 static void
 deflate_near_optimal_init_stats(struct libdeflate_compressor *c)
 {
-	init_block_split_stats(&c->split_stats);
-	memset(c->p.n.new_match_len_freqs, 0,
-	       sizeof(c->p.n.new_match_len_freqs));
-	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+  init_block_split_stats(&c->split_stats);
+  memset(c->p.n.new_match_len_freqs, 0,
+         sizeof(c->p.n.new_match_len_freqs));
+  memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
 }
 
 static void
 deflate_near_optimal_merge_stats(struct libdeflate_compressor *c)
 {
-	unsigned i;
-
-	merge_new_observations(&c->split_stats);
-	for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
-		c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i];
-		c->p.n.new_match_len_freqs[i] = 0;
-	}
+  unsigned i;
+  
+  merge_new_observations(&c->split_stats);
+  for (i = 0; i < ARRAY_LEN(c->p.n.match_len_freqs); i++) {
+    c->p.n.match_len_freqs[i] += c->p.n.new_match_len_freqs[i];
+    c->p.n.new_match_len_freqs[i] = 0;
+  }
 }
 
 /*
@@ -3557,22 +3557,22 @@ deflate_near_optimal_merge_stats(struct libdeflate_compressor *c)
 static void
 deflate_near_optimal_save_stats(struct libdeflate_compressor *c)
 {
-	int i;
-
-	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
-		c->p.n.prev_observations[i] = c->split_stats.observations[i];
-	c->p.n.prev_num_observations = c->split_stats.num_observations;
+  int i;
+  
+  for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+    c->p.n.prev_observations[i] = c->split_stats.observations[i];
+  c->p.n.prev_num_observations = c->split_stats.num_observations;
 }
 
 static void
 deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c)
 {
-	int i;
-
-	for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
-		c->split_stats.observations[i] = 0;
-	c->split_stats.num_observations = 0;
-	memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
+  int i;
+  
+  for (i = 0; i < NUM_OBSERVATION_TYPES; i++)
+    c->split_stats.observations[i] = 0;
+  c->split_stats.num_observations = 0;
+  memset(c->p.n.match_len_freqs, 0, sizeof(c->p.n.match_len_freqs));
 }
 
 /*
@@ -3590,538 +3590,538 @@ deflate_near_optimal_clear_old_stats(struct libdeflate_compressor *c)
  */
 static void
 deflate_compress_near_optimal(struct libdeflate_compressor * restrict c,
-			      const u8 *in, size_t in_nbytes,
-			      struct deflate_output_bitstream *os)
+                              const u8 *in, size_t in_nbytes,
+                              struct deflate_output_bitstream *os)
 {
-	const u8 *in_next = in;
-	const u8 *in_block_begin = in_next;
-	const u8 *in_end = in_next + in_nbytes;
-	const u8 *in_cur_base = in_next;
-	const u8 *in_next_slide =
-		in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
-	unsigned max_len = DEFLATE_MAX_MATCH_LEN;
-	unsigned nice_len = MIN(c->nice_match_length, max_len);
-	struct lz_match *cache_ptr = c->p.n.match_cache;
-	u32 next_hashes[2] = {0, 0};
-	bool prev_block_used_only_literals = false;
-
-	bt_matchfinder_init(&c->p.n.bt_mf);
-	deflate_near_optimal_init_stats(c);
-
-	do {
-		/* Starting a new DEFLATE block */
-		const u8 * const in_max_block_end = choose_max_block_end(
-				in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
-		const u8 *prev_end_block_check = NULL;
-		bool change_detected = false;
-		const u8 *next_observation = in_next;
-		unsigned min_len;
-
-		/*
-		 * Use the minimum match length heuristic to improve the
-		 * literal/match statistics gathered during matchfinding.
-		 * However, the actual near-optimal parse won't respect min_len,
-		 * as it can accurately assess the costs of different matches.
-		 *
-		 * If the "use only literals" strategy happened to be the best
-		 * strategy on the previous block, then probably the
-		 * min_match_len heuristic is still not aggressive enough for
-		 * the data, so force gathering literal stats only.
-		 */
-		if (prev_block_used_only_literals)
-			min_len = DEFLATE_MAX_MATCH_LEN + 1;
-		else
-			min_len = calculate_min_match_len(
-					in_block_begin,
-					in_max_block_end - in_block_begin,
-					c->max_search_depth);
-
-		/*
-		 * Find matches until we decide to end the block.  We end the
-		 * block if any of the following is true:
-		 *
-		 * (1) Maximum block length has been reached
-		 * (2) Match catch may overflow.
-		 * (3) Block split heuristic says to split now.
-		 */
-		for (;;) {
-			struct lz_match *matches;
-			unsigned best_len;
-			size_t remaining = in_end - in_next;
-
-			/* Slide the window forward if needed. */
-			if (in_next == in_next_slide) {
-				bt_matchfinder_slide_window(&c->p.n.bt_mf);
-				in_cur_base = in_next;
-				in_next_slide = in_next +
-					MIN(remaining, MATCHFINDER_WINDOW_SIZE);
-			}
-
-			/*
-			 * Find matches with the current position using the
-			 * binary tree matchfinder and save them in match_cache.
-			 *
-			 * Note: the binary tree matchfinder is more suited for
-			 * optimal parsing than the hash chain matchfinder.  The
-			 * reasons for this include:
-			 *
-			 * - The binary tree matchfinder can find more matches
-			 *   in the same number of steps.
-			 * - One of the major advantages of hash chains is that
-			 *   skipping positions (not searching for matches at
-			 *   them) is faster; however, with optimal parsing we
-			 *   search for matches at almost all positions, so this
-			 *   advantage of hash chains is negated.
-			 */
-			matches = cache_ptr;
-			best_len = 0;
-			adjust_max_and_nice_len(&max_len, &nice_len, remaining);
-			if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
-				cache_ptr = bt_matchfinder_get_matches(
-						&c->p.n.bt_mf,
-						in_cur_base,
-						in_next - in_cur_base,
-						max_len,
-						nice_len,
-						c->max_search_depth,
-						next_hashes,
-						matches);
-				if (cache_ptr > matches)
-					best_len = cache_ptr[-1].length;
-			}
-			if (in_next >= next_observation) {
-				if (best_len >= min_len) {
-					observe_match(&c->split_stats,
-						      best_len);
-					next_observation = in_next + best_len;
-					c->p.n.new_match_len_freqs[best_len]++;
-				} else {
-					observe_literal(&c->split_stats,
-							*in_next);
-					next_observation = in_next + 1;
-				}
-			}
-
-			cache_ptr->length = cache_ptr - matches;
-			cache_ptr->offset = *in_next;
-			in_next++;
-			cache_ptr++;
-
-			/*
-			 * If there was a very long match found, don't cache any
-			 * matches for the bytes covered by that match.  This
-			 * avoids degenerate behavior when compressing highly
-			 * redundant data, where the number of matches can be
-			 * very large.
-			 *
-			 * This heuristic doesn't actually hurt the compression
-			 * ratio very much.  If there's a long match, then the
-			 * data must be highly compressible, so it doesn't
-			 * matter much what we do.
-			 */
-			if (best_len >= DEFLATE_MIN_MATCH_LEN &&
-			    best_len >= nice_len) {
-				--best_len;
-				do {
-					remaining = in_end - in_next;
-					if (in_next == in_next_slide) {
-						bt_matchfinder_slide_window(
-							&c->p.n.bt_mf);
-						in_cur_base = in_next;
-						in_next_slide = in_next +
-							MIN(remaining,
-							    MATCHFINDER_WINDOW_SIZE);
-					}
-					adjust_max_and_nice_len(&max_len,
-								&nice_len,
-								remaining);
-					if (max_len >=
-					    BT_MATCHFINDER_REQUIRED_NBYTES) {
-						bt_matchfinder_skip_byte(
-							&c->p.n.bt_mf,
-							in_cur_base,
-							in_next - in_cur_base,
-							nice_len,
-							c->max_search_depth,
-							next_hashes);
-					}
-					cache_ptr->length = 0;
-					cache_ptr->offset = *in_next;
-					in_next++;
-					cache_ptr++;
-				} while (--best_len);
-			}
-			/* Maximum block length or end of input reached? */
-			if (in_next >= in_max_block_end)
-				break;
-			/* Match cache overflowed? */
-			if (cache_ptr >=
-			    &c->p.n.match_cache[MATCH_CACHE_LENGTH])
-				break;
-			/* Not ready to try to end the block (again)? */
-			if (!ready_to_check_block(&c->split_stats,
-						  in_block_begin, in_next,
-						  in_end))
-				continue;
-			/* Check if it would be worthwhile to end the block. */
-			if (do_end_block_check(&c->split_stats,
-					       in_next - in_block_begin)) {
-				change_detected = true;
-				break;
-			}
-			/* Ending the block doesn't seem worthwhile here. */
-			deflate_near_optimal_merge_stats(c);
-			prev_end_block_check = in_next;
-		}
-		/*
-		 * All the matches for this block have been cached.  Now choose
-		 * the precise end of the block and the sequence of items to
-		 * output to represent it, then flush the block.
-		 */
-		if (change_detected && prev_end_block_check != NULL) {
-			/*
-			 * The block is being ended because a recent chunk of
-			 * data differs from the rest of the block.  We could
-			 * end the block at 'in_next' like the greedy and lazy
-			 * compressors do, but that's not ideal since it would
-			 * include the differing chunk in the block.  The
-			 * near-optimal compressor has time to do a better job.
-			 * Therefore, we rewind to just before the chunk, and
-			 * output a block that only goes up to there.
-			 *
-			 * We then set things up to correctly start the next
-			 * block, considering that some work has already been
-			 * done on it (some matches found and stats gathered).
-			 */
-			struct lz_match *orig_cache_ptr = cache_ptr;
-			const u8 *in_block_end = prev_end_block_check;
-			u32 block_length = in_block_end - in_block_begin;
-			bool is_first = (in_block_begin == in);
-			bool is_final = false;
-			u32 num_bytes_to_rewind = in_next - in_block_end;
-			size_t cache_len_rewound;
-
-			/* Rewind the match cache. */
-			do {
-				cache_ptr--;
-				cache_ptr -= cache_ptr->length;
-			} while (--num_bytes_to_rewind);
-			cache_len_rewound = orig_cache_ptr - cache_ptr;
-
-			deflate_optimize_and_flush_block(
-						c, os, in_block_begin,
-						block_length, cache_ptr,
-						is_first, is_final,
-						&prev_block_used_only_literals);
-			memmove(c->p.n.match_cache, cache_ptr,
-				cache_len_rewound * sizeof(*cache_ptr));
-			cache_ptr = &c->p.n.match_cache[cache_len_rewound];
-			deflate_near_optimal_save_stats(c);
-			/*
-			 * Clear the stats for the just-flushed block, leaving
-			 * just the stats for the beginning of the next block.
-			 */
-			deflate_near_optimal_clear_old_stats(c);
-			in_block_begin = in_block_end;
-		} else {
-			/*
-			 * The block is being ended for a reason other than a
-			 * differing data chunk being detected.  Don't rewind at
-			 * all; just end the block at the current position.
-			 */
-			u32 block_length = in_next - in_block_begin;
-			bool is_first = (in_block_begin == in);
-			bool is_final = (in_next == in_end);
-
-			deflate_near_optimal_merge_stats(c);
-			deflate_optimize_and_flush_block(
-						c, os, in_block_begin,
-						block_length, cache_ptr,
-						is_first, is_final,
-						&prev_block_used_only_literals);
-			cache_ptr = &c->p.n.match_cache[0];
-			deflate_near_optimal_save_stats(c);
-			deflate_near_optimal_init_stats(c);
-			in_block_begin = in_next;
-		}
-	} while (in_next != in_end && !os->overflow);
+  const u8 *in_next = in;
+  const u8 *in_block_begin = in_next;
+  const u8 *in_end = in_next + in_nbytes;
+  const u8 *in_cur_base = in_next;
+  const u8 *in_next_slide =
+  in_next + MIN(in_end - in_next, MATCHFINDER_WINDOW_SIZE);
+  unsigned max_len = DEFLATE_MAX_MATCH_LEN;
+  unsigned nice_len = MIN(c->nice_match_length, max_len);
+  struct lz_match *cache_ptr = c->p.n.match_cache;
+  u32 next_hashes[2] = {0, 0};
+  bool prev_block_used_only_literals = false;
+  
+  bt_matchfinder_init(&c->p.n.bt_mf);
+  deflate_near_optimal_init_stats(c);
+  
+  do {
+    /* Starting a new DEFLATE block */
+    const u8 * const in_max_block_end = choose_max_block_end(
+                                                             in_block_begin, in_end, SOFT_MAX_BLOCK_LENGTH);
+    const u8 *prev_end_block_check = NULL;
+    bool change_detected = false;
+    const u8 *next_observation = in_next;
+    unsigned min_len;
+    
+    /*
+     * Use the minimum match length heuristic to improve the
+     * literal/match statistics gathered during matchfinding.
+     * However, the actual near-optimal parse won't respect min_len,
+     * as it can accurately assess the costs of different matches.
+     *
+     * If the "use only literals" strategy happened to be the best
+     * strategy on the previous block, then probably the
+     * min_match_len heuristic is still not aggressive enough for
+     * the data, so force gathering literal stats only.
+     */
+    if (prev_block_used_only_literals)
+      min_len = DEFLATE_MAX_MATCH_LEN + 1;
+    else
+      min_len = calculate_min_match_len(
+                                        in_block_begin,
+                                        in_max_block_end - in_block_begin,
+                                        c->max_search_depth);
+    
+    /*
+     * Find matches until we decide to end the block.  We end the
+     * block if any of the following is true:
+     *
+     * (1) Maximum block length has been reached
+     * (2) Match catch may overflow.
+     * (3) Block split heuristic says to split now.
+     */
+    for (;;) {
+      struct lz_match *matches;
+      unsigned best_len;
+      size_t remaining = in_end - in_next;
+      
+      /* Slide the window forward if needed. */
+      if (in_next == in_next_slide) {
+        bt_matchfinder_slide_window(&c->p.n.bt_mf);
+        in_cur_base = in_next;
+        in_next_slide = in_next +
+        MIN(remaining, MATCHFINDER_WINDOW_SIZE);
+      }
+      
+      /*
+       * Find matches with the current position using the
+       * binary tree matchfinder and save them in match_cache.
+       *
+       * Note: the binary tree matchfinder is more suited for
+       * optimal parsing than the hash chain matchfinder.  The
+       * reasons for this include:
+       *
+       * - The binary tree matchfinder can find more matches
+       *   in the same number of steps.
+       * - One of the major advantages of hash chains is that
+       *   skipping positions (not searching for matches at
+       *   them) is faster; however, with optimal parsing we
+       *   search for matches at almost all positions, so this
+       *   advantage of hash chains is negated.
+       */
+      matches = cache_ptr;
+      best_len = 0;
+      adjust_max_and_nice_len(&max_len, &nice_len, remaining);
+      if (likely(max_len >= BT_MATCHFINDER_REQUIRED_NBYTES)) {
+        cache_ptr = bt_matchfinder_get_matches(
+                                               &c->p.n.bt_mf,
+                                               in_cur_base,
+                                               in_next - in_cur_base,
+                                               max_len,
+                                               nice_len,
+                                               c->max_search_depth,
+                                               next_hashes,
+                                               matches);
+        if (cache_ptr > matches)
+          best_len = cache_ptr[-1].length;
+      }
+      if (in_next >= next_observation) {
+        if (best_len >= min_len) {
+          observe_match(&c->split_stats,
+                        best_len);
+          next_observation = in_next + best_len;
+          c->p.n.new_match_len_freqs[best_len]++;
+        } else {
+          observe_literal(&c->split_stats,
+                          *in_next);
+          next_observation = in_next + 1;
+        }
+      }
+      
+      cache_ptr->length = cache_ptr - matches;
+      cache_ptr->offset = *in_next;
+      in_next++;
+      cache_ptr++;
+      
+      /*
+       * If there was a very long match found, don't cache any
+       * matches for the bytes covered by that match.  This
+       * avoids degenerate behavior when compressing highly
+       * redundant data, where the number of matches can be
+       * very large.
+       *
+       * This heuristic doesn't actually hurt the compression
+       * ratio very much.  If there's a long match, then the
+       * data must be highly compressible, so it doesn't
+       * matter much what we do.
+       */
+      if (best_len >= DEFLATE_MIN_MATCH_LEN &&
+          best_len >= nice_len) {
+        --best_len;
+        do {
+          remaining = in_end - in_next;
+          if (in_next == in_next_slide) {
+            bt_matchfinder_slide_window(
+                                        &c->p.n.bt_mf);
+            in_cur_base = in_next;
+            in_next_slide = in_next +
+            MIN(remaining,
+                MATCHFINDER_WINDOW_SIZE);
+          }
+          adjust_max_and_nice_len(&max_len,
+                                  &nice_len,
+                                  remaining);
+          if (max_len >=
+              BT_MATCHFINDER_REQUIRED_NBYTES) {
+            bt_matchfinder_skip_byte(
+                                     &c->p.n.bt_mf,
+                                     in_cur_base,
+                                     in_next - in_cur_base,
+                                     nice_len,
+                                     c->max_search_depth,
+                                     next_hashes);
+          }
+          cache_ptr->length = 0;
+          cache_ptr->offset = *in_next;
+          in_next++;
+          cache_ptr++;
+        } while (--best_len);
+      }
+      /* Maximum block length or end of input reached? */
+      if (in_next >= in_max_block_end)
+        break;
+      /* Match cache overflowed? */
+      if (cache_ptr >=
+          &c->p.n.match_cache[MATCH_CACHE_LENGTH])
+        break;
+      /* Not ready to try to end the block (again)? */
+      if (!ready_to_check_block(&c->split_stats,
+                                in_block_begin, in_next,
+                                in_end))
+        continue;
+      /* Check if it would be worthwhile to end the block. */
+      if (do_end_block_check(&c->split_stats,
+                             in_next - in_block_begin)) {
+        change_detected = true;
+        break;
+      }
+      /* Ending the block doesn't seem worthwhile here. */
+      deflate_near_optimal_merge_stats(c);
+      prev_end_block_check = in_next;
+    }
+    /*
+     * All the matches for this block have been cached.  Now choose
+     * the precise end of the block and the sequence of items to
+     * output to represent it, then flush the block.
+     */
+    if (change_detected && prev_end_block_check != NULL) {
+      /*
+       * The block is being ended because a recent chunk of
+       * data differs from the rest of the block.  We could
+       * end the block at 'in_next' like the greedy and lazy
+       * compressors do, but that's not ideal since it would
+       * include the differing chunk in the block.  The
+       * near-optimal compressor has time to do a better job.
+       * Therefore, we rewind to just before the chunk, and
+       * output a block that only goes up to there.
+       *
+       * We then set things up to correctly start the next
+       * block, considering that some work has already been
+       * done on it (some matches found and stats gathered).
+       */
+      struct lz_match *orig_cache_ptr = cache_ptr;
+      const u8 *in_block_end = prev_end_block_check;
+      u32 block_length = in_block_end - in_block_begin;
+      bool is_first = (in_block_begin == in);
+      bool is_final = false;
+      u32 num_bytes_to_rewind = in_next - in_block_end;
+      size_t cache_len_rewound;
+      
+      /* Rewind the match cache. */
+      do {
+        cache_ptr--;
+        cache_ptr -= cache_ptr->length;
+      } while (--num_bytes_to_rewind);
+      cache_len_rewound = orig_cache_ptr - cache_ptr;
+      
+      deflate_optimize_and_flush_block(
+                                       c, os, in_block_begin,
+                                       block_length, cache_ptr,
+                                       is_first, is_final,
+                                       &prev_block_used_only_literals);
+      memmove(c->p.n.match_cache, cache_ptr,
+              cache_len_rewound * sizeof(*cache_ptr));
+      cache_ptr = &c->p.n.match_cache[cache_len_rewound];
+      deflate_near_optimal_save_stats(c);
+      /*
+       * Clear the stats for the just-flushed block, leaving
+       * just the stats for the beginning of the next block.
+       */
+      deflate_near_optimal_clear_old_stats(c);
+      in_block_begin = in_block_end;
+    } else {
+      /*
+       * The block is being ended for a reason other than a
+       * differing data chunk being detected.  Don't rewind at
+       * all; just end the block at the current position.
+       */
+      u32 block_length = in_next - in_block_begin;
+      bool is_first = (in_block_begin == in);
+      bool is_final = (in_next == in_end);
+      
+      deflate_near_optimal_merge_stats(c);
+      deflate_optimize_and_flush_block(
+                                       c, os, in_block_begin,
+                                       block_length, cache_ptr,
+                                       is_first, is_final,
+                                       &prev_block_used_only_literals);
+      cache_ptr = &c->p.n.match_cache[0];
+      deflate_near_optimal_save_stats(c);
+      deflate_near_optimal_init_stats(c);
+      in_block_begin = in_next;
+    }
+  } while (in_next != in_end && !os->overflow);
 }
 
 /* Initialize c->p.n.offset_slot_full. */
 static void
 deflate_init_offset_slot_full(struct libdeflate_compressor *c)
 {
-	unsigned offset_slot;
-	unsigned offset;
-	unsigned offset_end;
-
-	for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base);
-	     offset_slot++) {
-		offset = deflate_offset_slot_base[offset_slot];
-		offset_end = offset +
-			     (1 << deflate_extra_offset_bits[offset_slot]);
-		do {
-			c->p.n.offset_slot_full[offset] = offset_slot;
-		} while (++offset != offset_end);
-	}
+  unsigned offset_slot;
+  unsigned offset;
+  unsigned offset_end;
+  
+  for (offset_slot = 0; offset_slot < ARRAY_LEN(deflate_offset_slot_base);
+       offset_slot++) {
+    offset = deflate_offset_slot_base[offset_slot];
+    offset_end = offset +
+    (1 << deflate_extra_offset_bits[offset_slot]);
+    do {
+      c->p.n.offset_slot_full[offset] = offset_slot;
+    } while (++offset != offset_end);
+  }
 }
 
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
 
 LIBDEFLATEAPI struct libdeflate_compressor *
 libdeflate_alloc_compressor_ex(int compression_level,
-			       const struct libdeflate_options *options)
+                               const struct libdeflate_options *options)
 {
-	struct libdeflate_compressor *c;
-	size_t size = offsetof(struct libdeflate_compressor, p);
-
-	check_buildtime_parameters();
-
-	/*
-	 * Note: if more fields are added to libdeflate_options, this code will
-	 * need to be updated to support both the old and new structs.
-	 */
-	if (options->sizeof_options != sizeof(*options))
-		return NULL;
-
-	if (compression_level < 0 || compression_level > 12)
-		return NULL;
-
+  struct libdeflate_compressor *c;
+  size_t size = offsetof(struct libdeflate_compressor, p);
+  
+  check_buildtime_parameters();
+  
+  /*
+   * Note: if more fields are added to libdeflate_options, this code will
+   * need to be updated to support both the old and new structs.
+   */
+  if (options->sizeof_options != sizeof(*options))
+    return NULL;
+  
+  if (compression_level < 0 || compression_level > 12)
+    return NULL;
+  
 #if SUPPORT_NEAR_OPTIMAL_PARSING
-	if (compression_level >= 10)
-		size += sizeof(c->p.n);
-	else
+  if (compression_level >= 10)
+    size += sizeof(c->p.n);
+  else
 #endif
-	{
-		if (compression_level >= 2)
-			size += sizeof(c->p.g);
-		else if (compression_level == 1)
-			size += sizeof(c->p.f);
-	}
-
-	c = libdeflate_aligned_malloc(options->malloc_func ?
-				      options->malloc_func :
-				      libdeflate_default_malloc_func,
-				      MATCHFINDER_MEM_ALIGNMENT, size);
-	if (!c)
-		return NULL;
-	c->free_func = options->free_func ?
-		       options->free_func : libdeflate_default_free_func;
-
-	c->compression_level = compression_level;
-
-	/*
-	 * The higher the compression level, the more we should bother trying to
-	 * compress very small inputs.
-	 */
-	c->max_passthrough_size = 55 - (compression_level * 4);
-
-	switch (compression_level) {
-	case 0:
-		c->max_passthrough_size = SIZE_MAX;
-		c->impl = NULL; /* not used */
-		break;
-	case 1:
-		c->impl = deflate_compress_fastest;
-		/* max_search_depth is unused. */
-		c->nice_match_length = 32;
-		break;
-	case 2:
-		c->impl = deflate_compress_greedy;
-		c->max_search_depth = 6;
-		c->nice_match_length = 10;
-		break;
-	case 3:
-		c->impl = deflate_compress_greedy;
-		c->max_search_depth = 12;
-		c->nice_match_length = 14;
-		break;
-	case 4:
-		c->impl = deflate_compress_greedy;
-		c->max_search_depth = 16;
-		c->nice_match_length = 30;
-		break;
-	case 5:
-		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 16;
-		c->nice_match_length = 30;
-		break;
-	case 6:
-		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 35;
-		c->nice_match_length = 65;
-		break;
-	case 7:
-		c->impl = deflate_compress_lazy;
-		c->max_search_depth = 100;
-		c->nice_match_length = 130;
-		break;
-	case 8:
-		c->impl = deflate_compress_lazy2;
-		c->max_search_depth = 300;
-		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
-		break;
-	case 9:
+  {
+    if (compression_level >= 2)
+      size += sizeof(c->p.g);
+    else if (compression_level == 1)
+      size += sizeof(c->p.f);
+  }
+  
+  c = libdeflate_aligned_malloc(options->malloc_func ?
+                                options->malloc_func :
+                                libdeflate_default_malloc_func,
+                                MATCHFINDER_MEM_ALIGNMENT, size);
+  if (!c)
+    return NULL;
+  c->free_func = options->free_func ?
+  options->free_func : libdeflate_default_free_func;
+  
+  c->compression_level = compression_level;
+  
+  /*
+   * The higher the compression level, the more we should bother trying to
+   * compress very small inputs.
+   */
+  c->max_passthrough_size = 55 - (compression_level * 4);
+  
+  switch (compression_level) {
+    case 0:
+      c->max_passthrough_size = SIZE_MAX;
+      c->impl = NULL; /* not used */
+      break;
+    case 1:
+      c->impl = deflate_compress_fastest;
+      /* max_search_depth is unused. */
+      c->nice_match_length = 32;
+      break;
+    case 2:
+      c->impl = deflate_compress_greedy;
+      c->max_search_depth = 6;
+      c->nice_match_length = 10;
+      break;
+    case 3:
+      c->impl = deflate_compress_greedy;
+      c->max_search_depth = 12;
+      c->nice_match_length = 14;
+      break;
+    case 4:
+      c->impl = deflate_compress_greedy;
+      c->max_search_depth = 16;
+      c->nice_match_length = 30;
+      break;
+    case 5:
+      c->impl = deflate_compress_lazy;
+      c->max_search_depth = 16;
+      c->nice_match_length = 30;
+      break;
+    case 6:
+      c->impl = deflate_compress_lazy;
+      c->max_search_depth = 35;
+      c->nice_match_length = 65;
+      break;
+    case 7:
+      c->impl = deflate_compress_lazy;
+      c->max_search_depth = 100;
+      c->nice_match_length = 130;
+      break;
+    case 8:
+      c->impl = deflate_compress_lazy2;
+      c->max_search_depth = 300;
+      c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+      break;
+    case 9:
 #if !SUPPORT_NEAR_OPTIMAL_PARSING
-	default:
+    default:
 #endif
-		c->impl = deflate_compress_lazy2;
-		c->max_search_depth = 600;
-		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
-		break;
+      c->impl = deflate_compress_lazy2;
+      c->max_search_depth = 600;
+      c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+      break;
 #if SUPPORT_NEAR_OPTIMAL_PARSING
-	case 10:
-		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 35;
-		c->nice_match_length = 75;
-		c->p.n.max_optim_passes = 2;
-		c->p.n.min_improvement_to_continue = 32;
-		c->p.n.min_bits_to_use_nonfinal_path = 32;
-		c->p.n.max_len_to_optimize_static_block = 0;
-		deflate_init_offset_slot_full(c);
-		break;
-	case 11:
-		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 100;
-		c->nice_match_length = 150;
-		c->p.n.max_optim_passes = 4;
-		c->p.n.min_improvement_to_continue = 16;
-		c->p.n.min_bits_to_use_nonfinal_path = 16;
-		c->p.n.max_len_to_optimize_static_block = 1000;
-		deflate_init_offset_slot_full(c);
-		break;
-	case 12:
-	default:
-		c->impl = deflate_compress_near_optimal;
-		c->max_search_depth = 300;
-		c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
-		c->p.n.max_optim_passes = 10;
-		c->p.n.min_improvement_to_continue = 1;
-		c->p.n.min_bits_to_use_nonfinal_path = 1;
-		c->p.n.max_len_to_optimize_static_block = 10000;
-		deflate_init_offset_slot_full(c);
-		break;
+    case 10:
+      c->impl = deflate_compress_near_optimal;
+      c->max_search_depth = 35;
+      c->nice_match_length = 75;
+      c->p.n.max_optim_passes = 2;
+      c->p.n.min_improvement_to_continue = 32;
+      c->p.n.min_bits_to_use_nonfinal_path = 32;
+      c->p.n.max_len_to_optimize_static_block = 0;
+      deflate_init_offset_slot_full(c);
+      break;
+    case 11:
+      c->impl = deflate_compress_near_optimal;
+      c->max_search_depth = 100;
+      c->nice_match_length = 150;
+      c->p.n.max_optim_passes = 4;
+      c->p.n.min_improvement_to_continue = 16;
+      c->p.n.min_bits_to_use_nonfinal_path = 16;
+      c->p.n.max_len_to_optimize_static_block = 1000;
+      deflate_init_offset_slot_full(c);
+      break;
+    case 12:
+    default:
+      c->impl = deflate_compress_near_optimal;
+      c->max_search_depth = 300;
+      c->nice_match_length = DEFLATE_MAX_MATCH_LEN;
+      c->p.n.max_optim_passes = 10;
+      c->p.n.min_improvement_to_continue = 1;
+      c->p.n.min_bits_to_use_nonfinal_path = 1;
+      c->p.n.max_len_to_optimize_static_block = 10000;
+      deflate_init_offset_slot_full(c);
+      break;
 #endif /* SUPPORT_NEAR_OPTIMAL_PARSING */
-	}
-
-	deflate_init_static_codes(c);
-
-	return c;
+  }
+  
+  deflate_init_static_codes(c);
+  
+  return c;
 }
 
 
 LIBDEFLATEAPI struct libdeflate_compressor *
 libdeflate_alloc_compressor(int compression_level)
 {
-	static const struct libdeflate_options defaults = {
-		.sizeof_options = sizeof(defaults),
-	};
-	return libdeflate_alloc_compressor_ex(compression_level, &defaults);
+  static const struct libdeflate_options defaults = {
+    .sizeof_options = sizeof(defaults),
+  };
+  return libdeflate_alloc_compressor_ex(compression_level, &defaults);
 }
 
 LIBDEFLATEAPI size_t
 libdeflate_deflate_compress(struct libdeflate_compressor *c,
-			    const void *in, size_t in_nbytes,
-			    void *out, size_t out_nbytes_avail)
+                            const void *in, size_t in_nbytes,
+                            void *out, size_t out_nbytes_avail)
 {
-	struct deflate_output_bitstream os;
-
-	/*
-	 * For extremely short inputs, or for compression level 0, just output
-	 * uncompressed blocks.
-	 */
-	if (unlikely(in_nbytes <= c->max_passthrough_size))
-		return deflate_compress_none(in, in_nbytes,
-					     out, out_nbytes_avail);
-
-	/* Initialize the output bitstream structure. */
-	os.bitbuf = 0;
-	os.bitcount = 0;
-	os.next = out;
-	os.end = os.next + out_nbytes_avail;
-	os.overflow = false;
-
-	/* Call the actual compression function. */
-	(*c->impl)(c, in, in_nbytes, &os);
-
-	/* Return 0 if the output buffer is too small. */
-	if (os.overflow)
-		return 0;
-
-	/*
-	 * Write the final byte if needed.  This can't overflow the output
-	 * buffer because deflate_flush_block() would have set the overflow flag
-	 * if there wasn't enough space remaining for the full final block.
-	 */
-	ASSERT(os.bitcount <= 7);
-	if (os.bitcount) {
-		ASSERT(os.next < os.end);
-		*os.next++ = os.bitbuf;
-	}
-
-	/* Return the compressed size in bytes. */
-	return os.next - (u8 *)out;
+  struct deflate_output_bitstream os;
+  
+  /*
+   * For extremely short inputs, or for compression level 0, just output
+   * uncompressed blocks.
+   */
+  if (unlikely(in_nbytes <= c->max_passthrough_size))
+    return deflate_compress_none(in, in_nbytes,
+                                 out, out_nbytes_avail);
+  
+  /* Initialize the output bitstream structure. */
+  os.bitbuf = 0;
+  os.bitcount = 0;
+  os.next = out;
+  os.end = os.next + out_nbytes_avail;
+  os.overflow = false;
+  
+  /* Call the actual compression function. */
+  (*c->impl)(c, in, in_nbytes, &os);
+  
+  /* Return 0 if the output buffer is too small. */
+  if (os.overflow)
+    return 0;
+  
+  /*
+   * Write the final byte if needed.  This can't overflow the output
+   * buffer because deflate_flush_block() would have set the overflow flag
+   * if there wasn't enough space remaining for the full final block.
+   */
+  ASSERT(os.bitcount <= 7);
+  if (os.bitcount) {
+    ASSERT(os.next < os.end);
+    *os.next++ = os.bitbuf;
+  }
+  
+  /* Return the compressed size in bytes. */
+  return os.next - (u8 *)out;
 }
 
 LIBDEFLATEAPI void
 libdeflate_free_compressor(struct libdeflate_compressor *c)
 {
-	if (c)
-		libdeflate_aligned_free(c->free_func, c);
+  if (c)
+    libdeflate_aligned_free(c->free_func, c);
 }
 
 unsigned int
 libdeflate_get_compression_level(struct libdeflate_compressor *c)
 {
-	return c->compression_level;
+  return c->compression_level;
 }
 
 LIBDEFLATEAPI size_t
 libdeflate_deflate_compress_bound(struct libdeflate_compressor *c,
-				  size_t in_nbytes)
+                                  size_t in_nbytes)
 {
-	size_t max_blocks;
-
-	/*
-	 * Since the compressor never uses a compressed block when an
-	 * uncompressed block is cheaper, the worst case can be no worse than
-	 * the case where only uncompressed blocks are used.
-	 *
-	 * This is true even though up to 7 bits are "wasted" to byte-align the
-	 * bitstream when a compressed block is followed by an uncompressed
-	 * block.  This is because a compressed block wouldn't have been used if
-	 * it wasn't cheaper than an uncompressed block, and uncompressed blocks
-	 * always end on a byte boundary.  So the alignment bits will, at worst,
-	 * go up to the place where the uncompressed block would have ended.
-	 */
-
-	/*
-	 * Calculate the maximum number of uncompressed blocks that the
-	 * compressor can use for 'in_nbytes' of data.
-	 *
-	 * The minimum length that is passed to deflate_flush_block() is
-	 * MIN_BLOCK_LENGTH bytes, except for the final block if needed.  If
-	 * deflate_flush_block() decides to use an uncompressed block, it
-	 * actually will (in general) output a series of uncompressed blocks in
-	 * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
-	 * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
-	 * as in that case this behavior can't result in more blocks than the
-	 * case where deflate_flush_block() is called with min-length inputs.
-	 *
-	 * So the number of uncompressed blocks needed would be bounded by
-	 * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH).  However, empty inputs
-	 * need 1 (empty) block, which gives the final expression below.
-	 */
-	STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX);
-	max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
-
-	/*
-	 * Each uncompressed block has 5 bytes of overhead, for the BFINAL,
-	 * BTYPE, LEN, and NLEN fields.  (For the reason explained earlier, the
-	 * alignment bits at the very start of the block can be disregarded;
-	 * they would otherwise increase the overhead to 6 bytes per block.)
-	 * Therefore, the maximum number of overhead bytes is '5 * max_blocks'.
-	 * To get the final bound, add the number of uncompressed bytes.
-	 */
-	return (5 * max_blocks) + in_nbytes;
+  size_t max_blocks;
+  
+  /*
+   * Since the compressor never uses a compressed block when an
+   * uncompressed block is cheaper, the worst case can be no worse than
+   * the case where only uncompressed blocks are used.
+   *
+   * This is true even though up to 7 bits are "wasted" to byte-align the
+   * bitstream when a compressed block is followed by an uncompressed
+   * block.  This is because a compressed block wouldn't have been used if
+   * it wasn't cheaper than an uncompressed block, and uncompressed blocks
+   * always end on a byte boundary.  So the alignment bits will, at worst,
+   * go up to the place where the uncompressed block would have ended.
+   */
+  
+  /*
+   * Calculate the maximum number of uncompressed blocks that the
+   * compressor can use for 'in_nbytes' of data.
+   *
+   * The minimum length that is passed to deflate_flush_block() is
+   * MIN_BLOCK_LENGTH bytes, except for the final block if needed.  If
+   * deflate_flush_block() decides to use an uncompressed block, it
+   * actually will (in general) output a series of uncompressed blocks in
+   * order to stay within the UINT16_MAX limit of DEFLATE.  But this can
+   * be disregarded here as long as '2 * MIN_BLOCK_LENGTH <= UINT16_MAX',
+   * as in that case this behavior can't result in more blocks than the
+   * case where deflate_flush_block() is called with min-length inputs.
+   *
+   * So the number of uncompressed blocks needed would be bounded by
+   * DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH).  However, empty inputs
+   * need 1 (empty) block, which gives the final expression below.
+   */
+  STATIC_ASSERT(2 * MIN_BLOCK_LENGTH <= UINT16_MAX);
+  max_blocks = MAX(DIV_ROUND_UP(in_nbytes, MIN_BLOCK_LENGTH), 1);
+  
+  /*
+   * Each uncompressed block has 5 bytes of overhead, for the BFINAL,
+   * BTYPE, LEN, and NLEN fields.  (For the reason explained earlier, the
+   * alignment bits at the very start of the block can be disregarded;
+   * they would otherwise increase the overhead to 6 bytes per block.)
+   * Therefore, the maximum number of overhead bytes is '5 * max_blocks'.
+   * To get the final bound, add the number of uncompressed bytes.
+   */
+  return (5 * max_blocks) + in_nbytes;
 }
diff --git a/Sources/DEFLATE/deflate_constants.h b/Sources/DEFLATE/deflate_constants.h
index 95c9e0a5..a2da4baa 100644
--- a/Sources/DEFLATE/deflate_constants.h
+++ b/Sources/DEFLATE/deflate_constants.h
@@ -6,51 +6,51 @@
 #define LIB_DEFLATE_CONSTANTS_H
 
 /* Valid block types  */
-#define DEFLATE_BLOCKTYPE_UNCOMPRESSED		0
-#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN	1
-#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN	2
+#define DEFLATE_BLOCKTYPE_UNCOMPRESSED    0
+#define DEFLATE_BLOCKTYPE_STATIC_HUFFMAN  1
+#define DEFLATE_BLOCKTYPE_DYNAMIC_HUFFMAN  2
 
 /* Minimum and maximum supported match lengths (in bytes)  */
-#define DEFLATE_MIN_MATCH_LEN			3
-#define DEFLATE_MAX_MATCH_LEN			258
+#define DEFLATE_MIN_MATCH_LEN      3
+#define DEFLATE_MAX_MATCH_LEN      258
 
 /* Maximum supported match offset (in bytes) */
-#define DEFLATE_MAX_MATCH_OFFSET		32768
+#define DEFLATE_MAX_MATCH_OFFSET    32768
 
 /* log2 of DEFLATE_MAX_MATCH_OFFSET */
-#define DEFLATE_WINDOW_ORDER			15
+#define DEFLATE_WINDOW_ORDER      15
 
 /* Number of symbols in each Huffman code.  Note: for the literal/length
  * and offset codes, these are actually the maximum values; a given block
  * might use fewer symbols.  */
-#define DEFLATE_NUM_PRECODE_SYMS		19
-#define DEFLATE_NUM_LITLEN_SYMS			288
-#define DEFLATE_NUM_OFFSET_SYMS			32
+#define DEFLATE_NUM_PRECODE_SYMS    19
+#define DEFLATE_NUM_LITLEN_SYMS      288
+#define DEFLATE_NUM_OFFSET_SYMS      32
 
 /* The maximum number of symbols across all codes  */
-#define DEFLATE_MAX_NUM_SYMS			288
+#define DEFLATE_MAX_NUM_SYMS      288
 
 /* Division of symbols in the literal/length code  */
-#define DEFLATE_NUM_LITERALS			256
-#define DEFLATE_END_OF_BLOCK			256
-#define DEFLATE_FIRST_LEN_SYM			257
+#define DEFLATE_NUM_LITERALS      256
+#define DEFLATE_END_OF_BLOCK      256
+#define DEFLATE_FIRST_LEN_SYM      257
 
 /* Maximum codeword length, in bits, within each Huffman code  */
-#define DEFLATE_MAX_PRE_CODEWORD_LEN		7
-#define DEFLATE_MAX_LITLEN_CODEWORD_LEN		15
-#define DEFLATE_MAX_OFFSET_CODEWORD_LEN		15
+#define DEFLATE_MAX_PRE_CODEWORD_LEN    7
+#define DEFLATE_MAX_LITLEN_CODEWORD_LEN    15
+#define DEFLATE_MAX_OFFSET_CODEWORD_LEN    15
 
 /* The maximum codeword length across all codes  */
-#define DEFLATE_MAX_CODEWORD_LEN		15
+#define DEFLATE_MAX_CODEWORD_LEN    15
 
 /* Maximum possible overrun when decoding codeword lengths  */
-#define DEFLATE_MAX_LENS_OVERRUN		137
+#define DEFLATE_MAX_LENS_OVERRUN    137
 
 /*
  * Maximum number of extra bits that may be required to represent a match
  * length or offset.
  */
-#define DEFLATE_MAX_EXTRA_LENGTH_BITS		5
-#define DEFLATE_MAX_EXTRA_OFFSET_BITS		13
+#define DEFLATE_MAX_EXTRA_LENGTH_BITS    5
+#define DEFLATE_MAX_EXTRA_OFFSET_BITS    13
 
 #endif /* LIB_DEFLATE_CONSTANTS_H */
diff --git a/Sources/DEFLATE/deflate_decompress.c b/Sources/DEFLATE/deflate_decompress.c
index 63726c7a..11a49d69 100644
--- a/Sources/DEFLATE/deflate_decompress.c
+++ b/Sources/DEFLATE/deflate_decompress.c
@@ -55,44 +55,44 @@
  */
 #if 0
 #  pragma message("UNSAFE DECOMPRESSION IS ENABLED. THIS MUST ONLY BE USED IF THE DECOMPRESSOR INPUT WILL ALWAYS BE TRUSTED!")
-#  define SAFETY_CHECK(expr)	(void)(expr)
+#  define SAFETY_CHECK(expr)  (void)(expr)
 #else
-#  define SAFETY_CHECK(expr)	if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
+#  define SAFETY_CHECK(expr)  if (unlikely(!(expr))) return LIBDEFLATE_BAD_DATA
 #endif
 
 /*****************************************************************************
- *				Input bitstream                              *
+ *        Input bitstream                              *
  *****************************************************************************/
 
 /*
  * The state of the "input bitstream" consists of the following variables:
  *
- *	- in_next: a pointer to the next unread byte in the input buffer
+ *  - in_next: a pointer to the next unread byte in the input buffer
  *
- *	- in_end: a pointer to just past the end of the input buffer
+ *  - in_end: a pointer to just past the end of the input buffer
  *
- *	- bitbuf: a word-sized variable containing bits that have been read from
- *		  the input buffer or from the implicit appended zero bytes
+ *  - bitbuf: a word-sized variable containing bits that have been read from
+ *      the input buffer or from the implicit appended zero bytes
  *
- *	- bitsleft: the number of bits in 'bitbuf' available to be consumed.
- *		    After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually
- *		    contain more bits than this.  However, only the bits counted
- *		    by 'bitsleft' can actually be consumed; the rest can only be
- *		    used for preloading.
+ *  - bitsleft: the number of bits in 'bitbuf' available to be consumed.
+ *        After REFILL_BITS_BRANCHLESS(), 'bitbuf' can actually
+ *        contain more bits than this.  However, only the bits counted
+ *        by 'bitsleft' can actually be consumed; the rest can only be
+ *        used for preloading.
  *
- *		    As a micro-optimization, we allow bits 8 and higher of
- *		    'bitsleft' to contain garbage.  When consuming the bits
- *		    associated with a decode table entry, this allows us to do
- *		    'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'.
- *		    On some CPUs, this helps reduce instruction dependencies.
- *		    This does have the disadvantage that 'bitsleft' sometimes
- *		    needs to be cast to 'u8', such as when it's used as a shift
- *		    amount in REFILL_BITS_BRANCHLESS().  But that one happens
- *		    for free since most CPUs ignore high bits in shift amounts.
+ *        As a micro-optimization, we allow bits 8 and higher of
+ *        'bitsleft' to contain garbage.  When consuming the bits
+ *        associated with a decode table entry, this allows us to do
+ *        'bitsleft -= entry' instead of 'bitsleft -= (u8)entry'.
+ *        On some CPUs, this helps reduce instruction dependencies.
+ *        This does have the disadvantage that 'bitsleft' sometimes
+ *        needs to be cast to 'u8', such as when it's used as a shift
+ *        amount in REFILL_BITS_BRANCHLESS().  But that one happens
+ *        for free since most CPUs ignore high bits in shift amounts.
  *
- *	- overread_count: the total number of implicit appended zero bytes that
- *			  have been loaded into the bitbuffer, including any
- *			  counted by 'bitsleft' and any already consumed
+ *  - overread_count: the total number of implicit appended zero bytes that
+ *        have been loaded into the bitbuffer, including any
+ *        counted by 'bitsleft' and any already consumed
  */
 
 /*
@@ -103,18 +103,18 @@
  * which they don't have to refill as often.
  */
 typedef machine_word_t bitbuf_t;
-#define BITBUF_NBITS	(8 * (int)sizeof(bitbuf_t))
+#define BITBUF_NBITS  (8 * (int)sizeof(bitbuf_t))
 
 /* BITMASK(n) returns a bitmask of length 'n'. */
-#define BITMASK(n)	(((bitbuf_t)1 << (n)) - 1)
+#define BITMASK(n)  (((bitbuf_t)1 << (n)) - 1)
 
 /*
  * MAX_BITSLEFT is the maximum number of consumable bits, i.e. the maximum value
  * of '(u8)bitsleft'.  This is the size of the bitbuffer variable, minus 1 if
  * the branchless refill method is being used (see REFILL_BITS_BRANCHLESS()).
  */
-#define MAX_BITSLEFT	\
-	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS)
+#define MAX_BITSLEFT  \
+(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS - 1 : BITBUF_NBITS)
 
 /*
  * CONSUMABLE_NBITS is the minimum number of bits that are guaranteed to be
@@ -122,7 +122,7 @@ typedef machine_word_t bitbuf_t;
  * Since only whole bytes can be added to 'bitsleft', the worst case is
  * 'MAX_BITSLEFT - 7': the smallest amount where another byte doesn't fit.
  */
-#define CONSUMABLE_NBITS	(MAX_BITSLEFT - 7)
+#define CONSUMABLE_NBITS  (MAX_BITSLEFT - 7)
 
 /*
  * FASTLOOP_PRELOADABLE_NBITS is the minimum number of bits that are guaranteed
@@ -132,8 +132,8 @@ typedef machine_word_t bitbuf_t;
  * number of consumable bits (counted by 'bitsleft').  Any bits not counted in
  * 'bitsleft' can only be used for precomputation and cannot be consumed.
  */
-#define FASTLOOP_PRELOADABLE_NBITS	\
-	(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS)
+#define FASTLOOP_PRELOADABLE_NBITS  \
+(UNALIGNED_ACCESS_IS_FAST ? BITBUF_NBITS : CONSUMABLE_NBITS)
 
 /*
  * PRELOAD_SLACK is the minimum number of bits that are guaranteed to be
@@ -141,14 +141,14 @@ typedef machine_word_t bitbuf_t;
  * subsequent consumptions.  This is 1 bit if the branchless refill method is
  * being used, and 0 bits otherwise.
  */
-#define PRELOAD_SLACK	MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT)
+#define PRELOAD_SLACK  MAX(0, FASTLOOP_PRELOADABLE_NBITS - MAX_BITSLEFT)
 
 /*
  * CAN_CONSUME(n) is true if it's guaranteed that if the bitbuffer has just been
  * refilled, then it's always possible to consume 'n' bits from it.  'n' should
  * be a compile-time constant, to enable compile-time evaluation.
  */
-#define CAN_CONSUME(n)	(CONSUMABLE_NBITS >= (n))
+#define CAN_CONSUME(n)  (CONSUMABLE_NBITS >= (n))
 
 /*
  * CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits) is true if it's
@@ -156,9 +156,9 @@ typedef machine_word_t bitbuf_t;
  * consume 'consume_nbits' bits, then preload 'preload_nbits' bits.  The
  * arguments should be compile-time constants to enable compile-time evaluation.
  */
-#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits)	\
-	(CONSUMABLE_NBITS >= (consume_nbits) &&				\
-	 FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits))
+#define CAN_CONSUME_AND_THEN_PRELOAD(consume_nbits, preload_nbits)  \
+(CONSUMABLE_NBITS >= (consume_nbits) &&        \
+FASTLOOP_PRELOADABLE_NBITS >= (consume_nbits) + (preload_nbits))
 
 /*
  * REFILL_BITS_BRANCHLESS() branchlessly refills the bitbuffer variable by
@@ -169,13 +169,13 @@ typedef machine_word_t bitbuf_t;
  *
  * The simplest way of branchlessly updating 'bitsleft' would be:
  *
- *	bitsleft += (MAX_BITSLEFT - bitsleft) & ~7;
+ *  bitsleft += (MAX_BITSLEFT - bitsleft) & ~7;
  *
  * To make it faster, we define MAX_BITSLEFT to be 'WORDBITS - 1' rather than
  * WORDBITS, so that in binary it looks like 111111 or 11111.  Then, we update
  * 'bitsleft' by just setting the bits above the low 3 bits:
  *
- *	bitsleft |= MAX_BITSLEFT & ~7;
+ *  bitsleft |= MAX_BITSLEFT & ~7;
  *
  * That compiles down to a single instruction like 'or $0x38, %rbp'.  Using
  * 'MAX_BITSLEFT == WORDBITS - 1' also has the advantage that refills can be
@@ -183,17 +183,17 @@ typedef machine_word_t bitbuf_t;
  *
  * The simplest way of branchlessly updating 'in_next' would be:
  *
- *	in_next += (MAX_BITSLEFT - bitsleft) >> 3;
+ *  in_next += (MAX_BITSLEFT - bitsleft) >> 3;
  *
  * With 'MAX_BITSLEFT == WORDBITS - 1' we could use an XOR instead, though this
  * isn't really better:
  *
- *	in_next += (MAX_BITSLEFT ^ bitsleft) >> 3;
+ *  in_next += (MAX_BITSLEFT ^ bitsleft) >> 3;
  *
  * An alternative which can be marginally better is the following:
  *
- *	in_next += sizeof(bitbuf_t) - 1;
- *	in_next -= (bitsleft >> 3) & 0x7;
+ *  in_next += sizeof(bitbuf_t) - 1;
+ *  in_next -= (bitsleft >> 3) & 0x7;
  *
  * It seems this would increase the number of CPU instructions from 3 (sub, shr,
  * add) to 4 (add, shr, and, sub).  However, if the CPU has a bitfield
@@ -203,12 +203,12 @@ typedef machine_word_t bitbuf_t;
  * high bits in 'bitsleft', so it is compatible with the micro-optimization we
  * use where we let the high bits of 'bitsleft' contain garbage.
  */
-#define REFILL_BITS_BRANCHLESS()					\
-do {									\
-	bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft;	\
-	in_next += sizeof(bitbuf_t) - 1;				\
-	in_next -= (bitsleft >> 3) & 0x7;				\
-	bitsleft |= MAX_BITSLEFT & ~7;					\
+#define REFILL_BITS_BRANCHLESS()          \
+do {                  \
+bitbuf |= get_unaligned_leword(in_next) << (u8)bitsleft;  \
+in_next += sizeof(bitbuf_t) - 1;        \
+in_next -= (bitsleft >> 3) & 0x7;        \
+bitsleft |= MAX_BITSLEFT & ~7;          \
 } while (0)
 
 /*
@@ -233,42 +233,42 @@ do {									\
  * or return an error.  However, we do it to be slightly more friendly to the
  * not-recommended use case of decompressing with an unknown output size.)
  */
-#define REFILL_BITS()							\
-do {									\
-	if (UNALIGNED_ACCESS_IS_FAST &&					\
-	    likely(in_end - in_next >= sizeof(bitbuf_t))) {		\
-		REFILL_BITS_BRANCHLESS();				\
-	} else {							\
-		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
-			if (likely(in_next != in_end)) {		\
-				bitbuf |= (bitbuf_t)*in_next++ <<	\
-					  (u8)bitsleft;			\
-			} else {					\
-				overread_count++;			\
-				SAFETY_CHECK(overread_count <=		\
-					     sizeof(bitbuf_t));		\
-			}						\
-			bitsleft += 8;					\
-		}							\
-	}								\
+#define REFILL_BITS()              \
+do {                  \
+if (UNALIGNED_ACCESS_IS_FAST &&          \
+likely(in_end - in_next >= sizeof(bitbuf_t))) {    \
+REFILL_BITS_BRANCHLESS();        \
+} else {              \
+while ((u8)bitsleft < CONSUMABLE_NBITS) {    \
+if (likely(in_next != in_end)) {    \
+bitbuf |= (bitbuf_t)*in_next++ <<  \
+(u8)bitsleft;      \
+} else {          \
+overread_count++;      \
+SAFETY_CHECK(overread_count <=    \
+sizeof(bitbuf_t));    \
+}            \
+bitsleft += 8;          \
+}              \
+}                \
 } while (0)
 
 /*
  * REFILL_BITS_IN_FASTLOOP() is like REFILL_BITS(), but it doesn't check for the
  * end of the input.  It can only be used in the fastloop.
  */
-#define REFILL_BITS_IN_FASTLOOP()					\
-do {									\
-	STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST ||			\
-		      FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS);	\
-	if (UNALIGNED_ACCESS_IS_FAST) {					\
-		REFILL_BITS_BRANCHLESS();				\
-	} else {							\
-		while ((u8)bitsleft < CONSUMABLE_NBITS) {		\
-			bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft;	\
-			bitsleft += 8;					\
-		}							\
-	}								\
+#define REFILL_BITS_IN_FASTLOOP()          \
+do {                  \
+STATIC_ASSERT(UNALIGNED_ACCESS_IS_FAST ||      \
+FASTLOOP_PRELOADABLE_NBITS == CONSUMABLE_NBITS);  \
+if (UNALIGNED_ACCESS_IS_FAST) {          \
+REFILL_BITS_BRANCHLESS();        \
+} else {              \
+while ((u8)bitsleft < CONSUMABLE_NBITS) {    \
+bitbuf |= (bitbuf_t)*in_next++ << (u8)bitsleft;  \
+bitsleft += 8;          \
+}              \
+}                \
 } while (0)
 
 /*
@@ -277,8 +277,8 @@ do {									\
  * match of length DEFLATE_MAX_MATCH_LEN.  Additionally, some slack space must
  * be included for the intentional overrun in the match copy implementation.
  */
-#define FASTLOOP_MAX_BYTES_WRITTEN	\
-	(2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1)
+#define FASTLOOP_MAX_BYTES_WRITTEN  \
+(2 + DEFLATE_MAX_MATCH_LEN + (5 * WORDBYTES) - 1)
 
 /*
  * This is the worst-case maximum number of input bytes that are read during
@@ -291,10 +291,10 @@ do {									\
  * can be advanced.  Finally, we add sizeof(bitbuf_t) to account for
  * REFILL_BITS_BRANCHLESS() reading a word past 'in_next'.
  */
-#define FASTLOOP_MAX_BYTES_READ					\
-	(DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) +	\
-		      LENGTH_MAXBITS + OFFSET_MAXBITS, 8) +	\
-	 sizeof(bitbuf_t))
+#define FASTLOOP_MAX_BYTES_READ          \
+(DIV_ROUND_UP(MAX_BITSLEFT + (2 * LITLEN_TABLEBITS) +  \
+LENGTH_MAXBITS + OFFSET_MAXBITS, 8) +  \
+sizeof(bitbuf_t))
 
 /*****************************************************************************
  *                              Huffman decoding                             *
@@ -361,18 +361,18 @@ do {									\
  * worst-case maximum number of decode table entries, including the main table
  * and all subtables.  The ENOUGH value depends on three parameters:
  *
- *	(1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
- *	(2) the maximum number of main table bits (*_TABLEBITS)
- *	(3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
+ *  (1) the maximum number of symbols in the code (DEFLATE_NUM_*_SYMS)
+ *  (2) the maximum number of main table bits (*_TABLEBITS)
+ *  (3) the maximum allowed codeword length (DEFLATE_MAX_*_CODEWORD_LEN)
  *
  * The ENOUGH values were computed using the utility program 'enough' from zlib.
  */
-#define PRECODE_TABLEBITS	7
-#define PRECODE_ENOUGH		128	/* enough 19 7 7	*/
-#define LITLEN_TABLEBITS	11
-#define LITLEN_ENOUGH		2342	/* enough 288 11 15	*/
-#define OFFSET_TABLEBITS	8
-#define OFFSET_ENOUGH		402	/* enough 32 8 15	*/
+#define PRECODE_TABLEBITS  7
+#define PRECODE_ENOUGH    128  /* enough 19 7 7  */
+#define LITLEN_TABLEBITS  11
+#define LITLEN_ENOUGH    2342  /* enough 288 11 15  */
+#define OFFSET_TABLEBITS  8
+#define OFFSET_ENOUGH    402  /* enough 32 8 15  */
 
 /*
  * make_decode_table_entry() creates a decode table entry for the given symbol
@@ -387,16 +387,16 @@ do {									\
 static forceinline u32
 make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len)
 {
-	return decode_results[sym] + (len << 8) + len;
+  return decode_results[sym] + (len << 8) + len;
 }
 
 /*
  * Here is the format of our precode decode table entries.  Bits not explicitly
  * described contain zeroes:
  *
- *	Bit 20-16:  presym
- *	Bit 10-8:   codeword length [not used]
- *	Bit 2-0:    codeword length
+ *  Bit 20-16:  presym
+ *  Bit 10-8:   codeword length [not used]
+ *  Bit 2-0:    codeword length
  *
  * The precode decode table never has subtables, since we use
  * PRECODE_TABLEBITS == DEFLATE_MAX_PRE_CODEWORD_LEN.
@@ -405,226 +405,226 @@ make_decode_table_entry(const u32 decode_results[], u32 sym, u32 len)
  * symbol.  make_decode_table_entry() produces the final entries.
  */
 static const u32 precode_decode_results[] = {
-#define ENTRY(presym)	((u32)presym << 16)
-	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
-	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
-	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
-	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
-	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  ,
+#define ENTRY(presym)  ((u32)presym << 16)
+  ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+  ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+  ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+  ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+  ENTRY(16)  , ENTRY(17)  , ENTRY(18)  ,
 #undef ENTRY
 };
 
 /* Litlen and offset decode table entry flags */
 
 /* Indicates a literal entry in the litlen decode table */
-#define HUFFDEC_LITERAL			0x80000000
+#define HUFFDEC_LITERAL      0x80000000
 
 /* Indicates that HUFFDEC_SUBTABLE_POINTER or HUFFDEC_END_OF_BLOCK is set */
-#define HUFFDEC_EXCEPTIONAL		0x00008000
+#define HUFFDEC_EXCEPTIONAL    0x00008000
 
 /* Indicates a subtable pointer entry in the litlen or offset decode table */
-#define HUFFDEC_SUBTABLE_POINTER	0x00004000
+#define HUFFDEC_SUBTABLE_POINTER  0x00004000
 
 /* Indicates an end-of-block entry in the litlen decode table */
-#define HUFFDEC_END_OF_BLOCK		0x00002000
+#define HUFFDEC_END_OF_BLOCK    0x00002000
 
 /* Maximum number of bits that can be consumed by decoding a match length */
-#define LENGTH_MAXBITS		(DEFLATE_MAX_LITLEN_CODEWORD_LEN + \
-				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
-#define LENGTH_MAXFASTBITS	(LITLEN_TABLEBITS /* no subtable needed */ + \
-				 DEFLATE_MAX_EXTRA_LENGTH_BITS)
+#define LENGTH_MAXBITS    (DEFLATE_MAX_LITLEN_CODEWORD_LEN + \
+DEFLATE_MAX_EXTRA_LENGTH_BITS)
+#define LENGTH_MAXFASTBITS  (LITLEN_TABLEBITS /* no subtable needed */ + \
+DEFLATE_MAX_EXTRA_LENGTH_BITS)
 
 /*
  * Here is the format of our litlen decode table entries.  Bits not explicitly
  * described contain zeroes:
  *
- *	Literals:
- *		Bit 31:     1 (HUFFDEC_LITERAL)
- *		Bit 23-16:  literal value
- *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
- *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
- *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
- *		Bit 11-8:   remaining codeword length [not used]
- *		Bit 3-0:    remaining codeword length
- *	Lengths:
- *		Bit 31:     0 (!HUFFDEC_LITERAL)
- *		Bit 24-16:  length base value
- *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
- *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
- *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
- *		Bit 11-8:   remaining codeword length
- *		Bit 4-0:    remaining codeword length + number of extra bits
- *	End of block:
- *		Bit 31:     0 (!HUFFDEC_LITERAL)
- *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
- *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
- *		Bit 13:     1 (HUFFDEC_END_OF_BLOCK)
- *		Bit 11-8:   remaining codeword length [not used]
- *		Bit 3-0:    remaining codeword length
- *	Subtable pointer:
- *		Bit 31:     0 (!HUFFDEC_LITERAL)
- *		Bit 30-16:  index of start of subtable
- *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
- *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
- *		Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
- *		Bit 11-8:   number of subtable bits
- *		Bit 3-0:    number of main table bits
+ *  Literals:
+ *    Bit 31:     1 (HUFFDEC_LITERAL)
+ *    Bit 23-16:  literal value
+ *    Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *    Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *    Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *    Bit 11-8:   remaining codeword length [not used]
+ *    Bit 3-0:    remaining codeword length
+ *  Lengths:
+ *    Bit 31:     0 (!HUFFDEC_LITERAL)
+ *    Bit 24-16:  length base value
+ *    Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *    Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *    Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *    Bit 11-8:   remaining codeword length
+ *    Bit 4-0:    remaining codeword length + number of extra bits
+ *  End of block:
+ *    Bit 31:     0 (!HUFFDEC_LITERAL)
+ *    Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *    Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *    Bit 13:     1 (HUFFDEC_END_OF_BLOCK)
+ *    Bit 11-8:   remaining codeword length [not used]
+ *    Bit 3-0:    remaining codeword length
+ *  Subtable pointer:
+ *    Bit 31:     0 (!HUFFDEC_LITERAL)
+ *    Bit 30-16:  index of start of subtable
+ *    Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *    Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *    Bit 13:     0 (!HUFFDEC_END_OF_BLOCK)
+ *    Bit 11-8:   number of subtable bits
+ *    Bit 3-0:    number of main table bits
  *
  * This format has several desirable properties:
  *
- *	- The codeword length, length slot base, and number of extra length bits
- *	  are all built in.  This eliminates the need to separately look up this
- *	  information by indexing separate arrays by symbol or length slot.
+ *  - The codeword length, length slot base, and number of extra length bits
+ *    are all built in.  This eliminates the need to separately look up this
+ *    information by indexing separate arrays by symbol or length slot.
  *
- *	- The HUFFDEC_* flags enable easily distinguishing between the different
- *	  types of entries.  The HUFFDEC_LITERAL flag enables a fast path for
- *	  literals; the high bit is used for this, as some CPUs can test the
- *	  high bit more easily than other bits.  The HUFFDEC_EXCEPTIONAL flag
- *	  makes it possible to detect the two unlikely cases (subtable pointer
- *	  and end of block) in a single bit flag test.
+ *  - The HUFFDEC_* flags enable easily distinguishing between the different
+ *    types of entries.  The HUFFDEC_LITERAL flag enables a fast path for
+ *    literals; the high bit is used for this, as some CPUs can test the
+ *    high bit more easily than other bits.  The HUFFDEC_EXCEPTIONAL flag
+ *    makes it possible to detect the two unlikely cases (subtable pointer
+ *    and end of block) in a single bit flag test.
  *
- *	- The low byte is the number of bits that need to be removed from the
- *	  bitstream; this makes this value easily accessible, and it enables the
- *	  micro-optimization of doing 'bitsleft -= entry' instead of
- *	  'bitsleft -= (u8)entry'.  It also includes the number of extra bits,
- *	  so they don't need to be removed separately.
+ *  - The low byte is the number of bits that need to be removed from the
+ *    bitstream; this makes this value easily accessible, and it enables the
+ *    micro-optimization of doing 'bitsleft -= entry' instead of
+ *    'bitsleft -= (u8)entry'.  It also includes the number of extra bits,
+ *    so they don't need to be removed separately.
  *
- *	- The flags in bits 15-13 are arranged to be 0 when the
- *	  "remaining codeword length" in bits 11-8 is needed, making this value
- *	  fairly easily accessible as well via a shift and downcast.
+ *  - The flags in bits 15-13 are arranged to be 0 when the
+ *    "remaining codeword length" in bits 11-8 is needed, making this value
+ *    fairly easily accessible as well via a shift and downcast.
  *
- *	- Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are
- *	  needed, making it possible to extract this value with '& 0x3F' rather
- *	  than '& 0xF'.  This value is only used as a shift amount, so this can
- *	  save an 'and' instruction as the masking by 0x3F happens implicitly.
+ *  - Similarly, bits 13-12 are 0 when the "subtable bits" in bits 11-8 are
+ *    needed, making it possible to extract this value with '& 0x3F' rather
+ *    than '& 0xF'.  This value is only used as a shift amount, so this can
+ *    save an 'and' instruction as the masking by 0x3F happens implicitly.
  *
  * litlen_decode_results[] contains the static part of the entry for each
  * symbol.  make_decode_table_entry() produces the final entries.
  */
 static const u32 litlen_decode_results[] = {
-
-	/* Literals */
-#define ENTRY(literal)	(HUFFDEC_LITERAL | ((u32)literal << 16))
-	ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
-	ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
-	ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
-	ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
-	ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
-	ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
-	ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
-	ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
-	ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
-	ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
-	ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
-	ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
-	ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
-	ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
-	ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
-	ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
-	ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
-	ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
-	ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
-	ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
-	ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
-	ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
-	ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
-	ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
-	ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
-	ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
-	ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
-	ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
-	ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
-	ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
-	ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
-	ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
-	ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
-	ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
-	ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
-	ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
-	ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
-	ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
-	ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
-	ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
-	ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
-	ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
-	ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
-	ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
-	ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
-	ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
-	ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
-	ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
-	ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
-	ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
-	ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
-	ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
-	ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
-	ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
-	ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
-	ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
-	ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
-	ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
-	ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
-	ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
-	ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
-	ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
-	ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
-	ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
+  
+  /* Literals */
+#define ENTRY(literal)  (HUFFDEC_LITERAL | ((u32)literal << 16))
+  ENTRY(0)   , ENTRY(1)   , ENTRY(2)   , ENTRY(3)   ,
+  ENTRY(4)   , ENTRY(5)   , ENTRY(6)   , ENTRY(7)   ,
+  ENTRY(8)   , ENTRY(9)   , ENTRY(10)  , ENTRY(11)  ,
+  ENTRY(12)  , ENTRY(13)  , ENTRY(14)  , ENTRY(15)  ,
+  ENTRY(16)  , ENTRY(17)  , ENTRY(18)  , ENTRY(19)  ,
+  ENTRY(20)  , ENTRY(21)  , ENTRY(22)  , ENTRY(23)  ,
+  ENTRY(24)  , ENTRY(25)  , ENTRY(26)  , ENTRY(27)  ,
+  ENTRY(28)  , ENTRY(29)  , ENTRY(30)  , ENTRY(31)  ,
+  ENTRY(32)  , ENTRY(33)  , ENTRY(34)  , ENTRY(35)  ,
+  ENTRY(36)  , ENTRY(37)  , ENTRY(38)  , ENTRY(39)  ,
+  ENTRY(40)  , ENTRY(41)  , ENTRY(42)  , ENTRY(43)  ,
+  ENTRY(44)  , ENTRY(45)  , ENTRY(46)  , ENTRY(47)  ,
+  ENTRY(48)  , ENTRY(49)  , ENTRY(50)  , ENTRY(51)  ,
+  ENTRY(52)  , ENTRY(53)  , ENTRY(54)  , ENTRY(55)  ,
+  ENTRY(56)  , ENTRY(57)  , ENTRY(58)  , ENTRY(59)  ,
+  ENTRY(60)  , ENTRY(61)  , ENTRY(62)  , ENTRY(63)  ,
+  ENTRY(64)  , ENTRY(65)  , ENTRY(66)  , ENTRY(67)  ,
+  ENTRY(68)  , ENTRY(69)  , ENTRY(70)  , ENTRY(71)  ,
+  ENTRY(72)  , ENTRY(73)  , ENTRY(74)  , ENTRY(75)  ,
+  ENTRY(76)  , ENTRY(77)  , ENTRY(78)  , ENTRY(79)  ,
+  ENTRY(80)  , ENTRY(81)  , ENTRY(82)  , ENTRY(83)  ,
+  ENTRY(84)  , ENTRY(85)  , ENTRY(86)  , ENTRY(87)  ,
+  ENTRY(88)  , ENTRY(89)  , ENTRY(90)  , ENTRY(91)  ,
+  ENTRY(92)  , ENTRY(93)  , ENTRY(94)  , ENTRY(95)  ,
+  ENTRY(96)  , ENTRY(97)  , ENTRY(98)  , ENTRY(99)  ,
+  ENTRY(100) , ENTRY(101) , ENTRY(102) , ENTRY(103) ,
+  ENTRY(104) , ENTRY(105) , ENTRY(106) , ENTRY(107) ,
+  ENTRY(108) , ENTRY(109) , ENTRY(110) , ENTRY(111) ,
+  ENTRY(112) , ENTRY(113) , ENTRY(114) , ENTRY(115) ,
+  ENTRY(116) , ENTRY(117) , ENTRY(118) , ENTRY(119) ,
+  ENTRY(120) , ENTRY(121) , ENTRY(122) , ENTRY(123) ,
+  ENTRY(124) , ENTRY(125) , ENTRY(126) , ENTRY(127) ,
+  ENTRY(128) , ENTRY(129) , ENTRY(130) , ENTRY(131) ,
+  ENTRY(132) , ENTRY(133) , ENTRY(134) , ENTRY(135) ,
+  ENTRY(136) , ENTRY(137) , ENTRY(138) , ENTRY(139) ,
+  ENTRY(140) , ENTRY(141) , ENTRY(142) , ENTRY(143) ,
+  ENTRY(144) , ENTRY(145) , ENTRY(146) , ENTRY(147) ,
+  ENTRY(148) , ENTRY(149) , ENTRY(150) , ENTRY(151) ,
+  ENTRY(152) , ENTRY(153) , ENTRY(154) , ENTRY(155) ,
+  ENTRY(156) , ENTRY(157) , ENTRY(158) , ENTRY(159) ,
+  ENTRY(160) , ENTRY(161) , ENTRY(162) , ENTRY(163) ,
+  ENTRY(164) , ENTRY(165) , ENTRY(166) , ENTRY(167) ,
+  ENTRY(168) , ENTRY(169) , ENTRY(170) , ENTRY(171) ,
+  ENTRY(172) , ENTRY(173) , ENTRY(174) , ENTRY(175) ,
+  ENTRY(176) , ENTRY(177) , ENTRY(178) , ENTRY(179) ,
+  ENTRY(180) , ENTRY(181) , ENTRY(182) , ENTRY(183) ,
+  ENTRY(184) , ENTRY(185) , ENTRY(186) , ENTRY(187) ,
+  ENTRY(188) , ENTRY(189) , ENTRY(190) , ENTRY(191) ,
+  ENTRY(192) , ENTRY(193) , ENTRY(194) , ENTRY(195) ,
+  ENTRY(196) , ENTRY(197) , ENTRY(198) , ENTRY(199) ,
+  ENTRY(200) , ENTRY(201) , ENTRY(202) , ENTRY(203) ,
+  ENTRY(204) , ENTRY(205) , ENTRY(206) , ENTRY(207) ,
+  ENTRY(208) , ENTRY(209) , ENTRY(210) , ENTRY(211) ,
+  ENTRY(212) , ENTRY(213) , ENTRY(214) , ENTRY(215) ,
+  ENTRY(216) , ENTRY(217) , ENTRY(218) , ENTRY(219) ,
+  ENTRY(220) , ENTRY(221) , ENTRY(222) , ENTRY(223) ,
+  ENTRY(224) , ENTRY(225) , ENTRY(226) , ENTRY(227) ,
+  ENTRY(228) , ENTRY(229) , ENTRY(230) , ENTRY(231) ,
+  ENTRY(232) , ENTRY(233) , ENTRY(234) , ENTRY(235) ,
+  ENTRY(236) , ENTRY(237) , ENTRY(238) , ENTRY(239) ,
+  ENTRY(240) , ENTRY(241) , ENTRY(242) , ENTRY(243) ,
+  ENTRY(244) , ENTRY(245) , ENTRY(246) , ENTRY(247) ,
+  ENTRY(248) , ENTRY(249) , ENTRY(250) , ENTRY(251) ,
+  ENTRY(252) , ENTRY(253) , ENTRY(254) , ENTRY(255) ,
 #undef ENTRY
-
-	/* End of block */
-	HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK,
-
-	/* Lengths */
-#define ENTRY(length_base, num_extra_bits)	\
-	(((u32)(length_base) << 16) | (num_extra_bits))
-	ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
-	ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
-	ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
-	ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
-	ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
-	ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
-	ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
-	ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
+  
+  /* End of block */
+  HUFFDEC_EXCEPTIONAL | HUFFDEC_END_OF_BLOCK,
+  
+  /* Lengths */
+#define ENTRY(length_base, num_extra_bits)  \
+(((u32)(length_base) << 16) | (num_extra_bits))
+  ENTRY(3  , 0) , ENTRY(4  , 0) , ENTRY(5  , 0) , ENTRY(6  , 0),
+  ENTRY(7  , 0) , ENTRY(8  , 0) , ENTRY(9  , 0) , ENTRY(10 , 0),
+  ENTRY(11 , 1) , ENTRY(13 , 1) , ENTRY(15 , 1) , ENTRY(17 , 1),
+  ENTRY(19 , 2) , ENTRY(23 , 2) , ENTRY(27 , 2) , ENTRY(31 , 2),
+  ENTRY(35 , 3) , ENTRY(43 , 3) , ENTRY(51 , 3) , ENTRY(59 , 3),
+  ENTRY(67 , 4) , ENTRY(83 , 4) , ENTRY(99 , 4) , ENTRY(115, 4),
+  ENTRY(131, 5) , ENTRY(163, 5) , ENTRY(195, 5) , ENTRY(227, 5),
+  ENTRY(258, 0) , ENTRY(258, 0) , ENTRY(258, 0) ,
 #undef ENTRY
 };
 
 /* Maximum number of bits that can be consumed by decoding a match offset */
-#define OFFSET_MAXBITS		(DEFLATE_MAX_OFFSET_CODEWORD_LEN + \
-				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
-#define OFFSET_MAXFASTBITS	(OFFSET_TABLEBITS /* no subtable needed */ + \
-				 DEFLATE_MAX_EXTRA_OFFSET_BITS)
+#define OFFSET_MAXBITS    (DEFLATE_MAX_OFFSET_CODEWORD_LEN + \
+DEFLATE_MAX_EXTRA_OFFSET_BITS)
+#define OFFSET_MAXFASTBITS  (OFFSET_TABLEBITS /* no subtable needed */ + \
+DEFLATE_MAX_EXTRA_OFFSET_BITS)
 
 /*
  * Here is the format of our offset decode table entries.  Bits not explicitly
  * described contain zeroes:
  *
- *	Offsets:
- *		Bit 31-16:  offset base value
- *		Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
- *		Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
- *		Bit 11-8:   remaining codeword length
- *		Bit 4-0:    remaining codeword length + number of extra bits
- *	Subtable pointer:
- *		Bit 31-16:  index of start of subtable
- *		Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
- *		Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
- *		Bit 11-8:   number of subtable bits
- *		Bit 3-0:    number of main table bits
+ *  Offsets:
+ *    Bit 31-16:  offset base value
+ *    Bit 15:     0 (!HUFFDEC_EXCEPTIONAL)
+ *    Bit 14:     0 (!HUFFDEC_SUBTABLE_POINTER)
+ *    Bit 11-8:   remaining codeword length
+ *    Bit 4-0:    remaining codeword length + number of extra bits
+ *  Subtable pointer:
+ *    Bit 31-16:  index of start of subtable
+ *    Bit 15:     1 (HUFFDEC_EXCEPTIONAL)
+ *    Bit 14:     1 (HUFFDEC_SUBTABLE_POINTER)
+ *    Bit 11-8:   number of subtable bits
+ *    Bit 3-0:    number of main table bits
  *
  * These work the same way as the length entries and subtable pointer entries in
  * the litlen decode table; see litlen_decode_results[] above.
  */
 static const u32 offset_decode_results[] = {
-#define ENTRY(offset_base, num_extra_bits)	\
-	(((u32)(offset_base) << 16) | (num_extra_bits))
-	ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
-	ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
-	ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
-	ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
-	ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
-	ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
-	ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
-	ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) ,
+#define ENTRY(offset_base, num_extra_bits)  \
+(((u32)(offset_base) << 16) | (num_extra_bits))
+  ENTRY(1     , 0)  , ENTRY(2     , 0)  , ENTRY(3     , 0)  , ENTRY(4     , 0)  ,
+  ENTRY(5     , 1)  , ENTRY(7     , 1)  , ENTRY(9     , 2)  , ENTRY(13    , 2) ,
+  ENTRY(17    , 3)  , ENTRY(25    , 3)  , ENTRY(33    , 4)  , ENTRY(49    , 4)  ,
+  ENTRY(65    , 5)  , ENTRY(97    , 5)  , ENTRY(129   , 6)  , ENTRY(193   , 6)  ,
+  ENTRY(257   , 7)  , ENTRY(385   , 7)  , ENTRY(513   , 8)  , ENTRY(769   , 8)  ,
+  ENTRY(1025  , 9)  , ENTRY(1537  , 9)  , ENTRY(2049  , 10) , ENTRY(3073  , 10) ,
+  ENTRY(4097  , 11) , ENTRY(6145  , 11) , ENTRY(8193  , 12) , ENTRY(12289 , 12) ,
+  ENTRY(16385 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) , ENTRY(24577 , 13) ,
 #undef ENTRY
 };
 
@@ -640,40 +640,40 @@ static const u32 offset_decode_results[] = {
  * are decoded without an intervening dynamic block, even across streams.
  */
 struct libdeflate_decompressor {
-
-	/*
-	 * The arrays aren't all needed at the same time.  'precode_lens' and
-	 * 'precode_decode_table' are unneeded after 'lens' has been filled.
-	 * Furthermore, 'lens' need not be retained after building the litlen
-	 * and offset decode tables.  In fact, 'lens' can be in union with
-	 * 'litlen_decode_table' provided that 'offset_decode_table' is separate
-	 * and is built first.
-	 */
-
-	union {
-		u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
-
-		struct {
-			u8 lens[DEFLATE_NUM_LITLEN_SYMS +
-				DEFLATE_NUM_OFFSET_SYMS +
-				DEFLATE_MAX_LENS_OVERRUN];
-
-			u32 precode_decode_table[PRECODE_ENOUGH];
-		} l;
-
-		u32 litlen_decode_table[LITLEN_ENOUGH];
-	} u;
-
-	u32 offset_decode_table[OFFSET_ENOUGH];
-
-	/* used only during build_decode_table() */
-	u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
-
-	bool static_codes_loaded;
-	unsigned litlen_tablebits;
-
-	/* The free() function for this struct, chosen at allocation time */
-	free_func_t free_func;
+  
+  /*
+   * The arrays aren't all needed at the same time.  'precode_lens' and
+   * 'precode_decode_table' are unneeded after 'lens' has been filled.
+   * Furthermore, 'lens' need not be retained after building the litlen
+   * and offset decode tables.  In fact, 'lens' can be in union with
+   * 'litlen_decode_table' provided that 'offset_decode_table' is separate
+   * and is built first.
+   */
+  
+  union {
+    u8 precode_lens[DEFLATE_NUM_PRECODE_SYMS];
+    
+    struct {
+      u8 lens[DEFLATE_NUM_LITLEN_SYMS +
+              DEFLATE_NUM_OFFSET_SYMS +
+              DEFLATE_MAX_LENS_OVERRUN];
+      
+      u32 precode_decode_table[PRECODE_ENOUGH];
+    } l;
+    
+    u32 litlen_decode_table[LITLEN_ENOUGH];
+  } u;
+  
+  u32 offset_decode_table[OFFSET_ENOUGH];
+  
+  /* used only during build_decode_table() */
+  u16 sorted_syms[DEFLATE_MAX_NUM_SYMS];
+  
+  bool static_codes_loaded;
+  unsigned litlen_tablebits;
+  
+  /* The free() function for this struct, chosen at allocation time */
+  free_func_t free_func;
 };
 
 /*
@@ -686,383 +686,383 @@ struct libdeflate_decompressor {
  * Huffman codes in DEFLATE.
  *
  * @decode_table
- *	The array in which the decode table will be generated.  This array must
- *	have sufficient length; see the definition of the ENOUGH numbers.
+ *  The array in which the decode table will be generated.  This array must
+ *  have sufficient length; see the definition of the ENOUGH numbers.
  * @lens
- *	An array which provides, for each symbol, the length of the
- *	corresponding codeword in bits, or 0 if the symbol is unused.  This may
- *	alias @decode_table, since nothing is written to @decode_table until all
- *	@lens have been consumed.  All codeword lengths are assumed to be <=
- *	@max_codeword_len but are otherwise considered untrusted.  If they do
- *	not form a valid Huffman code, then the decode table is not built and
- *	%false is returned.
+ *  An array which provides, for each symbol, the length of the
+ *  corresponding codeword in bits, or 0 if the symbol is unused.  This may
+ *  alias @decode_table, since nothing is written to @decode_table until all
+ *  @lens have been consumed.  All codeword lengths are assumed to be <=
+ *  @max_codeword_len but are otherwise considered untrusted.  If they do
+ *  not form a valid Huffman code, then the decode table is not built and
+ *  %false is returned.
  * @num_syms
- *	The number of symbols in the code, including all unused symbols.
+ *  The number of symbols in the code, including all unused symbols.
  * @decode_results
- *	An array which gives the incomplete decode result for each symbol.  The
- *	needed values in this array will be combined with codeword lengths to
- *	make the final decode table entries using make_decode_table_entry().
+ *  An array which gives the incomplete decode result for each symbol.  The
+ *  needed values in this array will be combined with codeword lengths to
+ *  make the final decode table entries using make_decode_table_entry().
  * @table_bits
- *	The log base-2 of the number of main table entries to use.
- *	If @table_bits_ret != NULL, then @table_bits is treated as a maximum
- *	value and it will be decreased if a smaller table would be sufficient.
+ *  The log base-2 of the number of main table entries to use.
+ *  If @table_bits_ret != NULL, then @table_bits is treated as a maximum
+ *  value and it will be decreased if a smaller table would be sufficient.
  * @max_codeword_len
- *	The maximum allowed codeword length for this Huffman code.
- *	Must be <= DEFLATE_MAX_CODEWORD_LEN.
+ *  The maximum allowed codeword length for this Huffman code.
+ *  Must be <= DEFLATE_MAX_CODEWORD_LEN.
  * @sorted_syms
- *	A temporary array of length @num_syms.
+ *  A temporary array of length @num_syms.
  * @table_bits_ret
- *	If non-NULL, then the dynamic table_bits is enabled, and the actual
- *	table_bits value will be returned here.
+ *  If non-NULL, then the dynamic table_bits is enabled, and the actual
+ *  table_bits value will be returned here.
  *
  * Returns %true if successful; %false if the codeword lengths do not form a
  * valid Huffman code.
  */
 static bool
 build_decode_table(u32 decode_table[],
-		   const u8 lens[],
-		   const unsigned num_syms,
-		   const u32 decode_results[],
-		   unsigned table_bits,
-		   unsigned max_codeword_len,
-		   u16 *sorted_syms,
-		   unsigned *table_bits_ret)
+                   const u8 lens[],
+                   const unsigned num_syms,
+                   const u32 decode_results[],
+                   unsigned table_bits,
+                   unsigned max_codeword_len,
+                   u16 *sorted_syms,
+                   unsigned *table_bits_ret)
 {
-	unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
-	unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
-	unsigned sym;		/* current symbol */
-	unsigned codeword;	/* current codeword, bit-reversed */
-	unsigned len;		/* current codeword length in bits */
-	unsigned count;		/* num codewords remaining with this length */
-	u32 codespace_used;	/* codespace used out of '2^max_codeword_len' */
-	unsigned cur_table_end; /* end index of current table */
-	unsigned subtable_prefix; /* codeword prefix of current subtable */
-	unsigned subtable_start;  /* start index of current subtable */
-	unsigned subtable_bits;   /* log2 of current subtable length */
-
-	/* Count how many codewords have each length, including 0. */
-	for (len = 0; len <= max_codeword_len; len++)
-		len_counts[len] = 0;
-	for (sym = 0; sym < num_syms; sym++)
-		len_counts[lens[sym]]++;
-
-	/*
-	 * Determine the actual maximum codeword length that was used, and
-	 * decrease table_bits to it if allowed.
-	 */
-	while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0)
-		max_codeword_len--;
-	if (table_bits_ret != NULL) {
-		table_bits = MIN(table_bits, max_codeword_len);
-		*table_bits_ret = table_bits;
-	}
-
-	/*
-	 * Sort the symbols primarily by increasing codeword length and
-	 * secondarily by increasing symbol value; or equivalently by their
-	 * codewords in lexicographic order, since a canonical code is assumed.
-	 *
-	 * For efficiency, also compute 'codespace_used' in the same pass over
-	 * 'len_counts[]' used to build 'offsets[]' for sorting.
-	 */
-
-	/* Ensure that 'codespace_used' cannot overflow. */
-	STATIC_ASSERT(sizeof(codespace_used) == 4);
-	STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
-		      DEFLATE_MAX_NUM_SYMS);
-
-	offsets[0] = 0;
-	offsets[1] = len_counts[0];
-	codespace_used = 0;
-	for (len = 1; len < max_codeword_len; len++) {
-		offsets[len + 1] = offsets[len] + len_counts[len];
-		codespace_used = (codespace_used << 1) + len_counts[len];
-	}
-	codespace_used = (codespace_used << 1) + len_counts[len];
-
-	for (sym = 0; sym < num_syms; sym++)
-		sorted_syms[offsets[lens[sym]]++] = sym;
-
-	sorted_syms += offsets[0]; /* Skip unused symbols */
-
-	/* lens[] is done being used, so we can write to decode_table[] now. */
-
-	/*
-	 * Check whether the lengths form a complete code (exactly fills the
-	 * codespace), an incomplete code (doesn't fill the codespace), or an
-	 * overfull code (overflows the codespace).  A codeword of length 'n'
-	 * uses proportion '1/(2^n)' of the codespace.  An overfull code is
-	 * nonsensical, so is considered invalid.  An incomplete code is
-	 * considered valid only in two specific cases; see below.
-	 */
-
-	/* overfull code? */
-	if (unlikely(codespace_used > (1U << max_codeword_len)))
-		return false;
-
-	/* incomplete code? */
-	if (unlikely(codespace_used < (1U << max_codeword_len))) {
-		u32 entry;
-		unsigned i;
-
-		/*
-		 * The DEFLATE RFC explicitly allows the offset code to be
-		 * incomplete in two cases: a code containing just 1 codeword,
-		 * if that codeword has length 1; and a code containing no
-		 * codewords.  Note: the list of offset codeword lengths is
-		 * always nonempty, but lengths of 0 don't count as codewords.
-		 *
-		 * The RFC doesn't say whether the same cases are allowed for
-		 * the litlen and pre codes.  It's actually impossible for no
-		 * symbols to be used from these codes; however, it's
-		 * technically possible for only one symbol to be used.  zlib
-		 * allows 1 codeword for the litlen code, but not the precode.
-		 * The RFC also doesn't say whether, when there is 1 codeword,
-		 * that codeword is '0' or '1'.  zlib uses '0'.
-		 *
-		 * We accept what zlib accepts, plus a bit more.  First, we
-		 * don't treat the precode more strictly than the litlen and
-		 * offset codes.  There's no convincing reason to add a special
-		 * case for the precode here.
-		 *
-		 * Second, we just map each allowed incompete code to a complete
-		 * code with only real symbols.  To do this, we choose a symbol,
-		 * either the used symbol (for codes with 1 codeword) or an
-		 * arbitrary symbol (for empty codes), and give it both
-		 * codewords '0' and '1'.  zlib instead uses a special ERROR
-		 * symbol in the part of the codespace the code doesn't use.
-		 * However, having an ERROR symbol reduces the performance of
-		 * the Huffman decoder, for no real benefit.  Our approach also
-		 * avoids having to decide whether '0' or '1' is correct.
-		 *
-		 * Like zlib, we still reject all incomplete codes that contain
-		 * more than 1 codeword or a codeword length greater than 1.
-		 */
-		if (codespace_used == 0) {
-			sym = 0; /* arbitrary */
-		} else {
-			if (codespace_used != (1U << (max_codeword_len - 1)) ||
-			    len_counts[1] != 1)
-				return false;
-			sym = sorted_syms[0];
-		}
-		entry = make_decode_table_entry(decode_results, sym, 1);
-		for (i = 0; i < (1U << table_bits); i++)
-			decode_table[i] = entry;
-		return true;
-	}
-
-	/*
-	 * The lengths form a complete code.  Now, enumerate the codewords in
-	 * lexicographic order and fill the decode table entries for each one.
-	 *
-	 * First, process all codewords with len <= table_bits.  Each one gets
-	 * '2^(table_bits-len)' direct entries in the table.
-	 *
-	 * Since DEFLATE uses bit-reversed codewords, these entries aren't
-	 * consecutive but rather are spaced '2^len' entries apart.  This makes
-	 * filling them naively somewhat awkward and inefficient, since strided
-	 * stores are less cache-friendly and preclude the use of word or
-	 * vector-at-a-time stores to fill multiple entries per instruction.
-	 *
-	 * To optimize this, we incrementally double the table size.  When
-	 * processing codewords with length 'len', the table is treated as
-	 * having only '2^len' entries, so each codeword uses just one entry.
-	 * Then, each time 'len' is incremented, the table size is doubled and
-	 * the first half is copied to the second half.  This significantly
-	 * improves performance over naively doing strided stores.
-	 *
-	 * Note that some entries copied for each table doubling may not have
-	 * been initialized yet, but it doesn't matter since they're guaranteed
-	 * to be initialized later (because the Huffman code is complete).
-	 */
-	codeword = 0;
-	len = 1;
-	while ((count = len_counts[len]) == 0)
-		len++;
-	cur_table_end = 1U << len;
-	while (len <= table_bits) {
-		/* Process all 'count' codewords with length 'len' bits. */
-		do {
-			unsigned bit;
-
-			/* Fill the first entry for the current codeword. */
-			decode_table[codeword] =
-				make_decode_table_entry(decode_results,
-							*sorted_syms++, len);
-
-			if (codeword == cur_table_end - 1) {
-				/* Last codeword (all 1's) */
-				for (; len < table_bits; len++) {
-					memcpy(&decode_table[cur_table_end],
-					       decode_table,
-					       cur_table_end *
-						sizeof(decode_table[0]));
-					cur_table_end <<= 1;
-				}
-				return true;
-			}
-			/*
-			 * To advance to the lexicographically next codeword in
-			 * the canonical code, the codeword must be incremented,
-			 * then 0's must be appended to the codeword as needed
-			 * to match the next codeword's length.
-			 *
-			 * Since the codeword is bit-reversed, appending 0's is
-			 * a no-op.  However, incrementing it is nontrivial.  To
-			 * do so efficiently, use the 'bsr' instruction to find
-			 * the last (highest order) 0 bit in the codeword, set
-			 * it, and clear any later (higher order) 1 bits.  But
-			 * 'bsr' actually finds the highest order 1 bit, so to
-			 * use it first flip all bits in the codeword by XOR'ing
-			 * it with (1U << len) - 1 == cur_table_end - 1.
-			 */
-			bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
-			codeword &= bit - 1;
-			codeword |= bit;
-		} while (--count);
-
-		/* Advance to the next codeword length. */
-		do {
-			if (++len <= table_bits) {
-				memcpy(&decode_table[cur_table_end],
-				       decode_table,
-				       cur_table_end * sizeof(decode_table[0]));
-				cur_table_end <<= 1;
-			}
-		} while ((count = len_counts[len]) == 0);
-	}
-
-	/* Process codewords with len > table_bits.  These require subtables. */
-	cur_table_end = 1U << table_bits;
-	subtable_prefix = -1;
-	subtable_start = 0;
-	for (;;) {
-		u32 entry;
-		unsigned i;
-		unsigned stride;
-		unsigned bit;
-
-		/*
-		 * Start a new subtable if the first 'table_bits' bits of the
-		 * codeword don't match the prefix of the current subtable.
-		 */
-		if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
-			subtable_prefix = (codeword & ((1U << table_bits) - 1));
-			subtable_start = cur_table_end;
-			/*
-			 * Calculate the subtable length.  If the codeword has
-			 * length 'table_bits + n', then the subtable needs
-			 * '2^n' entries.  But it may need more; if fewer than
-			 * '2^n' codewords of length 'table_bits + n' remain,
-			 * then the length will need to be incremented to bring
-			 * in longer codewords until the subtable can be
-			 * completely filled.  Note that because the Huffman
-			 * code is complete, it will always be possible to fill
-			 * the subtable eventually.
-			 */
-			subtable_bits = len - table_bits;
-			codespace_used = count;
-			while (codespace_used < (1U << subtable_bits)) {
-				subtable_bits++;
-				codespace_used = (codespace_used << 1) +
-					len_counts[table_bits + subtable_bits];
-			}
-			cur_table_end = subtable_start + (1U << subtable_bits);
-
-			/*
-			 * Create the entry that points from the main table to
-			 * the subtable.
-			 */
-			decode_table[subtable_prefix] =
-				((u32)subtable_start << 16) |
-				HUFFDEC_EXCEPTIONAL |
-				HUFFDEC_SUBTABLE_POINTER |
-				(subtable_bits << 8) | table_bits;
-		}
-
-		/* Fill the subtable entries for the current codeword. */
-		entry = make_decode_table_entry(decode_results, *sorted_syms++,
-						len - table_bits);
-		i = subtable_start + (codeword >> table_bits);
-		stride = 1U << (len - table_bits);
-		do {
-			decode_table[i] = entry;
-			i += stride;
-		} while (i < cur_table_end);
-
-		/* Advance to the next codeword. */
-		if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
-			return true;
-		bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
-		codeword &= bit - 1;
-		codeword |= bit;
-		count--;
-		while (count == 0)
-			count = len_counts[++len];
-	}
+  unsigned len_counts[DEFLATE_MAX_CODEWORD_LEN + 1];
+  unsigned offsets[DEFLATE_MAX_CODEWORD_LEN + 1];
+  unsigned sym;    /* current symbol */
+  unsigned codeword;  /* current codeword, bit-reversed */
+  unsigned len;    /* current codeword length in bits */
+  unsigned count;    /* num codewords remaining with this length */
+  u32 codespace_used;  /* codespace used out of '2^max_codeword_len' */
+  unsigned cur_table_end; /* end index of current table */
+  unsigned subtable_prefix; /* codeword prefix of current subtable */
+  unsigned subtable_start;  /* start index of current subtable */
+  unsigned subtable_bits;   /* log2 of current subtable length */
+  
+  /* Count how many codewords have each length, including 0. */
+  for (len = 0; len <= max_codeword_len; len++)
+    len_counts[len] = 0;
+  for (sym = 0; sym < num_syms; sym++)
+    len_counts[lens[sym]]++;
+  
+  /*
+   * Determine the actual maximum codeword length that was used, and
+   * decrease table_bits to it if allowed.
+   */
+  while (max_codeword_len > 1 && len_counts[max_codeword_len] == 0)
+    max_codeword_len--;
+  if (table_bits_ret != NULL) {
+    table_bits = MIN(table_bits, max_codeword_len);
+    *table_bits_ret = table_bits;
+  }
+  
+  /*
+   * Sort the symbols primarily by increasing codeword length and
+   * secondarily by increasing symbol value; or equivalently by their
+   * codewords in lexicographic order, since a canonical code is assumed.
+   *
+   * For efficiency, also compute 'codespace_used' in the same pass over
+   * 'len_counts[]' used to build 'offsets[]' for sorting.
+   */
+  
+  /* Ensure that 'codespace_used' cannot overflow. */
+  STATIC_ASSERT(sizeof(codespace_used) == 4);
+  STATIC_ASSERT(UINT32_MAX / (1U << (DEFLATE_MAX_CODEWORD_LEN - 1)) >=
+                DEFLATE_MAX_NUM_SYMS);
+  
+  offsets[0] = 0;
+  offsets[1] = len_counts[0];
+  codespace_used = 0;
+  for (len = 1; len < max_codeword_len; len++) {
+    offsets[len + 1] = offsets[len] + len_counts[len];
+    codespace_used = (codespace_used << 1) + len_counts[len];
+  }
+  codespace_used = (codespace_used << 1) + len_counts[len];
+  
+  for (sym = 0; sym < num_syms; sym++)
+    sorted_syms[offsets[lens[sym]]++] = sym;
+  
+  sorted_syms += offsets[0]; /* Skip unused symbols */
+  
+  /* lens[] is done being used, so we can write to decode_table[] now. */
+  
+  /*
+   * Check whether the lengths form a complete code (exactly fills the
+   * codespace), an incomplete code (doesn't fill the codespace), or an
+   * overfull code (overflows the codespace).  A codeword of length 'n'
+   * uses proportion '1/(2^n)' of the codespace.  An overfull code is
+   * nonsensical, so is considered invalid.  An incomplete code is
+   * considered valid only in two specific cases; see below.
+   */
+  
+  /* overfull code? */
+  if (unlikely(codespace_used > (1U << max_codeword_len)))
+    return false;
+  
+  /* incomplete code? */
+  if (unlikely(codespace_used < (1U << max_codeword_len))) {
+    u32 entry;
+    unsigned i;
+    
+    /*
+     * The DEFLATE RFC explicitly allows the offset code to be
+     * incomplete in two cases: a code containing just 1 codeword,
+     * if that codeword has length 1; and a code containing no
+     * codewords.  Note: the list of offset codeword lengths is
+     * always nonempty, but lengths of 0 don't count as codewords.
+     *
+     * The RFC doesn't say whether the same cases are allowed for
+     * the litlen and pre codes.  It's actually impossible for no
+     * symbols to be used from these codes; however, it's
+     * technically possible for only one symbol to be used.  zlib
+     * allows 1 codeword for the litlen code, but not the precode.
+     * The RFC also doesn't say whether, when there is 1 codeword,
+     * that codeword is '0' or '1'.  zlib uses '0'.
+     *
+     * We accept what zlib accepts, plus a bit more.  First, we
+     * don't treat the precode more strictly than the litlen and
+     * offset codes.  There's no convincing reason to add a special
+     * case for the precode here.
+     *
+     * Second, we just map each allowed incompete code to a complete
+     * code with only real symbols.  To do this, we choose a symbol,
+     * either the used symbol (for codes with 1 codeword) or an
+     * arbitrary symbol (for empty codes), and give it both
+     * codewords '0' and '1'.  zlib instead uses a special ERROR
+     * symbol in the part of the codespace the code doesn't use.
+     * However, having an ERROR symbol reduces the performance of
+     * the Huffman decoder, for no real benefit.  Our approach also
+     * avoids having to decide whether '0' or '1' is correct.
+     *
+     * Like zlib, we still reject all incomplete codes that contain
+     * more than 1 codeword or a codeword length greater than 1.
+     */
+    if (codespace_used == 0) {
+      sym = 0; /* arbitrary */
+    } else {
+      if (codespace_used != (1U << (max_codeword_len - 1)) ||
+          len_counts[1] != 1)
+        return false;
+      sym = sorted_syms[0];
+    }
+    entry = make_decode_table_entry(decode_results, sym, 1);
+    for (i = 0; i < (1U << table_bits); i++)
+      decode_table[i] = entry;
+    return true;
+  }
+  
+  /*
+   * The lengths form a complete code.  Now, enumerate the codewords in
+   * lexicographic order and fill the decode table entries for each one.
+   *
+   * First, process all codewords with len <= table_bits.  Each one gets
+   * '2^(table_bits-len)' direct entries in the table.
+   *
+   * Since DEFLATE uses bit-reversed codewords, these entries aren't
+   * consecutive but rather are spaced '2^len' entries apart.  This makes
+   * filling them naively somewhat awkward and inefficient, since strided
+   * stores are less cache-friendly and preclude the use of word or
+   * vector-at-a-time stores to fill multiple entries per instruction.
+   *
+   * To optimize this, we incrementally double the table size.  When
+   * processing codewords with length 'len', the table is treated as
+   * having only '2^len' entries, so each codeword uses just one entry.
+   * Then, each time 'len' is incremented, the table size is doubled and
+   * the first half is copied to the second half.  This significantly
+   * improves performance over naively doing strided stores.
+   *
+   * Note that some entries copied for each table doubling may not have
+   * been initialized yet, but it doesn't matter since they're guaranteed
+   * to be initialized later (because the Huffman code is complete).
+   */
+  codeword = 0;
+  len = 1;
+  while ((count = len_counts[len]) == 0)
+    len++;
+  cur_table_end = 1U << len;
+  while (len <= table_bits) {
+    /* Process all 'count' codewords with length 'len' bits. */
+    do {
+      unsigned bit;
+      
+      /* Fill the first entry for the current codeword. */
+      decode_table[codeword] =
+      make_decode_table_entry(decode_results,
+                              *sorted_syms++, len);
+      
+      if (codeword == cur_table_end - 1) {
+        /* Last codeword (all 1's) */
+        for (; len < table_bits; len++) {
+          memcpy(&decode_table[cur_table_end],
+                 decode_table,
+                 cur_table_end *
+                 sizeof(decode_table[0]));
+          cur_table_end <<= 1;
+        }
+        return true;
+      }
+      /*
+       * To advance to the lexicographically next codeword in
+       * the canonical code, the codeword must be incremented,
+       * then 0's must be appended to the codeword as needed
+       * to match the next codeword's length.
+       *
+       * Since the codeword is bit-reversed, appending 0's is
+       * a no-op.  However, incrementing it is nontrivial.  To
+       * do so efficiently, use the 'bsr' instruction to find
+       * the last (highest order) 0 bit in the codeword, set
+       * it, and clear any later (higher order) 1 bits.  But
+       * 'bsr' actually finds the highest order 1 bit, so to
+       * use it first flip all bits in the codeword by XOR'ing
+       * it with (1U << len) - 1 == cur_table_end - 1.
+       */
+      bit = 1U << bsr32(codeword ^ (cur_table_end - 1));
+      codeword &= bit - 1;
+      codeword |= bit;
+    } while (--count);
+    
+    /* Advance to the next codeword length. */
+    do {
+      if (++len <= table_bits) {
+        memcpy(&decode_table[cur_table_end],
+               decode_table,
+               cur_table_end * sizeof(decode_table[0]));
+        cur_table_end <<= 1;
+      }
+    } while ((count = len_counts[len]) == 0);
+  }
+  
+  /* Process codewords with len > table_bits.  These require subtables. */
+  cur_table_end = 1U << table_bits;
+  subtable_prefix = -1;
+  subtable_start = 0;
+  for (;;) {
+    u32 entry;
+    unsigned i;
+    unsigned stride;
+    unsigned bit;
+    
+    /*
+     * Start a new subtable if the first 'table_bits' bits of the
+     * codeword don't match the prefix of the current subtable.
+     */
+    if ((codeword & ((1U << table_bits) - 1)) != subtable_prefix) {
+      subtable_prefix = (codeword & ((1U << table_bits) - 1));
+      subtable_start = cur_table_end;
+      /*
+       * Calculate the subtable length.  If the codeword has
+       * length 'table_bits + n', then the subtable needs
+       * '2^n' entries.  But it may need more; if fewer than
+       * '2^n' codewords of length 'table_bits + n' remain,
+       * then the length will need to be incremented to bring
+       * in longer codewords until the subtable can be
+       * completely filled.  Note that because the Huffman
+       * code is complete, it will always be possible to fill
+       * the subtable eventually.
+       */
+      subtable_bits = len - table_bits;
+      codespace_used = count;
+      while (codespace_used < (1U << subtable_bits)) {
+        subtable_bits++;
+        codespace_used = (codespace_used << 1) +
+        len_counts[table_bits + subtable_bits];
+      }
+      cur_table_end = subtable_start + (1U << subtable_bits);
+      
+      /*
+       * Create the entry that points from the main table to
+       * the subtable.
+       */
+      decode_table[subtable_prefix] =
+      ((u32)subtable_start << 16) |
+      HUFFDEC_EXCEPTIONAL |
+      HUFFDEC_SUBTABLE_POINTER |
+      (subtable_bits << 8) | table_bits;
+    }
+    
+    /* Fill the subtable entries for the current codeword. */
+    entry = make_decode_table_entry(decode_results, *sorted_syms++,
+                                    len - table_bits);
+    i = subtable_start + (codeword >> table_bits);
+    stride = 1U << (len - table_bits);
+    do {
+      decode_table[i] = entry;
+      i += stride;
+    } while (i < cur_table_end);
+    
+    /* Advance to the next codeword. */
+    if (codeword == (1U << len) - 1) /* last codeword (all 1's)? */
+      return true;
+    bit = 1U << bsr32(codeword ^ ((1U << len) - 1));
+    codeword &= bit - 1;
+    codeword |= bit;
+    count--;
+    while (count == 0)
+      count = len_counts[++len];
+  }
 }
 
 /* Build the decode table for the precode.  */
 static bool
 build_precode_decode_table(struct libdeflate_decompressor *d)
 {
-	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
-	STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
-
-	STATIC_ASSERT(ARRAY_LEN(precode_decode_results) ==
-		      DEFLATE_NUM_PRECODE_SYMS);
-
-	return build_decode_table(d->u.l.precode_decode_table,
-				  d->u.precode_lens,
-				  DEFLATE_NUM_PRECODE_SYMS,
-				  precode_decode_results,
-				  PRECODE_TABLEBITS,
-				  DEFLATE_MAX_PRE_CODEWORD_LEN,
-				  d->sorted_syms,
-				  NULL);
+  /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+  STATIC_ASSERT(PRECODE_TABLEBITS == 7 && PRECODE_ENOUGH == 128);
+  
+  STATIC_ASSERT(ARRAY_LEN(precode_decode_results) ==
+                DEFLATE_NUM_PRECODE_SYMS);
+  
+  return build_decode_table(d->u.l.precode_decode_table,
+                            d->u.precode_lens,
+                            DEFLATE_NUM_PRECODE_SYMS,
+                            precode_decode_results,
+                            PRECODE_TABLEBITS,
+                            DEFLATE_MAX_PRE_CODEWORD_LEN,
+                            d->sorted_syms,
+                            NULL);
 }
 
 /* Build the decode table for the literal/length code.  */
 static bool
 build_litlen_decode_table(struct libdeflate_decompressor *d,
-			  unsigned num_litlen_syms, unsigned num_offset_syms)
+                          unsigned num_litlen_syms, unsigned num_offset_syms)
 {
-	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
-	STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342);
-
-	STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) ==
-		      DEFLATE_NUM_LITLEN_SYMS);
-
-	return build_decode_table(d->u.litlen_decode_table,
-				  d->u.l.lens,
-				  num_litlen_syms,
-				  litlen_decode_results,
-				  LITLEN_TABLEBITS,
-				  DEFLATE_MAX_LITLEN_CODEWORD_LEN,
-				  d->sorted_syms,
-				  &d->litlen_tablebits);
+  /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+  STATIC_ASSERT(LITLEN_TABLEBITS == 11 && LITLEN_ENOUGH == 2342);
+  
+  STATIC_ASSERT(ARRAY_LEN(litlen_decode_results) ==
+                DEFLATE_NUM_LITLEN_SYMS);
+  
+  return build_decode_table(d->u.litlen_decode_table,
+                            d->u.l.lens,
+                            num_litlen_syms,
+                            litlen_decode_results,
+                            LITLEN_TABLEBITS,
+                            DEFLATE_MAX_LITLEN_CODEWORD_LEN,
+                            d->sorted_syms,
+                            &d->litlen_tablebits);
 }
 
 /* Build the decode table for the offset code.  */
 static bool
 build_offset_decode_table(struct libdeflate_decompressor *d,
-			  unsigned num_litlen_syms, unsigned num_offset_syms)
+                          unsigned num_litlen_syms, unsigned num_offset_syms)
 {
-	/* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
-	STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
-
-	STATIC_ASSERT(ARRAY_LEN(offset_decode_results) ==
-		      DEFLATE_NUM_OFFSET_SYMS);
-
-	return build_decode_table(d->offset_decode_table,
-				  d->u.l.lens + num_litlen_syms,
-				  num_offset_syms,
-				  offset_decode_results,
-				  OFFSET_TABLEBITS,
-				  DEFLATE_MAX_OFFSET_CODEWORD_LEN,
-				  d->sorted_syms,
-				  NULL);
+  /* When you change TABLEBITS, you must change ENOUGH, and vice versa! */
+  STATIC_ASSERT(OFFSET_TABLEBITS == 8 && OFFSET_ENOUGH == 402);
+  
+  STATIC_ASSERT(ARRAY_LEN(offset_decode_results) ==
+                DEFLATE_NUM_OFFSET_SYMS);
+  
+  return build_decode_table(d->offset_decode_table,
+                            d->u.l.lens + num_litlen_syms,
+                            num_offset_syms,
+                            offset_decode_results,
+                            OFFSET_TABLEBITS,
+                            DEFLATE_MAX_OFFSET_CODEWORD_LEN,
+                            d->sorted_syms,
+                            NULL);
 }
 
 /*****************************************************************************
@@ -1070,10 +1070,10 @@ build_offset_decode_table(struct libdeflate_decompressor *d,
  *****************************************************************************/
 
 typedef enum libdeflate_result (*decompress_func_t)
-	(struct libdeflate_decompressor * restrict d,
-	 const void * restrict in, size_t in_nbytes,
-	 void * restrict out, size_t out_nbytes_avail,
-	 size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+(struct libdeflate_decompressor * restrict d,
+ const void * restrict in, size_t in_nbytes,
+ void * restrict out, size_t out_nbytes_avail,
+ size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
 
 #define FUNCNAME deflate_decompress_default
 #undef ATTRIBUTES
@@ -1095,27 +1095,27 @@ typedef enum libdeflate_result (*decompress_func_t)
 #ifdef arch_select_decompress_func
 static enum libdeflate_result
 dispatch_decomp(struct libdeflate_decompressor *d,
-		const void *in, size_t in_nbytes,
-		void *out, size_t out_nbytes_avail,
-		size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
+                const void *in, size_t in_nbytes,
+                void *out, size_t out_nbytes_avail,
+                size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret);
 
 static volatile decompress_func_t decompress_impl = dispatch_decomp;
 
 /* Choose the best implementation at runtime. */
 static enum libdeflate_result
 dispatch_decomp(struct libdeflate_decompressor *d,
-		const void *in, size_t in_nbytes,
-		void *out, size_t out_nbytes_avail,
-		size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
+                const void *in, size_t in_nbytes,
+                void *out, size_t out_nbytes_avail,
+                size_t *actual_in_nbytes_ret, size_t *actual_out_nbytes_ret)
 {
-	decompress_func_t f = arch_select_decompress_func();
-
-	if (f == NULL)
-		f = DEFAULT_IMPL;
-
-	decompress_impl = f;
-	return f(d, in, in_nbytes, out, out_nbytes_avail,
-		 actual_in_nbytes_ret, actual_out_nbytes_ret);
+  decompress_func_t f = arch_select_decompress_func();
+  
+  if (f == NULL)
+    f = DEFAULT_IMPL;
+  
+  decompress_impl = f;
+  return f(d, in, in_nbytes, out, out_nbytes_avail,
+           actual_in_nbytes_ret, actual_out_nbytes_ret);
 }
 #else
 /* The best implementation is statically known, so call it directly. */
@@ -1132,77 +1132,77 @@ dispatch_decomp(struct libdeflate_decompressor *d,
  */
 LIBDEFLATEAPI enum libdeflate_result
 libdeflate_deflate_decompress_ex(struct libdeflate_decompressor *d,
-				 const void *in, size_t in_nbytes,
-				 void *out, size_t out_nbytes_avail,
-				 size_t *actual_in_nbytes_ret,
-				 size_t *actual_out_nbytes_ret)
+                                 const void *in, size_t in_nbytes,
+                                 void *out, size_t out_nbytes_avail,
+                                 size_t *actual_in_nbytes_ret,
+                                 size_t *actual_out_nbytes_ret)
 {
-	return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
-			       actual_in_nbytes_ret, actual_out_nbytes_ret);
+  return decompress_impl(d, in, in_nbytes, out, out_nbytes_avail,
+                         actual_in_nbytes_ret, actual_out_nbytes_ret);
 }
 
 LIBDEFLATEAPI enum libdeflate_result
 libdeflate_deflate_decompress(struct libdeflate_decompressor *d,
-			      const void *in, size_t in_nbytes,
-			      void *out, size_t out_nbytes_avail,
-			      size_t *actual_out_nbytes_ret)
+                              const void *in, size_t in_nbytes,
+                              void *out, size_t out_nbytes_avail,
+                              size_t *actual_out_nbytes_ret)
 {
-	return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
-						out, out_nbytes_avail,
-						NULL, actual_out_nbytes_ret);
+  return libdeflate_deflate_decompress_ex(d, in, in_nbytes,
+                                          out, out_nbytes_avail,
+                                          NULL, actual_out_nbytes_ret);
 }
 
 LIBDEFLATEAPI struct libdeflate_decompressor *
 libdeflate_alloc_decompressor_ex(const struct libdeflate_options *options)
 {
-	struct libdeflate_decompressor *d;
-
-	/*
-	 * Note: if more fields are added to libdeflate_options, this code will
-	 * need to be updated to support both the old and new structs.
-	 */
-	if (options->sizeof_options != sizeof(*options))
-		return NULL;
-
-	d = (options->malloc_func ? options->malloc_func :
-	     libdeflate_default_malloc_func)(sizeof(*d));
-	if (d == NULL)
-		return NULL;
-	/*
-	 * Note that only certain parts of the decompressor actually must be
-	 * initialized here:
-	 *
-	 * - 'static_codes_loaded' must be initialized to false.
-	 *
-	 * - The first half of the main portion of each decode table must be
-	 *   initialized to any value, to avoid reading from uninitialized
-	 *   memory during table expansion in build_decode_table().  (Although,
-	 *   this is really just to avoid warnings with dynamic tools like
-	 *   valgrind, since build_decode_table() is guaranteed to initialize
-	 *   all entries eventually anyway.)
-	 *
-	 * - 'free_func' must be set.
-	 *
-	 * But for simplicity, we currently just zero the whole decompressor.
-	 */
-	memset(d, 0, sizeof(*d));
-	d->free_func = options->free_func ?
-		       options->free_func : libdeflate_default_free_func;
-	return d;
+  struct libdeflate_decompressor *d;
+  
+  /*
+   * Note: if more fields are added to libdeflate_options, this code will
+   * need to be updated to support both the old and new structs.
+   */
+  if (options->sizeof_options != sizeof(*options))
+    return NULL;
+  
+  d = (options->malloc_func ? options->malloc_func :
+       libdeflate_default_malloc_func)(sizeof(*d));
+  if (d == NULL)
+    return NULL;
+  /*
+   * Note that only certain parts of the decompressor actually must be
+   * initialized here:
+   *
+   * - 'static_codes_loaded' must be initialized to false.
+   *
+   * - The first half of the main portion of each decode table must be
+   *   initialized to any value, to avoid reading from uninitialized
+   *   memory during table expansion in build_decode_table().  (Although,
+   *   this is really just to avoid warnings with dynamic tools like
+   *   valgrind, since build_decode_table() is guaranteed to initialize
+   *   all entries eventually anyway.)
+   *
+   * - 'free_func' must be set.
+   *
+   * But for simplicity, we currently just zero the whole decompressor.
+   */
+  memset(d, 0, sizeof(*d));
+  d->free_func = options->free_func ?
+  options->free_func : libdeflate_default_free_func;
+  return d;
 }
 
 LIBDEFLATEAPI struct libdeflate_decompressor *
 libdeflate_alloc_decompressor(void)
 {
-	static const struct libdeflate_options defaults = {
-		.sizeof_options = sizeof(defaults),
-	};
-	return libdeflate_alloc_decompressor_ex(&defaults);
+  static const struct libdeflate_options defaults = {
+    .sizeof_options = sizeof(defaults),
+  };
+  return libdeflate_alloc_decompressor_ex(&defaults);
 }
 
 LIBDEFLATEAPI void
 libdeflate_free_decompressor(struct libdeflate_decompressor *d)
 {
-	if (d)
-		d->free_func(d);
+  if (d)
+    d->free_func(d);
 }
diff --git a/Sources/DEFLATE/gzip_compress.c b/Sources/DEFLATE/gzip_compress.c
index b7d5076e..016e638d 100644
--- a/Sources/DEFLATE/gzip_compress.c
+++ b/Sources/DEFLATE/gzip_compress.c
@@ -30,61 +30,61 @@
 
 LIBDEFLATEAPI size_t
 libdeflate_gzip_compress(struct libdeflate_compressor *c,
-			 const void *in, size_t in_nbytes,
-			 void *out, size_t out_nbytes_avail)
+                         const void *in, size_t in_nbytes,
+                         void *out, size_t out_nbytes_avail)
 {
-	u8 *out_next = out;
-	unsigned compression_level;
-	u8 xfl;
-	size_t deflate_size;
-
-	if (out_nbytes_avail <= GZIP_MIN_OVERHEAD)
-		return 0;
-
-	/* ID1 */
-	*out_next++ = GZIP_ID1;
-	/* ID2 */
-	*out_next++ = GZIP_ID2;
-	/* CM */
-	*out_next++ = GZIP_CM_DEFLATE;
-	/* FLG */
-	*out_next++ = 0;
-	/* MTIME */
-	put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next);
-	out_next += 4;
-	/* XFL */
-	xfl = 0;
-	compression_level = libdeflate_get_compression_level(c);
-	if (compression_level < 2)
-		xfl |= GZIP_XFL_FASTEST_COMPRESSION;
-	else if (compression_level >= 8)
-		xfl |= GZIP_XFL_SLOWEST_COMPRESSION;
-	*out_next++ = xfl;
-	/* OS */
-	*out_next++ = GZIP_OS_UNKNOWN;	/* OS  */
-
-	/* Compressed data  */
-	deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
-					out_nbytes_avail - GZIP_MIN_OVERHEAD);
-	if (deflate_size == 0)
-		return 0;
-	out_next += deflate_size;
-
-	/* CRC32 */
-	put_unaligned_le32(libdeflate_crc32(0, in, in_nbytes), out_next);
-	out_next += 4;
-
-	/* ISIZE */
-	put_unaligned_le32((u32)in_nbytes, out_next);
-	out_next += 4;
-
-	return out_next - (u8 *)out;
+  u8 *out_next = out;
+  unsigned compression_level;
+  u8 xfl;
+  size_t deflate_size;
+  
+  if (out_nbytes_avail <= GZIP_MIN_OVERHEAD)
+    return 0;
+  
+  /* ID1 */
+  *out_next++ = GZIP_ID1;
+  /* ID2 */
+  *out_next++ = GZIP_ID2;
+  /* CM */
+  *out_next++ = GZIP_CM_DEFLATE;
+  /* FLG */
+  *out_next++ = 0;
+  /* MTIME */
+  put_unaligned_le32(GZIP_MTIME_UNAVAILABLE, out_next);
+  out_next += 4;
+  /* XFL */
+  xfl = 0;
+  compression_level = libdeflate_get_compression_level(c);
+  if (compression_level < 2)
+    xfl |= GZIP_XFL_FASTEST_COMPRESSION;
+  else if (compression_level >= 8)
+    xfl |= GZIP_XFL_SLOWEST_COMPRESSION;
+  *out_next++ = xfl;
+  /* OS */
+  *out_next++ = GZIP_OS_UNKNOWN;  /* OS  */
+  
+  /* Compressed data  */
+  deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
+                                             out_nbytes_avail - GZIP_MIN_OVERHEAD);
+  if (deflate_size == 0)
+    return 0;
+  out_next += deflate_size;
+  
+  /* CRC32 */
+  put_unaligned_le32(libdeflate_crc32(0, in, in_nbytes), out_next);
+  out_next += 4;
+  
+  /* ISIZE */
+  put_unaligned_le32((u32)in_nbytes, out_next);
+  out_next += 4;
+  
+  return out_next - (u8 *)out;
 }
 
 LIBDEFLATEAPI size_t
 libdeflate_gzip_compress_bound(struct libdeflate_compressor *c,
-			       size_t in_nbytes)
+                               size_t in_nbytes)
 {
-	return GZIP_MIN_OVERHEAD +
-	       libdeflate_deflate_compress_bound(c, in_nbytes);
+  return GZIP_MIN_OVERHEAD +
+  libdeflate_deflate_compress_bound(c, in_nbytes);
 }
diff --git a/Sources/DEFLATE/gzip_constants.h b/Sources/DEFLATE/gzip_constants.h
index 35e4728d..24b100c7 100644
--- a/Sources/DEFLATE/gzip_constants.h
+++ b/Sources/DEFLATE/gzip_constants.h
@@ -5,41 +5,41 @@
 #ifndef LIB_GZIP_CONSTANTS_H
 #define LIB_GZIP_CONSTANTS_H
 
-#define GZIP_MIN_HEADER_SIZE	10
-#define GZIP_FOOTER_SIZE	8
-#define GZIP_MIN_OVERHEAD	(GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
-
-#define GZIP_ID1		0x1F
-#define GZIP_ID2		0x8B
-
-#define GZIP_CM_DEFLATE		8
-
-#define GZIP_FTEXT		0x01
-#define GZIP_FHCRC		0x02
-#define GZIP_FEXTRA		0x04
-#define GZIP_FNAME		0x08
-#define GZIP_FCOMMENT		0x10
-#define GZIP_FRESERVED		0xE0
-
-#define GZIP_MTIME_UNAVAILABLE	0
-
-#define GZIP_XFL_SLOWEST_COMPRESSION	0x02
-#define GZIP_XFL_FASTEST_COMPRESSION	0x04
-
-#define GZIP_OS_FAT		0
-#define GZIP_OS_AMIGA		1
-#define GZIP_OS_VMS		2
-#define GZIP_OS_UNIX		3
-#define GZIP_OS_VM_CMS		4
-#define GZIP_OS_ATARI_TOS	5
-#define GZIP_OS_HPFS		6
-#define GZIP_OS_MACINTOSH	7
-#define GZIP_OS_Z_SYSTEM	8
-#define GZIP_OS_CP_M		9
-#define GZIP_OS_TOPS_20		10
-#define GZIP_OS_NTFS		11
-#define GZIP_OS_QDOS		12
-#define GZIP_OS_RISCOS		13
-#define GZIP_OS_UNKNOWN		255
+#define GZIP_MIN_HEADER_SIZE  10
+#define GZIP_FOOTER_SIZE  8
+#define GZIP_MIN_OVERHEAD  (GZIP_MIN_HEADER_SIZE + GZIP_FOOTER_SIZE)
+
+#define GZIP_ID1    0x1F
+#define GZIP_ID2    0x8B
+
+#define GZIP_CM_DEFLATE    8
+
+#define GZIP_FTEXT    0x01
+#define GZIP_FHCRC    0x02
+#define GZIP_FEXTRA    0x04
+#define GZIP_FNAME    0x08
+#define GZIP_FCOMMENT    0x10
+#define GZIP_FRESERVED    0xE0
+
+#define GZIP_MTIME_UNAVAILABLE  0
+
+#define GZIP_XFL_SLOWEST_COMPRESSION  0x02
+#define GZIP_XFL_FASTEST_COMPRESSION  0x04
+
+#define GZIP_OS_FAT    0
+#define GZIP_OS_AMIGA    1
+#define GZIP_OS_VMS    2
+#define GZIP_OS_UNIX    3
+#define GZIP_OS_VM_CMS    4
+#define GZIP_OS_ATARI_TOS  5
+#define GZIP_OS_HPFS    6
+#define GZIP_OS_MACINTOSH  7
+#define GZIP_OS_Z_SYSTEM  8
+#define GZIP_OS_CP_M    9
+#define GZIP_OS_TOPS_20    10
+#define GZIP_OS_NTFS    11
+#define GZIP_OS_QDOS    12
+#define GZIP_OS_RISCOS    13
+#define GZIP_OS_UNKNOWN    255
 
 #endif /* LIB_GZIP_CONSTANTS_H */
diff --git a/Sources/DEFLATE/gzip_decompress.c b/Sources/DEFLATE/gzip_decompress.c
index 76b74f69..53aa3979 100644
--- a/Sources/DEFLATE/gzip_decompress.c
+++ b/Sources/DEFLATE/gzip_decompress.c
@@ -30,115 +30,115 @@
 
 LIBDEFLATEAPI enum libdeflate_result
 libdeflate_gzip_decompress_ex(struct libdeflate_decompressor *d,
-			      const void *in, size_t in_nbytes,
-			      void *out, size_t out_nbytes_avail,
-			      size_t *actual_in_nbytes_ret,
-			      size_t *actual_out_nbytes_ret)
+                              const void *in, size_t in_nbytes,
+                              void *out, size_t out_nbytes_avail,
+                              size_t *actual_in_nbytes_ret,
+                              size_t *actual_out_nbytes_ret)
 {
-	const u8 *in_next = in;
-	const u8 * const in_end = in_next + in_nbytes;
-	u8 flg;
-	size_t actual_in_nbytes;
-	size_t actual_out_nbytes;
-	enum libdeflate_result result;
-
-	if (in_nbytes < GZIP_MIN_OVERHEAD)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* ID1 */
-	if (*in_next++ != GZIP_ID1)
-		return LIBDEFLATE_BAD_DATA;
-	/* ID2 */
-	if (*in_next++ != GZIP_ID2)
-		return LIBDEFLATE_BAD_DATA;
-	/* CM */
-	if (*in_next++ != GZIP_CM_DEFLATE)
-		return LIBDEFLATE_BAD_DATA;
-	flg = *in_next++;
-	/* MTIME */
-	in_next += 4;
-	/* XFL */
-	in_next += 1;
-	/* OS */
-	in_next += 1;
-
-	if (flg & GZIP_FRESERVED)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* Extra field */
-	if (flg & GZIP_FEXTRA) {
-		u16 xlen = get_unaligned_le16(in_next);
-		in_next += 2;
-
-		if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE)
-			return LIBDEFLATE_BAD_DATA;
-
-		in_next += xlen;
-	}
-
-	/* Original file name (zero terminated) */
-	if (flg & GZIP_FNAME) {
-		while (*in_next++ != 0 && in_next != in_end)
-			;
-		if (in_end - in_next < GZIP_FOOTER_SIZE)
-			return LIBDEFLATE_BAD_DATA;
-	}
-
-	/* File comment (zero terminated) */
-	if (flg & GZIP_FCOMMENT) {
-		while (*in_next++ != 0 && in_next != in_end)
-			;
-		if (in_end - in_next < GZIP_FOOTER_SIZE)
-			return LIBDEFLATE_BAD_DATA;
-	}
-
-	/* CRC16 for gzip header */
-	if (flg & GZIP_FHCRC) {
-		in_next += 2;
-		if (in_end - in_next < GZIP_FOOTER_SIZE)
-			return LIBDEFLATE_BAD_DATA;
-	}
-
-	/* Compressed data  */
-	result = libdeflate_deflate_decompress_ex(d, in_next,
-					in_end - GZIP_FOOTER_SIZE - in_next,
-					out, out_nbytes_avail,
-					&actual_in_nbytes,
-					actual_out_nbytes_ret);
-	if (result != LIBDEFLATE_SUCCESS)
-		return result;
-
-	if (actual_out_nbytes_ret)
-		actual_out_nbytes = *actual_out_nbytes_ret;
-	else
-		actual_out_nbytes = out_nbytes_avail;
-
-	in_next += actual_in_nbytes;
-
-	/* CRC32 */
-	if (libdeflate_crc32(0, out, actual_out_nbytes) !=
-	    get_unaligned_le32(in_next))
-		return LIBDEFLATE_BAD_DATA;
-	in_next += 4;
-
-	/* ISIZE */
-	if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
-		return LIBDEFLATE_BAD_DATA;
-	in_next += 4;
-
-	if (actual_in_nbytes_ret)
-		*actual_in_nbytes_ret = in_next - (u8 *)in;
-
-	return LIBDEFLATE_SUCCESS;
+  const u8 *in_next = in;
+  const u8 * const in_end = in_next + in_nbytes;
+  u8 flg;
+  size_t actual_in_nbytes;
+  size_t actual_out_nbytes;
+  enum libdeflate_result result;
+  
+  if (in_nbytes < GZIP_MIN_OVERHEAD)
+    return LIBDEFLATE_BAD_DATA;
+  
+  /* ID1 */
+  if (*in_next++ != GZIP_ID1)
+    return LIBDEFLATE_BAD_DATA;
+  /* ID2 */
+  if (*in_next++ != GZIP_ID2)
+    return LIBDEFLATE_BAD_DATA;
+  /* CM */
+  if (*in_next++ != GZIP_CM_DEFLATE)
+    return LIBDEFLATE_BAD_DATA;
+  flg = *in_next++;
+  /* MTIME */
+  in_next += 4;
+  /* XFL */
+  in_next += 1;
+  /* OS */
+  in_next += 1;
+  
+  if (flg & GZIP_FRESERVED)
+    return LIBDEFLATE_BAD_DATA;
+  
+  /* Extra field */
+  if (flg & GZIP_FEXTRA) {
+    u16 xlen = get_unaligned_le16(in_next);
+    in_next += 2;
+    
+    if (in_end - in_next < (u32)xlen + GZIP_FOOTER_SIZE)
+      return LIBDEFLATE_BAD_DATA;
+    
+    in_next += xlen;
+  }
+  
+  /* Original file name (zero terminated) */
+  if (flg & GZIP_FNAME) {
+    while (*in_next++ != 0 && in_next != in_end)
+      ;
+    if (in_end - in_next < GZIP_FOOTER_SIZE)
+      return LIBDEFLATE_BAD_DATA;
+  }
+  
+  /* File comment (zero terminated) */
+  if (flg & GZIP_FCOMMENT) {
+    while (*in_next++ != 0 && in_next != in_end)
+      ;
+    if (in_end - in_next < GZIP_FOOTER_SIZE)
+      return LIBDEFLATE_BAD_DATA;
+  }
+  
+  /* CRC16 for gzip header */
+  if (flg & GZIP_FHCRC) {
+    in_next += 2;
+    if (in_end - in_next < GZIP_FOOTER_SIZE)
+      return LIBDEFLATE_BAD_DATA;
+  }
+  
+  /* Compressed data  */
+  result = libdeflate_deflate_decompress_ex(d, in_next,
+                                            in_end - GZIP_FOOTER_SIZE - in_next,
+                                            out, out_nbytes_avail,
+                                            &actual_in_nbytes,
+                                            actual_out_nbytes_ret);
+  if (result != LIBDEFLATE_SUCCESS)
+    return result;
+  
+  if (actual_out_nbytes_ret)
+    actual_out_nbytes = *actual_out_nbytes_ret;
+  else
+    actual_out_nbytes = out_nbytes_avail;
+  
+  in_next += actual_in_nbytes;
+  
+  /* CRC32 */
+  if (libdeflate_crc32(0, out, actual_out_nbytes) !=
+      get_unaligned_le32(in_next))
+    return LIBDEFLATE_BAD_DATA;
+  in_next += 4;
+  
+  /* ISIZE */
+  if ((u32)actual_out_nbytes != get_unaligned_le32(in_next))
+    return LIBDEFLATE_BAD_DATA;
+  in_next += 4;
+  
+  if (actual_in_nbytes_ret)
+    *actual_in_nbytes_ret = in_next - (u8 *)in;
+  
+  return LIBDEFLATE_SUCCESS;
 }
 
 LIBDEFLATEAPI enum libdeflate_result
 libdeflate_gzip_decompress(struct libdeflate_decompressor *d,
-			   const void *in, size_t in_nbytes,
-			   void *out, size_t out_nbytes_avail,
-			   size_t *actual_out_nbytes_ret)
+                           const void *in, size_t in_nbytes,
+                           void *out, size_t out_nbytes_avail,
+                           size_t *actual_out_nbytes_ret)
 {
-	return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
-					     out, out_nbytes_avail,
-					     NULL, actual_out_nbytes_ret);
+  return libdeflate_gzip_decompress_ex(d, in, in_nbytes,
+                                       out, out_nbytes_avail,
+                                       NULL, actual_out_nbytes_ret);
 }
diff --git a/Sources/DEFLATE/lib_common.h b/Sources/DEFLATE/lib_common.h
index 8c9ff5fe..68bf734b 100644
--- a/Sources/DEFLATE/lib_common.h
+++ b/Sources/DEFLATE/lib_common.h
@@ -6,10 +6,10 @@
 #define LIB_LIB_COMMON_H
 
 #ifdef LIBDEFLATE_H
- /*
-  * When building the library, LIBDEFLATEAPI needs to be defined properly before
-  * including libdeflate.h.
-  */
+/*
+ * When building the library, LIBDEFLATEAPI needs to be defined properly before
+ * including libdeflate.h.
+ */
 #  error "lib_common.h must always be included before libdeflate.h"
 #endif
 
@@ -35,7 +35,7 @@
 #  define LIBDEFLATE_ALIGN_STACK
 #endif
 
-#define LIBDEFLATEAPI	LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK
+#define LIBDEFLATEAPI  LIBDEFLATE_EXPORT_SYM LIBDEFLATE_ALIGN_STACK
 
 #include "../common_defs.h"
 
@@ -46,7 +46,7 @@ extern malloc_func_t libdeflate_default_malloc_func;
 extern free_func_t libdeflate_default_free_func;
 
 void *libdeflate_aligned_malloc(malloc_func_t malloc_func,
-				size_t alignment, size_t size);
+                                size_t alignment, size_t size);
 void libdeflate_aligned_free(free_func_t free_func, void *ptr);
 
 #ifdef FREESTANDING
@@ -63,16 +63,16 @@ void libdeflate_aligned_free(free_func_t free_func, void *ptr);
  * We still need the actual function definitions in case gcc calls them.
  */
 void *memset(void *s, int c, size_t n);
-#define memset(s, c, n)		__builtin_memset((s), (c), (n))
+#define memset(s, c, n)    __builtin_memset((s), (c), (n))
 
 void *memcpy(void *dest, const void *src, size_t n);
-#define memcpy(dest, src, n)	__builtin_memcpy((dest), (src), (n))
+#define memcpy(dest, src, n)  __builtin_memcpy((dest), (src), (n))
 
 void *memmove(void *dest, const void *src, size_t n);
-#define memmove(dest, src, n)	__builtin_memmove((dest), (src), (n))
+#define memmove(dest, src, n)  __builtin_memmove((dest), (src), (n))
 
 int memcmp(const void *s1, const void *s2, size_t n);
-#define memcmp(s1, s2, n)	__builtin_memcmp((s1), (s2), (n))
+#define memcmp(s1, s2, n)  __builtin_memcmp((s1), (s2), (n))
 
 #undef LIBDEFLATE_ENABLE_ASSERTIONS
 #else
@@ -86,13 +86,13 @@ int memcmp(const void *s1, const void *s2, size_t n);
 #ifdef LIBDEFLATE_ENABLE_ASSERTIONS
 void libdeflate_assertion_failed(const char *expr, const char *file, int line);
 #define ASSERT(expr) { if (unlikely(!(expr))) \
-	libdeflate_assertion_failed(#expr, __FILE__, __LINE__); }
+libdeflate_assertion_failed(#expr, __FILE__, __LINE__); }
 #else
 #define ASSERT(expr) (void)(expr)
 #endif
 
-#define CONCAT_IMPL(a, b)	a##b
-#define CONCAT(a, b)		CONCAT_IMPL(a, b)
-#define ADD_SUFFIX(name)	CONCAT(name, SUFFIX)
+#define CONCAT_IMPL(a, b)  a##b
+#define CONCAT(a, b)    CONCAT_IMPL(a, b)
+#define ADD_SUFFIX(name)  CONCAT(name, SUFFIX)
 
 #endif /* LIB_LIB_COMMON_H */
diff --git a/Sources/DEFLATE/matchfinder_common.h b/Sources/DEFLATE/matchfinder_common.h
index a47d1070..8094c2b2 100644
--- a/Sources/DEFLATE/matchfinder_common.h
+++ b/Sources/DEFLATE/matchfinder_common.h
@@ -20,10 +20,10 @@
 static forceinline u32
 loaded_u32_to_u24(u32 v)
 {
-	if (CPU_IS_LITTLE_ENDIAN())
-		return v & 0xFFFFFF;
-	else
-		return v >> 8;
+  if (CPU_IS_LITTLE_ENDIAN())
+    return v & 0xFFFFFF;
+  else
+    return v >> 8;
 }
 
 /*
@@ -35,12 +35,12 @@ static forceinline u32
 load_u24_unaligned(const u8 *p)
 {
 #if UNALIGNED_ACCESS_IS_FAST
-	return loaded_u32_to_u24(load_u32_unaligned(p));
+  return loaded_u32_to_u24(load_u32_unaligned(p));
 #else
-	if (CPU_IS_LITTLE_ENDIAN())
-		return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
-	else
-		return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
+  if (CPU_IS_LITTLE_ENDIAN())
+    return ((u32)p[0] << 0) | ((u32)p[1] << 8) | ((u32)p[2] << 16);
+  else
+    return ((u32)p[2] << 0) | ((u32)p[1] << 8) | ((u32)p[0] << 16);
 #endif
 }
 
@@ -51,18 +51,43 @@ typedef s16 mf_pos_t;
 #define MATCHFINDER_INITVAL ((mf_pos_t)-MATCHFINDER_WINDOW_SIZE)
 
 /*
- * Required alignment of the matchfinder buffer pointer and size.  The values
- * here come from the AVX-2 implementation, which is the worst case.
+ * This is the memory address alignment, in bytes, required for the matchfinder
+ * buffers by the architecture-specific implementations of matchfinder_init()
+ * and matchfinder_rebase().  "Matchfinder buffer" means an entire struct
+ * hc_matchfinder, bt_matchfinder, or ht_matchfinder; the next_tab field of
+ * struct hc_matchfinder; or the child_tab field of struct bt_matchfinder.
+ *
+ * This affects how the entire 'struct deflate_compressor' is allocated, since
+ * the matchfinder structures are embedded inside it.
+ *
+ * Currently the maximum memory address alignment required is 32 bytes, needed
+ * by the AVX-2 matchfinder functions.
+ */
+#define MATCHFINDER_MEM_ALIGNMENT  32
+
+/*
+ * This declares a size, in bytes, that is guaranteed to divide the sizes of the
+ * matchfinder buffers (where "matchfinder buffers" is as defined for
+ * MATCHFINDER_MEM_ALIGNMENT).  The architecture-specific implementations of
+ * matchfinder_init() and matchfinder_rebase() take advantage of this value.
+ *
+ * Currently the maximum size alignment required is 128 bytes, needed by
+ * the AVX-2 matchfinder functions.  However, the RISC-V Vector Extension
+ * matchfinder functions can, in principle, take advantage of a larger size
+ * alignment.  Therefore, we set this to 1024, which still easily divides the
+ * actual sizes that result from the current matchfinder struct definitions.
+ * This value can safely be changed to any power of two that is >= 128.
  */
-#define MATCHFINDER_MEM_ALIGNMENT	32
-#define MATCHFINDER_SIZE_ALIGNMENT	128
+#define MATCHFINDER_SIZE_ALIGNMENT  1024
 
 #undef matchfinder_init
 #undef matchfinder_rebase
 #ifdef _aligned_attribute
 #  define MATCHFINDER_ALIGNED _aligned_attribute(MATCHFINDER_MEM_ALIGNMENT)
 #  if defined(ARCH_ARM32) || defined(ARCH_ARM64)
-#    include "matchfinder_impl.h"
+#    include "arm/matchfinder_impl.h"
+#  elif defined(ARCH_RISCV)
+#    include "riscv/matchfinder_impl.h"
 #  elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
 #    include "x86/matchfinder_impl.h"
 #  endif
@@ -82,11 +107,11 @@ typedef s16 mf_pos_t;
 static forceinline void
 matchfinder_init(mf_pos_t *data, size_t size)
 {
-	size_t num_entries = size / sizeof(*data);
-	size_t i;
-
-	for (i = 0; i < num_entries; i++)
-		data[i] = MATCHFINDER_INITVAL;
+  size_t num_entries = size / sizeof(*data);
+  size_t i;
+  
+  for (i = 0; i < num_entries; i++)
+    data[i] = MATCHFINDER_INITVAL;
 }
 #endif
 
@@ -111,25 +136,25 @@ matchfinder_init(mf_pos_t *data, size_t size)
 static forceinline void
 matchfinder_rebase(mf_pos_t *data, size_t size)
 {
-	size_t num_entries = size / sizeof(*data);
-	size_t i;
-
-	if (MATCHFINDER_WINDOW_SIZE == 32768) {
-		/*
-		 * Branchless version for 32768-byte windows.  Clear all bits if
-		 * the value was already negative, then set the sign bit.  This
-		 * is equivalent to subtracting 32768 with signed saturation.
-		 */
-		for (i = 0; i < num_entries; i++)
-			data[i] = 0x8000 | (data[i] & ~(data[i] >> 15));
-	} else {
-		for (i = 0; i < num_entries; i++) {
-			if (data[i] >= 0)
-				data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
-			else
-				data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
-		}
-	}
+  size_t num_entries = size / sizeof(*data);
+  size_t i;
+  
+  if (MATCHFINDER_WINDOW_SIZE == 32768) {
+    /*
+     * Branchless version for 32768-byte windows.  Clear all bits if
+     * the value was already negative, then set the sign bit.  This
+     * is equivalent to subtracting 32768 with signed saturation.
+     */
+    for (i = 0; i < num_entries; i++)
+      data[i] = 0x8000 | (data[i] & ~(data[i] >> 15));
+  } else {
+    for (i = 0; i < num_entries; i++) {
+      if (data[i] >= 0)
+        data[i] -= (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
+      else
+        data[i] = (mf_pos_t)-MATCHFINDER_WINDOW_SIZE;
+    }
+  }
 }
 #endif
 
@@ -143,7 +168,7 @@ matchfinder_rebase(mf_pos_t *data, size_t size)
 static forceinline u32
 lz_hash(u32 seq, unsigned num_bits)
 {
-	return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
+  return (u32)(seq * 0x1E35A7BD) >> (32 - num_bits);
 }
 
 /*
@@ -152,48 +177,48 @@ lz_hash(u32 seq, unsigned num_bits)
  */
 static forceinline unsigned
 lz_extend(const u8 * const strptr, const u8 * const matchptr,
-	  const unsigned start_len, const unsigned max_len)
+          const unsigned start_len, const unsigned max_len)
 {
-	unsigned len = start_len;
-	machine_word_t v_word;
-
-	if (UNALIGNED_ACCESS_IS_FAST) {
-
-		if (likely(max_len - len >= 4 * WORDBYTES)) {
-
-		#define COMPARE_WORD_STEP				\
-			v_word = load_word_unaligned(&matchptr[len]) ^	\
-				 load_word_unaligned(&strptr[len]);	\
-			if (v_word != 0)				\
-				goto word_differs;			\
-			len += WORDBYTES;				\
-
-			COMPARE_WORD_STEP
-			COMPARE_WORD_STEP
-			COMPARE_WORD_STEP
-			COMPARE_WORD_STEP
-		#undef COMPARE_WORD_STEP
-		}
-
-		while (len + WORDBYTES <= max_len) {
-			v_word = load_word_unaligned(&matchptr[len]) ^
-				 load_word_unaligned(&strptr[len]);
-			if (v_word != 0)
-				goto word_differs;
-			len += WORDBYTES;
-		}
-	}
-
-	while (len < max_len && matchptr[len] == strptr[len])
-		len++;
-	return len;
-
+  unsigned len = start_len;
+  machine_word_t v_word;
+  
+  if (UNALIGNED_ACCESS_IS_FAST) {
+    
+    if (likely(max_len - len >= 4 * WORDBYTES)) {
+      
+#define COMPARE_WORD_STEP        \
+v_word = load_word_unaligned(&matchptr[len]) ^  \
+load_word_unaligned(&strptr[len]);  \
+if (v_word != 0)        \
+goto word_differs;      \
+len += WORDBYTES;        \
+
+      COMPARE_WORD_STEP
+      COMPARE_WORD_STEP
+      COMPARE_WORD_STEP
+      COMPARE_WORD_STEP
+#undef COMPARE_WORD_STEP
+    }
+    
+    while (len + WORDBYTES <= max_len) {
+      v_word = load_word_unaligned(&matchptr[len]) ^
+      load_word_unaligned(&strptr[len]);
+      if (v_word != 0)
+        goto word_differs;
+      len += WORDBYTES;
+    }
+  }
+  
+  while (len < max_len && matchptr[len] == strptr[len])
+    len++;
+  return len;
+  
 word_differs:
-	if (CPU_IS_LITTLE_ENDIAN())
-		len += (bsfw(v_word) >> 3);
-	else
-		len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
-	return len;
+  if (CPU_IS_LITTLE_ENDIAN())
+    len += (bsfw(v_word) >> 3);
+  else
+    len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
+  return len;
 }
 
 #endif /* LIB_MATCHFINDER_COMMON_H */
diff --git a/Sources/DEFLATE/riscv/matchfinder_impl.h b/Sources/DEFLATE/riscv/matchfinder_impl.h
new file mode 100644
index 00000000..76081fa7
--- /dev/null
+++ b/Sources/DEFLATE/riscv/matchfinder_impl.h
@@ -0,0 +1,97 @@
+/*
+ * riscv/matchfinder_impl.h - RISC-V implementations of matchfinder functions
+ *
+ * Copyright 2024 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_RISCV_MATCHFINDER_IMPL_H
+#define LIB_RISCV_MATCHFINDER_IMPL_H
+
+#if defined(ARCH_RISCV) && defined(__riscv_vector)
+#include <riscv_vector.h>
+
+/*
+ * Return the maximum number of 16-bit (mf_pos_t) elements that fit in 8 RISC-V
+ * vector registers and also evenly divide the sizes of the matchfinder buffers.
+ */
+static forceinline size_t
+riscv_matchfinder_vl(void)
+{
+	const size_t vl = __riscv_vsetvlmax_e16m8();
+
+	STATIC_ASSERT(sizeof(mf_pos_t) == sizeof(s16));
+	/*
+	 * MATCHFINDER_SIZE_ALIGNMENT is a power of 2, as is 'vl' because the
+	 * RISC-V Vector Extension requires that the vector register length
+	 * (VLEN) be a power of 2.  Thus, a simple MIN() gives the correct
+	 * answer here; rounding to a power of 2 is not required.
+	 */
+	STATIC_ASSERT((MATCHFINDER_SIZE_ALIGNMENT &
+		       (MATCHFINDER_SIZE_ALIGNMENT - 1)) == 0);
+	ASSERT((vl & (vl - 1)) == 0);
+	return MIN(vl, MATCHFINDER_SIZE_ALIGNMENT / sizeof(mf_pos_t));
+}
+
+/* matchfinder_init() optimized using the RISC-V Vector Extension */
+static forceinline void
+matchfinder_init_rvv(mf_pos_t *p, size_t size)
+{
+	const size_t vl = riscv_matchfinder_vl();
+	const vint16m8_t v = __riscv_vmv_v_x_i16m8(MATCHFINDER_INITVAL, vl);
+
+	ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0);
+	do {
+		__riscv_vse16_v_i16m8(p, v, vl);
+		p += vl;
+		size -= vl * sizeof(p[0]);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_rvv
+
+/* matchfinder_rebase() optimized using the RISC-V Vector Extension */
+static forceinline void
+matchfinder_rebase_rvv(mf_pos_t *p, size_t size)
+{
+	const size_t vl = riscv_matchfinder_vl();
+
+	ASSERT(size > 0 && size % (vl * sizeof(p[0])) == 0);
+	do {
+		vint16m8_t v = __riscv_vle16_v_i16m8(p, vl);
+
+		/*
+		 * This should generate the vsadd.vx instruction
+		 * (Vector Saturating Add, integer vector-scalar)
+		 */
+		v = __riscv_vsadd_vx_i16m8(v, (s16)-MATCHFINDER_WINDOW_SIZE,
+					   vl);
+		__riscv_vse16_v_i16m8(p, v, vl);
+		p += vl;
+		size -= vl * sizeof(p[0]);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_rvv
+
+#endif /* ARCH_RISCV && __riscv_vector */
+
+#endif /* LIB_RISCV_MATCHFINDER_IMPL_H */
\ No newline at end of file
diff --git a/Sources/DEFLATE/utils.c b/Sources/DEFLATE/utils.c
index c1e4cc26..3eea24c8 100644
--- a/Sources/DEFLATE/utils.c
+++ b/Sources/DEFLATE/utils.c
@@ -39,31 +39,31 @@ free_func_t libdeflate_default_free_func = free;
 
 void *
 libdeflate_aligned_malloc(malloc_func_t malloc_func,
-			  size_t alignment, size_t size)
+                          size_t alignment, size_t size)
 {
-	void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size);
-
-	if (ptr) {
-		void *orig_ptr = ptr;
-
-		ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
-		((void **)ptr)[-1] = orig_ptr;
-	}
-	return ptr;
+  void *ptr = (*malloc_func)(sizeof(void *) + alignment - 1 + size);
+  
+  if (ptr) {
+    void *orig_ptr = ptr;
+    
+    ptr = (void *)ALIGN((uintptr_t)ptr + sizeof(void *), alignment);
+    ((void **)ptr)[-1] = orig_ptr;
+  }
+  return ptr;
 }
 
 void
 libdeflate_aligned_free(free_func_t free_func, void *ptr)
 {
-	(*free_func)(((void **)ptr)[-1]);
+  (*free_func)(((void **)ptr)[-1]);
 }
 
 LIBDEFLATEAPI void
 libdeflate_set_memory_allocator(malloc_func_t malloc_func,
-				free_func_t free_func)
+                                free_func_t free_func)
 {
-	libdeflate_default_malloc_func = malloc_func;
-	libdeflate_default_free_func = free_func;
+  libdeflate_default_malloc_func = malloc_func;
+  libdeflate_default_free_func = free_func;
 }
 
 /*
@@ -76,56 +76,56 @@ libdeflate_set_memory_allocator(malloc_func_t malloc_func,
 void * __attribute__((weak))
 memset(void *s, int c, size_t n)
 {
-	u8 *p = s;
-	size_t i;
-
-	for (i = 0; i < n; i++)
-		p[i] = c;
-	return s;
+  u8 *p = s;
+  size_t i;
+  
+  for (i = 0; i < n; i++)
+    p[i] = c;
+  return s;
 }
 
 #undef memcpy
 void * __attribute__((weak))
 memcpy(void *dest, const void *src, size_t n)
 {
-	u8 *d = dest;
-	const u8 *s = src;
-	size_t i;
-
-	for (i = 0; i < n; i++)
-		d[i] = s[i];
-	return dest;
+  u8 *d = dest;
+  const u8 *s = src;
+  size_t i;
+  
+  for (i = 0; i < n; i++)
+    d[i] = s[i];
+  return dest;
 }
 
 #undef memmove
 void * __attribute__((weak))
 memmove(void *dest, const void *src, size_t n)
 {
-	u8 *d = dest;
-	const u8 *s = src;
-	size_t i;
-
-	if (d <= s)
-		return memcpy(d, s, n);
-
-	for (i = n; i > 0; i--)
-		d[i - 1] = s[i - 1];
-	return dest;
+  u8 *d = dest;
+  const u8 *s = src;
+  size_t i;
+  
+  if (d <= s)
+    return memcpy(d, s, n);
+  
+  for (i = n; i > 0; i--)
+    d[i - 1] = s[i - 1];
+  return dest;
 }
 
 #undef memcmp
 int __attribute__((weak))
 memcmp(const void *s1, const void *s2, size_t n)
 {
-	const u8 *p1 = s1;
-	const u8 *p2 = s2;
-	size_t i;
-
-	for (i = 0; i < n; i++) {
-		if (p1[i] != p2[i])
-			return (int)p1[i] - (int)p2[i];
-	}
-	return 0;
+  const u8 *p1 = s1;
+  const u8 *p2 = s2;
+  size_t i;
+  
+  for (i = 0; i < n; i++) {
+    if (p1[i] != p2[i])
+      return (int)p1[i] - (int)p2[i];
+  }
+  return 0;
 }
 #endif /* FREESTANDING */
 
@@ -135,7 +135,7 @@ memcmp(const void *s1, const void *s2, size_t n)
 void
 libdeflate_assertion_failed(const char *expr, const char *file, int line)
 {
-	fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line);
-	abort();
+  fprintf(stderr, "Assertion failed: %s at %s:%d\n", expr, file, line);
+  abort();
 }
 #endif /* LIBDEFLATE_ENABLE_ASSERTIONS */
diff --git a/Sources/DEFLATE/x86/adler32_impl.h b/Sources/DEFLATE/x86/adler32_impl.h
new file mode 100644
index 00000000..0aacdda3
--- /dev/null
+++ b/Sources/DEFLATE/x86/adler32_impl.h
@@ -0,0 +1,127 @@
+/*
+ * x86/adler32_impl.h - x86 implementations of Adler-32 checksum algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_ADLER32_IMPL_H
+#define LIB_X86_ADLER32_IMPL_H
+
+#include "x86/cpu_features.h"
+
+/* SSE2 and AVX2 implementations.  Used on older CPUs. */
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
+#  define adler32_x86_sse2  adler32_x86_sse2
+#  define SUFFIX           _x86_sse2
+#  define ATTRIBUTES    _target_attribute("sse2")
+#  define VL      16
+#  define USE_VNNI    0
+#  define USE_MASKING    0
+#  include "adler32_template.h"
+
+#  define adler32_x86_avx2  adler32_x86_avx2
+#  define SUFFIX           _x86_avx2
+#  define ATTRIBUTES    _target_attribute("avx2")
+#  define VL      32
+#  define USE_VNNI    0
+#  define USE_MASKING    0
+#  include "adler32_template.h"
+#endif
+
+/*
+ * AVX-VNNI implementation.  This is used on CPUs that have AVX2 and AVX-VNNI
+ * but don't have AVX-512, for example Intel Alder Lake.
+ */
+#if GCC_PREREQ(11, 1) || CLANG_PREREQ(12, 0, 13000000) || MSVC_PREREQ(1930)
+#  define adler32_x86_avx2_vnni  adler32_x86_avx2_vnni
+#  define SUFFIX           _x86_avx2_vnni
+#  define ATTRIBUTES    _target_attribute("avx2,avxvnni")
+#  define VL      32
+#  define USE_VNNI    1
+#  define USE_MASKING    0
+#  include "adler32_template.h"
+#endif
+
+#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
+/*
+ * AVX512VNNI implementation using 256-bit vectors.  This is very similar to the
+ * AVX-VNNI implementation but takes advantage of masking and more registers.
+ * This is used on CPUs that support AVX-512 but where using 512-bit vectors
+ * causes downclocking.  This should also be the optimal implementation on CPUs
+ * that support AVX10/256 but not AVX10/512.
+ */
+#  define adler32_x86_avx512_vl256_vnni  adler32_x86_avx512_vl256_vnni
+#  define SUFFIX             _x86_avx512_vl256_vnni
+#  define ATTRIBUTES    _target_attribute("avx512bw,avx512vl,avx512vnni")
+#  define VL      32
+#  define USE_VNNI    1
+#  define USE_MASKING    1
+#  include "adler32_template.h"
+
+/*
+ * AVX512VNNI implementation using 512-bit vectors.  This is used on CPUs that
+ * have a good AVX-512 implementation including AVX512VNNI.  This should also be
+ * the optimal implementation on CPUs that support AVX10/512.
+ */
+#  define adler32_x86_avx512_vl512_vnni  adler32_x86_avx512_vl512_vnni
+#  define SUFFIX             _x86_avx512_vl512_vnni
+#  define ATTRIBUTES    _target_attribute("avx512bw,avx512vnni")
+#  define VL      64
+#  define USE_VNNI    1
+#  define USE_MASKING    1
+#  include "adler32_template.h"
+#endif
+
+static inline adler32_func_t
+arch_select_adler32_func(void)
+{
+  const u32 features MAYBE_UNUSED = get_x86_cpu_features();
+  
+#ifdef adler32_x86_avx512_vl512_vnni
+  if ((features & X86_CPU_FEATURE_ZMM) &&
+      HAVE_AVX512BW(features) && HAVE_AVX512VNNI(features))
+    return adler32_x86_avx512_vl512_vnni;
+#endif
+#ifdef adler32_x86_avx512_vl256_vnni
+  if (HAVE_AVX512BW(features) && HAVE_AVX512VL(features) &&
+      HAVE_AVX512VNNI(features))
+    return adler32_x86_avx512_vl256_vnni;
+#endif
+#ifdef adler32_x86_avx2_vnni
+  if (HAVE_AVX2(features) && HAVE_AVXVNNI(features))
+    return adler32_x86_avx2_vnni;
+#endif
+#ifdef adler32_x86_avx2
+  if (HAVE_AVX2(features))
+    return adler32_x86_avx2;
+#endif
+#ifdef adler32_x86_sse2
+  if (HAVE_SSE2(features))
+    return adler32_x86_sse2;
+#endif
+  return NULL;
+}
+#define arch_select_adler32_func  arch_select_adler32_func
+
+#endif /* LIB_X86_ADLER32_IMPL_H */
diff --git a/Sources/DEFLATE/x86/adler32_template.h b/Sources/DEFLATE/x86/adler32_template.h
new file mode 100644
index 00000000..77087741
--- /dev/null
+++ b/Sources/DEFLATE/x86/adler32_template.h
@@ -0,0 +1,512 @@
+/*
+ * x86/adler32_template.h - template for vectorized Adler-32 implementations
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This file is a "template" for instantiating Adler-32 functions for x86.
+ * The "parameters" are:
+ *
+ * SUFFIX:
+ *  Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *  Target function attributes to use.  Must satisfy the dependencies of the
+ *  other parameters as follows:
+ *     VL=16 && USE_VNNI=0 && USE_MASKING=0: at least sse2
+ *     VL=32 && USE_VNNI=0 && USE_MASKING=0: at least avx2
+ *     VL=32 && USE_VNNI=1 && USE_MASKING=0: at least avx2,avxvnni
+ *     VL=32 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vl,avx512vnni
+ *     VL=64 && USE_VNNI=1 && USE_MASKING=1: at least avx512bw,avx512vnni
+ *     (Other combinations are not useful and have not been tested.)
+ * VL:
+ *  Vector length in bytes.  Must be 16, 32, and 64.
+ * USE_VNNI:
+ *  If 1, use the VNNI dot product based algorithm.
+ *  If 0, use the legacy SSE2 and AVX2 compatible algorithm.
+ * USE_MASKING:
+ *  If 1, use AVX-512 features such as masking.
+ *  If 0, assume that the CPU might not support AVX-512.
+ */
+
+#if VL == 16
+#  define vec_t      __m128i
+#  define mask_t    u16
+#  define LOG2_VL    4
+#  define VADD8(a, b)    _mm_add_epi8((a), (b))
+#  define VADD16(a, b)    _mm_add_epi16((a), (b))
+#  define VADD32(a, b)    _mm_add_epi32((a), (b))
+#  if USE_MASKING
+#    define VDPBUSD(a, b, c)  _mm_dpbusd_epi32((a), (b), (c))
+#  else
+#    define VDPBUSD(a, b, c)  _mm_dpbusd_avx_epi32((a), (b), (c))
+#  endif
+#  define VLOAD(p)    _mm_load_si128((const void *)(p))
+#  define VLOADU(p)    _mm_loadu_si128((const void *)(p))
+#  define VMADD16(a, b)    _mm_madd_epi16((a), (b))
+#  define VMASKZ_LOADU(mask, p) _mm_maskz_loadu_epi8((mask), (p))
+#  define VMULLO32(a, b)  _mm_mullo_epi32((a), (b))
+#  define VSAD8(a, b)    _mm_sad_epu8((a), (b))
+#  define VSET1_32(a)    _mm_set1_epi32(a)
+#  define VSET1_8(a)    _mm_set1_epi8(a)
+#  define VSETZERO()    _mm_setzero_si128()
+#  define VSLL32(a, b)    _mm_slli_epi32((a), (b))
+#  define VUNPACKHI8(a, b)  _mm_unpackhi_epi8((a), (b))
+#  define VUNPACKLO8(a, b)  _mm_unpacklo_epi8((a), (b))
+#elif VL == 32
+#  define vec_t      __m256i
+#  define mask_t    u32
+#  define LOG2_VL    5
+#  define VADD8(a, b)    _mm256_add_epi8((a), (b))
+#  define VADD16(a, b)    _mm256_add_epi16((a), (b))
+#  define VADD32(a, b)    _mm256_add_epi32((a), (b))
+#  if USE_MASKING
+#    define VDPBUSD(a, b, c)  _mm256_dpbusd_epi32((a), (b), (c))
+#  else
+#    define VDPBUSD(a, b, c)  _mm256_dpbusd_avx_epi32((a), (b), (c))
+#  endif
+#  define VLOAD(p)    _mm256_load_si256((const void *)(p))
+#  define VLOADU(p)    _mm256_loadu_si256((const void *)(p))
+#  define VMADD16(a, b)    _mm256_madd_epi16((a), (b))
+#  define VMASKZ_LOADU(mask, p) _mm256_maskz_loadu_epi8((mask), (p))
+#  define VMULLO32(a, b)  _mm256_mullo_epi32((a), (b))
+#  define VSAD8(a, b)    _mm256_sad_epu8((a), (b))
+#  define VSET1_32(a)    _mm256_set1_epi32(a)
+#  define VSET1_8(a)    _mm256_set1_epi8(a)
+#  define VSETZERO()    _mm256_setzero_si256()
+#  define VSLL32(a, b)    _mm256_slli_epi32((a), (b))
+#  define VUNPACKHI8(a, b)  _mm256_unpackhi_epi8((a), (b))
+#  define VUNPACKLO8(a, b)  _mm256_unpacklo_epi8((a), (b))
+#elif VL == 64
+#  define vec_t      __m512i
+#  define mask_t    u64
+#  define LOG2_VL    6
+#  define VADD8(a, b)    _mm512_add_epi8((a), (b))
+#  define VADD32(a, b)    _mm512_add_epi32((a), (b))
+#  define VDPBUSD(a, b, c)  _mm512_dpbusd_epi32((a), (b), (c))
+#  define VLOAD(p)    _mm512_load_si512((const void *)(p))
+#  define VLOADU(p)    _mm512_loadu_si512((const void *)(p))
+#  define VMASKZ_LOADU(mask, p) _mm512_maskz_loadu_epi8((mask), (p))
+#  define VMULLO32(a, b)  _mm512_mullo_epi32((a), (b))
+#  define VSET1_32(a)    _mm512_set1_epi32(a)
+#  define VSET1_8(a)    _mm512_set1_epi8(a)
+#  define VSETZERO()    _mm512_setzero_si512()
+#  define VSLL32(a, b)    _mm512_slli_epi32((a), (b))
+#else
+#  error "unsupported vector length"
+#endif
+
+#define VADD32_3X(a, b, c)  VADD32(VADD32((a), (b)), (c))
+#define VADD32_4X(a, b, c, d)  VADD32(VADD32((a), (b)), VADD32((c), (d)))
+#define VADD32_5X(a, b, c, d, e) VADD32((a), VADD32_4X((b), (c), (d), (e)))
+#define VADD32_7X(a, b, c, d, e, f, g)  \
+VADD32(VADD32_3X((a), (b), (c)), VADD32_4X((d), (e), (f), (g)))
+
+/* Sum the 32-bit elements of v_s1 and add them to s1, and likewise for s2. */
+#undef reduce_to_32bits
+static forceinline ATTRIBUTES void
+ADD_SUFFIX(reduce_to_32bits)(vec_t v_s1, vec_t v_s2, u32 *s1_p, u32 *s2_p)
+{
+  __m128i v_s1_128, v_s2_128;
+#if VL == 16
+  {
+    v_s1_128 = v_s1;
+    v_s2_128 = v_s2;
+  }
+#else
+  {
+    __m256i v_s1_256, v_s2_256;
+#if VL == 32
+    v_s1_256 = v_s1;
+    v_s2_256 = v_s2;
+#else
+    /* Reduce 512 bits to 256 bits. */
+    v_s1_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s1, 0),
+                                _mm512_extracti64x4_epi64(v_s1, 1));
+    v_s2_256 = _mm256_add_epi32(_mm512_extracti64x4_epi64(v_s2, 0),
+                                _mm512_extracti64x4_epi64(v_s2, 1));
+#endif
+    /* Reduce 256 bits to 128 bits. */
+    v_s1_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s1_256, 0),
+                             _mm256_extracti128_si256(v_s1_256, 1));
+    v_s2_128 = _mm_add_epi32(_mm256_extracti128_si256(v_s2_256, 0),
+                             _mm256_extracti128_si256(v_s2_256, 1));
+  }
+#endif
+  
+  /*
+   * Reduce 128 bits to 32 bits.
+   *
+   * If the bytes were summed into v_s1 using psadbw + paddd, then ignore
+   * the odd-indexed elements of v_s1_128 since they are zero.
+   */
+#if USE_VNNI
+  v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x31));
+#endif
+  v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x31));
+  v_s1_128 = _mm_add_epi32(v_s1_128, _mm_shuffle_epi32(v_s1_128, 0x02));
+  v_s2_128 = _mm_add_epi32(v_s2_128, _mm_shuffle_epi32(v_s2_128, 0x02));
+  
+  *s1_p += (u32)_mm_cvtsi128_si32(v_s1_128);
+  *s2_p += (u32)_mm_cvtsi128_si32(v_s2_128);
+}
+#define reduce_to_32bits  ADD_SUFFIX(reduce_to_32bits)
+
+static u32 ATTRIBUTES
+ADD_SUFFIX(adler32)(u32 adler, const u8 *p, size_t len)
+{
+#if USE_VNNI
+  /* This contains the bytes [VL, VL-1, VL-2, ..., 1]. */
+  static const u8 _aligned_attribute(VL) raw_mults[VL] = {
+#if VL == 64
+    64, 63, 62, 61, 60, 59, 58, 57, 56, 55, 54, 53, 52, 51, 50, 49,
+    48, 47, 46, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33,
+#endif
+#if VL >= 32
+    32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17,
+#endif
+    16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5,  4,  3,  2,  1,
+  };
+  const vec_t ones = VSET1_8(1);
+#else
+  /*
+   * This contains the 16-bit values [2*VL, 2*VL - 1, 2*VL - 2, ..., 1].
+   * For VL==32 the ordering is weird because it has to match the way that
+   * vpunpcklbw and vpunpckhbw work on 128-bit lanes separately.
+   */
+  static const u16 _aligned_attribute(VL) raw_mults[4][VL / 2] = {
+#if VL == 16
+    { 32, 31, 30, 29, 28, 27, 26, 25 },
+    { 24, 23, 22, 21, 20, 19, 18, 17 },
+    { 16, 15, 14, 13, 12, 11, 10, 9  },
+    { 8,  7,  6,  5,  4,  3,  2,  1  },
+#elif VL == 32
+    { 64, 63, 62, 61, 60, 59, 58, 57, 48, 47, 46, 45, 44, 43, 42, 41 },
+    { 56, 55, 54, 53, 52, 51, 50, 49, 40, 39, 38, 37, 36, 35, 34, 33 },
+    { 32, 31, 30, 29, 28, 27, 26, 25, 16, 15, 14, 13, 12, 11, 10,  9 },
+    { 24, 23, 22, 21, 20, 19, 18, 17,  8,  7,  6,  5,  4,  3,  2,  1 },
+#else
+#  error "unsupported parameters"
+#endif
+  };
+  const vec_t mults_a = VLOAD(raw_mults[0]);
+  const vec_t mults_b = VLOAD(raw_mults[1]);
+  const vec_t mults_c = VLOAD(raw_mults[2]);
+  const vec_t mults_d = VLOAD(raw_mults[3]);
+#endif
+  const vec_t zeroes = VSETZERO();
+  u32 s1 = adler & 0xFFFF;
+  u32 s2 = adler >> 16;
+  
+  /*
+   * If the length is large and the pointer is misaligned, align it.
+   * For smaller lengths, just take the misaligned load penalty.
+   */
+  if (unlikely(len > 65536 && ((uintptr_t)p & (VL-1)))) {
+    do {
+      s1 += *p++;
+      s2 += s1;
+      len--;
+    } while ((uintptr_t)p & (VL-1));
+    s1 %= DIVISOR;
+    s2 %= DIVISOR;
+  }
+  
+#if USE_VNNI
+  /*
+   * This is Adler-32 using the vpdpbusd instruction from AVX512-VNNI or
+   * AVX-VNNI.  vpdpbusd multiplies the unsigned bytes of one vector by
+   * the signed bytes of another vector and adds the sums in groups of 4
+   * to the 32-bit elements of a third vector.  We use it in two ways:
+   * multiplying the data bytes by a sequence like 64,63,62,...,1 for
+   * calculating part of s2, and multiplying the data bytes by an all-ones
+   * sequence 1,1,1,...,1 for calculating s1 and part of s2.  The all-ones
+   * trick seems to be faster than the alternative of vpsadbw + vpaddd.
+   */
+  while (len) {
+    /*
+     * Calculate the length of the next data chunk such that s1 and
+     * s2 are guaranteed to not exceed UINT32_MAX.
+     */
+    size_t n = MIN(len, MAX_CHUNK_LEN & ~(4*VL - 1));
+    vec_t mults = VLOAD(raw_mults);
+    vec_t v_s1 = zeroes;
+    vec_t v_s2 = zeroes;
+    
+    s2 += s1 * n;
+    len -= n;
+    
+    if (n >= 4*VL) {
+      vec_t v_s1_b = zeroes;
+      vec_t v_s1_c = zeroes;
+      vec_t v_s1_d = zeroes;
+      vec_t v_s2_b = zeroes;
+      vec_t v_s2_c = zeroes;
+      vec_t v_s2_d = zeroes;
+      vec_t v_s1_sums   = zeroes;
+      vec_t v_s1_sums_b = zeroes;
+      vec_t v_s1_sums_c = zeroes;
+      vec_t v_s1_sums_d = zeroes;
+      vec_t tmp0, tmp1;
+      
+      do {
+        vec_t data_a = VLOADU(p + 0*VL);
+        vec_t data_b = VLOADU(p + 1*VL);
+        vec_t data_c = VLOADU(p + 2*VL);
+        vec_t data_d = VLOADU(p + 3*VL);
+        
+        /*
+         * Workaround for gcc bug where it generates
+         * unnecessary move instructions
+         * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
+         */
+#if GCC_PREREQ(1, 0)
+        __asm__("" : "+v" (data_a), "+v" (data_b),
+                "+v" (data_c), "+v" (data_d));
+#endif
+        
+        v_s2   = VDPBUSD(v_s2,   data_a, mults);
+        v_s2_b = VDPBUSD(v_s2_b, data_b, mults);
+        v_s2_c = VDPBUSD(v_s2_c, data_c, mults);
+        v_s2_d = VDPBUSD(v_s2_d, data_d, mults);
+        
+        v_s1_sums   = VADD32(v_s1_sums,   v_s1);
+        v_s1_sums_b = VADD32(v_s1_sums_b, v_s1_b);
+        v_s1_sums_c = VADD32(v_s1_sums_c, v_s1_c);
+        v_s1_sums_d = VADD32(v_s1_sums_d, v_s1_d);
+        
+        v_s1   = VDPBUSD(v_s1,   data_a, ones);
+        v_s1_b = VDPBUSD(v_s1_b, data_b, ones);
+        v_s1_c = VDPBUSD(v_s1_c, data_c, ones);
+        v_s1_d = VDPBUSD(v_s1_d, data_d, ones);
+        
+        /* Same gcc bug workaround.  See above */
+#if GCC_PREREQ(1, 0) && !defined(ARCH_X86_32)
+        __asm__("" : "+v" (v_s2), "+v" (v_s2_b),
+                "+v" (v_s2_c), "+v" (v_s2_d),
+                "+v" (v_s1_sums),
+                "+v" (v_s1_sums_b),
+                "+v" (v_s1_sums_c),
+                "+v" (v_s1_sums_d),
+                "+v" (v_s1), "+v" (v_s1_b),
+                "+v" (v_s1_c), "+v" (v_s1_d));
+#endif
+        p += 4*VL;
+        n -= 4*VL;
+      } while (n >= 4*VL);
+      
+      /*
+       * Reduce into v_s1 and v_s2 as follows:
+       *
+       * v_s2 = v_s2 + v_s2_b + v_s2_c + v_s2_d +
+       *    (4*VL)*(v_s1_sums   + v_s1_sums_b +
+       *      v_s1_sums_c + v_s1_sums_d) +
+       *    (3*VL)*v_s1 + (2*VL)*v_s1_b + VL*v_s1_c
+       * v_s1 = v_s1 + v_s1_b + v_s1_c + v_s1_d
+       */
+      tmp0 = VADD32(v_s1, v_s1_b);
+      tmp1 = VADD32(v_s1, v_s1_c);
+      v_s1_sums = VADD32_4X(v_s1_sums, v_s1_sums_b,
+                            v_s1_sums_c, v_s1_sums_d);
+      v_s1 = VADD32_3X(tmp0, v_s1_c, v_s1_d);
+      v_s2 = VADD32_7X(VSLL32(v_s1_sums, LOG2_VL + 2),
+                       VSLL32(tmp0, LOG2_VL + 1),
+                       VSLL32(tmp1, LOG2_VL),
+                       v_s2, v_s2_b, v_s2_c, v_s2_d);
+    }
+    
+    /* Process the last 0 <= n < 4*VL bytes of the chunk. */
+    if (n >= 2*VL) {
+      const vec_t data_a = VLOADU(p + 0*VL);
+      const vec_t data_b = VLOADU(p + 1*VL);
+      
+      v_s2 = VADD32(v_s2, VSLL32(v_s1, LOG2_VL + 1));
+      v_s1 = VDPBUSD(v_s1, data_a, ones);
+      v_s1 = VDPBUSD(v_s1, data_b, ones);
+      v_s2 = VDPBUSD(v_s2, data_a, VSET1_8(VL));
+      v_s2 = VDPBUSD(v_s2, data_a, mults);
+      v_s2 = VDPBUSD(v_s2, data_b, mults);
+      p += 2*VL;
+      n -= 2*VL;
+    }
+    if (n) {
+      /* Process the last 0 < n < 2*VL bytes of the chunk. */
+      vec_t data;
+      
+      v_s2 = VADD32(v_s2, VMULLO32(v_s1, VSET1_32(n)));
+      
+      mults = VADD8(mults, VSET1_8((int)n - VL));
+      if (n > VL) {
+        data = VLOADU(p);
+        v_s1 = VDPBUSD(v_s1, data, ones);
+        v_s2 = VDPBUSD(v_s2, data, mults);
+        p += VL;
+        n -= VL;
+        mults = VADD8(mults, VSET1_8(-VL));
+      }
+      /*
+       * Process the last 0 < n <= VL bytes of the chunk.
+       * Utilize a masked load if it's available.
+       */
+#if USE_MASKING
+      data = VMASKZ_LOADU((mask_t)-1 >> (VL - n), p);
+#else
+      data = zeroes;
+      memcpy(&data, p, n);
+#endif
+      v_s1 = VDPBUSD(v_s1, data, ones);
+      v_s2 = VDPBUSD(v_s2, data, mults);
+      p += n;
+    }
+    
+    reduce_to_32bits(v_s1, v_s2, &s1, &s2);
+    s1 %= DIVISOR;
+    s2 %= DIVISOR;
+  }
+#else /* USE_VNNI */
+  /*
+   * This is Adler-32 for SSE2 and AVX2.
+   *
+   * To horizontally sum bytes, use psadbw + paddd, where one of the
+   * arguments to psadbw is all-zeroes.
+   *
+   * For the s2 contribution from (2*VL - i)*data[i] for each of the 2*VL
+   * bytes of each iteration of the inner loop, use punpck{l,h}bw + paddw
+   * to sum, for each i across iterations, byte i into a corresponding
+   * 16-bit counter in v_byte_sums_*.  After the inner loop, use pmaddwd
+   * to multiply each counter by (2*VL - i), then add the products to s2.
+   *
+   * An alternative implementation would use pmaddubsw and pmaddwd in the
+   * inner loop to do (2*VL - i)*data[i] directly and add the products in
+   * groups of 4 to 32-bit counters.  However, on average that approach
+   * seems to be slower than the current approach which delays the
+   * multiplications.  Also, pmaddubsw requires SSSE3; the current
+   * approach keeps the implementation aligned between SSE2 and AVX2.
+   *
+   * The inner loop processes 2*VL bytes per iteration.  Increasing this
+   * to 4*VL doesn't seem to be helpful here.
+   */
+  while (len) {
+    /*
+     * Calculate the length of the next data chunk such that s1 and
+     * s2 are guaranteed to not exceed UINT32_MAX, and every
+     * v_byte_sums_* counter is guaranteed to not exceed INT16_MAX.
+     * It's INT16_MAX, not UINT16_MAX, because v_byte_sums_* are
+     * used with pmaddwd which does signed multiplication.  In the
+     * SSE2 case this limits chunks to 4096 bytes instead of 5504.
+     */
+    size_t n = MIN(len, MIN(2 * VL * (INT16_MAX / UINT8_MAX),
+                            MAX_CHUNK_LEN) & ~(2*VL - 1));
+    len -= n;
+    
+    if (n >= 2*VL) {
+      vec_t v_s1 = zeroes;
+      vec_t v_s1_sums = zeroes;
+      vec_t v_byte_sums_a = zeroes;
+      vec_t v_byte_sums_b = zeroes;
+      vec_t v_byte_sums_c = zeroes;
+      vec_t v_byte_sums_d = zeroes;
+      vec_t v_s2;
+      
+      s2 += s1 * (n & ~(2*VL - 1));
+      
+      do {
+        vec_t data_a = VLOADU(p + 0*VL);
+        vec_t data_b = VLOADU(p + 1*VL);
+        
+        v_s1_sums = VADD32(v_s1_sums, v_s1);
+        v_byte_sums_a = VADD16(v_byte_sums_a,
+                               VUNPACKLO8(data_a, zeroes));
+        v_byte_sums_b = VADD16(v_byte_sums_b,
+                               VUNPACKHI8(data_a, zeroes));
+        v_byte_sums_c = VADD16(v_byte_sums_c,
+                               VUNPACKLO8(data_b, zeroes));
+        v_byte_sums_d = VADD16(v_byte_sums_d,
+                               VUNPACKHI8(data_b, zeroes));
+        v_s1 = VADD32(v_s1,
+                      VADD32(VSAD8(data_a, zeroes),
+                             VSAD8(data_b, zeroes)));
+        /*
+         * Workaround for gcc bug where it generates
+         * unnecessary move instructions
+         * (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=107892)
+         */
+#if GCC_PREREQ(1, 0)
+        __asm__("" : "+x" (v_s1), "+x" (v_s1_sums),
+                "+x" (v_byte_sums_a),
+                "+x" (v_byte_sums_b),
+                "+x" (v_byte_sums_c),
+                "+x" (v_byte_sums_d));
+#endif
+        p += 2*VL;
+        n -= 2*VL;
+      } while (n >= 2*VL);
+      
+      /*
+       * Calculate v_s2 as (2*VL)*v_s1_sums +
+       * [2*VL, 2*VL - 1, 2*VL - 2, ..., 1] * v_byte_sums.
+       * Then update s1 and s2 from v_s1 and v_s2.
+       */
+      v_s2 = VADD32_5X(VSLL32(v_s1_sums, LOG2_VL + 1),
+                       VMADD16(v_byte_sums_a, mults_a),
+                       VMADD16(v_byte_sums_b, mults_b),
+                       VMADD16(v_byte_sums_c, mults_c),
+                       VMADD16(v_byte_sums_d, mults_d));
+      reduce_to_32bits(v_s1, v_s2, &s1, &s2);
+    }
+    /*
+     * Process the last 0 <= n < 2*VL bytes of the chunk using
+     * scalar instructions and reduce s1 and s2 mod DIVISOR.
+     */
+    ADLER32_CHUNK(s1, s2, p, n);
+  }
+#endif /* !USE_VNNI */
+  return (s2 << 16) | s1;
+}
+
+#undef vec_t
+#undef mask_t
+#undef LOG2_VL
+#undef VADD8
+#undef VADD16
+#undef VADD32
+#undef VDPBUSD
+#undef VLOAD
+#undef VLOADU
+#undef VMADD16
+#undef VMASKZ_LOADU
+#undef VMULLO32
+#undef VSAD8
+#undef VSET1_8
+#undef VSET1_32
+#undef VSETZERO
+#undef VSLL32
+#undef VUNPACKHI8
+#undef VUNPACKLO8
+
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef VL
+#undef USE_VNNI
+#undef USE_MASKING
diff --git a/Sources/DEFLATE/x86/cpu_features.c b/Sources/DEFLATE/x86/cpu_features.c
new file mode 100644
index 00000000..991aeb04
--- /dev/null
+++ b/Sources/DEFLATE/x86/cpu_features.c
@@ -0,0 +1,189 @@
+/*
+ * x86/cpu_features.c - feature detection for x86 CPUs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "../cpu_features_common.h" /* must be included first */
+#include "x86/cpu_features.h"
+
+#ifdef X86_CPU_FEATURES_KNOWN
+/* Runtime x86 CPU feature detection is supported. */
+
+/* Execute the CPUID instruction. */
+static inline void
+cpuid(u32 leaf, u32 subleaf, u32 *a, u32 *b, u32 *c, u32 *d)
+{
+#ifdef _MSC_VER
+	int result[4];
+
+	__cpuidex(result, leaf, subleaf);
+	*a = result[0];
+	*b = result[1];
+	*c = result[2];
+	*d = result[3];
+#else
+	__asm__ volatile(".ifnc %%ebx, %1; mov  %%ebx, %1; .endif\n"
+			 "cpuid                                  \n"
+			 ".ifnc %%ebx, %1; xchg %%ebx, %1; .endif\n"
+			 : "=a" (*a), "=b" (*b), "=c" (*c), "=d" (*d)
+			 : "a" (leaf), "c" (subleaf));
+#endif
+}
+
+/* Read an extended control register. */
+static inline u64
+read_xcr(u32 index)
+{
+#ifdef _MSC_VER
+	return _xgetbv(index);
+#else
+	u32 d, a;
+
+	/*
+	 * Execute the "xgetbv" instruction.  Old versions of binutils do not
+	 * recognize this instruction, so list the raw bytes instead.
+	 *
+	 * This must be 'volatile' to prevent this code from being moved out
+	 * from under the check for OSXSAVE.
+	 */
+	__asm__ volatile(".byte 0x0f, 0x01, 0xd0" :
+			 "=d" (d), "=a" (a) : "c" (index));
+
+	return ((u64)d << 32) | a;
+#endif
+}
+
+static const struct cpu_feature x86_cpu_feature_table[] = {
+	{X86_CPU_FEATURE_SSE2,		"sse2"},
+	{X86_CPU_FEATURE_PCLMULQDQ,	"pclmulqdq"},
+	{X86_CPU_FEATURE_AVX,		"avx"},
+	{X86_CPU_FEATURE_AVX2,		"avx2"},
+	{X86_CPU_FEATURE_BMI2,		"bmi2"},
+	{X86_CPU_FEATURE_ZMM,		"zmm"},
+	{X86_CPU_FEATURE_AVX512F,	"avx512f"},
+	{X86_CPU_FEATURE_AVX512BW,	"avx512bw"},
+	{X86_CPU_FEATURE_AVX512VL,	"avx512vl"},
+	{X86_CPU_FEATURE_VPCLMULQDQ,	"vpclmulqdq"},
+	{X86_CPU_FEATURE_AVX512VNNI,	"avx512_vnni"},
+	{X86_CPU_FEATURE_AVXVNNI,	"avx_vnni"},
+};
+
+volatile u32 libdeflate_x86_cpu_features = 0;
+
+/*
+ * Don't use 512-bit vectors on Intel CPUs before Rocket Lake and Sapphire
+ * Rapids, due to the downclocking penalty.
+ */
+static inline bool
+allow_512bit_vectors(const u32 manufacturer[3], u32 family, u32 model)
+{
+#ifdef TEST_SUPPORT__DO_NOT_USE
+	return true;
+#endif
+	if (memcmp(manufacturer, "GenuineIntel", 12) != 0)
+		return true;
+	if (family != 6)
+		return true;
+	switch (model) {
+	case 85: /* Skylake (Server), Cascade Lake, Cooper Lake */
+	case 106: /* Ice Lake (Server) */
+	case 108: /* Ice Lake (Server) */
+	case 126: /* Ice Lake (Client) */
+	case 140: /* Tiger Lake */
+	case 141: /* Tiger Lake */
+		return false;
+	}
+	return true;
+}
+
+/* Initialize libdeflate_x86_cpu_features. */
+void libdeflate_init_x86_cpu_features(void)
+{
+	u32 max_leaf;
+	u32 manufacturer[3];
+	u32 family, model;
+	u32 a, b, c, d;
+	u64 xcr0 = 0;
+	u32 features = 0;
+
+	/* EAX=0: Highest Function Parameter and Manufacturer ID */
+	cpuid(0, 0, &max_leaf, &manufacturer[0], &manufacturer[2],
+	      &manufacturer[1]);
+	if (max_leaf < 1)
+		goto out;
+
+	/* EAX=1: Processor Info and Feature Bits */
+	cpuid(1, 0, &a, &b, &c, &d);
+	family = (a >> 8) & 0xf;
+	model = (a >> 4) & 0xf;
+	if (family == 6 || family == 0xf)
+		model += (a >> 12) & 0xf0;
+	if (family == 0xf)
+		family += (a >> 20) & 0xff;
+	if (d & (1 << 26))
+		features |= X86_CPU_FEATURE_SSE2;
+	if (c & (1 << 1))
+		features |= X86_CPU_FEATURE_PCLMULQDQ;
+	if (c & (1 << 27))
+		xcr0 = read_xcr(0);
+	if ((c & (1 << 28)) && ((xcr0 & 0x6) == 0x6))
+		features |= X86_CPU_FEATURE_AVX;
+
+	if (max_leaf < 7)
+		goto out;
+
+	/* EAX=7, ECX=0: Extended Features */
+	cpuid(7, 0, &a, &b, &c, &d);
+	if ((b & (1 << 5)) && ((xcr0 & 0x6) == 0x6))
+		features |= X86_CPU_FEATURE_AVX2;
+	if (b & (1 << 8))
+		features |= X86_CPU_FEATURE_BMI2;
+	if (((xcr0 & 0xe6) == 0xe6) &&
+	    allow_512bit_vectors(manufacturer, family, model))
+		features |= X86_CPU_FEATURE_ZMM;
+	if ((b & (1 << 16)) && ((xcr0 & 0xe6) == 0xe6))
+		features |= X86_CPU_FEATURE_AVX512F;
+	if ((b & (1 << 30)) && ((xcr0 & 0xe6) == 0xe6))
+		features |= X86_CPU_FEATURE_AVX512BW;
+	if ((b & (1U << 31)) && ((xcr0 & 0xe6) == 0xe6))
+		features |= X86_CPU_FEATURE_AVX512VL;
+	if ((c & (1 << 10)) && ((xcr0 & 0x6) == 0x6))
+		features |= X86_CPU_FEATURE_VPCLMULQDQ;
+	if ((c & (1 << 11)) && ((xcr0 & 0xe6) == 0xe6))
+		features |= X86_CPU_FEATURE_AVX512VNNI;
+
+	/* EAX=7, ECX=1: Extended Features */
+	cpuid(7, 1, &a, &b, &c, &d);
+	if ((a & (1 << 4)) && ((xcr0 & 0x6) == 0x6))
+		features |= X86_CPU_FEATURE_AVXVNNI;
+
+out:
+	disable_cpu_features_for_testing(&features, x86_cpu_feature_table,
+					 ARRAY_LEN(x86_cpu_feature_table));
+
+	libdeflate_x86_cpu_features = features | X86_CPU_FEATURES_KNOWN;
+}
+
+#endif /* X86_CPU_FEATURES_KNOWN */
\ No newline at end of file
diff --git a/Sources/DEFLATE/x86/cpu_features.h b/Sources/DEFLATE/x86/cpu_features.h
new file mode 100644
index 00000000..d5d3f2ac
--- /dev/null
+++ b/Sources/DEFLATE/x86/cpu_features.h
@@ -0,0 +1,176 @@
+/*
+ * x86/cpu_features.h - feature detection for x86 CPUs
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_CPU_FEATURES_H
+#define LIB_X86_CPU_FEATURES_H
+
+#include "../lib_common.h"
+
+#if defined(ARCH_X86_32) || defined(ARCH_X86_64)
+
+#define X86_CPU_FEATURE_SSE2		(1 << 0)
+#define X86_CPU_FEATURE_PCLMULQDQ	(1 << 1)
+#define X86_CPU_FEATURE_AVX		(1 << 2)
+#define X86_CPU_FEATURE_AVX2		(1 << 3)
+#define X86_CPU_FEATURE_BMI2		(1 << 4)
+/*
+ * ZMM indicates whether 512-bit vectors (zmm registers) should be used.  On
+ * some CPUs, to avoid downclocking issues we don't set ZMM even if the CPU
+ * supports it, i.e. even if AVX512F is set.  On these CPUs, we may still use
+ * AVX-512 instructions, but only with ymm and xmm registers.
+ */
+#define X86_CPU_FEATURE_ZMM		(1 << 5)
+#define X86_CPU_FEATURE_AVX512F		(1 << 6)
+#define X86_CPU_FEATURE_AVX512BW	(1 << 7)
+#define X86_CPU_FEATURE_AVX512VL	(1 << 8)
+#define X86_CPU_FEATURE_VPCLMULQDQ	(1 << 9)
+#define X86_CPU_FEATURE_AVX512VNNI	(1 << 10)
+#define X86_CPU_FEATURE_AVXVNNI		(1 << 11)
+
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
+/* Runtime x86 CPU feature detection is supported. */
+#  define X86_CPU_FEATURES_KNOWN	(1U << 31)
+extern volatile u32 libdeflate_x86_cpu_features;
+
+void libdeflate_init_x86_cpu_features(void);
+
+static inline u32 get_x86_cpu_features(void)
+{
+	if (libdeflate_x86_cpu_features == 0)
+		libdeflate_init_x86_cpu_features();
+	return libdeflate_x86_cpu_features;
+}
+/*
+ * x86 intrinsics are also supported.  Include the headers needed to use them.
+ * Normally just immintrin.h suffices.  With clang in MSVC compatibility mode,
+ * immintrin.h incorrectly skips including sub-headers, so include those too.
+ */
+#  include <immintrin.h>
+#  if defined(_MSC_VER) && defined(__clang__)
+#    include <tmmintrin.h>
+#    include <smmintrin.h>
+#    include <wmmintrin.h>
+#    include <avxintrin.h>
+#    include <avx2intrin.h>
+#    include <avx512fintrin.h>
+#    include <avx512bwintrin.h>
+#    include <avx512vlintrin.h>
+#    if __has_include(<avx512vlbwintrin.h>)
+#      include <avx512vlbwintrin.h>
+#    endif
+#    if __has_include(<vpclmulqdqintrin.h>)
+#      include <vpclmulqdqintrin.h>
+#    endif
+#    if __has_include(<avx512vnniintrin.h>)
+#      include <avx512vnniintrin.h>
+#    endif
+#    if __has_include(<avx512vlvnniintrin.h>)
+#      include <avx512vlvnniintrin.h>
+#    endif
+#    if __has_include(<avxvnniintrin.h>)
+#      include <avxvnniintrin.h>
+#    endif
+#  endif
+#else
+static inline u32 get_x86_cpu_features(void) { return 0; }
+#endif
+
+#if defined(__SSE2__) || \
+	(defined(_MSC_VER) && \
+	 (defined(ARCH_X86_64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 2)))
+#  define HAVE_SSE2(features)		1
+#  define HAVE_SSE2_NATIVE		1
+#else
+#  define HAVE_SSE2(features)		((features) & X86_CPU_FEATURE_SSE2)
+#  define HAVE_SSE2_NATIVE		0
+#endif
+
+#if defined(__PCLMUL__) || (defined(_MSC_VER) && defined(__AVX2__))
+#  define HAVE_PCLMULQDQ(features)	1
+#else
+#  define HAVE_PCLMULQDQ(features)	((features) & X86_CPU_FEATURE_PCLMULQDQ)
+#endif
+
+#ifdef __AVX__
+#  define HAVE_AVX(features)		1
+#else
+#  define HAVE_AVX(features)		((features) & X86_CPU_FEATURE_AVX)
+#endif
+
+#ifdef __AVX2__
+#  define HAVE_AVX2(features)		1
+#else
+#  define HAVE_AVX2(features)		((features) & X86_CPU_FEATURE_AVX2)
+#endif
+
+#if defined(__BMI2__) || (defined(_MSC_VER) && defined(__AVX2__))
+#  define HAVE_BMI2(features)		1
+#  define HAVE_BMI2_NATIVE		1
+#else
+#  define HAVE_BMI2(features)		((features) & X86_CPU_FEATURE_BMI2)
+#  define HAVE_BMI2_NATIVE		0
+#endif
+
+#ifdef __AVX512F__
+#  define HAVE_AVX512F(features)	1
+#else
+#  define HAVE_AVX512F(features)	((features) & X86_CPU_FEATURE_AVX512F)
+#endif
+
+#ifdef __AVX512BW__
+#  define HAVE_AVX512BW(features)	1
+#else
+#  define HAVE_AVX512BW(features)	((features) & X86_CPU_FEATURE_AVX512BW)
+#endif
+
+#ifdef __AVX512VL__
+#  define HAVE_AVX512VL(features)	1
+#else
+#  define HAVE_AVX512VL(features)	((features) & X86_CPU_FEATURE_AVX512VL)
+#endif
+
+#ifdef __VPCLMULQDQ__
+#  define HAVE_VPCLMULQDQ(features)	1
+#else
+#  define HAVE_VPCLMULQDQ(features)	((features) & X86_CPU_FEATURE_VPCLMULQDQ)
+#endif
+
+#ifdef __AVX512VNNI__
+#  define HAVE_AVX512VNNI(features)	1
+#else
+#  define HAVE_AVX512VNNI(features)	((features) & X86_CPU_FEATURE_AVX512VNNI)
+#endif
+
+#ifdef __AVXVNNI__
+#  define HAVE_AVXVNNI(features)	1
+#else
+#  define HAVE_AVXVNNI(features)	((features) & X86_CPU_FEATURE_AVXVNNI)
+#endif
+
+#endif /* ARCH_X86_32 || ARCH_X86_64 */
+
+#endif /* LIB_X86_CPU_FEATURES_H */
diff --git a/Sources/DEFLATE/x86/crc32_impl.h b/Sources/DEFLATE/x86/crc32_impl.h
new file mode 100644
index 00000000..79494468
--- /dev/null
+++ b/Sources/DEFLATE/x86/crc32_impl.h
@@ -0,0 +1,137 @@
+/*
+ * x86/crc32_impl.h - x86 implementations of the gzip CRC-32 algorithm
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_CRC32_IMPL_H
+#define LIB_X86_CRC32_IMPL_H
+
+#include "x86/cpu_features.h"
+
+#if defined(__GNUC__) || defined(__clang__) || defined(_MSC_VER)
+/* PCLMULQDQ implementation */
+#  define crc32_x86_pclmulqdq	crc32_x86_pclmulqdq
+#  define SUFFIX			 _pclmulqdq
+#  define ATTRIBUTES		_target_attribute("pclmul")
+#  define VL			16
+#  define FOLD_LESSTHAN16BYTES	0
+#  define USE_TERNARYLOGIC	0
+#  include "x86/crc32_pclmul_template.h"
+
+/*
+ * PCLMULQDQ/AVX implementation.  Compared to the regular PCLMULQDQ
+ * implementation, this still uses 128-bit vectors, but it has two potential
+ * benefits.  First, simply compiling against the AVX target can improve
+ * performance significantly (e.g. 10100 MB/s to 16700 MB/s on Skylake) without
+ * actually using any AVX intrinsics, probably due to the availability of
+ * non-destructive VEX-encoded instructions.  Second, AVX support implies SSSE3
+ * and SSE4.1 support, and we can use SSSE3 and SSE4.1 intrinsics for efficient
+ * handling of partial blocks.  (We *could* compile a variant with
+ * PCLMULQDQ+SSE4.1 without AVX, but for simplicity we don't currently bother.)
+ */
+#  define crc32_x86_pclmulqdq_avx	crc32_x86_pclmulqdq_avx
+#  define SUFFIX				 _pclmulqdq_avx
+#  define ATTRIBUTES		_target_attribute("pclmul,avx")
+#  define VL			16
+#  define FOLD_LESSTHAN16BYTES	1
+#  define USE_TERNARYLOGIC	0
+#  include "x86/crc32_pclmul_template.h"
+#endif
+
+/*
+ * VPCLMULQDQ/AVX2 implementation.  Uses 256-bit vectors.
+ *
+ * Currently this can't be enabled with MSVC because MSVC has a bug where it
+ * incorrectly assumes that VPCLMULQDQ implies AVX-512:
+ * https://developercommunity.visualstudio.com/t/Compiler-incorrectly-assumes-VAES-and-VP/10578785?space=62&q=AVX512&sort=newest
+ */
+#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000)
+#  define crc32_x86_vpclmulqdq_avx2	crc32_x86_vpclmulqdq_avx2
+#  define SUFFIX				 _vpclmulqdq_avx2
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx2")
+#  define VL			32
+#  define FOLD_LESSTHAN16BYTES	1
+#  define USE_TERNARYLOGIC	0
+#  include "x86/crc32_pclmul_template.h"
+#endif
+
+#if GCC_PREREQ(8, 1) || CLANG_PREREQ(6, 0, 10000000) || MSVC_PREREQ(1920)
+/*
+ * VPCLMULQDQ/AVX512 implementation with 256-bit vectors.  This takes advantage
+ * of some AVX-512 instructions but uses 256-bit vectors rather than 512-bit.
+ * This can be useful on CPUs where 512-bit vectors cause downclocking.
+ */
+#  define crc32_x86_vpclmulqdq_avx512_vl256  crc32_x86_vpclmulqdq_avx512_vl256
+#  define SUFFIX				      _vpclmulqdq_avx512_vl256
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
+#  define VL			32
+#  define FOLD_LESSTHAN16BYTES	1
+#  define USE_TERNARYLOGIC	1
+#  include "x86/crc32_pclmul_template.h"
+
+/* VPCLMULQDQ/AVX512 implementation with 512-bit vectors */
+#  define crc32_x86_vpclmulqdq_avx512_vl512  crc32_x86_vpclmulqdq_avx512_vl512
+#  define SUFFIX				      _vpclmulqdq_avx512_vl512
+#  define ATTRIBUTES		_target_attribute("vpclmulqdq,pclmul,avx512vl")
+#  define VL			64
+#  define FOLD_LESSTHAN16BYTES	1
+#  define USE_TERNARYLOGIC	1
+#  include "x86/crc32_pclmul_template.h"
+#endif
+
+static inline crc32_func_t
+arch_select_crc32_func(void)
+{
+	const u32 features MAYBE_UNUSED = get_x86_cpu_features();
+
+#ifdef crc32_x86_vpclmulqdq_avx512_vl512
+	if ((features & X86_CPU_FEATURE_ZMM) &&
+	    HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
+	    HAVE_AVX512F(features) && HAVE_AVX512VL(features))
+		return crc32_x86_vpclmulqdq_avx512_vl512;
+#endif
+#ifdef crc32_x86_vpclmulqdq_avx512_vl256
+	if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
+	    HAVE_AVX512F(features) && HAVE_AVX512VL(features))
+		return crc32_x86_vpclmulqdq_avx512_vl256;
+#endif
+#ifdef crc32_x86_vpclmulqdq_avx2
+	if (HAVE_VPCLMULQDQ(features) && HAVE_PCLMULQDQ(features) &&
+	    HAVE_AVX2(features))
+		return crc32_x86_vpclmulqdq_avx2;
+#endif
+#ifdef crc32_x86_pclmulqdq_avx
+	if (HAVE_PCLMULQDQ(features) && HAVE_AVX(features))
+		return crc32_x86_pclmulqdq_avx;
+#endif
+#ifdef crc32_x86_pclmulqdq
+	if (HAVE_PCLMULQDQ(features))
+		return crc32_x86_pclmulqdq;
+#endif
+	return NULL;
+}
+#define arch_select_crc32_func	arch_select_crc32_func
+
+#endif /* LIB_X86_CRC32_IMPL_H */
diff --git a/Sources/DEFLATE/x86/crc32_pclmul_template.h b/Sources/DEFLATE/x86/crc32_pclmul_template.h
new file mode 100644
index 00000000..4257d449
--- /dev/null
+++ b/Sources/DEFLATE/x86/crc32_pclmul_template.h
@@ -0,0 +1,487 @@
+/*
+ * x86/crc32_pclmul_template.h - gzip CRC-32 with PCLMULQDQ instructions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * This file is a "template" for instantiating PCLMULQDQ-based crc32_x86
+ * functions.  The "parameters" are:
+ *
+ * SUFFIX:
+ *	Name suffix to append to all instantiated functions.
+ * ATTRIBUTES:
+ *	Target function attributes to use.  Must satisfy the dependencies of the
+ *	other parameters as follows:
+ *	   VL=16 && FOLD_LESSTHAN16BYTES=0: at least pclmul
+ *	   VL=16 && FOLD_LESSTHAN16BYTES=1: at least pclmul,sse4.1
+ *	   VL=32 && USE_TERNARYLOGIC=0: at least vpclmulqdq,pclmul,avx2
+ *	   VL=32 && USE_TERNARYLOGIC=1: at least vpclmulqdq,pclmul,avx512vl
+ *	   VL=64: at least vpclmulqdq,pclmul,avx512vl
+ * VL:
+ *	Vector length in bytes.  Supported values are 16, 32, and 64.
+ * FOLD_LESSTHAN16BYTES:
+ *	Use vector instructions to handle any partial blocks at the beginning
+ *	and end, instead of falling back to scalar instructions for those parts.
+ * USE_TERNARYLOGIC:
+ *	Use the vpternlog instruction to do three-argument XORs.
+ *
+ * The overall algorithm used is CRC folding with carryless multiplication
+ * instructions.  Note that the x86 crc32 instruction cannot be used, as it is
+ * for a different polynomial, not the gzip one.  For an explanation of CRC
+ * folding with carryless multiplication instructions, see
+ * scripts/gen_crc32_multipliers.c and the following paper:
+ *
+ *	"Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *	https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
+ *
+ * The original pclmulqdq instruction does one 64x64 to 128-bit carryless
+ * multiplication.  The VPCLMULQDQ feature added instructions that do two
+ * parallel 64x64 to 128-bit carryless multiplications in combination with AVX
+ * or AVX512VL, or four in combination with AVX512F.
+ */
+
+#undef fold_vec128
+static forceinline ATTRIBUTES __m128i
+ADD_SUFFIX(fold_vec128)(__m128i src, __m128i dst, __m128i multipliers)
+{
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x00));
+	dst = _mm_xor_si128(dst, _mm_clmulepi64_si128(src, multipliers, 0x11));
+	return dst;
+}
+#define fold_vec128	ADD_SUFFIX(fold_vec128)
+
+#if VL >= 32
+#undef fold_vec256
+static forceinline ATTRIBUTES __m256i
+ADD_SUFFIX(fold_vec256)(__m256i src, __m256i dst, __m256i multipliers)
+{
+#if USE_TERNARYLOGIC
+	return _mm256_ternarylogic_epi32(
+			_mm256_clmulepi64_epi128(src, multipliers, 0x00),
+			_mm256_clmulepi64_epi128(src, multipliers, 0x11),
+			dst,
+			0x96);
+#else
+	return _mm256_xor_si256(
+			_mm256_xor_si256(dst,
+					 _mm256_clmulepi64_epi128(src, multipliers, 0x00)),
+			_mm256_clmulepi64_epi128(src, multipliers, 0x11));
+#endif
+}
+#define fold_vec256	ADD_SUFFIX(fold_vec256)
+#endif /* VL >= 32 */
+
+#if VL >= 64
+#undef fold_vec512
+static forceinline ATTRIBUTES __m512i
+ADD_SUFFIX(fold_vec512)(__m512i src, __m512i dst, __m512i multipliers)
+{
+	return _mm512_ternarylogic_epi32(
+			_mm512_clmulepi64_epi128(src, multipliers, 0x00),
+			_mm512_clmulepi64_epi128(src, multipliers, 0x11),
+			dst,
+			0x96);
+}
+#define fold_vec512	ADD_SUFFIX(fold_vec512)
+#endif /* VL >= 64 */
+
+#if VL == 16
+#  define vec_t			__m128i
+#  define fold_vec		fold_vec128
+#  define VLOAD_UNALIGNED(p)	_mm_loadu_si128((const void *)(p))
+#  define VXOR(a, b)		_mm_xor_si128((a), (b))
+#  define M128I_TO_VEC(a)	a
+#  define MULTS_8V		_mm_set_epi64x(CRC32_X991_MODG, CRC32_X1055_MODG)
+#  define MULTS_4V		_mm_set_epi64x(CRC32_X479_MODG, CRC32_X543_MODG)
+#  define MULTS_2V		_mm_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG)
+#  define MULTS_1V		_mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG)
+#elif VL == 32
+#  define vec_t			__m256i
+#  define fold_vec		fold_vec256
+#  define VLOAD_UNALIGNED(p)	_mm256_loadu_si256((const void *)(p))
+#  define VXOR(a, b)		_mm256_xor_si256((a), (b))
+#  define M128I_TO_VEC(a)	_mm256_castsi128_si256(a)
+#  define MULTS(a, b)		_mm256_set_epi64x(a, b, a, b)
+#  define MULTS_8V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
+#  define MULTS_4V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
+#  define MULTS_2V		MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
+#  define MULTS_1V		MULTS(CRC32_X223_MODG, CRC32_X287_MODG)
+#elif VL == 64
+#  define vec_t			__m512i
+#  define fold_vec		fold_vec512
+#  define VLOAD_UNALIGNED(p)	_mm512_loadu_si512((const void *)(p))
+#  define VXOR(a, b)		_mm512_xor_si512((a), (b))
+#  define M128I_TO_VEC(a)	_mm512_castsi128_si512(a)
+#  define MULTS(a, b)		_mm512_set_epi64(a, b, a, b, a, b, a, b)
+#  define MULTS_8V		MULTS(CRC32_X4063_MODG, CRC32_X4127_MODG)
+#  define MULTS_4V		MULTS(CRC32_X2015_MODG, CRC32_X2079_MODG)
+#  define MULTS_2V		MULTS(CRC32_X991_MODG, CRC32_X1055_MODG)
+#  define MULTS_1V		MULTS(CRC32_X479_MODG, CRC32_X543_MODG)
+#else
+#  error "unsupported vector length"
+#endif
+
+#if FOLD_LESSTHAN16BYTES
+/*
+ * Given 'x' containing a 16-byte polynomial, and a pointer 'p' that points to
+ * the next '1 <= len <= 15' data bytes, rearrange the concatenation of 'x' and
+ * the data into vectors x0 and x1 that contain 'len' bytes and 16 bytes,
+ * respectively.  Then fold x0 into x1 and return the result.
+ * Assumes that 'p + len - 16' is in-bounds.
+ */
+#undef fold_lessthan16bytes
+static forceinline ATTRIBUTES __m128i
+ADD_SUFFIX(fold_lessthan16bytes)(__m128i x, const u8 *p, size_t len,
+				 __m128i /* __v2du */ multipliers_128b)
+{
+	/*
+	 * pshufb(x, shift_tab[len..len+15]) left shifts x by 16-len bytes.
+	 * pshufb(x, shift_tab[len+16..len+31]) right shifts x by len bytes.
+	 */
+	static const u8 shift_tab[48] = {
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+		0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+		0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+	};
+	__m128i lshift = _mm_loadu_si128((const void *)&shift_tab[len]);
+	__m128i rshift = _mm_loadu_si128((const void *)&shift_tab[len + 16]);
+	__m128i x0, x1;
+
+	/* x0 = x left-shifted by '16 - len' bytes */
+	x0 = _mm_shuffle_epi8(x, lshift);
+
+	/*
+	 * x1 = the last '16 - len' bytes from x (i.e. x right-shifted by 'len'
+	 * bytes) followed by the remaining data.
+	 */
+	x1 = _mm_blendv_epi8(_mm_shuffle_epi8(x, rshift),
+			     _mm_loadu_si128((const void *)(p + len - 16)),
+			     /* msb 0/1 of each byte selects byte from arg1/2 */
+			     rshift);
+
+	return fold_vec128(x0, x1, multipliers_128b);
+}
+#define fold_lessthan16bytes	ADD_SUFFIX(fold_lessthan16bytes)
+#endif /* FOLD_LESSTHAN16BYTES */
+
+static u32 ATTRIBUTES
+ADD_SUFFIX(crc32_x86)(u32 crc, const u8 *p, size_t len)
+{
+	const vec_t multipliers_8v = MULTS_8V; /* 8 vecs */
+	const vec_t multipliers_4v = MULTS_4V; /* 4 vecs */
+	const vec_t multipliers_2v = MULTS_2V; /* 2 vecs */
+	const vec_t multipliers_1v = MULTS_1V; /* 1 vecs */
+	const __m128i /* __v2du */ multipliers_128b =
+		_mm_set_epi64x(CRC32_X95_MODG, CRC32_X159_MODG);
+	const __m128i /* __v2du */ final_multiplier =
+		_mm_set_epi64x(0, CRC32_X63_MODG);
+	const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
+	const __m128i /* __v2du */ barrett_reduction_constants =
+		_mm_set_epi64x(CRC32_BARRETT_CONSTANT_2,
+			       CRC32_BARRETT_CONSTANT_1);
+	vec_t v0, v1, v2, v3, v4, v5, v6, v7;
+	__m128i x0, x1;
+
+	/*
+	 * There are two overall code paths.  The first path supports all
+	 * lengths, but is intended for short lengths; it uses unaligned loads
+	 * and does at most 4-way folds.  The second path only supports longer
+	 * lengths, aligns the pointer in order to do aligned loads, and does up
+	 * to 8-way folds.  The length check below decides which path to take.
+	 */
+	if (len < 64*VL) {
+		if (len < VL)
+			return crc32_slice1(crc, p, len);
+
+		v0 = VXOR(VLOAD_UNALIGNED(p),
+			  M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
+		p += VL;
+
+		if (len >= 4*VL) {
+			v1 = VLOAD_UNALIGNED(p + 0*VL);
+			v2 = VLOAD_UNALIGNED(p + 1*VL);
+			v3 = VLOAD_UNALIGNED(p + 2*VL);
+			p += 3*VL;
+			while (len >= 8*VL) {
+				v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL),
+					      multipliers_4v);
+				v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL),
+					      multipliers_4v);
+				v2 = fold_vec(v2, VLOAD_UNALIGNED(p + 2*VL),
+					      multipliers_4v);
+				v3 = fold_vec(v3, VLOAD_UNALIGNED(p + 3*VL),
+					      multipliers_4v);
+				p += 4*VL;
+				len -= 4*VL;
+			}
+			v0 = fold_vec(v0, v2, multipliers_2v);
+			v1 = fold_vec(v1, v3, multipliers_2v);
+			if (len & (2*VL)) {
+				v0 = fold_vec(v0, VLOAD_UNALIGNED(p + 0*VL),
+					      multipliers_2v);
+				v1 = fold_vec(v1, VLOAD_UNALIGNED(p + 1*VL),
+					      multipliers_2v);
+				p += 2*VL;
+			}
+			v0 = fold_vec(v0, v1, multipliers_1v);
+			if (len & VL) {
+				v0 = fold_vec(v0, VLOAD_UNALIGNED(p),
+					      multipliers_1v);
+				p += VL;
+			}
+		} else {
+			if (len >= 2*VL) {
+				v0 = fold_vec(v0, VLOAD_UNALIGNED(p),
+					      multipliers_1v);
+				p += VL;
+				if (len >= 3*VL) {
+					v0 = fold_vec(v0, VLOAD_UNALIGNED(p),
+						      multipliers_1v);
+					p += VL;
+				}
+			}
+		}
+	} else {
+		size_t align = -(uintptr_t)p & (VL-1);
+		const vec_t *vp;
+
+		/* Align p to the next VL-byte boundary. */
+		if (align == 0) {
+			vp = (const vec_t *)p;
+			v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
+		} else {
+			len -= align;
+		#if FOLD_LESSTHAN16BYTES
+			x0 = _mm_xor_si128(_mm_loadu_si128((const void *)p),
+					   _mm_cvtsi32_si128(crc));
+			p += 16;
+			if (align & 15) {
+				x0 = fold_lessthan16bytes(x0, p, align & 15,
+							  multipliers_128b);
+				p += align & 15;
+				align &= ~15;
+			}
+			while (align >= 16) {
+				x0 = fold_vec128(x0, *(const __m128i *)p,
+						 multipliers_128b);
+				p += 16;
+				align -= 16;
+			}
+			v0 = M128I_TO_VEC(x0);
+		#  if VL == 32
+			v0 = _mm256_inserti128_si256(v0, *(const __m128i *)p, 1);
+			p += 16;
+		#  elif VL == 64
+			v0 = _mm512_inserti32x4(v0, *(const __m128i *)p, 1);
+			p += 16;
+			v0 = _mm512_inserti64x4(v0, *(const __m256i *)p, 1);
+			p += 32;
+		#  endif
+			vp = (const vec_t *)p;
+		#else
+			crc = crc32_slice1(crc, p, align);
+			p += align;
+			vp = (const vec_t *)p;
+			v0 = VXOR(*vp++, M128I_TO_VEC(_mm_cvtsi32_si128(crc)));
+		#endif
+		}
+		v1 = *vp++;
+		v2 = *vp++;
+		v3 = *vp++;
+		v4 = *vp++;
+		v5 = *vp++;
+		v6 = *vp++;
+		v7 = *vp++;
+		do {
+			v0 = fold_vec(v0, *vp++, multipliers_8v);
+			v1 = fold_vec(v1, *vp++, multipliers_8v);
+			v2 = fold_vec(v2, *vp++, multipliers_8v);
+			v3 = fold_vec(v3, *vp++, multipliers_8v);
+			v4 = fold_vec(v4, *vp++, multipliers_8v);
+			v5 = fold_vec(v5, *vp++, multipliers_8v);
+			v6 = fold_vec(v6, *vp++, multipliers_8v);
+			v7 = fold_vec(v7, *vp++, multipliers_8v);
+			len -= 8*VL;
+		} while (len >= 16*VL);
+
+		/*
+		 * Reduce v0-v7 (length 8*VL bytes) to v0 (length VL bytes)
+		 * and fold in any VL-byte data segments that remain.
+		 */
+		v0 = fold_vec(v0, v4, multipliers_4v);
+		v1 = fold_vec(v1, v5, multipliers_4v);
+		v2 = fold_vec(v2, v6, multipliers_4v);
+		v3 = fold_vec(v3, v7, multipliers_4v);
+		if (len & (4*VL)) {
+			v0 = fold_vec(v0, *vp++, multipliers_4v);
+			v1 = fold_vec(v1, *vp++, multipliers_4v);
+			v2 = fold_vec(v2, *vp++, multipliers_4v);
+			v3 = fold_vec(v3, *vp++, multipliers_4v);
+		}
+		v0 = fold_vec(v0, v2, multipliers_2v);
+		v1 = fold_vec(v1, v3, multipliers_2v);
+		if (len & (2*VL)) {
+			v0 = fold_vec(v0, *vp++, multipliers_2v);
+			v1 = fold_vec(v1, *vp++, multipliers_2v);
+		}
+		v0 = fold_vec(v0, v1, multipliers_1v);
+		if (len & VL)
+			v0 = fold_vec(v0, *vp++, multipliers_1v);
+		p = (const u8 *)vp;
+	}
+
+	/*
+	 * Reduce v0 (length VL bytes) to x0 (length 16 bytes)
+	 * and fold in any 16-byte data segments that remain.
+	 */
+#if VL == 16
+	x0 = v0;
+#else
+	{
+#  if VL == 32
+		__m256i y0 = v0;
+#  else
+		const __m256i multipliers_256b =
+			_mm256_set_epi64x(CRC32_X223_MODG, CRC32_X287_MODG,
+					  CRC32_X223_MODG, CRC32_X287_MODG);
+		__m256i y0 = fold_vec256(_mm512_extracti64x4_epi64(v0, 0),
+					 _mm512_extracti64x4_epi64(v0, 1),
+					 multipliers_256b);
+		if (len & 32) {
+			y0 = fold_vec256(y0, _mm256_loadu_si256((const void *)p),
+					 multipliers_256b);
+			p += 32;
+		}
+#  endif
+		x0 = fold_vec128(_mm256_extracti128_si256(y0, 0),
+				 _mm256_extracti128_si256(y0, 1),
+				 multipliers_128b);
+	}
+	if (len & 16) {
+		x0 = fold_vec128(x0, _mm_loadu_si128((const void *)p),
+				 multipliers_128b);
+		p += 16;
+	}
+#endif
+	len &= 15;
+
+	/*
+	 * If fold_lessthan16bytes() is available, handle any remainder
+	 * of 1 to 15 bytes now, before reducing to 32 bits.
+	 */
+#if FOLD_LESSTHAN16BYTES
+	if (len)
+		x0 = fold_lessthan16bytes(x0, p, len, multipliers_128b);
+#endif
+
+	/*
+	 * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
+	 * which is equivalent to multiplying by x^32.  This is needed because
+	 * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
+	 */
+	x0 = _mm_xor_si128(_mm_srli_si128(x0, 8),
+			   _mm_clmulepi64_si128(x0, multipliers_128b, 0x10));
+
+	/* Fold 96 => 64 bits. */
+	x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
+			   _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
+						final_multiplier, 0x00));
+
+	/*
+	 * Reduce 64 => 32 bits using Barrett reduction.
+	 *
+	 * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
+	 * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
+	 *
+	 *	R(x) = (A(x)*x^32 + B(x)) mod G(x)
+	 *	     = (A(x)*x^32) mod G(x) + B(x)
+	 *
+	 * Then, by the Division Algorithm there exists a unique q(x) such that:
+	 *
+	 *	A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
+	 *
+	 * Since the left-hand side is of maximum degree 31, the right-hand side
+	 * must be too.  This implies that we can apply 'mod x^32' to the
+	 * right-hand side without changing its value:
+	 *
+	 *	(A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
+	 *
+	 * Note that '+' is equivalent to '-' in polynomials over GF(2).
+	 *
+	 * We also know that:
+	 *
+	 *	              / A(x)*x^32 \
+	 *	q(x) = floor (  ---------  )
+	 *	              \    G(x)   /
+	 *
+	 * To compute this efficiently, we can multiply the top and bottom by
+	 * x^32 and move the division by G(x) to the top:
+	 *
+	 *	              / A(x) * floor(x^64 / G(x)) \
+	 *	q(x) = floor (  -------------------------  )
+	 *	              \           x^32            /
+	 *
+	 * Note that floor(x^64 / G(x)) is a constant.
+	 *
+	 * So finally we have:
+	 *
+	 *	                          / A(x) * floor(x^64 / G(x)) \
+	 *	R(x) = B(x) + G(x)*floor (  -------------------------  )
+	 *	                          \           x^32            /
+	 */
+	x1 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32),
+				  barrett_reduction_constants, 0x00);
+	x1 = _mm_clmulepi64_si128(_mm_and_si128(x1, mask32),
+				  barrett_reduction_constants, 0x10);
+	x0 = _mm_xor_si128(x0, x1);
+#if FOLD_LESSTHAN16BYTES
+	crc = _mm_extract_epi32(x0, 1);
+#else
+	crc = _mm_cvtsi128_si32(_mm_shuffle_epi32(x0, 0x01));
+	/* Process up to 15 bytes left over at the end. */
+	crc = crc32_slice1(crc, p, len);
+#endif
+	return crc;
+}
+
+#undef vec_t
+#undef fold_vec
+#undef VLOAD_UNALIGNED
+#undef VXOR
+#undef M128I_TO_VEC
+#undef MULTS
+#undef MULTS_8V
+#undef MULTS_4V
+#undef MULTS_2V
+#undef MULTS_1V
+
+#undef SUFFIX
+#undef ATTRIBUTES
+#undef VL
+#undef FOLD_LESSTHAN16BYTES
+#undef USE_TERNARYLOGIC
diff --git a/Sources/DEFLATE/x86/decompress_impl.h b/Sources/DEFLATE/x86/decompress_impl.h
new file mode 100644
index 00000000..85ca920a
--- /dev/null
+++ b/Sources/DEFLATE/x86/decompress_impl.h
@@ -0,0 +1,57 @@
+#ifndef LIB_X86_DECOMPRESS_IMPL_H
+#define LIB_X86_DECOMPRESS_IMPL_H
+
+#include "x86/cpu_features.h"
+
+/*
+ * BMI2 optimized decompression function.
+ *
+ * With gcc and clang we just compile the whole function with
+ * __attribute__((target("bmi2"))), and the compiler uses bmi2 automatically.
+ *
+ * With MSVC, there is no target function attribute, but it's still possible to
+ * use bmi2 intrinsics explicitly.  Currently we mostly don't, but there's a
+ * case in which we do (see below), so we at least take advantage of that.
+ * However, MSVC from VS2017 (toolset v141) apparently miscompiles the _bzhi_*()
+ * intrinsics.  It seems to be fixed in VS2022.  Hence, use MSVC_PREREQ(1930).
+ */
+#if defined(__GNUC__) || defined(__clang__) || MSVC_PREREQ(1930)
+#  define deflate_decompress_bmi2	deflate_decompress_bmi2
+#  define FUNCNAME			deflate_decompress_bmi2
+#  define ATTRIBUTES			_target_attribute("bmi2")
+   /*
+    * Even with __attribute__((target("bmi2"))), gcc doesn't reliably use the
+    * bzhi instruction for 'word & BITMASK(count)'.  So use the bzhi intrinsic
+    * explicitly.  EXTRACT_VARBITS() is equivalent to 'word & BITMASK(count)';
+    * EXTRACT_VARBITS8() is equivalent to 'word & BITMASK((u8)count)'.
+    * Nevertheless, their implementation using the bzhi intrinsic is identical,
+    * as the bzhi instruction truncates the count to 8 bits implicitly.
+    */
+#  ifndef __clang__
+#    ifdef ARCH_X86_64
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u64((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u64((word), (count))
+#    else
+#      define EXTRACT_VARBITS(word, count)  _bzhi_u32((word), (count))
+#      define EXTRACT_VARBITS8(word, count) _bzhi_u32((word), (count))
+#    endif
+#  endif
+#  include "../decompress_template.h"
+#endif
+
+#if defined(deflate_decompress_bmi2) && HAVE_BMI2_NATIVE
+#define DEFAULT_IMPL	deflate_decompress_bmi2
+#else
+static inline decompress_func_t
+arch_select_decompress_func(void)
+{
+#ifdef deflate_decompress_bmi2
+	if (HAVE_BMI2(get_x86_cpu_features()))
+		return deflate_decompress_bmi2;
+#endif
+	return NULL;
+}
+#define arch_select_decompress_func	arch_select_decompress_func
+#endif
+
+#endif /* LIB_X86_DECOMPRESS_IMPL_H */
\ No newline at end of file
diff --git a/Sources/DEFLATE/x86/matchfinder_impl.h b/Sources/DEFLATE/x86/matchfinder_impl.h
new file mode 100644
index 00000000..080a7492
--- /dev/null
+++ b/Sources/DEFLATE/x86/matchfinder_impl.h
@@ -0,0 +1,122 @@
+/*
+ * x86/matchfinder_impl.h - x86 implementations of matchfinder functions
+ *
+ * Copyright 2016 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef LIB_X86_MATCHFINDER_IMPL_H
+#define LIB_X86_MATCHFINDER_IMPL_H
+
+#include "x86/cpu_features.h"
+
+#ifdef __AVX2__
+static forceinline void
+matchfinder_init_avx2(mf_pos_t *data, size_t size)
+{
+	__m256i *p = (__m256i *)data;
+	__m256i v = _mm256_set1_epi16(MATCHFINDER_INITVAL);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_avx2
+
+static forceinline void
+matchfinder_rebase_avx2(mf_pos_t *data, size_t size)
+{
+	__m256i *p = (__m256i *)data;
+	__m256i v = _mm256_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+		p[0] = _mm256_adds_epi16(p[0], v);
+		p[1] = _mm256_adds_epi16(p[1], v);
+		p[2] = _mm256_adds_epi16(p[2], v);
+		p[3] = _mm256_adds_epi16(p[3], v);
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_avx2
+
+#elif HAVE_SSE2_NATIVE
+static forceinline void
+matchfinder_init_sse2(mf_pos_t *data, size_t size)
+{
+	__m128i *p = (__m128i *)data;
+	__m128i v = _mm_set1_epi16(MATCHFINDER_INITVAL);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		p[0] = v;
+		p[1] = v;
+		p[2] = v;
+		p[3] = v;
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_init matchfinder_init_sse2
+
+static forceinline void
+matchfinder_rebase_sse2(mf_pos_t *data, size_t size)
+{
+	__m128i *p = (__m128i *)data;
+	__m128i v = _mm_set1_epi16((u16)-MATCHFINDER_WINDOW_SIZE);
+
+	STATIC_ASSERT(MATCHFINDER_MEM_ALIGNMENT % sizeof(*p) == 0);
+	STATIC_ASSERT(MATCHFINDER_SIZE_ALIGNMENT % (4 * sizeof(*p)) == 0);
+	STATIC_ASSERT(sizeof(mf_pos_t) == 2);
+
+	do {
+		/* PADDSW: Add Packed Signed Integers With Signed Saturation  */
+		p[0] = _mm_adds_epi16(p[0], v);
+		p[1] = _mm_adds_epi16(p[1], v);
+		p[2] = _mm_adds_epi16(p[2], v);
+		p[3] = _mm_adds_epi16(p[3], v);
+		p += 4;
+		size -= 4 * sizeof(*p);
+	} while (size != 0);
+}
+#define matchfinder_rebase matchfinder_rebase_sse2
+#endif /* HAVE_SSE2_NATIVE */
+
+#endif /* LIB_X86_MATCHFINDER_IMPL_H */
\ No newline at end of file
diff --git a/Sources/DEFLATE/zlib_compress.c b/Sources/DEFLATE/zlib_compress.c
index 12d43602..ecf38d8b 100644
--- a/Sources/DEFLATE/zlib_compress.c
+++ b/Sources/DEFLATE/zlib_compress.c
@@ -30,53 +30,53 @@
 
 LIBDEFLATEAPI size_t
 libdeflate_zlib_compress(struct libdeflate_compressor *c,
-			 const void *in, size_t in_nbytes,
-			 void *out, size_t out_nbytes_avail)
+                         const void *in, size_t in_nbytes,
+                         void *out, size_t out_nbytes_avail)
 {
-	u8 *out_next = out;
-	u16 hdr;
-	unsigned compression_level;
-	unsigned level_hint;
-	size_t deflate_size;
-
-	if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD)
-		return 0;
-
-	/* 2 byte header: CMF and FLG  */
-	hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12);
-	compression_level = libdeflate_get_compression_level(c);
-	if (compression_level < 2)
-		level_hint = ZLIB_FASTEST_COMPRESSION;
-	else if (compression_level < 6)
-		level_hint = ZLIB_FAST_COMPRESSION;
-	else if (compression_level < 8)
-		level_hint = ZLIB_DEFAULT_COMPRESSION;
-	else
-		level_hint = ZLIB_SLOWEST_COMPRESSION;
-	hdr |= level_hint << 6;
-	hdr |= 31 - (hdr % 31);
-
-	put_unaligned_be16(hdr, out_next);
-	out_next += 2;
-
-	/* Compressed data  */
-	deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
-					out_nbytes_avail - ZLIB_MIN_OVERHEAD);
-	if (deflate_size == 0)
-		return 0;
-	out_next += deflate_size;
-
-	/* ADLER32  */
-	put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next);
-	out_next += 4;
-
-	return out_next - (u8 *)out;
+  u8 *out_next = out;
+  u16 hdr;
+  unsigned compression_level;
+  unsigned level_hint;
+  size_t deflate_size;
+  
+  if (out_nbytes_avail <= ZLIB_MIN_OVERHEAD)
+    return 0;
+  
+  /* 2 byte header: CMF and FLG  */
+  hdr = (ZLIB_CM_DEFLATE << 8) | (ZLIB_CINFO_32K_WINDOW << 12);
+  compression_level = libdeflate_get_compression_level(c);
+  if (compression_level < 2)
+    level_hint = ZLIB_FASTEST_COMPRESSION;
+  else if (compression_level < 6)
+    level_hint = ZLIB_FAST_COMPRESSION;
+  else if (compression_level < 8)
+    level_hint = ZLIB_DEFAULT_COMPRESSION;
+  else
+    level_hint = ZLIB_SLOWEST_COMPRESSION;
+  hdr |= level_hint << 6;
+  hdr |= 31 - (hdr % 31);
+  
+  put_unaligned_be16(hdr, out_next);
+  out_next += 2;
+  
+  /* Compressed data  */
+  deflate_size = libdeflate_deflate_compress(c, in, in_nbytes, out_next,
+                                             out_nbytes_avail - ZLIB_MIN_OVERHEAD);
+  if (deflate_size == 0)
+    return 0;
+  out_next += deflate_size;
+  
+  /* ADLER32  */
+  put_unaligned_be32(libdeflate_adler32(1, in, in_nbytes), out_next);
+  out_next += 4;
+  
+  return out_next - (u8 *)out;
 }
 
 LIBDEFLATEAPI size_t
 libdeflate_zlib_compress_bound(struct libdeflate_compressor *c,
-			       size_t in_nbytes)
+                               size_t in_nbytes)
 {
-	return ZLIB_MIN_OVERHEAD +
-	       libdeflate_deflate_compress_bound(c, in_nbytes);
+  return ZLIB_MIN_OVERHEAD +
+  libdeflate_deflate_compress_bound(c, in_nbytes);
 }
diff --git a/Sources/DEFLATE/zlib_constants.h b/Sources/DEFLATE/zlib_constants.h
index f304310c..7b6b42a1 100644
--- a/Sources/DEFLATE/zlib_constants.h
+++ b/Sources/DEFLATE/zlib_constants.h
@@ -5,17 +5,17 @@
 #ifndef LIB_ZLIB_CONSTANTS_H
 #define LIB_ZLIB_CONSTANTS_H
 
-#define ZLIB_MIN_HEADER_SIZE	2
-#define ZLIB_FOOTER_SIZE	4
-#define ZLIB_MIN_OVERHEAD	(ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE)
+#define ZLIB_MIN_HEADER_SIZE  2
+#define ZLIB_FOOTER_SIZE  4
+#define ZLIB_MIN_OVERHEAD  (ZLIB_MIN_HEADER_SIZE + ZLIB_FOOTER_SIZE)
 
-#define ZLIB_CM_DEFLATE		8
+#define ZLIB_CM_DEFLATE    8
 
-#define ZLIB_CINFO_32K_WINDOW	7
+#define ZLIB_CINFO_32K_WINDOW  7
 
-#define ZLIB_FASTEST_COMPRESSION	0
-#define ZLIB_FAST_COMPRESSION		1
-#define ZLIB_DEFAULT_COMPRESSION	2
-#define ZLIB_SLOWEST_COMPRESSION	3
+#define ZLIB_FASTEST_COMPRESSION  0
+#define ZLIB_FAST_COMPRESSION    1
+#define ZLIB_DEFAULT_COMPRESSION  2
+#define ZLIB_SLOWEST_COMPRESSION  3
 
 #endif /* LIB_ZLIB_CONSTANTS_H */
diff --git a/Sources/DEFLATE/zlib_decompress.c b/Sources/DEFLATE/zlib_decompress.c
index f5e43eae..526f2706 100644
--- a/Sources/DEFLATE/zlib_decompress.c
+++ b/Sources/DEFLATE/zlib_decompress.c
@@ -30,75 +30,75 @@
 
 LIBDEFLATEAPI enum libdeflate_result
 libdeflate_zlib_decompress_ex(struct libdeflate_decompressor *d,
-			      const void *in, size_t in_nbytes,
-			      void *out, size_t out_nbytes_avail,
-			      size_t *actual_in_nbytes_ret,
-			      size_t *actual_out_nbytes_ret)
+                              const void *in, size_t in_nbytes,
+                              void *out, size_t out_nbytes_avail,
+                              size_t *actual_in_nbytes_ret,
+                              size_t *actual_out_nbytes_ret)
 {
-	const u8 *in_next = in;
-	const u8 * const in_end = in_next + in_nbytes;
-	u16 hdr;
-	size_t actual_in_nbytes;
-	size_t actual_out_nbytes;
-	enum libdeflate_result result;
-
-	if (in_nbytes < ZLIB_MIN_OVERHEAD)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* 2 byte header: CMF and FLG  */
-	hdr = get_unaligned_be16(in_next);
-	in_next += 2;
-
-	/* FCHECK */
-	if ((hdr % 31) != 0)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* CM */
-	if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* CINFO */
-	if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* FDICT */
-	if ((hdr >> 5) & 1)
-		return LIBDEFLATE_BAD_DATA;
-
-	/* Compressed data  */
-	result = libdeflate_deflate_decompress_ex(d, in_next,
-					in_end - ZLIB_FOOTER_SIZE - in_next,
-					out, out_nbytes_avail,
-					&actual_in_nbytes, actual_out_nbytes_ret);
-	if (result != LIBDEFLATE_SUCCESS)
-		return result;
-
-	if (actual_out_nbytes_ret)
-		actual_out_nbytes = *actual_out_nbytes_ret;
-	else
-		actual_out_nbytes = out_nbytes_avail;
-
-	in_next += actual_in_nbytes;
-
-	/* ADLER32  */
-	if (libdeflate_adler32(1, out, actual_out_nbytes) !=
-	    get_unaligned_be32(in_next))
-		return LIBDEFLATE_BAD_DATA;
-	in_next += 4;
-
-	if (actual_in_nbytes_ret)
-		*actual_in_nbytes_ret = in_next - (u8 *)in;
-
-	return LIBDEFLATE_SUCCESS;
+  const u8 *in_next = in;
+  const u8 * const in_end = in_next + in_nbytes;
+  u16 hdr;
+  size_t actual_in_nbytes;
+  size_t actual_out_nbytes;
+  enum libdeflate_result result;
+  
+  if (in_nbytes < ZLIB_MIN_OVERHEAD)
+    return LIBDEFLATE_BAD_DATA;
+  
+  /* 2 byte header: CMF and FLG  */
+  hdr = get_unaligned_be16(in_next);
+  in_next += 2;
+  
+  /* FCHECK */
+  if ((hdr % 31) != 0)
+    return LIBDEFLATE_BAD_DATA;
+  
+  /* CM */
+  if (((hdr >> 8) & 0xF) != ZLIB_CM_DEFLATE)
+    return LIBDEFLATE_BAD_DATA;
+  
+  /* CINFO */
+  if ((hdr >> 12) > ZLIB_CINFO_32K_WINDOW)
+    return LIBDEFLATE_BAD_DATA;
+  
+  /* FDICT */
+  if ((hdr >> 5) & 1)
+    return LIBDEFLATE_BAD_DATA;
+  
+  /* Compressed data  */
+  result = libdeflate_deflate_decompress_ex(d, in_next,
+                                            in_end - ZLIB_FOOTER_SIZE - in_next,
+                                            out, out_nbytes_avail,
+                                            &actual_in_nbytes, actual_out_nbytes_ret);
+  if (result != LIBDEFLATE_SUCCESS)
+    return result;
+  
+  if (actual_out_nbytes_ret)
+    actual_out_nbytes = *actual_out_nbytes_ret;
+  else
+    actual_out_nbytes = out_nbytes_avail;
+  
+  in_next += actual_in_nbytes;
+  
+  /* ADLER32  */
+  if (libdeflate_adler32(1, out, actual_out_nbytes) !=
+      get_unaligned_be32(in_next))
+    return LIBDEFLATE_BAD_DATA;
+  in_next += 4;
+  
+  if (actual_in_nbytes_ret)
+    *actual_in_nbytes_ret = in_next - (u8 *)in;
+  
+  return LIBDEFLATE_SUCCESS;
 }
 
 LIBDEFLATEAPI enum libdeflate_result
 libdeflate_zlib_decompress(struct libdeflate_decompressor *d,
-			   const void *in, size_t in_nbytes,
-			   void *out, size_t out_nbytes_avail,
-			   size_t *actual_out_nbytes_ret)
+                           const void *in, size_t in_nbytes,
+                           void *out, size_t out_nbytes_avail,
+                           size_t *actual_out_nbytes_ret)
 {
-	return libdeflate_zlib_decompress_ex(d, in, in_nbytes,
-					     out, out_nbytes_avail,
-					     NULL, actual_out_nbytes_ret);
+  return libdeflate_zlib_decompress_ex(d, in, in_nbytes,
+                                       out, out_nbytes_avail,
+                                       NULL, actual_out_nbytes_ret);
 }