Fix apple cross-platform compatibility.

* Targeting generic devices no longer causes build failures, generic devices are used for creating build archives, used for apple app store submissions. * This prefers the route of simplicity on the end of the user so that they are free to ship metaversekit/swiftusd within their own macOS/iOS/visionOS apps with no overhead. * This temporarily removes gzip support from libdeflate on apple platforms, since generic platforms remove the ability to target specific arm architecture features.
wabiverse · Mar 14, 2024 · 1307787 · 1307787
1 parent 56ea383
commit 1307787
Show file tree

Hide file tree

Showing 41 changed files with 9,870 additions and 7,887 deletions.
diff --git a/Package.swift b/Package.swift
@@ -223,6 +223,16 @@ let package = Package(
 
     .target(
       name: "DEFLATE",
+      exclude: [
+        // better cross-platform compatibility if we remove gzip
+        // support for now, gzip builds fine for macOS and even
+        // on iOS -- it is only when a user chooses to build an
+        // archive using one of the generic devices, that makes
+        // building gzip more complicated.
+        "crc32.c",
+        "gzip_compress.c",
+        "gzip_decompress.c",
+      ],
       publicHeadersPath: "include",
       cxxSettings: [
         .headerSearchPath("."),

diff --git a/Sources/DEFLATE/adler32.c b/Sources/DEFLATE/adler32.c
@@ -35,63 +35,95 @@
  * of s2 overflowing when it is represented as an unsigned 32-bit integer.  This
  * value was computed using the following Python script:
  *
- *	divisor = 65521
- *	count = 0
- *	s1 = divisor - 1
- *	s2 = divisor - 1
- *	while True:
- *		s1 += 0xFF
- *		s2 += s1
- *		if s2 > 0xFFFFFFFF:
- *			break
- *		count += 1
- *	print(count)
+ *  divisor = 65521
+ *  count = 0
+ *  s1 = divisor - 1
+ *  s2 = divisor - 1
+ *  while True:
+ *    s1 += 0xFF
+ *    s2 += s1
+ *    if s2 > 0xFFFFFFFF:
+ *      break
+ *    count += 1
+ *  print(count)
  *
  * Note that to get the correct worst-case value, we must assume that every byte
  * has value 0xFF and that s1 and s2 started with the highest possible values
  * modulo the divisor.
  */
-#define MAX_CHUNK_LEN	5552
+#define MAX_CHUNK_LEN  5552
+
+/*
+ * Update the Adler-32 values s1 and s2 using n bytes from p, update p to p + n,
+ * update n to 0, and reduce s1 and s2 mod DIVISOR.  It is assumed that neither
+ * s1 nor s2 can overflow before the reduction at the end, i.e. n plus any bytes
+ * already processed after the last reduction must not exceed MAX_CHUNK_LEN.
+ *
+ * This uses only portable C code.  This is used as a fallback when a vectorized
+ * implementation of Adler-32 (e.g. AVX2) is unavailable on the platform.
+ *
+ * Some of the vectorized implementations also use this to handle the end of the
+ * data when the data isn't evenly divisible by the length the vectorized code
+ * works on.  To avoid compiler errors about target-specific option mismatches
+ * when this is used in that way, this is a macro rather than a function.
+ *
+ * Although this is unvectorized, this does include an optimization where the
+ * main loop processes four bytes at a time using a strategy similar to that
+ * used by vectorized implementations.  This provides increased instruction-
+ * level parallelism compared to the traditional 's1 += *p++; s2 += s1;'.
+ */
+#define ADLER32_CHUNK(s1, s2, p, n)          \
+do {                  \
+if (n >= 4) {              \
+u32 s1_sum = 0;            \
+u32 byte_0_sum = 0;          \
+u32 byte_1_sum = 0;          \
+u32 byte_2_sum = 0;          \
+u32 byte_3_sum = 0;          \
+\
+do {              \
+s1_sum += s1;          \
+s1 += p[0] + p[1] + p[2] + p[3];    \
+byte_0_sum += p[0];        \
+byte_1_sum += p[1];        \
+byte_2_sum += p[2];        \
+byte_3_sum += p[3];        \
+p += 4;            \
+n -= 4;            \
+} while (n >= 4);          \
+s2 += (4 * (s1_sum + byte_0_sum)) + (3 * byte_1_sum) +  \
+(2 * byte_2_sum) + byte_3_sum;      \
+}                \
+for (; n; n--, p++) {            \
+s1 += *p;            \
+s2 += s1;            \
+}                \
+s1 %= DIVISOR;              \
+s2 %= DIVISOR;              \
+} while (0)
 
 static u32 MAYBE_UNUSED
 adler32_generic(u32 adler, const u8 *p, size_t len)
 {
-	u32 s1 = adler & 0xFFFF;
-	u32 s2 = adler >> 16;
-	const u8 * const end = p + len;
-
-	while (p != end) {
-		size_t chunk_len = MIN(end - p, MAX_CHUNK_LEN);
-		const u8 *chunk_end = p + chunk_len;
-		size_t num_unrolled_iterations = chunk_len / 4;
-
-		while (num_unrolled_iterations--) {
-			s1 += *p++;
-			s2 += s1;
-			s1 += *p++;
-			s2 += s1;
-			s1 += *p++;
-			s2 += s1;
-			s1 += *p++;
-			s2 += s1;
-		}
-		while (p != chunk_end) {
-			s1 += *p++;
-			s2 += s1;
-		}
-		s1 %= DIVISOR;
-		s2 %= DIVISOR;
-	}
-
-	return (s2 << 16) | s1;
+  u32 s1 = adler & 0xFFFF;
+  u32 s2 = adler >> 16;
+
+  while (len) {
+    size_t n = MIN(len, MAX_CHUNK_LEN & ~3);
+
+    len -= n;
+    ADLER32_CHUNK(s1, s2, p, n);
+  }
+
+  return (s2 << 16) | s1;
 }
 
 /* Include architecture-specific implementation(s) if available. */
 #undef DEFAULT_IMPL
 #undef arch_select_adler32_func
 typedef u32 (*adler32_func_t)(u32 adler, const u8 *p, size_t len);
 #if defined(ARCH_ARM32) || defined(ARCH_ARM64)
-#  include "adler32_impl.h"
+#  include "arm/adler32_impl.h"
 #elif defined(ARCH_X86_32) || defined(ARCH_X86_64)
 #  include "x86/adler32_impl.h"
 #endif
@@ -108,13 +140,13 @@ static volatile adler32_func_t adler32_impl = dispatch_adler32;
 /* Choose the best implementation at runtime. */
 static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
 {
-	adler32_func_t f = arch_select_adler32_func();
-
-	if (f == NULL)
-		f = DEFAULT_IMPL;
-
-	adler32_impl = f;
-	return f(adler, p, len);
+  adler32_func_t f = arch_select_adler32_func();
+  
+  if (f == NULL)
+    f = DEFAULT_IMPL;
+  
+  adler32_impl = f;
+  return f(adler, p, len);
 }
 #else
 /* The best implementation is statically known, so call it directly. */
@@ -124,7 +156,7 @@ static u32 dispatch_adler32(u32 adler, const u8 *p, size_t len)
 LIBDEFLATEAPI u32
 libdeflate_adler32(u32 adler, const void *buffer, size_t len)
 {
-	if (buffer == NULL) /* Return initial value. */
-		return 1;
-	return adler32_impl(adler, buffer, len);
+  if (buffer == NULL) /* Return initial value. */
+    return 1;
+  return adler32_impl(adler, buffer, len);
 }