Skip to content

Commit

Permalink
Add alder32 SIMD ARM support
Browse files Browse the repository at this point in the history
Add SIMD NEON implementation of the adler32 checksum.

Inflate speed increased by ~10% with the Silesia corpus for ARMv8. Tested with
a modified zpipe.c to run inflate, deflate for a stream of size 100M.

Based on the adler32-simd patch from Noel Gordon for the chromium fork of zlib.
17bbb3d73c84 ("zlib adler_simd.c")
  • Loading branch information
janaknat authored and vkrasnov committed Aug 20, 2020
1 parent 372bcd1 commit 109d175
Show file tree
Hide file tree
Showing 5 changed files with 292 additions and 4 deletions.
6 changes: 4 additions & 2 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,10 @@ mandir = ${prefix}/share/man
man3dir = ${mandir}/man3
pkgconfigdir = ${libdir}/pkgconfig

OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
OBJZ = adler32.o adler32_simd.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o

PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
PIC_OBJZ = adler32.lo adler32_simd.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo

ifneq ($(findstring -DHAS_PCLMUL, $(CFLAGS)),)
Expand Down Expand Up @@ -272,6 +272,7 @@ depend:
# DO NOT DELETE THIS LINE -- make depend depends on it.

adler32.o zutil.o: zutil.h zlib.h zconf.h
adler32_simd.o: zlib.h
gzclose.o gzlib.o gzread.o gzwrite.o: zlib.h zconf.h gzguts.h
compress.o example.o minigzip.o uncompr.o: zlib.h zconf.h
crc32.o: zutil.h zlib.h zconf.h crc32.h
Expand All @@ -282,6 +283,7 @@ inftrees.o: zutil.h zlib.h zconf.h inftrees.h
trees.o: deflate.h zutil.h zlib.h zconf.h trees.h

adler32.lo zutil.lo: zutil.h zlib.h zconf.h
adler32_simd.o: zlib.h
gzclose.lo gzlib.lo gzread.lo gzwrite.lo: zlib.h zconf.h gzguts.h
compress.lo example.lo minigzip.lo uncompr.lo: zlib.h zconf.h
crc32.lo: zutil.h zlib.h zconf.h crc32.h
Expand Down
9 changes: 9 additions & 0 deletions adler32.c
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ local uLong adler32_combine_ OF((uLong adler1, uLong adler2, z_off64_t len2));
# define MOD63(a) a %= BASE
#endif

#if defined(ADLER32_SIMD_NEON)
#include "adler32_simd.h"
#endif

/* ========================================================================= */
uLong ZEXPORT adler32(adler, buf, len)
uLong adler;
Expand All @@ -70,6 +74,11 @@ uLong ZEXPORT adler32(adler, buf, len)
unsigned long sum2;
unsigned n;

#if defined(ADLER32_SIMD_NEON)
if (buf && len >= 64)
return adler32_simd_(adler, buf, len);
#endif

/* split Adler-32 into component sums */
sum2 = (adler >> 16) & 0xffff;
adler &= 0xffff;
Expand Down
240 changes: 240 additions & 0 deletions adler32_simd.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
/* adler32_simd.c
*
* (C) 1995-2013 Jean-loup Gailly and Mark Adler
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* Jean-loup Gailly Mark Adler
* [email protected] [email protected]
*
* Copyright 2017 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*
* Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is
* the sum of N input data bytes D1 ... DN,
*
* A = A0 + D1 + D2 + ... + DN
*
* where A0 is the initial value.
*
* SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD,
* for example) and accumulating the byte sums can use SSE shuffle-adds (see
* the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has
* similar instructions.
*
* The adler32 B value (aka s2) sums the A values from each step:
*
* B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or
*
* B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN
*
* B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD):
*
* B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1].
*
* Adjacent blocks of 32 input bytes can be iterated with the expressions to
* compute the adler32 s1 s2 of M >> 32 input bytes [1].
*
* As M grows, the s1 s2 sums grow. If left unchecked, they would eventually
* overflow the precision of their integer representation (bad). However, s1
* and s2 also need to be computed modulo the adler BASE value (reduced). If
* at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow
* a uint32_t type (the NMAX constraint) [2].
*
* [1] the iterative equations for s2 contain constant factors; these can be
* hoisted from the n-blocks do loop of the SIMD code.
*
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates
* of the adler s1 s2 of uint32_t type (see adler32.c).
*/

#include "adler32_simd.h"

/* Definitions from adler32.c: largest prime smaller than 65536 */
#define BASE 65521U
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */
#define NMAX 5552

#if defined(ADLER32_SIMD_NEON)

#include <arm_neon.h>

uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */
uint32_t adler,
const unsigned char *buf,
unsigned long len)
{
/*
* Split Adler-32 into component sums.
*/
uint32_t s1 = adler & 0xffff;
uint32_t s2 = adler >> 16;

/*
* Serially compute s1 & s2, until the data is 16-byte aligned.
*/
if ((uintptr_t)buf & 15) {
while ((uintptr_t)buf & 15) {
s2 += (s1 += *buf++);
--len;
}

if (s1 >= BASE)
s1 -= BASE;
s2 %= BASE;
}

/*
* Process the data in blocks.
*/
const unsigned BLOCK_SIZE = 1 << 5;

unsigned long blocks = len / BLOCK_SIZE;
len -= blocks * BLOCK_SIZE;

while (blocks)
{
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */
if (n > blocks)
n = blocks;
blocks -= n;

/*
* Process n blocks of data. At most NMAX data bytes can be
* processed before s2 must be reduced modulo BASE.
*/
uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n };
uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 };

uint16x8_t v_column_sum_1 = vdupq_n_u16(0);
uint16x8_t v_column_sum_2 = vdupq_n_u16(0);
uint16x8_t v_column_sum_3 = vdupq_n_u16(0);
uint16x8_t v_column_sum_4 = vdupq_n_u16(0);

do {
/*
* Load 32 input bytes.
*/
const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf));
const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16));

/*
* Add previous block byte sum to v_s2.
*/
v_s2 = vaddq_u32(v_s2, v_s1);

/*
* Horizontally add the bytes for s1.
*/
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2));

/*
* Vertically add the bytes for s2.
*/
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1));
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1));
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2));
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2));

buf += BLOCK_SIZE;

} while (--n);

v_s2 = vshlq_n_u32(v_s2, 5);

/*
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2.
*/
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1),
(uint16x4_t) { 32, 31, 30, 29 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1),
(uint16x4_t) { 28, 27, 26, 25 });
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2),
(uint16x4_t) { 24, 23, 22, 21 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2),
(uint16x4_t) { 20, 19, 18, 17 });
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3),
(uint16x4_t) { 16, 15, 14, 13 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3),
(uint16x4_t) { 12, 11, 10, 9 });
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4),
(uint16x4_t) { 8, 7, 6, 5 });
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4),
(uint16x4_t) { 4, 3, 2, 1 });

/*
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2).
*/
uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1));
uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2));
uint32x2_t s1s2 = vpadd_u32(sum1, sum2);

s1 += vget_lane_u32(s1s2, 0);
s2 += vget_lane_u32(s1s2, 1);

/*
* Reduce.
*/
s1 %= BASE;
s2 %= BASE;
}

/*
* Handle leftover data.
*/
if (len) {
if (len >= 16) {
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);
s2 += (s1 += *buf++);

len -= 16;
}

while (len--) {
s2 += (s1 += *buf++);
}

if (s1 >= BASE)
s1 -= BASE;
s2 %= BASE;
}

/*
* Return the recombined sums.
*/
return s1 | (s2 << 16);
}

#endif /* ADLER32_SIMD_NEON */
37 changes: 37 additions & 0 deletions adler32_simd.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/* adler32_simd.h
*
* (C) 1995-2013 Jean-loup Gailly and Mark Adler
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* Jean-loup Gailly Mark Adler
* [email protected] [email protected]
*
* Copyright 2017 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/

#include <stdint.h>

#include "zconf.h"
#include "zutil.h"

uint32_t ZLIB_INTERNAL adler32_simd_(
uint32_t adler,
const unsigned char *buf,
unsigned long len);
4 changes: 2 additions & 2 deletions configure
Original file line number Diff line number Diff line change
Expand Up @@ -795,8 +795,8 @@ void foo(void) {
EOF

if try $CC -march=armv8-a+crc $CFLAGS $test.c -c $test; then
CFLAGS="-march=armv8-a+crc $CFLAGS"
SFLAGS="-march=armv8-a+crc $SFLAGS"
CFLAGS="-march=armv8-a+crc -DADLER32_SIMD_NEON $CFLAGS"
SFLAGS="-march=armv8-a+crc -DADLER32_SIMD_NEON $SFLAGS"
echo "Checking for CRC and NEON support ... Yes" | tee -a configure.log
else
echo "Checking for CRC and NEON support ... No" | tee -a configure.log
Expand Down

0 comments on commit 109d175

Please sign in to comment.