forked from cloudflare/zlib
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add SIMD NEON implementation of the adler32 checksum. Inflate speed increased by ~10% with the Silesia corpus for ARMv8. Tested with a modified zpipe.c to run inflate, deflate for a stream of size 100M. Based on the adler32-simd patch from Noel Gordon for the chromium fork of zlib. 17bbb3d73c84 ("zlib adler_simd.c")
- Loading branch information
Showing
5 changed files
with
292 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,240 @@ | ||
/* adler32_simd.c | ||
* | ||
* (C) 1995-2013 Jean-loup Gailly and Mark Adler | ||
* | ||
* This software is provided 'as-is', without any express or implied | ||
* warranty. In no event will the authors be held liable for any damages | ||
* arising from the use of this software. | ||
* | ||
* Permission is granted to anyone to use this software for any purpose, | ||
* including commercial applications, and to alter it and redistribute it | ||
* freely, subject to the following restrictions: | ||
* | ||
* 1. The origin of this software must not be misrepresented; you must not | ||
* claim that you wrote the original software. If you use this software | ||
* in a product, an acknowledgment in the product documentation would be | ||
* appreciated but is not required. | ||
* 2. Altered source versions must be plainly marked as such, and must not be | ||
* misrepresented as being the original software. | ||
* 3. This notice may not be removed or altered from any source distribution. | ||
* | ||
* Jean-loup Gailly Mark Adler | ||
* [email protected] [email protected] | ||
* | ||
* Copyright 2017 The Chromium Authors. All rights reserved. | ||
* Use of this source code is governed by a BSD-style license that can be | ||
* found in the Chromium source repository LICENSE file. | ||
* | ||
* Per http://en.wikipedia.org/wiki/Adler-32 the adler32 A value (aka s1) is | ||
* the sum of N input data bytes D1 ... DN, | ||
* | ||
* A = A0 + D1 + D2 + ... + DN | ||
* | ||
* where A0 is the initial value. | ||
* | ||
* SSE2 _mm_sad_epu8() can be used for byte sums (see http://bit.ly/2wpUOeD, | ||
* for example) and accumulating the byte sums can use SSE shuffle-adds (see | ||
* the "Integer" section of http://bit.ly/2erPT8t for details). Arm NEON has | ||
* similar instructions. | ||
* | ||
* The adler32 B value (aka s2) sums the A values from each step: | ||
* | ||
* B0 + (A0 + D1) + (A0 + D1 + D2) + ... + (A0 + D1 + D2 + ... + DN) or | ||
* | ||
* B0 + N.A0 + N.D1 + (N-1).D2 + (N-2).D3 + ... + (N-(N-1)).DN | ||
* | ||
* B0 being the initial value. For 32 bytes (ideal for garden-variety SIMD): | ||
* | ||
* B = B0 + 32.A0 + [D1 D2 D3 ... D32] x [32 31 30 ... 1]. | ||
* | ||
* Adjacent blocks of 32 input bytes can be iterated with the expressions to | ||
* compute the adler32 s1 s2 of M >> 32 input bytes [1]. | ||
* | ||
* As M grows, the s1 s2 sums grow. If left unchecked, they would eventually | ||
* overflow the precision of their integer representation (bad). However, s1 | ||
* and s2 also need to be computed modulo the adler BASE value (reduced). If | ||
* at most NMAX bytes are processed before a reduce, s1 s2 _cannot_ overflow | ||
* a uint32_t type (the NMAX constraint) [2]. | ||
* | ||
* [1] the iterative equations for s2 contain constant factors; these can be | ||
* hoisted from the n-blocks do loop of the SIMD code. | ||
* | ||
* [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates | ||
* of the adler s1 s2 of uint32_t type (see adler32.c). | ||
*/ | ||
|
||
#include "adler32_simd.h" | ||
|
||
/* Definitions from adler32.c: largest prime smaller than 65536 */ | ||
#define BASE 65521U | ||
/* NMAX is the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 */ | ||
#define NMAX 5552 | ||
|
||
#if defined(ADLER32_SIMD_NEON) | ||
|
||
#include <arm_neon.h> | ||
|
||
uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ | ||
uint32_t adler, | ||
const unsigned char *buf, | ||
unsigned long len) | ||
{ | ||
/* | ||
* Split Adler-32 into component sums. | ||
*/ | ||
uint32_t s1 = adler & 0xffff; | ||
uint32_t s2 = adler >> 16; | ||
|
||
/* | ||
* Serially compute s1 & s2, until the data is 16-byte aligned. | ||
*/ | ||
if ((uintptr_t)buf & 15) { | ||
while ((uintptr_t)buf & 15) { | ||
s2 += (s1 += *buf++); | ||
--len; | ||
} | ||
|
||
if (s1 >= BASE) | ||
s1 -= BASE; | ||
s2 %= BASE; | ||
} | ||
|
||
/* | ||
* Process the data in blocks. | ||
*/ | ||
const unsigned BLOCK_SIZE = 1 << 5; | ||
|
||
unsigned long blocks = len / BLOCK_SIZE; | ||
len -= blocks * BLOCK_SIZE; | ||
|
||
while (blocks) | ||
{ | ||
unsigned n = NMAX / BLOCK_SIZE; /* The NMAX constraint. */ | ||
if (n > blocks) | ||
n = blocks; | ||
blocks -= n; | ||
|
||
/* | ||
* Process n blocks of data. At most NMAX data bytes can be | ||
* processed before s2 must be reduced modulo BASE. | ||
*/ | ||
uint32x4_t v_s2 = (uint32x4_t) { 0, 0, 0, s1 * n }; | ||
uint32x4_t v_s1 = (uint32x4_t) { 0, 0, 0, 0 }; | ||
|
||
uint16x8_t v_column_sum_1 = vdupq_n_u16(0); | ||
uint16x8_t v_column_sum_2 = vdupq_n_u16(0); | ||
uint16x8_t v_column_sum_3 = vdupq_n_u16(0); | ||
uint16x8_t v_column_sum_4 = vdupq_n_u16(0); | ||
|
||
do { | ||
/* | ||
* Load 32 input bytes. | ||
*/ | ||
const uint8x16_t bytes1 = vld1q_u8((uint8_t*)(buf)); | ||
const uint8x16_t bytes2 = vld1q_u8((uint8_t*)(buf + 16)); | ||
|
||
/* | ||
* Add previous block byte sum to v_s2. | ||
*/ | ||
v_s2 = vaddq_u32(v_s2, v_s1); | ||
|
||
/* | ||
* Horizontally add the bytes for s1. | ||
*/ | ||
v_s1 = vpadalq_u16(v_s1, vpadalq_u8(vpaddlq_u8(bytes1), bytes2)); | ||
|
||
/* | ||
* Vertically add the bytes for s2. | ||
*/ | ||
v_column_sum_1 = vaddw_u8(v_column_sum_1, vget_low_u8 (bytes1)); | ||
v_column_sum_2 = vaddw_u8(v_column_sum_2, vget_high_u8(bytes1)); | ||
v_column_sum_3 = vaddw_u8(v_column_sum_3, vget_low_u8 (bytes2)); | ||
v_column_sum_4 = vaddw_u8(v_column_sum_4, vget_high_u8(bytes2)); | ||
|
||
buf += BLOCK_SIZE; | ||
|
||
} while (--n); | ||
|
||
v_s2 = vshlq_n_u32(v_s2, 5); | ||
|
||
/* | ||
* Multiply-add bytes by [ 32, 31, 30, ... ] for s2. | ||
*/ | ||
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_1), | ||
(uint16x4_t) { 32, 31, 30, 29 }); | ||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_1), | ||
(uint16x4_t) { 28, 27, 26, 25 }); | ||
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_2), | ||
(uint16x4_t) { 24, 23, 22, 21 }); | ||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_2), | ||
(uint16x4_t) { 20, 19, 18, 17 }); | ||
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_3), | ||
(uint16x4_t) { 16, 15, 14, 13 }); | ||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_3), | ||
(uint16x4_t) { 12, 11, 10, 9 }); | ||
v_s2 = vmlal_u16(v_s2, vget_low_u16 (v_column_sum_4), | ||
(uint16x4_t) { 8, 7, 6, 5 }); | ||
v_s2 = vmlal_u16(v_s2, vget_high_u16(v_column_sum_4), | ||
(uint16x4_t) { 4, 3, 2, 1 }); | ||
|
||
/* | ||
* Sum epi32 ints v_s1(s2) and accumulate in s1(s2). | ||
*/ | ||
uint32x2_t sum1 = vpadd_u32(vget_low_u32(v_s1), vget_high_u32(v_s1)); | ||
uint32x2_t sum2 = vpadd_u32(vget_low_u32(v_s2), vget_high_u32(v_s2)); | ||
uint32x2_t s1s2 = vpadd_u32(sum1, sum2); | ||
|
||
s1 += vget_lane_u32(s1s2, 0); | ||
s2 += vget_lane_u32(s1s2, 1); | ||
|
||
/* | ||
* Reduce. | ||
*/ | ||
s1 %= BASE; | ||
s2 %= BASE; | ||
} | ||
|
||
/* | ||
* Handle leftover data. | ||
*/ | ||
if (len) { | ||
if (len >= 16) { | ||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
|
||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
|
||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
|
||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
s2 += (s1 += *buf++); | ||
|
||
len -= 16; | ||
} | ||
|
||
while (len--) { | ||
s2 += (s1 += *buf++); | ||
} | ||
|
||
if (s1 >= BASE) | ||
s1 -= BASE; | ||
s2 %= BASE; | ||
} | ||
|
||
/* | ||
* Return the recombined sums. | ||
*/ | ||
return s1 | (s2 << 16); | ||
} | ||
|
||
#endif /* ADLER32_SIMD_NEON */ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* adler32_simd.h | ||
* | ||
* (C) 1995-2013 Jean-loup Gailly and Mark Adler | ||
* | ||
* This software is provided 'as-is', without any express or implied | ||
* warranty. In no event will the authors be held liable for any damages | ||
* arising from the use of this software. | ||
* | ||
* Permission is granted to anyone to use this software for any purpose, | ||
* including commercial applications, and to alter it and redistribute it | ||
* freely, subject to the following restrictions: | ||
* | ||
* 1. The origin of this software must not be misrepresented; you must not | ||
* claim that you wrote the original software. If you use this software | ||
* in a product, an acknowledgment in the product documentation would be | ||
* appreciated but is not required. | ||
* 2. Altered source versions must be plainly marked as such, and must not be | ||
* misrepresented as being the original software. | ||
* 3. This notice may not be removed or altered from any source distribution. | ||
* | ||
* Jean-loup Gailly Mark Adler | ||
* [email protected] [email protected] | ||
* | ||
* Copyright 2017 The Chromium Authors. All rights reserved. | ||
* Use of this source code is governed by a BSD-style license that can be | ||
* found in the Chromium source repository LICENSE file. | ||
*/ | ||
|
||
#include <stdint.h> | ||
|
||
#include "zconf.h" | ||
#include "zutil.h" | ||
|
||
uint32_t ZLIB_INTERNAL adler32_simd_( | ||
uint32_t adler, | ||
const unsigned char *buf, | ||
unsigned long len); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters