From 3af2d51c43af09b98d53cffd69e625dee8f6c6e6 Mon Sep 17 00:00:00 2001 From: Folkert de Vries Date: Thu, 19 Sep 2024 14:25:30 +0200 Subject: [PATCH] remove adler32 COPY variant (memcpy is faster) --- zlib-rs/src/adler32.rs | 10 ++-- zlib-rs/src/adler32/avx2.rs | 89 ++++++---------------------------- zlib-rs/src/adler32/generic.rs | 22 --------- 3 files changed, 17 insertions(+), 104 deletions(-) diff --git a/zlib-rs/src/adler32.rs b/zlib-rs/src/adler32.rs index ca78898b..08d326ee 100644 --- a/zlib-rs/src/adler32.rs +++ b/zlib-rs/src/adler32.rs @@ -23,14 +23,10 @@ pub fn adler32(start_checksum: u32, data: &[u8]) -> u32 { pub fn adler32_fold_copy(start_checksum: u32, dst: &mut [MaybeUninit], src: &[u8]) -> u32 { debug_assert!(dst.len() >= src.len(), "{} < {}", dst.len(), src.len()); - #[cfg(target_arch = "x86_64")] - if crate::cpu_features::is_enabled_avx2() { - return avx2::adler32_fold_copy_avx2(start_checksum, dst, src); - } - - let adler = adler32(start_checksum, src); + // integrating the memcpy into the adler32 function did not have any benefits, and in fact was + // a bit slower for very small chunk sizes. dst[..src.len()].copy_from_slice(slice_to_uninit(src)); - adler + adler32(start_checksum, src) } pub fn adler32_combine(adler1: u32, adler2: u32, len2: u64) -> u32 { diff --git a/zlib-rs/src/adler32/avx2.rs b/zlib-rs/src/adler32/avx2.rs index 6bcf5a88..d0de2f63 100644 --- a/zlib-rs/src/adler32/avx2.rs +++ b/zlib-rs/src/adler32/avx2.rs @@ -1,15 +1,12 @@ -use core::{ - arch::x86_64::{ - __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_extracti128_si256, - _mm256_madd_epi16, _mm256_maddubs_epi16, _mm256_permutevar8x32_epi32, _mm256_sad_epu8, - _mm256_slli_epi32, _mm256_storeu_si256, _mm256_zextsi128_si256, _mm_add_epi32, - _mm_cvtsi128_si32, _mm_cvtsi32_si128, _mm_shuffle_epi32, _mm_unpackhi_epi64, - }, - mem::MaybeUninit, +use core::arch::x86_64::{ + __m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_extracti128_si256, _mm256_madd_epi16, + _mm256_maddubs_epi16, _mm256_permutevar8x32_epi32, _mm256_sad_epu8, _mm256_slli_epi32, + _mm256_zextsi128_si256, _mm_add_epi32, _mm_cvtsi128_si32, _mm_cvtsi32_si128, _mm_shuffle_epi32, + _mm_unpackhi_epi64, }; use crate::adler32::{ - generic::{adler32_copy_len_16, adler32_len_16, adler32_len_64}, + generic::{adler32_len_16, adler32_len_64}, BASE, NMAX, }; @@ -63,20 +60,11 @@ unsafe fn partial_hsum256(x: __m256i) -> u32 { pub fn adler32_avx2(adler: u32, src: &[u8]) -> u32 { assert!(crate::cpu_features::is_enabled_avx2()); - unsafe { adler32_avx2_help::(adler, &mut [], src) } -} - -pub fn adler32_fold_copy_avx2(adler: u32, dst: &mut [MaybeUninit], src: &[u8]) -> u32 { - assert!(crate::cpu_features::is_enabled_avx2()); - unsafe { adler32_avx2_help::(adler, dst, src) } + unsafe { adler32_avx2_help(adler, src) } } #[target_feature(enable = "avx2")] -unsafe fn adler32_avx2_help( - adler: u32, - mut dst: &mut [MaybeUninit], - src: &[u8], -) -> u32 { +unsafe fn adler32_avx2_help(adler: u32, src: &[u8]) -> u32 { if src.is_empty() { return adler; } @@ -87,21 +75,9 @@ unsafe fn adler32_avx2_help( let mut adler0 = adler & 0xffff; let adler = if before.len() < 16 { - if COPY { - let adler = adler32_copy_len_16(adler0, dst, before, adler1); - dst = &mut dst[before.len()..]; - adler - } else { - adler32_len_16(adler0, before, adler1) - } + adler32_len_16(adler0, before, adler1) } else if before.len() < 32 { - if COPY { - let adler = adler32_copy_len_16(adler0, dst, before, adler1); - dst = &mut dst[before.len()..]; - adler - } else { - adler32_len_64(adler0, before, adler1) - } + adler32_len_64(adler0, before, adler1) } else { adler }; @@ -111,25 +87,14 @@ unsafe fn adler32_avx2_help( // use largest step possible (without causing overflow) for chunk in middle.chunks(NMAX as usize / 32) { - (adler0, adler1) = unsafe { helper_32_bytes::(adler0, adler1, dst, chunk) }; - if COPY { - dst = &mut dst[32 * chunk.len()..]; - } + (adler0, adler1) = unsafe { helper_32_bytes(adler0, adler1, chunk) }; } if !after.is_empty() { if after.len() < 16 { - if COPY { - return adler32_copy_len_16(adler0, dst, after, adler1); - } else { - return adler32_len_16(adler0, after, adler1); - } + return adler32_len_16(adler0, after, adler1); } else if after.len() < 32 { - if COPY { - return adler32_copy_len_16(adler0, dst, after, adler1); - } else { - return adler32_len_64(adler0, after, adler1); - } + return adler32_len_64(adler0, after, adler1); } else { unreachable!() } @@ -139,26 +104,14 @@ unsafe fn adler32_avx2_help( } #[target_feature(enable = "avx2")] -unsafe fn helper_32_bytes( - mut adler0: u32, - mut adler1: u32, - dst: &mut [MaybeUninit], - src: &[__m256i], -) -> (u32, u32) { +unsafe fn helper_32_bytes(mut adler0: u32, mut adler1: u32, src: &[__m256i]) -> (u32, u32) { let mut vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0 as i32)); let mut vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1 as i32)); let mut vs1_0 = vs1; let mut vs3 = ZERO; - let mut out_chunks = dst.chunks_exact_mut(32); - for vbuf in src.iter().copied() { - if COPY { - let out_chunk = out_chunks.next().unwrap(); - _mm256_storeu_si256(out_chunk.as_mut_ptr() as *mut __m256i, vbuf); - } - let vs1_sad = _mm256_sad_epu8(vbuf, ZERO); // Sum of abs diff, resulting in 2 x int32's vs1 = _mm256_add_epi32(vs1, vs1_sad); @@ -240,18 +193,4 @@ mod test { unsafe fn slice_assume_init(slice: &[MaybeUninit]) -> &[u8] { &*(slice as *const [MaybeUninit] as *const [u8]) } - - #[test] - fn fold_copy_copies() { - let src: Vec<_> = (0..128).map(|x| x as u8).collect(); - let mut dst = [MaybeUninit::new(0); 128]; - - for (i, _) in src.iter().enumerate() { - dst.fill(MaybeUninit::new(0)); - - adler32_fold_copy_avx2(1, &mut dst[..i], &src[..i]); - - assert_eq!(&src[..i], unsafe { slice_assume_init(&dst[..i]) }) - } - } } diff --git a/zlib-rs/src/adler32/generic.rs b/zlib-rs/src/adler32/generic.rs index 2500c38f..f34f539c 100644 --- a/zlib-rs/src/adler32/generic.rs +++ b/zlib-rs/src/adler32/generic.rs @@ -1,5 +1,3 @@ -use core::mem::MaybeUninit; - use super::{BASE, NMAX}; const UNROLL_MORE: bool = true; @@ -100,26 +98,6 @@ pub(crate) fn adler32_len_16(mut adler: u32, buf: &[u8], mut sum2: u32) -> u32 { adler | (sum2 << 16) } -#[cfg_attr(not(target_arch = "x86_64"), allow(unused))] -pub(crate) fn adler32_copy_len_16( - mut adler: u32, - dst: &mut [MaybeUninit], - src: &[u8], - mut sum2: u32, -) -> u32 { - for (source, destination) in src.iter().zip(dst.iter_mut()) { - let v = *source; - *destination = MaybeUninit::new(v); - adler += v as u32; - sum2 += adler; - } - - adler %= BASE; - sum2 %= BASE; /* only added so many BASE's */ - /* return recombined sums */ - adler | (sum2 << 16) -} - pub(crate) fn adler32_len_64(mut adler: u32, buf: &[u8], mut sum2: u32) -> u32 { const N: usize = if UNROLL_MORE { 16 } else { 8 }; let mut it = buf.chunks_exact(N);