Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

remove adler32 COPY variant (memcpy is faster) #189

Merged
merged 1 commit into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 3 additions & 7 deletions zlib-rs/src/adler32.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,10 @@ pub fn adler32(start_checksum: u32, data: &[u8]) -> u32 {
pub fn adler32_fold_copy(start_checksum: u32, dst: &mut [MaybeUninit<u8>], src: &[u8]) -> u32 {
debug_assert!(dst.len() >= src.len(), "{} < {}", dst.len(), src.len());

#[cfg(target_arch = "x86_64")]
if crate::cpu_features::is_enabled_avx2() {
return avx2::adler32_fold_copy_avx2(start_checksum, dst, src);
}

let adler = adler32(start_checksum, src);
// integrating the memcpy into the adler32 function did not have any benefits, and in fact was
// a bit slower for very small chunk sizes.
dst[..src.len()].copy_from_slice(slice_to_uninit(src));
adler
adler32(start_checksum, src)
}

pub fn adler32_combine(adler1: u32, adler2: u32, len2: u64) -> u32 {
Expand Down
89 changes: 14 additions & 75 deletions zlib-rs/src/adler32/avx2.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
use core::{
arch::x86_64::{
__m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_extracti128_si256,
_mm256_madd_epi16, _mm256_maddubs_epi16, _mm256_permutevar8x32_epi32, _mm256_sad_epu8,
_mm256_slli_epi32, _mm256_storeu_si256, _mm256_zextsi128_si256, _mm_add_epi32,
_mm_cvtsi128_si32, _mm_cvtsi32_si128, _mm_shuffle_epi32, _mm_unpackhi_epi64,
},
mem::MaybeUninit,
use core::arch::x86_64::{
__m256i, _mm256_add_epi32, _mm256_castsi256_si128, _mm256_extracti128_si256, _mm256_madd_epi16,
_mm256_maddubs_epi16, _mm256_permutevar8x32_epi32, _mm256_sad_epu8, _mm256_slli_epi32,
_mm256_zextsi128_si256, _mm_add_epi32, _mm_cvtsi128_si32, _mm_cvtsi32_si128, _mm_shuffle_epi32,
_mm_unpackhi_epi64,
};

use crate::adler32::{
generic::{adler32_copy_len_16, adler32_len_16, adler32_len_64},
generic::{adler32_len_16, adler32_len_64},
BASE, NMAX,
};

Expand Down Expand Up @@ -63,20 +60,11 @@ unsafe fn partial_hsum256(x: __m256i) -> u32 {

pub fn adler32_avx2(adler: u32, src: &[u8]) -> u32 {
assert!(crate::cpu_features::is_enabled_avx2());
unsafe { adler32_avx2_help::<false>(adler, &mut [], src) }
}

pub fn adler32_fold_copy_avx2(adler: u32, dst: &mut [MaybeUninit<u8>], src: &[u8]) -> u32 {
assert!(crate::cpu_features::is_enabled_avx2());
unsafe { adler32_avx2_help::<true>(adler, dst, src) }
unsafe { adler32_avx2_help(adler, src) }
}

#[target_feature(enable = "avx2")]
unsafe fn adler32_avx2_help<const COPY: bool>(
adler: u32,
mut dst: &mut [MaybeUninit<u8>],
src: &[u8],
) -> u32 {
unsafe fn adler32_avx2_help(adler: u32, src: &[u8]) -> u32 {
if src.is_empty() {
return adler;
}
Expand All @@ -87,21 +75,9 @@ unsafe fn adler32_avx2_help<const COPY: bool>(
let mut adler0 = adler & 0xffff;

let adler = if before.len() < 16 {
if COPY {
let adler = adler32_copy_len_16(adler0, dst, before, adler1);
dst = &mut dst[before.len()..];
adler
} else {
adler32_len_16(adler0, before, adler1)
}
adler32_len_16(adler0, before, adler1)
} else if before.len() < 32 {
if COPY {
let adler = adler32_copy_len_16(adler0, dst, before, adler1);
dst = &mut dst[before.len()..];
adler
} else {
adler32_len_64(adler0, before, adler1)
}
adler32_len_64(adler0, before, adler1)
} else {
adler
};
Expand All @@ -111,25 +87,14 @@ unsafe fn adler32_avx2_help<const COPY: bool>(

// use largest step possible (without causing overflow)
for chunk in middle.chunks(NMAX as usize / 32) {
(adler0, adler1) = unsafe { helper_32_bytes::<COPY>(adler0, adler1, dst, chunk) };
if COPY {
dst = &mut dst[32 * chunk.len()..];
}
(adler0, adler1) = unsafe { helper_32_bytes(adler0, adler1, chunk) };
}

if !after.is_empty() {
if after.len() < 16 {
if COPY {
return adler32_copy_len_16(adler0, dst, after, adler1);
} else {
return adler32_len_16(adler0, after, adler1);
}
return adler32_len_16(adler0, after, adler1);
} else if after.len() < 32 {
if COPY {
return adler32_copy_len_16(adler0, dst, after, adler1);
} else {
return adler32_len_64(adler0, after, adler1);
}
return adler32_len_64(adler0, after, adler1);
} else {
unreachable!()
}
Expand All @@ -139,26 +104,14 @@ unsafe fn adler32_avx2_help<const COPY: bool>(
}

#[target_feature(enable = "avx2")]
unsafe fn helper_32_bytes<const COPY: bool>(
mut adler0: u32,
mut adler1: u32,
dst: &mut [MaybeUninit<u8>],
src: &[__m256i],
) -> (u32, u32) {
unsafe fn helper_32_bytes(mut adler0: u32, mut adler1: u32, src: &[__m256i]) -> (u32, u32) {
let mut vs1 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler0 as i32));
let mut vs2 = _mm256_zextsi128_si256(_mm_cvtsi32_si128(adler1 as i32));

let mut vs1_0 = vs1;
let mut vs3 = ZERO;

let mut out_chunks = dst.chunks_exact_mut(32);

for vbuf in src.iter().copied() {
if COPY {
let out_chunk = out_chunks.next().unwrap();
_mm256_storeu_si256(out_chunk.as_mut_ptr() as *mut __m256i, vbuf);
}

let vs1_sad = _mm256_sad_epu8(vbuf, ZERO); // Sum of abs diff, resulting in 2 x int32's

vs1 = _mm256_add_epi32(vs1, vs1_sad);
Expand Down Expand Up @@ -240,18 +193,4 @@ mod test {
unsafe fn slice_assume_init(slice: &[MaybeUninit<u8>]) -> &[u8] {
&*(slice as *const [MaybeUninit<u8>] as *const [u8])
}

#[test]
fn fold_copy_copies() {
let src: Vec<_> = (0..128).map(|x| x as u8).collect();
let mut dst = [MaybeUninit::new(0); 128];

for (i, _) in src.iter().enumerate() {
dst.fill(MaybeUninit::new(0));

adler32_fold_copy_avx2(1, &mut dst[..i], &src[..i]);

assert_eq!(&src[..i], unsafe { slice_assume_init(&dst[..i]) })
}
}
}
22 changes: 0 additions & 22 deletions zlib-rs/src/adler32/generic.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
use core::mem::MaybeUninit;

use super::{BASE, NMAX};

const UNROLL_MORE: bool = true;
Expand Down Expand Up @@ -100,26 +98,6 @@ pub(crate) fn adler32_len_16(mut adler: u32, buf: &[u8], mut sum2: u32) -> u32 {
adler | (sum2 << 16)
}

#[cfg_attr(not(target_arch = "x86_64"), allow(unused))]
pub(crate) fn adler32_copy_len_16(
mut adler: u32,
dst: &mut [MaybeUninit<u8>],
src: &[u8],
mut sum2: u32,
) -> u32 {
for (source, destination) in src.iter().zip(dst.iter_mut()) {
let v = *source;
*destination = MaybeUninit::new(v);
adler += v as u32;
sum2 += adler;
}

adler %= BASE;
sum2 %= BASE; /* only added so many BASE's */
/* return recombined sums */
adler | (sum2 << 16)
}

pub(crate) fn adler32_len_64(mut adler: u32, buf: &[u8], mut sum2: u32) -> u32 {
const N: usize = if UNROLL_MORE { 16 } else { 8 };
let mut it = buf.chunks_exact(N);
Expand Down
Loading