Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added AVX512 support for SkRasterPipeline_opts.h #151

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/core/SkRasterPipelineOpContexts.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace SkSL { class TraceHook; }
// by stages that have no lowp implementation. They can therefore use the (smaller) highp value to
// save memory in the arena.
inline static constexpr int SkRasterPipeline_kMaxStride = 16;
inline static constexpr int SkRasterPipeline_kMaxStride_highp = 8;
inline static constexpr int SkRasterPipeline_kMaxStride_highp = 16;

// How much space to allocate for each MemoryCtx scratch buffer, as part of tail-pixel handling.
inline static constexpr size_t SkRasterPipeline_MaxScratchPerPatch =
Expand Down
275 changes: 269 additions & 6 deletions src/opts/SkRasterPipeline_opts.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ using NoCtx = const void*;
#define JUMPER_IS_SCALAR
#elif defined(SK_ARM_HAS_NEON)
#define JUMPER_IS_NEON
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
#define JUMPER_IS_AVX512
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
#define JUMPER_IS_HSW
#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
Expand Down Expand Up @@ -291,6 +293,223 @@ namespace SK_OPTS_NS {
SI void store4(float* ptr, F r, F g, F b, F a) {
vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
}
#elif defined(JUMPER_IS_AVX512)
template <typename T> using V = T __attribute__((ext_vector_type(16)));
using F = V<float >;
using I32 = V< int32_t>;
using U64 = V<uint64_t>;
using U32 = V<uint32_t>;
using U16 = V<uint16_t>;
using U8 = V<uint8_t >;

SI F mad(F f, F m, F a) { return _mm512_fmadd_ps(f, m, a); }
SI F min(F a, F b) { return _mm512_min_ps(a,b); }
SI I32 min(I32 a, I32 b) { return _mm512_min_epi32(a,b); }
SI U32 min(U32 a, U32 b) { return _mm512_min_epu32(a,b); }
SI F max(F a, F b) { return _mm512_max_ps(a,b); }
SI I32 max(I32 a, I32 b) { return _mm512_max_epi32(a,b); }
SI U32 max(U32 a, U32 b) { return _mm512_max_epu32(a,b); }
SI F abs_ (F v) { return _mm512_and_ps(v, _mm512_sub_ps(_mm512_setzero(), v)); }
SI I32 abs_ (I32 v) { return _mm512_abs_epi32(v); }
SI F floor_(F v) { return _mm512_floor_ps(v); }
SI F ceil_(F v) { return _mm512_ceil_ps(v); }
SI F rcp_approx(F v) { return _mm512_rcp14_ps (v); }
SI F rsqrt_approx (F v) { return _mm512_rsqrt14_ps(v); }
SI F sqrt_ (F v) { return _mm512_sqrt_ps (v); }
SI F rcp_precise (F v) {
F e = rcp_approx(v);
return _mm512_fnmadd_ps(v, e, _mm512_set1_ps(2.0f)) * e;
}
SI U32 round(F v) { return _mm512_cvtps_epi32(v); }
SI U32 round(F v, F scale) { return _mm512_cvtps_epi32(v*scale); }
SI U16 pack(U32 v) {
__m256i rst = _mm256_packus_epi32(_mm512_castsi512_si256(v),
_mm512_extracti64x4_epi64(v, 1));
return _mm256_permutex_epi64(rst, 216);
}
SI U8 pack(U16 v) {
__m256i rst = _mm256_packus_epi16(v, v);
return _mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
}
SI F if_then_else(I32 c, F t, F e) {
return _mm512_castsi512_ps(_mm512_ternarylogic_epi64(c, _mm512_castps_si512(t),
_mm512_castps_si512(e), 202));
}
SI bool any(I32 c) {
__mmask16 mask32 = _mm512_test_epi32_mask(c, c);
return mask32 != 0;
}
SI bool all(I32 c) {
__mmask16 mask32 = _mm512_test_epi32_mask(c, c);
return mask32 == 0xffff;
}
template <typename T>
SI V<T> gather(const T* p, U32 ix) {
return{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]],
p[ix[8]], p[ix[9]], p[ix[10]], p[ix[11]],
p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]],
};
}
SI F gather(const float* p, U32 ix) { return _mm512_i32gather_ps(ix, p, 4); }
SI U32 gather(const uint32_t* p, U32 ix) { return _mm512_i32gather_epi32(ix, p, 4); }
SI U64 gather(const uint64_t* p, U32 ix) {
__m512i parts[] = {
_mm512_i32gather_epi64(_mm512_castsi512_si256(ix), p, 8),
_mm512_i32gather_epi64(_mm512_extracti32x8_epi32(ix, 1), p, 8),
};
return sk_bit_cast<U64>(parts);
}
template <typename V, typename S>
SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
V before = gather(dst, ix);
V after = if_then_else(mask, src, before);
dst[ix[0]] = after[0];
dst[ix[1]] = after[1];
dst[ix[2]] = after[2];
dst[ix[3]] = after[3];
dst[ix[4]] = after[4];
dst[ix[5]] = after[5];
dst[ix[6]] = after[6];
dst[ix[7]] = after[7];
dst[ix[8]] = after[8];
dst[ix[9]] = after[9];
dst[ix[10]] = after[10];
dst[ix[11]] = after[11];
dst[ix[12]] = after[12];
dst[ix[13]] = after[13];
dst[ix[14]] = after[14];
dst[ix[15]] = after[15];
}

SI void load2(const uint16_t* ptr, U16* r, U16* g) {
U16 _01234567 = _mm256_loadu_si256(((__m256i*)ptr) + 0);
U16 _89abcdef = _mm256_loadu_si256(((__m256i*)ptr) + 1);

*r = _mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_mm256_slli_epi32
(_01234567, 16), 16), _mm256_srai_epi32(_mm256_slli_epi32(_89abcdef, 16), 16)), 216);
*g = _mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_01234567, 16),
_mm256_srai_epi32(_89abcdef, 16)), 216);
}
SI void store2(uint16_t* ptr, U16 r, U16 g) {
auto _01234567 = _mm256_unpacklo_epi16(r, g);
auto _89abcdef = _mm256_unpackhi_epi16(r, g);
__m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567),
_89abcdef, 1);
__m512i aa = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), combinedVector);
_01234567 = _mm512_castsi512_si256(aa);
_89abcdef = _mm512_extracti64x4_epi64(aa, 1);

_mm256_storeu_si256((__m256i*)ptr + 0, _01234567);
_mm256_storeu_si256((__m256i*)ptr + 1, _89abcdef);
}

SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
__m512i _01234567 = _mm512_loadu_si512((__m512i*)ptr);
__m512i _89abcdef = _mm512_loadu_si512((__m512i*)(ptr+32));

*r = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_01234567,
_mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
_89abcdef, _mm512_set1_epi64(0xFF))));
*g = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
_01234567, 16), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
_mm512_and_si512(_mm512_srli_epi64(_89abcdef, 16), _mm512_set1_epi64(0xFF))));
*b = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
_01234567, 32), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
_mm512_and_si512(_mm512_srli_epi64(_89abcdef, 32), _mm512_set1_epi64(0xFF))));
*a = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
_01234567, 48), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
_mm512_and_si512(_mm512_srli_epi64(_89abcdef, 48), _mm512_set1_epi64(0xFF))));
}
SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
auto rg012389ab = _mm256_unpacklo_epi16(r, g),
rg4567cdef = _mm256_unpackhi_epi16(r, g),
ba012389ab = _mm256_unpacklo_epi16(b, a),
ba4567cdef = _mm256_unpackhi_epi16(b, a);

auto _0189 = _mm256_unpacklo_epi32(rg012389ab, ba012389ab),
_23ab = _mm256_unpackhi_epi32(rg012389ab, ba012389ab),
_45cd = _mm256_unpacklo_epi32(rg4567cdef, ba4567cdef),
_67ef = _mm256_unpackhi_epi32(rg4567cdef, ba4567cdef);

auto _ab23 = _mm256_permutex_epi64(_23ab, 78);
auto _0123 = _mm256_blend_epi32(_0189, _ab23, 0xf0);
auto _89ab = _mm256_permutex_epi64(_mm256_blend_epi32(_0189, _ab23, 0x0f), 78);
auto _ef67 = _mm256_permutex_epi64(_67ef, 78);
auto _4567 = _mm256_blend_epi32(_45cd, _ef67, 0xf0);
auto _cdef = _mm256_permutex_epi64(_mm256_blend_epi32(_45cd, _ef67, 0x0f), 78);

_mm256_storeu_si256((__m256i*)ptr, _0123);
_mm256_storeu_si256((__m256i*)ptr + 1, _4567);
_mm256_storeu_si256((__m256i*)ptr + 2, _89ab);
_mm256_storeu_si256((__m256i*)ptr + 3, _cdef);
}

SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
F _048c, _159d, _26ae, _37bf;

_048c = _mm512_castps128_ps512(_mm_loadu_ps(ptr) );
_048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+16), 1);
_048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+32), 2);
_048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+48), 3);
_159d = _mm512_castps128_ps512(_mm_loadu_ps(ptr+4) );
_159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+20), 1);
_159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+36), 2);
_159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+52), 3);
_26ae = _mm512_castps128_ps512(_mm_loadu_ps(ptr+8) );
_26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+24), 1);
_26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+40), 2);
_26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+56), 3);
_37bf = _mm512_castps128_ps512(_mm_loadu_ps(ptr+12) );
_37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+28), 1);
_37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+44), 2);
_37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+60), 3);

F rg02468acf = _mm512_unpacklo_ps(_048c, _26ae),
ba02468acf = _mm512_unpackhi_ps(_048c, _26ae),
rg13579bde = _mm512_unpacklo_ps(_159d, _37bf),
ba13579bde = _mm512_unpackhi_ps(_159d, _37bf);

*r = _mm512_unpacklo_ps(rg02468acf, rg13579bde);
*g = _mm512_unpackhi_ps(rg02468acf, rg13579bde);
*b = _mm512_unpacklo_ps(ba02468acf, ba13579bde);
*a = _mm512_unpackhi_ps(ba02468acf, ba13579bde);
}

SI void store4(float* ptr, F r, F g, F b, F a) {
F rg014589cd = _mm512_unpacklo_ps(r, g),
rg2367abef = _mm512_unpackhi_ps(r, g),
ba014589cd = _mm512_unpacklo_ps(b, a),
ba2367abef = _mm512_unpackhi_ps(b, a);

F _048c = _mm512_unpacklo_pd(rg014589cd, ba014589cd), // r0 g0 b0 a0 4 8 c
_26ae = _mm512_unpacklo_pd(rg2367abef, ba2367abef), // r2 g2 b2 a2 6 a e
_159d = _mm512_unpackhi_pd(rg014589cd, ba014589cd), // r1 g1 b1 a1 5 9 d
_37bf = _mm512_unpackhi_pd(rg2367abef, ba2367abef); // r3 g3 b3 a3 7 b f


F _ae26 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _26ae),
_bf37 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _37bf),
_8c04 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _048c),
_9d15 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _159d),

_0426 = _mm512_permutex2var_pd(_048c, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _ae26),
_1537 = _mm512_permutex2var_pd(_159d, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _bf37),
_5173 = _mm512_permutex_pd(_1537, 176),
_0123 = _mm512_permutex2var_pd(_0426, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _5173),
_5476 = _mm512_permutex2var_pd(_5173, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _0426),
_4567 = _mm512_permutex_pd(_5476, 176),
_8cae = _mm512_permutex2var_pd(_8c04, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _26ae),
_9dbf = _mm512_permutex2var_pd(_9d15, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _37bf),
_d9fb = _mm512_permutex_pd(_9dbf, 176),
_89ab = _mm512_permutex2var_pd(_8cae, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _d9fb),
_dcfe = _mm512_permutex2var_pd(_d9fb, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _8cae),
_cdef = _mm512_permutex_pd(_dcfe, 176);
_mm512_storeu_ps(ptr+0, _0123);
_mm512_storeu_ps(ptr+16, _4567);
_mm512_storeu_ps(ptr+32, _89ab);
_mm512_storeu_ps(ptr+48, _cdef);
}

#elif defined(JUMPER_IS_HSW)
// These are __m256 and __m256i, but friendlier and strongly-typed.
Expand Down Expand Up @@ -631,6 +850,12 @@ template <typename T> using V = T __attribute__((ext_vector_type(4)));
SI U32 trunc_(F v) { return (U32)v; }
SI U32 expand(U16 v) { return (U32)v; }
SI U32 expand(U8 v) { return (U32)v; }
#elif defined (JUMPER_IS_AVX512)
SI F cast (U32 v) { return _mm512_cvtepu32_ps(v); }
SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
SI U32 trunc_(F v) { return (U32)__builtin_convertvector( v, I32); }
SI U32 expand(U16 v) { return _mm512_cvtepu16_epi32(v); }
SI U32 expand(U8 v) { return _mm512_cvtepu8_epi32(v); }
#else
SI F cast (U32 v) { return __builtin_convertvector((I32)v, F); }
SI F cast64(U64 v) { return __builtin_convertvector( v, F); }
Expand Down Expand Up @@ -692,6 +917,9 @@ SI F from_half(U16 h) {
&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
return vcvt_f32_f16(h);

#elif defined(JUMPER_IS_AVX512)
return _mm512_cvtph_ps(h);

#elif defined(JUMPER_IS_HSW)
return _mm256_cvtph_ps(h);

Expand All @@ -713,6 +941,9 @@ SI U16 to_half(F f) {
&& !defined(SK_BUILD_FOR_GOOGLE3) // Temporary workaround for some Google3 builds.
return vcvt_f16_f32(f);

#elif defined(JUMPER_IS_AVX512)
return _mm512_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);

#elif defined(JUMPER_IS_HSW)
return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);

Expand Down Expand Up @@ -4162,7 +4393,7 @@ namespace lowp {

#else // We are compiling vector code with Clang... let's make some lowp stages!

#if defined(JUMPER_IS_HSW)
#if defined(JUMPER_IS_AVX512) || defined(JUMPER_IS_HSW)
using U8 = uint8_t __attribute__((ext_vector_type(16)));
using U16 = uint16_t __attribute__((ext_vector_type(16)));
using I16 = int16_t __attribute__((ext_vector_type(16)));
Expand Down Expand Up @@ -4440,7 +4671,10 @@ SI U32 trunc_(F x) { return (U32)cast<I32>(x); }

// Use approximate instructions and one Newton-Raphson step to calculate 1/x.
SI F rcp_precise(F x) {
#if defined(JUMPER_IS_HSW)
#if defined(JUMPER_IS_AVX512)
F e = _mm512_rcp14_ps(x);
return _mm512_fnmadd_ps(x, e, _mm512_set1_ps(2.0f)) * e;
#elif defined(JUMPER_IS_HSW)
__m256 lo,hi;
split(x, &lo,&hi);
return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
Expand All @@ -4457,7 +4691,9 @@ SI F rcp_precise(F x) {
#endif
}
SI F sqrt_(F x) {
#if defined(JUMPER_IS_HSW)
#if defined(JUMPER_IS_AVX512)
return _mm512_sqrt_ps(x);
#elif defined(JUMPER_IS_HSW)
__m256 lo,hi;
split(x, &lo,&hi);
return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
Expand Down Expand Up @@ -4492,6 +4728,8 @@ SI F floor_(F x) {
float32x4_t lo,hi;
split(x, &lo,&hi);
return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
#elif defined(JUMPER_IS_AVX512)
return _mm512_floor_ps(x);
#elif defined(JUMPER_IS_HSW)
__m256 lo,hi;
split(x, &lo,&hi);
Expand All @@ -4512,7 +4750,9 @@ SI F floor_(F x) {
// The result is a number on [-1, 1).
// Note: on neon this is a saturating multiply while the others are not.
SI I16 scaled_mult(I16 a, I16 b) {
#if defined(JUMPER_IS_HSW)
#if defined(JUMPER_IS_AVX512)
return _mm256_mulhrs_epi16(a, b);
#elif defined(JUMPER_IS_HSW)
return _mm256_mulhrs_epi16(a, b);
#elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
return _mm_mulhrs_epi16(a, b);
Expand Down Expand Up @@ -4786,7 +5026,25 @@ SI void store(T* ptr, V v) {
memcpy(ptr, &v, sizeof(v));
}

#if defined(JUMPER_IS_HSW)
#if defined(JUMPER_IS_AVX512)
template <typename V, typename T>
SI V gather(const T* ptr, U32 ix) {
return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
}

template<>
F gather(const float* ptr, U32 ix) {
return _mm512_i32gather_ps(ix, ptr, 4);
}

template<>
U32 gather(const uint32_t* ptr, U32 ix) {
return _mm512_i32gather_epi32(ix, ptr, 4);
}
#elif defined(JUMPER_IS_HSW)
template <typename V, typename T>
SI V gather(const T* ptr, U32 ix) {
return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
Expand Down Expand Up @@ -4824,7 +5082,12 @@ SI void store(T* ptr, V v) {
// ~~~~~~ 32-bit memory loads and stores ~~~~~~ //

SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
#if defined(JUMPER_IS_HSW)
#if defined(JUMPER_IS_AVX512)
rgba = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), rgba);
auto cast_U16 = [](U32 v) -> U16 {
return _mm256_packus_epi32(_mm512_castsi512_si256(v), _mm512_extracti64x4_epi64(v, 1));
};
#elif defined(JUMPER_IS_HSW)
// Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
__m256i _01,_23;
split(rgba, &_01, &_23);
Expand Down