From c3cc0eaabde5157c3af0836b0ee62e9be2d6be8f Mon Sep 17 00:00:00 2001
From: pingladd <86369275+pingladd@users.noreply.github.com>
Date: Wed, 15 Nov 2023 23:06:18 +0000
Subject: [PATCH 1/2] Added AVX512 support for SkRasterPipeline_opts.h

Hi,
The patch has integrated AVX512 support for certain functions in both highp and lowp. Testing and verification were conducted within the Pdfium repository, where it passed the pdfium_embeddertests.exe. Performance-wise, the AVX512 code path shows significant enhancement over the standard SSE and AVX2 paths. This performance boost was confirmed through testing with PDF files sourced from the resources folder of the Pdfium library.

This is an imported pull request from
https://github.com/google/skia/pull/149

GitOrigin-RevId: 3dfeb3bb6ba398f57bf8e110ba1d7384f67575cc
Change-Id: I91f95a69d914ed57707239b7d2257a6c8f0c3ffa
---
 src/core/SkRasterPipelineOpContexts.h |   2 +-
 src/opts/SkRasterPipeline_opts.h      | 285 +++++++++++++++++++++++++-
 2 files changed, 280 insertions(+), 7 deletions(-)
diff --git a/src/core/SkRasterPipelineOpContexts.h b/src/core/SkRasterPipelineOpContexts.h
index a098ac3a40aa..ab3cad6a5117 100644
--- a/src/core/SkRasterPipelineOpContexts.h
+++ b/src/core/SkRasterPipelineOpContexts.h
@@ -19,7 +19,7 @@ namespace SkSL { class TraceHook; }
 // by stages that have no lowp implementation. They can therefore use the (smaller) highp value to
 // save memory in the arena.
 inline static constexpr int SkRasterPipeline_kMaxStride = 16;
-inline static constexpr int SkRasterPipeline_kMaxStride_highp = 8;
+inline static constexpr int SkRasterPipeline_kMaxStride_highp = 16;
 
 // How much space to allocate for each MemoryCtx scratch buffer, as part of tail-pixel handling.
 inline static constexpr size_t SkRasterPipeline_MaxScratchPerPatch =
diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 72fccb842dad..94024a79b029 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -59,6 +59,8 @@ using NoCtx = const void*;
     #define JUMPER_IS_SCALAR
 #elif defined(SK_ARM_HAS_NEON)
     #define JUMPER_IS_NEON
+#elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_SKX
+    #define JUMPER_IS_AVX512
 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX2
     #define JUMPER_IS_HSW
 #elif SK_CPU_SSE_LEVEL >= SK_CPU_SSE_LEVEL_AVX
@@ -291,6 +293,224 @@ namespace SK_OPTS_NS {
     SI void store4(float* ptr, F r, F g, F b, F a) {
         vst4q_f32(ptr, (float32x4x4_t{{r,g,b,a}}));
     }
+#elif defined(JUMPER_IS_AVX512)
+template <typename T> using V = T __attribute__((ext_vector_type(16)));
+    using F   = V<float   >;
+    using I32 = V< int32_t>;
+    using U64 = V<uint64_t>;
+    using U32 = V<uint32_t>;
+    using U16 = V<uint16_t>;
+    using U8  = V<uint8_t >;
+
+    SI F   mad(F f, F m, F a) { return _mm512_fmadd_ps(f, m, a); }
+    SI F   min(F a, F b)     { return _mm512_min_ps(a,b);    }
+    SI I32 min(I32 a, I32 b) { return _mm512_min_epi32(a,b); }
+    SI U32 min(U32 a, U32 b) { return _mm512_min_epu32(a,b); }
+    SI F   max(F a, F b)     { return _mm512_max_ps(a,b);    }
+    SI I32 max(I32 a, I32 b) { return _mm512_max_epi32(a,b); }
+    SI U32 max(U32 a, U32 b) { return _mm512_max_epu32(a,b); }
+    SI F   abs_  (F v)   { return _mm512_and_ps(v, _mm512_sub_ps(_mm512_setzero(), v)); }
+    SI I32 abs_  (I32 v) { return _mm512_abs_epi32(v);   }
+    SI F   floor_(F v)   { return _mm512_floor_ps(v);    }
+    SI F   ceil_(F v)    { return _mm512_ceil_ps(v);     }
+    SI F   rcp_approx(F v) { return _mm512_rcp14_ps  (v);  }
+    SI F   rsqrt_approx (F v)   { return _mm512_rsqrt14_ps(v);  }
+    SI F   sqrt_ (F v)   { return _mm512_sqrt_ps (v);    }
+    SI F rcp_precise (F v) {
+        F e = rcp_approx(v);
+        return _mm512_fnmadd_ps(v, e, _mm512_set1_ps(2.0f)) * e;
+    }
+    SI U32 round(F v)          { return _mm512_cvtps_epi32(v); }
+    SI U32 round(F v, F scale) { return _mm512_cvtps_epi32(v*scale); }
+    SI U16 pack(U32 v) {
+        __m256i rst = _mm256_packus_epi32(_mm512_castsi512_si256(v),
+                                _mm512_extracti64x4_epi64(v, 1));
+        return _mm256_permutex_epi64(rst, 216);
+    }
+    SI U8 pack(U16 v) {
+        __m256i rst = _mm256_packus_epi16(v, v);
+        return _mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
+    }
+    SI F if_then_else(I32 c, F t, F e) {
+	    return _mm512_castsi512_ps(_mm512_ternarylogic_epi64(c, _mm512_castps_si512(t),
+                     _mm512_castps_si512(e), 202));
+    }
+    SI bool any(I32 c) {
+        __mmask16 mask32 = _mm512_test_epi32_mask(c, c);
+        return mask32 != 0;
+    }
+    SI bool all(I32 c) {
+        __mmask16 mask32 = _mm512_test_epi32_mask(c, c);
+        return mask32 == 0xffff;
+    }
+    template <typename T>
+    SI V<T> gather(const T* p, U32 ix) {
+        return{  p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
+                 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]],
+                 p[ix[8]], p[ix[9]], p[ix[10]], p[ix[11]],
+                 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]],
+                 };
+    }
+    SI F   gather(const float* p, U32 ix) { return _mm512_i32gather_ps(ix, p, 4); }
+    SI U32 gather(const uint32_t* p, U32 ix) { return _mm512_i32gather_epi32(ix, p, 4); }
+    SI U64 gather(const uint64_t* p, U32 ix) {
+        __m512i parts[] = {
+            _mm512_i32gather_epi64(_mm512_castsi512_si256(ix), p, 8),
+            _mm512_i32gather_epi64(_mm512_extracti32x8_epi32(ix, 1), p, 8),
+        };
+        return sk_bit_cast<U64>(parts);
+    }
+    template <typename V, typename S>
+    SI void scatter_masked(V src, S* dst, U32 ix, I32 mask) {
+        V before = gather(dst, ix);
+        V after = if_then_else(mask, src, before);
+        dst[ix[0]] = after[0];
+        dst[ix[1]] = after[1];
+        dst[ix[2]] = after[2];
+        dst[ix[3]] = after[3];
+        dst[ix[4]] = after[4];
+        dst[ix[5]] = after[5];
+        dst[ix[6]] = after[6];
+        dst[ix[7]] = after[7];
+        dst[ix[8]] = after[8];
+        dst[ix[9]] = after[9];
+        dst[ix[10]] = after[10];
+        dst[ix[11]] = after[11];
+        dst[ix[12]] = after[12];
+        dst[ix[13]] = after[13];
+        dst[ix[14]] = after[14];
+        dst[ix[15]] = after[15];
+    }
+
+    SI void load2(const uint16_t* ptr, U16* r, U16* g) {
+        U16 _01234567, _89abcdef;
+        _01234567 = _mm256_loadu_si256(((__m256i*)ptr) + 0);
+        _89abcdef = _mm256_loadu_si256(((__m256i*)ptr) + 1);
+
+        *r = _mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_mm256_slli_epi32
+            (_01234567, 16), 16), _mm256_srai_epi32(_mm256_slli_epi32(_89abcdef, 16), 16)), 216);
+        *g = _mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_01234567, 16),
+                             _mm256_srai_epi32(_89abcdef, 16)), 216);
+    }
+    SI void store2(uint16_t* ptr, U16 r, U16 g) {
+        auto _01234567 = _mm256_unpacklo_epi16(r, g);
+        auto _89abcdef = _mm256_unpackhi_epi16(r, g);
+        __m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567), _89abcdef, 1);
+        __m512i aa = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), combinedVector);
+        _01234567 = _mm512_castsi512_si256(aa);
+        _89abcdef = _mm512_extracti64x4_epi64(aa, 1);
+
+        _mm256_storeu_si256((__m256i*)ptr + 0, _01234567);
+        _mm256_storeu_si256((__m256i*)ptr + 1, _89abcdef);
+    }
+
+    SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
+        __m512i _01234567, _89abcdef;
+        _01234567 = _mm512_loadu_si512((__m512i*)ptr);
+        _89abcdef = _mm512_loadu_si512((__m512i*)(ptr+32));
+
+        *r = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_01234567, _mm512_set1_epi64
+               (0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(_89abcdef, _mm512_set1_epi64(0xFF))));
+        *g = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(_01234567, 16),
+               _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
+                     _mm512_srli_epi64(_89abcdef, 16), _mm512_set1_epi64(0xFF))));
+        *b = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(_01234567, 32),
+               _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
+                     _mm512_srli_epi64(_89abcdef, 32), _mm512_set1_epi64(0xFF))));
+        *a = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(_01234567, 48),
+               _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
+                     _mm512_srli_epi64(_89abcdef, 48), _mm512_set1_epi64(0xFF))));
+    }
+    SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
+        auto rg012389ab = _mm256_unpacklo_epi16(r, g),
+             rg4567cdef = _mm256_unpackhi_epi16(r, g),
+             ba012389ab = _mm256_unpacklo_epi16(b, a),
+             ba4567cdef = _mm256_unpackhi_epi16(b, a);
+
+        auto _0189 = _mm256_unpacklo_epi32(rg012389ab, ba012389ab),
+             _23ab = _mm256_unpackhi_epi32(rg012389ab, ba012389ab),
+             _45cd = _mm256_unpacklo_epi32(rg4567cdef, ba4567cdef),
+             _67ef = _mm256_unpackhi_epi32(rg4567cdef, ba4567cdef);
+
+        auto _ab23 = _mm256_permutex_epi64(_23ab, 78);
+        auto _0123 = _mm256_blend_epi32(_0189, _ab23, 0xf0);
+        auto _89ab = _mm256_permutex_epi64(_mm256_blend_epi32(_0189, _ab23, 0x0f), 78);
+        auto _ef67 = _mm256_permutex_epi64(_67ef, 78);
+        auto _4567 = _mm256_blend_epi32(_45cd, _ef67, 0xf0);
+        auto _cdef = _mm256_permutex_epi64(_mm256_blend_epi32(_45cd, _ef67, 0x0f), 78);
+
+        _mm256_storeu_si256((__m256i*)ptr, _0123);
+        _mm256_storeu_si256((__m256i*)ptr + 1, _4567);
+        _mm256_storeu_si256((__m256i*)ptr + 2, _89ab);
+        _mm256_storeu_si256((__m256i*)ptr + 3, _cdef);
+    }
+
+    SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
+        F _048c, _159d, _26ae, _37bf;
+          _048c = _159d = _26ae = _37bf = 0;
+
+        _048c = _mm512_castps128_ps512(_mm_loadu_ps(ptr)         );
+        _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+16), 1);
+        _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+32), 2);
+        _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+48), 3);
+        _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+ 4), 0);
+        _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+20), 1);
+        _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+36), 2);
+        _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+52), 3);
+        _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+ 8), 0);
+        _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+24), 1);
+        _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+40), 2);
+        _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+56), 3);
+        _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+12), 0);
+        _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+28), 1);
+        _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+44), 2);
+        _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+60), 3);
+       
+        F rg02468acf = _mm512_unpacklo_ps(_048c, _26ae),
+          ba02468acf = _mm512_unpackhi_ps(_048c, _26ae),
+          rg13579bde = _mm512_unpacklo_ps(_159d, _37bf),
+          ba13579bde = _mm512_unpackhi_ps(_159d, _37bf);
+
+        *r = _mm512_unpacklo_ps(rg02468acf, rg13579bde);
+        *g = _mm512_unpackhi_ps(rg02468acf, rg13579bde);
+        *b = _mm512_unpacklo_ps(ba02468acf, ba13579bde);
+        *a = _mm512_unpackhi_ps(ba02468acf, ba13579bde);
+    }
+    
+    SI void store4(float* ptr, F r, F g, F b, F a) {
+        F rg014589cd = _mm512_unpacklo_ps(r, g),
+          rg2367abef = _mm512_unpackhi_ps(r, g),
+          ba014589cd = _mm512_unpacklo_ps(b, a),
+          ba2367abef = _mm512_unpackhi_ps(b, a);
+
+        F _048c = _mm512_unpacklo_pd(rg014589cd, ba014589cd), // r0 g0 b0 a0 4 8 c
+          _26ae = _mm512_unpacklo_pd(rg2367abef, ba2367abef), // r2 g2 b2 a2 6 a e
+          _159d = _mm512_unpackhi_pd(rg014589cd, ba014589cd), // r1 g1 b1 a1 5 9 d
+          _37bf = _mm512_unpackhi_pd(rg2367abef, ba2367abef); // r3 g3 b3 a3 7 b f
+
+
+        F _ae26 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _26ae),
+          _bf37 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _37bf),
+          _8c04 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _048c),
+          _9d15 = _mm512_permutexvar_pd(_mm512_setr_epi64(4,5,6,7,0,1,2,3), _159d),
+
+          _0426 = _mm512_permutex2var_pd(_048c, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _ae26),
+          _1537 = _mm512_permutex2var_pd(_159d, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _bf37),
+          _5173 = _mm512_permutex_pd(_1537, 176),
+          _0123 = _mm512_permutex2var_pd(_0426, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _5173),
+          _5476 = _mm512_permutex2var_pd(_5173, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _0426),
+          _4567 = _mm512_permutex_pd(_5476, 176),
+          _8cae = _mm512_permutex2var_pd(_8c04, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _26ae),
+          _9dbf = _mm512_permutex2var_pd(_9d15, _mm512_setr_epi64(0,1,2,3,12,13,14,15), _37bf),
+          _d9fb = _mm512_permutex_pd(_9dbf, 176),
+          _89ab = _mm512_permutex2var_pd(_8cae, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _d9fb),
+          _dcfe = _mm512_permutex2var_pd(_d9fb, _mm512_setr_epi64(0,1,10,11,4,5,14,15), _8cae),
+          _cdef = _mm512_permutex_pd(_dcfe, 176);
+        _mm512_storeu_ps(ptr+0, _0123);
+        _mm512_storeu_ps(ptr+16, _4567);
+        _mm512_storeu_ps(ptr+32, _89ab);
+        _mm512_storeu_ps(ptr+48, _cdef);
+    }
 
 #elif defined(JUMPER_IS_HSW)
     // These are __m256 and __m256i, but friendlier and strongly-typed.
@@ -631,6 +851,12 @@ template <typename T> using V = T __attribute__((ext_vector_type(4)));
     SI U32 trunc_(F   v) { return (U32)v; }
     SI U32 expand(U16 v) { return (U32)v; }
     SI U32 expand(U8  v) { return (U32)v; }
+#elif defined (JUMPER_IS_AVX512)
+    SI F   cast  (U32 v) { return      _mm512_cvtepu32_ps(v); }
+    SI F   cast64(U64 v) { return      __builtin_convertvector(     v,   F); }
+    SI U32 trunc_(F   v) { return (U32)__builtin_convertvector(     v, I32); }
+    SI U32 expand(U16 v) { return      _mm512_cvtepu16_epi32(v); }
+    SI U32 expand(U8  v) { return      _mm512_cvtepu8_epi32(v); }
 #else
     SI F   cast  (U32 v) { return      __builtin_convertvector((I32)v,   F); }
     SI F   cast64(U64 v) { return      __builtin_convertvector(     v,   F); }
@@ -692,6 +918,9 @@ SI F from_half(U16 h) {
     && !defined(SK_BUILD_FOR_GOOGLE3)  // Temporary workaround for some Google3 builds.
     return vcvt_f32_f16(h);
 
+#elif defined(JUMPER_IS_AVX512)
+    return _mm512_cvtph_ps(h);
+
 #elif defined(JUMPER_IS_HSW)
     return _mm256_cvtph_ps(h);
 
@@ -713,6 +942,9 @@ SI U16 to_half(F f) {
     && !defined(SK_BUILD_FOR_GOOGLE3)  // Temporary workaround for some Google3 builds.
     return vcvt_f16_f32(f);
 
+#elif defined(JUMPER_IS_AVX512)
+    return _mm512_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
+
 #elif defined(JUMPER_IS_HSW)
     return _mm256_cvtps_ph(f, _MM_FROUND_CUR_DIRECTION);
 
@@ -4162,7 +4394,16 @@ namespace lowp {
 
 #else  // We are compiling vector code with Clang... let's make some lowp stages!
 
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    using U8  = uint8_t  __attribute__((ext_vector_type(16)));
+    using U16 = uint16_t __attribute__((ext_vector_type(16)));
+    using I16 =  int16_t __attribute__((ext_vector_type(16)));
+    using I32 =  int32_t __attribute__((ext_vector_type(16)));
+    using U32 = uint32_t __attribute__((ext_vector_type(16)));
+    using I64 =  int64_t __attribute__((ext_vector_type(16)));
+    using U64 = uint64_t __attribute__((ext_vector_type(16)));
+    using F   = float    __attribute__((ext_vector_type(16)));
+#elif defined(JUMPER_IS_HSW)
     using U8  = uint8_t  __attribute__((ext_vector_type(16)));
     using U16 = uint16_t __attribute__((ext_vector_type(16)));
     using I16 =  int16_t __attribute__((ext_vector_type(16)));
@@ -4440,7 +4681,10 @@ SI U32 trunc_(F x) { return (U32)cast<I32>(x); }
 
 // Use approximate instructions and one Newton-Raphson step to calculate 1/x.
 SI F rcp_precise(F x) {
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    F e = _mm512_rcp14_ps(x);
+    return _mm512_fnmadd_ps(x, e, _mm512_set1_ps(2.0f)) * e;
+#elif defined(JUMPER_IS_HSW)
     __m256 lo,hi;
     split(x, &lo,&hi);
     return join<F>(SK_OPTS_NS::rcp_precise(lo), SK_OPTS_NS::rcp_precise(hi));
@@ -4457,7 +4701,9 @@ SI F rcp_precise(F x) {
 #endif
 }
 SI F sqrt_(F x) {
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    return _mm512_sqrt_ps(x);
+#elif defined(JUMPER_IS_HSW)
     __m256 lo,hi;
     split(x, &lo,&hi);
     return join<F>(_mm256_sqrt_ps(lo), _mm256_sqrt_ps(hi));
@@ -4492,6 +4738,8 @@ SI F floor_(F x) {
     float32x4_t lo,hi;
     split(x, &lo,&hi);
     return join<F>(vrndmq_f32(lo), vrndmq_f32(hi));
+#elif defined(JUMPER_IS_AVX512)
+    return _mm512_floor_ps(x);
 #elif defined(JUMPER_IS_HSW)
     __m256 lo,hi;
     split(x, &lo,&hi);
@@ -4512,7 +4760,9 @@ SI F floor_(F x) {
 // The result is a number on [-1, 1).
 // Note: on neon this is a saturating multiply while the others are not.
 SI I16 scaled_mult(I16 a, I16 b) {
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    return _mm256_mulhrs_epi16(a, b);
+#elif defined(JUMPER_IS_HSW)
     return _mm256_mulhrs_epi16(a, b);
 #elif defined(JUMPER_IS_SSE41) || defined(JUMPER_IS_AVX)
     return _mm_mulhrs_epi16(a, b);
@@ -4786,7 +5036,25 @@ SI void store(T* ptr, V v) {
     memcpy(ptr, &v, sizeof(v));
 }
 
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    template <typename V, typename T>
+    SI V gather(const T* ptr, U32 ix) {
+        return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
+                  ptr[ix[ 4]], ptr[ix[ 5]], ptr[ix[ 6]], ptr[ix[ 7]],
+                  ptr[ix[ 8]], ptr[ix[ 9]], ptr[ix[10]], ptr[ix[11]],
+                  ptr[ix[12]], ptr[ix[13]], ptr[ix[14]], ptr[ix[15]], };
+    }
+
+    template<>
+    F gather(const float* ptr, U32 ix) {
+        return _mm512_i32gather_ps(ix, ptr, 4);
+    }
+
+    template<>
+    U32 gather(const uint32_t* ptr, U32 ix) {
+        return _mm512_i32gather_epi32(ix, ptr, 4);
+    }
+#elif defined(JUMPER_IS_HSW)
     template <typename V, typename T>
     SI V gather(const T* ptr, U32 ix) {
         return V{ ptr[ix[ 0]], ptr[ix[ 1]], ptr[ix[ 2]], ptr[ix[ 3]],
@@ -4824,7 +5092,12 @@ SI void store(T* ptr, V v) {
 // ~~~~~~ 32-bit memory loads and stores ~~~~~~ //
 
 SI void from_8888(U32 rgba, U16* r, U16* g, U16* b, U16* a) {
-#if defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512)
+    rgba = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), rgba);
+    auto cast_U16 = [](U32 v) -> U16 {
+        return _mm256_packus_epi32(_mm512_castsi512_si256(v), _mm512_extracti64x4_epi64(v, 1));
+    };
+#elif defined(JUMPER_IS_HSW)
     // Swap the middle 128-bit lanes to make _mm256_packus_epi32() in cast_U16() work out nicely.
     __m256i _01,_23;
     split(rgba, &_01, &_23);

From 0bfde00c6857c7edc9f41e3b5b2453264d762e24 Mon Sep 17 00:00:00 2001
From: pingladd <86369275+pingladd@users.noreply.github.com>
Date: Tue, 12 Dec 2023 10:40:56 -0600
Subject: [PATCH 2/2] Added AVX512 support for SkRasterPipeline_opts.h

Hi,
The patch has integrated AVX512 support for certain functions in both highp and lowp. Testing and verification were conducted within the Pdfium repository, where it passed the pdfium_embeddertests.exe. Performance-wise, the AVX512 code path shows significant enhancement over the standard SSE and AVX2 paths. This performance boost was confirmed through testing with PDF files sourced from the resources folder of the Pdfium library.

This is an imported pull request from
https://github.com/google/skia/pull/149

GitOrigin-RevId: 3dfeb3bb6ba398f57bf8e110ba1d7384f67575cc
Change-Id: I91f95a69d914ed57707239b7d2257a6c8f0c3ffa

This is an imported pull request from
https://github.com/google/skia/pull/151

GitOrigin-RevId: 02db57ee1678343e5dcb0a3d85efd412eeeab844
Change-Id: Ia674977e3c1a083938bbfda1e9d785595896cb88
---
 src/opts/SkRasterPipeline_opts.h | 68 ++++++++++++++------------------
 1 file changed, 29 insertions(+), 39 deletions(-)

diff --git a/src/opts/SkRasterPipeline_opts.h b/src/opts/SkRasterPipeline_opts.h
index 94024a79b029..2cc661196a4b 100644
--- a/src/opts/SkRasterPipeline_opts.h
+++ b/src/opts/SkRasterPipeline_opts.h
@@ -332,7 +332,7 @@ template <typename T> using V = T __attribute__((ext_vector_type(16)));
         return _mm256_castsi256_si128(_mm256_permute4x64_epi64(rst, 8));
     }
     SI F if_then_else(I32 c, F t, F e) {
-	    return _mm512_castsi512_ps(_mm512_ternarylogic_epi64(c, _mm512_castps_si512(t),
+        return _mm512_castsi512_ps(_mm512_ternarylogic_epi64(c, _mm512_castps_si512(t),
                      _mm512_castps_si512(e), 202));
     }
     SI bool any(I32 c) {
@@ -345,11 +345,11 @@ template <typename T> using V = T __attribute__((ext_vector_type(16)));
     }
     template <typename T>
     SI V<T> gather(const T* p, U32 ix) {
-        return{  p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
-                 p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]],
-                 p[ix[8]], p[ix[9]], p[ix[10]], p[ix[11]],
-                 p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]],
-                 };
+        return{p[ix[0]], p[ix[1]], p[ix[2]], p[ix[3]],
+               p[ix[4]], p[ix[5]], p[ix[6]], p[ix[7]],
+               p[ix[8]], p[ix[9]], p[ix[10]], p[ix[11]],
+               p[ix[12]], p[ix[13]], p[ix[14]], p[ix[15]],
+               };
     }
     SI F   gather(const float* p, U32 ix) { return _mm512_i32gather_ps(ix, p, 4); }
     SI U32 gather(const uint32_t* p, U32 ix) { return _mm512_i32gather_epi32(ix, p, 4); }
@@ -383,9 +383,8 @@ template <typename T> using V = T __attribute__((ext_vector_type(16)));
     }
 
     SI void load2(const uint16_t* ptr, U16* r, U16* g) {
-        U16 _01234567, _89abcdef;
-        _01234567 = _mm256_loadu_si256(((__m256i*)ptr) + 0);
-        _89abcdef = _mm256_loadu_si256(((__m256i*)ptr) + 1);
+        U16 _01234567 = _mm256_loadu_si256(((__m256i*)ptr) + 0);
+        U16 _89abcdef = _mm256_loadu_si256(((__m256i*)ptr) + 1);
 
         *r = _mm256_permute4x64_epi64(_mm256_packs_epi32(_mm256_srai_epi32(_mm256_slli_epi32
             (_01234567, 16), 16), _mm256_srai_epi32(_mm256_slli_epi32(_89abcdef, 16), 16)), 216);
@@ -395,7 +394,8 @@ template <typename T> using V = T __attribute__((ext_vector_type(16)));
     SI void store2(uint16_t* ptr, U16 r, U16 g) {
         auto _01234567 = _mm256_unpacklo_epi16(r, g);
         auto _89abcdef = _mm256_unpackhi_epi16(r, g);
-        __m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567), _89abcdef, 1);
+        __m512i combinedVector = _mm512_inserti64x4(_mm512_castsi256_si512(_01234567),
+                    _89abcdef, 1);
         __m512i aa = _mm512_permutexvar_epi64(_mm512_setr_epi64(0,1,4,5,2,3,6,7), combinedVector);
         _01234567 = _mm512_castsi512_si256(aa);
         _89abcdef = _mm512_extracti64x4_epi64(aa, 1);
@@ -405,21 +405,21 @@ template <typename T> using V = T __attribute__((ext_vector_type(16)));
     }
 
     SI void load4(const uint16_t* ptr, U16* r, U16* g, U16* b, U16* a) {
-        __m512i _01234567, _89abcdef;
-        _01234567 = _mm512_loadu_si512((__m512i*)ptr);
-        _89abcdef = _mm512_loadu_si512((__m512i*)(ptr+32));
-
-        *r = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_01234567, _mm512_set1_epi64
-               (0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(_89abcdef, _mm512_set1_epi64(0xFF))));
-        *g = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(_01234567, 16),
-               _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
-                     _mm512_srli_epi64(_89abcdef, 16), _mm512_set1_epi64(0xFF))));
-        *b = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(_01234567, 32),
-               _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
-                     _mm512_srli_epi64(_89abcdef, 32), _mm512_set1_epi64(0xFF))));
-        *a = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(_01234567, 48),
-               _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
-                     _mm512_srli_epi64(_89abcdef, 48), _mm512_set1_epi64(0xFF))));
+        __m512i _01234567 = _mm512_loadu_si512((__m512i*)ptr);
+        __m512i _89abcdef = _mm512_loadu_si512((__m512i*)(ptr+32));
+
+        *r = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_01234567,
+                _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(_mm512_and_si512(
+                  _89abcdef, _mm512_set1_epi64(0xFF))));
+        *g = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
+               _01234567, 16), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
+                 _mm512_and_si512(_mm512_srli_epi64(_89abcdef, 16), _mm512_set1_epi64(0xFF))));
+        *b = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
+               _01234567, 32), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
+                 _mm512_and_si512(_mm512_srli_epi64(_89abcdef, 32), _mm512_set1_epi64(0xFF))));
+        *a = _mm256_setr_m128i(_mm512_cvtepi64_epi16(_mm512_and_si512(_mm512_srli_epi64(
+               _01234567, 48), _mm512_set1_epi64(0xFF))), _mm512_cvtepi64_epi16(
+                 _mm512_and_si512(_mm512_srli_epi64(_89abcdef, 48), _mm512_set1_epi64(0xFF))));
     }
     SI void store4(uint16_t* ptr, U16 r, U16 g, U16 b, U16 a) {
         auto rg012389ab = _mm256_unpacklo_epi16(r, g),
@@ -447,21 +447,20 @@ template <typename T> using V = T __attribute__((ext_vector_type(16)));
 
     SI void load4(const float* ptr, F* r, F* g, F* b, F* a) {
         F _048c, _159d, _26ae, _37bf;
-          _048c = _159d = _26ae = _37bf = 0;
 
         _048c = _mm512_castps128_ps512(_mm_loadu_ps(ptr)         );
         _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+16), 1);
         _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+32), 2);
         _048c = _mm512_insertf32x4(_048c, _mm_loadu_ps(ptr+48), 3);
-        _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+ 4), 0);
+        _159d = _mm512_castps128_ps512(_mm_loadu_ps(ptr+4)       );
         _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+20), 1);
         _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+36), 2);
         _159d = _mm512_insertf32x4(_159d, _mm_loadu_ps(ptr+52), 3);
-        _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+ 8), 0);
+        _26ae = _mm512_castps128_ps512(_mm_loadu_ps(ptr+8)       );
         _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+24), 1);
         _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+40), 2);
         _26ae = _mm512_insertf32x4(_26ae, _mm_loadu_ps(ptr+56), 3);
-        _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+12), 0);
+        _37bf = _mm512_castps128_ps512(_mm_loadu_ps(ptr+12)      );
         _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+28), 1);
         _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+44), 2);
         _37bf = _mm512_insertf32x4(_37bf, _mm_loadu_ps(ptr+60), 3);
@@ -4394,16 +4393,7 @@ namespace lowp {
 
 #else  // We are compiling vector code with Clang... let's make some lowp stages!
 
-#if defined(JUMPER_IS_AVX512)
-    using U8  = uint8_t  __attribute__((ext_vector_type(16)));
-    using U16 = uint16_t __attribute__((ext_vector_type(16)));
-    using I16 =  int16_t __attribute__((ext_vector_type(16)));
-    using I32 =  int32_t __attribute__((ext_vector_type(16)));
-    using U32 = uint32_t __attribute__((ext_vector_type(16)));
-    using I64 =  int64_t __attribute__((ext_vector_type(16)));
-    using U64 = uint64_t __attribute__((ext_vector_type(16)));
-    using F   = float    __attribute__((ext_vector_type(16)));
-#elif defined(JUMPER_IS_HSW)
+#if defined(JUMPER_IS_AVX512) || defined(JUMPER_IS_HSW)
     using U8  = uint8_t  __attribute__((ext_vector_type(16)));
     using U16 = uint16_t __attribute__((ext_vector_type(16)));
     using I16 =  int16_t __attribute__((ext_vector_type(16)));