Skip to content

Commit

Permalink
Merge pull request #30 from xinhuang/sqr
Browse files Browse the repository at this point in the history
implementation of v?Sqr
  • Loading branch information
xianyi authored Dec 6, 2016
2 parents 4c9919f + 5ef9a2f commit 2cb7cc5
Show file tree
Hide file tree
Showing 21 changed files with 389 additions and 4 deletions.
3 changes: 3 additions & 0 deletions include/openvml.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ OPENVML_EXPORT void OpenVML_FUNCNAME(vdSub)(const VML_INT n, const double * a, c
OPENVML_EXPORT void OpenVML_FUNCNAME(vcSub)(const VML_INT n, const float * a, const float * b, float * y);
OPENVML_EXPORT void OpenVML_FUNCNAME(vzSub)(const VML_INT n, const double * a, const double * b, double * y);

OPENVML_EXPORT void OpenVML_FUNCNAME(vsSqr)(const VML_INT n, const float * a, float * y);
OPENVML_EXPORT void OpenVML_FUNCNAME(vdSqr)(const VML_INT n, const double * a, double * y);

OPENVML_EXPORT void OpenVML_FUNCNAME(vsPow)(const VML_INT n, const float * a, const float * b, float * y);
OPENVML_EXPORT void OpenVML_FUNCNAME(vdPow)(const VML_INT n, const double * a, const double * b, double * y);

Expand Down
2 changes: 2 additions & 0 deletions include/openvml_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ void OpenVML_FUNCNAME(dsub_k)(VMLLONG n, double * a, double * b, double * y, dou
void OpenVML_FUNCNAME(csub_k)(VMLLONG n, float * a, float * b, float * y, float * z, float * other_params);
void OpenVML_FUNCNAME(zsub_k)(VMLLONG n, double * a, double * b, double * y, double * z, double * other_params);

void OpenVML_FUNCNAME(ssqr_k)(VMLLONG n, float * a, float * b, float * y, float * z, float * other_params);
void OpenVML_FUNCNAME(sdqr_k)(VMLLONG n, double * a, double * b, double * y, double * z, double * other_params);

void OpenVML_FUNCNAME(spow_k)(VMLLONG n, float * a, float * b, float * y, float * z, float * other_params);
void OpenVML_FUNCNAME(dpow_k)(VMLLONG n, double * a, double * b, double * y, double * z, double * other_params);
Expand Down
5 changes: 5 additions & 0 deletions include/openvml_macros.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@
#define CSUB_K OpenVML_FUNCNAME(csub_k)
#define ZSUB_K OpenVML_FUNCNAME(zsub_k)

#define SSQR_K OpenVML_FUNCNAME(ssqr_k)
#define DSQR_K OpenVML_FUNCNAME(dsqr_k)

#define SPOW_K OpenVML_FUNCNAME(spow_k)
#define DPOW_K OpenVML_FUNCNAME(dpow_k)
#define CPOW_K OpenVML_FUNCNAME(cpow_k)
Expand Down Expand Up @@ -114,6 +117,7 @@
#ifndef DOUBLE
#define ADD_K SADD_K
#define SUB_K SSUB_K
#define SQR_K SSQR_K
#define POW_K SPOW_K
#define POWX_K SPOWX_K
#define EXP_K SEXP_K
Expand All @@ -134,6 +138,7 @@
#else
#define ADD_K DADD_K
#define SUB_K DSUB_K
#define SQR_K DSQR_K
#define POW_K DPOW_K
#define POWX_K DPOWX_K
#define EXP_K DEXP_K
Expand Down
3 changes: 3 additions & 0 deletions include/openvml_reference.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ OPENVML_EXPORT void OpenVML_FUNCNAME_REF(vdSub)(const VML_INT n, const double *
OPENVML_EXPORT void OpenVML_FUNCNAME_REF(vcSub)(const VML_INT n, const float * a, const float * b, float * y);
OPENVML_EXPORT void OpenVML_FUNCNAME_REF(vzSub)(const VML_INT n, const double * a, const double * b, double * y);

OPENVML_EXPORT void OpenVML_FUNCNAME_REF(vsSqr)(const VML_INT n, const float * a, float * y);
OPENVML_EXPORT void OpenVML_FUNCNAME_REF(vdSqr)(const VML_INT n, const double * a, double * y);

OPENVML_EXPORT void OpenVML_FUNCNAME_REF(vsPow)(const VML_INT n, const float * a, const float * b, float * y);
OPENVML_EXPORT void OpenVML_FUNCNAME_REF(vdPow)(const VML_INT n, const double * a, const double * b, double * y);

Expand Down
4 changes: 2 additions & 2 deletions interface/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set(OpenVML_LIBSRC_C "")
set(OpenVML_LIBSRC_Z "")

set(REAL_INTERFACE_LIST
add sub
add sub sqr
pow powx pow2o3 pow3o2 exp expm1
tanh
log10 ln log1p
Expand Down Expand Up @@ -108,4 +108,4 @@ Endforeach(INTERFACE)

add_library(openvml_interface_core OBJECT ${OpenVML_LIBSRC_S} ${OpenVML_LIBSRC_D} ${OpenVML_LIBSRC_C} ${OpenVML_LIBSRC_Z} ${OpenVML_LIBSRC_OTHER})

target_compile_definitions(openvml_interface_core PUBLIC openvml_EXPORTS)
target_compile_definitions(openvml_interface_core PUBLIC openvml_EXPORTS)
39 changes: 39 additions & 0 deletions interface/sqr.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/* * Copyright (c) 2014, 2015 Zhang Xianyi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <openvml.h>
#include <openvml_driver.h>
#include <openvml_kernel.h>


void CNAME(const VML_INT n, const VML_FLOAT * a, VML_FLOAT * y) {

if (n<=0) return;
if (a==NULL || y==NULL) return;


EXEC_VML(0, SQR_K, n, (VML_FLOAT*)a, NULL, y, NULL, NULL);

}
2 changes: 1 addition & 1 deletion kernel/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ set(OpenVML_LIBSRC_C "")
set(OpenVML_LIBSRC_Z "")

#s,d
set(KERNEL_LIST add sub pow powx exp expm1 tanh log10 ln log1p floor
set(KERNEL_LIST add sub sqr pow powx exp expm1 tanh log10 ln log1p floor
sin cos sincos tan asin acos atan atan2)

#c,z
Expand Down
3 changes: 3 additions & 0 deletions kernel/aarch64/Kernel_generic.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ set(sub_D_KERNEL_SOURCE generic/sub_kernel.c)
set(sub_C_KERNEL_SOURCE generic/sub_kernel.c)
set(sub_Z_KERNEL_SOURCE generic/sub_kernel.c)

set(sqr_S_KERNEL_SOURCE generic/sqr_kernel.c)
set(sqr_D_KERNEL_SOURCE generic/sqr_kernel.c)

set(pow_S_KERNEL_SOURCE generic/pow_kernel.c)
set(pow_D_KERNEL_SOURCE generic/pow_kernel.c)

Expand Down
3 changes: 3 additions & 0 deletions kernel/arm/Kernel_generic.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ set(sub_D_KERNEL_SOURCE generic/sub_kernel.c)
set(sub_C_KERNEL_SOURCE generic/sub_kernel.c)
set(sub_Z_KERNEL_SOURCE generic/sub_kernel.c)

set(sqr_S_KERNEL_SOURCE generic/sqr_kernel.c)
set(sqr_D_KERNEL_SOURCE generic/sqr_kernel.c)

set(pow_S_KERNEL_SOURCE generic/pow_kernel.c)
set(pow_D_KERNEL_SOURCE generic/pow_kernel.c)

Expand Down
3 changes: 3 additions & 0 deletions kernel/generic/Kernel_generic.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ set(sub_D_KERNEL_SOURCE ${OpenVML_ARCH}/sub_kernel.c)
set(sub_C_KERNEL_SOURCE ${OpenVML_ARCH}/sub_kernel.c)
set(sub_Z_KERNEL_SOURCE ${OpenVML_ARCH}/sub_kernel.c)

set(sqr_S_KERNEL_SOURCE ${OpenVML_ARCH}/sqr_kernel.c)
set(sqr_D_KERNEL_SOURCE ${OpenVML_ARCH}/sqr_kernel.c)

set(pow_S_KERNEL_SOURCE ${OpenVML_ARCH}/pow_kernel.c)
set(pow_D_KERNEL_SOURCE ${OpenVML_ARCH}/pow_kernel.c)
#set(pow_C_KERNEL_SOURCE ${OpenVML_ARCH}/pow_kernel.c)
Expand Down
33 changes: 33 additions & 0 deletions kernel/generic/sqr_kernel.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/* * Copyright (c) 2014, 2015 Zhang Xianyi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "openvml_kernel.h"

void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
VMLLONG i=0;
for(i=0; i<COMPSIZE*n; i++){
y[i]=a[i]*a[i];
}
}
3 changes: 3 additions & 0 deletions kernel/x86_64/Kernel_generic.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ set(sub_D_KERNEL_SOURCE generic/sub_kernel.c)
set(sub_C_KERNEL_SOURCE generic/sub_kernel.c)
set(sub_Z_KERNEL_SOURCE generic/sub_kernel.c)

set(sqr_S_KERNEL_SOURCE generic/sqr_kernel.c)
set(sqr_D_KERNEL_SOURCE generic/sqr_kernel.c)

set(pow_S_KERNEL_SOURCE generic/pow_kernel.c)
set(pow_D_KERNEL_SOURCE generic/pow_kernel.c)

Expand Down
3 changes: 3 additions & 0 deletions kernel/x86_64/Kernel_haswell.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ set(sub_D_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c)
set(sub_C_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c)
set(sub_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c)

set(sqr_S_KERNEL_SOURCE ${OpenVML_ARCH}/ssqr_kernel_avx.c)
set(sqr_D_KERNEL_SOURCE ${OpenVML_ARCH}/dsqr_kernel_avx.c)

set(pow_S_KERNEL_SOURCE ${OpenVML_ARCH}/spow_kernel_avx.c)
set(pow_D_KERNEL_SOURCE ${OpenVML_ARCH}/dpow_kernel_avx.c)

Expand Down
3 changes: 3 additions & 0 deletions kernel/x86_64/Kernel_sandybridge.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@ set(sub_D_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c)
set(sub_C_KERNEL_SOURCE ${OpenVML_ARCH}/ssub_kernel_avx.c)
set(sub_Z_KERNEL_SOURCE ${OpenVML_ARCH}/dsub_kernel_avx.c)

set(sqr_S_KERNEL_SOURCE ${OpenVML_ARCH}/ssqr_kernel_avx.c)
set(sqr_D_KERNEL_SOURCE ${OpenVML_ARCH}/dsqr_kernel_avx.c)

set(pow_S_KERNEL_SOURCE ${OpenVML_ARCH}/spow_kernel_avx.c)
set(pow_D_KERNEL_SOURCE ${OpenVML_ARCH}/dpow_kernel_avx.c)

Expand Down
78 changes: 78 additions & 0 deletions kernel/x86_64/dsqr_kernel_avx.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
/* * Copyright (c) 2014, 2015 Zhang Xianyi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "openvml_kernel.h"

#include <immintrin.h>

void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
VMLLONG loop_count=(COMPSIZE*n) >> 5;
VMLLONG remain_count=(COMPSIZE*n) & 0x1f;

int i=0;

while(loop_count>0){

__m256d av0=_mm256_loadu_pd(a);
__m256d av1=_mm256_loadu_pd(a+4);
__m256d av2=_mm256_loadu_pd(a+8);
__m256d av3=_mm256_loadu_pd(a+12);

__m256d av4=_mm256_loadu_pd(a+16);
__m256d av5=_mm256_loadu_pd(a+20);
__m256d av6=_mm256_loadu_pd(a+24);
__m256d av7=_mm256_loadu_pd(a+28);


__m256d yv0=_mm256_mul_pd(av0, av0);
__m256d yv1=_mm256_mul_pd(av1, av1);
__m256d yv2=_mm256_mul_pd(av2, av2);
__m256d yv3=_mm256_mul_pd(av3, av3);

__m256d yv4=_mm256_mul_pd(av4, av4);
__m256d yv5=_mm256_mul_pd(av5, av5);
__m256d yv6=_mm256_mul_pd(av6, av6);
__m256d yv7=_mm256_mul_pd(av7, av7);

_mm256_storeu_pd(y, yv0);
_mm256_storeu_pd(y+4, yv1);
_mm256_storeu_pd(y+8, yv2);
_mm256_storeu_pd(y+12, yv3);

_mm256_storeu_pd(y+16, yv4);
_mm256_storeu_pd(y+20, yv5);
_mm256_storeu_pd(y+24, yv6);
_mm256_storeu_pd(y+28, yv7);

a+=32;
y+=32;
loop_count--;
}

for(i=0; i<remain_count; i++){
y[i]=a[i]*a[i];
}
}

33 changes: 33 additions & 0 deletions kernel/x86_64/sqr_kernel.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/* * Copyright (c) 2014, 2015 Zhang Xianyi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "openvml_kernel.h"

void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
VMLLONG i=0;
for(i=0; i<COMPSIZE*n; i++){
y[i]=a[i]*a[i];
}
}
64 changes: 64 additions & 0 deletions kernel/x86_64/ssqr_kernel_avx.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/* * Copyright (c) 2014, 2015 Zhang Xianyi
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice, this
* list of conditions and the following disclaimer in the documentation and/or
* other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include "openvml_kernel.h"

#include <immintrin.h>

void KERNEL_NAME(VMLLONG n, VML_FLOAT * a, VML_FLOAT * b, VML_FLOAT * y, VML_FLOAT * z, VML_FLOAT * other_params) {
VMLLONG loop_count=(COMPSIZE*n) >> 5;
VMLLONG remain_count=(COMPSIZE*n) & 0x1f;

int i=0;

while(loop_count>0){

__m256 av0=_mm256_loadu_ps(a);
__m256 av1=_mm256_loadu_ps(a+8);
__m256 av2=_mm256_loadu_ps(a+16);
__m256 av3=_mm256_loadu_ps(a+24);


__m256 yv0=_mm256_mul_ps(av0, av0);
__m256 yv1=_mm256_mul_ps(av1, av1);
__m256 yv2=_mm256_mul_ps(av2, av2);
__m256 yv3=_mm256_mul_ps(av3, av3);


_mm256_storeu_ps(y, yv0);
_mm256_storeu_ps(y+8, yv1);
_mm256_storeu_ps(y+16, yv2);
_mm256_storeu_ps(y+24, yv3);

a+=32;
y+=32;
loop_count--;
}

for(i=0; i<remain_count; i++){
y[i]=a[i]*a[i];
}
}

Loading

0 comments on commit 2cb7cc5

Please sign in to comment.