Skip to content

Examples of auto vectorizable codes

Naoki Shibata edited this page Feb 7, 2018 · 37 revisions

All source codes in this page are in public domain. These codes are provided for checking how compiler vecotorizes codes with various compiler options. Try compiling them at Compiler Explorer.

  • Automatically vectorized by clang or gcc
  • Generates calls to a vectorized math library

Do not forget to specify -fno-math-errno and -fno-trapping-math options.

SRGB to linear image conversion

Just a simple example.

#include <math.h>

#define N 256
__attribute__ ((__aligned__(64))) float in[N][N], out[N][N];

static float srgb2linear_pix(float c) {
  float r = pow((c + 0.055) / (1 + 0.055), 2.4);
  return c < 0.04045 ? (c * (1.0 / 12.92)) : r;
}

void srgb2linear(void) {
  for (int y = 0; y < N; y++) {
    for (int x = 0; x < N; x++) {
      out[y][x] = srgb2linear_pix(in[y][x]);
    }
  }
}

Finding roots of cubic equations

More complicated than the previous one. Conditional selection from two values.

#include <math.h>

#define N 256
__attribute__ ((__aligned__(64))) double in[4][N], out[3][N];

typedef struct { double x, y, z; } double3;

static double3 execute(double a, double b, double c, double d) {
  double s1 = b*(1/a), s2 = c*(1/a), s3 = d*(1/a);
  double p = s2 - 1/3.0*s1*s1;
  double q = s3 - 1/3.0*s1*s2 + 2/27.0*s1*s1*s1;
  double z = q*q + 4/27.0*p*p*p;

  double w = cbrt((-q + sqrt(z)) * 0.5) + cbrt((-q - sqrt(z)) * 0.5) - 1/3.0*s1;

  double th = acos(0.5*(3.0/p)*q*sqrt(-(3.0/p)));
  double w0 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*0/3)-1/3.0*s1;
  double w1 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*1/3)-1/3.0*s1;
  double w2 = 2 * sqrt(-1.0/3*p) * cos(1.0/3.0*th - 2*M_PI*2/3)-1/3.0*s1;

  double3 ret = { NAN, NAN, NAN };
  if (z >= 0) {
    ret.x = w;
  } else {
    ret.x = w0; ret.y = w1; ret.z = w2;
  }

  return ret;
}

void cardanoN(void) {
  for (int i = 0; i < N; i++) {
    double3 r = execute(in[0][i], in[1][i], in[2][i], in[3][i]);
    out[0][i] = r.x;
    out[1][i] = r.y;
    out[2][i] = r.z;
  }
}

Generating Dini's surface

The compiled code may call sincos.

#include <math.h>

typedef struct { double x, y, z; } double3;

#define N 256
__attribute__ ((__aligned__(64))) double3 out[N][N];

static double3 dini(double a, double b, double u, double v) {
  double3 ret;
  ret.x = a * cos(u) * sin(v);
  ret.y = a * sin(u) * sin(v);
  ret.z = a * (cos(v) + log(tan(v * 0.5))) + b * u;
  return ret;
}

void diniSurface(double a, double b) {
  for (int i = 0; i < N; i++) {
    for (int j = 0; j < N; j++) {
      double u = 4.0 * M_PI * i / N;
      double v = 2.0 * j / N;
      out[i][j] = dini(a, b, u, v);
    }
  }
}

Factorial approximation formula by Peter Luschny

Calls to pow may be removed.

#include <math.h>

#define N 256
__attribute__ ((__aligned__(64))) double in[N], out[N];

// Factorial approximation formula by Peter Luschny
#define c0 (1.0 / 24.0)
#define c1 (3.0 / 80.0)
#define c2 (18029.0 / 45360.0)
#define c3 (6272051.0 / 14869008.0)

static double lus(double x) {
  x += 0.5;
  double p = (pow(x, 5)+(c3+c2+c1)*pow(x, 3)+c1*c3*x) /
    (pow(x,4)+(c3+c2+c1+c0)*pow(x,2)+(c1+c0)*c3+c0*c2);
  return 0.5*log(2*M_PI) + x * (log(p)-1);
}

void factorialN() {
  for (int i = 0; i < N; i++) {
    out[i] = lus(in[i]);
  }
}
Clone this wiki locally