Skip to content

Commit

Permalink
[softmax] basic implementation of softmax based on RVV (#5)
Browse files Browse the repository at this point in the history
* Softmax bench prototyping
* Adding baseline benchmark
* Prototype of RVV implementation
* Introducing fully vectorized implementation
* Minor optimizations to full RVV scheme
* Intrinsics and dead code cleanup
* Implementing multi-size array benchmark
* Adding accuracy benchmark for scalar expf implementation
* Refactoring benchmarks to execute every test on multiple input sets
* Making golden model more stable (subtraction of max input)
* Integrating max offset subtraction in quick_dirty_vector_expf
* Introducing POLY_DEGREE macro to configure quick_dirty_expf polynomial degree (max: 6)
* Extending exponential benchmark
* Adding README.md file
* Adding missing fesetround call
  • Loading branch information
nibrunie authored Feb 12, 2024
1 parent b2e7911 commit 78f1d34
Show file tree
Hide file tree
Showing 7 changed files with 801 additions and 0 deletions.
48 changes: 48 additions & 0 deletions src/softmax/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Makefile for RISC-V In a Nutshell course (RVVIAN) examples

# RISC-V C Compiler
# available options (the version used must support RVV intrinsics)
# clang/llvm
RISCVCLANG=clang --target=riscv64
# RISCVCC=clang --target=riscv64
# GNU Compiler Collection (GCC)
RISCVCC=riscv64-unknown-elf-gcc

EXTRA_CFLAGS?=-DCOUNT_INSTRET

# architectural parameters for the simulation
# width of vector registers (VLEN)
VLEN?=128

# path to proxy-kernel (pk)
PK_PATH=/opt/riscv/riscv64-unknown-elf/bin/pk64

# SIMULATOR
# Available options in the Docker (uncomment one)
SIMULATOR=spike --isa=rv64gcv_zicntr_zihpm --varch=vlen:$(VLEN),elen:64 $(PK_PATH)
# SIMULATOR=qemu-riscv64 -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on

INCLUDE_DIR ?= /opt/riscv/riscv64-unknown-elf/include/

softmax_baseline.o: softmax_baseline.c
$(RISCVCLANG) $(EXTRA_CFLAGS) -I./ -I$(INCLUDE_DIR) -O2 -march=rv64gcv -c -o $@ $^

softmax_rvv.o: softmax_rvv.c
$(RISCVCLANG) $(EXTRA_CFLAGS) -I./ -I$(INCLUDE_DIR) -O2 -march=rv64gcv -c -o $@ $^

bench_softmax: bench_softmax.c softmax_baseline.o softmax_rvv.o
$(RISCVCC) $(EXTRA_CFLAGS) -I./ -O2 -march=rv64gcv $^ -lm -o $@

bench_exp: bench_expf.c softmax_baseline.o softmax_rvv.o
$(RISCVCC) $(EXTRA_CFLAGS) -I./ -O2 -march=rv64gcv $^ -lm -o $@

sim_bench_softmax: bench_softmax
$(SIMULATOR) $^

sim_bench_exp: bench_exp
$(SIMULATOR) $^

clean:
rm -f *.o bench_softmax

.PHONY: sim_bench_softmax clean
43 changes: 43 additions & 0 deletions src/softmax/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Softmax

The examples in this directory implement multiple version of a softmax activation function on a 1D array.
Those examples are designed to illustrate multiple way RISC-V Vector Extension (RVV) can
be used to perform such operation.

# Running example in a docker container

All the examples in this directory can be built and executed using a docker container built
from https://github.com/nibrunie/rvv-examples/blob/main/riscv-toolchain.Dockerfile.
You can follow the indication of https://github.com/nibrunie/rvv-examples/blob/main/README.md for more
information.

# How to build and execute softmax benchmarks

Once inside a proper build/execution environment, you can change the current directory to
be `src/softmax` and execute a simple `make` command to build and run the examples:

```
make clean
make sim_bench_softmax EXTRA_CFLAGS="-DVERBOSE"
```

It is possible to modify the build configuration with the `EXTRA_CFLAGS` environment variable,
for example to count cycles rather than retired instructions you can run:
```
make clean
make sim_bench_softmax EXTRA_CFLAGS="-DVERBOSE -DCOUNT_CYCLE"
```

You can also reduce the verbosity level by removing `-DVERBOSE` and you can change the number of tests executed for each benchmark wit `-DNUM_TESTS=<num-of-tests>`.

# How to build and execute exponential accuracy benchmarks

This directory contains an accuracy test bench for various scalar implementation of the exponential function.

Once inside a proper build/execution environment, you can change the current directory to
be `src/softmax` and execute a simple `make` command to build and run the examples:

```
make clean
make sim_bench_exp
```
70 changes: 70 additions & 0 deletions src/softmax/bench_expf.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

float quick_dirty_expf(float x);

float quick_dirty_vector_expf(float* dst, float* src, float max_x, size_t n);

float quick_dirty_expf_from_vector(float x) {
float src[4] = {x, x, x, x};
float dst[4] = {0};
quick_dirty_vector_expf(dst, src, 0.f, 4);
return dst[3];
}

typedef float (exp_implementation_t)(float);

typedef struct {
exp_implementation_t* func;
double max_rel_error;
float max_rel_error_input;
char label[100];
} bench_exp_t;

int main(void) {
srand(17);
float start = -2.0f, stop=2.0f;
int steps = 128;
int i;
double max_error[2] = {0.};
double max_error_input[2] = {0.};

bench_exp_t benchmarks[] = {
(bench_exp_t) {.func = expf, .label="baseline expf"},
(bench_exp_t) {.func = quick_dirty_expf, .label="quick_dirty_expf"},
(bench_exp_t) {.func = quick_dirty_expf_from_vector, .label="quick_dirty_vector_expf"},
};

int benchId;
for (benchId = 0; benchId < sizeof(benchmarks) / sizeof(bench_exp_t); benchId ++) {
benchmarks[benchId].max_rel_error = 0.f;
benchmarks[benchId].max_rel_error_input = 0.f;
}


for (i = 0; i < steps; ++i) {
float x = start + (stop - start) * rand() / (float) RAND_MAX;
double golden = exp(x);

for (benchId = 0; benchId < sizeof(benchmarks) / sizeof(bench_exp_t); benchId ++) {
float result = benchmarks[benchId].func(x);
double rel_error = (result - golden) / golden;
if (rel_error > benchmarks[benchId].max_rel_error) {
benchmarks[benchId].max_rel_error = rel_error;
benchmarks[benchId].max_rel_error_input = x;
}
printf("%.5e ", rel_error);
}
printf("\n");
}
for (benchId = 0; benchId < sizeof(benchmarks) / sizeof(bench_exp_t); benchId ++) {
printf("%s: max relative error is %.5e, it is reached for expf(%a)=%a vs exp(%a)=%a\n",
benchmarks[benchId].label, benchmarks[benchId].max_rel_error,
benchmarks[benchId].max_rel_error_input, benchmarks[benchId].func(benchmarks[benchId].max_rel_error_input),
benchmarks[benchId].max_rel_error_input, exp(benchmarks[benchId].max_rel_error_input));
}

return 0;
}
144 changes: 144 additions & 0 deletions src/softmax/bench_softmax.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
// file: bench_matrix_transpose.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
#include <bench_softmax_utils.h>


/** Declaring various softmax implementation benchmarks **/
softmax_bench_result_t softmax_baseline_fp32_bench(float* dst, float* src, double* golden, size_t n);

softmax_bench_result_t softmax_rvv_norm_fp32_bench(float* dst, float* src, double* golden, size_t n);

softmax_bench_result_t softmax_rvv_fp32_bench(float* dst, float* src, double* golden, size_t n);

softmax_bench_result_t softmax_stable_rvv_fp32_bench(float* dst, float* src, double* golden, size_t n);

softmax_bench_result_t softmax_scalar_quick_dirty_expf_fp32_bench(float* dst, float* src, double* golden, size_t n);

typedef softmax_bench_result_t (softmax_bench_func_t)(float* dst, float* src, double* golden, size_t n);

/** Descriptor structure for softmax benchmark */
typedef struct {
softmax_bench_func_t* bench;
softmax_bench_result_t result;
char label[100];
} softmax_bench_t;


extern void softmax_golden_fp32_fp64(double* dst, float* src, size_t n);

#ifndef NUM_TESTS
#define NUM_TESTS 100
#endif


int main(void) {
int i;
softmax_bench_t benchmarks[] = {
(softmax_bench_t){.bench = softmax_baseline_fp32_bench, .label="baseline n-element softmax"},
(softmax_bench_t){.bench = softmax_scalar_quick_dirty_expf_fp32_bench, .label="scalar quick_dirty_expf n-element softmax"},
(softmax_bench_t){.bench = softmax_rvv_norm_fp32_bench, .label="rvv-based (norm only) n-element softmax"},
(softmax_bench_t){.bench = softmax_rvv_fp32_bench, .label="rvv-based n-element softmax"},
(softmax_bench_t){.bench = softmax_stable_rvv_fp32_bench, .label="rvv-based n-element stable softmax"},
};

size_t testSizes[] = {4, 16, 17, 32, 33, 128, 129, 511, 512, 1024, 2048};
for (size_t testId = 0; testId < sizeof(testSizes) / sizeof(size_t); testId++)
{
size_t n = testSizes[testId];
# ifdef VERBOSE
printf("--------------------------------------------------------------------------------\n");
printf("--------------------------------------------------------------------------------\n");
printf("Benchmarking softmax on a %d-element array.\n", n);
# endif
float* src = malloc(n * sizeof(float));
float* dst = malloc(n * sizeof(float));
double* golden = malloc(n * sizeof(double));
assert(src);
assert(dst);
assert(golden);

// reset benchmark results
for (unsigned benchId=0; benchId < sizeof(benchmarks) / sizeof(softmax_bench_t); benchId++)
{
benchmarks[benchId].result.max_abs_error = 0.;
benchmarks[benchId].result.max_rel_error = 0.;
benchmarks[benchId].result.perf_count = 0;
benchmarks[benchId].result.error_norm2 = 0.f;
}

int j;
const float RAND_LOWER_BOUND = -2.f;
const float RAND_RANGE = 4.f;

for (j = 0; j < NUM_TESTS; ++j) {
// random initialization of the input arrays
for (i = 0; i < n; ++i) {
src[i] = RAND_RANGE * rand() / (float) RAND_MAX + RAND_LOWER_BOUND;
}

// computing golden value
softmax_golden_fp32_fp64(golden, src, n);

# ifdef VERY_VERBOSE
printf("source matrix:\n");
array_dump_fp32(src, n);
printf("golden result:\n");
array_dump_fp64(golden, n);
# endif // VERY_VERBOSE

// softmax benchmarks. iterating over all existing implementation for this given input set
for (unsigned benchId=0; benchId < sizeof(benchmarks) / sizeof(softmax_bench_t); benchId++)
{
memset(dst, 0, sizeof(dst)); // resetting array in-between experiments

softmax_bench_result_t local_result = benchmarks[benchId].bench(dst, src, golden, n);

benchmarks[benchId].result = accumulate_bench_result(benchmarks[benchId].result, local_result);

# ifdef VERY_VERBOSE
printf("%s result:\n", benchmarks[benchId].label);
array_dump_fp32(dst, n);
# endif // VERY_VERBOSE

}
}

// display results
for (unsigned benchId=0; benchId < sizeof(benchmarks) / sizeof(softmax_bench_t); benchId++)
{
softmax_bench_result_t bench_result = benchmarks[benchId].result;
bench_result.perf_count = bench_result.perf_count / NUM_TESTS;
bench_result.mean_rel_error = bench_result.mean_rel_error / NUM_TESTS;
bench_result.error_norm2 = sqrt(bench_result.error_norm2);


# ifdef VERBOSE
printf("--------------------------------------------------------------------------------\n");
printf("%s used %d " PERF_METRIC "(s) to evaluate softmax on a %d-element array.\n",
benchmarks[benchId].label, bench_result.perf_count, n);
printf(" " PERF_METRIC " per elements: %.3f\n", (double) bench_result.perf_count / n);
printf(" element(s) per " PERF_METRIC ": %.3f\n", (double) n / bench_result.perf_count);
printf(" max absolute error: %.4a\n", bench_result.max_abs_error);
printf(" max relative error: %.4a\n", bench_result.max_rel_error);
printf(" mean relative error: %.4a\n", bench_result.mean_rel_error);
printf(" error norm 2: %.4a\n", bench_result.error_norm2);
# else
// condensed display
printf("%s, %d, %d, %.3e, %.3e, %.3e %.3e\n",
benchmarks[benchId].label, n, bench_result.perf_count,
bench_result.max_abs_error, bench_result.max_rel_error, bench_result.error_norm2, bench_result.mean_rel_error);
# endif
}

free(src);
free(dst);
free(golden);
}


return 0;
}
69 changes: 69 additions & 0 deletions src/softmax/bench_softmax_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include <stddef.h>
#include <stdio.h>
#include <math.h>

/** generic type for a binary32/float softmax implementation */
typedef void(softmax_func_t)(float* dst, float* src, size_t n);

/** return the value of selected perf counter
*
* perf counter is selected through a macro:
* - defining COUNT_INSTRET selects the instret counter
* The instret counter counts the number of retired (executed) instructions.
* - defining COUNT_CYCLE selects cycle count
*/
static unsigned long read_perf_counter(void)
{
unsigned long counter_value;
#if defined(COUNT_INSTRET)
#define PERF_METRIC "instruction"
asm volatile ("rdinstret %0" : "=r" (counter_value));
#elif defined(COUNT_CYCLE)
#define PERF_METRIC "cycle"
asm volatile ("rdcycle %0" : "=r" (counter_value));
#else
// instret is also the default
#define PERF_METRIC "instruction"
asm volatile ("rdinstret %0" : "=r" (counter_value));
#endif
return counter_value;
}

typedef struct {
unsigned long perf_count;
double max_abs_error;
double max_rel_error;
double mean_rel_error;
double error_norm2;
} softmax_bench_result_t;

static softmax_bench_result_t accumulate_bench_result(softmax_bench_result_t res, softmax_bench_result_t new_result) {
if (new_result.max_abs_error > res.max_abs_error) res.max_abs_error = new_result.max_abs_error;
if (new_result.max_rel_error > res.max_rel_error) res.max_rel_error = new_result.max_rel_error;
res.perf_count += new_result.perf_count;
res.error_norm2 += new_result.max_rel_error * new_result.max_rel_error;
res.mean_rel_error += new_result.mean_rel_error;

return res;
}


/** Display the content of a binary32 n-element array */
static void array_dump_fp32(float *array, size_t n)
{
size_t i;
for (i = 0; i < n; ++i) {
printf(" %a ", array[i]);
}
printf("\n");
}

/** Display the content of a binary64 n-element array */
static void array_dump_fp64(double *array, size_t n)
{
size_t i;
for (i = 0; i < n; ++i) {
printf(" %a ", array[i]);
}
printf("\n");
}
Loading

0 comments on commit 78f1d34

Please sign in to comment.