-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[softmax] basic implementation of softmax based on RVV (#5)
* Softmax bench prototyping * Adding baseline benchmark * Prototype of RVV implementation * Introducing fully vectorized implementation * Minor optimizations to full RVV scheme * Intrinsics and dead code cleanup * Implementing multi-size array benchmark * Adding accuracy benchmark for scalar expf implementation * Refactoring benchmarks to execute every test on multiple input sets * Making golden model more stable (subtraction of max input) * Integrating max offset subtraction in quick_dirty_vector_expf * Introducing POLY_DEGREE macro to configure quick_dirty_expf polynomial degree (max: 6) * Extending exponential benchmark * Adding README.md file * Adding missing fesetround call
- Loading branch information
Showing
7 changed files
with
801 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
# Makefile for RISC-V In a Nutshell course (RVVIAN) examples | ||
|
||
# RISC-V C Compiler | ||
# available options (the version used must support RVV intrinsics) | ||
# clang/llvm | ||
RISCVCLANG=clang --target=riscv64 | ||
# RISCVCC=clang --target=riscv64 | ||
# GNU Compiler Collection (GCC) | ||
RISCVCC=riscv64-unknown-elf-gcc | ||
|
||
EXTRA_CFLAGS?=-DCOUNT_INSTRET | ||
|
||
# architectural parameters for the simulation | ||
# width of vector registers (VLEN) | ||
VLEN?=128 | ||
|
||
# path to proxy-kernel (pk) | ||
PK_PATH=/opt/riscv/riscv64-unknown-elf/bin/pk64 | ||
|
||
# SIMULATOR | ||
# Available options in the Docker (uncomment one) | ||
SIMULATOR=spike --isa=rv64gcv_zicntr_zihpm --varch=vlen:$(VLEN),elen:64 $(PK_PATH) | ||
# SIMULATOR=qemu-riscv64 -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on | ||
|
||
INCLUDE_DIR ?= /opt/riscv/riscv64-unknown-elf/include/ | ||
|
||
softmax_baseline.o: softmax_baseline.c | ||
$(RISCVCLANG) $(EXTRA_CFLAGS) -I./ -I$(INCLUDE_DIR) -O2 -march=rv64gcv -c -o $@ $^ | ||
|
||
softmax_rvv.o: softmax_rvv.c | ||
$(RISCVCLANG) $(EXTRA_CFLAGS) -I./ -I$(INCLUDE_DIR) -O2 -march=rv64gcv -c -o $@ $^ | ||
|
||
bench_softmax: bench_softmax.c softmax_baseline.o softmax_rvv.o | ||
$(RISCVCC) $(EXTRA_CFLAGS) -I./ -O2 -march=rv64gcv $^ -lm -o $@ | ||
|
||
bench_exp: bench_expf.c softmax_baseline.o softmax_rvv.o | ||
$(RISCVCC) $(EXTRA_CFLAGS) -I./ -O2 -march=rv64gcv $^ -lm -o $@ | ||
|
||
sim_bench_softmax: bench_softmax | ||
$(SIMULATOR) $^ | ||
|
||
sim_bench_exp: bench_exp | ||
$(SIMULATOR) $^ | ||
|
||
clean: | ||
rm -f *.o bench_softmax | ||
|
||
.PHONY: sim_bench_softmax clean |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
# Softmax | ||
|
||
The examples in this directory implement multiple version of a softmax activation function on a 1D array. | ||
Those examples are designed to illustrate multiple way RISC-V Vector Extension (RVV) can | ||
be used to perform such operation. | ||
|
||
# Running example in a docker container | ||
|
||
All the examples in this directory can be built and executed using a docker container built | ||
from https://github.com/nibrunie/rvv-examples/blob/main/riscv-toolchain.Dockerfile. | ||
You can follow the indication of https://github.com/nibrunie/rvv-examples/blob/main/README.md for more | ||
information. | ||
|
||
# How to build and execute softmax benchmarks | ||
|
||
Once inside a proper build/execution environment, you can change the current directory to | ||
be `src/softmax` and execute a simple `make` command to build and run the examples: | ||
|
||
``` | ||
make clean | ||
make sim_bench_softmax EXTRA_CFLAGS="-DVERBOSE" | ||
``` | ||
|
||
It is possible to modify the build configuration with the `EXTRA_CFLAGS` environment variable, | ||
for example to count cycles rather than retired instructions you can run: | ||
``` | ||
make clean | ||
make sim_bench_softmax EXTRA_CFLAGS="-DVERBOSE -DCOUNT_CYCLE" | ||
``` | ||
|
||
You can also reduce the verbosity level by removing `-DVERBOSE` and you can change the number of tests executed for each benchmark wit `-DNUM_TESTS=<num-of-tests>`. | ||
|
||
# How to build and execute exponential accuracy benchmarks | ||
|
||
This directory contains an accuracy test bench for various scalar implementation of the exponential function. | ||
|
||
Once inside a proper build/execution environment, you can change the current directory to | ||
be `src/softmax` and execute a simple `make` command to build and run the examples: | ||
|
||
``` | ||
make clean | ||
make sim_bench_exp | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
#include <math.h> | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <time.h> | ||
|
||
float quick_dirty_expf(float x); | ||
|
||
float quick_dirty_vector_expf(float* dst, float* src, float max_x, size_t n); | ||
|
||
float quick_dirty_expf_from_vector(float x) { | ||
float src[4] = {x, x, x, x}; | ||
float dst[4] = {0}; | ||
quick_dirty_vector_expf(dst, src, 0.f, 4); | ||
return dst[3]; | ||
} | ||
|
||
typedef float (exp_implementation_t)(float); | ||
|
||
typedef struct { | ||
exp_implementation_t* func; | ||
double max_rel_error; | ||
float max_rel_error_input; | ||
char label[100]; | ||
} bench_exp_t; | ||
|
||
int main(void) { | ||
srand(17); | ||
float start = -2.0f, stop=2.0f; | ||
int steps = 128; | ||
int i; | ||
double max_error[2] = {0.}; | ||
double max_error_input[2] = {0.}; | ||
|
||
bench_exp_t benchmarks[] = { | ||
(bench_exp_t) {.func = expf, .label="baseline expf"}, | ||
(bench_exp_t) {.func = quick_dirty_expf, .label="quick_dirty_expf"}, | ||
(bench_exp_t) {.func = quick_dirty_expf_from_vector, .label="quick_dirty_vector_expf"}, | ||
}; | ||
|
||
int benchId; | ||
for (benchId = 0; benchId < sizeof(benchmarks) / sizeof(bench_exp_t); benchId ++) { | ||
benchmarks[benchId].max_rel_error = 0.f; | ||
benchmarks[benchId].max_rel_error_input = 0.f; | ||
} | ||
|
||
|
||
for (i = 0; i < steps; ++i) { | ||
float x = start + (stop - start) * rand() / (float) RAND_MAX; | ||
double golden = exp(x); | ||
|
||
for (benchId = 0; benchId < sizeof(benchmarks) / sizeof(bench_exp_t); benchId ++) { | ||
float result = benchmarks[benchId].func(x); | ||
double rel_error = (result - golden) / golden; | ||
if (rel_error > benchmarks[benchId].max_rel_error) { | ||
benchmarks[benchId].max_rel_error = rel_error; | ||
benchmarks[benchId].max_rel_error_input = x; | ||
} | ||
printf("%.5e ", rel_error); | ||
} | ||
printf("\n"); | ||
} | ||
for (benchId = 0; benchId < sizeof(benchmarks) / sizeof(bench_exp_t); benchId ++) { | ||
printf("%s: max relative error is %.5e, it is reached for expf(%a)=%a vs exp(%a)=%a\n", | ||
benchmarks[benchId].label, benchmarks[benchId].max_rel_error, | ||
benchmarks[benchId].max_rel_error_input, benchmarks[benchId].func(benchmarks[benchId].max_rel_error_input), | ||
benchmarks[benchId].max_rel_error_input, exp(benchmarks[benchId].max_rel_error_input)); | ||
} | ||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
// file: bench_matrix_transpose.c | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <string.h> | ||
#include <stdio.h> | ||
#include <assert.h> | ||
#include <bench_softmax_utils.h> | ||
|
||
|
||
/** Declaring various softmax implementation benchmarks **/ | ||
softmax_bench_result_t softmax_baseline_fp32_bench(float* dst, float* src, double* golden, size_t n); | ||
|
||
softmax_bench_result_t softmax_rvv_norm_fp32_bench(float* dst, float* src, double* golden, size_t n); | ||
|
||
softmax_bench_result_t softmax_rvv_fp32_bench(float* dst, float* src, double* golden, size_t n); | ||
|
||
softmax_bench_result_t softmax_stable_rvv_fp32_bench(float* dst, float* src, double* golden, size_t n); | ||
|
||
softmax_bench_result_t softmax_scalar_quick_dirty_expf_fp32_bench(float* dst, float* src, double* golden, size_t n); | ||
|
||
typedef softmax_bench_result_t (softmax_bench_func_t)(float* dst, float* src, double* golden, size_t n); | ||
|
||
/** Descriptor structure for softmax benchmark */ | ||
typedef struct { | ||
softmax_bench_func_t* bench; | ||
softmax_bench_result_t result; | ||
char label[100]; | ||
} softmax_bench_t; | ||
|
||
|
||
extern void softmax_golden_fp32_fp64(double* dst, float* src, size_t n); | ||
|
||
#ifndef NUM_TESTS | ||
#define NUM_TESTS 100 | ||
#endif | ||
|
||
|
||
int main(void) { | ||
int i; | ||
softmax_bench_t benchmarks[] = { | ||
(softmax_bench_t){.bench = softmax_baseline_fp32_bench, .label="baseline n-element softmax"}, | ||
(softmax_bench_t){.bench = softmax_scalar_quick_dirty_expf_fp32_bench, .label="scalar quick_dirty_expf n-element softmax"}, | ||
(softmax_bench_t){.bench = softmax_rvv_norm_fp32_bench, .label="rvv-based (norm only) n-element softmax"}, | ||
(softmax_bench_t){.bench = softmax_rvv_fp32_bench, .label="rvv-based n-element softmax"}, | ||
(softmax_bench_t){.bench = softmax_stable_rvv_fp32_bench, .label="rvv-based n-element stable softmax"}, | ||
}; | ||
|
||
size_t testSizes[] = {4, 16, 17, 32, 33, 128, 129, 511, 512, 1024, 2048}; | ||
for (size_t testId = 0; testId < sizeof(testSizes) / sizeof(size_t); testId++) | ||
{ | ||
size_t n = testSizes[testId]; | ||
# ifdef VERBOSE | ||
printf("--------------------------------------------------------------------------------\n"); | ||
printf("--------------------------------------------------------------------------------\n"); | ||
printf("Benchmarking softmax on a %d-element array.\n", n); | ||
# endif | ||
float* src = malloc(n * sizeof(float)); | ||
float* dst = malloc(n * sizeof(float)); | ||
double* golden = malloc(n * sizeof(double)); | ||
assert(src); | ||
assert(dst); | ||
assert(golden); | ||
|
||
// reset benchmark results | ||
for (unsigned benchId=0; benchId < sizeof(benchmarks) / sizeof(softmax_bench_t); benchId++) | ||
{ | ||
benchmarks[benchId].result.max_abs_error = 0.; | ||
benchmarks[benchId].result.max_rel_error = 0.; | ||
benchmarks[benchId].result.perf_count = 0; | ||
benchmarks[benchId].result.error_norm2 = 0.f; | ||
} | ||
|
||
int j; | ||
const float RAND_LOWER_BOUND = -2.f; | ||
const float RAND_RANGE = 4.f; | ||
|
||
for (j = 0; j < NUM_TESTS; ++j) { | ||
// random initialization of the input arrays | ||
for (i = 0; i < n; ++i) { | ||
src[i] = RAND_RANGE * rand() / (float) RAND_MAX + RAND_LOWER_BOUND; | ||
} | ||
|
||
// computing golden value | ||
softmax_golden_fp32_fp64(golden, src, n); | ||
|
||
# ifdef VERY_VERBOSE | ||
printf("source matrix:\n"); | ||
array_dump_fp32(src, n); | ||
printf("golden result:\n"); | ||
array_dump_fp64(golden, n); | ||
# endif // VERY_VERBOSE | ||
|
||
// softmax benchmarks. iterating over all existing implementation for this given input set | ||
for (unsigned benchId=0; benchId < sizeof(benchmarks) / sizeof(softmax_bench_t); benchId++) | ||
{ | ||
memset(dst, 0, sizeof(dst)); // resetting array in-between experiments | ||
|
||
softmax_bench_result_t local_result = benchmarks[benchId].bench(dst, src, golden, n); | ||
|
||
benchmarks[benchId].result = accumulate_bench_result(benchmarks[benchId].result, local_result); | ||
|
||
# ifdef VERY_VERBOSE | ||
printf("%s result:\n", benchmarks[benchId].label); | ||
array_dump_fp32(dst, n); | ||
# endif // VERY_VERBOSE | ||
|
||
} | ||
} | ||
|
||
// display results | ||
for (unsigned benchId=0; benchId < sizeof(benchmarks) / sizeof(softmax_bench_t); benchId++) | ||
{ | ||
softmax_bench_result_t bench_result = benchmarks[benchId].result; | ||
bench_result.perf_count = bench_result.perf_count / NUM_TESTS; | ||
bench_result.mean_rel_error = bench_result.mean_rel_error / NUM_TESTS; | ||
bench_result.error_norm2 = sqrt(bench_result.error_norm2); | ||
|
||
|
||
# ifdef VERBOSE | ||
printf("--------------------------------------------------------------------------------\n"); | ||
printf("%s used %d " PERF_METRIC "(s) to evaluate softmax on a %d-element array.\n", | ||
benchmarks[benchId].label, bench_result.perf_count, n); | ||
printf(" " PERF_METRIC " per elements: %.3f\n", (double) bench_result.perf_count / n); | ||
printf(" element(s) per " PERF_METRIC ": %.3f\n", (double) n / bench_result.perf_count); | ||
printf(" max absolute error: %.4a\n", bench_result.max_abs_error); | ||
printf(" max relative error: %.4a\n", bench_result.max_rel_error); | ||
printf(" mean relative error: %.4a\n", bench_result.mean_rel_error); | ||
printf(" error norm 2: %.4a\n", bench_result.error_norm2); | ||
# else | ||
// condensed display | ||
printf("%s, %d, %d, %.3e, %.3e, %.3e %.3e\n", | ||
benchmarks[benchId].label, n, bench_result.perf_count, | ||
bench_result.max_abs_error, bench_result.max_rel_error, bench_result.error_norm2, bench_result.mean_rel_error); | ||
# endif | ||
} | ||
|
||
free(src); | ||
free(dst); | ||
free(golden); | ||
} | ||
|
||
|
||
return 0; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
#include <stddef.h> | ||
#include <stdio.h> | ||
#include <math.h> | ||
|
||
/** generic type for a binary32/float softmax implementation */ | ||
typedef void(softmax_func_t)(float* dst, float* src, size_t n); | ||
|
||
/** return the value of selected perf counter | ||
* | ||
* perf counter is selected through a macro: | ||
* - defining COUNT_INSTRET selects the instret counter | ||
* The instret counter counts the number of retired (executed) instructions. | ||
* - defining COUNT_CYCLE selects cycle count | ||
*/ | ||
static unsigned long read_perf_counter(void) | ||
{ | ||
unsigned long counter_value; | ||
#if defined(COUNT_INSTRET) | ||
#define PERF_METRIC "instruction" | ||
asm volatile ("rdinstret %0" : "=r" (counter_value)); | ||
#elif defined(COUNT_CYCLE) | ||
#define PERF_METRIC "cycle" | ||
asm volatile ("rdcycle %0" : "=r" (counter_value)); | ||
#else | ||
// instret is also the default | ||
#define PERF_METRIC "instruction" | ||
asm volatile ("rdinstret %0" : "=r" (counter_value)); | ||
#endif | ||
return counter_value; | ||
} | ||
|
||
typedef struct { | ||
unsigned long perf_count; | ||
double max_abs_error; | ||
double max_rel_error; | ||
double mean_rel_error; | ||
double error_norm2; | ||
} softmax_bench_result_t; | ||
|
||
static softmax_bench_result_t accumulate_bench_result(softmax_bench_result_t res, softmax_bench_result_t new_result) { | ||
if (new_result.max_abs_error > res.max_abs_error) res.max_abs_error = new_result.max_abs_error; | ||
if (new_result.max_rel_error > res.max_rel_error) res.max_rel_error = new_result.max_rel_error; | ||
res.perf_count += new_result.perf_count; | ||
res.error_norm2 += new_result.max_rel_error * new_result.max_rel_error; | ||
res.mean_rel_error += new_result.mean_rel_error; | ||
|
||
return res; | ||
} | ||
|
||
|
||
/** Display the content of a binary32 n-element array */ | ||
static void array_dump_fp32(float *array, size_t n) | ||
{ | ||
size_t i; | ||
for (i = 0; i < n; ++i) { | ||
printf(" %a ", array[i]); | ||
} | ||
printf("\n"); | ||
} | ||
|
||
/** Display the content of a binary64 n-element array */ | ||
static void array_dump_fp64(double *array, size_t n) | ||
{ | ||
size_t i; | ||
for (i = 0; i < n; ++i) { | ||
printf(" %a ", array[i]); | ||
} | ||
printf("\n"); | ||
} |
Oops, something went wrong.