[softmax] basic implementation of softmax based on RVV (#5)

* Softmax bench prototyping * Adding baseline benchmark * Prototype of RVV implementation * Introducing fully vectorized implementation * Minor optimizations to full RVV scheme * Intrinsics and dead code cleanup * Implementing multi-size array benchmark * Adding accuracy benchmark for scalar expf implementation * Refactoring benchmarks to execute every test on multiple input sets * Making golden model more stable (subtraction of max input) * Integrating max offset subtraction in quick_dirty_vector_expf * Introducing POLY_DEGREE macro to configure quick_dirty_expf polynomial degree (max: 6) * Extending exponential benchmark * Adding README.md file * Adding missing fesetround call
nibrunie · Feb 12, 2024 · 78f1d34 · 78f1d34
1 parent b2e7911
commit 78f1d34
Show file tree

Hide file tree

Showing 7 changed files with 801 additions and 0 deletions.
diff --git a/src/softmax/Makefile b/src/softmax/Makefile
@@ -0,0 +1,48 @@
+# Makefile for RISC-V In a Nutshell course (RVVIAN) examples
+
+# RISC-V C Compiler
+# available options (the version used must support RVV intrinsics)
+# clang/llvm
+RISCVCLANG=clang  --target=riscv64
+# RISCVCC=clang  --target=riscv64
+# GNU Compiler Collection (GCC)
+RISCVCC=riscv64-unknown-elf-gcc 
+
+EXTRA_CFLAGS?=-DCOUNT_INSTRET
+
+# architectural parameters for the simulation
+# width of vector registers (VLEN)
+VLEN?=128
+
+# path to proxy-kernel (pk)
+PK_PATH=/opt/riscv/riscv64-unknown-elf/bin/pk64 
+
+# SIMULATOR
+# Available options in the Docker (uncomment one)
+SIMULATOR=spike --isa=rv64gcv_zicntr_zihpm --varch=vlen:$(VLEN),elen:64 $(PK_PATH)
+# SIMULATOR=qemu-riscv64 -cpu rv64,v=on,vext_spec=v1.0,vlen=128,rvv_ta_all_1s=on
+
+INCLUDE_DIR ?= /opt/riscv/riscv64-unknown-elf/include/
+
+softmax_baseline.o: softmax_baseline.c
+	$(RISCVCLANG) $(EXTRA_CFLAGS) -I./ -I$(INCLUDE_DIR)  -O2 -march=rv64gcv -c -o $@ $^
+
+softmax_rvv.o: softmax_rvv.c
+	$(RISCVCLANG) $(EXTRA_CFLAGS) -I./ -I$(INCLUDE_DIR)  -O2 -march=rv64gcv -c -o $@ $^
+
+bench_softmax: bench_softmax.c softmax_baseline.o softmax_rvv.o
+	 $(RISCVCC) $(EXTRA_CFLAGS) -I./ -O2 -march=rv64gcv $^ -lm -o $@
+
+bench_exp: bench_expf.c softmax_baseline.o softmax_rvv.o
+	 $(RISCVCC) $(EXTRA_CFLAGS) -I./ -O2 -march=rv64gcv $^ -lm -o $@
+
+sim_bench_softmax: bench_softmax
+	$(SIMULATOR) $^
+
+sim_bench_exp: bench_exp
+	$(SIMULATOR) $^
+
+clean:
+	rm -f *.o bench_softmax 
+
+.PHONY: sim_bench_softmax clean
diff --git a/src/softmax/README.md b/src/softmax/README.md
@@ -0,0 +1,43 @@
+# Softmax
+
+The examples in this directory implement multiple version of a softmax activation function on a 1D array.
+Those examples are designed to illustrate multiple way RISC-V Vector Extension (RVV) can
+be used to perform such operation.
+
+# Running example in a docker container
+
+All the examples in this directory can be built and executed using a docker container built
+from https://github.com/nibrunie/rvv-examples/blob/main/riscv-toolchain.Dockerfile.
+You can follow the indication of https://github.com/nibrunie/rvv-examples/blob/main/README.md for more 
+information.
+
+# How to build and execute softmax benchmarks
+
+Once inside a proper build/execution environment, you can change the current directory to
+be `src/softmax` and execute a simple `make` command to build and run the examples:
+
+```
+make clean
+make sim_bench_softmax EXTRA_CFLAGS="-DVERBOSE"
+```
+
+It is possible to modify the build configuration with the `EXTRA_CFLAGS` environment variable,
+for example to count cycles rather than retired instructions you can run:
+```
+make clean
+make sim_bench_softmax EXTRA_CFLAGS="-DVERBOSE -DCOUNT_CYCLE"
+```
+
+You can also reduce the verbosity level by removing `-DVERBOSE` and you can change the number of tests executed for each benchmark wit `-DNUM_TESTS=<num-of-tests>`.
+
+# How to build and execute exponential  accuracy benchmarks
+
+This directory contains an accuracy test bench for various scalar implementation of the exponential function.
+
+Once inside a proper build/execution environment, you can change the current directory to
+be `src/softmax` and execute a simple `make` command to build and run the examples:
+
+```
+make clean
+make sim_bench_exp 
+```
diff --git a/src/softmax/bench_expf.c b/src/softmax/bench_expf.c
@@ -0,0 +1,70 @@
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+
+float quick_dirty_expf(float x);
+
+float quick_dirty_vector_expf(float* dst, float* src, float max_x, size_t n);
+
+float quick_dirty_expf_from_vector(float x) {
+    float src[4] = {x, x, x, x};
+    float dst[4] = {0};
+    quick_dirty_vector_expf(dst, src, 0.f, 4);
+    return dst[3];
+}
+
+typedef float (exp_implementation_t)(float);
+
+typedef struct {
+    exp_implementation_t* func;
+    double max_rel_error;
+    float max_rel_error_input;
+    char label[100];
+} bench_exp_t;
+
+int main(void) {
+    srand(17);
+    float start = -2.0f, stop=2.0f;
+    int steps = 128;
+    int i;
+    double max_error[2] = {0.};
+    double max_error_input[2] = {0.};
+
+    bench_exp_t benchmarks[] = {
+        (bench_exp_t) {.func = expf,             .label="baseline expf"},
+        (bench_exp_t) {.func = quick_dirty_expf, .label="quick_dirty_expf"},
+        (bench_exp_t) {.func = quick_dirty_expf_from_vector, .label="quick_dirty_vector_expf"},
+    }; 
+
+    int benchId;
+    for (benchId = 0; benchId < sizeof(benchmarks) / sizeof(bench_exp_t); benchId ++) {
+        benchmarks[benchId].max_rel_error = 0.f;
+        benchmarks[benchId].max_rel_error_input = 0.f;
+    }
+
+
+    for (i = 0; i < steps; ++i) {
+        float x = start + (stop - start) * rand() / (float) RAND_MAX;
+        double golden = exp(x);
+
+        for (benchId = 0; benchId < sizeof(benchmarks) / sizeof(bench_exp_t); benchId ++) {
+            float result = benchmarks[benchId].func(x);
+            double rel_error = (result - golden) / golden;
+            if (rel_error > benchmarks[benchId].max_rel_error) {
+                benchmarks[benchId].max_rel_error = rel_error;
+                benchmarks[benchId].max_rel_error_input = x;
+            }
+            printf("%.5e ", rel_error);
+        }
+        printf("\n");
+    }
+    for (benchId = 0; benchId < sizeof(benchmarks) / sizeof(bench_exp_t); benchId ++) {
+        printf("%s:             max relative error is %.5e, it is reached for expf(%a)=%a vs exp(%a)=%a\n",
+               benchmarks[benchId].label, benchmarks[benchId].max_rel_error,
+               benchmarks[benchId].max_rel_error_input, benchmarks[benchId].func(benchmarks[benchId].max_rel_error_input),
+               benchmarks[benchId].max_rel_error_input, exp(benchmarks[benchId].max_rel_error_input));
+    }
+
+    return 0;
+}
diff --git a/src/softmax/bench_softmax.c b/src/softmax/bench_softmax.c
@@ -0,0 +1,144 @@
+// file: bench_matrix_transpose.c
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+#include <bench_softmax_utils.h>
+
+
+/** Declaring various softmax implementation benchmarks **/
+softmax_bench_result_t softmax_baseline_fp32_bench(float* dst, float* src, double* golden, size_t n);
+
+softmax_bench_result_t softmax_rvv_norm_fp32_bench(float* dst, float* src, double* golden, size_t n); 
+
+softmax_bench_result_t softmax_rvv_fp32_bench(float* dst, float* src, double* golden, size_t n); 
+
+softmax_bench_result_t softmax_stable_rvv_fp32_bench(float* dst, float* src, double* golden, size_t n);
+
+softmax_bench_result_t softmax_scalar_quick_dirty_expf_fp32_bench(float* dst, float* src, double* golden, size_t n);
+
+typedef softmax_bench_result_t (softmax_bench_func_t)(float* dst, float* src, double* golden, size_t n);
+
+/** Descriptor structure for softmax benchmark */
+typedef struct {
+    softmax_bench_func_t* bench;
+    softmax_bench_result_t result;
+    char label[100];
+} softmax_bench_t;
+
+
+extern void softmax_golden_fp32_fp64(double* dst, float* src, size_t n);
+
+#ifndef NUM_TESTS
+#define NUM_TESTS 100
+#endif
+
+
+int main(void) {
+    int i;
+    softmax_bench_t benchmarks[] = {
+        (softmax_bench_t){.bench = softmax_baseline_fp32_bench,                .label="baseline n-element softmax"},
+        (softmax_bench_t){.bench = softmax_scalar_quick_dirty_expf_fp32_bench, .label="scalar quick_dirty_expf n-element softmax"},
+        (softmax_bench_t){.bench = softmax_rvv_norm_fp32_bench,                .label="rvv-based (norm only) n-element softmax"},
+        (softmax_bench_t){.bench = softmax_rvv_fp32_bench,                     .label="rvv-based n-element softmax"},
+        (softmax_bench_t){.bench = softmax_stable_rvv_fp32_bench,              .label="rvv-based n-element stable softmax"},
+    };
+
+    size_t testSizes[] = {4, 16, 17, 32, 33, 128, 129, 511, 512, 1024, 2048};
+    for (size_t testId = 0; testId < sizeof(testSizes) / sizeof(size_t); testId++)
+    {
+        size_t n = testSizes[testId];
+#       ifdef VERBOSE 
+        printf("--------------------------------------------------------------------------------\n");
+        printf("--------------------------------------------------------------------------------\n");
+        printf("Benchmarking softmax on a %d-element array.\n", n);
+#       endif
+        float* src = malloc(n * sizeof(float));
+        float* dst = malloc(n * sizeof(float));
+        double* golden = malloc(n * sizeof(double));
+        assert(src);
+        assert(dst);
+        assert(golden);
+
+        // reset benchmark results
+        for (unsigned benchId=0; benchId < sizeof(benchmarks) / sizeof(softmax_bench_t); benchId++)
+        {
+            benchmarks[benchId].result.max_abs_error = 0.;
+            benchmarks[benchId].result.max_rel_error = 0.;
+            benchmarks[benchId].result.perf_count = 0;
+            benchmarks[benchId].result.error_norm2 = 0.f;
+        }
+
+        int j;
+        const float RAND_LOWER_BOUND = -2.f;
+        const float RAND_RANGE = 4.f;
+
+        for (j = 0; j < NUM_TESTS; ++j) {
+            // random initialization of the input arrays
+            for (i = 0; i < n; ++i) {
+                src[i] = RAND_RANGE * rand() / (float) RAND_MAX + RAND_LOWER_BOUND;
+            }
+
+            // computing golden value
+            softmax_golden_fp32_fp64(golden, src, n);
+
+#           ifdef VERY_VERBOSE
+            printf("source matrix:\n");
+            array_dump_fp32(src, n);
+            printf("golden result:\n");
+            array_dump_fp64(golden, n);
+#           endif // VERY_VERBOSE
+
+            // softmax benchmarks. iterating over all existing implementation for this given input set
+            for (unsigned benchId=0; benchId < sizeof(benchmarks) / sizeof(softmax_bench_t); benchId++)
+            {
+                memset(dst, 0, sizeof(dst)); // resetting array in-between experiments
+
+                softmax_bench_result_t local_result = benchmarks[benchId].bench(dst, src, golden, n);
+
+                benchmarks[benchId].result = accumulate_bench_result(benchmarks[benchId].result, local_result);
+
+#               ifdef VERY_VERBOSE
+                printf("%s result:\n", benchmarks[benchId].label);
+                array_dump_fp32(dst, n);
+#               endif // VERY_VERBOSE
+
+            }
+        }
+
+        // display results
+        for (unsigned benchId=0; benchId < sizeof(benchmarks) / sizeof(softmax_bench_t); benchId++)
+        {
+            softmax_bench_result_t bench_result = benchmarks[benchId].result;
+            bench_result.perf_count = bench_result.perf_count / NUM_TESTS;
+            bench_result.mean_rel_error = bench_result.mean_rel_error / NUM_TESTS;
+            bench_result.error_norm2 = sqrt(bench_result.error_norm2);
+
+
+#           ifdef VERBOSE 
+            printf("--------------------------------------------------------------------------------\n");
+            printf("%s used %d " PERF_METRIC "(s) to evaluate softmax on a %d-element array.\n",
+                benchmarks[benchId].label, bench_result.perf_count, n);
+            printf(" " PERF_METRIC " per elements:    %.3f\n", (double) bench_result.perf_count / n);
+            printf("  element(s) per " PERF_METRIC ": %.3f\n", (double) n / bench_result.perf_count);
+            printf("  max absolute error:  %.4a\n", bench_result.max_abs_error);
+            printf("  max relative error:  %.4a\n", bench_result.max_rel_error);
+            printf("  mean relative error: %.4a\n", bench_result.mean_rel_error);
+            printf("  error norm 2:       %.4a\n", bench_result.error_norm2);
+#           else
+            // condensed display
+            printf("%s, %d, %d, %.3e, %.3e, %.3e %.3e\n", 
+                   benchmarks[benchId].label, n, bench_result.perf_count,
+                   bench_result.max_abs_error, bench_result.max_rel_error, bench_result.error_norm2, bench_result.mean_rel_error);
+#           endif
+        }
+
+        free(src);
+        free(dst);
+        free(golden);
+    }
+
+
+    return 0;
+}
diff --git a/src/softmax/bench_softmax_utils.h b/src/softmax/bench_softmax_utils.h
@@ -0,0 +1,69 @@
+#include <stddef.h>
+#include <stdio.h>
+#include <math.h>
+
+/** generic type for a binary32/float softmax implementation */
+typedef void(softmax_func_t)(float* dst, float* src, size_t n);
+
+/** return the value of selected perf counter
+ * 
+ * perf counter is selected through a macro:
+ * - defining COUNT_INSTRET selects the instret counter
+ *    The instret counter counts the number of retired (executed) instructions.
+ * - defining COUNT_CYCLE selects cycle count
+*/
+static unsigned long read_perf_counter(void)
+{
+  unsigned long counter_value;
+#if defined(COUNT_INSTRET)
+#define PERF_METRIC "instruction"
+  asm volatile ("rdinstret %0" : "=r" (counter_value));
+#elif defined(COUNT_CYCLE)
+#define PERF_METRIC "cycle"
+  asm volatile ("rdcycle %0" : "=r" (counter_value));
+#else
+  // instret is also the default
+#define PERF_METRIC "instruction"
+  asm volatile ("rdinstret %0" : "=r" (counter_value));
+#endif
+  return counter_value;
+}
+
+typedef struct {
+    unsigned long perf_count;
+    double max_abs_error;
+    double max_rel_error;
+    double mean_rel_error;
+    double error_norm2;
+} softmax_bench_result_t;
+
+static softmax_bench_result_t accumulate_bench_result(softmax_bench_result_t res, softmax_bench_result_t new_result) {
+  if (new_result.max_abs_error > res.max_abs_error) res.max_abs_error = new_result.max_abs_error;
+  if (new_result.max_rel_error > res.max_rel_error) res.max_rel_error = new_result.max_rel_error;
+  res.perf_count += new_result.perf_count;
+  res.error_norm2 += new_result.max_rel_error * new_result.max_rel_error;
+  res.mean_rel_error += new_result.mean_rel_error;
+
+  return res;
+}
+
+
+/** Display the content of a binary32 n-element array */
+static void array_dump_fp32(float *array, size_t n)
+{
+    size_t i;
+    for (i = 0; i < n; ++i) {
+        printf(" %a ", array[i]);
+    }
+    printf("\n");
+}
+
+/** Display the content of a binary64 n-element array */
+static void array_dump_fp64(double *array, size_t n)
+{
+    size_t i;
+    for (i = 0; i < n; ++i) {
+        printf(" %a ", array[i]);
+    }
+    printf("\n");
+}