diff --git a/.clang-tidy b/.clang-tidy
new file mode 100644
index 000000000..5c75f7645
--- /dev/null
+++ b/.clang-tidy
@@ -0,0 +1,46 @@
+---
+Checks: >
+  bugprone-*,
+  cppcoreguidelines-*,
+  performance-*,
+  readability-*,$
+  clang-analyzer-*,
+  misc-unused-parameters,
+
+  -bugprone-easily-swappable-parameters,
+  -bugprone-reserved-identifier,
+  -cppcoreguidelines-pro-type-vararg,
+  -cppcoreguidelines-avoid-non-const-global-variables,
+
+WarningsAsErrors: ''
+HeaderFileExtensions:
+  - ''
+  - h
+  - hh
+  - hpp
+  - hxx
+ImplementationFileExtensions:
+  - c
+  - cc
+  - cpp
+  - cxx
+HeaderFilterRegex: ''
+AnalyzeTemporaryDtors: false
+FormatStyle:     file
+CheckOptions:
+  cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU'
+  google-readability-namespace-comments.ShortNamespaceLines: '10'
+  cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;'
+  llvm-else-after-return.WarnOnUnfixable: 'false'
+  cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false'
+  google-readability-namespace-comments.SpacesBeforeComments: '2'
+  cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true'
+  google-readability-braces-around-statements.ShortStatementLines: '1'
+  google-readability-function-size.StatementThreshold: '800'
+  llvm-qualified-auto.AddConstToQualified: 'false'
+  llvm-else-after-return.WarnOnConditionVariables: 'false'
+  cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false'
+  cppcoreguidelines-avoid-do-while.IgnoreMacros: 'true'
+SystemHeaders:   false
+...
+
diff --git a/.clangd b/.clangd
new file mode 100644
index 000000000..a6158c2f4
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,29 @@
+CompileFlags:
+  Add: [
+    -std=c++17,
+    -D__riscv,
+    -DNUM_CORES,
+    -DNUM_CORES_PER_TILE,
+    -DNUM_GROUPS,
+    -DSTACK_SIZE,
+    -DXQUEUE_SIZE,
+    -DSEQ_MEM_SIZE,
+    -DBANKING_FACTOR,
+    -DETL_CHECK_PUSH_POP,
+    -DETL_LOG_ERRORS,
+    -DETL_VERBOSE_ERRORS
+  ]
+
+---
+
+If:
+  PathMatch: [.*/runtime/.*/.*]
+CompileFlags:
+  Add: [-xc++, -I..]
+
+---
+
+If:
+  PathMatch: [.*/tests/.*/.*]
+CompileFlags:
+  Add: [-xc++, -I.., -I../../runtime/]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a645480ef..cea01b64d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Add `apb` dependency of version 0.2.4
 - Add support for the `FENCE` instruction
 - Add support for DRAMsys5.0 co-simulation
+- Add KMP runtime
+- Add new runtime testing infrastructure
+- Add automated benchmark running and plotting
+- Add clangd and clang-tidy configs
+- Add Banshee config
+- Add OpenMP sections and teams tests
 
 ### Changes
 - Add physical feasible TeraPool configuration with SubGroup hierarchy.
@@ -33,12 +39,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.
 - Update `common_verification` to 0.2.3
 - Update `register_interface` to 0.4.3
 - Updated Halide to version 15
+- Include SP in Snitch traces to aid in building better tracevis
+- Make the `tracevis` target depend on the `trace` target and use `-cf`
+- Rename the `omp` runtime directory to `gomp`
+- Link with `lld` when using clang
 
 ### Fixed
 - Fix type issue in `snitch_addr_demux`
 - Properly disable the debugging CSRs in ASIC implementations
 - Fix a bug in the  DMA's distributed midend
 - Fix bugs in radix2, radix4by2 parallelization and loading of data for radix4 CFFT
+- Fix building verilator on newer clang and gcc versions
+- Fix tracing targets missing some harts
+- Fix clang warnings about the .comment section
 
 ## 0.6.0 - 2023-01-09
 
diff --git a/Makefile b/Makefile
index 720e628b2..668513222 100644
--- a/Makefile
+++ b/Makefile
@@ -90,7 +90,7 @@ tc-llvm:
 		-DCMAKE_INSTALL_PREFIX=$(LLVM_INSTALL_DIR) \
 		-DCMAKE_CXX_COMPILER=$(CXX) \
 		-DCMAKE_C_COMPILER=$(CC) \
-		-DLLVM_ENABLE_PROJECTS="clang" \
+		-DLLVM_ENABLE_PROJECTS="clang;lld" \
 		-DLLVM_TARGETS_TO_BUILD="RISCV;host" \
 		-DLLVM_BUILD_DOCS="0" \
 		-DLLVM_ENABLE_BINDINGS="0" \
@@ -146,7 +146,7 @@ $(BENDER_INSTALL_DIR)/bender:
 verilator: $(VERILATOR_INSTALL_DIR)/bin/verilator
 $(VERILATOR_INSTALL_DIR)/bin/verilator: toolchain/verilator Makefile
 	cd $<; unset VERILATOR_ROOT; \
-	autoconf && CC=$(CLANG_CC) CXX=$(CLANG_CXX) CXXFLAGS=$(CLANG_CXXFLAGS) LDFLAGS=$(CLANG_LDFLAGS) ./configure --prefix=$(VERILATOR_INSTALL_DIR) $(VERILATOR_CI) && \
+	autoconf && CC=$(CLANG_CC) CXX=$(CLANG_CXX) CXXFLAGS="$(CLANG_CXXFLAGS) -include memory" LDFLAGS=$(CLANG_LDFLAGS) ./configure --prefix=$(VERILATOR_INSTALL_DIR) $(VERILATOR_CI) && \
 	make -j4 && make install
 
 # Update and patch hardware dependencies for MemPool
diff --git a/banshee-config.yaml b/banshee-config.yaml
new file mode 100644
index 000000000..b9e776dc3
--- /dev/null
+++ b/banshee-config.yaml
@@ -0,0 +1,53 @@
+# Copyright 2021 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+---
+address:
+  scratch_reg: 0x40000000
+  wakeup_reg: 0x40000004
+  tcdm_start: 0x40000008
+  tcdm_end: 0x4000000C
+  nr_cores: 0x40000010
+  uart: 0xC0000000
+  # Not supported in MemPool
+  barrier_reg:
+    start: 0x50000000
+    offset: 0x100000
+  cluster_base_hartid: 0x50000001
+  cluster_num: 0x50000002
+  cluster_id: 0x50000003
+  cl_clint: 0x40000060
+  clint: 0xFFFF0000
+memory:
+  tcdm:
+    start: 0x0
+    size: 0x100000
+    offset: 0x100000
+    latency: 5
+  dram:
+    start: 0x80000000
+    size: 0x01000000
+    offset: 0x0
+    latency: 10
+  periphs:
+    start: 0x40000000
+    size: 0x20000
+    offset: 0x0
+    latency: 5
+    callbacks:
+      - name: zero-memory
+        size: 0x10000
+      - name: mempool-dma
+        size: 0x1C
+inst_latency:
+  mul: 3
+  mulh: 3
+  mulhsu: 3
+  mulhu: 3
+  div: 3
+  divu: 3
+  rem: 3
+  remu: 3
+ssr:
+  num_dm: 3
diff --git a/config/minpool-no-xpulp.mk b/config/minpool-no-xpulp.mk
new file mode 100644
index 000000000..2d056fb4d
--- /dev/null
+++ b/config/minpool-no-xpulp.mk
@@ -0,0 +1,14 @@
+# Copyright 2021 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+# Author: Matheus Cavalcante, ETH Zurich
+
+###############
+##  MinPool  ##
+###############
+
+include $(MEMPOOL_DIR)/config/minpool.mk
+
+# Disable Xpulpimg
+xpulpimg ?= 0
diff --git a/hardware/Makefile b/hardware/Makefile
index 6de18eb4f..0b9f41383 100644
--- a/hardware/Makefile
+++ b/hardware/Makefile
@@ -15,6 +15,9 @@ TOOLCHAIN_DIR := $(abspath $(ROOT_DIR)/../toolchain)
 config_mk = $(abspath $(ROOT_DIR)/../config/config.mk)
 include $(config_mk)
 
+# Banshee config
+banshee_config = $(MEMPOOL_DIR)/banshee-config.yaml
+
 # build path
 buildpath       ?= build
 resultpath      ?= results
@@ -319,6 +322,13 @@ verilate: $(VERILATOR_EXE) $(buildpath) Makefile
 	# Avoid capturing the return status when running the load-throughput analysis
 	if [ $(tg) -ne 1 ]; then ./scripts/return_status.sh $(buildpath)/transcript; fi
 
+################
+# Banshee      #
+################
+
+banshee:
+	banshee --configuration $(banshee_config) --num-clusters 1 --num-cores $(num_cores) -l $(preload)
+
 #############
 # Lint      #
 #############
@@ -340,11 +350,13 @@ spyglass/tmp/files: $(bender)
 trace_env += NUM_CORES=$(num_cores)
 trace_env += SEQ_MEM_SIZE=$(seq_mem_size)
 
+.PHONY: benchmark trace log pre_trace post_trace tracevis
+
 benchmark: log simcvcs
 	# Call `make` again to get variable extension with all traces
 	result_dir=$(result_dir) $(MAKE) trace
 
-trace: pre_trace $(trace) post_trace
+trace: post_trace
 
 log:
 	mkdir -p "$(result_dir)"
@@ -361,20 +373,20 @@ log:
 pre_trace:
 	rm -rf $(tracepath)
 
-post_trace:
+post_trace: $(trace)
 	mkdir -p "$(result_dir)"
 	cp $(buildpath)/transcript "$(result_dir)/" | true
 	cp $(traceresult) "$(result_dir)"
 	cp $(trace) "$(result_dir)"
 	$(python) $(ROOT_DIR)/scripts/gen_avg.py --folder "$(result_dir)" | tee $(result_dir)/avg.txt
 
-$(buildpath)/%.trace: $(buildpath)/%.dasm
+$(buildpath)/%.trace: $(buildpath)/%.dasm pre_trace
 	mkdir -p $(tracepath)
 	$(INSTALL_DIR)/riscv-isa-sim/bin/spike-dasm < $< > $(tracepath)/$*
 	$(trace_env) $(python) $(ROOT_DIR)/scripts/gen_trace.py -p --csv $(traceresult) $(tracepath)/$* > $@
 
-tracevis:
-	$(MEMPOOL_DIR)/scripts/tracevis.py $(preload) $(buildpath)/*.trace -o $(buildpath)/tracevis.json
+tracevis: $(trace)
+	$(MEMPOOL_DIR)/scripts/tracevis.py -cf $(preload) $(buildpath)/*.trace -o $(buildpath)/tracevis.json
 
 ############################
 # Unit tests simulation    #
diff --git a/hardware/deps/snitch/src/snitch.sv b/hardware/deps/snitch/src/snitch.sv
index 32c58ea0e..50a2ca34a 100644
--- a/hardware/deps/snitch/src/snitch.sv
+++ b/hardware/deps/snitch/src/snitch.sv
@@ -2415,7 +2415,7 @@ module snitch
     assign sp_new_value = gpr_wdata[i];
     always_ff @(posedge clk_i or posedge rst_i) begin
       if (!rst_i && gpr_we[i] && gpr_waddr[i] == SP && csr_stack_limit_q != 32'hFFFF_FFFF && ($signed(sp_new_value) < $signed(csr_stack_limit_q))) begin
-        $warning("[Stackoverflow: Core %0d] Set SP to 0x%08h, limit is 0x%08h", hart_id_i, sp_new_value, csr_stack_limit_q);
+        $warning("[Stackoverflow: Core %0d, PC: %0d] Set SP to 0x%08h, limit is 0x%08h", hart_id_i, inst_addr_o, sp_new_value, csr_stack_limit_q);
       end
     end
   end
diff --git a/hardware/scripts/gen_trace.py b/hardware/scripts/gen_trace.py
index 1763f9151..0b091d9b8 100755
--- a/hardware/scripts/gen_trace.py
+++ b/hardware/scripts/gen_trace.py
@@ -28,9 +28,10 @@
 GENERAL_WARN = ('WARNING: Inconsistent final state; performance metrics may '
                 'be inaccurate. Is this trace complete?\n')
 
-TRACE_IN_REGEX = r'(\d+)\s+(\d+)\s+(0x[0-9A-Fa-fz]+)\s+([^#;]*)(\s*#;\s*(.*))?'
+TRACE_IN_REGEX = (r'(\d+)\s+(\d+)\s+(0x[0-9A-Fa-fz]+)\s+(0x[0-9A-Fa-fz]+)\s+'
+                  r'([^#;]*)(\s*#;\s*(.*))?')
 
-TRACE_OUT_FMT = '{:>8} {:>8} {:>10} {:<30}'
+TRACE_OUT_FMT = '{:>8} {:>8} {:>8} {:>10} {:<30}'
 
 
 # -------------------- Tracer configuration  --------------------
@@ -304,7 +305,7 @@ def annotate_insn(
     match = re.search(TRACE_IN_REGEX, line.strip('\n'))
     if match is None:
         raise ValueError('Not a valid trace line:\n{}'.format(line))
-    time_str, cycle_str, pc_str, insn, _, extras_str = match.groups()
+    time_str, cycle_str, pc_str, sp_str, insn, _, extras_str = match.groups()
     time_info = (int(time_str), int(cycle_str))
     show_time_info = (dupl_time_info or time_info != last_time_info)
     time_info_strs = tuple((str(elem) if show_time_info else '')
@@ -333,12 +334,13 @@ def annotate_insn(
         else:
             prev_wfi_time = 0
         return ((TRACE_OUT_FMT + ' #; {}').format(*time_info_strs,
-                                                  pc_str, insn, annot),
+                                                  pc_str, sp_str, insn, annot),
                 time_info, prev_wfi_time, retired_reg, empty)
     # Vanilla trace
     else:
-        return TRACE_OUT_FMT.format(
-            *time_info_strs, pc_str, insn), time_info, 0, retired_reg, False
+        return (TRACE_OUT_FMT.format(
+            *time_info_strs, pc_str, sp_str, insn), time_info, 0, retired_reg,
+            False)
 
 
 # -------------------- Performance metrics --------------------
diff --git a/hardware/src/mempool_cc.sv b/hardware/src/mempool_cc.sv
index 96134460e..1e3579380 100644
--- a/hardware/src/mempool_cc.sv
+++ b/hardware/src/mempool_cc.sv
@@ -406,8 +406,8 @@ module mempool_cc
           extras_fpu = $sformatf("%s}", extras_fpu);
 
           $timeformat(-9, 0, "", 10);
-          $sformat(trace_entry, "%t %8d 0x%h DASM(%h) #; %s\n",
-              $time, cycle, i_snitch.pc_q, i_snitch.inst_data_i, extras_str);
+          $sformat(trace_entry, "%t %8d 0x%h 0x%h DASM(%h) #; %s\n",
+              $time, cycle, i_snitch.pc_q, i_snitch.i_snitch_regfile.mem[2], i_snitch.inst_data_i, extras_str);
           $fwrite(f, trace_entry);
         end
 
diff --git a/scripts/license-checker.hjson b/scripts/license-checker.hjson
index eea7ec365..52f2ee8de 100644
--- a/scripts/license-checker.hjson
+++ b/scripts/license-checker.hjson
@@ -16,7 +16,7 @@
     'software/runtime/printf*'
     'software/runtime/encoding.h'
     'software/runtime/mempool_dma_frontend.h'
-    'software/runtime/omp/libgomp.h'
+    'software/runtime/gomp/libgomp.h'
     'software/riscv-tests/*'
     'hardware/deps/*'
     'hardware/tb/dpi/elfloader.cpp'
diff --git a/scripts/tracevis.py b/scripts/tracevis.py
index 66f1e2968..a379b5295 100755
--- a/scripts/tracevis.py
+++ b/scripts/tracevis.py
@@ -45,10 +45,12 @@
 # 1 -> cycle
 # 2 -> privilege level (RTL) / hartid (banshee)
 # 3 -> pc (hex with 0x prefix)
-# 4 -> instruction
-# 5 -> args (RTL) / empty (banshee)
-# 6 -> comment (RTL) / instruction arguments (banshee)
-RTL_REGEX = r' *(\d+) +(\d+) +([3M1S0U]?) *(0x[0-9a-f]+) ([.\w]+) +(.+)#; (.*)'
+# 4 -> sp (hex with 0x prefix)
+# 5 -> instruction
+# 6 -> args (RTL) / empty (banshee)
+# 7 -> comment (RTL) / instruction arguments (banshee)
+RTL_REGEX = (r' *(\d+) +(\d+) +([3M1S0U]?) *(0x[0-9a-f]+) *'
+             r'(0x[0-9a-f]+) ([.\w]+) +(.+)#; (.*)')
 BANSHEE_REGEX = r' *(\d+) (\d+) (\d+) ([0-9a-f]+) *.+ +.+# ([\w\.]*)( +)(.*)'
 
 # regex matches a line of instruction retired by the accelerator
@@ -63,12 +65,14 @@
 
 @lru_cache(maxsize=1024)
 def addr2line_cache(addr):
-    cmd = f'{addr2line} -e {elf} -f -a -i {addr:x}'
+    cmd = f'{addr2line} -C -e {elf} -f -a -i {addr:x}'
     return os.popen(cmd).read().split('\n')
 
 
 functions = []
-prev_func = ""
+function_stack = []
+prev_sp = 0
+prev_func = None
 prev_ts = 0
 start_benchmark = 0
 
@@ -108,14 +112,17 @@ def trace_instruction(
              f'"ts": {time}, '
              f'"dur": {duration}, '
              f'"pid": {pid}, '
-             f'"tid": {functions.index(name)}, '
+             f'"tid": {pid}, '
              f'"args": {frame_args}'
              f'}},\n')
 
     output_file.write(frame)
 
 
-def trace_function(name, pid, time, cyc, file):
+def trace_function(name, pid, time, cyc, file, instr, sp):
+    global prev_sp
+    global prev_func
+
     # Assemble values for json
     # Doc:
     # https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview
@@ -129,7 +136,55 @@ def trace_function(name, pid, time, cyc, file):
     arg_cycles = cyc
     arg_coords = file
 
-    if prev_func != "":
+    sp = int(sp, base=16)
+
+    if prev_func is None or name != prev_func:
+        if prev_func is not None and prev_func not in function_stack:
+            end_time = time if time > prev_ts else prev_ts + 1
+            output_file.write(
+                f'{{'
+                f'"name": "{prev_func}", '
+                f'"cat": "{cat}", '
+                f'"ph": "E", '
+                f'"ts": {end_time}, '
+                f'"pid": {pid}, '
+                f'"tid": {pid}'
+                f'}},\n'
+            )
+
+            # print(f'Stackless function {prev_func} ended')
+
+        if name not in function_stack:
+            output_file.write(f'{{'
+                              f'"name": "{name}", '
+                              f'"cat": "{cat}", '
+                              f'"ph": "B", '
+                              f'"ts": {time}, '
+                              f'"pid": {pid}, '
+                              f'"tid": {pid}, '
+                              f'"args": {{"time": "{arg_cycles}", '
+                              f' "Origin": "{arg_coords}"}}'
+                              f'}},\n')
+
+            # print(f'Begin {name}')
+        else:
+            pass
+            # print(f'Function {name} already in stack')
+
+    elif sp < prev_sp:
+        function_stack.append(name)
+        # print(f'Pushed {name} to stack')
+        # print(f'Function stack: {function_stack}')
+        # print()
+
+    elif sp > prev_sp and len(function_stack) > 0:
+        # pop prev function
+        prev_func = function_stack.pop()
+
+        # print(f'Popped {prev_func} from stack')
+        # print(f'Function stack: {function_stack}')
+        # print()
+
         end_time = time if time > prev_ts else prev_ts + 1
         output_file.write(
             f'{{'
@@ -138,25 +193,15 @@ def trace_function(name, pid, time, cyc, file):
             f'"ph": "E", '
             f'"ts": {end_time}, '
             f'"pid": {pid}, '
-            f'"tid": {functions.index(prev_func)}'
+            f'"tid": {pid}'
             f'}},\n'
         )
 
-    frame = (f'{{'
-             f'"name": "{name}", '
-             f'"cat": "{cat}", '
-             f'"ph": "B", '
-             f'"ts": {time}, '
-             f'"pid": {pid}, '
-             f'"tid": {functions.index(name)}, '
-             f'"args": {{"time": "{arg_cycles}", "Origin": "{arg_coords}"}}'
-             f'}},\n')
-
-    output_file.write(frame)
+    prev_sp = sp
+    prev_func = name
 
 
 def flush(buf, hartid):
-    global prev_func
     global prev_ts
     global start_benchmark
     global output_file
@@ -169,18 +214,18 @@ def flush(buf, hartid):
             a2ls += addr2line_cache(int(addr, base=16))[:-1]
     else:
         a2ls = os.popen(
-            f'{addr2line} -e {elf} -f -a -i {" ".join(pcs)}'
+            f'{addr2line} -C -e {elf} -f -a -i {" ".join(pcs)}'
         ).read().split('\n')[:-1]
 
     for i in range(len(buf)-1):
-        (time, cyc, priv, pc, instr, args, cmt) = buf.pop(0)
+        (time, cyc, priv, pc, sp, instr, args, cmt) = buf.pop(0)
 
         if use_time:
             next_time = int(buf[0][0])
             time = int(time)
         else:
             next_time = int(buf[0][1])
-            time = int(cyc)
+            time = float(cyc) / 1000
 
         # Have lookahead time to this instruction?
         next_time = lah[time] if time in lah else next_time
@@ -220,11 +265,8 @@ def flush(buf, hartid):
             functions.append(func)
 
         if compress_function:
-            if func != prev_func:
-                trace_function(name=func, pid=int(hartid),
-                               time=time, cyc=cyc, file=file)
-                prev_func = func
-                prev_ts = time
+            trace_function(name=func, pid=int(hartid),
+                           time=time, cyc=cyc, file=file, instr=instr, sp=sp)
         else:
             trace_instruction(name=func,
                               pid=int(hartid),
@@ -243,9 +285,9 @@ def parse_line(line, hartid):
     # print(line)
     match = re_line.match(line)
     if match:
-        (time, cyc, priv, pc, instr, args, cmt) = tuple(
+        (time, cyc, priv, pc, sp, instr, args, cmt) = tuple(
             [match.group(i+1).strip() for i in range(re_line.groups)])
-        buf.append((time, cyc, priv, pc, instr, args, cmt))
+        buf.append((time, cyc, priv, pc, sp, instr, args, cmt))
         last_time, last_cyc = time, cyc
 
     if len(buf) > 10:
@@ -263,7 +305,7 @@ def offload_lookahead(lines):
     for line in lines:
         match = re_line.match(line)
         if match:
-            (time, cyc, priv, pc, instr, args, cmt) = tuple(
+            (time, cyc, priv, pc, sp, instr, args, cmt) = tuple(
                 [match.group(i+1).strip() for i in range(re_line.groups)])
             time = int(time) if use_time else int(cyc)
 
@@ -387,7 +429,6 @@ def offload_lookahead(lines):
         output_file.write('{"traceEvents": [\n')
 
         for filename in traces:
-            prev_func = ""
             prev_ts = 0
             start_benchmark = 0
             hartid = 0
@@ -425,31 +466,6 @@ def offload_lookahead(lines):
                 print(f'=> Parsed {lines-fails} of {lines} lines',
                       file=sys.stderr)
 
-                # Terminate last function all at end of file
-                if compress_function:
-                    output_file.write(
-                        f'{{'
-                        f'"name": "{prev_func}", '
-                        f'"cat": "function", '
-                        f'"ph": "E", '
-                        f'"ts": {prev_ts+1}, '
-                        f'"pid": {hartid}, '
-                        f'"tid": {functions.index(prev_func)}'
-                        f'}},\n'
-                    )
-
-                # Write Metadata
-                for func in functions:
-                    output_file.write(
-                        f'{{'
-                        f'"name": "thread_name", '
-                        f'"ph": "M", '
-                        f'"pid": {hartid}, '
-                        f'"tid": {functions.index(func)}, '
-                        f'"args": {{"name" : "{func}"}}'
-                        f'}},\n'
-                    )
-
         for i in range(hartid + 1):
             output_file.write(
                 f'{{'
diff --git a/software/apps/omp/Makefile b/software/apps/omp/Makefile
index cece16052..efc9f44bc 100644
--- a/software/apps/omp/Makefile
+++ b/software/apps/omp/Makefile
@@ -9,12 +9,15 @@ SOFTWARE_DIR := $(abspath $(ROOT_DIR)/../..)
 APPS_DIR := $(ROOT_DIR)
 BIN_DIR := $(abspath $(SOFTWARE_DIR)/bin/$(subst $(SOFTWARE_DIR),,$(APPS_DIR)))
 RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime)
+COMPILER ?= llvm
 
 # OpenMP runtime
-OMP_DIR       ?= $(RUNTIME_DIR)/omp
 RISCV_CCFLAGS += -fopenmp -DNTHREADS=$(num_cores)
 RISCV_CCFLAGS += -I$(OMP_DIR)
 
+# Wrap main function
+RISCV_LDFLAGS += -Wl,-wrap,main
+
 # This will overwrite the ROOT_DIR variable from the included makefile
 include $(RUNTIME_DIR)/runtime.mk
 
diff --git a/software/apps/omp/barrier_benchmark/main.c b/software/apps/omp/barrier_benchmark/main.c
new file mode 100644
index 000000000..02cab5771
--- /dev/null
+++ b/software/apps/omp/barrier_benchmark/main.c
@@ -0,0 +1,43 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include <string.h>
+
+#include "baremetal/mempool_conv2d_i32p.h"
+#include "encoding.h"
+#include "omp.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+#define MAX_BARRIERS 16
+
+int main() {
+#pragma omp parallel
+  {
+    unsigned int counter = 0;
+    unsigned int cycles = 0;
+    unsigned int start_time = 0;
+
+    for (int i = 1; i < MAX_BARRIERS + 1; i++) {
+
+      start_time = mempool_get_timer();
+      mempool_start_benchmark();
+
+      for (int j = 0; j < i; j++) {
+#pragma omp barrier
+        counter++;
+      }
+
+      mempool_stop_benchmark();
+      cycles = mempool_get_timer() - start_time;
+
+#pragma omp single
+      printf("%d barriers: %d cycles\n", i, cycles);
+    }
+  }
+
+  return 0;
+}
diff --git a/software/apps/omp/barrier_conv/main.c b/software/apps/omp/barrier_conv/main.c
index 5b0548057..47544704d 100644
--- a/software/apps/omp/barrier_conv/main.c
+++ b/software/apps/omp/barrier_conv/main.c
@@ -7,7 +7,7 @@
 
 #include "baremetal/mempool_conv2d_i32p.h"
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
@@ -149,37 +149,40 @@ void conv_gomp_barrier(uint32_t core_id, uint32_t num_cores) {
 }
 
 int main() {
-  mempool_timer_t cycles, start_time;
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-
-  if (core_id == 0) {
-    printf("Start Barrier Benchmark\n");
-  }
+#pragma omp parallel
+  {
+    mempool_timer_t cycles, start_time;
+    uint32_t core_id = mempool_get_core_id();
+    uint32_t num_cores = mempool_get_core_count();
+
+    if (core_id == 0) {
+      printf("Start Barrier Benchmark\n");
+    }
 
 #pragma omp barrier
-  start_time = mempool_get_timer();
-  mempool_start_benchmark();
-  conv_mempool_barrier(core_id, num_cores);
-  mempool_stop_benchmark();
-  cycles = mempool_get_timer();
-
-  if (core_id == 0) {
-    printf("Mempool barrier cycles: %d\n", cycles - start_time);
-  }
+    start_time = mempool_get_timer();
+    mempool_start_benchmark();
+    conv_mempool_barrier(core_id, num_cores);
+    mempool_stop_benchmark();
+    cycles = mempool_get_timer();
+
+    if (core_id == 0) {
+      printf("Mempool barrier cycles: %d\n", cycles - start_time);
+    }
 
-  mempool_barrier(num_cores);
+    mempool_barrier(num_cores);
 
-  start_time = mempool_get_timer();
-  mempool_start_benchmark();
-  conv_gomp_barrier(core_id, num_cores);
-  mempool_stop_benchmark();
-  cycles = mempool_get_timer();
+    start_time = mempool_get_timer();
+    mempool_start_benchmark();
+    conv_gomp_barrier(core_id, num_cores);
+    mempool_stop_benchmark();
+    cycles = mempool_get_timer();
 
-  if (core_id == 0) {
-    printf("GOMP barrier cycles: %d\n", cycles - start_time);
-  }
+    if (core_id == 0) {
+      printf("GOMP barrier cycles: %d\n", cycles - start_time);
+    }
 
 #pragma omp barrier
+  }
   return 0;
 }
diff --git a/software/apps/omp/critical_benchmark/main.c b/software/apps/omp/critical_benchmark/main.c
index b13287abf..54d9fb2c1 100644
--- a/software/apps/omp/critical_benchmark/main.c
+++ b/software/apps/omp/critical_benchmark/main.c
@@ -6,7 +6,7 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
@@ -68,35 +68,33 @@ void omp_parallel_critical() {
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
 
-  // Initialize synchronization variables
-  mempool_barrier_init(core_id);
+#pragma omp parallel
+  {
+    uint32_t core_id = mempool_get_core_id();
 
-  if (core_id == 0) {
-    printf("Initialize\n");
-    *lock = 0;
-    result = 0;
-  }
+    // Initialize synchronization variables
+    mempool_barrier_init(core_id);
 
-  mempool_barrier(num_cores);
-  parallel_critical_manual();
-  mempool_barrier(num_cores);
+    if (core_id == 0) {
+      printf("Initialize\n");
+      *lock = 0;
+      result = 0;
+    }
 
-  result = 0;
+    mempool_barrier(num_cores);
+    parallel_critical_manual();
+    mempool_barrier(num_cores);
+
+    result = 0;
+  }
 
   /*  OPENMP IMPLEMENTATION  */
 
-  if (core_id == 0) {
-    mempool_wait(4 * num_cores);
-    omp_parallel_critical();
-    mempool_wait(100 * num_cores);
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
+  mempool_wait(4 * num_cores);
+  omp_parallel_critical();
+  mempool_wait(100 * num_cores);
+
   return 0;
 }
diff --git a/software/apps/omp/master_benchmark/main.c b/software/apps/omp/master_benchmark/main.c
index 81768c8aa..8308feea3 100644
--- a/software/apps/omp/master_benchmark/main.c
+++ b/software/apps/omp/master_benchmark/main.c
@@ -6,7 +6,7 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
@@ -74,36 +74,33 @@ void omp_parallel_master() {
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
 
-  // Initialize synchronization variables
-  mempool_barrier_init(core_id);
+#pragma omp parallel
+  {
+    uint32_t core_id = mempool_get_core_id();
 
-  // #ifdef VERBOSE
-  if (core_id == 0) {
-    printf("Initialize\n");
-    *checkfirst = 0;
-    result = 0;
-  }
+    // Initialize synchronization variables
+    mempool_barrier_init(core_id);
 
-  mempool_barrier(num_cores);
-  parallel_master_manual();
-  mempool_barrier(num_cores);
+    // #ifdef VERBOSE
+    if (core_id == 0) {
+      printf("Initialize\n");
+      *checkfirst = 0;
+      result = 0;
+    }
 
-  result = 0;
+    mempool_barrier(num_cores);
+    parallel_master_manual();
+    mempool_barrier(num_cores);
+
+    result = 0;
+  }
 
   /*  OPENMP IMPLEMENTATION  */
 
-  if (core_id == 0) {
-    mempool_wait(1 * num_cores);
-    omp_parallel_master();
-    mempool_wait(4 * num_cores);
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
+  mempool_wait(1 * num_cores);
+  omp_parallel_master();
+  mempool_wait(4 * num_cores);
   return 0;
 }
diff --git a/software/apps/omp/omp_overhead/main.c b/software/apps/omp/omp_overhead/main.c
index 8622c25ba..1e7c6a405 100644
--- a/software/apps/omp/omp_overhead/main.c
+++ b/software/apps/omp/omp_overhead/main.c
@@ -6,7 +6,7 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
@@ -14,12 +14,14 @@
 #define N 16
 #define M 4
 
-void work2(unsigned long num) {
+uint32_t work2(unsigned long num) {
   uint32_t i;
   uint32_t cnt = 0;
 
   for (i = 0; i < num; i++)
     cnt += i;
+
+  return cnt;
 }
 
 void sequential() {
@@ -78,52 +80,60 @@ void section_parallel() {
   }
 }
 
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t cycles;
+#define REPETITIONS 10
 
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-
-    printf("Sequential Start\n");
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    sequential();
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("Sequential Duration: %d\n", cycles);
-
-    printf("Static Start\n");
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    static_parallel();
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("Static Duration: %d\n", cycles);
-
-    printf("Dynamic Start\n");
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    dynamic_parallel();
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("Dynamic Duration: %d\n", cycles);
-
-    printf("Section Start\n");
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    section_parallel();
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("Section Duration: %d\n", cycles);
-
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
+void startup_time() {
+
+  uint32_t duration = 0;
+
+  for (int i = 0; i < REPETITIONS; i++) {
+    uint32_t time = mempool_get_timer();
+#pragma omp parallel
+    {
+#pragma omp single
+      duration += mempool_get_timer() - time;
     }
   }
 
+  printf("Startup time duration: %d\n", duration / REPETITIONS);
+}
+
+int main() {
+  uint32_t cycles;
+
+  printf("Sequential Start\n");
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  sequential();
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+  printf("Sequential Duration: %d\n", cycles);
+
+  printf("Static Start\n");
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  static_parallel();
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+  printf("Static Duration: %d\n", cycles);
+
+  printf("Dynamic Start\n");
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  dynamic_parallel();
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+  printf("Dynamic Duration: %d\n", cycles);
+
+  printf("Section Start\n");
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  section_parallel();
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+  printf("Section Duration: %d\n", cycles);
+
+  startup_time();
+
   return 0;
 }
diff --git a/software/apps/omp/omp_parallel_for_benchmark/main.c b/software/apps/omp/omp_parallel_for_benchmark/main.c
index bc4de3d07..21b4054fc 100644
--- a/software/apps/omp/omp_parallel_for_benchmark/main.c
+++ b/software/apps/omp/omp_parallel_for_benchmark/main.c
@@ -8,7 +8,7 @@
 #include "baremetal/mempool_matmul_i32p.h"
 #include "baremetal/mempool_matmul_i32s.h"
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "omp/mempool_matmul_i32.h"
 #include "printf.h"
 #include "runtime.h"
@@ -86,107 +86,58 @@ void print_matrix(int32_t const *matrix, uint32_t num_rows,
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
   mempool_timer_t cycles;
   int error;
 
-  // Initialize synchronization variables
-  mempool_barrier_init(core_id);
+#pragma omp parallel
+  {
+    // Initialize Matrices
+    init_matrix(a, M, N, A_a, A_b, A_c, (uint32_t)omp_get_thread_num(),
+                (uint32_t)omp_get_num_threads());
+    init_matrix(b, N, P, B_a, B_b, B_c, (uint32_t)omp_get_thread_num(),
+                (uint32_t)omp_get_num_threads());
+  }
 
-  // Initialize Matrices
-  init_matrix(a, M, N, A_a, A_b, A_c, core_id, num_cores);
-  init_matrix(b, N, P, B_a, B_b, B_c, core_id, num_cores);
+  printf("Start sequential\n");
 
-  mempool_barrier(num_cores);
+  mempool_wait(1000);
 
   cycles = mempool_get_timer();
   mempool_start_benchmark();
-  mat_mul_parallel(a, b, c, M, N, P, core_id, num_cores);
+  // mat_mul_sequential(a, b, c, M, N, P);
   mempool_stop_benchmark();
   cycles = mempool_get_timer() - cycles;
-  mempool_barrier(num_cores);
-
-  // Check result
-  if (core_id == 0) {
-    printf("Manual Parallel Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
-    if (error != 0) {
-      printf("Error code %d\n", error);
-      printf("c[%d]=%d\n", error, c[error]);
-    }
-  } else {
-    mempool_wait(M * P * 12);
+  printf("Sequqntial Duration: %d\n", cycles);
+  error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+  if (error != 0) {
+    printf("Error code %d\n", error);
+    printf("c[%d]=%d\n", error, c[error]);
   }
-  mempool_barrier(num_cores);
+
+  printf("Start openMP\n");
 
   cycles = mempool_get_timer();
   mempool_start_benchmark();
-  mat_mul_unrolled_parallel(a, b, c, M, N, P, core_id, num_cores);
+  mat_mul_parallel_omp(a, b, c, M, N, P);
   mempool_stop_benchmark();
   cycles = mempool_get_timer() - cycles;
-  mempool_barrier(num_cores);
-
-  // Check result
-  if (core_id == 0) {
-    printf("Manual unrolled Parallel Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
-    if (error != 0) {
-      printf("Error code %d\n", error);
-      printf("c[%d]=%d\n", error, c[error]);
-    }
-  } else {
-    mempool_wait(M * P * 12);
+  printf("OpenMP Parallel Duration: %d\n", cycles);
+  error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+  if (error != 0) {
+    printf("Error code %d\n", error);
+    printf("c[%d]=%d\n", error, c[error]);
   }
-  mempool_barrier(num_cores);
-
-  if (core_id == 0) {
-
-    mempool_wait(1000);
-
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    mat_mul_sequential(a, b, c, M, N, P);
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("Sequqntial Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
-    if (error != 0) {
-      printf("Error code %d\n", error);
-      printf("c[%d]=%d\n", error, c[error]);
-    }
-
-    printf("Start openMP\n");
-
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    mat_mul_parallel_omp(a, b, c, M, N, P);
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("OpenMP Parallel Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
-    if (error != 0) {
-      printf("Error code %d\n", error);
-      printf("c[%d]=%d\n", error, c[error]);
-    }
 
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    mat_mul_unrolled_parallel_omp(a, b, c, M, N, P);
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("OpenMP Unrolled Parallel Duration: %d\n", cycles);
-    error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
-    if (error != 0) {
-      printf("Error code %d\n", error);
-      printf("c[%d]=%d\n", error, c[error]);
-    }
-
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  mat_mul_unrolled_parallel_omp(a, b, c, M, N, P);
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+  printf("OpenMP Unrolled Parallel Duration: %d\n", cycles);
+  error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c);
+  if (error != 0) {
+    printf("Error code %d\n", error);
+    printf("c[%d]=%d\n", error, c[error]);
   }
 
   return 0;
diff --git a/software/apps/omp/omp_parallel_for_dynamic_benchmark/main.c b/software/apps/omp/omp_parallel_for_dynamic_benchmark/main.c
index 42b9640a9..5deff879c 100644
--- a/software/apps/omp/omp_parallel_for_dynamic_benchmark/main.c
+++ b/software/apps/omp/omp_parallel_for_dynamic_benchmark/main.c
@@ -7,7 +7,7 @@
 
 #include "data.h"
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
@@ -65,44 +65,29 @@ void spmv_dynamic(int *y, int *data, int *colidx, int *rowb, int *rowe, int *x,
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
+  mempool_timer_t cycles;
+  int n = 512;
 
-  mempool_barrier_init(core_id);
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  spmv(y, nnz, col, rowb, rowe, x, n);
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+  printf("Sequqntial Duration: %d\n", cycles);
 
-  if (core_id == 0) {
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  spmv_static(y, nnz, col, rowb, rowe, x, n);
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+  printf("Static Duration: %d\n", cycles);
 
-    mempool_wait(1000);
-
-    mempool_timer_t cycles;
-    int n = 512;
-
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    spmv(y, nnz, col, rowb, rowe, x, n);
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("Sequqntial Duration: %d\n", cycles);
-
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    spmv_static(y, nnz, col, rowb, rowe, x, n);
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("Static Duration: %d\n", cycles);
-
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    spmv_dynamic(y, nnz, col, rowb, rowe, x, n);
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-    printf("Dynamic Duration: %d\n", cycles);
-
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  spmv_dynamic(y, nnz, col, rowb, rowe, x, n);
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+  printf("Dynamic Duration: %d\n", cycles);
 
   return 0;
 }
diff --git a/software/apps/omp/reduction_benchmark/main.c b/software/apps/omp/reduction_benchmark/main.c
index 1106c633c..ffa44e539 100644
--- a/software/apps/omp/reduction_benchmark/main.c
+++ b/software/apps/omp/reduction_benchmark/main.c
@@ -6,7 +6,7 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
@@ -144,164 +144,160 @@ int32_t dot_product_omp_dynamic(int32_t const *__restrict__ A,
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
   mempool_timer_t cycles;
+  uint32_t num_cores = mempool_get_core_count();
 
-  // Initialize synchronization variables
-  mempool_barrier_init(core_id);
+#pragma omp parallel
+  {
+    uint32_t core_id = mempool_get_core_id();
 
-#ifdef VERBOSE
-  if (core_id == 0) {
-    printf("Initialize\n");
-  }
-#endif
-
-  // Initialize Matrices
-  init_vector(a, M, A_a, A_b, core_id, num_cores);
-  init_vector(b, M, B_a, B_b, core_id, num_cores);
+    // Initialize synchronization variables
+    mempool_barrier_init(core_id);
 
 #ifdef VERBOSE
-  mempool_barrier(num_cores);
-  if (core_id == 0) {
-    // print_vector(a, M);
-    // print_vector(b, M);
-  }
+    if (core_id == 0) {
+      printf("Initialize\n");
+    }
 #endif
 
-  mempool_barrier(num_cores);
-  int32_t result, correct_result;
-
-  if (core_id == 0) {
-    mempool_wait(4 * num_cores);
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    result = dot_product_sequential(a, b, M);
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
-  }
+    // Initialize Matrices
+    init_vector(a, M, A_a, A_b, core_id, num_cores);
+    init_vector(b, M, B_a, B_b, core_id, num_cores);
 
 #ifdef VERBOSE
-  mempool_barrier(num_cores);
-  if (core_id == 0) {
-    printf("Sequential Result: %d\n", result);
-    printf("Sequential Duration: %d\n", cycles);
-    if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) {
-      printf("Sequential Result is %d instead of %d\n", result, correct_result);
-    } else {
-      printf("Result is correct!\n");
+    mempool_barrier(num_cores);
+    if (core_id == 0) {
+      // print_vector(a, M);
+      // print_vector(b, M);
     }
-  }
 #endif
-  mempool_barrier(num_cores);
 
-  cycles = mempool_get_timer();
-  mempool_start_benchmark();
-  result = dot_product_parallel1(a, b, c, M, core_id, num_cores);
-  mempool_stop_benchmark();
-  cycles = mempool_get_timer() - cycles;
+    mempool_barrier(num_cores);
+    int32_t result, correct_result;
 
-#ifdef VERBOSE
-  mempool_barrier(num_cores);
-  if (core_id == 0) {
-    printf("Manual Parallel1 Result: %d\n", result);
-    printf("Manual Parallel1 Duration: %d\n", cycles);
-    if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) {
-      printf("Manual Parallel1 Result is %d instead of %d\n", result,
-             correct_result);
-    } else {
-      printf("Result is correct!\n");
+    if (core_id == 0) {
+      mempool_wait(4 * num_cores);
+      cycles = mempool_get_timer();
+      mempool_start_benchmark();
+      result = dot_product_sequential(a, b, M);
+      mempool_stop_benchmark();
+      cycles = mempool_get_timer() - cycles;
     }
-  }
-#endif
-  mempool_barrier(num_cores);
-
-  cycles = mempool_get_timer();
-  mempool_start_benchmark();
-  result = dot_product_parallel2(a, b, c, M, core_id, num_cores);
-  mempool_stop_benchmark();
-  cycles = mempool_get_timer() - cycles;
 
 #ifdef VERBOSE
-  mempool_barrier(num_cores);
-  if (core_id == 0) {
-    printf("Manual Parallel2 Result: %d\n", result);
-    printf("Manual Parallel2 Duration: %d\n", cycles);
-    if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) {
-      printf("Manual Parallel2 Result is %d instead of %d\n", result,
-             correct_result);
-    } else {
-      printf("Result is correct!\n");
+    mempool_barrier(num_cores);
+    if (core_id == 0) {
+      printf("Sequential Result: %d\n", result);
+      printf("Sequential Duration: %d\n", cycles);
+      if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) {
+        printf("Sequential Result is %d instead of %d\n", result,
+               correct_result);
+      } else {
+        printf("Result is correct!\n");
+      }
     }
-  }
 #endif
-  mempool_barrier(num_cores);
-
-  /*  OPENMP IMPLEMENTATION  */
-  int32_t omp_result;
-
-  if (core_id == 0) {
-    mempool_wait(4 * num_cores);
+    mempool_barrier(num_cores);
 
     cycles = mempool_get_timer();
     mempool_start_benchmark();
-    omp_result = dot_product_omp_static(a, b, M);
+    result = dot_product_parallel1(a, b, c, M, core_id, num_cores);
     mempool_stop_benchmark();
     cycles = mempool_get_timer() - cycles;
 
-    printf("OMP Static Result: %d\n", omp_result);
-    printf("OMP Static Duration: %d\n", cycles);
-    if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b,
-                           &correct_result)) {
-      printf("OMP Static Result is %d instead of %d\n", omp_result,
-             correct_result);
-    } else {
-      printf("Result is correct!\n");
+#ifdef VERBOSE
+    mempool_barrier(num_cores);
+    if (core_id == 0) {
+      printf("Manual Parallel1 Result: %d\n", result);
+      printf("Manual Parallel1 Duration: %d\n", cycles);
+      if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) {
+        printf("Manual Parallel1 Result is %d instead of %d\n", result,
+               correct_result);
+      } else {
+        printf("Result is correct!\n");
+      }
     }
-
-    mempool_wait(4 * num_cores);
+#endif
+    mempool_barrier(num_cores);
 
     cycles = mempool_get_timer();
     mempool_start_benchmark();
-    omp_result = dot_product_omp_dynamic(a, b, M, 4);
+    result = dot_product_parallel2(a, b, c, M, core_id, num_cores);
     mempool_stop_benchmark();
     cycles = mempool_get_timer() - cycles;
 
-    printf("OMP Dynamic(4) Result: %d\n", omp_result);
-    printf("OMP Dynamic(4) Duration: %d\n", cycles);
-    if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b,
-                           &correct_result)) {
-      printf("OMP Dynamic(4) Result is %d instead of %d\n", omp_result,
-             correct_result);
-    } else {
-      printf("Result is correct!\n");
+#ifdef VERBOSE
+    mempool_barrier(num_cores);
+    if (core_id == 0) {
+      printf("Manual Parallel2 Result: %d\n", result);
+      printf("Manual Parallel2 Duration: %d\n", cycles);
+      if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) {
+        printf("Manual Parallel2 Result is %d instead of %d\n", result,
+               correct_result);
+      } else {
+        printf("Result is correct!\n");
+      }
     }
+#endif
+    mempool_barrier(num_cores);
+  }
 
-    mempool_wait(4 * num_cores);
+  /*  OPENMP IMPLEMENTATION  */
+  int32_t omp_result;
+  int32_t correct_result;
 
-    cycles = mempool_get_timer();
-    mempool_start_benchmark();
-    omp_result = dot_product_omp_dynamic(a, b, M, 10);
-    mempool_stop_benchmark();
-    cycles = mempool_get_timer() - cycles;
+  mempool_wait(4 * num_cores);
 
-    printf("OMP Dynamic(10) Result: %d\n", omp_result);
-    printf("OMP Dynamic(10) Duration: %d\n", cycles);
-    if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b,
-                           &correct_result)) {
-      printf("OMP Dynamic(10) Result is %d instead of %d\n", omp_result,
-             correct_result);
-    } else {
-      printf("Result is correct!\n");
-    }
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  omp_result = dot_product_omp_static(a, b, M);
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+
+  printf("OMP Static Result: %d\n", omp_result);
+  printf("OMP Static Duration: %d\n", cycles);
+  if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b, &correct_result)) {
+    printf("OMP Static Result is %d instead of %d\n", omp_result,
+           correct_result);
+  } else {
+    printf("Result is correct!\n");
+  }
 
-    mempool_wait(4 * num_cores);
+  mempool_wait(4 * num_cores);
 
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  omp_result = dot_product_omp_dynamic(a, b, M, 4);
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+
+  printf("OMP Dynamic(4) Result: %d\n", omp_result);
+  printf("OMP Dynamic(4) Duration: %d\n", cycles);
+  if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b, &correct_result)) {
+    printf("OMP Dynamic(4) Result is %d instead of %d\n", omp_result,
+           correct_result);
   } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
+    printf("Result is correct!\n");
   }
+
+  mempool_wait(4 * num_cores);
+
+  cycles = mempool_get_timer();
+  mempool_start_benchmark();
+  omp_result = dot_product_omp_dynamic(a, b, M, 10);
+  mempool_stop_benchmark();
+  cycles = mempool_get_timer() - cycles;
+
+  printf("OMP Dynamic(10) Result: %d\n", omp_result);
+  printf("OMP Dynamic(10) Duration: %d\n", cycles);
+  if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b, &correct_result)) {
+    printf("OMP Dynamic(10) Result is %d instead of %d\n", omp_result,
+           correct_result);
+  } else {
+    printf("Result is correct!\n");
+  }
+
+  mempool_wait(4 * num_cores);
+
   return 0;
 }
diff --git a/software/apps/omp/reduction_benchmark2/main.c b/software/apps/omp/reduction_benchmark2/main.c
new file mode 100644
index 000000000..dadafcba8
--- /dev/null
+++ b/software/apps/omp/reduction_benchmark2/main.c
@@ -0,0 +1,64 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <stdint.h>
+#include <string.h>
+
+#include "encoding.h"
+#include "omp.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+
+int32_t dot_product_omp_static(int32_t const *__restrict__ A,
+                               int32_t const *__restrict__ B,
+                               uint32_t num_elements) {
+  uint32_t i;
+  int32_t dotp = 0;
+#pragma omp parallel for reduction(+ : dotp)
+  for (i = 0; i < num_elements; i++) {
+    dotp += A[i] * B[i];
+  }
+  return dotp;
+}
+
+int32_t dot_product_omp_dynamic(int32_t const *__restrict__ A,
+                                int32_t const *__restrict__ B,
+                                uint32_t num_elements) {
+  uint32_t i;
+  int32_t dotp = 0;
+  // printf("num_elements %d\n", num_elements);
+#pragma omp parallel for schedule(dynamic) reduction(+ : dotp)
+  for (i = 0; i < num_elements; i++) {
+    dotp += A[i] * B[i];
+  }
+  return dotp;
+}
+
+int main() {
+  uint32_t num_cores = mempool_get_core_count();
+  mempool_timer_t cycles;
+
+  mempool_wait(4 * num_cores);
+
+  for (unsigned int i = 1; i <= 8192; i *= 2) {
+    int32_t *a = simple_malloc(i * sizeof(int32_t));
+    cycles = mempool_get_timer();
+    dot_product_omp_static(a, a, i);
+    cycles = mempool_get_timer() - cycles;
+    simple_free(a);
+    printf("Static duration with %d elements: %d\n", i, cycles);
+  }
+
+  for (unsigned int i = 1; i <= 8192; i *= 2) {
+    int32_t *a = simple_malloc(i * sizeof(int32_t));
+    cycles = mempool_get_timer();
+    dot_product_omp_dynamic(a, a, i);
+    cycles = mempool_get_timer() - cycles;
+    simple_free(a);
+    printf("Dynamic duration with %d elements: %d\n", i, cycles);
+  }
+
+  return 0;
+}
diff --git a/software/apps/omp/single_benchmark/main.c b/software/apps/omp/single_benchmark/main.c
index 523a02f19..2f6d191a9 100644
--- a/software/apps/omp/single_benchmark/main.c
+++ b/software/apps/omp/single_benchmark/main.c
@@ -6,7 +6,7 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
@@ -22,9 +22,8 @@ void work1() {
 }
 
 void parallel_single_manual() {
-  uint32_t core_id;
+  uint32_t core_id = mempool_get_core_id();
   uint32_t num_cores = mempool_get_core_count();
-  core_id = mempool_get_core_id();
 
   work1();
 
@@ -71,37 +70,28 @@ void omp_parallel_single() {
 
 int main() {
   uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
 
-  // Initialize synchronization variables
-  mempool_barrier_init(core_id);
-
-  // #ifdef VERBOSE
-  if (core_id == 0) {
-    printf("Initialize\n");
-    *checkfirst = 0;
-    result = 0;
-  }
+#pragma omp parallel
+  {
+    // #ifdef VERBOSE
+    if (core_id == 0) {
+      printf("Initialize\n");
+      *checkfirst = 0;
+      result = 0;
+    }
 
-  mempool_barrier(num_cores);
+#pragma omp barrier
 
-  parallel_single_manual();
+    parallel_single_manual();
 
-  mempool_barrier(num_cores);
+#pragma omp barrier
 
-  result = 0;
+    result = 0;
+  }
 
   /*  OPENMP IMPLEMENTATION  */
-  if (core_id == 0) {
 
-    omp_parallel_single();
+  omp_parallel_single();
 
-    mempool_wait(4 * num_cores);
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
   return 0;
 }
diff --git a/software/apps/omp/test/main.c b/software/apps/omp/test/main.c
new file mode 100644
index 000000000..5fc7f996a
--- /dev/null
+++ b/software/apps/omp/test/main.c
@@ -0,0 +1,20 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "omp.h"
+#include "printf.h"
+#include "runtime.h"
+
+int main(){
+#pragma omp parallel
+    {
+#pragma omp critical
+        {printf("First critical\n");
+}
+
+#pragma omp critical
+{ printf("Second critical\n"); }
+}
+}
+;
diff --git a/software/runtime/omp/barrier.c b/software/runtime/gomp/barrier.c
similarity index 100%
rename from software/runtime/omp/barrier.c
rename to software/runtime/gomp/barrier.c
diff --git a/software/runtime/omp/critical.c b/software/runtime/gomp/critical.c
similarity index 100%
rename from software/runtime/omp/critical.c
rename to software/runtime/gomp/critical.c
diff --git a/software/runtime/omp/libgomp.h b/software/runtime/gomp/libgomp.h
similarity index 97%
rename from software/runtime/omp/libgomp.h
rename to software/runtime/gomp/libgomp.h
index 383e19a42..cfa86cfaa 100644
--- a/software/runtime/omp/libgomp.h
+++ b/software/runtime/gomp/libgomp.h
@@ -115,6 +115,6 @@ typedef struct {
   omp_lock_t atomic_lock;
 } work_t;
 
-extern event_t event;
-extern work_t works;
+extern event_t event __attribute__((section(".l1")));
+extern work_t works __attribute__((section(".l1")));
 #endif /* __LIBGOMP_H__ */
diff --git a/software/runtime/omp/loop.c b/software/runtime/gomp/loop.c
similarity index 100%
rename from software/runtime/omp/loop.c
rename to software/runtime/gomp/loop.c
diff --git a/software/runtime/gomp/main_wrapper.c b/software/runtime/gomp/main_wrapper.c
new file mode 100644
index 000000000..a98a7c419
--- /dev/null
+++ b/software/runtime/gomp/main_wrapper.c
@@ -0,0 +1,43 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "libgomp.h"
+#include "runtime.h"
+
+typedef void (*init_func)(void);
+extern init_func __init_array_start[];
+extern init_func __init_array_end[];
+
+static inline void initGlobals() {
+  // NOLINTNEXTLINE(*-narrowing-conversions)
+  int32_t len = __init_array_end - __init_array_start;
+  for (int32_t i = 0; i < len; i++) {
+
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    __init_array_start[i]();
+  }
+}
+
+int __real_main();
+
+int __wrap_main() {
+  const mempool_id_t core_id = mempool_get_core_id();
+
+  mempool_barrier_init(core_id);
+  mempool_init(core_id);
+
+  if (core_id == 0) {
+    initGlobals();
+    __real_main();
+
+    printf("Program done\n");
+  } else {
+    while (1) {
+      mempool_wfi();
+      run_task(core_id);
+    }
+  }
+
+  return 0;
+}
diff --git a/software/runtime/omp/omp-lock.h b/software/runtime/gomp/omp-lock.h
similarity index 100%
rename from software/runtime/omp/omp-lock.h
rename to software/runtime/gomp/omp-lock.h
diff --git a/software/runtime/omp/omp.h b/software/runtime/gomp/omp.h
similarity index 94%
rename from software/runtime/omp/omp.h
rename to software/runtime/gomp/omp.h
index de04877ab..554e3d691 100644
--- a/software/runtime/omp/omp.h
+++ b/software/runtime/gomp/omp.h
@@ -7,6 +7,8 @@
 #ifndef __OMP_H__
 #define __OMP_H__
 
+#include "stdint.h"
+
 /* parallel.c */
 extern uint32_t omp_get_num_threads(void);
 extern uint32_t omp_get_thread_num(void);
diff --git a/software/runtime/omp/parallel.c b/software/runtime/gomp/parallel.c
similarity index 100%
rename from software/runtime/omp/parallel.c
rename to software/runtime/gomp/parallel.c
diff --git a/software/runtime/omp/sections.c b/software/runtime/gomp/sections.c
similarity index 100%
rename from software/runtime/omp/sections.c
rename to software/runtime/gomp/sections.c
diff --git a/software/runtime/omp/single.c b/software/runtime/gomp/single.c
similarity index 100%
rename from software/runtime/omp/single.c
rename to software/runtime/gomp/single.c
diff --git a/software/runtime/omp/work.c b/software/runtime/gomp/work.c
similarity index 100%
rename from software/runtime/omp/work.c
rename to software/runtime/gomp/work.c
diff --git a/software/runtime/kmp/barrier.cpp b/software/runtime/kmp/barrier.cpp
new file mode 100644
index 000000000..32328aad7
--- /dev/null
+++ b/software/runtime/kmp/barrier.cpp
@@ -0,0 +1,13 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "barrier.hpp"
+
+namespace kmp {
+Barrier::Barrier(kmp_int32 numThreads)
+    : barrier(0), generation(0), numThreads(numThreads) {}
+
+Barrier::~Barrier() { DEBUG_PRINT("Destroying barrier at %p\n", this); }
+
+}; // namespace kmp
diff --git a/software/runtime/kmp/barrier.hpp b/software/runtime/kmp/barrier.hpp
new file mode 100644
index 000000000..3b4c7d086
--- /dev/null
+++ b/software/runtime/kmp/barrier.hpp
@@ -0,0 +1,93 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "kmp/omp.h"
+#include "kmp/types.h"
+#include "kmp/util.hpp"
+
+#include <atomic>
+#include <cassert>
+#include <cstdint>
+
+extern "C" {
+#include "runtime.h"
+}
+
+namespace kmp {
+
+namespace runtime {
+extern kmp_int32 numTeams;
+}
+
+class Barrier {
+public:
+  Barrier(Barrier &&) = delete;
+  Barrier &operator=(Barrier &&) = delete;
+  Barrier(kmp_int32 numThreads);
+  Barrier(const Barrier &) = delete;
+  Barrier &operator=(const Barrier &) = delete;
+
+  ~Barrier();
+
+  inline void wait() {
+    if (runtime::numTeams == 1) {
+      DEBUG_PRINT("Entering wfi barrier at %p\n", this);
+      // WFI barrier
+
+      // Increment the barrier counter
+      if ((numThreads - 1) == barrier.fetch_add(1, std::memory_order_relaxed)) {
+        DEBUG_PRINT("Barrier done at %p\n", this);
+        barrier.store(0, std::memory_order_relaxed);
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+        wake_up_all();
+      }
+
+      // Some threads have not reached the barrier --> Let's wait
+      // Clear the wake-up trigger for the last core reaching the barrier as
+      // well
+      mempool_wfi();
+      DEBUG_PRINT("Exiting wfi barrier at %p\n", this);
+
+    } else {
+      // Spin generation barrier
+      kmp_int32 gen = generation;
+
+      DEBUG_PRINT("Entering spin barrier at %p, gen %d\n", this, gen);
+
+      // Increment the barrier counter
+      if ((numThreads - 1) == barrier.fetch_add(1, std::memory_order_relaxed)) {
+        DEBUG_PRINT("Barrier done at %p\n", this);
+        barrier.store(0, std::memory_order_relaxed);
+        generation.fetch_add(1, std::memory_order_relaxed);
+        std::atomic_thread_fence(std::memory_order_seq_cst);
+      }
+
+      while (gen == generation.load(std::memory_order_relaxed)) {
+        // Spin
+      }
+
+      DEBUG_PRINT("Exiting spin barrier at %p, gen %d\n", this, gen);
+    }
+  };
+
+  inline void setNumThreads(int32_t numThreads) {
+    if (barrier != 0) {
+      DEBUG_PRINT("Cannot change the number of threads while the barrier is "
+                  "active: %p, %d\n",
+                  this, barrier.load());
+    }
+    assert(barrier == 0 &&
+           "Cannot change the number of threads while the barrier is active");
+
+    this->numThreads = numThreads;
+  }
+
+private:
+  std::atomic<kmp_int32> barrier;
+  std::atomic<kmp_int32> generation;
+  int32_t numThreads;
+};
+}; // namespace kmp
diff --git a/software/runtime/kmp/cppsupport.cpp b/software/runtime/kmp/cppsupport.cpp
new file mode 100644
index 000000000..e390c1122
--- /dev/null
+++ b/software/runtime/kmp/cppsupport.cpp
@@ -0,0 +1,69 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include <cstdlib>
+#include <mutex>
+
+#include "kmp/util.hpp"
+
+extern "C" {
+#include "alloc.h"
+}
+
+extern void (*_eoc)(void);
+
+kmp::Mutex allocLock __attribute__((section(".l1")));
+
+void *operator new(size_t size) {
+  std::lock_guard<kmp::Mutex> lock(allocLock);
+  void *ptr = simple_malloc(size);
+  return ptr;
+}
+
+void operator delete(void *ptr) noexcept {
+  std::lock_guard<kmp::Mutex> lock(allocLock);
+  return simple_free(ptr);
+}
+
+void *operator new[](size_t size) { return operator new(size); }
+
+void operator delete[](void *ptr) noexcept { return operator delete(ptr); }
+
+namespace std {
+void __throw_bad_alloc() {
+  printf("Bad alloc\n");
+  abort();
+}
+
+void __throw_length_error(const char *msg) {
+  printf("Length error: %s\n", msg);
+  abort();
+}
+
+void __throw_bad_optional_access() {
+  printf("Bad optional access\n");
+  abort();
+}
+} // namespace std
+
+extern "C" void abort() {
+  printf("Aborting\n");
+  while (true) {
+    asm("j _eoc");
+  }
+}
+
+extern "C" int __cxa_atexit(void (*func)(void *), void *arg, void *dso_handle) {
+  (void)func;
+  (void)arg;
+  (void)dso_handle;
+  return 0;
+}
+
+extern "C" void __assert_func(const char *file, int line, const char *func,
+                              const char *failedexpr) {
+  printf("Assertion failed: %s, file %s, line %d, function %s\n", failedexpr,
+         file, line, func);
+  abort();
+}
diff --git a/software/runtime/kmp/entrypoints.cpp b/software/runtime/kmp/entrypoints.cpp
new file mode 100644
index 000000000..b5a251dfe
--- /dev/null
+++ b/software/runtime/kmp/entrypoints.cpp
@@ -0,0 +1,237 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kmp/runtime.hpp"
+#include "kmp/team.hpp"
+#include "kmp/types.h"
+
+using kmp::Mutex;
+
+extern "C" {
+#include "runtime.h"
+
+void __kmpc_barrier(ident_t * /*loc*/, kmp_int32 global_tid) {
+  kmp::runtime::getThread(global_tid).getCurrentTeam()->getBarrier().wait();
+};
+
+// Parallel
+void __kmpc_fork_call(ident_t * /*loc*/, kmp_int32 argc, kmpc_micro microtask,
+                      ...) {
+  // NOLINTBEGIN(cppcoreguidelines-pro-bounds-array-to-pointer-decay,
+  // cppcoreguidelines-pro-type-reinterpret-cast)
+  va_list args;
+  va_start(args, microtask);
+  kmp::Task kmpMicrotask(microtask, reinterpret_cast<void **>(args), argc);
+  kmp::runtime::getCurrentThread().forkCall(kmpMicrotask);
+  va_end(args);
+  // NOLINTEND(cppcoreguidelines-pro-bounds-array-to-pointer-decay,
+  // cppcoreguidelines-pro-type-reinterpret-cast)
+};
+
+// Static loops
+void __kmpc_for_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
+                              kmp_int32 *plastiter, kmp_int32 *plower,
+                              kmp_int32 *pupper, kmp_int32 *pstride,
+                              kmp_int32 incr, kmp_int32 chunk) {
+  kmp::runtime::getThread(gtid).getCurrentTeam()->forStaticInit(
+      loc, gtid, static_cast<kmp_sched_type>(schedtype), plastiter, plower,
+      pupper, pstride, incr, chunk);
+};
+
+void __kmpc_for_static_init_4u(ident_t *loc, kmp_int32 gtid,
+                               kmp_int32 schedtype, kmp_uint32 *plastiter,
+                               kmp_uint32 *plower, kmp_uint32 *pupper,
+                               kmp_int32 *pstride, kmp_int32 incr,
+                               kmp_int32 chunk) {
+  kmp::runtime::getThread(gtid).getCurrentTeam()->forStaticInit(
+      loc, gtid, static_cast<kmp_sched_type>(schedtype), plastiter, plower,
+      pupper, pstride, incr, chunk);
+};
+
+void __kmpc_for_static_init_8(ident_t * /*loc*/, kmp_int32 /*gtid*/,
+                              kmp_int32 /*schedtype*/,
+                              kmp_int64 * /*plastiter*/, kmp_int64 * /*plower*/,
+                              kmp_int64 * /*pupper*/, kmp_int64 * /*pstride*/,
+                              kmp_int64 /*incr*/, kmp_int64 /*chunk*/) {
+  assert(false && "Unsupported loop index type");
+};
+
+void __kmpc_for_static_init_8u(ident_t * /*loc*/, kmp_int32 /*gtid*/,
+                               kmp_int32 /*schedtype*/,
+                               kmp_uint64 * /*plastiter*/,
+                               kmp_uint64 * /*plower*/, kmp_uint64 * /*pupper*/,
+                               kmp_int64 * /*pstride*/, kmp_int64 /*incr*/,
+                               kmp_int64 /*chunk*/) {
+  assert(false && "Unsupported loop index type");
+};
+
+void __kmpc_for_static_fini(ident_t * /*loc*/, kmp_int32 /*global_tid*/){};
+
+// Dynamic loops
+void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
+                            kmp_int32 lower, kmp_int32 upper, kmp_int32 incr,
+                            kmp_int32 chunk) {
+  kmp::runtime::getThread(gtid).getCurrentTeam()->dispatchInit(
+      loc, gtid,
+      static_cast<kmp_sched_type>(SCHEDULE_WITHOUT_MODIFIERS(schedtype)), lower,
+      upper, incr, chunk);
+}
+
+void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype,
+                             kmp_uint32 lower, kmp_uint32 upper, kmp_int32 incr,
+                             kmp_int32 chunk) {
+  kmp::runtime::getThread(gtid).getCurrentTeam()->dispatchInit(
+      loc, gtid,
+      static_cast<kmp_sched_type>(SCHEDULE_WITHOUT_MODIFIERS(schedtype)), lower,
+      upper, incr, chunk);
+}
+
+void __kmpc_dispatch_init_8(ident_t * /*loc*/, kmp_int64 /*gtid*/,
+                            kmp_sched_type /*schedtype*/, kmp_int64 /*lower*/,
+                            kmp_int64 /*upper*/, kmp_int64 /*incr*/,
+                            kmp_int64 /*chunk*/) {
+  assert(false && "Unsupported loop index type");
+}
+
+void __kmpc_dispatch_init_8u(ident_t * /*loc*/, kmp_int64 /*gtid*/,
+                             kmp_sched_type /*schedtype*/, kmp_uint64 /*lower*/,
+                             kmp_uint64 /*upper*/, kmp_int64 /*incr*/,
+                             kmp_int64 /*chunk*/) {
+  assert(false && "Unsupported loop index type");
+}
+
+int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *plastiter,
+                           kmp_int32 *plower, kmp_int32 *pupper,
+                           kmp_int32 *pstride) {
+  return static_cast<int>(
+      kmp::runtime::getThread(gtid).getCurrentTeam()->dispatchNext(
+          loc, gtid, plastiter, plower, pupper, pstride));
+}
+
+int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *plastiter,
+                            kmp_uint32 *plower, kmp_uint32 *pupper,
+                            kmp_int32 *pstride) {
+  return static_cast<int>(
+      kmp::runtime::getThread(gtid).getCurrentTeam()->dispatchNext(
+          loc, gtid, plastiter, plower, pupper, pstride));
+}
+
+int __kmpc_dispatch_next_8(ident_t * /*loc*/, kmp_int64 /*gtid*/,
+                           kmp_int64 * /*plastiter*/, kmp_int64 * /*plower*/,
+                           kmp_int64 * /*pupper*/, kmp_int64 * /*pstride*/) {
+  assert(false && "Unsupported loop index type");
+  return 0;
+}
+
+int __kmpc_dispatch_next_8u(ident_t * /*loc*/, kmp_int64 /*gtid*/,
+                            kmp_int64 * /*plastiter*/, kmp_uint64 * /*plower*/,
+                            kmp_uint64 * /*pupper*/, kmp_int64 * /*pstride*/) {
+  assert(false && "Unsupported loop index type");
+  return 0;
+}
+
+void __kmpc_push_num_threads(ident_t * /*loc*/, kmp_int32 global_tid,
+                             kmp_int32 num_threads) {
+  kmp::runtime::getThread(global_tid).requestNumThreads(num_threads);
+};
+
+// Critical sections
+void __kmpc_critical(ident_t * /*unused*/, kmp_int32 /*gtid*/,
+                     kmp_critical_name *crit) {
+  static_assert(sizeof(kmp::Mutex) <= sizeof(kmp_critical_name));
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+  kmp::Mutex *mutex = reinterpret_cast<kmp::Mutex *>(*crit);
+  mutex->lock();
+};
+
+void __kmpc_end_critical(ident_t * /*unused*/, kmp_int32 /*gtid*/,
+                         kmp_critical_name *crit) {
+
+  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
+  Mutex *mutex = reinterpret_cast<kmp::Mutex *>(*crit);
+  mutex->unlock();
+};
+
+// Master
+kmp_int32 __kmpc_master(ident_t * /*loc*/, kmp_int32 gtid) {
+  return static_cast<kmp_int32>(kmp::runtime::getThread(gtid).getTid() == 0);
+};
+
+void __kmpc_end_master(ident_t * /*loc*/, kmp_int32 /*gtid*/){/* NOOP */};
+
+// Single (same as master for now)
+kmp_int32 __kmpc_single(ident_t * /*loc*/, kmp_int32 gtid) {
+  return static_cast<kmp_int32>(kmp::runtime::getThread(gtid).getTid() == 0);
+};
+
+void __kmpc_end_single(ident_t * /*loc*/, kmp_int32 /*gtid*/){/* NOOP */};
+
+// Copyprivate
+void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size,
+                        void *cpy_data, void (*cpy_func)(void *, void *),
+                        kmp_int32 didit) {
+  kmp::runtime::getThread(gtid).getCurrentTeam()->copyPrivate(
+      loc, gtid, cpy_size, cpy_data, cpy_func, didit);
+};
+
+// Reduction
+kmp_int32 __kmpc_reduce_nowait(ident_t * /*loc*/, kmp_int32 /*global_tid*/,
+                               kmp_int32 /*num_vars*/, size_t /*reduce_size*/,
+                               void * /*reduce_data*/,
+                               void (* /*reduce_func*/)(void *lhs_data,
+                                                        void *rhs_data),
+                               kmp_critical_name * /*lck*/) {
+  return 2; // Atomic reduction
+}
+
+void __kmpc_end_reduce_nowait(ident_t * /*loc*/, kmp_int32 /*global_tid*/,
+                              kmp_critical_name * /*lck*/) {
+  /* NOOP */
+}
+
+kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars,
+                        size_t reduce_size, void *reduce_data,
+                        void (*reduce_func)(void *lhs_data, void *rhs_data),
+                        kmp_critical_name *lck) {
+
+  return __kmpc_reduce_nowait(loc, global_tid, num_vars, reduce_size,
+                              reduce_data, reduce_func, lck);
+}
+
+void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid,
+                       kmp_critical_name * /*lck*/) {
+  return __kmpc_barrier(loc, global_tid);
+}
+
+// Teams
+void __kmpc_fork_teams(ident_t * /*loc*/, kmp_int32 argc, kmpc_micro microtask,
+                       ...) {
+  // NOLINTBEGIN(cppcoreguidelines-pro-bounds-array-to-pointer-decay,
+  // cppcoreguidelines-pro-type-reinterpret-cast)
+  va_list args;
+  va_start(args, microtask);
+  kmp::Task kmpMicrotask(microtask, reinterpret_cast<void **>(args), argc);
+  kmp::runtime::getCurrentThread().forkTeams(kmpMicrotask);
+  va_end(args);
+  // NOLINTEND(cppcoreguidelines-pro-bounds-array-to-pointer-decay,
+  // cppcoreguidelines-pro-type-reinterpret-cast)
+}
+
+void __kmpc_push_num_teams(ident_t * /*loc*/, kmp_int32 /*global_tid*/,
+                           kmp_int32 num_teams, kmp_int32 num_threads) {
+  DEBUG_PRINT("num_teams: %d, num_threads: %d\n", num_teams, num_threads);
+  if (num_teams > 0) {
+    kmp::runtime::requestedNumTeams = std::min(num_teams, NUM_CORES / 2);
+  }
+
+  if (num_threads > 0) {
+    kmp::runtime::requestedThreadLimit = num_threads;
+  }
+}
+
+kmp_int32 __kmpc_global_thread_num(ident_t * /*loc*/) {
+  return static_cast<kmp_int32>(mempool_get_core_id());
+};
+}
diff --git a/software/runtime/kmp/main_wrapper.cpp b/software/runtime/kmp/main_wrapper.cpp
new file mode 100644
index 000000000..22c1a0859
--- /dev/null
+++ b/software/runtime/kmp/main_wrapper.cpp
@@ -0,0 +1,59 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kmp/runtime.hpp"
+
+extern "C" {
+#include "runtime.h"
+}
+
+// https://etherealwake.com/2021/09/crt-startup/
+typedef void (*init_func)(void);
+extern init_func __init_array_start[];
+extern init_func __init_array_end[];
+
+static inline void initGlobals() {
+  // NOLINTNEXTLINE(*-narrowing-conversions)
+  int32_t len = __init_array_end - __init_array_start;
+  for (int32_t i = 0; i < len; i++) {
+
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
+    __init_array_start[i]();
+  }
+}
+
+extern "C" int __real_main();
+
+bool initLock = true;
+
+extern "C" int __wrap_main() {
+  const mempool_id_t core_id = mempool_get_core_id();
+  if (core_id == 0) {
+    DEBUG_PRINT("Running OpenMP program on %d cores\n",
+                mempool_get_core_count());
+
+    // Init heap allocators
+    mempool_init(0);
+
+    // Call C++ global constructors
+    initGlobals();
+
+    initLock = false;
+
+    DEBUG_PRINT("Init done\n");
+
+    // Run the program
+    __real_main();
+
+    printf("Program done\n");
+  } else {
+    while (initLock) {
+      // Wait for initialization to finish
+    }
+
+    kmp::runtime::runThread(static_cast<kmp_int32>(core_id));
+  }
+
+  return 0;
+}
diff --git a/software/runtime/kmp/omp.cpp b/software/runtime/kmp/omp.cpp
new file mode 100644
index 000000000..3f5d04064
--- /dev/null
+++ b/software/runtime/kmp/omp.cpp
@@ -0,0 +1,22 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "omp.h"
+#include "kmp/runtime.hpp"
+#include "kmp/team.hpp"
+
+void not_implemented(void) { printf("Not implemented\n"); }
+
+int omp_get_num_threads(void) {
+  return kmp::runtime::getCurrentThread().getCurrentTeam()->getNumThreads();
+}
+
+int omp_get_thread_num(void) {
+  return kmp::runtime::getCurrentThread().getTid();
+};
+
+int omp_get_num_teams(void) { return kmp::runtime::numTeams; }
+int omp_get_team_num(void) {
+  return kmp::runtime::getCurrentThread().getCurrentTeam()->getTeamId();
+}
diff --git a/software/runtime/kmp/omp.h b/software/runtime/kmp/omp.h
new file mode 100644
index 000000000..4c6abcf79
--- /dev/null
+++ b/software/runtime/kmp/omp.h
@@ -0,0 +1,21 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+/* Standard public APIs */
+
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int omp_get_num_threads(void);
+extern int omp_get_thread_num(void);
+
+extern int omp_get_num_teams(void);
+extern int omp_get_team_num(void);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/software/runtime/kmp/runtime.cpp b/software/runtime/kmp/runtime.cpp
new file mode 100644
index 000000000..be79ae434
--- /dev/null
+++ b/software/runtime/kmp/runtime.cpp
@@ -0,0 +1,34 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kmp/team.hpp"
+#include "kmp/types.h"
+
+#include <array>
+#include <utility>
+
+namespace kmp {
+
+namespace runtime {
+
+template <kmp_int32... Is>
+constexpr std::array<Thread, sizeof...(Is)>
+sequencetoArray(std::integer_sequence<kmp_int32, Is...> /*unused*/) {
+  return {{Is...}};
+}
+
+std::array<Thread, NUM_CORES> threads =
+    sequencetoArray(std::make_integer_sequence<kmp_int32, NUM_CORES>{});
+
+Team defaultTeam(0, 0);
+
+std::optional<kmp_int32> requestedNumTeams;
+std::optional<kmp_int32> requestedThreadLimit;
+kmp_int32 numTeams = 1;
+
+Barrier teamsBarrier(NUM_GROUPS);
+
+} // namespace runtime
+
+} // namespace kmp
diff --git a/software/runtime/kmp/runtime.hpp b/software/runtime/kmp/runtime.hpp
new file mode 100644
index 000000000..eca4eb814
--- /dev/null
+++ b/software/runtime/kmp/runtime.hpp
@@ -0,0 +1,42 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "kmp/barrier.hpp"
+#include "kmp/thread.hpp"
+#include "kmp/types.h"
+
+// NOLINTNEXTLINE(bugprone-reserved-identifier)
+extern void __assert_func(const char *file, int line, const char *func,
+                          const char *failedexpr);
+
+namespace kmp {
+
+namespace runtime {
+
+extern std::array<Thread, NUM_CORES> threads __attribute__((section(".l1")));
+
+extern Team defaultTeam __attribute__((section(".l1")));
+
+extern std::optional<kmp_int32> requestedNumTeams;
+extern std::optional<kmp_int32> requestedThreadLimit;
+
+extern Barrier teamsBarrier __attribute__((section(".l1")));
+
+static inline void runThread(kmp_int32 core_id) {
+  threads[static_cast<kmp_uint32>(core_id)].run();
+};
+
+static inline Thread &getThread(kmp_int32 gtid) {
+  return threads[static_cast<kmp_uint32>(gtid)];
+};
+
+static inline Thread &getCurrentThread() {
+  return threads[mempool_get_core_id()];
+};
+
+} // namespace runtime
+
+} // namespace kmp
diff --git a/software/runtime/kmp/task.cpp b/software/runtime/kmp/task.cpp
new file mode 100644
index 000000000..6edbe8521
--- /dev/null
+++ b/software/runtime/kmp/task.cpp
@@ -0,0 +1,92 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kmp/task.hpp"
+#include "kmp/runtime.hpp"
+
+extern "C" {
+#include "runtime.h"
+}
+
+namespace kmp {
+Task::Task(kmpc_micro func, void **args, kmp_int32 argc)
+    : func(func), argc(argc), args(args) {
+
+  assert(argc <= MAX_ARGS && "Unsupported number of microtask arguments");
+
+  DEBUG_PRINT("Microtask constructor\n");
+};
+
+void Task::run(kmp_int32 gtid, kmp_int32 tid) const {
+  // There seems to not be a better way to do this without custom passes or
+  // ASM
+  switch (argc) {
+  default:
+    return;
+
+  // NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic,*-magic-numbers)
+  case 0:
+    func(&gtid, &tid);
+    break;
+  case 1:
+    func(&gtid, &tid, args[0]);
+    break;
+  case 2:
+    func(&gtid, &tid, args[0], args[1]);
+    break;
+  case 3:
+    func(&gtid, &tid, args[0], args[1], args[2]);
+    break;
+  case 4:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3]);
+    break;
+  case 5:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4]);
+    break;
+  case 6:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5]);
+    break;
+  case 7:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5],
+         args[6]);
+    break;
+  case 8:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5],
+         args[6], args[7]);
+    break;
+  case 9:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5],
+         args[6], args[7], args[8]);
+    break;
+  case 10:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5],
+         args[6], args[7], args[8], args[9]);
+    break;
+  case 11:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5],
+         args[6], args[7], args[8], args[9], args[10]);
+    break;
+  case 12:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5],
+         args[6], args[7], args[8], args[9], args[10], args[11]);
+    break;
+  case 13:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5],
+         args[6], args[7], args[8], args[9], args[10], args[11], args[12]);
+    break;
+  case 14:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5],
+         args[6], args[7], args[8], args[9], args[10], args[11], args[12],
+         args[13]);
+    break;
+  case 15:
+    func(&gtid, &tid, args[0], args[1], args[2], args[3], args[4], args[5],
+         args[6], args[7], args[8], args[9], args[10], args[11], args[12],
+         args[13], args[14]);
+    break;
+  }
+  // NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic,*-magic-numbers)
+};
+
+} // namespace kmp
diff --git a/software/runtime/kmp/task.hpp b/software/runtime/kmp/task.hpp
new file mode 100644
index 000000000..e5a9b03af
--- /dev/null
+++ b/software/runtime/kmp/task.hpp
@@ -0,0 +1,25 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include "types.h"
+
+#define MAX_ARGS 15
+
+namespace kmp {
+
+class Task {
+public:
+  Task(kmpc_micro func, void **args, kmp_int32 argc);
+
+  void run(kmp_int32 gtid, kmp_int32 tid) const;
+
+private:
+  kmpc_micro func;
+  kmp_int32 argc;
+  void **args;
+};
+
+}; // namespace kmp
diff --git a/software/runtime/kmp/team.hpp b/software/runtime/kmp/team.hpp
new file mode 100644
index 000000000..fd656e4aa
--- /dev/null
+++ b/software/runtime/kmp/team.hpp
@@ -0,0 +1,318 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <array>
+#include <mutex>
+#include <optional>
+
+#include "kmp/barrier.hpp"
+#include "kmp/runtime.hpp"
+#include "kmp/types.h"
+#include "kmp/util.hpp"
+#include "printf.h"
+
+namespace kmp {
+
+class Thread;
+class Task;
+class Barrier;
+
+class Team {
+
+  struct DynamicSchedule {
+    kmp_uint32 lowerNext = 0;
+    kmp_uint32 upper = 0;
+    kmp_uint32 chunk = 0; // Chunk size assumed to be positive
+    kmp_int32 incr = 0;
+    kmp_int32 stride = 0;
+
+    bool valid = false;
+    kmp_int32 numDone = 0;
+
+    Mutex mutex;
+  };
+
+public:
+  Team(const Team &) = delete;
+  Team(Team &&) = delete;
+  Team &operator=(const Team &) = delete;
+  Team &operator=(Team &&) = delete;
+
+  inline Team(kmp_int32 masterGtid, kmp_int32 teamId)
+      : masterGtid(masterGtid), teamId(teamId), barrier(numThreads),
+        implicitTask(nullptr, nullptr, 0) {}
+
+  inline ~Team() {
+    for (kmp_int32 i = masterGtid + 1; i < masterGtid + numThreads; i++) {
+      while (runtime::getThread(i).isRunning()) {
+        // Wait for thread to finish
+        DEBUG_PRINT("Waiting for thread %d to finish\n", i);
+      }
+    }
+  }
+
+  inline Barrier &getBarrier() { return barrier; }
+
+  inline const Task &getImplicitTask() const { return implicitTask; }
+
+  inline void setImplicitTask(Task task) { implicitTask = task; }
+
+  inline auto getNumThreads() const { return numThreads; }
+
+  inline void setNumThreads(kmp_int32 numThreads) {
+    if (teamId == runtime::numTeams - 1) {
+      // Last team gets the remaining threads
+      numThreads = std::min(numThreads, NUM_CORES - masterGtid);
+    } else {
+      // Limit thread number
+      numThreads = std::min(numThreads, NUM_CORES / runtime::numTeams);
+    }
+
+    DEBUG_PRINT("Team %d has %d threads\n", teamId, numThreads);
+
+    this->numThreads = numThreads;
+    this->barrier.setNumThreads(numThreads);
+  }
+
+  inline auto setCopyPrivateData(void *data) { copyPrivateData = data; }
+
+  inline auto getCopyPrivateData() const { return copyPrivateData; }
+
+  inline void run() {
+    if (runtime::numTeams > 1) {
+      for (kmp_int32 i = masterGtid + 1; i < masterGtid + numThreads; i++) {
+        auto &thread = runtime::getThread(i);
+        thread.setTid(i - masterGtid);
+
+        if (i != masterGtid) {
+          thread.setCurrentTeam(this);
+          thread.wakeUp();
+        }
+      }
+    } else {
+      for (kmp_int32 i = masterGtid + 1; i < masterGtid + numThreads; i++) {
+        auto &thread = runtime::getThread(i);
+        thread.setCurrentTeam(this);
+      }
+
+      wake_up_all();
+      mempool_wfi();
+    }
+  }
+
+  inline auto getTeamId() const { return teamId; }
+
+  /**
+   * @brief Schedule a static for loop. See
+   * https://github.com/llvm/llvm-project/blob/f28c006a5895fc0e329fe15fead81e37457cb1d1/clang/lib/CodeGen/CGStmtOpenMP.cpp#L2900
+   *
+   * @tparam T Loop index type
+   * @param loc Source code location
+   * @param gtid  Global thread ID
+   * @param schedtype Scheduling type
+   * @param plastiter Pointer to last iteration flag, true if the current thread
+   * executes the last iteration, false otherwise
+   * @param plower Pointer to lower bound for this thread
+   * @param pupper Pointer to upper bound for this thread
+   * @param pstride Pointer to stride for this thread
+   * @param chunk Chunk size
+   */
+  template <typename T, typename SignedT = typename std::make_signed<T>::type,
+            typename UnsignedT = typename std::make_unsigned<T>::type>
+  void forStaticInit(ident_t * /*loc*/, kmp_int32 gtid,
+                     kmp_sched_type schedtype, T *plastiter, T *plower,
+                     T *pupper, SignedT *pstride, SignedT incr,
+                     SignedT chunk) const {
+
+    assert(incr == 1 && "Loop increment is not 1");
+
+    switch (schedtype) {
+    case kmp_sch_static: {
+
+      // Calculate chunk size
+      // https://stackoverflow.com/a/14878734
+      chunk = static_cast<SignedT>(*pupper - *plower + 1) / numThreads +
+              (static_cast<SignedT>(*pupper - *plower + 1) % numThreads != 0);
+
+      // Fall through to static chunked
+    }
+    case kmp_sch_static_chunked: {
+      assert(incr != 0 && "Loop increment must be non-zero");
+      assert(chunk > 0 && "Chunk size is not positive");
+      assert((static_cast<T>(chunk) <= *pupper - *plower + 1) &&
+             "Chunk size is greater than loop size");
+
+      kmp_int32 tid = runtime::getThread(gtid).getTid();
+
+      SignedT numChunks =
+          (static_cast<SignedT>(*pupper - *plower) + chunk) / chunk;
+
+      SignedT span = incr * chunk;
+      *pstride = span * static_cast<SignedT>(numThreads);
+      *plower = *plower + static_cast<T>(tid) * static_cast<T>(span);
+      *pupper = *plower + static_cast<T>(span - incr);
+      *plastiter = (tid == (numChunks - 1) % numThreads);
+
+      break;
+    }
+
+    // Distribute (teams)
+    case kmp_distribute_static: {
+
+      // Calculate chunk size
+      // https://stackoverflow.com/a/14878734
+      chunk =
+          static_cast<SignedT>(*pupper - *plower + 1) / runtime::numTeams +
+          (static_cast<SignedT>(*pupper - *plower + 1) % runtime::numTeams !=
+           0);
+
+      // Fall through to static chunked
+    }
+    case kmp_distribute_static_chunked: {
+      assert(incr != 0 && "Loop increment must be non-zero");
+      assert(chunk > 0 && "Chunk size is not positive");
+      assert((static_cast<T>(chunk) <= *pupper - *plower + 1) &&
+             "Chunk size is greater than loop size");
+
+      SignedT numChunks =
+          (static_cast<SignedT>(*pupper - *plower) + chunk) / chunk;
+
+      SignedT span = incr * chunk;
+      *pstride = span * static_cast<SignedT>(runtime::numTeams);
+      *plower = *plower + static_cast<T>(teamId) * static_cast<T>(span);
+      *pupper = *plower + static_cast<T>(span - incr);
+      *plastiter = (teamId == (numChunks - 1) % runtime::numTeams);
+
+      break;
+    }
+    default: {
+      assert(false && "Unsupported scheduling type");
+      break;
+    }
+    }
+  }
+
+  template <typename T, typename SignedT = typename std::make_signed<T>::type,
+            typename UnsignedT = typename std::make_unsigned<T>::type>
+  void dispatchInit(ident_t * /*loc*/, kmp_int32 /*gtid*/,
+                    kmp_sched_type schedtype, T lower, T upper, SignedT incr,
+                    SignedT chunk) {
+
+    assert(incr == 1 && "Loop increment is not 1");
+    assert(chunk > 0 && "Chunk size is not positive");
+    assert((static_cast<T>(chunk) <= upper - lower + 1) &&
+           "Chunk size is greater than loop size");
+
+    DEBUG_PRINT("Dispatch init\n");
+
+    DEBUG_PRINT("Got dynamic schedule\n");
+
+    switch (schedtype) {
+    case kmp_sch_dynamic_chunked: {
+      std::lock_guard<Mutex> lock(dynamicSchedule.mutex);
+
+      if (dynamicSchedule.valid) {
+        DEBUG_PRINT("Dynamic schedule is already valid\n");
+        return;
+      }
+
+      SignedT span = incr * chunk;
+
+      dynamicSchedule.lowerNext = static_cast<kmp_uint32>(lower);
+      dynamicSchedule.upper = static_cast<kmp_uint32>(upper);
+      dynamicSchedule.chunk = static_cast<kmp_uint32>(chunk);
+      dynamicSchedule.incr = incr;
+      dynamicSchedule.stride = span * static_cast<SignedT>(numThreads);
+
+      DEBUG_PRINT(
+          "Dynamic schedule: lowerNext=%d, upper=%d, chunk=%d, incr=%d, "
+          "stride=%d, addr: %p\n",
+          dynamicSchedule.lowerNext, dynamicSchedule.upper,
+          dynamicSchedule.chunk, dynamicSchedule.incr, dynamicSchedule.stride,
+          &dynamicSchedule);
+
+      dynamicSchedule.valid = true;
+      break;
+    }
+    default: {
+      printf("Unsupported scheduling type: %d\n", schedtype);
+      assert(false && "Unsupported scheduling type");
+      break;
+    }
+    };
+  }
+
+  template <typename T, typename SignedT = typename std::make_signed<T>::type>
+  bool dispatchNext(ident_t * /*loc*/, kmp_int32 /*gtid*/, SignedT *plastiter,
+                    T *plower, T *pupper, SignedT *pstride) {
+
+    DEBUG_PRINT("Dispatch next\n");
+
+    std::lock_guard<Mutex> lock(dynamicSchedule.mutex);
+    assert(dynamicSchedule.valid && "Dynamic schedule is not valid");
+
+    if (dynamicSchedule.lowerNext > dynamicSchedule.upper) {
+      DEBUG_PRINT("Dynamic loop done\n");
+      if (++dynamicSchedule.numDone == numThreads) {
+        dynamicSchedule.valid = false;
+        dynamicSchedule.numDone = 0;
+      }
+
+      return false;
+    }
+
+    *plower = static_cast<T>(dynamicSchedule.lowerNext);
+
+    dynamicSchedule.lowerNext += dynamicSchedule.chunk;
+    if (dynamicSchedule.lowerNext > dynamicSchedule.upper) {
+      *pupper = static_cast<T>(dynamicSchedule.upper);
+      *plastiter = true;
+    } else {
+      *pupper = static_cast<T>(dynamicSchedule.lowerNext - 1);
+      *plastiter = false;
+    }
+
+    *pstride = dynamicSchedule.stride;
+
+    return true;
+  };
+
+  inline void copyPrivate(ident_t * /*loc*/, kmp_int32 gtid,
+                          size_t /*cpy_size*/, void *cpy_data,
+                          void (*cpy_func)(void *, void *), kmp_int32 didit) {
+    (void)gtid;
+
+    if (didit != 0) {
+      copyPrivateData = cpy_data;
+      DEBUG_PRINT("Thread %d set copyprivate data to %p\n", gtid, cpy_data);
+    }
+
+    barrier.wait();
+
+    if (didit == 0) {
+      DEBUG_PRINT("Thread %d copying copyprivate data from %p to %p\n", gtid,
+                  copyPrivateData, cpy_data);
+      cpy_func(cpy_data, copyPrivateData);
+    }
+
+    barrier.wait();
+  };
+
+private:
+  kmp_int32 masterGtid = 0;
+  kmp_int32 teamId = 0;
+  kmp_int32 numThreads = 1;
+
+  Barrier barrier;
+
+  DynamicSchedule dynamicSchedule;
+
+  void *copyPrivateData = nullptr;
+
+  Task implicitTask;
+};
+
+} // namespace kmp
diff --git a/software/runtime/kmp/thread.cpp b/software/runtime/kmp/thread.cpp
new file mode 100644
index 000000000..9f803728a
--- /dev/null
+++ b/software/runtime/kmp/thread.cpp
@@ -0,0 +1,123 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "kmp/thread.hpp"
+#include "kmp/team.hpp"
+#include "kmp/util.hpp"
+
+extern "C" {
+#include "runtime.h"
+}
+
+namespace kmp {
+
+Thread::Thread(kmp_int32 gtid) : Thread(gtid, gtid) {}
+
+Thread::Thread(kmp_int32 gtid, kmp_int32 tid)
+    : gtid(gtid), tid(tid),
+      currentTeam(gtid == 0 ? &runtime::defaultTeam : nullptr){};
+;
+
+void Thread::run() {
+  while (true) {
+    DEBUG_PRINT("Thread %d went to sleep\n", gtid);
+    mempool_wfi();
+    std::lock_guard<Mutex> lock(running);
+
+    DEBUG_PRINT("Thread %d woke up\n", gtid);
+
+    if (currentTeam != nullptr && !teamsRegion.has_value()) {
+
+      (*currentTeam).getImplicitTask().run(gtid, tid);
+      DEBUG_PRINT("Done running task\n");
+
+      Team *prevTeam = currentTeam;
+      currentTeam = nullptr;
+      tid = gtid;
+
+      (*prevTeam).getBarrier().wait();
+
+    } else if (teamsRegion.has_value()) {
+      teamsRegion->run(gtid, tid);
+      DEBUG_PRINT("Done running teams region\n");
+
+      teamsRegion.reset();
+
+      delete currentTeam;
+      currentTeam = nullptr;
+      tid = gtid;
+
+      runtime::teamsBarrier.wait();
+
+    } else {
+      DEBUG_PRINT("Thread %d woke up to no work. currentTeam: %p, "
+                  "teamsRegion.has_value(): %d\n",
+                  gtid, currentTeam, teamsRegion.has_value());
+    }
+  }
+};
+
+void Thread::forkCall(Task parallelRegion) {
+  kmp_int32 numThreads = this->requestedNumThreads.value_or(NUM_CORES);
+  this->requestedNumThreads.reset();
+
+  DEBUG_PRINT("Forking call with %d threads\n", numThreads);
+
+  Team *team = currentTeam;
+
+  // Setup
+  team->setNumThreads(numThreads);
+  team->setImplicitTask(parallelRegion);
+
+  // Run on all threads
+  team->run();
+  parallelRegion.run(gtid, tid);
+
+  DEBUG_PRINT("Done running task\n");
+  DEBUG_PRINT("Fork call done\n");
+
+  team->getBarrier().wait();
+};
+
+void Thread::forkTeams(Task teamsRegion) {
+  runtime::numTeams = runtime::requestedNumTeams.value_or(NUM_GROUPS);
+  runtime::teamsBarrier.setNumThreads(runtime::numTeams);
+  runtime::requestedNumTeams.reset();
+
+  DEBUG_PRINT("Forking call with %d teams\n", runtime::numTeams);
+
+  kmp_int32 coresPerTeam = NUM_CORES / runtime::numTeams;
+
+  for (kmp_int32 i = 1; i < runtime::numTeams; i++) {
+    kmp_int32 coreId = i * coresPerTeam;
+
+    Thread &thread = runtime::getThread(coreId);
+
+    thread.setCurrentTeam(new Team(coreId, i));
+    thread.setTeamsRegion(teamsRegion);
+    thread.setTid(0);
+
+    if (runtime::requestedThreadLimit) {
+      thread.requestNumThreads(runtime::requestedThreadLimit.value());
+    }
+
+    thread.wakeUp();
+  }
+
+  this->setTeamsRegion(teamsRegion);
+  if (runtime::requestedThreadLimit) {
+    this->requestNumThreads(runtime::requestedThreadLimit.value());
+  }
+
+  teamsRegion.run(gtid, tid);
+  this->teamsRegion.reset();
+
+  DEBUG_PRINT("Fork teams done\n");
+
+  runtime::teamsBarrier.wait();
+
+  runtime::numTeams = 1;
+};
+
+} // namespace kmp
diff --git a/software/runtime/kmp/thread.hpp b/software/runtime/kmp/thread.hpp
new file mode 100644
index 000000000..401c8612a
--- /dev/null
+++ b/software/runtime/kmp/thread.hpp
@@ -0,0 +1,80 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <optional>
+
+#include "kmp/task.hpp"
+#include "kmp/types.h"
+#include "kmp/util.hpp"
+#include "runtime.h"
+
+namespace kmp {
+
+// Forward declaration
+class Team;
+
+class Thread {
+
+public:
+  Thread(kmp_int32 gtid);
+  Thread(kmp_int32 gtid, kmp_int32 tid);
+
+  Thread(const Thread &other) = delete;
+  Thread(Thread &&) = delete;
+  Thread &operator=(const Thread &) = delete;
+  Thread &operator=(Thread &&) = delete;
+
+  ~Thread() = default;
+
+  void run();
+
+  inline void wakeUp() {
+    std::lock_guard<Mutex> lock(running);
+    DEBUG_PRINT("Waking up thread %d\n", gtid);
+    wake_up(static_cast<uint32_t>(gtid));
+  };
+
+  inline Team *getCurrentTeam() { return currentTeam; };
+
+  inline void setCurrentTeam(Team *team) {
+    DEBUG_PRINT("Setting current team for %d: %p\n", this->gtid, team);
+    currentTeam = team;
+  };
+
+  inline void setTeamsRegion(Task teamsRegion) {
+    this->teamsRegion = teamsRegion;
+  };
+
+  inline auto getGtid() const { return gtid; };
+
+  inline auto getTid() const { return tid; };
+
+  inline void setTid(kmp_int32 tid) { this->tid = tid; };
+
+  inline bool isRunning() { return running.isLocked(); };
+
+  inline void requestNumThreads(kmp_int32 numThreads) {
+    this->requestedNumThreads = numThreads;
+  }
+
+  void forkCall(Task parallelRegion);
+
+  void forkTeams(Task teamsRegion);
+
+private:
+  kmp_int32 gtid;
+  kmp_int32 tid;
+
+  Mutex running;
+
+  Team *currentTeam;
+
+  // Contains task if this is the initial thread (master) of the teams region
+  std::optional<Task> teamsRegion;
+
+  std::optional<kmp_int32> requestedNumThreads;
+};
+}; // namespace kmp
diff --git a/software/runtime/kmp/types.h b/software/runtime/kmp/types.h
new file mode 100644
index 000000000..ea4c2019f
--- /dev/null
+++ b/software/runtime/kmp/types.h
@@ -0,0 +1,140 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <cstdint>
+
+typedef uint32_t kmp_uint32;
+typedef int32_t kmp_int32;
+typedef uint64_t kmp_uint64;
+typedef int64_t kmp_int64;
+
+typedef struct {
+  int reserved_1;
+  int flags;
+  int reserved_2;
+  int reserved_3;
+  char *psource;
+} ident_t;
+
+// NOLINTNEXTLINE(cppcoreguidelines-*,readability-*)
+typedef kmp_int32 kmp_critical_name[8];
+
+typedef void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...);
+typedef void (*kmpc_micro_bound)(kmp_int32 *bound_tid, kmp_int32 *bound_nth,
+                                 ...);
+
+enum kmp_sched_type : kmp_int32 {
+  kmp_sch_lower = 32, /**< lower bound for unordered values */
+  kmp_sch_static_chunked = 33,
+  kmp_sch_static = 34, /**< static unspecialized */
+  kmp_sch_dynamic_chunked = 35,
+  kmp_sch_guided_chunked = 36, /**< guided unspecialized */
+  kmp_sch_runtime = 37,
+  kmp_sch_auto = 38, /**< auto */
+  kmp_sch_trapezoidal = 39,
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_static_greedy = 40,
+  kmp_sch_static_balanced = 41,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_guided_iterative_chunked = 42,
+  kmp_sch_guided_analytical_chunked = 43,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_static_steal = 44,
+
+  /* static with chunk adjustment (e.g., simd) */
+  kmp_sch_static_balanced_chunked = 45,
+  kmp_sch_guided_simd = 46,  /**< guided with chunk adjustment */
+  kmp_sch_runtime_simd = 47, /**< runtime with chunk adjustment */
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_sch_upper, /**< upper bound for unordered values */
+
+  kmp_ord_lower = 64, /**< lower bound for ordered values, must be power of 2 */
+  kmp_ord_static_chunked = 65,
+  kmp_ord_static = 66, /**< ordered static unspecialized */
+  kmp_ord_dynamic_chunked = 67,
+  kmp_ord_guided_chunked = 68,
+  kmp_ord_runtime = 69,
+  kmp_ord_auto = 70, /**< ordered auto */
+  kmp_ord_trapezoidal = 71,
+  kmp_ord_upper, /**< upper bound for ordered values */
+
+  /* Schedules for Distribute construct */
+  kmp_distribute_static_chunked = 91, /**< distribute static chunked */
+  kmp_distribute_static = 92,         /**< distribute static unspecialized */
+
+  /* For the "nomerge" versions, kmp_dispatch_next*() will always return a
+     single iteration/chunk, even if the loop is serialized. For the schedule
+     types listed above, the entire iteration vector is returned if the loop is
+     serialized. This doesn't work for gcc/gcomp sections. */
+  kmp_nm_lower = 160, /**< lower bound for nomerge values */
+
+  kmp_nm_static_chunked =
+      (kmp_sch_static_chunked - kmp_sch_lower + kmp_nm_lower),
+  kmp_nm_static = 162, /**< static unspecialized */
+  kmp_nm_dynamic_chunked = 163,
+  kmp_nm_guided_chunked = 164, /**< guided unspecialized */
+  kmp_nm_runtime = 165,
+  kmp_nm_auto = 166, /**< auto */
+  kmp_nm_trapezoidal = 167,
+
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_nm_static_greedy = 168,
+  kmp_nm_static_balanced = 169,
+  /* accessible only through KMP_SCHEDULE environment variable */
+  kmp_nm_guided_iterative_chunked = 170,
+  kmp_nm_guided_analytical_chunked = 171,
+  kmp_nm_static_steal =
+      172, /* accessible only through OMP_SCHEDULE environment variable */
+
+  kmp_nm_ord_static_chunked = 193,
+  kmp_nm_ord_static = 194, /**< ordered static unspecialized */
+  kmp_nm_ord_dynamic_chunked = 195,
+  kmp_nm_ord_guided_chunked = 196,
+  kmp_nm_ord_runtime = 197,
+  kmp_nm_ord_auto = 198, /**< auto */
+  kmp_nm_ord_trapezoidal = 199,
+  kmp_nm_upper, /**< upper bound for nomerge values */
+
+  /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. Since
+     we need to distinguish the three possible cases (no modifier, monotonic
+     modifier, nonmonotonic modifier), we need separate bits for each modifier.
+     The absence of monotonic does not imply nonmonotonic, especially since 4.5
+     says that the behaviour of the "no modifier" case is implementation defined
+     in 4.5, but will become "nonmonotonic" in 5.0.
+
+     Since we're passing a full 32 bit value, we can use a couple of high bits
+     for these flags; out of paranoia we avoid the sign bit.
+
+     These modifiers can be or-ed into non-static schedules by the compiler to
+     pass the additional information. They will be stripped early in the
+     processing in __kmp_dispatch_init when setting up schedules, so most of the
+     code won't ever see schedules with these bits set.  */
+  kmp_sch_modifier_monotonic =
+      (1 << 29), /**< Set if the monotonic schedule modifier was present */
+  kmp_sch_modifier_nonmonotonic =
+      (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */
+
+// NOLINTBEGIN(cppcoreguidelines-macro-usage)
+#define SCHEDULE_WITHOUT_MODIFIERS(s)                                          \
+  (kmp_sched_type)(                                                            \
+      (s) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic))
+#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sch_modifier_monotonic) != 0)
+#define SCHEDULE_HAS_NONMONOTONIC(s) (((s)&kmp_sch_modifier_nonmonotonic) != 0)
+#define SCHEDULE_HAS_NO_MODIFIERS(s)                                           \
+  (((s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) == 0)
+#define SCHEDULE_GET_MODIFIERS(s)                                              \
+  ((kmp_sched_type)((s) & (kmp_sch_modifier_nonmonotonic |                     \
+                           kmp_sch_modifier_monotonic)))
+#define SCHEDULE_SET_MODIFIERS(s, m)                                           \
+  ((s) = (kmp_sched_type)((kmp_int32)(s) | (kmp_int32)(m)))
+#define SCHEDULE_NONMONOTONIC 0
+#define SCHEDULE_MONOTONIC 1
+  // NOLINTEND(cppcoreguidelines-macro-usage)
+  //
+  kmp_sch_default = kmp_sch_static /**< default scheduling algorithm */
+};
diff --git a/software/runtime/kmp/util.hpp b/software/runtime/kmp/util.hpp
new file mode 100644
index 000000000..4ca2baa34
--- /dev/null
+++ b/software/runtime/kmp/util.hpp
@@ -0,0 +1,196 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#include <atomic>
+#include <cassert>
+#include <limits>
+#include <mutex>
+
+#include "printf.h"
+
+extern "C" {
+#include "alloc.h"
+}
+
+namespace kmp {
+class Mutex;
+}
+
+extern kmp::Mutex allocLock;
+
+namespace kmp {
+
+#ifndef NDEBUG
+#define DEBUG_PRINT(...) printf(__VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+class Mutex {
+public:
+  inline void lock() {
+    while (locked.exchange(true, std::memory_order_acquire)) {
+    }
+  }
+
+  inline bool tryLock() {
+    return !locked.exchange(true, std::memory_order_acquire);
+  }
+
+  inline bool isLocked() { return locked.load(std::memory_order_acquire); }
+
+  inline void unlock() { locked.store(false, std::memory_order_release); }
+
+private:
+  std::atomic<bool> locked = false;
+};
+
+template <typename T> class SharedPointer {
+public:
+  SharedPointer() : refCount(nullptr), ptr(nullptr) {}
+
+  explicit SharedPointer(T *ptr)
+      : refCount(new std::atomic<uint32_t>(1)), ptr(ptr) {}
+
+  SharedPointer(const SharedPointer &other)
+      : refCount(other.refCount), ptr(other.ptr) {
+    (*refCount)++;
+  }
+
+  SharedPointer(SharedPointer &&other) noexcept
+      : refCount(other.refCount), ptr(other.ptr) {
+    other.ptr = nullptr;
+    other.refCount = nullptr;
+  }
+
+  SharedPointer &operator=(SharedPointer &&other) noexcept {
+    if (this != &other) {
+      std::swap(ptr, other.ptr);
+      std::swap(refCount, other.refCount);
+    }
+    return *this;
+  }
+
+  SharedPointer &operator=(const SharedPointer &other) {
+    if (this != &other) {
+      ptr = other.ptr;
+      refCount = other.refCount;
+      (*refCount)++;
+    }
+    return *this;
+  }
+
+  ~SharedPointer() {
+    if (refCount == nullptr) {
+      return;
+    }
+
+    if (--(*refCount) == 0) {
+      delete ptr;
+      delete refCount;
+    }
+  }
+
+  T *get() { return ptr; }
+  const T *get() const { return ptr; }
+
+  T *operator->() { return ptr; }
+  const T *operator->() const { return ptr; }
+
+  T &operator*() { return *ptr; }
+  const T &operator*() const { return *ptr; }
+
+  inline void reset() {
+    DEBUG_PRINT("Resetting shared pointer\n");
+    if (refCount == nullptr) {
+      return;
+    }
+
+    if (--(*refCount) == 0) {
+      delete ptr;
+      delete refCount;
+    }
+
+    refCount = nullptr;
+    ptr = nullptr;
+  }
+
+private:
+  std::atomic<uint32_t> *refCount;
+  T *ptr;
+};
+
+template <typename T> class UniquePointer {
+public:
+  explicit UniquePointer(T *ptr) : ptr(ptr) {}
+
+  // Move constructor
+  UniquePointer(UniquePointer &&other) noexcept : ptr(other.ptr) {
+    other.ptr = nullptr;
+  }
+
+  // Move assignment
+  UniquePointer &operator=(UniquePointer &&other) noexcept {
+    if (this != &other) {
+      delete ptr;
+      ptr = other.ptr;
+      other.ptr = nullptr;
+    }
+    return *this;
+  }
+
+  // Disallow copy constructor and assignment
+  UniquePointer(const UniquePointer &other) = delete;
+  UniquePointer &operator=(const UniquePointer &other) = delete;
+
+  ~UniquePointer() { delete ptr; }
+
+  T *get() { return ptr; }
+  const T *get() const { return ptr; }
+
+  T *operator->() { return ptr; }
+  const T *operator->() const { return ptr; }
+
+  T &operator*() { return *ptr; }
+  const T &operator*() const { return *ptr; }
+
+private:
+  T *ptr;
+};
+
+template <class T> struct Allocator {
+  typedef T value_type;
+
+  Allocator() = default;
+
+  template <class U>
+  constexpr Allocator(const Allocator<U> & /*unused*/) noexcept {}
+
+  [[nodiscard]] T *allocate(std::size_t n) {
+    assert(n <= std::numeric_limits<std::size_t>::max() / sizeof(T));
+    std::lock_guard<kmp::Mutex> lock(allocLock);
+
+    DEBUG_PRINT("Allocating %lu bytes\n", n * sizeof(T));
+    if (auto ptr = static_cast<T *>(simple_malloc(n * sizeof(T)))) {
+      DEBUG_PRINT("Allocated %lu bytes at %p\n", n * sizeof(T), ptr);
+      return ptr;
+    }
+
+    assert(false && "Allocation failed");
+    return nullptr;
+  }
+
+  void deallocate(T *ptr, std::size_t n) noexcept {
+    (void)n;
+
+    std::lock_guard<kmp::Mutex> lock(allocLock);
+
+    simple_free(ptr);
+    DEBUG_PRINT("Deallocated %lu bytes at %p\n", n * sizeof(T), ptr);
+  }
+};
+
+} // namespace kmp
diff --git a/software/runtime/link.ld b/software/runtime/link.ld
index e03fc7ae1..d710ebe62 100644
--- a/software/runtime/link.ld
+++ b/software/runtime/link.ld
@@ -16,11 +16,18 @@ SECTIONS {
     . = __seq_end;
   } > l1
 
+  /* BSS on L1 */
+  .bss : {
+    __bss_start = .;
+    *(.bss)
+    *(.sbss .sbss2 .sbss2.* .gnu.linkonce.sb2.*);
+    __bss_end = .;
+  } > l1
+
   /* Interleaved region on L1 */
   .l1 (NOLOAD): {
     *(.l1_prio)
     *(.l1)
-    *(.bss)
     __l1_alloc_base = ALIGN(0x10);
     __heap_start = .;
   } > l1
@@ -35,6 +42,14 @@ SECTIONS {
     _etext = .;
   } > l2
 
+  /* Init array on L2 */
+  .init_array : {
+      HIDDEN (__init_array_start = .);
+      KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*)))
+      KEEP (*(.init_array))
+      HIDDEN (__init_array_end = .);
+  } > l2
+
   /* RO Data on L2 */
   .rodata : {
     *(.rodata .rodata.* .gnu.linkonce.r.*)
@@ -51,20 +66,16 @@ SECTIONS {
     . = ALIGN(0x10);
     *(.data)
   } > l2
+
   .sdata2 : {
     *(.sdata2 .sdata2.* .gnu.linkonce.s2.*)
   } > l2
+
   .sdata : {
     __global_pointer$ = . + 0x800;
     *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*)
     *(.sdata .sdata.* .gnu.linkonce.s.*)
-  } > l2
-
-  .bss : {
-    __bss_start = .;
-    *(.bss)
-    *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*);
-    __bss_end = .;
+    . = ALIGN(0x10);
   } > l2
 
   .l2 : {
@@ -75,5 +86,5 @@ SECTIONS {
 
   .comment : {
     *(.comment)
-  } > l2
+  }
 }
diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h
index 4abdbd682..a1714bdb1 100644
--- a/software/runtime/runtime.h
+++ b/software/runtime/runtime.h
@@ -140,6 +140,7 @@ static inline void wake_up_tile(uint32_t group_id, uint32_t tile_mask) {
   case 0:
     wake_up_tile_g0_reg = tile_mask;
     break;
+
   case 1:
     wake_up_tile_g1_reg = tile_mask;
     break;
diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk
index 69d309158..3c5ad5958 100644
--- a/software/runtime/runtime.mk
+++ b/software/runtime/runtime.mk
@@ -20,9 +20,11 @@ LLVM_INSTALL_DIR   ?= $(INSTALL_DIR)/llvm
 # HALIDE_INSTALL_DIR ?= $(INSTALL_DIR)/halide
 # HALIDE_INCLUDE     ?= $(HALIDE_INSTALL_DIR)/include
 # HALIDE_LIB         ?= $(HALIDE_INSTALL_DIR)/lib
-OMP_DIR            ?= $(ROOT_DIR)/omp
+GOMP_DIR           ?= $(ROOT_DIR)/gomp
+KMP_DIR            ?= $(ROOT_DIR)/kmp
 KERNELS_DIR        ?= $(abspath $(ROOT_DIR)/../kernels)
 DATA_DIR           ?= $(abspath $(ROOT_DIR)/../data)
+EXT_DIR						 ?= $(abspath $(ROOT_DIR)/../ext)
 
 COMPILER      ?= gcc
 XPULPIMG      ?= $(xpulpimg)
@@ -42,7 +44,7 @@ ifeq ($(COMPILER),gcc)
 		# Define __XPULPIMG if the extension is active
 		DEFINES       += -D__XPULPIMG
 	else
-		RISCV_ARCH_AS ?= rv$(RISCV_ARCH)ima
+		RISCV_ARCH    ?= rv$(RISCV_XLEN)ima
 		RISCV_ARCH_AS ?= $(RISCV_ARCH)Xpulpv2
 	endif
 	# GCC Toolchain
@@ -50,7 +52,8 @@ ifeq ($(COMPILER),gcc)
 	RISCV_CC      ?= $(RISCV_PREFIX)gcc
 	RISCV_CXX     ?= $(RISCV_PREFIX)g++
 	RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump
-
+	# OMP runtime
+	OMP_DIR			  ?= $(GOMP_DIR)
 else
 
 	# Use LLVM by default
@@ -77,7 +80,8 @@ else
 	RISCV_CC      ?= $(LLVM_INSTALL_DIR)/bin/clang
 	RISCV_CXX     ?= $(LLVM_INSTALL_DIR)/bin/clang++
 	RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump
-
+	# OMP runtime
+	OMP_DIR			  ?= $(KMP_DIR)
 endif
 RISCV_OBJCOPY ?= $(RISCV_PREFIX)objcopy
 RISCV_AS      ?= $(RISCV_PREFIX)as
@@ -109,12 +113,16 @@ ifdef terapool
 	DEFINES += -DNUM_TILES_PER_SUB_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)/$(num_sub_groups_per_group)}')
 endif
 
+ifdef NDEBUG
+	DEFINES += -DNDEBUG
+endif
+
 # Specify cross compilation target. This can be omitted if LLVM is built with riscv as default target
 RISCV_LLVM_TARGET  ?= --target=$(RISCV_TARGET) --sysroot=$(GCC_INSTALL_DIR)/$(RISCV_TARGET) --gcc-toolchain=$(GCC_INSTALL_DIR)
 
 RISCV_WARNINGS += -Wunused-variable -Wconversion -Wall -Wextra # -Werror
 RISCV_FLAGS_COMMON_TESTS ?= -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) -I$(ROOT_DIR) -I$(KERNELS_DIR) -I$(DATA_DIR) -static
-RISCV_FLAGS_COMMON ?= $(RISCV_FLAGS_COMMON_TESTS) -g -std=gnu99 -O3  -fno-builtin-memcpy -fno-builtin-memset -ffast-math -fno-common -fno-builtin-printf $(DEFINES) $(RISCV_WARNINGS)
+RISCV_FLAGS_COMMON ?= $(RISCV_FLAGS_COMMON_TESTS) -gdwarf-4 -O3 -fno-builtin-memcpy -fno-builtin-memset -ffast-math -fno-common -fno-builtin-printf $(DEFINES) $(RISCV_WARNINGS)
 RISCV_FLAGS_GCC    ?= -mcmodel=medany -Wa,-march=$(RISCV_ARCH_AS) -mtune=mempool -fno-tree-loop-distribute-patterns # -falign-loops=32 -falign-jumps=32
 RISCV_FLAGS_LLVM   ?= -mcmodel=small -mcpu=mempool-rv32 -mllvm -misched-topdown -menable-experimental-extensions
 # Enable soft-divsqrt when the hardware is not supported.
@@ -124,16 +132,16 @@ ifeq ($(xDivSqrt), 0)
 endif
 
 ifeq ($(COMPILER),gcc)
-	RISCV_CCFLAGS       += $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON)
-	RISCV_CXXFLAGS      += $(RISCV_CCFLAGS)
+	RISCV_CCFLAGS       += $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON) -std=gnu99
+	RISCV_CXXFLAGS      += $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON) -std=c++17
 	RISCV_LDFLAGS       += -static -nostartfiles -lm -lgcc $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON) -L$(ROOT_DIR)
 	RISCV_OBJDUMP_FLAGS += --disassembler-option="march=$(RISCV_ARCH_AS)"
 	# For unit tests
 	RISCV_CCFLAGS_TESTS ?= $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON_TESTS) -fvisibility=hidden -nostdlib $(RISCV_LDFLAGS)
 else
-	RISCV_CCFLAGS       += $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_LLVM) $(RISCV_FLAGS_COMMON)
-	RISCV_CXXFLAGS      += $(RISCV_CCFLAGS)
-	RISCV_LDFLAGS       += -static -nostartfiles -lm -lgcc -mcmodel=small $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_COMMON) -L$(ROOT_DIR)
+	RISCV_CCFLAGS       += $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_LLVM) $(RISCV_FLAGS_COMMON) -std=gnu99
+	RISCV_CXXFLAGS      += $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_LLVM) $(RISCV_FLAGS_COMMON) -std=c++17 -fno-exceptions -fno-threadsafe-statics
+	RISCV_LDFLAGS       += -static -nostartfiles -nostdlib -lgcc -lm -fuse-ld=lld -mcmodel=small $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_COMMON) -L$(ROOT_DIR)
 	RISCV_OBJDUMP_FLAGS += --mcpu=mempool-rv32
 	ifeq ($(xDivSqrt), 0)
 		RISCV_OBJDUMP_FLAGS += --mattr=+m,+a,+nofdiv,+xpulpmacsi,+xpulppostmod,+xpulpvect,+xpulpvectshufflepack,+zfinx
@@ -154,7 +162,7 @@ RUNTIME += $(ROOT_DIR)/serial.c.o
 RUNTIME += $(ROOT_DIR)/string.c.o
 RUNTIME += $(ROOT_DIR)/synchronization.c.o
 
-OMP_RUNTIME := $(addsuffix .o,$(shell find $(OMP_DIR) -name "*.c"))
+OMP_RUNTIME := $(addsuffix .o,$(shell find $(OMP_DIR) -name "*.c" -o -name "*.cpp"))
 
 .INTERMEDIATE: $(RUNTIME) $(OMP_RUNTIME) $(LINKER_SCRIPT)
 # Disable builtin rules
diff --git a/software/runtime/testing.h b/software/runtime/testing.h
new file mode 100644
index 000000000..324bd788c
--- /dev/null
+++ b/software/runtime/testing.h
@@ -0,0 +1,116 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+// Inspired by https://jera.com/techinfo/jtns/jtn002
+
+#pragma once
+
+#include "printf.h"
+
+#define MAX_TESTS 20
+
+typedef struct {
+  const char *name;
+  void (*func)(char const **out_error_message);
+} test_t;
+
+test_t tests[MAX_TESTS]; // NOLINT
+
+int tests_run = 0;    // NOLINT(*-global-variables)
+int tests_failed = 0; // NOLINT(*-global-variables)
+int num_tests = 0;    // NOLINT(*-global-variables)
+
+#ifndef NDEBUG
+#define DEBUG_PRINT(...) printf(__VA_ARGS__)
+#else
+#define DEBUG_PRINT(...)
+#endif
+
+// NOLINTNEXTLINE
+#define STRINGIFY(x) #x
+// NOLINTNEXTLINE
+#define TOSTRING(x) STRINGIFY(x)
+#define LINE_STRING TOSTRING(__LINE__)
+
+// NOLINTNEXTLINE
+#define ASSERT_TRUE(condition, error_message)                                  \
+  do {                                                                         \
+    if (!(condition)) {                                                        \
+      *out_error_message = (error_message);                                    \
+      return;                                                                  \
+    }                                                                          \
+  } while (0)
+
+// NOLINTNEXTLINE
+#define ASSERT_EQ(left, right)                                                 \
+  do {                                                                         \
+    if (!((left) == (right))) {                                                \
+      *out_error_message =                                                     \
+          #left " is not equal to " #right ", " __FILE__ ":" LINE_STRING;      \
+      return;                                                                  \
+    }                                                                          \
+  } while (0)
+
+// NOLINTNEXTLINE
+#define ASSERT_NEQ(left, right)                                                \
+  do {                                                                         \
+    if (!((left) != (right))) {                                                \
+      *out_error_message =                                                     \
+          #left " is equal to " #right ", " __FILE__ ":" LINE_STRING;          \
+      return;                                                                  \
+    }                                                                          \
+  } while (0)
+
+// NOLINTNEXTLINE
+#define EXPECT_TRUE(condition, error_message)                                  \
+  do {                                                                         \
+    if (!(condition))                                                          \
+      printf("\t[CHECK FAILED]: %s, %s:%d\n", (error_message), __FILE__,       \
+             __LINE__);                                                        \
+  } while (0)
+
+// NOLINTNEXTLINE
+#define RUN_TEST(test)                                                         \
+  do {                                                                         \
+    printf("\n[RUNNING]: %s \n", (test).name);                                 \
+    const char *message = NULL;                                                \
+    ((test).func)(&message);                                                   \
+    tests_run++;                                                               \
+    if (message != NULL) {                                                     \
+      tests_failed++;                                                          \
+      printf("\t[ASSERTION FAILED]: %s\n", message);                           \
+      printf("[FAIL]: %s\n", (test).name);                                     \
+    } else {                                                                   \
+      printf("[SUCCESS]: %s\n", (test).name);                                  \
+    }                                                                          \
+  } while (0)
+
+// NOLINTNEXTLINE
+#define RUN_ALL_TESTS()                                                        \
+  do {                                                                         \
+    for (int i = 0; i < num_tests; i++) {                                      \
+      RUN_TEST(tests[i]);                                                      \
+    }                                                                          \
+  } while (0)
+
+// NOLINTNEXTLINE
+#define PRINT_TEST_RESULTS()                                                   \
+  do {                                                                         \
+    printf("Ran %d tests\n", tests_run);                                       \
+    printf("Failed %d tests\n", tests_failed);                                 \
+  } while (0)
+
+// NOLINTNEXTLINE
+#define TEST(testname)                                                         \
+  void testname(char const **out_error_message);                               \
+  __attribute__((constructor)) void add_##testname##_to_array(void) {          \
+    if (num_tests < MAX_TESTS) {                                               \
+      tests[num_tests].name = #testname;                                       \
+      tests[num_tests].func = testname;                                        \
+      num_tests++;                                                             \
+    } else {                                                                   \
+      printf("Too many tests added, max is %d\n", MAX_TESTS);                  \
+    }                                                                          \
+  }                                                                            \
+  inline void testname(char const **out_error_message)
diff --git a/software/scripts/plot_benchmarks.py b/software/scripts/plot_benchmarks.py
new file mode 100644
index 000000000..cd2324656
--- /dev/null
+++ b/software/scripts/plot_benchmarks.py
@@ -0,0 +1,152 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import textwrap
+import subprocess
+import re
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+
+HARDWARE_DIR = "../../hardware"
+APPS_DIR = "../apps"
+OMP_APPS_DIR = APPS_DIR + "/omp"
+UART_REGEX = re.compile(r"\[UART\] (.*): (\d+)")
+GIT_COMMIT_HASH = subprocess.check_output(
+    ["git", "describe", "--always", "--dirty"]).strip().decode("utf-8")
+OUTPUT = f'results/{GIT_COMMIT_HASH}/results.csv'
+
+
+def plot_speedup(df):
+    # Separate LLVM and GCC data
+    llvm_data = df[df['compiler'] == 'llvm']
+    gcc_data = df[df['compiler'] == 'gcc']
+
+    # Merge LLVM and GCC data on 'app' and 'name'
+    joined_data = pd.merge(llvm_data, gcc_data, on=[
+                           'app', 'name'], suffixes=('_llvm', '_gcc'))
+
+    # Calculate the speedup (GCC cycles / LLVM cycles)
+    joined_data['speedup'] = joined_data['cycles_gcc'] / \
+        joined_data['cycles_llvm']
+
+    # Get unique applications
+    apps = joined_data['app'].unique()
+
+    # Create bar plots for each application showing speedup
+    for app in apps:
+        # Filter data for the current app
+        app_data = joined_data[joined_data['app'] == app]
+
+        # Unique test names
+        test_names = app_data['name']
+        test_names = ['\n'.join(textwrap.wrap(name, width=10))
+                      for name in test_names]
+
+        # Speedup values
+        speedup_values = app_data['speedup']
+
+        # Bar positions and bar width
+        bar_width = 0.4  # Width of the bars
+        x = np.arange(len(test_names))  # Position for bars
+
+        # Create the bar plot
+        plt.figure()
+
+        # Plot speedup values
+        bars = plt.bar(x, speedup_values, width=bar_width,
+                       color='green', label='LLVM Speedup')
+
+        # Add value labels on top of each bar
+        for bar, value in zip(bars, speedup_values):
+            height = max(1, bar.get_height())
+            _, top = plt.ylim()
+            plt.ylim(top=max(top, height + 0.3))
+            plt.text(bar.get_x() + bar.get_width() / 2, height +
+                     0.05, f'{value:.2f}', ha='center', va='bottom')
+
+        # Set x-axis labels, title, etc.
+        plt.ylabel('GCC/LLVM Ratio (Speedup)')
+        plt.title(f'Speedup of LLVM Against GCC for {app}')
+        plt.xticks(x, test_names, rotation=45, ha='right')
+        plt.axhline(y=1, color='red', linestyle='--',
+                    linewidth=1, label='Baseline (GCC)')
+        plt.legend()
+        plt.tight_layout()  # Adjust layout for readability
+        plt.savefig(f'results/{GIT_COMMIT_HASH}/{app}_speedup.png')
+        # plt.show()  # Display plot
+
+
+def plot_cycles(df):
+    # Get unique applications
+    apps = df['app'].unique()
+
+    # Create bar plots for each application showing raw cycles for LLVM and GCC
+    for app in apps:
+        # Filter data for the current app
+        app_data = df[df['app'] == app]
+
+        # Unique test names
+        test_names = app_data['name'].unique()
+
+        # Initialize arrays for LLVM and GCC cycles
+        llvm_cycles = []
+        gcc_cycles = []
+
+        # Iterate over test names and align the cycles
+        for test in test_names:
+            llvm_cycle = app_data[(app_data['name'] == test) & (
+                app_data['compiler'] == 'llvm')]['cycles']
+            gcc_cycle = app_data[(app_data['name'] == test) & (
+                app_data['compiler'] == 'gcc')]['cycles']
+
+            # Add cycles only if both GCC and LLVM data are available for the
+            # test
+            if not llvm_cycle.empty and not gcc_cycle.empty:
+                llvm_cycles.append(llvm_cycle.iloc[0])
+                gcc_cycles.append(gcc_cycle.iloc[0])
+            elif not llvm_cycle.empty:
+                llvm_cycles.append(llvm_cycle.iloc[0])
+                gcc_cycles.append(0)  # Add a placeholder 0 for GCC
+            elif not gcc_cycle.empty:
+                gcc_cycles.append(gcc_cycle.iloc[0])
+                llvm_cycles.append(0)  # Add a placeholder 0 for LLVM
+
+        # Bar width and x-positions with closer spacing
+        bar_width = 0.4  # Width of the bars
+        x = np.arange(len(test_names))  # Position for bars
+
+        # Create the bar plot
+        plt.figure(figsize=(10, 6))
+
+        # Plot LLVM cycles if available
+        if llvm_cycles:
+            plt.bar(x, llvm_cycles, width=bar_width, label='LLVM')
+
+        # Plot GCC cycles if available
+        if gcc_cycles:
+            plt.bar(x + bar_width, gcc_cycles, width=bar_width, label='GCC')
+
+        # Set x-axis labels, title, etc.
+        plt.ylabel('Cycles')
+        plt.title(f'Cycles Comparison for {app}')
+        plt.xticks(x + bar_width / 2, test_names, rotation=45, ha='right')
+        plt.legend()
+        plt.tight_layout()  # Adjust layout for readability
+        plt.savefig(f'results/{GIT_COMMIT_HASH}/{app}_cycles.png')
+        # plt.show()  # Display plot
+
+
+def main():
+    df = pd.read_csv(OUTPUT)
+
+    if ("dirty" in GIT_COMMIT_HASH):
+        print("WARNING: The current commit is dirty.")
+
+    plot_speedup(df)
+    plot_cycles(df)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/software/scripts/run_benchmarks.py b/software/scripts/run_benchmarks.py
new file mode 100644
index 000000000..24c9cc15d
--- /dev/null
+++ b/software/scripts/run_benchmarks.py
@@ -0,0 +1,81 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import subprocess
+import os
+import re
+import pandas as pd
+import runner
+from pprint import pp
+
+APPS_DIR = "../apps"
+OMP_APPS_DIR = APPS_DIR + "/omp"
+UART_REGEX = re.compile(r"\[UART\] ((?!.*\bresult\b).*): (\d+)", re.IGNORECASE)
+GIT_COMMIT_HASH = subprocess.check_output(
+    ["git", "describe", "--always", "--dirty"]).strip().decode("utf-8")
+OUTPUT = f"results/{GIT_COMMIT_HASH}/results.csv"
+
+results = pd.DataFrame(columns=["app", "name", "compiler", "cycles"])
+
+
+def compileAll(dir, env):
+    return subprocess.run(["make", "-C", dir, "all"], env=env).returncode == 0
+
+
+def runAll(dir, args, env):
+    global results
+    compiler = env["COMPILER"]
+
+    for app in os.listdir(dir):
+        try:
+            if (os.path.isfile(os.path.join(dir, app)) or app.startswith(".")):
+                continue
+
+            app_dir = f"{os.path.basename(dir)}/{app}"
+
+            (res, reason, output) = runner.run(
+                app_dir, args, env, lambda x: None)
+            if not res:
+                print(f"{app} did not run successfully")
+                print(reason)
+
+            matches = UART_REGEX.findall(output)
+            for match in matches:
+                results = pd.concat([results, pd.DataFrame(
+                    [{"app": app, "name":
+                      match[0], "compiler":
+                      compiler, "cycles":
+                      int(match[1])}])])
+
+            pp(results)
+            print()
+            results.to_csv(OUTPUT, index=False)
+
+        except KeyboardInterrupt:
+            continue
+
+
+def main():
+    parser = runner.get_arg_parser()
+    args = parser.parse_args()
+
+    env = os.environ
+
+    env["config"] = args.config
+    if not args.debug:
+        env["NDEBUG"] = "1"
+
+    os.makedirs(f'results/{GIT_COMMIT_HASH}', exist_ok=True)
+
+    for compiler in (["gcc", "llvm"] if args.compiler is None else
+                     [args.compiler]):
+        env["COMPILER"] = compiler
+        if compileAll(OMP_APPS_DIR, env):
+            runAll(OMP_APPS_DIR, args, env)
+        else:
+            print(f"Failed to compile with {compiler}")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/software/scripts/run_tests.py b/software/scripts/run_tests.py
new file mode 100644
index 000000000..4221e13dc
--- /dev/null
+++ b/software/scripts/run_tests.py
@@ -0,0 +1,108 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+import re
+import fnmatch
+import runner
+
+DIR = os.path.dirname(os.path.realpath(__file__))
+SOFTWARE_DIR = os.path.join(DIR, "../")
+TESTS_DIR = os.path.join(SOFTWARE_DIR, "tests")
+BIN_DIR = os.path.join(DIR, "../bin")
+TESTS_BIN_DIR = os.path.join(BIN_DIR, "tests")
+
+RED = "\033[91m"
+GREEN = "\033[92m"
+YELLOW = "\033[93m"
+RESET = "\033[0m"
+
+
+def parse_line(line, stats):
+    running_re = re.compile(r"\[RUNNING\]:\s+(.*)")
+    success_re = re.compile(r"\[SUCCESS\]:\s+(.*)")
+    fail_re = re.compile(r"\[FAIL\]:\s+(.*)")
+    failures_re = re.compile(r"\[\w+ FAILED\]:\s+(.*)")
+
+    if m := running_re.search(line):
+        stats["num_tests"] += 1
+        print(f"{YELLOW}[RUNNING]{RESET}: {m.group(1)}")
+    elif m := success_re.search(line):
+        print(f"{GREEN}[SUCCESS]{RESET}: {m.group(1)}")
+        stats["num_success"] += 1
+    elif m := fail_re.search(line):
+        print(f"{RED}[FAIL]{RESET}: {m.group(1)}")
+    elif m := failures_re.search(line):
+        print(f"   {RED}[FAIL]{RESET}: {m.group(1)}")
+
+
+def print_results(stats):
+    color = (GREEN if stats["num_success"] > 0 and
+             stats["num_success"] == stats["num_tests"] else RED)
+    print(
+        f'{color}'
+        f'[RESULT]{RESET}: '
+        f'{stats["num_success"]}/{stats["num_tests"]} tests passed'
+    )
+
+
+def main():
+    parser = runner.get_arg_parser()
+    parser.add_argument(
+        "tests", type=str, nargs="+", help="Tests to run (glob matching)"
+    )
+    parser.add_argument(
+        "-r",
+        "--repetitions",
+        type=int,
+        default=10,
+        help="Test repetitions (not all tests use this)",
+    )
+
+    args = parser.parse_args()
+
+    matching_tests = []
+    for test in args.tests:
+        for root, dirs, _ in os.walk(TESTS_DIR):
+            for d in dirs:
+                full_path = os.path.relpath(os.path.join(root, d), TESTS_DIR)
+                if fnmatch.fnmatch(full_path, test):
+                    matching_tests.append(full_path)
+
+    if not matching_tests:
+        print("No tests found matching the pattern")
+        return
+
+    print(f"Running tests: {matching_tests}")
+
+    env = os.environ
+    env["REPETITIONS"] = str(args.repetitions)
+
+    for test in sorted(set(matching_tests)):
+        print()
+
+        testpath = os.path.join(TESTS_DIR, test)
+
+        if args.compiler and not runner.compile(testpath, args, env):
+            continue
+
+        if not os.path.exists(os.path.join(TESTS_BIN_DIR, test)):
+            print(f"Test {test} not found")
+            continue
+
+        stats = {"num_tests": 0, "num_success": 0}
+
+        res, reason, out = runner.run(
+            test, args, env,
+            lambda line: parse_line(line, stats))
+
+        if not res:
+            print(f"{RED}[FAIL]{RESET}: {reason}")
+            stats["num_tests"] = "?"
+
+        print_results(stats)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/software/scripts/runner.py b/software/scripts/runner.py
new file mode 100644
index 000000000..a624cbfe7
--- /dev/null
+++ b/software/scripts/runner.py
@@ -0,0 +1,208 @@
+# Copyright 2024 ETH Zurich and University of Bologna.
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+import argparse
+import os
+import subprocess
+import threading
+import queue
+import time
+import signal
+
+DIR = os.path.dirname(os.path.realpath(__file__))
+HARDWARE_DIR = os.path.join(DIR, "../../hardware")
+
+
+# https://stackoverflow.com/a/4791612
+def kill_proc(proc):
+    os.killpg(os.getpgid(proc.pid), signal.SIGKILL)
+
+
+def enqueue_output(out, queue):
+    try:
+        for line in iter(out.readline, ""):
+            queue.put(line)
+    except Exception as e:
+        print(e)
+        pass
+
+
+def compile(prog, args, env_extra):
+    if not os.path.exists(prog):
+        print(f"{prog} not found")
+        return False
+
+    env = os.environ
+    env["config"] = args.config
+    env["COMPILER"] = args.compiler
+    env |= env_extra if env_extra else {}
+
+    if not args.debug:
+        env["NDEBUG"] = "1"
+
+    dir, progname = os.path.split(prog)
+
+    print(f"Compiling {progname}")
+    comp = subprocess.run(
+        ["make", "-C",  dir, progname],
+        env=env,
+        capture_output=True,
+    )
+
+    if comp.returncode != 0:
+        print(f"Failed to compile {progname}")
+        if args.verbose:
+            print(comp.stdout.decode("utf-8"))
+            print(comp.stderr.decode("utf-8"))
+        return False
+
+    return True
+
+
+def run(prog, args, env_extra, line_callback):
+    if args.simulator == "verilator":
+        args.simulator = "verilate"
+
+    env = os.environ
+    env["config"] = args.config
+    env["app"] = prog
+    env |= env_extra if env_extra else {}
+
+    output = ""
+    timer = None
+
+    print(f"Running {prog}")
+
+    try:
+        # https://stackoverflow.com/a/76624958
+        proc = subprocess.Popen(
+            ["make", "-C", HARDWARE_DIR, args.simulator],
+            env=env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            bufsize=1,
+            errors="replace",
+            preexec_fn=os.setsid
+        )
+
+        # Start a timer to kill the process if it takes too long
+        if args.timeout > 0:
+            timer = threading.Timer(args.timeout, kill_proc, [proc])
+            timer.start()
+
+        stdout_queue = queue.Queue()
+        stderr_queue = queue.Queue()
+
+        stdout_thread = threading.Thread(
+            target=enqueue_output, args=(proc.stdout, stdout_queue)
+        )
+        stderr_thread = threading.Thread(
+            target=enqueue_output, args=(proc.stderr, stderr_queue)
+        )
+
+        stdout_thread.daemon = True
+        stderr_thread.daemon = True
+
+        stdout_thread.start()
+        stderr_thread.start()
+
+        while True:
+            time.sleep(0.1)
+
+            while not stdout_queue.empty() or not stderr_queue.empty():
+                try:
+                    stdout_line = stdout_queue.get_nowait()
+                except queue.Empty:
+                    stdout_line = None
+
+                try:
+                    stderr_line = stderr_queue.get_nowait()
+                except queue.Empty:
+                    stderr_line = None
+
+                for line in [stdout_line, stderr_line]:
+                    if line:
+                        output += line
+                        line_callback(line)
+
+                        if args.verbose:
+                            print(line, end="")
+
+                        if ("Error 117") in line:
+                            reason = ("Banshee called "
+                                      "the police, most likely a deadlock "
+                                      "(all threads called wfi)")
+                            kill_proc(proc)
+                            return (False, reason, output)
+
+                        if "Stackoverflow" in line:
+                            kill_proc(proc)
+                            return (False, "Stackoverflow", output)
+
+                        if (args.simulator == "banshee" and
+                                "Program done" in line):
+                            kill_proc(proc)
+                            break
+
+            if (
+                proc.poll() is not None
+                and stdout_queue.empty()
+                and stderr_queue.empty()
+            ):
+                break
+
+        if proc.returncode is not None and proc.returncode > 0:
+            return (False, "Non-zero return code", output)
+        elif timer is not None and not timer.is_alive():
+            return (False, "Timeout", output)
+        else:
+            return (True, "Success", output)
+
+    except Exception as e:
+        return (False, str(e), output)
+
+    finally:
+        if proc.poll() is None:
+            kill_proc(proc)
+        if timer:
+            timer.cancel()
+
+
+def get_arg_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "-t",
+        "--timeout",
+        type=int,
+        default=180,
+        help="Timeout in seconds (set to 0 to disable)",
+    )
+    parser.add_argument(
+        "-s", "--simulator", type=str, default="verilator",
+        help="Simulator to use"
+    )
+    parser.add_argument(
+        "-c",
+        "--config",
+        type=str,
+        default="minpool-no-xpulp",
+        help="Mempool configuration",
+    )
+    parser.add_argument(
+        "--compiler",
+        type=str,
+        choices=["gcc", "llvm"],
+        help="Compiler",
+    )
+    parser.add_argument(
+        "--verbose", action="store_true", default=False,
+        help="Print verbose output"
+    )
+    parser.add_argument(
+        "--debug", action="store_true", default=False,
+        help="Compile in debug mode"
+    )
+
+    return parser
diff --git a/software/tests/omp/Makefile b/software/tests/omp/Makefile
index 73a61326f..c9641293e 100644
--- a/software/tests/omp/Makefile
+++ b/software/tests/omp/Makefile
@@ -9,12 +9,19 @@ SOFTWARE_DIR := $(abspath $(ROOT_DIR)/../..)
 TESTS_DIR := $(ROOT_DIR)
 BIN_DIR := $(abspath $(SOFTWARE_DIR)/bin/$(subst $(SOFTWARE_DIR),,$(TESTS_DIR)))
 RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime)
+COMPILER ?= llvm
 
 # OpenMP runtime
-OMP_DIR       ?= $(RUNTIME_DIR)/omp
 RISCV_CCFLAGS += -fopenmp -DNTHREADS=$(num_cores)
 RISCV_CCFLAGS += -I$(OMP_DIR)
 
+# Wrap main function
+RISCV_LDFLAGS += -Wl,-wrap,main
+
+# Test repetitions
+REPETITIONS ?= 10
+RISCV_CCFLAGS += -DREPETITIONS=$(REPETITIONS)
+
 # This will overwrite the ROOT_DIR variable from the included makefile
 include $(RUNTIME_DIR)/runtime.mk
 
diff --git a/software/tests/omp/atomic/main.c b/software/tests/omp/atomic/main.c
index a1d63cf25..9e872c08e 100644
--- a/software/tests/omp/atomic/main.c
+++ b/software/tests/omp/atomic/main.c
@@ -5,303 +5,315 @@
 #include <stdint.h>
 #include <string.h>
 
+#include "../../runtime/testing.h"
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
 
-#define REPETITIONS 10 /* Number of times to run each test */
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
+
 #define MAX_FACTOR 10
 #define KNOWN_PRODUCT 3628800 /* 10! */
 #define LOOPCOUNT 100         /* Number of iterations to slit amongst threads */
 
-int test_omp_atomic() {
-  int sum;
-  int diff;
-  int product;
-  int x;
-  int *logics;
-  int bit_and = 1;
-  int bit_or = 0;
-  int exclusiv_bit_or = 0;
-  int j;
-  int known_sum;
-  int known_diff;
-  int known_product;
-  int result = 0;
-  int logicsArray[LOOPCOUNT];
-  logics = logicsArray;
-
-  sum = 0;
-  diff = 0;
-  product = 1;
-
-// sum of integers test
+int logics[LOOPCOUNT];
+
+TEST(sum_of_integers) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    int sum = 0;
+
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 1; i <= LOOPCOUNT; i++) {
+      for (i = 1; i <= LOOPCOUNT; i++) {
 #pragma omp atomic
-      sum += i;
+        sum += i;
+      }
     }
+
+    int known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
+    ASSERT_EQ(known_sum, sum);
   }
-  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
-  if (known_sum != sum) {
-    printf("Error in sum with integers: Result was %d instead of %d.\n", sum,
-           known_sum);
-    result++;
-  }
+}
+
+TEST(difference_of_integers) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    int diff = 0;
 
-// difference of integers test
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < LOOPCOUNT; i++) {
+      for (i = 0; i < LOOPCOUNT; i++) {
 #pragma omp atomic
-      diff -= i;
+        diff -= i;
+      }
     }
+
+    int known_diff = ((LOOPCOUNT - 1) * LOOPCOUNT) / 2 * -1;
+    printf("known_diff: %d, diff: %d\n", known_diff, diff);
+    ASSERT_EQ(known_diff, diff);
   }
-  known_diff = ((LOOPCOUNT - 1) * LOOPCOUNT) / 2 * -1;
-  if (diff != known_diff) {
-    printf("Error in difference with integers: Result was %d instead of 0.\n",
-           diff);
-    result++;
-  }
+}
+
+TEST(product_of_integers) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    int product = 1;
 
-// product of integers test
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 1; i <= MAX_FACTOR; i++) {
+      for (i = 1; i <= MAX_FACTOR; i++) {
 #pragma omp atomic
-      product *= i;
+        product *= i;
+      }
     }
+
+    ASSERT_EQ(KNOWN_PRODUCT, product);
   }
-  known_product = KNOWN_PRODUCT;
-  if (known_product != product) {
-    printf("Error in product with integers: Result was %d instead of %d\n",
-           product, known_product);
-    result++;
-  }
+}
+
+TEST(division_of_integers) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    int product = KNOWN_PRODUCT;
 
-  // division of integers test
-  product = KNOWN_PRODUCT;
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 1; i <= MAX_FACTOR; ++i) {
+      for (i = 1; i <= MAX_FACTOR; ++i) {
 #pragma omp atomic
-      product /= i;
+        product /= i;
+      }
     }
+
+    ASSERT_EQ(1, product);
   }
-  if (product != 1) {
-    printf("Error in product division with integers: Result was %d"
-           " instead of 1\n",
-           product);
-    result++;
-  }
+}
 
-  // ++ test
-  x = 0;
+TEST(atomic_increment) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    int x = 0;
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < LOOPCOUNT; ++i) {
+      for (i = 0; i < LOOPCOUNT; ++i) {
 #pragma omp atomic
-      x++;
+        x++;
+      }
     }
+
+    ASSERT_EQ(LOOPCOUNT, x);
   }
-  if (x != LOOPCOUNT) {
-    result++;
-    printf("Error in ++\n");
-  }
+}
+
+TEST(atomic_decrement) {
+  int x = 0;
 
-// -- test
+  for (int count = 0; count < REPETITIONS; count++) {
+
+    x = LOOPCOUNT;
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < LOOPCOUNT; ++i) {
+      for (i = 0; i < LOOPCOUNT; ++i) {
 #pragma omp atomic
-      x--;
+        x--;
+      }
     }
-  }
-  if (x != 0) {
-    result++;
-    printf("Error in --\n");
-  }
 
-  // bit-and test part 1
-  for (j = 0; j < LOOPCOUNT; ++j) {
-    logics[j] = 1;
+    ASSERT_EQ(0, x);
   }
-  bit_and = 1;
+}
+
+TEST(atomic_bit_and_1) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    memset(logics, 0, LOOPCOUNT);
+    int bit_and = 1;
+
+    for (int j = 0; j < LOOPCOUNT; ++j) {
+      logics[j] = 1;
+    }
+    bit_and = 1;
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < LOOPCOUNT; ++i) {
+      for (i = 0; i < LOOPCOUNT; ++i) {
 #pragma omp atomic
-      bit_and &= logics[i];
+        bit_and &= logics[i];
+      }
     }
+
+    ASSERT_EQ(1, bit_and);
   }
-  if (!bit_and) {
-    result++;
-    printf("Error in BIT AND part 1\n");
-  }
+}
+
+TEST(atomic_bit_and_2) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    memset(logics, 0, LOOPCOUNT);
+    int bit_and = 1;
 
-  // bit-and test part 2
-  bit_and = 1;
-  logics[LOOPCOUNT / 2] = 0;
+    bit_and = 1;
+    logics[LOOPCOUNT / 2] = 0;
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < LOOPCOUNT; ++i) {
+      for (i = 0; i < LOOPCOUNT; ++i) {
 #pragma omp atomic
-      bit_and &= logics[i];
+        bit_and &= logics[i];
+      }
     }
-  }
-  if (bit_and) {
-    result++;
-    printf("Error in BIT AND part 2\n");
-  }
 
-  // bit-or test part 1
-  for (j = 0; j < LOOPCOUNT; j++) {
-    logics[j] = 0;
+    ASSERT_EQ(0, bit_and);
   }
-  bit_or = 0;
+}
+
+TEST(atomic_bit_or_1) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    memset(logics, 0, LOOPCOUNT);
+    int bit_or = 1;
+
+    for (int j = 0; j < LOOPCOUNT; j++) {
+      logics[j] = 0;
+    }
+
+    bit_or = 0;
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < LOOPCOUNT; ++i) {
+      for (i = 0; i < LOOPCOUNT; ++i) {
 #pragma omp atomic
-      bit_or |= logics[i];
+        bit_or |= logics[i];
+      }
     }
+
+    ASSERT_EQ(0, bit_or);
   }
-  if (bit_or) {
-    result++;
-    printf("Error in BIT OR part 1\n");
-  }
+}
+
+TEST(atomic_bit_or_2) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    memset(logics, 0, LOOPCOUNT);
+    int bit_or = 1;
+
+    for (int j = 0; j < LOOPCOUNT; j++) {
+      logics[j] = 0;
+    }
+
+    bit_or = 0;
+    logics[LOOPCOUNT / 2] = 1;
 
-  // bit-or test part 2
-  bit_or = 0;
-  logics[LOOPCOUNT / 2] = 1;
 #pragma omp parallel
-  {
+    {
 
-    int i;
+      int i;
 #pragma omp for
-    for (i = 0; i < LOOPCOUNT; ++i) {
+      for (i = 0; i < LOOPCOUNT; ++i) {
 #pragma omp atomic
-      bit_or |= logics[i];
+        bit_or |= logics[i];
+      }
     }
+    ASSERT_EQ(1, bit_or);
   }
-  if (!bit_or) {
-    result++;
-    printf("Error in BIT OR part 2\n");
-  }
+}
+
+TEST(atomix_bit_xor_1) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    memset(logics, 0, LOOPCOUNT);
+    int exclusiv_bit_or = 0;
+
+    for (int j = 0; j < LOOPCOUNT; j++) {
+      logics[j] = 0;
+    }
 
-  // bit-xor test part 1
-  for (j = 0; j < LOOPCOUNT; j++) {
-    logics[j] = 0;
-  }
-  exclusiv_bit_or = 0;
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < LOOPCOUNT; ++i) {
+      for (i = 0; i < LOOPCOUNT; ++i) {
 #pragma omp atomic
-      exclusiv_bit_or ^= logics[i];
+        exclusiv_bit_or ^= logics[i];
+      }
     }
+
+    ASSERT_EQ(0, exclusiv_bit_or);
   }
-  if (exclusiv_bit_or) {
-    result++;
-    printf("Error in EXCLUSIV BIT OR part 1\n");
-  }
+}
+
+TEST(atomic_bit_xor_2) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    memset(logics, 0, LOOPCOUNT);
+    int exclusiv_bit_or = 0;
+
+    for (int j = 0; j < LOOPCOUNT; j++) {
+      logics[j] = 0;
+    }
+
+    exclusiv_bit_or = 0;
+    logics[LOOPCOUNT / 2] = 1;
 
-  // bit-xor test part 2
-  exclusiv_bit_or = 0;
-  logics[LOOPCOUNT / 2] = 1;
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < LOOPCOUNT; ++i) {
+      for (i = 0; i < LOOPCOUNT; ++i) {
 #pragma omp atomic
-      exclusiv_bit_or ^= logics[i];
+        exclusiv_bit_or ^= logics[i];
+      }
     }
+
+    ASSERT_EQ(1, exclusiv_bit_or);
   }
-  if (!exclusiv_bit_or) {
-    result++;
-    printf("Error in EXCLUSIV BIT OR part 2\n");
-  }
+}
+//
+TEST(atomic_left_shift) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    int x = 1;
 
-  // left shift test
-  x = 1;
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < 10; ++i) {
+      for (i = 0; i < 10; ++i) {
 #pragma omp atomic
-      x <<= 1;
+        x <<= 1;
+      }
     }
+
+    ASSERT_EQ(1024, x);
   }
-  if (x != 1024) {
-    result++;
-    printf("Error in <<\n");
-    x = 1024;
-  }
+}
+
+TEST(atomic_right_shift) {
+  for (int count = 0; count < REPETITIONS; count++) {
+    int x = 1024;
 
-// right shift test
 #pragma omp parallel
-  {
-    int i;
+    {
+      int i;
 #pragma omp for
-    for (i = 0; i < 10; ++i) {
+      for (i = 0; i < 10; ++i) {
 #pragma omp atomic
-      x >>= 1;
+        x >>= 1;
+      }
     }
-  }
-  if (x != 1) {
-    result++;
-    printf("Error in >>\n");
-  }
 
-  return (result);
+    ASSERT_EQ(1, x);
+  }
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  int i;
-  int num_failed = 0;
-
-  mempool_wait(4 * num_cores);
-
-  if (core_id == 0) {
-    printf("Master Thread start\n");
-    for (i = 0; i < REPETITIONS; i++) {
-      printf("test: %d\n", i);
-      num_failed = test_omp_atomic();
-      printf("num_failed: %d\n", num_failed);
-    }
-    printf("Master Thread end\n\n\n");
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
-  return num_failed;
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
 }
diff --git a/software/tests/omp/barrier_test1/main.c b/software/tests/omp/barrier_test1/main.c
index 7c2f1542f..49cdf4934 100644
--- a/software/tests/omp/barrier_test1/main.c
+++ b/software/tests/omp/barrier_test1/main.c
@@ -6,66 +6,45 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "testing.h"
+
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
 
-#define REPETITIONS 10 /* Number of times to run each test */
 #define SLEEPTIME 1000
 
-uint32_t test_omp_barrier() {
+TEST(test_omp_barrier) {
   uint32_t result1;
   uint32_t result2;
   result1 = 0;
   result2 = 0;
 
+  for (int i = 0; i < REPETITIONS; i++) {
 #pragma omp parallel
-  {
-    uint32_t rank;
-    rank = omp_get_thread_num();
-    if (rank == 1) {
-      printf("waiting...\n");
-      mempool_wait(((double)SLEEPTIME) /
-                   REPETITIONS); // give 1 sec to whole test
-      printf("waited.\n");
-      result2 = 3;
-    }
+    {
+      uint32_t rank;
+      rank = omp_get_thread_num();
+      if (rank == 1) {
+        mempool_wait(((uint32_t)(double)SLEEPTIME /
+                      REPETITIONS)); // give 1 sec to whole test
+        result2 = 3;
+      }
 #pragma omp barrier
 
-    if (rank == 2) {
-      printf("result2: %d\n", result2);
-      result1 = result2;
-      printf("result1: %d\n", result1);
+      if (rank == 2) {
+        result1 = result2;
+      }
     }
+    ASSERT_EQ(result1, 3);
   }
-  return (result1 == 3);
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t i;
-  uint32_t num_failed = 0;
-
-  if (core_id == 0) {
-    printf("Master Thread start\n");
-    for (i = 0; i < REPETITIONS; i++) {
-      printf("test: %d\n", i);
-      if (!test_omp_barrier()) {
-        num_failed++;
-      }
-      printf("test finished: %d\n", i);
-    }
-    printf("Master Thread end\n\n\n");
-    printf("num_failed: %d\n", num_failed);
-    mempool_wait(4 * num_cores);
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
-
-  return 0;
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
 }
diff --git a/software/tests/omp/barrier_test2/main.c b/software/tests/omp/barrier_test2/main.c
deleted file mode 100644
index ac1f3b7e7..000000000
--- a/software/tests/omp/barrier_test2/main.c
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2022 ETH Zurich and University of Bologna.
-// Licensed under the Apache License, Version 2.0, see LICENSE for details.
-// SPDX-License-Identifier: Apache-2.0
-
-#include <stdint.h>
-#include <string.h>
-
-#include "encoding.h"
-#include "libgomp.h"
-#include "printf.h"
-#include "runtime.h"
-#include "synchronization.h"
-
-#define REPETITIONS 10 /* Number of times to run each test */
-#define SLEEPTIME 1000
-
-uint32_t test_omp_barrier(uint32_t num_cores) {
-  uint32_t result1;
-  uint32_t result2;
-  result1 = 0;
-  result2 = 0;
-
-#pragma omp parallel
-  {
-    uint32_t rank;
-    rank = omp_get_thread_num();
-    if (rank == 1) {
-      printf("waiting...\n");
-      mempool_wait(((double)SLEEPTIME) /
-                   REPETITIONS); // give 1 sec to whole test
-      printf("waited.\n");
-      result2 = 3;
-    }
-    mempool_barrier(num_cores);
-
-    if (rank == 2) {
-      printf("result2: %d\n", result2);
-      result1 = result2;
-      printf("result1: %d\n", result1);
-    }
-  }
-  return (result1 == 3);
-}
-
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t i;
-  uint32_t num_failed = 0;
-
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-    printf("Master Thread start\n");
-    for (i = 0; i < REPETITIONS; i++) {
-      printf("test: %d\n", i);
-      if (!test_omp_barrier(num_cores)) {
-        num_failed++;
-      }
-      printf("test finished: %d\n", i);
-    }
-    printf("Master Thread end\n\n\n");
-    printf("num_failed: %d\n", num_failed);
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
-
-  return 0;
-}
diff --git a/software/tests/omp/critical/main.c b/software/tests/omp/critical/main.c
index 4ffba2c7a..ef8b38da7 100644
--- a/software/tests/omp/critical/main.c
+++ b/software/tests/omp/critical/main.c
@@ -6,66 +6,45 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "testing.h"
 
-#define REPETITIONS 10 /* Number of times to run each test */
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
 
-int test_omp_critical() {
-  int sum;
-  int known_sum, mysum;
+TEST(test_omp_critical) {
   int num_cores = (int)mempool_get_core_count();
 
-  sum = 0;
-#pragma omp parallel
-  {
-    mysum = 0;
-    int i;
+  for (int r = 0; r < REPETITIONS; r++) {
+    int sum1 = 0;
+    int sum2 = 0;
 
-#pragma omp single
+#pragma omp parallel
     {
-      for (i = 0; i < 100; i++)
-        mysum = mysum + i;
-      printf("Single\n");
-    }
+#pragma omp critical
+      {
+        sum1 += 1;
+        sum2 += 2;
+      }
 
 #pragma omp critical
-    {
-      sum = mysum + sum;
-      // printf("Sum: %d, thread_id: %d\n",sum,omp_get_thread_num());
+      {
+        sum1 += 1;
+        sum2 += 2;
+      }
     }
+
+    ASSERT_EQ(sum1, 2 * num_cores);
+    ASSERT_EQ(sum2, 2 * sum1);
+    ASSERT_EQ(sum2, 4 * num_cores);
   }
-  known_sum = 99 * 100 / 2 * num_cores;
-  return (known_sum == sum);
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t num_cores = mempool_get_core_count();
-  uint32_t i;
-  uint32_t num_failed = 0;
-
-  mempool_wait(2 * num_cores);
-
-  if (core_id == 0) {
-    printf("Master Thread start\n");
-    for (i = 0; i < REPETITIONS; i++) {
-      printf("test: %d\n", i);
-      if (!test_omp_critical()) {
-        num_failed++;
-      }
-      printf("num_failed: %d\n", num_failed);
-    }
-    printf("Master Thread end\n\n\n");
-    printf("num_failed: %d\n", num_failed);
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
-
+  RUN_ALL_TESTS();
   return 0;
 }
diff --git a/software/tests/omp/master/main.c b/software/tests/omp/master/main.c
index ac763ac8e..8f15bc31b 100644
--- a/software/tests/omp/master/main.c
+++ b/software/tests/omp/master/main.c
@@ -6,54 +6,47 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "testing.h"
 
-#define REPETITIONS 10 /* Number of times to run each test */
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
 
-int test_omp_master() {
-  uint32_t nthreads;
-  int32_t executing_thread;
-  // counts up the number of wrong thread no. for the master thread. (Must be 0)
-  uint32_t tid = 0;
-  nthreads = 0;
-  executing_thread = -1;
+TEST(test_omp_master) {
+  for (int i = 0; i < REPETITIONS; i++) {
+
+    uint32_t nthreads;
+    int32_t executing_thread;
+    // counts up the number of wrong thread no. for the master thread. (Must be
+    // 0)
+    uint32_t tid = 0;
+    nthreads = 0;
+    executing_thread = -1;
 
 #pragma omp parallel
-  {
-#pragma omp master
     {
-      printf("Master Thread executes\n\n\n");
-      tid = omp_get_thread_num();
-      nthreads++;
-      executing_thread = (int32_t)omp_get_thread_num();
-    } /* end of master*/
-  }   /* end of parallel*/
-  return ((nthreads == 1) && (executing_thread == 0) && (tid == 0));
+#pragma omp master
+      {
+        printf("Master Thread executes\n\n\n");
+        tid = omp_get_thread_num();
+        nthreads++;
+        executing_thread = (int32_t)omp_get_thread_num();
+      } /* end of master*/
+    }   /* end of parallel*/
+
+    ASSERT_EQ(1, nthreads);
+    ASSERT_EQ(0, executing_thread);
+    ASSERT_EQ(0, tid);
+  }
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t i;
-  uint32_t num_failed = 0;
-
-  if (core_id == 0) {
-    printf("Master Thread start\n");
-    for (i = 0; i < REPETITIONS; i++) {
-      if (!test_omp_master()) {
-        num_failed++;
-      }
-    }
-    printf("Master Thread end\n\n\n");
-    printf("num_failed:%d\n", num_failed);
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
 
   return 0;
 }
diff --git a/software/tests/omp/omp_parallel/main.c b/software/tests/omp/omp_parallel/main.c
index d88a138ee..cb5ec10dd 100644
--- a/software/tests/omp/omp_parallel/main.c
+++ b/software/tests/omp/omp_parallel/main.c
@@ -6,37 +6,50 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "testing.h"
 
-extern volatile uint32_t tcdm_start_address_reg;
-extern volatile uint32_t tcdm_end_address_reg;
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
 
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
+TEST(test_omp_parallel_8) {
+  for (int i = 0; i < REPETITIONS; i++) {
     printf("Master Thread: Parallel start\n");
-    mempool_wait(1000);
+    uint32_t nthreads = 0;
+
 #pragma omp parallel num_threads(8)
-    { printf("%d\n", omp_get_num_threads()); }
+    {
+      nthreads = omp_get_num_threads();
+      printf("%d\n", omp_get_num_threads());
+    }
     printf("Master Thread: Parallel end\n\n\n");
+    ASSERT_EQ(8, nthreads);
+  }
+}
+
+TEST(test_omp_parallel) {
+  for (int i = 0; i < REPETITIONS; i++) {
+    printf("Master Thread: Parallel start\n");
+    uint32_t nthreads = 0;
 
     printf("Master Thread: Parallel start\n");
-    mempool_wait(1000);
 #pragma omp parallel
-    { printf("%d\n", omp_get_num_threads()); }
-    printf("Master Thread: Parallel end\n\n\n");
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
+    {
+      nthreads = omp_get_num_threads();
+      printf("%d\n", omp_get_num_threads());
     }
+    printf("Master Thread: Parallel end\n\n\n");
+    ASSERT_EQ(NUM_CORES, nthreads);
   }
+}
+
+int main() {
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
 
   return 0;
 }
diff --git a/software/tests/omp/omp_parallel_for/main.c b/software/tests/omp/omp_parallel_for/main.c
index 56d400a72..cfc526b96 100644
--- a/software/tests/omp/omp_parallel_for/main.c
+++ b/software/tests/omp/omp_parallel_for/main.c
@@ -6,169 +6,166 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "testing.h"
 
-#define TEST_THREAD
+int buf_1[64];
+uint32_t buf_2[NUM_CORES];
 
-void gcc_omp_parallel_for_schedule_static(void) {
-  int buf[64], *p;
+TEST(omp_parallel_for_schedule_static) {
   uint32_t i;
-  int result = 0;
-  memset(buf, '\0', sizeof(buf));
+  int *p;
+  memset(buf_1, '\0', sizeof(buf_1));
 
 #pragma omp parallel for
-  for (i = 0; i < omp_get_num_threads(); i++) {
-    if (omp_get_thread_num() != i) {
-      printf("Error: for loop is not executed in parallel\n");
-      result += 1;
-    }
+  for (int i = 0; i < (int)omp_get_num_threads(); i++) {
+    buf_1[i] = i;
   }
 
+  for (int i = 0; i < NUM_CORES; i++) {
+    ASSERT_EQ(buf_1[i], i);
+  }
+
+  memset(buf_1, '\0', sizeof(buf_1));
 #pragma omp parallel for schedule(static, 3) private(p)
-  for (p = &buf[10]; p < &buf[54]; p++)
+  for (p = &buf_1[10]; p < &buf_1[54]; p++) {
     *p = 5;
-  for (i = 0; i < 64; i++)
-    if (buf[i] != 5 * (i >= 10 && i < 54)) {
-      printf("error 1 at gcc schedule static\n");
-      result += 1;
-    }
+  }
 
-  memset(buf, '\0', sizeof(buf));
+  for (i = 0; i < 64; i++) {
+    ASSERT_EQ(buf_1[i], 5 * (i >= 10 && i < 54));
+  }
+
+  memset(buf_1, '\0', sizeof(buf_1));
 #pragma omp parallel for schedule(static, 3)
-  for (p = &buf[3]; p <= &buf[63]; p += 2)
+  for (p = &buf_1[3]; p <= &buf_1[63]; p += 2) {
     p[-2] = 6;
-  for (i = 0; i < 64; i++)
-    if (buf[i] != 6 * ((i & 1) && i <= 61)) {
-      printf("error 2 at gcc schedule static\n");
-      result += 1;
-    }
-  memset(buf, '\0', sizeof(buf));
+  }
+
+  for (i = 0; i < 64; i++) {
+    ASSERT_EQ(buf_1[i], 6 * ((i & 1) && i <= 61));
+  }
+
+  memset(buf_1, '\0', sizeof(buf_1));
 #pragma omp parallel for schedule(static, 3)
-  for (p = &buf[16]; p < &buf[51]; p = 4 + p)
+  for (p = &buf_1[16]; p < &buf_1[51]; p = 4 + p) {
     p[2] = 7;
-  for (i = 0; i < 64; i++)
-    if (buf[i] != 7 * ((i & 3) == 2 && i >= 18 && i < 53)) {
-      printf("error 3 at gcc schedule static\n");
-      result += 1;
-    }
-  memset(buf, '\0', sizeof(buf));
+  }
+
+  for (i = 0; i < 64; i++) {
+    ASSERT_EQ(buf_1[i], 7 * ((i & 3) == 2 && i >= 18 && i < 53));
+  }
+
+  memset(buf_1, '\0', sizeof(buf_1));
 #pragma omp parallel for schedule(static, 3)
-  for (p = &buf[16]; p <= &buf[40]; p = p + 4ULL)
+  for (p = &buf_1[16]; p <= &buf_1[40]; p = p + 4U) {
     p[2] = -7;
-  for (i = 0; i < 64; i++)
-    if (buf[i] != -7 * ((i & 3) == 2 && i >= 18 && i <= 42)) {
-      printf("error 4 at gcc schedule static\n");
-      result += 1;
-    }
-  memset(buf, '\0', sizeof(buf));
+  }
+
+  for (i = 0; i < 64; i++) {
+    ASSERT_EQ(buf_1[i], -7 * ((i & 3) == 2 && i >= 18 && i <= 42));
+  }
+
+  memset(buf_1, '\0', sizeof(buf_1));
 #pragma omp parallel for schedule(static, 3)
-  for (p = &buf[53]; p > &buf[9]; --p)
+  for (p = &buf_1[53]; p > &buf_1[9]; --p) {
     *p = 5;
-  for (i = 0; i < 64; i++)
-    if (buf[i] != 5 * (i >= 10 && i < 54)) {
-      printf("error 5 at gcc schedule static\n");
-      result += 1;
-    }
-  memset(buf, '\0', sizeof(buf));
+  }
+
+  for (i = 0; i < 64; i++) {
+    ASSERT_EQ(buf_1[i], 5 * (i >= 10 && i < 54));
+  }
+
+  memset(buf_1, '\0', sizeof(buf_1));
 #pragma omp parallel for schedule(static, 3)
-  for (p = &buf[63]; p >= &buf[3]; p -= 2)
+  for (p = &buf_1[63]; p >= &buf_1[3]; p -= 2) {
     p[-2] = 6;
-  for (i = 0; i < 64; i++)
-    if (buf[i] != 6 * ((i & 1) && i <= 61)) {
-      printf("error 6 at gcc schedule static\n");
-      result += 1;
-    }
-  memset(buf, '\0', sizeof(buf));
+  }
+
+  for (i = 0; i < 64; i++) {
+    ASSERT_EQ(buf_1[i], 6 * ((i & 1) && i <= 61));
+  }
+
+  memset(buf_1, '\0', sizeof(buf_1));
 #pragma omp parallel for schedule(static, 3)
-  for (p = &buf[48]; p > &buf[15]; p = -4 + p)
+  for (p = &buf_1[48]; p > &buf_1[15]; p = -4 + p) {
     p[2] = 7;
-  for (i = 0; i < 64; i++)
-    if (buf[i] != 7 * ((i & 3) == 2 && i >= 18 && i < 53)) {
-      printf("error 7 at at gcc schedule static\n");
-      result += 1;
-    }
-  memset(buf, '\0', sizeof(buf));
+  }
+
+  for (i = 0; i < 64; i++) {
+    ASSERT_EQ(buf_1[i], 7 * ((i & 3) == 2 && i >= 18 && i < 53));
+  }
+
+  memset(buf_1, '\0', sizeof(buf_1));
 #pragma omp parallel for schedule(static, 3)
-  for (p = &buf[40]; p >= &buf[16]; p = p - 4ULL)
+  for (p = &buf_1[40]; p >= &buf_1[16]; p = p - 4U) {
     p[2] = -7;
-  for (i = 0; i < 64; i++)
-    if (buf[i] != -7 * ((i & 3) == 2 && i >= 18 && i <= 42)) {
-      printf("error 8 at gcc schedule static\n");
-      result += 1;
-    }
+  }
 
-  if (result == 0) {
-    printf("All test passed\n");
-  } else {
-    printf("Failed %d tests\n", result);
+  for (i = 0; i < 64; i++) {
+    ASSERT_EQ(buf_1[i], -7 * ((i & 3) == 2 && i >= 18 && i <= 42));
   }
 }
 
-void gcc_omp_parallel_for_schedule_static_thread(void) {
-  printf("Testing: schedule default chunk size\n");
+TEST(parallel_for_schedule_static_thread) {
+
+  memset(buf_2, '\0', sizeof(buf_2));
 #pragma omp parallel for num_threads(4) schedule(static)
   for (int i = 0; i < 10; i++) {
-    printf("%d\n", omp_get_thread_num());
+    buf_2[i] = omp_get_thread_num();
+  }
+
+  uint32_t chunkSize = (10 + 4 - 1) / 4; // ceil(10/4)
+  for (uint32_t i = 0; i < 10; i++) {
+    ASSERT_EQ(buf_2[i], (i / chunkSize) % 4);
   }
 
-  printf("Testing: schedule chunk size 2\n");
+  memset(buf_2, '\0', sizeof(buf_2));
 #pragma omp parallel for num_threads(4) schedule(static, 2)
   for (int i = 0; i < 10; i++) {
-    printf("%d\n", omp_get_thread_num());
+    buf_2[i] = omp_get_thread_num();
   }
 
-  printf("Testing: private\n");
-  int A = 9;
+  for (uint32_t i = 0; i < 10; i++) {
+    ASSERT_EQ(buf_2[i], (i / 2) % 4);
+  }
+
+  uint32_t A = 9;
 #pragma omp parallel for num_threads(4) schedule(static) private(A)
-  for (int i = 0; i < 4; i++) {
+  for (uint32_t i = 0; i < 4; i++) {
     A = i;
-    printf("%d\n", A);
   }
-  printf("A %d\n", A);
 
-  printf("Testing: First private\n");
+  ASSERT_EQ(A, 9);
+
   A = 9;
+  memset(buf_2, '\0', sizeof(buf_2));
 #pragma omp parallel for num_threads(4) schedule(static) firstprivate(A)
-  for (int i = 0; i < 4; i++) {
-    printf("%d\n", A);
+  for (uint32_t i = 0; i < 4; i++) {
+    buf_2[i] = A;
     A = i;
   }
-  printf("A %d\n", A);
 
-  printf("Testing: Last private\n");
+  for (int i = 0; i < 4; i++) {
+    ASSERT_EQ(buf_2[i], 9);
+  }
+  ASSERT_EQ(A, 9);
+
   A = 9;
 #pragma omp parallel for num_threads(4) schedule(static) lastprivate(A)
-  for (int i = 0; i < 4; i++) {
+  for (uint32_t i = 0; i < 4; i++) {
     A = i;
   }
-  printf("A %d\n", A);
+
+  ASSERT_EQ(A, 3);
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-
-///////////////////////////////////////////////////////////
-//////////////////////   test   ///////////////////////////
-///////////////////////////////////////////////////////////
-#ifdef TEST_THREAD
-    gcc_omp_parallel_for_schedule_static_thread();
-#else
-    gcc_omp_parallel_for_schedule_static();
-#endif
-
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
 
   return 0;
 }
diff --git a/software/tests/omp/omp_parallel_for_dynamic/main.c b/software/tests/omp/omp_parallel_for_dynamic/main.c
index 8011cdcb2..d56a743d1 100644
--- a/software/tests/omp/omp_parallel_for_dynamic/main.c
+++ b/software/tests/omp/omp_parallel_for_dynamic/main.c
@@ -6,159 +6,96 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "testing.h"
 
-void work(int num) {
-  int i;
-  volatile int cnt = 0;
+int buf[64];
 
-  for (i = 0; i < num; i++) {
-    cnt += i;
-  }
-}
-
-void gcc_omp_parallel_for_schedule_dynamic(void) {
-  int buf[64];
+TEST(gcc_omp_parallel_for_schedule_dynamic) {
   int i, j;
-  int result = 0;
+
   memset(buf, '\0', sizeof(buf));
 #pragma omp parallel for schedule(dynamic, 3)
   for (j = 10; j < 54; j++)
     buf[j] = 5;
   for (i = 0; i < 64; i++)
-    if (buf[i] != 5 * (i >= 10 && i < 54)) {
-      printf("error 1 at gcc schedule dynamic\n");
-      result += 1;
-    }
+    ASSERT_EQ(buf[i], 5 * (i >= 10 && i < 54));
+
+  DEBUG_PRINT("First\n");
+
   memset(buf, '\0', sizeof(buf));
 #pragma omp parallel for schedule(dynamic, 3)
   for (j = 3; j <= 63; j += 2)
     buf[j - 2] = 6;
   for (i = 0; i < 64; i++)
-    if (buf[i] != 6 * ((i & 1) && i <= 61)) {
-      printf("error 2 at gcc schedule dynamic\n");
-      result += 1;
-    }
+    ASSERT_EQ(buf[i], 6 * ((i & 1) && i <= 61));
+
+  DEBUG_PRINT("Second\n");
+
   memset(buf, '\0', sizeof(buf));
 #pragma omp parallel for schedule(dynamic, 3)
   for (j = 16; j < 51; j += 4)
     buf[j + 2] = 7;
   for (i = 0; i < 64; i++)
-    if (buf[i] != 7 * ((i & 3) == 2 && i >= 18 && i < 53)) {
-      printf("error 3 at gcc schedule dynamic\n");
-      result += 1;
-    }
+    ASSERT_EQ(buf[i], 7 * ((i & 3) == 2 && i >= 18 && i < 53));
+
+  DEBUG_PRINT("Third\n");
+
   memset(buf, '\0', sizeof(buf));
 #pragma omp parallel for schedule(dynamic, 3)
   for (j = 16; j <= 40; j += 4)
     buf[j + 2] = -7;
   for (i = 0; i < 64; i++)
-    if (buf[i] != -7 * ((i & 3) == 2 && i >= 18 && i <= 42)) {
-      printf("error 4 at gcc schedule dynamic\n");
-      result += 1;
-    }
+    ASSERT_EQ(buf[i], -7 * ((i & 3) == 2 && i >= 18 && i <= 42));
+
+  DEBUG_PRINT("Fourth\n");
+
   memset(buf, '\0', sizeof(buf));
 #pragma omp parallel for schedule(dynamic, 3)
-  for (j = 53; j > 9; --j)
+  for (j = 53; j > 9; --j) {
+    DEBUG_PRINT("%d\n", j);
     buf[j] = 5;
+  }
+
   for (i = 0; i < 64; i++)
-    if (buf[i] != 5 * (i >= 10 && i < 54)) {
-      printf("error 5 at gcc schedule dynamic\n");
-      result += 1;
-    }
+    ASSERT_EQ(buf[i], 5 * (i >= 10 && i < 54));
+
+  DEBUG_PRINT("Fifth\n");
+
   memset(buf, '\0', sizeof(buf));
 #pragma omp parallel for schedule(dynamic, 3)
   for (j = 63; j >= 3; j -= 2)
     buf[j - 2] = 6;
   for (i = 0; i < 64; i++)
-    if (buf[i] != 6 * ((i & 1) && i <= 61)) {
-      printf("error 6 at gcc schedule dynamic\n");
-      result += 1;
-    }
+    ASSERT_EQ(buf[i], 6 * ((i & 1) && i <= 61));
+
+  DEBUG_PRINT("Sixth\n");
+
   memset(buf, '\0', sizeof(buf));
 #pragma omp parallel for schedule(dynamic, 3)
   for (j = 48; j > 15; j -= 4)
     buf[j + 2] = 7;
   for (i = 0; i < 64; i++)
-    if (buf[i] != 7 * ((i & 3) == 2 && i >= 18 && i < 53)) {
-      printf("error 7 at gcc schedule dynamic\n");
-      result += 1;
-    }
+    ASSERT_EQ(buf[i], 7 * ((i & 3) == 2 && i >= 18 && i < 53));
+
+  DEBUG_PRINT("Seventh\n");
+
   memset(buf, '\0', sizeof(buf));
 #pragma omp parallel for schedule(dynamic, 3)
   for (j = 40; j >= 16; j -= 4)
     buf[j + 2] = -7;
   for (i = 0; i < 64; i++)
-    if (buf[i] != -7 * ((i & 3) == 2 && i >= 18 && i <= 42)) {
-      printf("error 8 at gcc schedule dynamic\n");
-      result += 1;
-    }
-  if (result == 0) {
-    printf("All tests passed\n");
-  } else {
-    printf("Failed %d tests", result);
-  }
-}
+    ASSERT_EQ(buf[i], -7 * ((i & 3) == 2 && i >= 18 && i <= 42));
 
-// void gcc_omp_parallel_for_schedule_dynamic_thread_test(void){
-//     printf("Testing: schedule chunk size 1\n");
-//     #pragma omp parallel for num_threads(4) schedule(dynamic)
-//     for (int k = 0; k < 10; k++){
-//       printf("%d\n", omp_get_thread_num());
-//     }
-
-//     printf("Testing: schedule chunk size 2\n");
-//     #pragma omp parallel for num_threads(4) schedule(dynamic,3)
-//     for (int k = 0; k < 10; k++){
-//         printf("%d\n", omp_get_thread_num());
-//     }
-// }
+  DEBUG_PRINT("Eighth\n");
+}
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-
-  mempool_barrier_init(core_id);
-
-  if (core_id == 0) {
-
-    mempool_wait(1000);
-
-    ///////////////////////////////////////////////////////////
-    //////////////////////   test   ///////////////////////////
-    ///////////////////////////////////////////////////////////
-
-    gcc_omp_parallel_for_schedule_dynamic();
-
-    ///////////////////////////////////////////////////////////
-    /////////////////////   Benchmark   ///////////////////////
-    ///////////////////////////////////////////////////////////
-    // uint32_t time;
-    // mempool_start_benchmark();
-    // #pragma omp parallel for num_threads(2) schedule(dynamic,2)
-    // for(int i = 0; i < 6400; i++){
-    //   work(10);
-    // }
-    // mempool_stop_benchmark();
-    // time = mempool_get_timer();
-    // printf("Parallel Time %d\n",time);
-
-    // mempool_start_benchmark();
-    // for(int i = 0; i < 6400; i++){
-    //   work(10);
-    // }
-    // mempool_stop_benchmark();
-    // time = mempool_get_timer();
-    // printf("Sequential Time %d\n",time);
-
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
 
   return 0;
 }
diff --git a/software/tests/omp/omp_test/main.c b/software/tests/omp/omp_test/main.c
index 6cde22a88..c5f65bc89 100644
--- a/software/tests/omp/omp_test/main.c
+++ b/software/tests/omp/omp_test/main.c
@@ -6,120 +6,109 @@
 #include <string.h>
 
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "testing.h"
 
-#define REPETITIONS 1 /* Number of times to run each test */
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
 
-void work1() {
-  int sum = 0;
-  for (int i = 0; i < 100; i++) {
-    sum++;
-  }
-}
-
-int test_omp_parallel_for() {
-  int sum = 0;
+TEST(test_omp_parallel_for) {
+  for (int i = 0; i < REPETITIONS; i++) {
+    int sum = 0;
 
 #pragma omp parallel shared(sum)
-  {
+    {
 #pragma omp for reduction(+ : sum)
-    for (int i = 0; i <= 100; i++) {
-      sum += i;
+      for (int i = 0; i <= 100; i++) {
+        sum += i;
+      }
     }
+
+    ASSERT_EQ(sum, 5050);
   }
-  return sum;
 }
 
-int test_omp_parallel_for_dynamic() {
-  int sum = 0;
+TEST(test_omp_parallel_for_dynamic) {
+  for (int i = 0; i < REPETITIONS; i++) {
+    int sum = 0;
 
 #pragma omp parallel shared(sum)
-  {
+    {
 #pragma omp for schedule(dynamic, 16) reduction(+ : sum)
-    for (int i = 0; i <= 100; i++) {
-      sum += i;
+      for (int i = 0; i <= 100; i++) {
+        sum += i;
+      }
     }
+
+    ASSERT_EQ(sum, 5050);
   }
-  return sum;
 }
 
-int test_omp_parallel_for_dynamic_static() {
-  int sum = 0;
+TEST(test_omp_parallel_for_dynamic_static) {
+  for (int i = 0; i < REPETITIONS; i++) {
+    int sum = 0;
 
 #pragma omp parallel shared(sum)
-  {
+    {
 #pragma omp for schedule(dynamic, 16) reduction(+ : sum)
-    for (int i = 0; i <= 100; i++) {
-      sum += i;
-    }
+      for (int i = 0; i <= 100; i++) {
+        sum += i;
+      }
+
+#pragma omp single
+      sum = 0;
 
-    sum = 0;
 #pragma omp for schedule(static) reduction(+ : sum)
-    for (int i = -100; i <= 0; i++) {
-      sum += i;
+      for (int i = -100; i <= 0; i++) {
+        sum += i;
+      }
     }
+
+    printf("sum: %d\n", sum);
+    ASSERT_EQ(sum, -5050);
   }
-  return sum;
 }
 
-int test_omp_many() {
-  int sum = 0;
+TEST(test_omp_many) {
+  for (int i = 0; i < REPETITIONS; i++) {
+    int sum = 0;
+    int master_sum, single_sum = 0;
 
 #pragma omp parallel shared(sum)
-  {
+    {
 #pragma omp for schedule(dynamic, 16) reduction(+ : sum)
-    for (int i = 0; i <= 100; i++) {
-      sum += i;
-    }
+      for (int i = 0; i <= 100; i++) {
+        sum += i;
+      }
 
 #pragma omp barrier
 
 #pragma omp master
-    {
-      printf("first sum: %d\n", sum);
-      sum = 0;
-    }
+      { master_sum = sum; }
 
 #pragma omp barrier
 
 #pragma omp for schedule(static) reduction(+ : sum)
-    for (int i = -10; i <= 0; i++) {
-      sum += i;
-    }
+      for (int i = -10; i <= 0; i++) {
+        sum += i;
+      }
 
 #pragma omp barrier
 
 #pragma omp single
-    { printf("second sum: %d\n", sum); }
+      { single_sum = sum; }
+    }
+
+    ASSERT_EQ(master_sum, 5050);
+    ASSERT_EQ(single_sum, 4995);
   }
-  return sum;
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t i;
-
-  if (core_id == 0) {
-    printf("Master Thread start\n");
-    for (i = 0; i < REPETITIONS; i++) {
-      printf("Test: %d\n", i);
-      printf("For loop-sum is: %d\n", test_omp_parallel_for());
-      printf("For loop dynamic-sum is: %d\n", test_omp_parallel_for_dynamic());
-      printf("For loop dynamic-static-sum is: %d\n",
-             test_omp_parallel_for_dynamic_static());
-      printf("Test many omp-sum is: %d\n", test_omp_many());
-      printf("Test finished: %d\n", i);
-    }
-    printf("Master Thread end\n\n\n");
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
-
-  return 0;
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
 }
diff --git a/software/tests/omp/reduction/main.c b/software/tests/omp/reduction/main.c
index 15afb6a49..196e1bdaf 100644
--- a/software/tests/omp/reduction/main.c
+++ b/software/tests/omp/reduction/main.c
@@ -7,228 +7,238 @@
 
 #include "baremetal/mempool_conv2d_i32p.h"
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "testing.h"
 
-#define REPETITIONS 10 /* Number of times to run each test */
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
 
 #define MAX_FACTOR 10
 #define KNOWN_PRODUCT 3628800 /* 10! */
-#define LOOPCOUNT 100         /* Number of iterations to slit amongst threads */
-
-int test_omp_parallel_for_reduction() {
-  int sum;
-  int known_sum;
-  int diff;
-  int product;
-  int known_product;
-  int logic_and;
-  int logic_or;
-  int bit_and;
-  int bit_or;
-  int exclusiv_bit_or;
-  int logics[LOOPCOUNT];
-  int i;
-  int result;
-
-  sum = 0;
-  result = 0;
-  product = 1;
-  logic_and = 1;
-  logic_or = 0;
-  bit_and = 1;
-  bit_or = 0;
-  exclusiv_bit_or = 0;
-
-  /* Tests for integers */
-  known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
-#pragma omp parallel for schedule(static, 1) private(i) reduction(+ : sum)
-  for (i = 1; i <= LOOPCOUNT; i++) {
-    sum = sum + i;
-  }
-  if (known_sum != sum) {
-    result++;
-    printf("Error in sum with integers: Result was %d"
-           " instead of %d\n",
-           sum, known_sum);
-  }
+#define LOOPCOUNT 100 /* Number of iterations to split amongst threads */
 
-  diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
-#pragma omp parallel for schedule(static, 1) private(i) reduction(- : diff)
-  for (i = 1; i <= LOOPCOUNT; ++i) {
-    diff = diff - i;
-  }
-  if (diff != 0) {
-    result++;
-    printf("Error in difference with integers: Result was %d"
-           " instead of 0.\n",
-           diff);
-  }
+uint32_t logics[LOOPCOUNT];
 
-/* Tests for integers */
-#pragma omp parallel for schedule(static, 1) private(i) reduction(* : product)
-  for (i = 1; i <= MAX_FACTOR; i++) {
-    product *= i;
-  }
-  known_product = KNOWN_PRODUCT;
-  if (known_product != product) {
-    result++;
-    printf("Error in Product with integers: Result was %d"
-           " instead of %d\n\n",
-           product, known_product);
-  }
+TEST(test_omp_parallel_for_sum) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int sum = 0;
+    int known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
 
-  /* Tests for logic AND */
-  for (i = 0; i < LOOPCOUNT; i++) {
-    logics[i] = 1;
-  }
+#pragma omp parallel for schedule(static, 1) reduction(+ : sum)
+    for (int i = 1; i <= LOOPCOUNT; i++) {
+      sum += i;
+    }
 
-#pragma omp parallel for private(i) schedule(static,1) reduction(&&:logic_and)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    logic_and = (logic_and && logics[i]);
-  }
-  if (!logic_and) {
-    result++;
-    printf("Error in logic AND part 1.\n");
+    ASSERT_EQ(sum, known_sum);
   }
+}
 
-  logic_and = 1;
-  logics[LOOPCOUNT / 2] = 0;
+TEST(test_omp_parallel_for_diff) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2;
 
-#pragma omp parallel for schedule(static,1) private(i) reduction(&&:logic_and)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    logic_and = logic_and && logics[i];
-  }
-  if (logic_and) {
-    result++;
-    printf("Error in logic AND part 2.\n");
-  }
+#pragma omp parallel for schedule(static, 1) reduction(- : diff)
+    for (int i = 1; i <= LOOPCOUNT; ++i) {
+      diff -= i;
+    }
 
-  /* Tests for logic OR */
-  for (i = 0; i < LOOPCOUNT; i++) {
-    logics[i] = 0;
+    ASSERT_EQ(diff, 0);
   }
+}
 
-#pragma omp parallel for schedule(static, 1) private(i) reduction(|| : logic_or)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    logic_or = logic_or || logics[i];
-  }
-  if (logic_or) {
-    result++;
-    printf("Error in logic OR part 1.\n");
-  }
-  logic_or = 0;
-  logics[LOOPCOUNT / 2] = 1;
+TEST(test_omp_parallel_for_product) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int product = 1;
+    int known_product = KNOWN_PRODUCT;
 
-#pragma omp parallel for schedule(static, 1) private(i) reduction(|| : logic_or)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    logic_or = logic_or || logics[i];
-  }
-  if (!logic_or) {
-    result++;
-    printf("Error in logic OR part 2.\n");
-  }
+#pragma omp parallel for schedule(static, 1) reduction(* : product)
+    for (int i = 1; i <= MAX_FACTOR; i++) {
+      product *= i;
+    }
 
-  /* Tests for bitwise AND */
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    logics[i] = 1;
+    ASSERT_EQ(product, known_product);
   }
+}
 
-#pragma omp parallel for schedule(static, 1) private(i) reduction(& : bit_and)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    bit_and = (bit_and & logics[i]);
-  }
-  if (!bit_and) {
-    result++;
-    printf("Error in BIT AND part 1.\n");
-  }
+TEST(test_omp_parallel_for_logic_and_part1) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int logic_and = 1;
+    for (int i = 0; i < LOOPCOUNT; i++) {
+      logics[i] = 1;
+    }
 
-  bit_and = 1;
-  logics[LOOPCOUNT / 2] = 0;
+#pragma omp parallel for schedule(static, 1) reduction(&& : logic_and)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      logic_and = logic_and && logics[i];
+    }
 
-#pragma omp parallel for schedule(static, 1) private(i) reduction(& : bit_and)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    bit_and = bit_and & logics[i];
-  }
-  if (bit_and) {
-    result++;
-    printf("Error in BIT AND part 2.\n");
+    ASSERT_EQ(logic_and, 1);
   }
+}
 
-  /* Tests for bitwise OR */
-  for (i = 0; i < LOOPCOUNT; i++) {
-    logics[i] = 0;
-  }
+TEST(test_omp_parallel_for_logic_and_part2) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int logic_and = 1;
+    memset(logics, 0, LOOPCOUNT);
+    for (int i = 0; i < LOOPCOUNT; i++) {
+      logics[i] = 1;
+    }
+    logics[LOOPCOUNT / 2] = 0;
 
-#pragma omp parallel for schedule(static, 1) private(i) reduction(| : bit_or)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    bit_or = bit_or | logics[i];
-  }
-  if (bit_or) {
-    result++;
-    printf("Error in BIT OR part 1\n");
+#pragma omp parallel for schedule(static, 1) reduction(&& : logic_and)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      logic_and = logic_and && logics[i];
+    }
+
+    ASSERT_EQ(logic_and, 0);
   }
-  bit_or = 0;
-  logics[LOOPCOUNT / 2] = 1;
+}
 
-#pragma omp parallel for schedule(static, 1) private(i) reduction(| : bit_or)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    bit_or = bit_or | logics[i];
+TEST(test_omp_parallel_for_logic_or_part1) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int logic_or = 0;
+    memset(logics, 0, LOOPCOUNT);
+    for (int i = 0; i < LOOPCOUNT; i++) {
+      logics[i] = 0;
+    }
+
+#pragma omp parallel for schedule(static, 1) reduction(|| : logic_or)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      logic_or = logic_or || logics[i];
+    }
+
+    ASSERT_EQ(logic_or, 0);
   }
-  if (!bit_or) {
-    result++;
-    printf("Error in BIT OR part 2\n");
+}
+
+TEST(test_omp_parallel_for_logic_or_part2) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int logic_or = 0;
+    memset(logics, 0, LOOPCOUNT);
+    for (int i = 0; i < LOOPCOUNT; i++) {
+      logics[i] = 0;
+    }
+    logics[LOOPCOUNT / 2] = 1;
+
+#pragma omp parallel for schedule(static, 1) reduction(|| : logic_or)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      logic_or = logic_or || logics[i];
+    }
+
+    ASSERT_EQ(logic_or, 1);
   }
+}
+
+TEST(test_omp_parallel_for_bit_and_part1) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t bit_and = 1;
+    memset(logics, 0, LOOPCOUNT);
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      logics[i] = 1;
+    }
+
+#pragma omp parallel for schedule(static, 1) reduction(& : bit_and)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      bit_and = bit_and & logics[i];
+    }
 
-  /* Tests for bitwise XOR */
-  for (i = 0; i < LOOPCOUNT; i++) {
-    logics[i] = 0;
+    ASSERT_EQ(bit_and, 1);
   }
+}
+
+TEST(test_omp_parallel_for_bit_and_part2) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t bit_and = 1;
+    memset(logics, 0, LOOPCOUNT);
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      logics[i] = 1;
+    }
+    logics[LOOPCOUNT / 2] = 0;
+
+#pragma omp parallel for schedule(static, 1) reduction(& : bit_and)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      bit_and = bit_and & logics[i];
+    }
 
-#pragma omp parallel for schedule(static,1) private(i) reduction(^:exclusiv_bit_or)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+    ASSERT_EQ(bit_and, 0);
   }
-  if (exclusiv_bit_or) {
-    result++;
-    printf("Error in EXCLUSIV BIT OR part 1\n");
+}
+
+TEST(test_omp_parallel_for_bit_or_part1) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t bit_or = 0;
+    memset(logics, 0, LOOPCOUNT);
+    for (int i = 0; i < LOOPCOUNT; i++) {
+      logics[i] = 0;
+    }
+
+#pragma omp parallel for schedule(static, 1) reduction(| : bit_or)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      bit_or = bit_or | logics[i];
+    }
+
+    ASSERT_EQ(bit_or, 0);
   }
+}
 
-  exclusiv_bit_or = 0;
-  logics[LOOPCOUNT / 2] = 1;
+TEST(test_omp_parallel_for_bit_or_part2) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t bit_or = 0;
+    memset(logics, 0, LOOPCOUNT);
+    for (int i = 0; i < LOOPCOUNT; i++) {
+      logics[i] = 0;
+    }
+    logics[LOOPCOUNT / 2] = 1;
 
-#pragma omp parallel for schedule(static,1) private(i) reduction(^:exclusiv_bit_or)
-  for (i = 0; i < LOOPCOUNT; ++i) {
-    exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+#pragma omp parallel for schedule(static, 1) reduction(| : bit_or)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      bit_or = bit_or | logics[i];
+    }
+
+    ASSERT_EQ(bit_or, 1);
   }
-  if (!exclusiv_bit_or) {
-    result++;
-    printf("Error in EXCLUSIV BIT OR part 2\n");
+}
+
+TEST(test_omp_parallel_for_exclusiv_bit_or_part1) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t exclusiv_bit_or = 0;
+    memset(logics, 0, LOOPCOUNT);
+    for (int i = 0; i < LOOPCOUNT; i++) {
+      logics[i] = 0;
+    }
+
+#pragma omp parallel for schedule(static, 1) reduction(^ : exclusiv_bit_or)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+    }
+
+    ASSERT_EQ(exclusiv_bit_or, 0);
   }
-  return (result);
 }
 
-int main() {
-  uint32_t core_id = mempool_get_core_id();
-  int i;
-  int num_failed = 0;
-
-  if (core_id == 0) {
-    printf("Master Thread start\n");
-    for (i = 0; i < REPETITIONS; i++) {
-      printf("test: %d\n", i);
-      num_failed = test_omp_parallel_for_reduction();
-      printf("num_failed: %d\n", num_failed);
-    }
-    printf("Master Thread end\n\n\n");
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
+TEST(test_omp_parallel_for_exclusiv_bit_or_part2) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t exclusiv_bit_or = 0;
+    memset(logics, 0, LOOPCOUNT);
+    for (int i = 0; i < LOOPCOUNT; i++) {
+      logics[i] = 0;
     }
+    logics[LOOPCOUNT / 2] = 1;
+
+#pragma omp parallel for schedule(static, 1) reduction(^ : exclusiv_bit_or)
+    for (int i = 0; i < LOOPCOUNT; ++i) {
+      exclusiv_bit_or = exclusiv_bit_or ^ logics[i];
+    }
+
+    ASSERT_EQ(exclusiv_bit_or, 1);
   }
+}
 
-  return num_failed;
+int main() {
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
 }
diff --git a/software/tests/omp/sections/main.c b/software/tests/omp/sections/main.c
new file mode 100644
index 000000000..bf2285402
--- /dev/null
+++ b/software/tests/omp/sections/main.c
@@ -0,0 +1,39 @@
+// Copyright 2022 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "encoding.h"
+#include "omp.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "testing.h"
+
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
+
+TEST(test_omp_parallel_sections) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t result = 0;
+    uint32_t section_1 = 0;
+    uint32_t section_2 = 0;
+
+#pragma omp parallel sections
+    {
+
+#pragma omp section
+      { section_1 = omp_get_thread_num(); }
+
+#pragma omp section
+      { section_2 = omp_get_thread_num(); }
+    }
+
+    ASSERT_NEQ(section_1, section_2);
+  }
+}
+
+int main() {
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
+}
diff --git a/software/tests/omp/single/main.c b/software/tests/omp/single/main.c
index 9ab31b845..e97461d87 100644
--- a/software/tests/omp/single/main.c
+++ b/software/tests/omp/single/main.c
@@ -2,120 +2,75 @@
 // Licensed under the Apache License, Version 2.0, see LICENSE for details.
 // SPDX-License-Identifier: Apache-2.0
 
-#include <stdint.h>
-#include <string.h>
-
 #include "encoding.h"
-#include "libgomp.h"
+#include "omp.h"
 #include "printf.h"
 #include "runtime.h"
 #include "synchronization.h"
+#include "testing.h"
 
-#define REPETITIONS 10 /* Number of times to run each test */
-
-void work1() {
-  int sum = 0;
-  for (int i = 0; i < 100; i++) {
-    sum++;
-  }
-}
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
 
-uint32_t test_omp_parallel_single() {
-  uint32_t result;
-  result = 0;
-  uint32_t core_id;
+TEST(test_omp_parallel_single) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t result = 0;
 
 #pragma omp parallel shared(result)
-  {
-    core_id = mempool_get_core_id();
-
-    work1();
-    if (core_id == 0) {
-      work1();
-    }
+    {
 
 #pragma omp single
-    { result = 100; }
-
-    work1();
-    if (core_id == 0) {
-      work1();
+      { result += 100; }
     }
 
-#pragma omp single
-    {
-      if (result == 100)
-        result = core_id;
-    }
+    ASSERT_EQ(result, 100);
   }
-  return result;
 }
 
-uint32_t test_omp_for_single() {
-  uint32_t sum = 0;
+TEST(test_omp_for_single) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t sum = 0;
 
 #pragma omp parallel shared(sum)
-  {
-#pragma omp single
     {
-      for (uint32_t i = 0; i <= 100; i++) {
-        sum += i;
+#pragma omp single
+      {
+        for (uint32_t i = 0; i <= 100; i++) {
+          sum += i;
+        }
       }
-    }
 #pragma omp single
-    {
-      if (sum == 100 * 101 / 2)
-        sum = 1;
+      {
+        if (sum == 100 * 101 / 2)
+          sum = 1;
+      }
     }
+
+    ASSERT_EQ(sum, 1);
   }
-  return sum;
 }
 
-uint32_t test_omp_single_copyprivate() {
-  uint32_t result;
-  result = 0;
-
-#pragma omp parallel firstprivate(result)
-  {
-    uint32_t core_id = mempool_get_core_id();
+TEST(test_omp_single_copyprivate) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    uint32_t result = 0;
+    uint32_t outerResult = 0;
 
-    work1();
-    if (core_id == 0) {
-      work1();
-    }
+#pragma omp parallel private(result)
+    {
 
 #pragma omp single copyprivate(result)
-    { result = 100; }
+      { result = 100; }
 
-    work1();
-    if (core_id == 5) {
-      result *= 2;
-      printf("Core 5 result: %d\n", result);
+#pragma omp single
+      { outerResult = result; }
     }
+
+    ASSERT_EQ(outerResult, 100);
   }
-  return result;
 }
 
 int main() {
-  uint32_t core_id = mempool_get_core_id();
-  uint32_t i;
-
-  if (core_id == 0) {
-    printf("Master Thread start\n");
-    for (i = 0; i < REPETITIONS; i++) {
-      printf("Test: %d\n", i);
-      printf("Single core_id: %d\n", test_omp_parallel_single());
-      printf("For loop-sum is t/f: %d\n", test_omp_for_single());
-      printf("Copyprivate: %d\n", test_omp_single_copyprivate());
-      printf("Test finished: %d\n", i);
-    }
-    printf("Master Thread end\n\n\n");
-  } else {
-    while (1) {
-      mempool_wfi();
-      run_task(core_id);
-    }
-  }
-
-  return 0;
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
 }
diff --git a/software/tests/omp/teams/main.c b/software/tests/omp/teams/main.c
new file mode 100644
index 000000000..6d82c557d
--- /dev/null
+++ b/software/tests/omp/teams/main.c
@@ -0,0 +1,91 @@
+// Copyright 2024 ETH Zurich and University of Bologna.
+// Licensed under the Apache License, Version 2.0, see LICENSE for details.
+// SPDX-License-Identifier: Apache-2.0
+
+#include "encoding.h"
+#include "omp.h"
+#include "printf.h"
+#include "runtime.h"
+#include "synchronization.h"
+#include "testing.h"
+
+#ifndef REPETITIONS
+#define REPETITIONS 100 /* Number of times to run each test */
+#endif
+
+TEST(test_teams_distribute) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int num_teams = 0;
+    int team_num[12];
+
+#pragma omp teams distribute num_teams(4)
+    for (int i = 0; i < 12; i++) {
+      team_num[i] = omp_get_team_num();
+
+      if (omp_get_team_num() == 0) {
+        num_teams = omp_get_num_teams();
+      }
+    }
+
+    for (int i = 0; i < 12; i++) {
+      ASSERT_EQ(team_num[i], i / 3);
+    }
+
+    ASSERT_EQ(num_teams, 4);
+  }
+}
+
+TEST(test_teams_reduce) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int sum[4];
+
+#pragma omp teams num_teams(4)
+    {
+      int local_sum = 0;
+
+#pragma omp parallel for reduction(+ : local_sum)
+      for (int i = 0; i < 32; i++) {
+        local_sum += 1;
+      }
+
+      sum[omp_get_team_num()] = local_sum;
+    }
+
+    for (int i = 0; i < 4; i++) {
+      ASSERT_EQ(sum[i], 32);
+    }
+  }
+}
+
+TEST(test_teams_barrier) {
+  for (int rep = 0; rep < REPETITIONS; rep++) {
+    int results[2];
+#pragma omp teams num_teams(2)
+    {
+      uint32_t team_num = omp_get_team_num();
+      int result = 0;
+
+#pragma omp parallel
+      {
+        uint32_t rank = omp_get_thread_num();
+        if (rank == 1) {
+          mempool_wait(1000); // give 1 sec to whole test
+          result = 3;
+        }
+#pragma omp barrier
+
+        if (rank == 2) {
+          results[team_num] = result;
+        }
+      }
+    }
+
+    ASSERT_EQ(results[0], 3);
+    ASSERT_EQ(results[1], 3);
+  }
+}
+
+int main() {
+  RUN_ALL_TESTS();
+  PRINT_TEST_RESULTS();
+}