diff --git a/.clang-tidy b/.clang-tidy new file mode 100644 index 000000000..5c75f7645 --- /dev/null +++ b/.clang-tidy @@ -0,0 +1,46 @@ +--- +Checks: > + bugprone-*, + cppcoreguidelines-*, + performance-*, + readability-*,$ + clang-analyzer-*, + misc-unused-parameters, + + -bugprone-easily-swappable-parameters, + -bugprone-reserved-identifier, + -cppcoreguidelines-pro-type-vararg, + -cppcoreguidelines-avoid-non-const-global-variables, + +WarningsAsErrors: '' +HeaderFileExtensions: + - '' + - h + - hh + - hpp + - hxx +ImplementationFileExtensions: + - c + - cc + - cpp + - cxx +HeaderFilterRegex: '' +AnalyzeTemporaryDtors: false +FormatStyle: file +CheckOptions: + cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU' + google-readability-namespace-comments.ShortNamespaceLines: '10' + cert-err33-c.CheckedFunctions: '::aligned_alloc;::asctime_s;::at_quick_exit;::atexit;::bsearch;::bsearch_s;::btowc;::c16rtomb;::c32rtomb;::calloc;::clock;::cnd_broadcast;::cnd_init;::cnd_signal;::cnd_timedwait;::cnd_wait;::ctime_s;::fclose;::fflush;::fgetc;::fgetpos;::fgets;::fgetwc;::fopen;::fopen_s;::fprintf;::fprintf_s;::fputc;::fputs;::fputwc;::fputws;::fread;::freopen;::freopen_s;::fscanf;::fscanf_s;::fseek;::fsetpos;::ftell;::fwprintf;::fwprintf_s;::fwrite;::fwscanf;::fwscanf_s;::getc;::getchar;::getenv;::getenv_s;::gets_s;::getwc;::getwchar;::gmtime;::gmtime_s;::localtime;::localtime_s;::malloc;::mbrtoc16;::mbrtoc32;::mbsrtowcs;::mbsrtowcs_s;::mbstowcs;::mbstowcs_s;::memchr;::mktime;::mtx_init;::mtx_lock;::mtx_timedlock;::mtx_trylock;::mtx_unlock;::printf_s;::putc;::putwc;::raise;::realloc;::remove;::rename;::scanf;::scanf_s;::setlocale;::setvbuf;::signal;::snprintf;::snprintf_s;::sprintf;::sprintf_s;::sscanf;::sscanf_s;::strchr;::strerror_s;::strftime;::strpbrk;::strrchr;::strstr;::strtod;::strtof;::strtoimax;::strtok;::strtok_s;::strtol;::strtold;::strtoll;::strtoul;::strtoull;::strtoumax;::strxfrm;::swprintf;::swprintf_s;::swscanf;::swscanf_s;::thrd_create;::thrd_detach;::thrd_join;::thrd_sleep;::time;::timespec_get;::tmpfile;::tmpfile_s;::tmpnam;::tmpnam_s;::tss_create;::tss_get;::tss_set;::ungetc;::ungetwc;::vfprintf;::vfprintf_s;::vfscanf;::vfscanf_s;::vfwprintf;::vfwprintf_s;::vfwscanf;::vfwscanf_s;::vprintf_s;::vscanf;::vscanf_s;::vsnprintf;::vsnprintf_s;::vsprintf;::vsprintf_s;::vsscanf;::vsscanf_s;::vswprintf;::vswprintf_s;::vswscanf;::vswscanf_s;::vwprintf_s;::vwscanf;::vwscanf_s;::wcrtomb;::wcschr;::wcsftime;::wcspbrk;::wcsrchr;::wcsrtombs;::wcsrtombs_s;::wcsstr;::wcstod;::wcstof;::wcstoimax;::wcstok;::wcstok_s;::wcstol;::wcstold;::wcstoll;::wcstombs;::wcstombs_s;::wcstoul;::wcstoull;::wcstoumax;::wcsxfrm;::wctob;::wctrans;::wctype;::wmemchr;::wprintf_s;::wscanf;::wscanf_s;' + llvm-else-after-return.WarnOnUnfixable: 'false' + cert-str34-c.DiagnoseSignedUnsignedCharComparisons: 'false' + google-readability-namespace-comments.SpacesBeforeComments: '2' + cppcoreguidelines-non-private-member-variables-in-classes.IgnoreClassesWithAllMemberVariablesBeingPublic: 'true' + google-readability-braces-around-statements.ShortStatementLines: '1' + google-readability-function-size.StatementThreshold: '800' + llvm-qualified-auto.AddConstToQualified: 'false' + llvm-else-after-return.WarnOnConditionVariables: 'false' + cert-oop54-cpp.WarnOnlyIfThisHasSuspiciousField: 'false' + cppcoreguidelines-avoid-do-while.IgnoreMacros: 'true' +SystemHeaders: false +... + diff --git a/.clangd b/.clangd new file mode 100644 index 000000000..a6158c2f4 --- /dev/null +++ b/.clangd @@ -0,0 +1,29 @@ +CompileFlags: + Add: [ + -std=c++17, + -D__riscv, + -DNUM_CORES, + -DNUM_CORES_PER_TILE, + -DNUM_GROUPS, + -DSTACK_SIZE, + -DXQUEUE_SIZE, + -DSEQ_MEM_SIZE, + -DBANKING_FACTOR, + -DETL_CHECK_PUSH_POP, + -DETL_LOG_ERRORS, + -DETL_VERBOSE_ERRORS + ] + +--- + +If: + PathMatch: [.*/runtime/.*/.*] +CompileFlags: + Add: [-xc++, -I..] + +--- + +If: + PathMatch: [.*/tests/.*/.*] +CompileFlags: + Add: [-xc++, -I.., -I../../runtime/] diff --git a/CHANGELOG.md b/CHANGELOG.md index a645480ef..cea01b64d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,12 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Add `apb` dependency of version 0.2.4 - Add support for the `FENCE` instruction - Add support for DRAMsys5.0 co-simulation +- Add KMP runtime +- Add new runtime testing infrastructure +- Add automated benchmark running and plotting +- Add clangd and clang-tidy configs +- Add Banshee config +- Add OpenMP sections and teams tests ### Changes - Add physical feasible TeraPool configuration with SubGroup hierarchy. @@ -33,12 +39,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. - Update `common_verification` to 0.2.3 - Update `register_interface` to 0.4.3 - Updated Halide to version 15 +- Include SP in Snitch traces to aid in building better tracevis +- Make the `tracevis` target depend on the `trace` target and use `-cf` +- Rename the `omp` runtime directory to `gomp` +- Link with `lld` when using clang ### Fixed - Fix type issue in `snitch_addr_demux` - Properly disable the debugging CSRs in ASIC implementations - Fix a bug in the DMA's distributed midend - Fix bugs in radix2, radix4by2 parallelization and loading of data for radix4 CFFT +- Fix building verilator on newer clang and gcc versions +- Fix tracing targets missing some harts +- Fix clang warnings about the .comment section ## 0.6.0 - 2023-01-09 diff --git a/Makefile b/Makefile index 720e628b2..668513222 100644 --- a/Makefile +++ b/Makefile @@ -90,7 +90,7 @@ tc-llvm: -DCMAKE_INSTALL_PREFIX=$(LLVM_INSTALL_DIR) \ -DCMAKE_CXX_COMPILER=$(CXX) \ -DCMAKE_C_COMPILER=$(CC) \ - -DLLVM_ENABLE_PROJECTS="clang" \ + -DLLVM_ENABLE_PROJECTS="clang;lld" \ -DLLVM_TARGETS_TO_BUILD="RISCV;host" \ -DLLVM_BUILD_DOCS="0" \ -DLLVM_ENABLE_BINDINGS="0" \ @@ -146,7 +146,7 @@ $(BENDER_INSTALL_DIR)/bender: verilator: $(VERILATOR_INSTALL_DIR)/bin/verilator $(VERILATOR_INSTALL_DIR)/bin/verilator: toolchain/verilator Makefile cd $<; unset VERILATOR_ROOT; \ - autoconf && CC=$(CLANG_CC) CXX=$(CLANG_CXX) CXXFLAGS=$(CLANG_CXXFLAGS) LDFLAGS=$(CLANG_LDFLAGS) ./configure --prefix=$(VERILATOR_INSTALL_DIR) $(VERILATOR_CI) && \ + autoconf && CC=$(CLANG_CC) CXX=$(CLANG_CXX) CXXFLAGS="$(CLANG_CXXFLAGS) -include memory" LDFLAGS=$(CLANG_LDFLAGS) ./configure --prefix=$(VERILATOR_INSTALL_DIR) $(VERILATOR_CI) && \ make -j4 && make install # Update and patch hardware dependencies for MemPool diff --git a/banshee-config.yaml b/banshee-config.yaml new file mode 100644 index 000000000..b9e776dc3 --- /dev/null +++ b/banshee-config.yaml @@ -0,0 +1,53 @@ +# Copyright 2021 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +--- +address: + scratch_reg: 0x40000000 + wakeup_reg: 0x40000004 + tcdm_start: 0x40000008 + tcdm_end: 0x4000000C + nr_cores: 0x40000010 + uart: 0xC0000000 + # Not supported in MemPool + barrier_reg: + start: 0x50000000 + offset: 0x100000 + cluster_base_hartid: 0x50000001 + cluster_num: 0x50000002 + cluster_id: 0x50000003 + cl_clint: 0x40000060 + clint: 0xFFFF0000 +memory: + tcdm: + start: 0x0 + size: 0x100000 + offset: 0x100000 + latency: 5 + dram: + start: 0x80000000 + size: 0x01000000 + offset: 0x0 + latency: 10 + periphs: + start: 0x40000000 + size: 0x20000 + offset: 0x0 + latency: 5 + callbacks: + - name: zero-memory + size: 0x10000 + - name: mempool-dma + size: 0x1C +inst_latency: + mul: 3 + mulh: 3 + mulhsu: 3 + mulhu: 3 + div: 3 + divu: 3 + rem: 3 + remu: 3 +ssr: + num_dm: 3 diff --git a/config/minpool-no-xpulp.mk b/config/minpool-no-xpulp.mk new file mode 100644 index 000000000..2d056fb4d --- /dev/null +++ b/config/minpool-no-xpulp.mk @@ -0,0 +1,14 @@ +# Copyright 2021 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +# Author: Matheus Cavalcante, ETH Zurich + +############### +## MinPool ## +############### + +include $(MEMPOOL_DIR)/config/minpool.mk + +# Disable Xpulpimg +xpulpimg ?= 0 diff --git a/hardware/Makefile b/hardware/Makefile index 6de18eb4f..0b9f41383 100644 --- a/hardware/Makefile +++ b/hardware/Makefile @@ -15,6 +15,9 @@ TOOLCHAIN_DIR := $(abspath $(ROOT_DIR)/../toolchain) config_mk = $(abspath $(ROOT_DIR)/../config/config.mk) include $(config_mk) +# Banshee config +banshee_config = $(MEMPOOL_DIR)/banshee-config.yaml + # build path buildpath ?= build resultpath ?= results @@ -319,6 +322,13 @@ verilate: $(VERILATOR_EXE) $(buildpath) Makefile # Avoid capturing the return status when running the load-throughput analysis if [ $(tg) -ne 1 ]; then ./scripts/return_status.sh $(buildpath)/transcript; fi +################ +# Banshee # +################ + +banshee: + banshee --configuration $(banshee_config) --num-clusters 1 --num-cores $(num_cores) -l $(preload) + ############# # Lint # ############# @@ -340,11 +350,13 @@ spyglass/tmp/files: $(bender) trace_env += NUM_CORES=$(num_cores) trace_env += SEQ_MEM_SIZE=$(seq_mem_size) +.PHONY: benchmark trace log pre_trace post_trace tracevis + benchmark: log simcvcs # Call `make` again to get variable extension with all traces result_dir=$(result_dir) $(MAKE) trace -trace: pre_trace $(trace) post_trace +trace: post_trace log: mkdir -p "$(result_dir)" @@ -361,20 +373,20 @@ log: pre_trace: rm -rf $(tracepath) -post_trace: +post_trace: $(trace) mkdir -p "$(result_dir)" cp $(buildpath)/transcript "$(result_dir)/" | true cp $(traceresult) "$(result_dir)" cp $(trace) "$(result_dir)" $(python) $(ROOT_DIR)/scripts/gen_avg.py --folder "$(result_dir)" | tee $(result_dir)/avg.txt -$(buildpath)/%.trace: $(buildpath)/%.dasm +$(buildpath)/%.trace: $(buildpath)/%.dasm pre_trace mkdir -p $(tracepath) $(INSTALL_DIR)/riscv-isa-sim/bin/spike-dasm < $< > $(tracepath)/$* $(trace_env) $(python) $(ROOT_DIR)/scripts/gen_trace.py -p --csv $(traceresult) $(tracepath)/$* > $@ -tracevis: - $(MEMPOOL_DIR)/scripts/tracevis.py $(preload) $(buildpath)/*.trace -o $(buildpath)/tracevis.json +tracevis: $(trace) + $(MEMPOOL_DIR)/scripts/tracevis.py -cf $(preload) $(buildpath)/*.trace -o $(buildpath)/tracevis.json ############################ # Unit tests simulation # diff --git a/hardware/deps/snitch/src/snitch.sv b/hardware/deps/snitch/src/snitch.sv index 32c58ea0e..50a2ca34a 100644 --- a/hardware/deps/snitch/src/snitch.sv +++ b/hardware/deps/snitch/src/snitch.sv @@ -2415,7 +2415,7 @@ module snitch assign sp_new_value = gpr_wdata[i]; always_ff @(posedge clk_i or posedge rst_i) begin if (!rst_i && gpr_we[i] && gpr_waddr[i] == SP && csr_stack_limit_q != 32'hFFFF_FFFF && ($signed(sp_new_value) < $signed(csr_stack_limit_q))) begin - $warning("[Stackoverflow: Core %0d] Set SP to 0x%08h, limit is 0x%08h", hart_id_i, sp_new_value, csr_stack_limit_q); + $warning("[Stackoverflow: Core %0d, PC: %0d] Set SP to 0x%08h, limit is 0x%08h", hart_id_i, inst_addr_o, sp_new_value, csr_stack_limit_q); end end end diff --git a/hardware/scripts/gen_trace.py b/hardware/scripts/gen_trace.py index 1763f9151..0b091d9b8 100755 --- a/hardware/scripts/gen_trace.py +++ b/hardware/scripts/gen_trace.py @@ -28,9 +28,10 @@ GENERAL_WARN = ('WARNING: Inconsistent final state; performance metrics may ' 'be inaccurate. Is this trace complete?\n') -TRACE_IN_REGEX = r'(\d+)\s+(\d+)\s+(0x[0-9A-Fa-fz]+)\s+([^#;]*)(\s*#;\s*(.*))?' +TRACE_IN_REGEX = (r'(\d+)\s+(\d+)\s+(0x[0-9A-Fa-fz]+)\s+(0x[0-9A-Fa-fz]+)\s+' + r'([^#;]*)(\s*#;\s*(.*))?') -TRACE_OUT_FMT = '{:>8} {:>8} {:>10} {:<30}' +TRACE_OUT_FMT = '{:>8} {:>8} {:>8} {:>10} {:<30}' # -------------------- Tracer configuration -------------------- @@ -304,7 +305,7 @@ def annotate_insn( match = re.search(TRACE_IN_REGEX, line.strip('\n')) if match is None: raise ValueError('Not a valid trace line:\n{}'.format(line)) - time_str, cycle_str, pc_str, insn, _, extras_str = match.groups() + time_str, cycle_str, pc_str, sp_str, insn, _, extras_str = match.groups() time_info = (int(time_str), int(cycle_str)) show_time_info = (dupl_time_info or time_info != last_time_info) time_info_strs = tuple((str(elem) if show_time_info else '') @@ -333,12 +334,13 @@ def annotate_insn( else: prev_wfi_time = 0 return ((TRACE_OUT_FMT + ' #; {}').format(*time_info_strs, - pc_str, insn, annot), + pc_str, sp_str, insn, annot), time_info, prev_wfi_time, retired_reg, empty) # Vanilla trace else: - return TRACE_OUT_FMT.format( - *time_info_strs, pc_str, insn), time_info, 0, retired_reg, False + return (TRACE_OUT_FMT.format( + *time_info_strs, pc_str, sp_str, insn), time_info, 0, retired_reg, + False) # -------------------- Performance metrics -------------------- diff --git a/hardware/src/mempool_cc.sv b/hardware/src/mempool_cc.sv index 96134460e..1e3579380 100644 --- a/hardware/src/mempool_cc.sv +++ b/hardware/src/mempool_cc.sv @@ -406,8 +406,8 @@ module mempool_cc extras_fpu = $sformatf("%s}", extras_fpu); $timeformat(-9, 0, "", 10); - $sformat(trace_entry, "%t %8d 0x%h DASM(%h) #; %s\n", - $time, cycle, i_snitch.pc_q, i_snitch.inst_data_i, extras_str); + $sformat(trace_entry, "%t %8d 0x%h 0x%h DASM(%h) #; %s\n", + $time, cycle, i_snitch.pc_q, i_snitch.i_snitch_regfile.mem[2], i_snitch.inst_data_i, extras_str); $fwrite(f, trace_entry); end diff --git a/scripts/license-checker.hjson b/scripts/license-checker.hjson index eea7ec365..52f2ee8de 100644 --- a/scripts/license-checker.hjson +++ b/scripts/license-checker.hjson @@ -16,7 +16,7 @@ 'software/runtime/printf*' 'software/runtime/encoding.h' 'software/runtime/mempool_dma_frontend.h' - 'software/runtime/omp/libgomp.h' + 'software/runtime/gomp/libgomp.h' 'software/riscv-tests/*' 'hardware/deps/*' 'hardware/tb/dpi/elfloader.cpp' diff --git a/scripts/tracevis.py b/scripts/tracevis.py index 66f1e2968..a379b5295 100755 --- a/scripts/tracevis.py +++ b/scripts/tracevis.py @@ -45,10 +45,12 @@ # 1 -> cycle # 2 -> privilege level (RTL) / hartid (banshee) # 3 -> pc (hex with 0x prefix) -# 4 -> instruction -# 5 -> args (RTL) / empty (banshee) -# 6 -> comment (RTL) / instruction arguments (banshee) -RTL_REGEX = r' *(\d+) +(\d+) +([3M1S0U]?) *(0x[0-9a-f]+) ([.\w]+) +(.+)#; (.*)' +# 4 -> sp (hex with 0x prefix) +# 5 -> instruction +# 6 -> args (RTL) / empty (banshee) +# 7 -> comment (RTL) / instruction arguments (banshee) +RTL_REGEX = (r' *(\d+) +(\d+) +([3M1S0U]?) *(0x[0-9a-f]+) *' + r'(0x[0-9a-f]+) ([.\w]+) +(.+)#; (.*)') BANSHEE_REGEX = r' *(\d+) (\d+) (\d+) ([0-9a-f]+) *.+ +.+# ([\w\.]*)( +)(.*)' # regex matches a line of instruction retired by the accelerator @@ -63,12 +65,14 @@ @lru_cache(maxsize=1024) def addr2line_cache(addr): - cmd = f'{addr2line} -e {elf} -f -a -i {addr:x}' + cmd = f'{addr2line} -C -e {elf} -f -a -i {addr:x}' return os.popen(cmd).read().split('\n') functions = [] -prev_func = "" +function_stack = [] +prev_sp = 0 +prev_func = None prev_ts = 0 start_benchmark = 0 @@ -108,14 +112,17 @@ def trace_instruction( f'"ts": {time}, ' f'"dur": {duration}, ' f'"pid": {pid}, ' - f'"tid": {functions.index(name)}, ' + f'"tid": {pid}, ' f'"args": {frame_args}' f'}},\n') output_file.write(frame) -def trace_function(name, pid, time, cyc, file): +def trace_function(name, pid, time, cyc, file, instr, sp): + global prev_sp + global prev_func + # Assemble values for json # Doc: # https://docs.google.com/document/d/1CvAClvFfyA5R-PhYUmn5OOQtYMH4h6I0nSsKchNAySU/preview @@ -129,7 +136,55 @@ def trace_function(name, pid, time, cyc, file): arg_cycles = cyc arg_coords = file - if prev_func != "": + sp = int(sp, base=16) + + if prev_func is None or name != prev_func: + if prev_func is not None and prev_func not in function_stack: + end_time = time if time > prev_ts else prev_ts + 1 + output_file.write( + f'{{' + f'"name": "{prev_func}", ' + f'"cat": "{cat}", ' + f'"ph": "E", ' + f'"ts": {end_time}, ' + f'"pid": {pid}, ' + f'"tid": {pid}' + f'}},\n' + ) + + # print(f'Stackless function {prev_func} ended') + + if name not in function_stack: + output_file.write(f'{{' + f'"name": "{name}", ' + f'"cat": "{cat}", ' + f'"ph": "B", ' + f'"ts": {time}, ' + f'"pid": {pid}, ' + f'"tid": {pid}, ' + f'"args": {{"time": "{arg_cycles}", ' + f' "Origin": "{arg_coords}"}}' + f'}},\n') + + # print(f'Begin {name}') + else: + pass + # print(f'Function {name} already in stack') + + elif sp < prev_sp: + function_stack.append(name) + # print(f'Pushed {name} to stack') + # print(f'Function stack: {function_stack}') + # print() + + elif sp > prev_sp and len(function_stack) > 0: + # pop prev function + prev_func = function_stack.pop() + + # print(f'Popped {prev_func} from stack') + # print(f'Function stack: {function_stack}') + # print() + end_time = time if time > prev_ts else prev_ts + 1 output_file.write( f'{{' @@ -138,25 +193,15 @@ def trace_function(name, pid, time, cyc, file): f'"ph": "E", ' f'"ts": {end_time}, ' f'"pid": {pid}, ' - f'"tid": {functions.index(prev_func)}' + f'"tid": {pid}' f'}},\n' ) - frame = (f'{{' - f'"name": "{name}", ' - f'"cat": "{cat}", ' - f'"ph": "B", ' - f'"ts": {time}, ' - f'"pid": {pid}, ' - f'"tid": {functions.index(name)}, ' - f'"args": {{"time": "{arg_cycles}", "Origin": "{arg_coords}"}}' - f'}},\n') - - output_file.write(frame) + prev_sp = sp + prev_func = name def flush(buf, hartid): - global prev_func global prev_ts global start_benchmark global output_file @@ -169,18 +214,18 @@ def flush(buf, hartid): a2ls += addr2line_cache(int(addr, base=16))[:-1] else: a2ls = os.popen( - f'{addr2line} -e {elf} -f -a -i {" ".join(pcs)}' + f'{addr2line} -C -e {elf} -f -a -i {" ".join(pcs)}' ).read().split('\n')[:-1] for i in range(len(buf)-1): - (time, cyc, priv, pc, instr, args, cmt) = buf.pop(0) + (time, cyc, priv, pc, sp, instr, args, cmt) = buf.pop(0) if use_time: next_time = int(buf[0][0]) time = int(time) else: next_time = int(buf[0][1]) - time = int(cyc) + time = float(cyc) / 1000 # Have lookahead time to this instruction? next_time = lah[time] if time in lah else next_time @@ -220,11 +265,8 @@ def flush(buf, hartid): functions.append(func) if compress_function: - if func != prev_func: - trace_function(name=func, pid=int(hartid), - time=time, cyc=cyc, file=file) - prev_func = func - prev_ts = time + trace_function(name=func, pid=int(hartid), + time=time, cyc=cyc, file=file, instr=instr, sp=sp) else: trace_instruction(name=func, pid=int(hartid), @@ -243,9 +285,9 @@ def parse_line(line, hartid): # print(line) match = re_line.match(line) if match: - (time, cyc, priv, pc, instr, args, cmt) = tuple( + (time, cyc, priv, pc, sp, instr, args, cmt) = tuple( [match.group(i+1).strip() for i in range(re_line.groups)]) - buf.append((time, cyc, priv, pc, instr, args, cmt)) + buf.append((time, cyc, priv, pc, sp, instr, args, cmt)) last_time, last_cyc = time, cyc if len(buf) > 10: @@ -263,7 +305,7 @@ def offload_lookahead(lines): for line in lines: match = re_line.match(line) if match: - (time, cyc, priv, pc, instr, args, cmt) = tuple( + (time, cyc, priv, pc, sp, instr, args, cmt) = tuple( [match.group(i+1).strip() for i in range(re_line.groups)]) time = int(time) if use_time else int(cyc) @@ -387,7 +429,6 @@ def offload_lookahead(lines): output_file.write('{"traceEvents": [\n') for filename in traces: - prev_func = "" prev_ts = 0 start_benchmark = 0 hartid = 0 @@ -425,31 +466,6 @@ def offload_lookahead(lines): print(f'=> Parsed {lines-fails} of {lines} lines', file=sys.stderr) - # Terminate last function all at end of file - if compress_function: - output_file.write( - f'{{' - f'"name": "{prev_func}", ' - f'"cat": "function", ' - f'"ph": "E", ' - f'"ts": {prev_ts+1}, ' - f'"pid": {hartid}, ' - f'"tid": {functions.index(prev_func)}' - f'}},\n' - ) - - # Write Metadata - for func in functions: - output_file.write( - f'{{' - f'"name": "thread_name", ' - f'"ph": "M", ' - f'"pid": {hartid}, ' - f'"tid": {functions.index(func)}, ' - f'"args": {{"name" : "{func}"}}' - f'}},\n' - ) - for i in range(hartid + 1): output_file.write( f'{{' diff --git a/software/apps/omp/Makefile b/software/apps/omp/Makefile index cece16052..efc9f44bc 100644 --- a/software/apps/omp/Makefile +++ b/software/apps/omp/Makefile @@ -9,12 +9,15 @@ SOFTWARE_DIR := $(abspath $(ROOT_DIR)/../..) APPS_DIR := $(ROOT_DIR) BIN_DIR := $(abspath $(SOFTWARE_DIR)/bin/$(subst $(SOFTWARE_DIR),,$(APPS_DIR))) RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime) +COMPILER ?= llvm # OpenMP runtime -OMP_DIR ?= $(RUNTIME_DIR)/omp RISCV_CCFLAGS += -fopenmp -DNTHREADS=$(num_cores) RISCV_CCFLAGS += -I$(OMP_DIR) +# Wrap main function +RISCV_LDFLAGS += -Wl,-wrap,main + # This will overwrite the ROOT_DIR variable from the included makefile include $(RUNTIME_DIR)/runtime.mk diff --git a/software/apps/omp/barrier_benchmark/main.c b/software/apps/omp/barrier_benchmark/main.c new file mode 100644 index 000000000..02cab5771 --- /dev/null +++ b/software/apps/omp/barrier_benchmark/main.c @@ -0,0 +1,43 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "baremetal/mempool_conv2d_i32p.h" +#include "encoding.h" +#include "omp.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +#define MAX_BARRIERS 16 + +int main() { +#pragma omp parallel + { + unsigned int counter = 0; + unsigned int cycles = 0; + unsigned int start_time = 0; + + for (int i = 1; i < MAX_BARRIERS + 1; i++) { + + start_time = mempool_get_timer(); + mempool_start_benchmark(); + + for (int j = 0; j < i; j++) { +#pragma omp barrier + counter++; + } + + mempool_stop_benchmark(); + cycles = mempool_get_timer() - start_time; + +#pragma omp single + printf("%d barriers: %d cycles\n", i, cycles); + } + } + + return 0; +} diff --git a/software/apps/omp/barrier_conv/main.c b/software/apps/omp/barrier_conv/main.c index 5b0548057..47544704d 100644 --- a/software/apps/omp/barrier_conv/main.c +++ b/software/apps/omp/barrier_conv/main.c @@ -7,7 +7,7 @@ #include "baremetal/mempool_conv2d_i32p.h" #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" @@ -149,37 +149,40 @@ void conv_gomp_barrier(uint32_t core_id, uint32_t num_cores) { } int main() { - mempool_timer_t cycles, start_time; - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - - if (core_id == 0) { - printf("Start Barrier Benchmark\n"); - } +#pragma omp parallel + { + mempool_timer_t cycles, start_time; + uint32_t core_id = mempool_get_core_id(); + uint32_t num_cores = mempool_get_core_count(); + + if (core_id == 0) { + printf("Start Barrier Benchmark\n"); + } #pragma omp barrier - start_time = mempool_get_timer(); - mempool_start_benchmark(); - conv_mempool_barrier(core_id, num_cores); - mempool_stop_benchmark(); - cycles = mempool_get_timer(); - - if (core_id == 0) { - printf("Mempool barrier cycles: %d\n", cycles - start_time); - } + start_time = mempool_get_timer(); + mempool_start_benchmark(); + conv_mempool_barrier(core_id, num_cores); + mempool_stop_benchmark(); + cycles = mempool_get_timer(); + + if (core_id == 0) { + printf("Mempool barrier cycles: %d\n", cycles - start_time); + } - mempool_barrier(num_cores); + mempool_barrier(num_cores); - start_time = mempool_get_timer(); - mempool_start_benchmark(); - conv_gomp_barrier(core_id, num_cores); - mempool_stop_benchmark(); - cycles = mempool_get_timer(); + start_time = mempool_get_timer(); + mempool_start_benchmark(); + conv_gomp_barrier(core_id, num_cores); + mempool_stop_benchmark(); + cycles = mempool_get_timer(); - if (core_id == 0) { - printf("GOMP barrier cycles: %d\n", cycles - start_time); - } + if (core_id == 0) { + printf("GOMP barrier cycles: %d\n", cycles - start_time); + } #pragma omp barrier + } return 0; } diff --git a/software/apps/omp/critical_benchmark/main.c b/software/apps/omp/critical_benchmark/main.c index b13287abf..54d9fb2c1 100644 --- a/software/apps/omp/critical_benchmark/main.c +++ b/software/apps/omp/critical_benchmark/main.c @@ -6,7 +6,7 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" @@ -68,35 +68,33 @@ void omp_parallel_critical() { } int main() { - uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); - // Initialize synchronization variables - mempool_barrier_init(core_id); +#pragma omp parallel + { + uint32_t core_id = mempool_get_core_id(); - if (core_id == 0) { - printf("Initialize\n"); - *lock = 0; - result = 0; - } + // Initialize synchronization variables + mempool_barrier_init(core_id); - mempool_barrier(num_cores); - parallel_critical_manual(); - mempool_barrier(num_cores); + if (core_id == 0) { + printf("Initialize\n"); + *lock = 0; + result = 0; + } - result = 0; + mempool_barrier(num_cores); + parallel_critical_manual(); + mempool_barrier(num_cores); + + result = 0; + } /* OPENMP IMPLEMENTATION */ - if (core_id == 0) { - mempool_wait(4 * num_cores); - omp_parallel_critical(); - mempool_wait(100 * num_cores); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } + mempool_wait(4 * num_cores); + omp_parallel_critical(); + mempool_wait(100 * num_cores); + return 0; } diff --git a/software/apps/omp/master_benchmark/main.c b/software/apps/omp/master_benchmark/main.c index 81768c8aa..8308feea3 100644 --- a/software/apps/omp/master_benchmark/main.c +++ b/software/apps/omp/master_benchmark/main.c @@ -6,7 +6,7 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" @@ -74,36 +74,33 @@ void omp_parallel_master() { } int main() { - uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); - // Initialize synchronization variables - mempool_barrier_init(core_id); +#pragma omp parallel + { + uint32_t core_id = mempool_get_core_id(); - // #ifdef VERBOSE - if (core_id == 0) { - printf("Initialize\n"); - *checkfirst = 0; - result = 0; - } + // Initialize synchronization variables + mempool_barrier_init(core_id); - mempool_barrier(num_cores); - parallel_master_manual(); - mempool_barrier(num_cores); + // #ifdef VERBOSE + if (core_id == 0) { + printf("Initialize\n"); + *checkfirst = 0; + result = 0; + } - result = 0; + mempool_barrier(num_cores); + parallel_master_manual(); + mempool_barrier(num_cores); + + result = 0; + } /* OPENMP IMPLEMENTATION */ - if (core_id == 0) { - mempool_wait(1 * num_cores); - omp_parallel_master(); - mempool_wait(4 * num_cores); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } + mempool_wait(1 * num_cores); + omp_parallel_master(); + mempool_wait(4 * num_cores); return 0; } diff --git a/software/apps/omp/omp_overhead/main.c b/software/apps/omp/omp_overhead/main.c index 8622c25ba..1e7c6a405 100644 --- a/software/apps/omp/omp_overhead/main.c +++ b/software/apps/omp/omp_overhead/main.c @@ -6,7 +6,7 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" @@ -14,12 +14,14 @@ #define N 16 #define M 4 -void work2(unsigned long num) { +uint32_t work2(unsigned long num) { uint32_t i; uint32_t cnt = 0; for (i = 0; i < num; i++) cnt += i; + + return cnt; } void sequential() { @@ -78,52 +80,60 @@ void section_parallel() { } } -int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t cycles; +#define REPETITIONS 10 - mempool_barrier_init(core_id); - - if (core_id == 0) { - - printf("Sequential Start\n"); - cycles = mempool_get_timer(); - mempool_start_benchmark(); - sequential(); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("Sequential Duration: %d\n", cycles); - - printf("Static Start\n"); - cycles = mempool_get_timer(); - mempool_start_benchmark(); - static_parallel(); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("Static Duration: %d\n", cycles); - - printf("Dynamic Start\n"); - cycles = mempool_get_timer(); - mempool_start_benchmark(); - dynamic_parallel(); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("Dynamic Duration: %d\n", cycles); - - printf("Section Start\n"); - cycles = mempool_get_timer(); - mempool_start_benchmark(); - section_parallel(); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("Section Duration: %d\n", cycles); - - } else { - while (1) { - mempool_wfi(); - run_task(core_id); +void startup_time() { + + uint32_t duration = 0; + + for (int i = 0; i < REPETITIONS; i++) { + uint32_t time = mempool_get_timer(); +#pragma omp parallel + { +#pragma omp single + duration += mempool_get_timer() - time; } } + printf("Startup time duration: %d\n", duration / REPETITIONS); +} + +int main() { + uint32_t cycles; + + printf("Sequential Start\n"); + cycles = mempool_get_timer(); + mempool_start_benchmark(); + sequential(); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + printf("Sequential Duration: %d\n", cycles); + + printf("Static Start\n"); + cycles = mempool_get_timer(); + mempool_start_benchmark(); + static_parallel(); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + printf("Static Duration: %d\n", cycles); + + printf("Dynamic Start\n"); + cycles = mempool_get_timer(); + mempool_start_benchmark(); + dynamic_parallel(); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + printf("Dynamic Duration: %d\n", cycles); + + printf("Section Start\n"); + cycles = mempool_get_timer(); + mempool_start_benchmark(); + section_parallel(); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + printf("Section Duration: %d\n", cycles); + + startup_time(); + return 0; } diff --git a/software/apps/omp/omp_parallel_for_benchmark/main.c b/software/apps/omp/omp_parallel_for_benchmark/main.c index bc4de3d07..21b4054fc 100644 --- a/software/apps/omp/omp_parallel_for_benchmark/main.c +++ b/software/apps/omp/omp_parallel_for_benchmark/main.c @@ -8,7 +8,7 @@ #include "baremetal/mempool_matmul_i32p.h" #include "baremetal/mempool_matmul_i32s.h" #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "omp/mempool_matmul_i32.h" #include "printf.h" #include "runtime.h" @@ -86,107 +86,58 @@ void print_matrix(int32_t const *matrix, uint32_t num_rows, } int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); mempool_timer_t cycles; int error; - // Initialize synchronization variables - mempool_barrier_init(core_id); +#pragma omp parallel + { + // Initialize Matrices + init_matrix(a, M, N, A_a, A_b, A_c, (uint32_t)omp_get_thread_num(), + (uint32_t)omp_get_num_threads()); + init_matrix(b, N, P, B_a, B_b, B_c, (uint32_t)omp_get_thread_num(), + (uint32_t)omp_get_num_threads()); + } - // Initialize Matrices - init_matrix(a, M, N, A_a, A_b, A_c, core_id, num_cores); - init_matrix(b, N, P, B_a, B_b, B_c, core_id, num_cores); + printf("Start sequential\n"); - mempool_barrier(num_cores); + mempool_wait(1000); cycles = mempool_get_timer(); mempool_start_benchmark(); - mat_mul_parallel(a, b, c, M, N, P, core_id, num_cores); + // mat_mul_sequential(a, b, c, M, N, P); mempool_stop_benchmark(); cycles = mempool_get_timer() - cycles; - mempool_barrier(num_cores); - - // Check result - if (core_id == 0) { - printf("Manual Parallel Duration: %d\n", cycles); - error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c); - if (error != 0) { - printf("Error code %d\n", error); - printf("c[%d]=%d\n", error, c[error]); - } - } else { - mempool_wait(M * P * 12); + printf("Sequqntial Duration: %d\n", cycles); + error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c); + if (error != 0) { + printf("Error code %d\n", error); + printf("c[%d]=%d\n", error, c[error]); } - mempool_barrier(num_cores); + + printf("Start openMP\n"); cycles = mempool_get_timer(); mempool_start_benchmark(); - mat_mul_unrolled_parallel(a, b, c, M, N, P, core_id, num_cores); + mat_mul_parallel_omp(a, b, c, M, N, P); mempool_stop_benchmark(); cycles = mempool_get_timer() - cycles; - mempool_barrier(num_cores); - - // Check result - if (core_id == 0) { - printf("Manual unrolled Parallel Duration: %d\n", cycles); - error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c); - if (error != 0) { - printf("Error code %d\n", error); - printf("c[%d]=%d\n", error, c[error]); - } - } else { - mempool_wait(M * P * 12); + printf("OpenMP Parallel Duration: %d\n", cycles); + error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c); + if (error != 0) { + printf("Error code %d\n", error); + printf("c[%d]=%d\n", error, c[error]); } - mempool_barrier(num_cores); - - if (core_id == 0) { - - mempool_wait(1000); - - cycles = mempool_get_timer(); - mempool_start_benchmark(); - mat_mul_sequential(a, b, c, M, N, P); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("Sequqntial Duration: %d\n", cycles); - error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c); - if (error != 0) { - printf("Error code %d\n", error); - printf("c[%d]=%d\n", error, c[error]); - } - - printf("Start openMP\n"); - - cycles = mempool_get_timer(); - mempool_start_benchmark(); - mat_mul_parallel_omp(a, b, c, M, N, P); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("OpenMP Parallel Duration: %d\n", cycles); - error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c); - if (error != 0) { - printf("Error code %d\n", error); - printf("c[%d]=%d\n", error, c[error]); - } - cycles = mempool_get_timer(); - mempool_start_benchmark(); - mat_mul_unrolled_parallel_omp(a, b, c, M, N, P); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("OpenMP Unrolled Parallel Duration: %d\n", cycles); - error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c); - if (error != 0) { - printf("Error code %d\n", error); - printf("c[%d]=%d\n", error, c[error]); - } - - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } + cycles = mempool_get_timer(); + mempool_start_benchmark(); + mat_mul_unrolled_parallel_omp(a, b, c, M, N, P); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + printf("OpenMP Unrolled Parallel Duration: %d\n", cycles); + error = verify_matrix(c, M, P, A_a, A_b, A_c, B_a, B_b, B_c); + if (error != 0) { + printf("Error code %d\n", error); + printf("c[%d]=%d\n", error, c[error]); } return 0; diff --git a/software/apps/omp/omp_parallel_for_dynamic_benchmark/main.c b/software/apps/omp/omp_parallel_for_dynamic_benchmark/main.c index 42b9640a9..5deff879c 100644 --- a/software/apps/omp/omp_parallel_for_dynamic_benchmark/main.c +++ b/software/apps/omp/omp_parallel_for_dynamic_benchmark/main.c @@ -7,7 +7,7 @@ #include "data.h" #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" @@ -65,44 +65,29 @@ void spmv_dynamic(int *y, int *data, int *colidx, int *rowb, int *rowe, int *x, } int main() { - uint32_t core_id = mempool_get_core_id(); + mempool_timer_t cycles; + int n = 512; - mempool_barrier_init(core_id); + cycles = mempool_get_timer(); + mempool_start_benchmark(); + spmv(y, nnz, col, rowb, rowe, x, n); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + printf("Sequqntial Duration: %d\n", cycles); - if (core_id == 0) { + cycles = mempool_get_timer(); + mempool_start_benchmark(); + spmv_static(y, nnz, col, rowb, rowe, x, n); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + printf("Static Duration: %d\n", cycles); - mempool_wait(1000); - - mempool_timer_t cycles; - int n = 512; - - cycles = mempool_get_timer(); - mempool_start_benchmark(); - spmv(y, nnz, col, rowb, rowe, x, n); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("Sequqntial Duration: %d\n", cycles); - - cycles = mempool_get_timer(); - mempool_start_benchmark(); - spmv_static(y, nnz, col, rowb, rowe, x, n); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("Static Duration: %d\n", cycles); - - cycles = mempool_get_timer(); - mempool_start_benchmark(); - spmv_dynamic(y, nnz, col, rowb, rowe, x, n); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - printf("Dynamic Duration: %d\n", cycles); - - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } + cycles = mempool_get_timer(); + mempool_start_benchmark(); + spmv_dynamic(y, nnz, col, rowb, rowe, x, n); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + printf("Dynamic Duration: %d\n", cycles); return 0; } diff --git a/software/apps/omp/reduction_benchmark/main.c b/software/apps/omp/reduction_benchmark/main.c index 1106c633c..ffa44e539 100644 --- a/software/apps/omp/reduction_benchmark/main.c +++ b/software/apps/omp/reduction_benchmark/main.c @@ -6,7 +6,7 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" @@ -144,164 +144,160 @@ int32_t dot_product_omp_dynamic(int32_t const *__restrict__ A, } int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); mempool_timer_t cycles; + uint32_t num_cores = mempool_get_core_count(); - // Initialize synchronization variables - mempool_barrier_init(core_id); +#pragma omp parallel + { + uint32_t core_id = mempool_get_core_id(); -#ifdef VERBOSE - if (core_id == 0) { - printf("Initialize\n"); - } -#endif - - // Initialize Matrices - init_vector(a, M, A_a, A_b, core_id, num_cores); - init_vector(b, M, B_a, B_b, core_id, num_cores); + // Initialize synchronization variables + mempool_barrier_init(core_id); #ifdef VERBOSE - mempool_barrier(num_cores); - if (core_id == 0) { - // print_vector(a, M); - // print_vector(b, M); - } + if (core_id == 0) { + printf("Initialize\n"); + } #endif - mempool_barrier(num_cores); - int32_t result, correct_result; - - if (core_id == 0) { - mempool_wait(4 * num_cores); - cycles = mempool_get_timer(); - mempool_start_benchmark(); - result = dot_product_sequential(a, b, M); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; - } + // Initialize Matrices + init_vector(a, M, A_a, A_b, core_id, num_cores); + init_vector(b, M, B_a, B_b, core_id, num_cores); #ifdef VERBOSE - mempool_barrier(num_cores); - if (core_id == 0) { - printf("Sequential Result: %d\n", result); - printf("Sequential Duration: %d\n", cycles); - if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) { - printf("Sequential Result is %d instead of %d\n", result, correct_result); - } else { - printf("Result is correct!\n"); + mempool_barrier(num_cores); + if (core_id == 0) { + // print_vector(a, M); + // print_vector(b, M); } - } #endif - mempool_barrier(num_cores); - cycles = mempool_get_timer(); - mempool_start_benchmark(); - result = dot_product_parallel1(a, b, c, M, core_id, num_cores); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; + mempool_barrier(num_cores); + int32_t result, correct_result; -#ifdef VERBOSE - mempool_barrier(num_cores); - if (core_id == 0) { - printf("Manual Parallel1 Result: %d\n", result); - printf("Manual Parallel1 Duration: %d\n", cycles); - if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) { - printf("Manual Parallel1 Result is %d instead of %d\n", result, - correct_result); - } else { - printf("Result is correct!\n"); + if (core_id == 0) { + mempool_wait(4 * num_cores); + cycles = mempool_get_timer(); + mempool_start_benchmark(); + result = dot_product_sequential(a, b, M); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; } - } -#endif - mempool_barrier(num_cores); - - cycles = mempool_get_timer(); - mempool_start_benchmark(); - result = dot_product_parallel2(a, b, c, M, core_id, num_cores); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; #ifdef VERBOSE - mempool_barrier(num_cores); - if (core_id == 0) { - printf("Manual Parallel2 Result: %d\n", result); - printf("Manual Parallel2 Duration: %d\n", cycles); - if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) { - printf("Manual Parallel2 Result is %d instead of %d\n", result, - correct_result); - } else { - printf("Result is correct!\n"); + mempool_barrier(num_cores); + if (core_id == 0) { + printf("Sequential Result: %d\n", result); + printf("Sequential Duration: %d\n", cycles); + if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) { + printf("Sequential Result is %d instead of %d\n", result, + correct_result); + } else { + printf("Result is correct!\n"); + } } - } #endif - mempool_barrier(num_cores); - - /* OPENMP IMPLEMENTATION */ - int32_t omp_result; - - if (core_id == 0) { - mempool_wait(4 * num_cores); + mempool_barrier(num_cores); cycles = mempool_get_timer(); mempool_start_benchmark(); - omp_result = dot_product_omp_static(a, b, M); + result = dot_product_parallel1(a, b, c, M, core_id, num_cores); mempool_stop_benchmark(); cycles = mempool_get_timer() - cycles; - printf("OMP Static Result: %d\n", omp_result); - printf("OMP Static Duration: %d\n", cycles); - if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b, - &correct_result)) { - printf("OMP Static Result is %d instead of %d\n", omp_result, - correct_result); - } else { - printf("Result is correct!\n"); +#ifdef VERBOSE + mempool_barrier(num_cores); + if (core_id == 0) { + printf("Manual Parallel1 Result: %d\n", result); + printf("Manual Parallel1 Duration: %d\n", cycles); + if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) { + printf("Manual Parallel1 Result is %d instead of %d\n", result, + correct_result); + } else { + printf("Result is correct!\n"); + } } - - mempool_wait(4 * num_cores); +#endif + mempool_barrier(num_cores); cycles = mempool_get_timer(); mempool_start_benchmark(); - omp_result = dot_product_omp_dynamic(a, b, M, 4); + result = dot_product_parallel2(a, b, c, M, core_id, num_cores); mempool_stop_benchmark(); cycles = mempool_get_timer() - cycles; - printf("OMP Dynamic(4) Result: %d\n", omp_result); - printf("OMP Dynamic(4) Duration: %d\n", cycles); - if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b, - &correct_result)) { - printf("OMP Dynamic(4) Result is %d instead of %d\n", omp_result, - correct_result); - } else { - printf("Result is correct!\n"); +#ifdef VERBOSE + mempool_barrier(num_cores); + if (core_id == 0) { + printf("Manual Parallel2 Result: %d\n", result); + printf("Manual Parallel2 Duration: %d\n", cycles); + if (!verify_dotproduct(result, M, A_a, A_b, B_a, B_b, &correct_result)) { + printf("Manual Parallel2 Result is %d instead of %d\n", result, + correct_result); + } else { + printf("Result is correct!\n"); + } } +#endif + mempool_barrier(num_cores); + } - mempool_wait(4 * num_cores); + /* OPENMP IMPLEMENTATION */ + int32_t omp_result; + int32_t correct_result; - cycles = mempool_get_timer(); - mempool_start_benchmark(); - omp_result = dot_product_omp_dynamic(a, b, M, 10); - mempool_stop_benchmark(); - cycles = mempool_get_timer() - cycles; + mempool_wait(4 * num_cores); - printf("OMP Dynamic(10) Result: %d\n", omp_result); - printf("OMP Dynamic(10) Duration: %d\n", cycles); - if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b, - &correct_result)) { - printf("OMP Dynamic(10) Result is %d instead of %d\n", omp_result, - correct_result); - } else { - printf("Result is correct!\n"); - } + cycles = mempool_get_timer(); + mempool_start_benchmark(); + omp_result = dot_product_omp_static(a, b, M); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + + printf("OMP Static Result: %d\n", omp_result); + printf("OMP Static Duration: %d\n", cycles); + if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b, &correct_result)) { + printf("OMP Static Result is %d instead of %d\n", omp_result, + correct_result); + } else { + printf("Result is correct!\n"); + } - mempool_wait(4 * num_cores); + mempool_wait(4 * num_cores); + cycles = mempool_get_timer(); + mempool_start_benchmark(); + omp_result = dot_product_omp_dynamic(a, b, M, 4); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + + printf("OMP Dynamic(4) Result: %d\n", omp_result); + printf("OMP Dynamic(4) Duration: %d\n", cycles); + if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b, &correct_result)) { + printf("OMP Dynamic(4) Result is %d instead of %d\n", omp_result, + correct_result); } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } + printf("Result is correct!\n"); } + + mempool_wait(4 * num_cores); + + cycles = mempool_get_timer(); + mempool_start_benchmark(); + omp_result = dot_product_omp_dynamic(a, b, M, 10); + mempool_stop_benchmark(); + cycles = mempool_get_timer() - cycles; + + printf("OMP Dynamic(10) Result: %d\n", omp_result); + printf("OMP Dynamic(10) Duration: %d\n", cycles); + if (!verify_dotproduct(omp_result, M, A_a, A_b, B_a, B_b, &correct_result)) { + printf("OMP Dynamic(10) Result is %d instead of %d\n", omp_result, + correct_result); + } else { + printf("Result is correct!\n"); + } + + mempool_wait(4 * num_cores); + return 0; } diff --git a/software/apps/omp/reduction_benchmark2/main.c b/software/apps/omp/reduction_benchmark2/main.c new file mode 100644 index 000000000..dadafcba8 --- /dev/null +++ b/software/apps/omp/reduction_benchmark2/main.c @@ -0,0 +1,64 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "encoding.h" +#include "omp.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" + +int32_t dot_product_omp_static(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + uint32_t num_elements) { + uint32_t i; + int32_t dotp = 0; +#pragma omp parallel for reduction(+ : dotp) + for (i = 0; i < num_elements; i++) { + dotp += A[i] * B[i]; + } + return dotp; +} + +int32_t dot_product_omp_dynamic(int32_t const *__restrict__ A, + int32_t const *__restrict__ B, + uint32_t num_elements) { + uint32_t i; + int32_t dotp = 0; + // printf("num_elements %d\n", num_elements); +#pragma omp parallel for schedule(dynamic) reduction(+ : dotp) + for (i = 0; i < num_elements; i++) { + dotp += A[i] * B[i]; + } + return dotp; +} + +int main() { + uint32_t num_cores = mempool_get_core_count(); + mempool_timer_t cycles; + + mempool_wait(4 * num_cores); + + for (unsigned int i = 1; i <= 8192; i *= 2) { + int32_t *a = simple_malloc(i * sizeof(int32_t)); + cycles = mempool_get_timer(); + dot_product_omp_static(a, a, i); + cycles = mempool_get_timer() - cycles; + simple_free(a); + printf("Static duration with %d elements: %d\n", i, cycles); + } + + for (unsigned int i = 1; i <= 8192; i *= 2) { + int32_t *a = simple_malloc(i * sizeof(int32_t)); + cycles = mempool_get_timer(); + dot_product_omp_dynamic(a, a, i); + cycles = mempool_get_timer() - cycles; + simple_free(a); + printf("Dynamic duration with %d elements: %d\n", i, cycles); + } + + return 0; +} diff --git a/software/apps/omp/single_benchmark/main.c b/software/apps/omp/single_benchmark/main.c index 523a02f19..2f6d191a9 100644 --- a/software/apps/omp/single_benchmark/main.c +++ b/software/apps/omp/single_benchmark/main.c @@ -6,7 +6,7 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" @@ -22,9 +22,8 @@ void work1() { } void parallel_single_manual() { - uint32_t core_id; + uint32_t core_id = mempool_get_core_id(); uint32_t num_cores = mempool_get_core_count(); - core_id = mempool_get_core_id(); work1(); @@ -71,37 +70,28 @@ void omp_parallel_single() { int main() { uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - // Initialize synchronization variables - mempool_barrier_init(core_id); - - // #ifdef VERBOSE - if (core_id == 0) { - printf("Initialize\n"); - *checkfirst = 0; - result = 0; - } +#pragma omp parallel + { + // #ifdef VERBOSE + if (core_id == 0) { + printf("Initialize\n"); + *checkfirst = 0; + result = 0; + } - mempool_barrier(num_cores); +#pragma omp barrier - parallel_single_manual(); + parallel_single_manual(); - mempool_barrier(num_cores); +#pragma omp barrier - result = 0; + result = 0; + } /* OPENMP IMPLEMENTATION */ - if (core_id == 0) { - omp_parallel_single(); + omp_parallel_single(); - mempool_wait(4 * num_cores); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } return 0; } diff --git a/software/apps/omp/test/main.c b/software/apps/omp/test/main.c new file mode 100644 index 000000000..5fc7f996a --- /dev/null +++ b/software/apps/omp/test/main.c @@ -0,0 +1,20 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "omp.h" +#include "printf.h" +#include "runtime.h" + +int main(){ +#pragma omp parallel + { +#pragma omp critical + {printf("First critical\n"); +} + +#pragma omp critical +{ printf("Second critical\n"); } +} +} +; diff --git a/software/runtime/omp/barrier.c b/software/runtime/gomp/barrier.c similarity index 100% rename from software/runtime/omp/barrier.c rename to software/runtime/gomp/barrier.c diff --git a/software/runtime/omp/critical.c b/software/runtime/gomp/critical.c similarity index 100% rename from software/runtime/omp/critical.c rename to software/runtime/gomp/critical.c diff --git a/software/runtime/omp/libgomp.h b/software/runtime/gomp/libgomp.h similarity index 97% rename from software/runtime/omp/libgomp.h rename to software/runtime/gomp/libgomp.h index 383e19a42..cfa86cfaa 100644 --- a/software/runtime/omp/libgomp.h +++ b/software/runtime/gomp/libgomp.h @@ -115,6 +115,6 @@ typedef struct { omp_lock_t atomic_lock; } work_t; -extern event_t event; -extern work_t works; +extern event_t event __attribute__((section(".l1"))); +extern work_t works __attribute__((section(".l1"))); #endif /* __LIBGOMP_H__ */ diff --git a/software/runtime/omp/loop.c b/software/runtime/gomp/loop.c similarity index 100% rename from software/runtime/omp/loop.c rename to software/runtime/gomp/loop.c diff --git a/software/runtime/gomp/main_wrapper.c b/software/runtime/gomp/main_wrapper.c new file mode 100644 index 000000000..a98a7c419 --- /dev/null +++ b/software/runtime/gomp/main_wrapper.c @@ -0,0 +1,43 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "libgomp.h" +#include "runtime.h" + +typedef void (*init_func)(void); +extern init_func __init_array_start[]; +extern init_func __init_array_end[]; + +static inline void initGlobals() { + // NOLINTNEXTLINE(*-narrowing-conversions) + int32_t len = __init_array_end - __init_array_start; + for (int32_t i = 0; i < len; i++) { + + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + __init_array_start[i](); + } +} + +int __real_main(); + +int __wrap_main() { + const mempool_id_t core_id = mempool_get_core_id(); + + mempool_barrier_init(core_id); + mempool_init(core_id); + + if (core_id == 0) { + initGlobals(); + __real_main(); + + printf("Program done\n"); + } else { + while (1) { + mempool_wfi(); + run_task(core_id); + } + } + + return 0; +} diff --git a/software/runtime/omp/omp-lock.h b/software/runtime/gomp/omp-lock.h similarity index 100% rename from software/runtime/omp/omp-lock.h rename to software/runtime/gomp/omp-lock.h diff --git a/software/runtime/omp/omp.h b/software/runtime/gomp/omp.h similarity index 94% rename from software/runtime/omp/omp.h rename to software/runtime/gomp/omp.h index de04877ab..554e3d691 100644 --- a/software/runtime/omp/omp.h +++ b/software/runtime/gomp/omp.h @@ -7,6 +7,8 @@ #ifndef __OMP_H__ #define __OMP_H__ +#include "stdint.h" + /* parallel.c */ extern uint32_t omp_get_num_threads(void); extern uint32_t omp_get_thread_num(void); diff --git a/software/runtime/omp/parallel.c b/software/runtime/gomp/parallel.c similarity index 100% rename from software/runtime/omp/parallel.c rename to software/runtime/gomp/parallel.c diff --git a/software/runtime/omp/sections.c b/software/runtime/gomp/sections.c similarity index 100% rename from software/runtime/omp/sections.c rename to software/runtime/gomp/sections.c diff --git a/software/runtime/omp/single.c b/software/runtime/gomp/single.c similarity index 100% rename from software/runtime/omp/single.c rename to software/runtime/gomp/single.c diff --git a/software/runtime/omp/work.c b/software/runtime/gomp/work.c similarity index 100% rename from software/runtime/omp/work.c rename to software/runtime/gomp/work.c diff --git a/software/runtime/kmp/barrier.cpp b/software/runtime/kmp/barrier.cpp new file mode 100644 index 000000000..32328aad7 --- /dev/null +++ b/software/runtime/kmp/barrier.cpp @@ -0,0 +1,13 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "barrier.hpp" + +namespace kmp { +Barrier::Barrier(kmp_int32 numThreads) + : barrier(0), generation(0), numThreads(numThreads) {} + +Barrier::~Barrier() { DEBUG_PRINT("Destroying barrier at %p\n", this); } + +}; // namespace kmp diff --git a/software/runtime/kmp/barrier.hpp b/software/runtime/kmp/barrier.hpp new file mode 100644 index 000000000..3b4c7d086 --- /dev/null +++ b/software/runtime/kmp/barrier.hpp @@ -0,0 +1,93 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "kmp/omp.h" +#include "kmp/types.h" +#include "kmp/util.hpp" + +#include +#include +#include + +extern "C" { +#include "runtime.h" +} + +namespace kmp { + +namespace runtime { +extern kmp_int32 numTeams; +} + +class Barrier { +public: + Barrier(Barrier &&) = delete; + Barrier &operator=(Barrier &&) = delete; + Barrier(kmp_int32 numThreads); + Barrier(const Barrier &) = delete; + Barrier &operator=(const Barrier &) = delete; + + ~Barrier(); + + inline void wait() { + if (runtime::numTeams == 1) { + DEBUG_PRINT("Entering wfi barrier at %p\n", this); + // WFI barrier + + // Increment the barrier counter + if ((numThreads - 1) == barrier.fetch_add(1, std::memory_order_relaxed)) { + DEBUG_PRINT("Barrier done at %p\n", this); + barrier.store(0, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + wake_up_all(); + } + + // Some threads have not reached the barrier --> Let's wait + // Clear the wake-up trigger for the last core reaching the barrier as + // well + mempool_wfi(); + DEBUG_PRINT("Exiting wfi barrier at %p\n", this); + + } else { + // Spin generation barrier + kmp_int32 gen = generation; + + DEBUG_PRINT("Entering spin barrier at %p, gen %d\n", this, gen); + + // Increment the barrier counter + if ((numThreads - 1) == barrier.fetch_add(1, std::memory_order_relaxed)) { + DEBUG_PRINT("Barrier done at %p\n", this); + barrier.store(0, std::memory_order_relaxed); + generation.fetch_add(1, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + } + + while (gen == generation.load(std::memory_order_relaxed)) { + // Spin + } + + DEBUG_PRINT("Exiting spin barrier at %p, gen %d\n", this, gen); + } + }; + + inline void setNumThreads(int32_t numThreads) { + if (barrier != 0) { + DEBUG_PRINT("Cannot change the number of threads while the barrier is " + "active: %p, %d\n", + this, barrier.load()); + } + assert(barrier == 0 && + "Cannot change the number of threads while the barrier is active"); + + this->numThreads = numThreads; + } + +private: + std::atomic barrier; + std::atomic generation; + int32_t numThreads; +}; +}; // namespace kmp diff --git a/software/runtime/kmp/cppsupport.cpp b/software/runtime/kmp/cppsupport.cpp new file mode 100644 index 000000000..e390c1122 --- /dev/null +++ b/software/runtime/kmp/cppsupport.cpp @@ -0,0 +1,69 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#include "kmp/util.hpp" + +extern "C" { +#include "alloc.h" +} + +extern void (*_eoc)(void); + +kmp::Mutex allocLock __attribute__((section(".l1"))); + +void *operator new(size_t size) { + std::lock_guard lock(allocLock); + void *ptr = simple_malloc(size); + return ptr; +} + +void operator delete(void *ptr) noexcept { + std::lock_guard lock(allocLock); + return simple_free(ptr); +} + +void *operator new[](size_t size) { return operator new(size); } + +void operator delete[](void *ptr) noexcept { return operator delete(ptr); } + +namespace std { +void __throw_bad_alloc() { + printf("Bad alloc\n"); + abort(); +} + +void __throw_length_error(const char *msg) { + printf("Length error: %s\n", msg); + abort(); +} + +void __throw_bad_optional_access() { + printf("Bad optional access\n"); + abort(); +} +} // namespace std + +extern "C" void abort() { + printf("Aborting\n"); + while (true) { + asm("j _eoc"); + } +} + +extern "C" int __cxa_atexit(void (*func)(void *), void *arg, void *dso_handle) { + (void)func; + (void)arg; + (void)dso_handle; + return 0; +} + +extern "C" void __assert_func(const char *file, int line, const char *func, + const char *failedexpr) { + printf("Assertion failed: %s, file %s, line %d, function %s\n", failedexpr, + file, line, func); + abort(); +} diff --git a/software/runtime/kmp/entrypoints.cpp b/software/runtime/kmp/entrypoints.cpp new file mode 100644 index 000000000..b5a251dfe --- /dev/null +++ b/software/runtime/kmp/entrypoints.cpp @@ -0,0 +1,237 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "kmp/runtime.hpp" +#include "kmp/team.hpp" +#include "kmp/types.h" + +using kmp::Mutex; + +extern "C" { +#include "runtime.h" + +void __kmpc_barrier(ident_t * /*loc*/, kmp_int32 global_tid) { + kmp::runtime::getThread(global_tid).getCurrentTeam()->getBarrier().wait(); +}; + +// Parallel +void __kmpc_fork_call(ident_t * /*loc*/, kmp_int32 argc, kmpc_micro microtask, + ...) { + // NOLINTBEGIN(cppcoreguidelines-pro-bounds-array-to-pointer-decay, + // cppcoreguidelines-pro-type-reinterpret-cast) + va_list args; + va_start(args, microtask); + kmp::Task kmpMicrotask(microtask, reinterpret_cast(args), argc); + kmp::runtime::getCurrentThread().forkCall(kmpMicrotask); + va_end(args); + // NOLINTEND(cppcoreguidelines-pro-bounds-array-to-pointer-decay, + // cppcoreguidelines-pro-type-reinterpret-cast) +}; + +// Static loops +void __kmpc_for_static_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype, + kmp_int32 *plastiter, kmp_int32 *plower, + kmp_int32 *pupper, kmp_int32 *pstride, + kmp_int32 incr, kmp_int32 chunk) { + kmp::runtime::getThread(gtid).getCurrentTeam()->forStaticInit( + loc, gtid, static_cast(schedtype), plastiter, plower, + pupper, pstride, incr, chunk); +}; + +void __kmpc_for_static_init_4u(ident_t *loc, kmp_int32 gtid, + kmp_int32 schedtype, kmp_uint32 *plastiter, + kmp_uint32 *plower, kmp_uint32 *pupper, + kmp_int32 *pstride, kmp_int32 incr, + kmp_int32 chunk) { + kmp::runtime::getThread(gtid).getCurrentTeam()->forStaticInit( + loc, gtid, static_cast(schedtype), plastiter, plower, + pupper, pstride, incr, chunk); +}; + +void __kmpc_for_static_init_8(ident_t * /*loc*/, kmp_int32 /*gtid*/, + kmp_int32 /*schedtype*/, + kmp_int64 * /*plastiter*/, kmp_int64 * /*plower*/, + kmp_int64 * /*pupper*/, kmp_int64 * /*pstride*/, + kmp_int64 /*incr*/, kmp_int64 /*chunk*/) { + assert(false && "Unsupported loop index type"); +}; + +void __kmpc_for_static_init_8u(ident_t * /*loc*/, kmp_int32 /*gtid*/, + kmp_int32 /*schedtype*/, + kmp_uint64 * /*plastiter*/, + kmp_uint64 * /*plower*/, kmp_uint64 * /*pupper*/, + kmp_int64 * /*pstride*/, kmp_int64 /*incr*/, + kmp_int64 /*chunk*/) { + assert(false && "Unsupported loop index type"); +}; + +void __kmpc_for_static_fini(ident_t * /*loc*/, kmp_int32 /*global_tid*/){}; + +// Dynamic loops +void __kmpc_dispatch_init_4(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype, + kmp_int32 lower, kmp_int32 upper, kmp_int32 incr, + kmp_int32 chunk) { + kmp::runtime::getThread(gtid).getCurrentTeam()->dispatchInit( + loc, gtid, + static_cast(SCHEDULE_WITHOUT_MODIFIERS(schedtype)), lower, + upper, incr, chunk); +} + +void __kmpc_dispatch_init_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 schedtype, + kmp_uint32 lower, kmp_uint32 upper, kmp_int32 incr, + kmp_int32 chunk) { + kmp::runtime::getThread(gtid).getCurrentTeam()->dispatchInit( + loc, gtid, + static_cast(SCHEDULE_WITHOUT_MODIFIERS(schedtype)), lower, + upper, incr, chunk); +} + +void __kmpc_dispatch_init_8(ident_t * /*loc*/, kmp_int64 /*gtid*/, + kmp_sched_type /*schedtype*/, kmp_int64 /*lower*/, + kmp_int64 /*upper*/, kmp_int64 /*incr*/, + kmp_int64 /*chunk*/) { + assert(false && "Unsupported loop index type"); +} + +void __kmpc_dispatch_init_8u(ident_t * /*loc*/, kmp_int64 /*gtid*/, + kmp_sched_type /*schedtype*/, kmp_uint64 /*lower*/, + kmp_uint64 /*upper*/, kmp_int64 /*incr*/, + kmp_int64 /*chunk*/) { + assert(false && "Unsupported loop index type"); +} + +int __kmpc_dispatch_next_4(ident_t *loc, kmp_int32 gtid, kmp_int32 *plastiter, + kmp_int32 *plower, kmp_int32 *pupper, + kmp_int32 *pstride) { + return static_cast( + kmp::runtime::getThread(gtid).getCurrentTeam()->dispatchNext( + loc, gtid, plastiter, plower, pupper, pstride)); +} + +int __kmpc_dispatch_next_4u(ident_t *loc, kmp_int32 gtid, kmp_int32 *plastiter, + kmp_uint32 *plower, kmp_uint32 *pupper, + kmp_int32 *pstride) { + return static_cast( + kmp::runtime::getThread(gtid).getCurrentTeam()->dispatchNext( + loc, gtid, plastiter, plower, pupper, pstride)); +} + +int __kmpc_dispatch_next_8(ident_t * /*loc*/, kmp_int64 /*gtid*/, + kmp_int64 * /*plastiter*/, kmp_int64 * /*plower*/, + kmp_int64 * /*pupper*/, kmp_int64 * /*pstride*/) { + assert(false && "Unsupported loop index type"); + return 0; +} + +int __kmpc_dispatch_next_8u(ident_t * /*loc*/, kmp_int64 /*gtid*/, + kmp_int64 * /*plastiter*/, kmp_uint64 * /*plower*/, + kmp_uint64 * /*pupper*/, kmp_int64 * /*pstride*/) { + assert(false && "Unsupported loop index type"); + return 0; +} + +void __kmpc_push_num_threads(ident_t * /*loc*/, kmp_int32 global_tid, + kmp_int32 num_threads) { + kmp::runtime::getThread(global_tid).requestNumThreads(num_threads); +}; + +// Critical sections +void __kmpc_critical(ident_t * /*unused*/, kmp_int32 /*gtid*/, + kmp_critical_name *crit) { + static_assert(sizeof(kmp::Mutex) <= sizeof(kmp_critical_name)); + + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) + kmp::Mutex *mutex = reinterpret_cast(*crit); + mutex->lock(); +}; + +void __kmpc_end_critical(ident_t * /*unused*/, kmp_int32 /*gtid*/, + kmp_critical_name *crit) { + + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast) + Mutex *mutex = reinterpret_cast(*crit); + mutex->unlock(); +}; + +// Master +kmp_int32 __kmpc_master(ident_t * /*loc*/, kmp_int32 gtid) { + return static_cast(kmp::runtime::getThread(gtid).getTid() == 0); +}; + +void __kmpc_end_master(ident_t * /*loc*/, kmp_int32 /*gtid*/){/* NOOP */}; + +// Single (same as master for now) +kmp_int32 __kmpc_single(ident_t * /*loc*/, kmp_int32 gtid) { + return static_cast(kmp::runtime::getThread(gtid).getTid() == 0); +}; + +void __kmpc_end_single(ident_t * /*loc*/, kmp_int32 /*gtid*/){/* NOOP */}; + +// Copyprivate +void __kmpc_copyprivate(ident_t *loc, kmp_int32 gtid, size_t cpy_size, + void *cpy_data, void (*cpy_func)(void *, void *), + kmp_int32 didit) { + kmp::runtime::getThread(gtid).getCurrentTeam()->copyPrivate( + loc, gtid, cpy_size, cpy_data, cpy_func, didit); +}; + +// Reduction +kmp_int32 __kmpc_reduce_nowait(ident_t * /*loc*/, kmp_int32 /*global_tid*/, + kmp_int32 /*num_vars*/, size_t /*reduce_size*/, + void * /*reduce_data*/, + void (* /*reduce_func*/)(void *lhs_data, + void *rhs_data), + kmp_critical_name * /*lck*/) { + return 2; // Atomic reduction +} + +void __kmpc_end_reduce_nowait(ident_t * /*loc*/, kmp_int32 /*global_tid*/, + kmp_critical_name * /*lck*/) { + /* NOOP */ +} + +kmp_int32 __kmpc_reduce(ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, + size_t reduce_size, void *reduce_data, + void (*reduce_func)(void *lhs_data, void *rhs_data), + kmp_critical_name *lck) { + + return __kmpc_reduce_nowait(loc, global_tid, num_vars, reduce_size, + reduce_data, reduce_func, lck); +} + +void __kmpc_end_reduce(ident_t *loc, kmp_int32 global_tid, + kmp_critical_name * /*lck*/) { + return __kmpc_barrier(loc, global_tid); +} + +// Teams +void __kmpc_fork_teams(ident_t * /*loc*/, kmp_int32 argc, kmpc_micro microtask, + ...) { + // NOLINTBEGIN(cppcoreguidelines-pro-bounds-array-to-pointer-decay, + // cppcoreguidelines-pro-type-reinterpret-cast) + va_list args; + va_start(args, microtask); + kmp::Task kmpMicrotask(microtask, reinterpret_cast(args), argc); + kmp::runtime::getCurrentThread().forkTeams(kmpMicrotask); + va_end(args); + // NOLINTEND(cppcoreguidelines-pro-bounds-array-to-pointer-decay, + // cppcoreguidelines-pro-type-reinterpret-cast) +} + +void __kmpc_push_num_teams(ident_t * /*loc*/, kmp_int32 /*global_tid*/, + kmp_int32 num_teams, kmp_int32 num_threads) { + DEBUG_PRINT("num_teams: %d, num_threads: %d\n", num_teams, num_threads); + if (num_teams > 0) { + kmp::runtime::requestedNumTeams = std::min(num_teams, NUM_CORES / 2); + } + + if (num_threads > 0) { + kmp::runtime::requestedThreadLimit = num_threads; + } +} + +kmp_int32 __kmpc_global_thread_num(ident_t * /*loc*/) { + return static_cast(mempool_get_core_id()); +}; +} diff --git a/software/runtime/kmp/main_wrapper.cpp b/software/runtime/kmp/main_wrapper.cpp new file mode 100644 index 000000000..22c1a0859 --- /dev/null +++ b/software/runtime/kmp/main_wrapper.cpp @@ -0,0 +1,59 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "kmp/runtime.hpp" + +extern "C" { +#include "runtime.h" +} + +// https://etherealwake.com/2021/09/crt-startup/ +typedef void (*init_func)(void); +extern init_func __init_array_start[]; +extern init_func __init_array_end[]; + +static inline void initGlobals() { + // NOLINTNEXTLINE(*-narrowing-conversions) + int32_t len = __init_array_end - __init_array_start; + for (int32_t i = 0; i < len; i++) { + + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) + __init_array_start[i](); + } +} + +extern "C" int __real_main(); + +bool initLock = true; + +extern "C" int __wrap_main() { + const mempool_id_t core_id = mempool_get_core_id(); + if (core_id == 0) { + DEBUG_PRINT("Running OpenMP program on %d cores\n", + mempool_get_core_count()); + + // Init heap allocators + mempool_init(0); + + // Call C++ global constructors + initGlobals(); + + initLock = false; + + DEBUG_PRINT("Init done\n"); + + // Run the program + __real_main(); + + printf("Program done\n"); + } else { + while (initLock) { + // Wait for initialization to finish + } + + kmp::runtime::runThread(static_cast(core_id)); + } + + return 0; +} diff --git a/software/runtime/kmp/omp.cpp b/software/runtime/kmp/omp.cpp new file mode 100644 index 000000000..3f5d04064 --- /dev/null +++ b/software/runtime/kmp/omp.cpp @@ -0,0 +1,22 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "omp.h" +#include "kmp/runtime.hpp" +#include "kmp/team.hpp" + +void not_implemented(void) { printf("Not implemented\n"); } + +int omp_get_num_threads(void) { + return kmp::runtime::getCurrentThread().getCurrentTeam()->getNumThreads(); +} + +int omp_get_thread_num(void) { + return kmp::runtime::getCurrentThread().getTid(); +}; + +int omp_get_num_teams(void) { return kmp::runtime::numTeams; } +int omp_get_team_num(void) { + return kmp::runtime::getCurrentThread().getCurrentTeam()->getTeamId(); +} diff --git a/software/runtime/kmp/omp.h b/software/runtime/kmp/omp.h new file mode 100644 index 000000000..4c6abcf79 --- /dev/null +++ b/software/runtime/kmp/omp.h @@ -0,0 +1,21 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +/* Standard public APIs */ + +#pragma once + +#ifdef __cplusplus +extern "C" { +#endif + +extern int omp_get_num_threads(void); +extern int omp_get_thread_num(void); + +extern int omp_get_num_teams(void); +extern int omp_get_team_num(void); + +#ifdef __cplusplus +} +#endif diff --git a/software/runtime/kmp/runtime.cpp b/software/runtime/kmp/runtime.cpp new file mode 100644 index 000000000..be79ae434 --- /dev/null +++ b/software/runtime/kmp/runtime.cpp @@ -0,0 +1,34 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "kmp/team.hpp" +#include "kmp/types.h" + +#include +#include + +namespace kmp { + +namespace runtime { + +template +constexpr std::array +sequencetoArray(std::integer_sequence /*unused*/) { + return {{Is...}}; +} + +std::array threads = + sequencetoArray(std::make_integer_sequence{}); + +Team defaultTeam(0, 0); + +std::optional requestedNumTeams; +std::optional requestedThreadLimit; +kmp_int32 numTeams = 1; + +Barrier teamsBarrier(NUM_GROUPS); + +} // namespace runtime + +} // namespace kmp diff --git a/software/runtime/kmp/runtime.hpp b/software/runtime/kmp/runtime.hpp new file mode 100644 index 000000000..eca4eb814 --- /dev/null +++ b/software/runtime/kmp/runtime.hpp @@ -0,0 +1,42 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "kmp/barrier.hpp" +#include "kmp/thread.hpp" +#include "kmp/types.h" + +// NOLINTNEXTLINE(bugprone-reserved-identifier) +extern void __assert_func(const char *file, int line, const char *func, + const char *failedexpr); + +namespace kmp { + +namespace runtime { + +extern std::array threads __attribute__((section(".l1"))); + +extern Team defaultTeam __attribute__((section(".l1"))); + +extern std::optional requestedNumTeams; +extern std::optional requestedThreadLimit; + +extern Barrier teamsBarrier __attribute__((section(".l1"))); + +static inline void runThread(kmp_int32 core_id) { + threads[static_cast(core_id)].run(); +}; + +static inline Thread &getThread(kmp_int32 gtid) { + return threads[static_cast(gtid)]; +}; + +static inline Thread &getCurrentThread() { + return threads[mempool_get_core_id()]; +}; + +} // namespace runtime + +} // namespace kmp diff --git a/software/runtime/kmp/task.cpp b/software/runtime/kmp/task.cpp new file mode 100644 index 000000000..6edbe8521 --- /dev/null +++ b/software/runtime/kmp/task.cpp @@ -0,0 +1,92 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "kmp/task.hpp" +#include "kmp/runtime.hpp" + +extern "C" { +#include "runtime.h" +} + +namespace kmp { +Task::Task(kmpc_micro func, void **args, kmp_int32 argc) + : func(func), argc(argc), args(args) { + + assert(argc <= MAX_ARGS && "Unsupported number of microtask arguments"); + + DEBUG_PRINT("Microtask constructor\n"); +}; + +void Task::run(kmp_int32 gtid, kmp_int32 tid) const { + // There seems to not be a better way to do this without custom passes or + // ASM + switch (argc) { + default: + return; + + // NOLINTBEGIN(cppcoreguidelines-pro-bounds-pointer-arithmetic,*-magic-numbers) + case 0: + func(>id, &tid); + break; + case 1: + func(>id, &tid, args[0]); + break; + case 2: + func(>id, &tid, args[0], args[1]); + break; + case 3: + func(>id, &tid, args[0], args[1], args[2]); + break; + case 4: + func(>id, &tid, args[0], args[1], args[2], args[3]); + break; + case 5: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4]); + break; + case 6: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5]); + break; + case 7: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5], + args[6]); + break; + case 8: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5], + args[6], args[7]); + break; + case 9: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5], + args[6], args[7], args[8]); + break; + case 10: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5], + args[6], args[7], args[8], args[9]); + break; + case 11: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5], + args[6], args[7], args[8], args[9], args[10]); + break; + case 12: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5], + args[6], args[7], args[8], args[9], args[10], args[11]); + break; + case 13: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5], + args[6], args[7], args[8], args[9], args[10], args[11], args[12]); + break; + case 14: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5], + args[6], args[7], args[8], args[9], args[10], args[11], args[12], + args[13]); + break; + case 15: + func(>id, &tid, args[0], args[1], args[2], args[3], args[4], args[5], + args[6], args[7], args[8], args[9], args[10], args[11], args[12], + args[13], args[14]); + break; + } + // NOLINTEND(cppcoreguidelines-pro-bounds-pointer-arithmetic,*-magic-numbers) +}; + +} // namespace kmp diff --git a/software/runtime/kmp/task.hpp b/software/runtime/kmp/task.hpp new file mode 100644 index 000000000..e5a9b03af --- /dev/null +++ b/software/runtime/kmp/task.hpp @@ -0,0 +1,25 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include "types.h" + +#define MAX_ARGS 15 + +namespace kmp { + +class Task { +public: + Task(kmpc_micro func, void **args, kmp_int32 argc); + + void run(kmp_int32 gtid, kmp_int32 tid) const; + +private: + kmpc_micro func; + kmp_int32 argc; + void **args; +}; + +}; // namespace kmp diff --git a/software/runtime/kmp/team.hpp b/software/runtime/kmp/team.hpp new file mode 100644 index 000000000..fd656e4aa --- /dev/null +++ b/software/runtime/kmp/team.hpp @@ -0,0 +1,318 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include + +#include "kmp/barrier.hpp" +#include "kmp/runtime.hpp" +#include "kmp/types.h" +#include "kmp/util.hpp" +#include "printf.h" + +namespace kmp { + +class Thread; +class Task; +class Barrier; + +class Team { + + struct DynamicSchedule { + kmp_uint32 lowerNext = 0; + kmp_uint32 upper = 0; + kmp_uint32 chunk = 0; // Chunk size assumed to be positive + kmp_int32 incr = 0; + kmp_int32 stride = 0; + + bool valid = false; + kmp_int32 numDone = 0; + + Mutex mutex; + }; + +public: + Team(const Team &) = delete; + Team(Team &&) = delete; + Team &operator=(const Team &) = delete; + Team &operator=(Team &&) = delete; + + inline Team(kmp_int32 masterGtid, kmp_int32 teamId) + : masterGtid(masterGtid), teamId(teamId), barrier(numThreads), + implicitTask(nullptr, nullptr, 0) {} + + inline ~Team() { + for (kmp_int32 i = masterGtid + 1; i < masterGtid + numThreads; i++) { + while (runtime::getThread(i).isRunning()) { + // Wait for thread to finish + DEBUG_PRINT("Waiting for thread %d to finish\n", i); + } + } + } + + inline Barrier &getBarrier() { return barrier; } + + inline const Task &getImplicitTask() const { return implicitTask; } + + inline void setImplicitTask(Task task) { implicitTask = task; } + + inline auto getNumThreads() const { return numThreads; } + + inline void setNumThreads(kmp_int32 numThreads) { + if (teamId == runtime::numTeams - 1) { + // Last team gets the remaining threads + numThreads = std::min(numThreads, NUM_CORES - masterGtid); + } else { + // Limit thread number + numThreads = std::min(numThreads, NUM_CORES / runtime::numTeams); + } + + DEBUG_PRINT("Team %d has %d threads\n", teamId, numThreads); + + this->numThreads = numThreads; + this->barrier.setNumThreads(numThreads); + } + + inline auto setCopyPrivateData(void *data) { copyPrivateData = data; } + + inline auto getCopyPrivateData() const { return copyPrivateData; } + + inline void run() { + if (runtime::numTeams > 1) { + for (kmp_int32 i = masterGtid + 1; i < masterGtid + numThreads; i++) { + auto &thread = runtime::getThread(i); + thread.setTid(i - masterGtid); + + if (i != masterGtid) { + thread.setCurrentTeam(this); + thread.wakeUp(); + } + } + } else { + for (kmp_int32 i = masterGtid + 1; i < masterGtid + numThreads; i++) { + auto &thread = runtime::getThread(i); + thread.setCurrentTeam(this); + } + + wake_up_all(); + mempool_wfi(); + } + } + + inline auto getTeamId() const { return teamId; } + + /** + * @brief Schedule a static for loop. See + * https://github.com/llvm/llvm-project/blob/f28c006a5895fc0e329fe15fead81e37457cb1d1/clang/lib/CodeGen/CGStmtOpenMP.cpp#L2900 + * + * @tparam T Loop index type + * @param loc Source code location + * @param gtid Global thread ID + * @param schedtype Scheduling type + * @param plastiter Pointer to last iteration flag, true if the current thread + * executes the last iteration, false otherwise + * @param plower Pointer to lower bound for this thread + * @param pupper Pointer to upper bound for this thread + * @param pstride Pointer to stride for this thread + * @param chunk Chunk size + */ + template ::type, + typename UnsignedT = typename std::make_unsigned::type> + void forStaticInit(ident_t * /*loc*/, kmp_int32 gtid, + kmp_sched_type schedtype, T *plastiter, T *plower, + T *pupper, SignedT *pstride, SignedT incr, + SignedT chunk) const { + + assert(incr == 1 && "Loop increment is not 1"); + + switch (schedtype) { + case kmp_sch_static: { + + // Calculate chunk size + // https://stackoverflow.com/a/14878734 + chunk = static_cast(*pupper - *plower + 1) / numThreads + + (static_cast(*pupper - *plower + 1) % numThreads != 0); + + // Fall through to static chunked + } + case kmp_sch_static_chunked: { + assert(incr != 0 && "Loop increment must be non-zero"); + assert(chunk > 0 && "Chunk size is not positive"); + assert((static_cast(chunk) <= *pupper - *plower + 1) && + "Chunk size is greater than loop size"); + + kmp_int32 tid = runtime::getThread(gtid).getTid(); + + SignedT numChunks = + (static_cast(*pupper - *plower) + chunk) / chunk; + + SignedT span = incr * chunk; + *pstride = span * static_cast(numThreads); + *plower = *plower + static_cast(tid) * static_cast(span); + *pupper = *plower + static_cast(span - incr); + *plastiter = (tid == (numChunks - 1) % numThreads); + + break; + } + + // Distribute (teams) + case kmp_distribute_static: { + + // Calculate chunk size + // https://stackoverflow.com/a/14878734 + chunk = + static_cast(*pupper - *plower + 1) / runtime::numTeams + + (static_cast(*pupper - *plower + 1) % runtime::numTeams != + 0); + + // Fall through to static chunked + } + case kmp_distribute_static_chunked: { + assert(incr != 0 && "Loop increment must be non-zero"); + assert(chunk > 0 && "Chunk size is not positive"); + assert((static_cast(chunk) <= *pupper - *plower + 1) && + "Chunk size is greater than loop size"); + + SignedT numChunks = + (static_cast(*pupper - *plower) + chunk) / chunk; + + SignedT span = incr * chunk; + *pstride = span * static_cast(runtime::numTeams); + *plower = *plower + static_cast(teamId) * static_cast(span); + *pupper = *plower + static_cast(span - incr); + *plastiter = (teamId == (numChunks - 1) % runtime::numTeams); + + break; + } + default: { + assert(false && "Unsupported scheduling type"); + break; + } + } + } + + template ::type, + typename UnsignedT = typename std::make_unsigned::type> + void dispatchInit(ident_t * /*loc*/, kmp_int32 /*gtid*/, + kmp_sched_type schedtype, T lower, T upper, SignedT incr, + SignedT chunk) { + + assert(incr == 1 && "Loop increment is not 1"); + assert(chunk > 0 && "Chunk size is not positive"); + assert((static_cast(chunk) <= upper - lower + 1) && + "Chunk size is greater than loop size"); + + DEBUG_PRINT("Dispatch init\n"); + + DEBUG_PRINT("Got dynamic schedule\n"); + + switch (schedtype) { + case kmp_sch_dynamic_chunked: { + std::lock_guard lock(dynamicSchedule.mutex); + + if (dynamicSchedule.valid) { + DEBUG_PRINT("Dynamic schedule is already valid\n"); + return; + } + + SignedT span = incr * chunk; + + dynamicSchedule.lowerNext = static_cast(lower); + dynamicSchedule.upper = static_cast(upper); + dynamicSchedule.chunk = static_cast(chunk); + dynamicSchedule.incr = incr; + dynamicSchedule.stride = span * static_cast(numThreads); + + DEBUG_PRINT( + "Dynamic schedule: lowerNext=%d, upper=%d, chunk=%d, incr=%d, " + "stride=%d, addr: %p\n", + dynamicSchedule.lowerNext, dynamicSchedule.upper, + dynamicSchedule.chunk, dynamicSchedule.incr, dynamicSchedule.stride, + &dynamicSchedule); + + dynamicSchedule.valid = true; + break; + } + default: { + printf("Unsupported scheduling type: %d\n", schedtype); + assert(false && "Unsupported scheduling type"); + break; + } + }; + } + + template ::type> + bool dispatchNext(ident_t * /*loc*/, kmp_int32 /*gtid*/, SignedT *plastiter, + T *plower, T *pupper, SignedT *pstride) { + + DEBUG_PRINT("Dispatch next\n"); + + std::lock_guard lock(dynamicSchedule.mutex); + assert(dynamicSchedule.valid && "Dynamic schedule is not valid"); + + if (dynamicSchedule.lowerNext > dynamicSchedule.upper) { + DEBUG_PRINT("Dynamic loop done\n"); + if (++dynamicSchedule.numDone == numThreads) { + dynamicSchedule.valid = false; + dynamicSchedule.numDone = 0; + } + + return false; + } + + *plower = static_cast(dynamicSchedule.lowerNext); + + dynamicSchedule.lowerNext += dynamicSchedule.chunk; + if (dynamicSchedule.lowerNext > dynamicSchedule.upper) { + *pupper = static_cast(dynamicSchedule.upper); + *plastiter = true; + } else { + *pupper = static_cast(dynamicSchedule.lowerNext - 1); + *plastiter = false; + } + + *pstride = dynamicSchedule.stride; + + return true; + }; + + inline void copyPrivate(ident_t * /*loc*/, kmp_int32 gtid, + size_t /*cpy_size*/, void *cpy_data, + void (*cpy_func)(void *, void *), kmp_int32 didit) { + (void)gtid; + + if (didit != 0) { + copyPrivateData = cpy_data; + DEBUG_PRINT("Thread %d set copyprivate data to %p\n", gtid, cpy_data); + } + + barrier.wait(); + + if (didit == 0) { + DEBUG_PRINT("Thread %d copying copyprivate data from %p to %p\n", gtid, + copyPrivateData, cpy_data); + cpy_func(cpy_data, copyPrivateData); + } + + barrier.wait(); + }; + +private: + kmp_int32 masterGtid = 0; + kmp_int32 teamId = 0; + kmp_int32 numThreads = 1; + + Barrier barrier; + + DynamicSchedule dynamicSchedule; + + void *copyPrivateData = nullptr; + + Task implicitTask; +}; + +} // namespace kmp diff --git a/software/runtime/kmp/thread.cpp b/software/runtime/kmp/thread.cpp new file mode 100644 index 000000000..9f803728a --- /dev/null +++ b/software/runtime/kmp/thread.cpp @@ -0,0 +1,123 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "kmp/thread.hpp" +#include "kmp/team.hpp" +#include "kmp/util.hpp" + +extern "C" { +#include "runtime.h" +} + +namespace kmp { + +Thread::Thread(kmp_int32 gtid) : Thread(gtid, gtid) {} + +Thread::Thread(kmp_int32 gtid, kmp_int32 tid) + : gtid(gtid), tid(tid), + currentTeam(gtid == 0 ? &runtime::defaultTeam : nullptr){}; +; + +void Thread::run() { + while (true) { + DEBUG_PRINT("Thread %d went to sleep\n", gtid); + mempool_wfi(); + std::lock_guard lock(running); + + DEBUG_PRINT("Thread %d woke up\n", gtid); + + if (currentTeam != nullptr && !teamsRegion.has_value()) { + + (*currentTeam).getImplicitTask().run(gtid, tid); + DEBUG_PRINT("Done running task\n"); + + Team *prevTeam = currentTeam; + currentTeam = nullptr; + tid = gtid; + + (*prevTeam).getBarrier().wait(); + + } else if (teamsRegion.has_value()) { + teamsRegion->run(gtid, tid); + DEBUG_PRINT("Done running teams region\n"); + + teamsRegion.reset(); + + delete currentTeam; + currentTeam = nullptr; + tid = gtid; + + runtime::teamsBarrier.wait(); + + } else { + DEBUG_PRINT("Thread %d woke up to no work. currentTeam: %p, " + "teamsRegion.has_value(): %d\n", + gtid, currentTeam, teamsRegion.has_value()); + } + } +}; + +void Thread::forkCall(Task parallelRegion) { + kmp_int32 numThreads = this->requestedNumThreads.value_or(NUM_CORES); + this->requestedNumThreads.reset(); + + DEBUG_PRINT("Forking call with %d threads\n", numThreads); + + Team *team = currentTeam; + + // Setup + team->setNumThreads(numThreads); + team->setImplicitTask(parallelRegion); + + // Run on all threads + team->run(); + parallelRegion.run(gtid, tid); + + DEBUG_PRINT("Done running task\n"); + DEBUG_PRINT("Fork call done\n"); + + team->getBarrier().wait(); +}; + +void Thread::forkTeams(Task teamsRegion) { + runtime::numTeams = runtime::requestedNumTeams.value_or(NUM_GROUPS); + runtime::teamsBarrier.setNumThreads(runtime::numTeams); + runtime::requestedNumTeams.reset(); + + DEBUG_PRINT("Forking call with %d teams\n", runtime::numTeams); + + kmp_int32 coresPerTeam = NUM_CORES / runtime::numTeams; + + for (kmp_int32 i = 1; i < runtime::numTeams; i++) { + kmp_int32 coreId = i * coresPerTeam; + + Thread &thread = runtime::getThread(coreId); + + thread.setCurrentTeam(new Team(coreId, i)); + thread.setTeamsRegion(teamsRegion); + thread.setTid(0); + + if (runtime::requestedThreadLimit) { + thread.requestNumThreads(runtime::requestedThreadLimit.value()); + } + + thread.wakeUp(); + } + + this->setTeamsRegion(teamsRegion); + if (runtime::requestedThreadLimit) { + this->requestNumThreads(runtime::requestedThreadLimit.value()); + } + + teamsRegion.run(gtid, tid); + this->teamsRegion.reset(); + + DEBUG_PRINT("Fork teams done\n"); + + runtime::teamsBarrier.wait(); + + runtime::numTeams = 1; +}; + +} // namespace kmp diff --git a/software/runtime/kmp/thread.hpp b/software/runtime/kmp/thread.hpp new file mode 100644 index 000000000..401c8612a --- /dev/null +++ b/software/runtime/kmp/thread.hpp @@ -0,0 +1,80 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +#include "kmp/task.hpp" +#include "kmp/types.h" +#include "kmp/util.hpp" +#include "runtime.h" + +namespace kmp { + +// Forward declaration +class Team; + +class Thread { + +public: + Thread(kmp_int32 gtid); + Thread(kmp_int32 gtid, kmp_int32 tid); + + Thread(const Thread &other) = delete; + Thread(Thread &&) = delete; + Thread &operator=(const Thread &) = delete; + Thread &operator=(Thread &&) = delete; + + ~Thread() = default; + + void run(); + + inline void wakeUp() { + std::lock_guard lock(running); + DEBUG_PRINT("Waking up thread %d\n", gtid); + wake_up(static_cast(gtid)); + }; + + inline Team *getCurrentTeam() { return currentTeam; }; + + inline void setCurrentTeam(Team *team) { + DEBUG_PRINT("Setting current team for %d: %p\n", this->gtid, team); + currentTeam = team; + }; + + inline void setTeamsRegion(Task teamsRegion) { + this->teamsRegion = teamsRegion; + }; + + inline auto getGtid() const { return gtid; }; + + inline auto getTid() const { return tid; }; + + inline void setTid(kmp_int32 tid) { this->tid = tid; }; + + inline bool isRunning() { return running.isLocked(); }; + + inline void requestNumThreads(kmp_int32 numThreads) { + this->requestedNumThreads = numThreads; + } + + void forkCall(Task parallelRegion); + + void forkTeams(Task teamsRegion); + +private: + kmp_int32 gtid; + kmp_int32 tid; + + Mutex running; + + Team *currentTeam; + + // Contains task if this is the initial thread (master) of the teams region + std::optional teamsRegion; + + std::optional requestedNumThreads; +}; +}; // namespace kmp diff --git a/software/runtime/kmp/types.h b/software/runtime/kmp/types.h new file mode 100644 index 000000000..ea4c2019f --- /dev/null +++ b/software/runtime/kmp/types.h @@ -0,0 +1,140 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include + +typedef uint32_t kmp_uint32; +typedef int32_t kmp_int32; +typedef uint64_t kmp_uint64; +typedef int64_t kmp_int64; + +typedef struct { + int reserved_1; + int flags; + int reserved_2; + int reserved_3; + char *psource; +} ident_t; + +// NOLINTNEXTLINE(cppcoreguidelines-*,readability-*) +typedef kmp_int32 kmp_critical_name[8]; + +typedef void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...); +typedef void (*kmpc_micro_bound)(kmp_int32 *bound_tid, kmp_int32 *bound_nth, + ...); + +enum kmp_sched_type : kmp_int32 { + kmp_sch_lower = 32, /**< lower bound for unordered values */ + kmp_sch_static_chunked = 33, + kmp_sch_static = 34, /**< static unspecialized */ + kmp_sch_dynamic_chunked = 35, + kmp_sch_guided_chunked = 36, /**< guided unspecialized */ + kmp_sch_runtime = 37, + kmp_sch_auto = 38, /**< auto */ + kmp_sch_trapezoidal = 39, + + /* accessible only through KMP_SCHEDULE environment variable */ + kmp_sch_static_greedy = 40, + kmp_sch_static_balanced = 41, + /* accessible only through KMP_SCHEDULE environment variable */ + kmp_sch_guided_iterative_chunked = 42, + kmp_sch_guided_analytical_chunked = 43, + /* accessible only through KMP_SCHEDULE environment variable */ + kmp_sch_static_steal = 44, + + /* static with chunk adjustment (e.g., simd) */ + kmp_sch_static_balanced_chunked = 45, + kmp_sch_guided_simd = 46, /**< guided with chunk adjustment */ + kmp_sch_runtime_simd = 47, /**< runtime with chunk adjustment */ + + /* accessible only through KMP_SCHEDULE environment variable */ + kmp_sch_upper, /**< upper bound for unordered values */ + + kmp_ord_lower = 64, /**< lower bound for ordered values, must be power of 2 */ + kmp_ord_static_chunked = 65, + kmp_ord_static = 66, /**< ordered static unspecialized */ + kmp_ord_dynamic_chunked = 67, + kmp_ord_guided_chunked = 68, + kmp_ord_runtime = 69, + kmp_ord_auto = 70, /**< ordered auto */ + kmp_ord_trapezoidal = 71, + kmp_ord_upper, /**< upper bound for ordered values */ + + /* Schedules for Distribute construct */ + kmp_distribute_static_chunked = 91, /**< distribute static chunked */ + kmp_distribute_static = 92, /**< distribute static unspecialized */ + + /* For the "nomerge" versions, kmp_dispatch_next*() will always return a + single iteration/chunk, even if the loop is serialized. For the schedule + types listed above, the entire iteration vector is returned if the loop is + serialized. This doesn't work for gcc/gcomp sections. */ + kmp_nm_lower = 160, /**< lower bound for nomerge values */ + + kmp_nm_static_chunked = + (kmp_sch_static_chunked - kmp_sch_lower + kmp_nm_lower), + kmp_nm_static = 162, /**< static unspecialized */ + kmp_nm_dynamic_chunked = 163, + kmp_nm_guided_chunked = 164, /**< guided unspecialized */ + kmp_nm_runtime = 165, + kmp_nm_auto = 166, /**< auto */ + kmp_nm_trapezoidal = 167, + + /* accessible only through KMP_SCHEDULE environment variable */ + kmp_nm_static_greedy = 168, + kmp_nm_static_balanced = 169, + /* accessible only through KMP_SCHEDULE environment variable */ + kmp_nm_guided_iterative_chunked = 170, + kmp_nm_guided_analytical_chunked = 171, + kmp_nm_static_steal = + 172, /* accessible only through OMP_SCHEDULE environment variable */ + + kmp_nm_ord_static_chunked = 193, + kmp_nm_ord_static = 194, /**< ordered static unspecialized */ + kmp_nm_ord_dynamic_chunked = 195, + kmp_nm_ord_guided_chunked = 196, + kmp_nm_ord_runtime = 197, + kmp_nm_ord_auto = 198, /**< auto */ + kmp_nm_ord_trapezoidal = 199, + kmp_nm_upper, /**< upper bound for nomerge values */ + + /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. Since + we need to distinguish the three possible cases (no modifier, monotonic + modifier, nonmonotonic modifier), we need separate bits for each modifier. + The absence of monotonic does not imply nonmonotonic, especially since 4.5 + says that the behaviour of the "no modifier" case is implementation defined + in 4.5, but will become "nonmonotonic" in 5.0. + + Since we're passing a full 32 bit value, we can use a couple of high bits + for these flags; out of paranoia we avoid the sign bit. + + These modifiers can be or-ed into non-static schedules by the compiler to + pass the additional information. They will be stripped early in the + processing in __kmp_dispatch_init when setting up schedules, so most of the + code won't ever see schedules with these bits set. */ + kmp_sch_modifier_monotonic = + (1 << 29), /**< Set if the monotonic schedule modifier was present */ + kmp_sch_modifier_nonmonotonic = + (1 << 30), /**< Set if the nonmonotonic schedule modifier was present */ + +// NOLINTBEGIN(cppcoreguidelines-macro-usage) +#define SCHEDULE_WITHOUT_MODIFIERS(s) \ + (kmp_sched_type)( \ + (s) & ~(kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) +#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sch_modifier_monotonic) != 0) +#define SCHEDULE_HAS_NONMONOTONIC(s) (((s)&kmp_sch_modifier_nonmonotonic) != 0) +#define SCHEDULE_HAS_NO_MODIFIERS(s) \ + (((s) & (kmp_sch_modifier_nonmonotonic | kmp_sch_modifier_monotonic)) == 0) +#define SCHEDULE_GET_MODIFIERS(s) \ + ((kmp_sched_type)((s) & (kmp_sch_modifier_nonmonotonic | \ + kmp_sch_modifier_monotonic))) +#define SCHEDULE_SET_MODIFIERS(s, m) \ + ((s) = (kmp_sched_type)((kmp_int32)(s) | (kmp_int32)(m))) +#define SCHEDULE_NONMONOTONIC 0 +#define SCHEDULE_MONOTONIC 1 + // NOLINTEND(cppcoreguidelines-macro-usage) + // + kmp_sch_default = kmp_sch_static /**< default scheduling algorithm */ +}; diff --git a/software/runtime/kmp/util.hpp b/software/runtime/kmp/util.hpp new file mode 100644 index 000000000..4ca2baa34 --- /dev/null +++ b/software/runtime/kmp/util.hpp @@ -0,0 +1,196 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include +#include +#include + +#include "printf.h" + +extern "C" { +#include "alloc.h" +} + +namespace kmp { +class Mutex; +} + +extern kmp::Mutex allocLock; + +namespace kmp { + +#ifndef NDEBUG +#define DEBUG_PRINT(...) printf(__VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +class Mutex { +public: + inline void lock() { + while (locked.exchange(true, std::memory_order_acquire)) { + } + } + + inline bool tryLock() { + return !locked.exchange(true, std::memory_order_acquire); + } + + inline bool isLocked() { return locked.load(std::memory_order_acquire); } + + inline void unlock() { locked.store(false, std::memory_order_release); } + +private: + std::atomic locked = false; +}; + +template class SharedPointer { +public: + SharedPointer() : refCount(nullptr), ptr(nullptr) {} + + explicit SharedPointer(T *ptr) + : refCount(new std::atomic(1)), ptr(ptr) {} + + SharedPointer(const SharedPointer &other) + : refCount(other.refCount), ptr(other.ptr) { + (*refCount)++; + } + + SharedPointer(SharedPointer &&other) noexcept + : refCount(other.refCount), ptr(other.ptr) { + other.ptr = nullptr; + other.refCount = nullptr; + } + + SharedPointer &operator=(SharedPointer &&other) noexcept { + if (this != &other) { + std::swap(ptr, other.ptr); + std::swap(refCount, other.refCount); + } + return *this; + } + + SharedPointer &operator=(const SharedPointer &other) { + if (this != &other) { + ptr = other.ptr; + refCount = other.refCount; + (*refCount)++; + } + return *this; + } + + ~SharedPointer() { + if (refCount == nullptr) { + return; + } + + if (--(*refCount) == 0) { + delete ptr; + delete refCount; + } + } + + T *get() { return ptr; } + const T *get() const { return ptr; } + + T *operator->() { return ptr; } + const T *operator->() const { return ptr; } + + T &operator*() { return *ptr; } + const T &operator*() const { return *ptr; } + + inline void reset() { + DEBUG_PRINT("Resetting shared pointer\n"); + if (refCount == nullptr) { + return; + } + + if (--(*refCount) == 0) { + delete ptr; + delete refCount; + } + + refCount = nullptr; + ptr = nullptr; + } + +private: + std::atomic *refCount; + T *ptr; +}; + +template class UniquePointer { +public: + explicit UniquePointer(T *ptr) : ptr(ptr) {} + + // Move constructor + UniquePointer(UniquePointer &&other) noexcept : ptr(other.ptr) { + other.ptr = nullptr; + } + + // Move assignment + UniquePointer &operator=(UniquePointer &&other) noexcept { + if (this != &other) { + delete ptr; + ptr = other.ptr; + other.ptr = nullptr; + } + return *this; + } + + // Disallow copy constructor and assignment + UniquePointer(const UniquePointer &other) = delete; + UniquePointer &operator=(const UniquePointer &other) = delete; + + ~UniquePointer() { delete ptr; } + + T *get() { return ptr; } + const T *get() const { return ptr; } + + T *operator->() { return ptr; } + const T *operator->() const { return ptr; } + + T &operator*() { return *ptr; } + const T &operator*() const { return *ptr; } + +private: + T *ptr; +}; + +template struct Allocator { + typedef T value_type; + + Allocator() = default; + + template + constexpr Allocator(const Allocator & /*unused*/) noexcept {} + + [[nodiscard]] T *allocate(std::size_t n) { + assert(n <= std::numeric_limits::max() / sizeof(T)); + std::lock_guard lock(allocLock); + + DEBUG_PRINT("Allocating %lu bytes\n", n * sizeof(T)); + if (auto ptr = static_cast(simple_malloc(n * sizeof(T)))) { + DEBUG_PRINT("Allocated %lu bytes at %p\n", n * sizeof(T), ptr); + return ptr; + } + + assert(false && "Allocation failed"); + return nullptr; + } + + void deallocate(T *ptr, std::size_t n) noexcept { + (void)n; + + std::lock_guard lock(allocLock); + + simple_free(ptr); + DEBUG_PRINT("Deallocated %lu bytes at %p\n", n * sizeof(T), ptr); + } +}; + +} // namespace kmp diff --git a/software/runtime/link.ld b/software/runtime/link.ld index e03fc7ae1..d710ebe62 100644 --- a/software/runtime/link.ld +++ b/software/runtime/link.ld @@ -16,11 +16,18 @@ SECTIONS { . = __seq_end; } > l1 + /* BSS on L1 */ + .bss : { + __bss_start = .; + *(.bss) + *(.sbss .sbss2 .sbss2.* .gnu.linkonce.sb2.*); + __bss_end = .; + } > l1 + /* Interleaved region on L1 */ .l1 (NOLOAD): { *(.l1_prio) *(.l1) - *(.bss) __l1_alloc_base = ALIGN(0x10); __heap_start = .; } > l1 @@ -35,6 +42,14 @@ SECTIONS { _etext = .; } > l2 + /* Init array on L2 */ + .init_array : { + HIDDEN (__init_array_start = .); + KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*))) + KEEP (*(.init_array)) + HIDDEN (__init_array_end = .); + } > l2 + /* RO Data on L2 */ .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) @@ -51,20 +66,16 @@ SECTIONS { . = ALIGN(0x10); *(.data) } > l2 + .sdata2 : { *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) } > l2 + .sdata : { __global_pointer$ = . + 0x800; *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*) *(.sdata .sdata.* .gnu.linkonce.s.*) - } > l2 - - .bss : { - __bss_start = .; - *(.bss) - *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*); - __bss_end = .; + . = ALIGN(0x10); } > l2 .l2 : { @@ -75,5 +86,5 @@ SECTIONS { .comment : { *(.comment) - } > l2 + } } diff --git a/software/runtime/runtime.h b/software/runtime/runtime.h index 4abdbd682..a1714bdb1 100644 --- a/software/runtime/runtime.h +++ b/software/runtime/runtime.h @@ -140,6 +140,7 @@ static inline void wake_up_tile(uint32_t group_id, uint32_t tile_mask) { case 0: wake_up_tile_g0_reg = tile_mask; break; + case 1: wake_up_tile_g1_reg = tile_mask; break; diff --git a/software/runtime/runtime.mk b/software/runtime/runtime.mk index 69d309158..3c5ad5958 100644 --- a/software/runtime/runtime.mk +++ b/software/runtime/runtime.mk @@ -20,9 +20,11 @@ LLVM_INSTALL_DIR ?= $(INSTALL_DIR)/llvm # HALIDE_INSTALL_DIR ?= $(INSTALL_DIR)/halide # HALIDE_INCLUDE ?= $(HALIDE_INSTALL_DIR)/include # HALIDE_LIB ?= $(HALIDE_INSTALL_DIR)/lib -OMP_DIR ?= $(ROOT_DIR)/omp +GOMP_DIR ?= $(ROOT_DIR)/gomp +KMP_DIR ?= $(ROOT_DIR)/kmp KERNELS_DIR ?= $(abspath $(ROOT_DIR)/../kernels) DATA_DIR ?= $(abspath $(ROOT_DIR)/../data) +EXT_DIR ?= $(abspath $(ROOT_DIR)/../ext) COMPILER ?= gcc XPULPIMG ?= $(xpulpimg) @@ -42,7 +44,7 @@ ifeq ($(COMPILER),gcc) # Define __XPULPIMG if the extension is active DEFINES += -D__XPULPIMG else - RISCV_ARCH_AS ?= rv$(RISCV_ARCH)ima + RISCV_ARCH ?= rv$(RISCV_XLEN)ima RISCV_ARCH_AS ?= $(RISCV_ARCH)Xpulpv2 endif # GCC Toolchain @@ -50,7 +52,8 @@ ifeq ($(COMPILER),gcc) RISCV_CC ?= $(RISCV_PREFIX)gcc RISCV_CXX ?= $(RISCV_PREFIX)g++ RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump - + # OMP runtime + OMP_DIR ?= $(GOMP_DIR) else # Use LLVM by default @@ -77,7 +80,8 @@ else RISCV_CC ?= $(LLVM_INSTALL_DIR)/bin/clang RISCV_CXX ?= $(LLVM_INSTALL_DIR)/bin/clang++ RISCV_OBJDUMP ?= $(RISCV_PREFIX)objdump - + # OMP runtime + OMP_DIR ?= $(KMP_DIR) endif RISCV_OBJCOPY ?= $(RISCV_PREFIX)objcopy RISCV_AS ?= $(RISCV_PREFIX)as @@ -109,12 +113,16 @@ ifdef terapool DEFINES += -DNUM_TILES_PER_SUB_GROUP=$(shell awk 'BEGIN{print ($(num_cores)/$(num_groups))/$(num_cores_per_tile)/$(num_sub_groups_per_group)}') endif +ifdef NDEBUG + DEFINES += -DNDEBUG +endif + # Specify cross compilation target. This can be omitted if LLVM is built with riscv as default target RISCV_LLVM_TARGET ?= --target=$(RISCV_TARGET) --sysroot=$(GCC_INSTALL_DIR)/$(RISCV_TARGET) --gcc-toolchain=$(GCC_INSTALL_DIR) RISCV_WARNINGS += -Wunused-variable -Wconversion -Wall -Wextra # -Werror RISCV_FLAGS_COMMON_TESTS ?= -march=$(RISCV_ARCH) -mabi=$(RISCV_ABI) -I$(ROOT_DIR) -I$(KERNELS_DIR) -I$(DATA_DIR) -static -RISCV_FLAGS_COMMON ?= $(RISCV_FLAGS_COMMON_TESTS) -g -std=gnu99 -O3 -fno-builtin-memcpy -fno-builtin-memset -ffast-math -fno-common -fno-builtin-printf $(DEFINES) $(RISCV_WARNINGS) +RISCV_FLAGS_COMMON ?= $(RISCV_FLAGS_COMMON_TESTS) -gdwarf-4 -O3 -fno-builtin-memcpy -fno-builtin-memset -ffast-math -fno-common -fno-builtin-printf $(DEFINES) $(RISCV_WARNINGS) RISCV_FLAGS_GCC ?= -mcmodel=medany -Wa,-march=$(RISCV_ARCH_AS) -mtune=mempool -fno-tree-loop-distribute-patterns # -falign-loops=32 -falign-jumps=32 RISCV_FLAGS_LLVM ?= -mcmodel=small -mcpu=mempool-rv32 -mllvm -misched-topdown -menable-experimental-extensions # Enable soft-divsqrt when the hardware is not supported. @@ -124,16 +132,16 @@ ifeq ($(xDivSqrt), 0) endif ifeq ($(COMPILER),gcc) - RISCV_CCFLAGS += $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON) - RISCV_CXXFLAGS += $(RISCV_CCFLAGS) + RISCV_CCFLAGS += $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON) -std=gnu99 + RISCV_CXXFLAGS += $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON) -std=c++17 RISCV_LDFLAGS += -static -nostartfiles -lm -lgcc $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON) -L$(ROOT_DIR) RISCV_OBJDUMP_FLAGS += --disassembler-option="march=$(RISCV_ARCH_AS)" # For unit tests RISCV_CCFLAGS_TESTS ?= $(RISCV_FLAGS_GCC) $(RISCV_FLAGS_COMMON_TESTS) -fvisibility=hidden -nostdlib $(RISCV_LDFLAGS) else - RISCV_CCFLAGS += $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_LLVM) $(RISCV_FLAGS_COMMON) - RISCV_CXXFLAGS += $(RISCV_CCFLAGS) - RISCV_LDFLAGS += -static -nostartfiles -lm -lgcc -mcmodel=small $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_COMMON) -L$(ROOT_DIR) + RISCV_CCFLAGS += $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_LLVM) $(RISCV_FLAGS_COMMON) -std=gnu99 + RISCV_CXXFLAGS += $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_LLVM) $(RISCV_FLAGS_COMMON) -std=c++17 -fno-exceptions -fno-threadsafe-statics + RISCV_LDFLAGS += -static -nostartfiles -nostdlib -lgcc -lm -fuse-ld=lld -mcmodel=small $(RISCV_LLVM_TARGET) $(RISCV_FLAGS_COMMON) -L$(ROOT_DIR) RISCV_OBJDUMP_FLAGS += --mcpu=mempool-rv32 ifeq ($(xDivSqrt), 0) RISCV_OBJDUMP_FLAGS += --mattr=+m,+a,+nofdiv,+xpulpmacsi,+xpulppostmod,+xpulpvect,+xpulpvectshufflepack,+zfinx @@ -154,7 +162,7 @@ RUNTIME += $(ROOT_DIR)/serial.c.o RUNTIME += $(ROOT_DIR)/string.c.o RUNTIME += $(ROOT_DIR)/synchronization.c.o -OMP_RUNTIME := $(addsuffix .o,$(shell find $(OMP_DIR) -name "*.c")) +OMP_RUNTIME := $(addsuffix .o,$(shell find $(OMP_DIR) -name "*.c" -o -name "*.cpp")) .INTERMEDIATE: $(RUNTIME) $(OMP_RUNTIME) $(LINKER_SCRIPT) # Disable builtin rules diff --git a/software/runtime/testing.h b/software/runtime/testing.h new file mode 100644 index 000000000..324bd788c --- /dev/null +++ b/software/runtime/testing.h @@ -0,0 +1,116 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +// Inspired by https://jera.com/techinfo/jtns/jtn002 + +#pragma once + +#include "printf.h" + +#define MAX_TESTS 20 + +typedef struct { + const char *name; + void (*func)(char const **out_error_message); +} test_t; + +test_t tests[MAX_TESTS]; // NOLINT + +int tests_run = 0; // NOLINT(*-global-variables) +int tests_failed = 0; // NOLINT(*-global-variables) +int num_tests = 0; // NOLINT(*-global-variables) + +#ifndef NDEBUG +#define DEBUG_PRINT(...) printf(__VA_ARGS__) +#else +#define DEBUG_PRINT(...) +#endif + +// NOLINTNEXTLINE +#define STRINGIFY(x) #x +// NOLINTNEXTLINE +#define TOSTRING(x) STRINGIFY(x) +#define LINE_STRING TOSTRING(__LINE__) + +// NOLINTNEXTLINE +#define ASSERT_TRUE(condition, error_message) \ + do { \ + if (!(condition)) { \ + *out_error_message = (error_message); \ + return; \ + } \ + } while (0) + +// NOLINTNEXTLINE +#define ASSERT_EQ(left, right) \ + do { \ + if (!((left) == (right))) { \ + *out_error_message = \ + #left " is not equal to " #right ", " __FILE__ ":" LINE_STRING; \ + return; \ + } \ + } while (0) + +// NOLINTNEXTLINE +#define ASSERT_NEQ(left, right) \ + do { \ + if (!((left) != (right))) { \ + *out_error_message = \ + #left " is equal to " #right ", " __FILE__ ":" LINE_STRING; \ + return; \ + } \ + } while (0) + +// NOLINTNEXTLINE +#define EXPECT_TRUE(condition, error_message) \ + do { \ + if (!(condition)) \ + printf("\t[CHECK FAILED]: %s, %s:%d\n", (error_message), __FILE__, \ + __LINE__); \ + } while (0) + +// NOLINTNEXTLINE +#define RUN_TEST(test) \ + do { \ + printf("\n[RUNNING]: %s \n", (test).name); \ + const char *message = NULL; \ + ((test).func)(&message); \ + tests_run++; \ + if (message != NULL) { \ + tests_failed++; \ + printf("\t[ASSERTION FAILED]: %s\n", message); \ + printf("[FAIL]: %s\n", (test).name); \ + } else { \ + printf("[SUCCESS]: %s\n", (test).name); \ + } \ + } while (0) + +// NOLINTNEXTLINE +#define RUN_ALL_TESTS() \ + do { \ + for (int i = 0; i < num_tests; i++) { \ + RUN_TEST(tests[i]); \ + } \ + } while (0) + +// NOLINTNEXTLINE +#define PRINT_TEST_RESULTS() \ + do { \ + printf("Ran %d tests\n", tests_run); \ + printf("Failed %d tests\n", tests_failed); \ + } while (0) + +// NOLINTNEXTLINE +#define TEST(testname) \ + void testname(char const **out_error_message); \ + __attribute__((constructor)) void add_##testname##_to_array(void) { \ + if (num_tests < MAX_TESTS) { \ + tests[num_tests].name = #testname; \ + tests[num_tests].func = testname; \ + num_tests++; \ + } else { \ + printf("Too many tests added, max is %d\n", MAX_TESTS); \ + } \ + } \ + inline void testname(char const **out_error_message) diff --git a/software/scripts/plot_benchmarks.py b/software/scripts/plot_benchmarks.py new file mode 100644 index 000000000..cd2324656 --- /dev/null +++ b/software/scripts/plot_benchmarks.py @@ -0,0 +1,152 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +import textwrap +import subprocess +import re +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +HARDWARE_DIR = "../../hardware" +APPS_DIR = "../apps" +OMP_APPS_DIR = APPS_DIR + "/omp" +UART_REGEX = re.compile(r"\[UART\] (.*): (\d+)") +GIT_COMMIT_HASH = subprocess.check_output( + ["git", "describe", "--always", "--dirty"]).strip().decode("utf-8") +OUTPUT = f'results/{GIT_COMMIT_HASH}/results.csv' + + +def plot_speedup(df): + # Separate LLVM and GCC data + llvm_data = df[df['compiler'] == 'llvm'] + gcc_data = df[df['compiler'] == 'gcc'] + + # Merge LLVM and GCC data on 'app' and 'name' + joined_data = pd.merge(llvm_data, gcc_data, on=[ + 'app', 'name'], suffixes=('_llvm', '_gcc')) + + # Calculate the speedup (GCC cycles / LLVM cycles) + joined_data['speedup'] = joined_data['cycles_gcc'] / \ + joined_data['cycles_llvm'] + + # Get unique applications + apps = joined_data['app'].unique() + + # Create bar plots for each application showing speedup + for app in apps: + # Filter data for the current app + app_data = joined_data[joined_data['app'] == app] + + # Unique test names + test_names = app_data['name'] + test_names = ['\n'.join(textwrap.wrap(name, width=10)) + for name in test_names] + + # Speedup values + speedup_values = app_data['speedup'] + + # Bar positions and bar width + bar_width = 0.4 # Width of the bars + x = np.arange(len(test_names)) # Position for bars + + # Create the bar plot + plt.figure() + + # Plot speedup values + bars = plt.bar(x, speedup_values, width=bar_width, + color='green', label='LLVM Speedup') + + # Add value labels on top of each bar + for bar, value in zip(bars, speedup_values): + height = max(1, bar.get_height()) + _, top = plt.ylim() + plt.ylim(top=max(top, height + 0.3)) + plt.text(bar.get_x() + bar.get_width() / 2, height + + 0.05, f'{value:.2f}', ha='center', va='bottom') + + # Set x-axis labels, title, etc. + plt.ylabel('GCC/LLVM Ratio (Speedup)') + plt.title(f'Speedup of LLVM Against GCC for {app}') + plt.xticks(x, test_names, rotation=45, ha='right') + plt.axhline(y=1, color='red', linestyle='--', + linewidth=1, label='Baseline (GCC)') + plt.legend() + plt.tight_layout() # Adjust layout for readability + plt.savefig(f'results/{GIT_COMMIT_HASH}/{app}_speedup.png') + # plt.show() # Display plot + + +def plot_cycles(df): + # Get unique applications + apps = df['app'].unique() + + # Create bar plots for each application showing raw cycles for LLVM and GCC + for app in apps: + # Filter data for the current app + app_data = df[df['app'] == app] + + # Unique test names + test_names = app_data['name'].unique() + + # Initialize arrays for LLVM and GCC cycles + llvm_cycles = [] + gcc_cycles = [] + + # Iterate over test names and align the cycles + for test in test_names: + llvm_cycle = app_data[(app_data['name'] == test) & ( + app_data['compiler'] == 'llvm')]['cycles'] + gcc_cycle = app_data[(app_data['name'] == test) & ( + app_data['compiler'] == 'gcc')]['cycles'] + + # Add cycles only if both GCC and LLVM data are available for the + # test + if not llvm_cycle.empty and not gcc_cycle.empty: + llvm_cycles.append(llvm_cycle.iloc[0]) + gcc_cycles.append(gcc_cycle.iloc[0]) + elif not llvm_cycle.empty: + llvm_cycles.append(llvm_cycle.iloc[0]) + gcc_cycles.append(0) # Add a placeholder 0 for GCC + elif not gcc_cycle.empty: + gcc_cycles.append(gcc_cycle.iloc[0]) + llvm_cycles.append(0) # Add a placeholder 0 for LLVM + + # Bar width and x-positions with closer spacing + bar_width = 0.4 # Width of the bars + x = np.arange(len(test_names)) # Position for bars + + # Create the bar plot + plt.figure(figsize=(10, 6)) + + # Plot LLVM cycles if available + if llvm_cycles: + plt.bar(x, llvm_cycles, width=bar_width, label='LLVM') + + # Plot GCC cycles if available + if gcc_cycles: + plt.bar(x + bar_width, gcc_cycles, width=bar_width, label='GCC') + + # Set x-axis labels, title, etc. + plt.ylabel('Cycles') + plt.title(f'Cycles Comparison for {app}') + plt.xticks(x + bar_width / 2, test_names, rotation=45, ha='right') + plt.legend() + plt.tight_layout() # Adjust layout for readability + plt.savefig(f'results/{GIT_COMMIT_HASH}/{app}_cycles.png') + # plt.show() # Display plot + + +def main(): + df = pd.read_csv(OUTPUT) + + if ("dirty" in GIT_COMMIT_HASH): + print("WARNING: The current commit is dirty.") + + plot_speedup(df) + plot_cycles(df) + + +if __name__ == '__main__': + main() diff --git a/software/scripts/run_benchmarks.py b/software/scripts/run_benchmarks.py new file mode 100644 index 000000000..24c9cc15d --- /dev/null +++ b/software/scripts/run_benchmarks.py @@ -0,0 +1,81 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import os +import re +import pandas as pd +import runner +from pprint import pp + +APPS_DIR = "../apps" +OMP_APPS_DIR = APPS_DIR + "/omp" +UART_REGEX = re.compile(r"\[UART\] ((?!.*\bresult\b).*): (\d+)", re.IGNORECASE) +GIT_COMMIT_HASH = subprocess.check_output( + ["git", "describe", "--always", "--dirty"]).strip().decode("utf-8") +OUTPUT = f"results/{GIT_COMMIT_HASH}/results.csv" + +results = pd.DataFrame(columns=["app", "name", "compiler", "cycles"]) + + +def compileAll(dir, env): + return subprocess.run(["make", "-C", dir, "all"], env=env).returncode == 0 + + +def runAll(dir, args, env): + global results + compiler = env["COMPILER"] + + for app in os.listdir(dir): + try: + if (os.path.isfile(os.path.join(dir, app)) or app.startswith(".")): + continue + + app_dir = f"{os.path.basename(dir)}/{app}" + + (res, reason, output) = runner.run( + app_dir, args, env, lambda x: None) + if not res: + print(f"{app} did not run successfully") + print(reason) + + matches = UART_REGEX.findall(output) + for match in matches: + results = pd.concat([results, pd.DataFrame( + [{"app": app, "name": + match[0], "compiler": + compiler, "cycles": + int(match[1])}])]) + + pp(results) + print() + results.to_csv(OUTPUT, index=False) + + except KeyboardInterrupt: + continue + + +def main(): + parser = runner.get_arg_parser() + args = parser.parse_args() + + env = os.environ + + env["config"] = args.config + if not args.debug: + env["NDEBUG"] = "1" + + os.makedirs(f'results/{GIT_COMMIT_HASH}', exist_ok=True) + + for compiler in (["gcc", "llvm"] if args.compiler is None else + [args.compiler]): + env["COMPILER"] = compiler + if compileAll(OMP_APPS_DIR, env): + runAll(OMP_APPS_DIR, args, env) + else: + print(f"Failed to compile with {compiler}") + + +if __name__ == '__main__': + main() diff --git a/software/scripts/run_tests.py b/software/scripts/run_tests.py new file mode 100644 index 000000000..4221e13dc --- /dev/null +++ b/software/scripts/run_tests.py @@ -0,0 +1,108 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +import os +import re +import fnmatch +import runner + +DIR = os.path.dirname(os.path.realpath(__file__)) +SOFTWARE_DIR = os.path.join(DIR, "../") +TESTS_DIR = os.path.join(SOFTWARE_DIR, "tests") +BIN_DIR = os.path.join(DIR, "../bin") +TESTS_BIN_DIR = os.path.join(BIN_DIR, "tests") + +RED = "\033[91m" +GREEN = "\033[92m" +YELLOW = "\033[93m" +RESET = "\033[0m" + + +def parse_line(line, stats): + running_re = re.compile(r"\[RUNNING\]:\s+(.*)") + success_re = re.compile(r"\[SUCCESS\]:\s+(.*)") + fail_re = re.compile(r"\[FAIL\]:\s+(.*)") + failures_re = re.compile(r"\[\w+ FAILED\]:\s+(.*)") + + if m := running_re.search(line): + stats["num_tests"] += 1 + print(f"{YELLOW}[RUNNING]{RESET}: {m.group(1)}") + elif m := success_re.search(line): + print(f"{GREEN}[SUCCESS]{RESET}: {m.group(1)}") + stats["num_success"] += 1 + elif m := fail_re.search(line): + print(f"{RED}[FAIL]{RESET}: {m.group(1)}") + elif m := failures_re.search(line): + print(f" {RED}[FAIL]{RESET}: {m.group(1)}") + + +def print_results(stats): + color = (GREEN if stats["num_success"] > 0 and + stats["num_success"] == stats["num_tests"] else RED) + print( + f'{color}' + f'[RESULT]{RESET}: ' + f'{stats["num_success"]}/{stats["num_tests"]} tests passed' + ) + + +def main(): + parser = runner.get_arg_parser() + parser.add_argument( + "tests", type=str, nargs="+", help="Tests to run (glob matching)" + ) + parser.add_argument( + "-r", + "--repetitions", + type=int, + default=10, + help="Test repetitions (not all tests use this)", + ) + + args = parser.parse_args() + + matching_tests = [] + for test in args.tests: + for root, dirs, _ in os.walk(TESTS_DIR): + for d in dirs: + full_path = os.path.relpath(os.path.join(root, d), TESTS_DIR) + if fnmatch.fnmatch(full_path, test): + matching_tests.append(full_path) + + if not matching_tests: + print("No tests found matching the pattern") + return + + print(f"Running tests: {matching_tests}") + + env = os.environ + env["REPETITIONS"] = str(args.repetitions) + + for test in sorted(set(matching_tests)): + print() + + testpath = os.path.join(TESTS_DIR, test) + + if args.compiler and not runner.compile(testpath, args, env): + continue + + if not os.path.exists(os.path.join(TESTS_BIN_DIR, test)): + print(f"Test {test} not found") + continue + + stats = {"num_tests": 0, "num_success": 0} + + res, reason, out = runner.run( + test, args, env, + lambda line: parse_line(line, stats)) + + if not res: + print(f"{RED}[FAIL]{RESET}: {reason}") + stats["num_tests"] = "?" + + print_results(stats) + + +if __name__ == "__main__": + main() diff --git a/software/scripts/runner.py b/software/scripts/runner.py new file mode 100644 index 000000000..a624cbfe7 --- /dev/null +++ b/software/scripts/runner.py @@ -0,0 +1,208 @@ +# Copyright 2024 ETH Zurich and University of Bologna. +# Licensed under the Apache License, Version 2.0, see LICENSE for details. +# SPDX-License-Identifier: Apache-2.0 + +import argparse +import os +import subprocess +import threading +import queue +import time +import signal + +DIR = os.path.dirname(os.path.realpath(__file__)) +HARDWARE_DIR = os.path.join(DIR, "../../hardware") + + +# https://stackoverflow.com/a/4791612 +def kill_proc(proc): + os.killpg(os.getpgid(proc.pid), signal.SIGKILL) + + +def enqueue_output(out, queue): + try: + for line in iter(out.readline, ""): + queue.put(line) + except Exception as e: + print(e) + pass + + +def compile(prog, args, env_extra): + if not os.path.exists(prog): + print(f"{prog} not found") + return False + + env = os.environ + env["config"] = args.config + env["COMPILER"] = args.compiler + env |= env_extra if env_extra else {} + + if not args.debug: + env["NDEBUG"] = "1" + + dir, progname = os.path.split(prog) + + print(f"Compiling {progname}") + comp = subprocess.run( + ["make", "-C", dir, progname], + env=env, + capture_output=True, + ) + + if comp.returncode != 0: + print(f"Failed to compile {progname}") + if args.verbose: + print(comp.stdout.decode("utf-8")) + print(comp.stderr.decode("utf-8")) + return False + + return True + + +def run(prog, args, env_extra, line_callback): + if args.simulator == "verilator": + args.simulator = "verilate" + + env = os.environ + env["config"] = args.config + env["app"] = prog + env |= env_extra if env_extra else {} + + output = "" + timer = None + + print(f"Running {prog}") + + try: + # https://stackoverflow.com/a/76624958 + proc = subprocess.Popen( + ["make", "-C", HARDWARE_DIR, args.simulator], + env=env, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + bufsize=1, + errors="replace", + preexec_fn=os.setsid + ) + + # Start a timer to kill the process if it takes too long + if args.timeout > 0: + timer = threading.Timer(args.timeout, kill_proc, [proc]) + timer.start() + + stdout_queue = queue.Queue() + stderr_queue = queue.Queue() + + stdout_thread = threading.Thread( + target=enqueue_output, args=(proc.stdout, stdout_queue) + ) + stderr_thread = threading.Thread( + target=enqueue_output, args=(proc.stderr, stderr_queue) + ) + + stdout_thread.daemon = True + stderr_thread.daemon = True + + stdout_thread.start() + stderr_thread.start() + + while True: + time.sleep(0.1) + + while not stdout_queue.empty() or not stderr_queue.empty(): + try: + stdout_line = stdout_queue.get_nowait() + except queue.Empty: + stdout_line = None + + try: + stderr_line = stderr_queue.get_nowait() + except queue.Empty: + stderr_line = None + + for line in [stdout_line, stderr_line]: + if line: + output += line + line_callback(line) + + if args.verbose: + print(line, end="") + + if ("Error 117") in line: + reason = ("Banshee called " + "the police, most likely a deadlock " + "(all threads called wfi)") + kill_proc(proc) + return (False, reason, output) + + if "Stackoverflow" in line: + kill_proc(proc) + return (False, "Stackoverflow", output) + + if (args.simulator == "banshee" and + "Program done" in line): + kill_proc(proc) + break + + if ( + proc.poll() is not None + and stdout_queue.empty() + and stderr_queue.empty() + ): + break + + if proc.returncode is not None and proc.returncode > 0: + return (False, "Non-zero return code", output) + elif timer is not None and not timer.is_alive(): + return (False, "Timeout", output) + else: + return (True, "Success", output) + + except Exception as e: + return (False, str(e), output) + + finally: + if proc.poll() is None: + kill_proc(proc) + if timer: + timer.cancel() + + +def get_arg_parser(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-t", + "--timeout", + type=int, + default=180, + help="Timeout in seconds (set to 0 to disable)", + ) + parser.add_argument( + "-s", "--simulator", type=str, default="verilator", + help="Simulator to use" + ) + parser.add_argument( + "-c", + "--config", + type=str, + default="minpool-no-xpulp", + help="Mempool configuration", + ) + parser.add_argument( + "--compiler", + type=str, + choices=["gcc", "llvm"], + help="Compiler", + ) + parser.add_argument( + "--verbose", action="store_true", default=False, + help="Print verbose output" + ) + parser.add_argument( + "--debug", action="store_true", default=False, + help="Compile in debug mode" + ) + + return parser diff --git a/software/tests/omp/Makefile b/software/tests/omp/Makefile index 73a61326f..c9641293e 100644 --- a/software/tests/omp/Makefile +++ b/software/tests/omp/Makefile @@ -9,12 +9,19 @@ SOFTWARE_DIR := $(abspath $(ROOT_DIR)/../..) TESTS_DIR := $(ROOT_DIR) BIN_DIR := $(abspath $(SOFTWARE_DIR)/bin/$(subst $(SOFTWARE_DIR),,$(TESTS_DIR))) RUNTIME_DIR := $(abspath $(SOFTWARE_DIR)/runtime) +COMPILER ?= llvm # OpenMP runtime -OMP_DIR ?= $(RUNTIME_DIR)/omp RISCV_CCFLAGS += -fopenmp -DNTHREADS=$(num_cores) RISCV_CCFLAGS += -I$(OMP_DIR) +# Wrap main function +RISCV_LDFLAGS += -Wl,-wrap,main + +# Test repetitions +REPETITIONS ?= 10 +RISCV_CCFLAGS += -DREPETITIONS=$(REPETITIONS) + # This will overwrite the ROOT_DIR variable from the included makefile include $(RUNTIME_DIR)/runtime.mk diff --git a/software/tests/omp/atomic/main.c b/software/tests/omp/atomic/main.c index a1d63cf25..9e872c08e 100644 --- a/software/tests/omp/atomic/main.c +++ b/software/tests/omp/atomic/main.c @@ -5,303 +5,315 @@ #include #include +#include "../../runtime/testing.h" #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" -#define REPETITIONS 10 /* Number of times to run each test */ +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif + #define MAX_FACTOR 10 #define KNOWN_PRODUCT 3628800 /* 10! */ #define LOOPCOUNT 100 /* Number of iterations to slit amongst threads */ -int test_omp_atomic() { - int sum; - int diff; - int product; - int x; - int *logics; - int bit_and = 1; - int bit_or = 0; - int exclusiv_bit_or = 0; - int j; - int known_sum; - int known_diff; - int known_product; - int result = 0; - int logicsArray[LOOPCOUNT]; - logics = logicsArray; - - sum = 0; - diff = 0; - product = 1; - -// sum of integers test +int logics[LOOPCOUNT]; + +TEST(sum_of_integers) { + for (int count = 0; count < REPETITIONS; count++) { + int sum = 0; + #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 1; i <= LOOPCOUNT; i++) { + for (i = 1; i <= LOOPCOUNT; i++) { #pragma omp atomic - sum += i; + sum += i; + } } + + int known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2; + ASSERT_EQ(known_sum, sum); } - known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2; - if (known_sum != sum) { - printf("Error in sum with integers: Result was %d instead of %d.\n", sum, - known_sum); - result++; - } +} + +TEST(difference_of_integers) { + for (int count = 0; count < REPETITIONS; count++) { + int diff = 0; -// difference of integers test #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < LOOPCOUNT; i++) { + for (i = 0; i < LOOPCOUNT; i++) { #pragma omp atomic - diff -= i; + diff -= i; + } } + + int known_diff = ((LOOPCOUNT - 1) * LOOPCOUNT) / 2 * -1; + printf("known_diff: %d, diff: %d\n", known_diff, diff); + ASSERT_EQ(known_diff, diff); } - known_diff = ((LOOPCOUNT - 1) * LOOPCOUNT) / 2 * -1; - if (diff != known_diff) { - printf("Error in difference with integers: Result was %d instead of 0.\n", - diff); - result++; - } +} + +TEST(product_of_integers) { + for (int count = 0; count < REPETITIONS; count++) { + int product = 1; -// product of integers test #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 1; i <= MAX_FACTOR; i++) { + for (i = 1; i <= MAX_FACTOR; i++) { #pragma omp atomic - product *= i; + product *= i; + } } + + ASSERT_EQ(KNOWN_PRODUCT, product); } - known_product = KNOWN_PRODUCT; - if (known_product != product) { - printf("Error in product with integers: Result was %d instead of %d\n", - product, known_product); - result++; - } +} + +TEST(division_of_integers) { + for (int count = 0; count < REPETITIONS; count++) { + int product = KNOWN_PRODUCT; - // division of integers test - product = KNOWN_PRODUCT; #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 1; i <= MAX_FACTOR; ++i) { + for (i = 1; i <= MAX_FACTOR; ++i) { #pragma omp atomic - product /= i; + product /= i; + } } + + ASSERT_EQ(1, product); } - if (product != 1) { - printf("Error in product division with integers: Result was %d" - " instead of 1\n", - product); - result++; - } +} - // ++ test - x = 0; +TEST(atomic_increment) { + for (int count = 0; count < REPETITIONS; count++) { + int x = 0; #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < LOOPCOUNT; ++i) { + for (i = 0; i < LOOPCOUNT; ++i) { #pragma omp atomic - x++; + x++; + } } + + ASSERT_EQ(LOOPCOUNT, x); } - if (x != LOOPCOUNT) { - result++; - printf("Error in ++\n"); - } +} + +TEST(atomic_decrement) { + int x = 0; -// -- test + for (int count = 0; count < REPETITIONS; count++) { + + x = LOOPCOUNT; #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < LOOPCOUNT; ++i) { + for (i = 0; i < LOOPCOUNT; ++i) { #pragma omp atomic - x--; + x--; + } } - } - if (x != 0) { - result++; - printf("Error in --\n"); - } - // bit-and test part 1 - for (j = 0; j < LOOPCOUNT; ++j) { - logics[j] = 1; + ASSERT_EQ(0, x); } - bit_and = 1; +} + +TEST(atomic_bit_and_1) { + for (int count = 0; count < REPETITIONS; count++) { + memset(logics, 0, LOOPCOUNT); + int bit_and = 1; + + for (int j = 0; j < LOOPCOUNT; ++j) { + logics[j] = 1; + } + bit_and = 1; #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < LOOPCOUNT; ++i) { + for (i = 0; i < LOOPCOUNT; ++i) { #pragma omp atomic - bit_and &= logics[i]; + bit_and &= logics[i]; + } } + + ASSERT_EQ(1, bit_and); } - if (!bit_and) { - result++; - printf("Error in BIT AND part 1\n"); - } +} + +TEST(atomic_bit_and_2) { + for (int count = 0; count < REPETITIONS; count++) { + memset(logics, 0, LOOPCOUNT); + int bit_and = 1; - // bit-and test part 2 - bit_and = 1; - logics[LOOPCOUNT / 2] = 0; + bit_and = 1; + logics[LOOPCOUNT / 2] = 0; #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < LOOPCOUNT; ++i) { + for (i = 0; i < LOOPCOUNT; ++i) { #pragma omp atomic - bit_and &= logics[i]; + bit_and &= logics[i]; + } } - } - if (bit_and) { - result++; - printf("Error in BIT AND part 2\n"); - } - // bit-or test part 1 - for (j = 0; j < LOOPCOUNT; j++) { - logics[j] = 0; + ASSERT_EQ(0, bit_and); } - bit_or = 0; +} + +TEST(atomic_bit_or_1) { + for (int count = 0; count < REPETITIONS; count++) { + memset(logics, 0, LOOPCOUNT); + int bit_or = 1; + + for (int j = 0; j < LOOPCOUNT; j++) { + logics[j] = 0; + } + + bit_or = 0; #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < LOOPCOUNT; ++i) { + for (i = 0; i < LOOPCOUNT; ++i) { #pragma omp atomic - bit_or |= logics[i]; + bit_or |= logics[i]; + } } + + ASSERT_EQ(0, bit_or); } - if (bit_or) { - result++; - printf("Error in BIT OR part 1\n"); - } +} + +TEST(atomic_bit_or_2) { + for (int count = 0; count < REPETITIONS; count++) { + memset(logics, 0, LOOPCOUNT); + int bit_or = 1; + + for (int j = 0; j < LOOPCOUNT; j++) { + logics[j] = 0; + } + + bit_or = 0; + logics[LOOPCOUNT / 2] = 1; - // bit-or test part 2 - bit_or = 0; - logics[LOOPCOUNT / 2] = 1; #pragma omp parallel - { + { - int i; + int i; #pragma omp for - for (i = 0; i < LOOPCOUNT; ++i) { + for (i = 0; i < LOOPCOUNT; ++i) { #pragma omp atomic - bit_or |= logics[i]; + bit_or |= logics[i]; + } } + ASSERT_EQ(1, bit_or); } - if (!bit_or) { - result++; - printf("Error in BIT OR part 2\n"); - } +} + +TEST(atomix_bit_xor_1) { + for (int count = 0; count < REPETITIONS; count++) { + memset(logics, 0, LOOPCOUNT); + int exclusiv_bit_or = 0; + + for (int j = 0; j < LOOPCOUNT; j++) { + logics[j] = 0; + } - // bit-xor test part 1 - for (j = 0; j < LOOPCOUNT; j++) { - logics[j] = 0; - } - exclusiv_bit_or = 0; #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < LOOPCOUNT; ++i) { + for (i = 0; i < LOOPCOUNT; ++i) { #pragma omp atomic - exclusiv_bit_or ^= logics[i]; + exclusiv_bit_or ^= logics[i]; + } } + + ASSERT_EQ(0, exclusiv_bit_or); } - if (exclusiv_bit_or) { - result++; - printf("Error in EXCLUSIV BIT OR part 1\n"); - } +} + +TEST(atomic_bit_xor_2) { + for (int count = 0; count < REPETITIONS; count++) { + memset(logics, 0, LOOPCOUNT); + int exclusiv_bit_or = 0; + + for (int j = 0; j < LOOPCOUNT; j++) { + logics[j] = 0; + } + + exclusiv_bit_or = 0; + logics[LOOPCOUNT / 2] = 1; - // bit-xor test part 2 - exclusiv_bit_or = 0; - logics[LOOPCOUNT / 2] = 1; #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < LOOPCOUNT; ++i) { + for (i = 0; i < LOOPCOUNT; ++i) { #pragma omp atomic - exclusiv_bit_or ^= logics[i]; + exclusiv_bit_or ^= logics[i]; + } } + + ASSERT_EQ(1, exclusiv_bit_or); } - if (!exclusiv_bit_or) { - result++; - printf("Error in EXCLUSIV BIT OR part 2\n"); - } +} +// +TEST(atomic_left_shift) { + for (int count = 0; count < REPETITIONS; count++) { + int x = 1; - // left shift test - x = 1; #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < 10; ++i) { + for (i = 0; i < 10; ++i) { #pragma omp atomic - x <<= 1; + x <<= 1; + } } + + ASSERT_EQ(1024, x); } - if (x != 1024) { - result++; - printf("Error in <<\n"); - x = 1024; - } +} + +TEST(atomic_right_shift) { + for (int count = 0; count < REPETITIONS; count++) { + int x = 1024; -// right shift test #pragma omp parallel - { - int i; + { + int i; #pragma omp for - for (i = 0; i < 10; ++i) { + for (i = 0; i < 10; ++i) { #pragma omp atomic - x >>= 1; + x >>= 1; + } } - } - if (x != 1) { - result++; - printf("Error in >>\n"); - } - return (result); + ASSERT_EQ(1, x); + } } int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - int i; - int num_failed = 0; - - mempool_wait(4 * num_cores); - - if (core_id == 0) { - printf("Master Thread start\n"); - for (i = 0; i < REPETITIONS; i++) { - printf("test: %d\n", i); - num_failed = test_omp_atomic(); - printf("num_failed: %d\n", num_failed); - } - printf("Master Thread end\n\n\n"); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } - return num_failed; + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); } diff --git a/software/tests/omp/barrier_test1/main.c b/software/tests/omp/barrier_test1/main.c index 7c2f1542f..49cdf4934 100644 --- a/software/tests/omp/barrier_test1/main.c +++ b/software/tests/omp/barrier_test1/main.c @@ -6,66 +6,45 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "testing.h" + +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif -#define REPETITIONS 10 /* Number of times to run each test */ #define SLEEPTIME 1000 -uint32_t test_omp_barrier() { +TEST(test_omp_barrier) { uint32_t result1; uint32_t result2; result1 = 0; result2 = 0; + for (int i = 0; i < REPETITIONS; i++) { #pragma omp parallel - { - uint32_t rank; - rank = omp_get_thread_num(); - if (rank == 1) { - printf("waiting...\n"); - mempool_wait(((double)SLEEPTIME) / - REPETITIONS); // give 1 sec to whole test - printf("waited.\n"); - result2 = 3; - } + { + uint32_t rank; + rank = omp_get_thread_num(); + if (rank == 1) { + mempool_wait(((uint32_t)(double)SLEEPTIME / + REPETITIONS)); // give 1 sec to whole test + result2 = 3; + } #pragma omp barrier - if (rank == 2) { - printf("result2: %d\n", result2); - result1 = result2; - printf("result1: %d\n", result1); + if (rank == 2) { + result1 = result2; + } } + ASSERT_EQ(result1, 3); } - return (result1 == 3); } int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t i; - uint32_t num_failed = 0; - - if (core_id == 0) { - printf("Master Thread start\n"); - for (i = 0; i < REPETITIONS; i++) { - printf("test: %d\n", i); - if (!test_omp_barrier()) { - num_failed++; - } - printf("test finished: %d\n", i); - } - printf("Master Thread end\n\n\n"); - printf("num_failed: %d\n", num_failed); - mempool_wait(4 * num_cores); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } - - return 0; + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); } diff --git a/software/tests/omp/barrier_test2/main.c b/software/tests/omp/barrier_test2/main.c deleted file mode 100644 index ac1f3b7e7..000000000 --- a/software/tests/omp/barrier_test2/main.c +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright 2022 ETH Zurich and University of Bologna. -// Licensed under the Apache License, Version 2.0, see LICENSE for details. -// SPDX-License-Identifier: Apache-2.0 - -#include -#include - -#include "encoding.h" -#include "libgomp.h" -#include "printf.h" -#include "runtime.h" -#include "synchronization.h" - -#define REPETITIONS 10 /* Number of times to run each test */ -#define SLEEPTIME 1000 - -uint32_t test_omp_barrier(uint32_t num_cores) { - uint32_t result1; - uint32_t result2; - result1 = 0; - result2 = 0; - -#pragma omp parallel - { - uint32_t rank; - rank = omp_get_thread_num(); - if (rank == 1) { - printf("waiting...\n"); - mempool_wait(((double)SLEEPTIME) / - REPETITIONS); // give 1 sec to whole test - printf("waited.\n"); - result2 = 3; - } - mempool_barrier(num_cores); - - if (rank == 2) { - printf("result2: %d\n", result2); - result1 = result2; - printf("result1: %d\n", result1); - } - } - return (result1 == 3); -} - -int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t i; - uint32_t num_failed = 0; - - mempool_barrier_init(core_id); - - if (core_id == 0) { - printf("Master Thread start\n"); - for (i = 0; i < REPETITIONS; i++) { - printf("test: %d\n", i); - if (!test_omp_barrier(num_cores)) { - num_failed++; - } - printf("test finished: %d\n", i); - } - printf("Master Thread end\n\n\n"); - printf("num_failed: %d\n", num_failed); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } - - return 0; -} diff --git a/software/tests/omp/critical/main.c b/software/tests/omp/critical/main.c index 4ffba2c7a..ef8b38da7 100644 --- a/software/tests/omp/critical/main.c +++ b/software/tests/omp/critical/main.c @@ -6,66 +6,45 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "testing.h" -#define REPETITIONS 10 /* Number of times to run each test */ +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif -int test_omp_critical() { - int sum; - int known_sum, mysum; +TEST(test_omp_critical) { int num_cores = (int)mempool_get_core_count(); - sum = 0; -#pragma omp parallel - { - mysum = 0; - int i; + for (int r = 0; r < REPETITIONS; r++) { + int sum1 = 0; + int sum2 = 0; -#pragma omp single +#pragma omp parallel { - for (i = 0; i < 100; i++) - mysum = mysum + i; - printf("Single\n"); - } +#pragma omp critical + { + sum1 += 1; + sum2 += 2; + } #pragma omp critical - { - sum = mysum + sum; - // printf("Sum: %d, thread_id: %d\n",sum,omp_get_thread_num()); + { + sum1 += 1; + sum2 += 2; + } } + + ASSERT_EQ(sum1, 2 * num_cores); + ASSERT_EQ(sum2, 2 * sum1); + ASSERT_EQ(sum2, 4 * num_cores); } - known_sum = 99 * 100 / 2 * num_cores; - return (known_sum == sum); } int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t num_cores = mempool_get_core_count(); - uint32_t i; - uint32_t num_failed = 0; - - mempool_wait(2 * num_cores); - - if (core_id == 0) { - printf("Master Thread start\n"); - for (i = 0; i < REPETITIONS; i++) { - printf("test: %d\n", i); - if (!test_omp_critical()) { - num_failed++; - } - printf("num_failed: %d\n", num_failed); - } - printf("Master Thread end\n\n\n"); - printf("num_failed: %d\n", num_failed); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } - + RUN_ALL_TESTS(); return 0; } diff --git a/software/tests/omp/master/main.c b/software/tests/omp/master/main.c index ac763ac8e..8f15bc31b 100644 --- a/software/tests/omp/master/main.c +++ b/software/tests/omp/master/main.c @@ -6,54 +6,47 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "testing.h" -#define REPETITIONS 10 /* Number of times to run each test */ +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif -int test_omp_master() { - uint32_t nthreads; - int32_t executing_thread; - // counts up the number of wrong thread no. for the master thread. (Must be 0) - uint32_t tid = 0; - nthreads = 0; - executing_thread = -1; +TEST(test_omp_master) { + for (int i = 0; i < REPETITIONS; i++) { + + uint32_t nthreads; + int32_t executing_thread; + // counts up the number of wrong thread no. for the master thread. (Must be + // 0) + uint32_t tid = 0; + nthreads = 0; + executing_thread = -1; #pragma omp parallel - { -#pragma omp master { - printf("Master Thread executes\n\n\n"); - tid = omp_get_thread_num(); - nthreads++; - executing_thread = (int32_t)omp_get_thread_num(); - } /* end of master*/ - } /* end of parallel*/ - return ((nthreads == 1) && (executing_thread == 0) && (tid == 0)); +#pragma omp master + { + printf("Master Thread executes\n\n\n"); + tid = omp_get_thread_num(); + nthreads++; + executing_thread = (int32_t)omp_get_thread_num(); + } /* end of master*/ + } /* end of parallel*/ + + ASSERT_EQ(1, nthreads); + ASSERT_EQ(0, executing_thread); + ASSERT_EQ(0, tid); + } } int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t i; - uint32_t num_failed = 0; - - if (core_id == 0) { - printf("Master Thread start\n"); - for (i = 0; i < REPETITIONS; i++) { - if (!test_omp_master()) { - num_failed++; - } - } - printf("Master Thread end\n\n\n"); - printf("num_failed:%d\n", num_failed); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); return 0; } diff --git a/software/tests/omp/omp_parallel/main.c b/software/tests/omp/omp_parallel/main.c index d88a138ee..cb5ec10dd 100644 --- a/software/tests/omp/omp_parallel/main.c +++ b/software/tests/omp/omp_parallel/main.c @@ -6,37 +6,50 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "testing.h" -extern volatile uint32_t tcdm_start_address_reg; -extern volatile uint32_t tcdm_end_address_reg; +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif -int main() { - uint32_t core_id = mempool_get_core_id(); - - mempool_barrier_init(core_id); - - if (core_id == 0) { +TEST(test_omp_parallel_8) { + for (int i = 0; i < REPETITIONS; i++) { printf("Master Thread: Parallel start\n"); - mempool_wait(1000); + uint32_t nthreads = 0; + #pragma omp parallel num_threads(8) - { printf("%d\n", omp_get_num_threads()); } + { + nthreads = omp_get_num_threads(); + printf("%d\n", omp_get_num_threads()); + } printf("Master Thread: Parallel end\n\n\n"); + ASSERT_EQ(8, nthreads); + } +} + +TEST(test_omp_parallel) { + for (int i = 0; i < REPETITIONS; i++) { + printf("Master Thread: Parallel start\n"); + uint32_t nthreads = 0; printf("Master Thread: Parallel start\n"); - mempool_wait(1000); #pragma omp parallel - { printf("%d\n", omp_get_num_threads()); } - printf("Master Thread: Parallel end\n\n\n"); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); + { + nthreads = omp_get_num_threads(); + printf("%d\n", omp_get_num_threads()); } + printf("Master Thread: Parallel end\n\n\n"); + ASSERT_EQ(NUM_CORES, nthreads); } +} + +int main() { + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); return 0; } diff --git a/software/tests/omp/omp_parallel_for/main.c b/software/tests/omp/omp_parallel_for/main.c index 56d400a72..cfc526b96 100644 --- a/software/tests/omp/omp_parallel_for/main.c +++ b/software/tests/omp/omp_parallel_for/main.c @@ -6,169 +6,166 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "testing.h" -#define TEST_THREAD +int buf_1[64]; +uint32_t buf_2[NUM_CORES]; -void gcc_omp_parallel_for_schedule_static(void) { - int buf[64], *p; +TEST(omp_parallel_for_schedule_static) { uint32_t i; - int result = 0; - memset(buf, '\0', sizeof(buf)); + int *p; + memset(buf_1, '\0', sizeof(buf_1)); #pragma omp parallel for - for (i = 0; i < omp_get_num_threads(); i++) { - if (omp_get_thread_num() != i) { - printf("Error: for loop is not executed in parallel\n"); - result += 1; - } + for (int i = 0; i < (int)omp_get_num_threads(); i++) { + buf_1[i] = i; } + for (int i = 0; i < NUM_CORES; i++) { + ASSERT_EQ(buf_1[i], i); + } + + memset(buf_1, '\0', sizeof(buf_1)); #pragma omp parallel for schedule(static, 3) private(p) - for (p = &buf[10]; p < &buf[54]; p++) + for (p = &buf_1[10]; p < &buf_1[54]; p++) { *p = 5; - for (i = 0; i < 64; i++) - if (buf[i] != 5 * (i >= 10 && i < 54)) { - printf("error 1 at gcc schedule static\n"); - result += 1; - } + } - memset(buf, '\0', sizeof(buf)); + for (i = 0; i < 64; i++) { + ASSERT_EQ(buf_1[i], 5 * (i >= 10 && i < 54)); + } + + memset(buf_1, '\0', sizeof(buf_1)); #pragma omp parallel for schedule(static, 3) - for (p = &buf[3]; p <= &buf[63]; p += 2) + for (p = &buf_1[3]; p <= &buf_1[63]; p += 2) { p[-2] = 6; - for (i = 0; i < 64; i++) - if (buf[i] != 6 * ((i & 1) && i <= 61)) { - printf("error 2 at gcc schedule static\n"); - result += 1; - } - memset(buf, '\0', sizeof(buf)); + } + + for (i = 0; i < 64; i++) { + ASSERT_EQ(buf_1[i], 6 * ((i & 1) && i <= 61)); + } + + memset(buf_1, '\0', sizeof(buf_1)); #pragma omp parallel for schedule(static, 3) - for (p = &buf[16]; p < &buf[51]; p = 4 + p) + for (p = &buf_1[16]; p < &buf_1[51]; p = 4 + p) { p[2] = 7; - for (i = 0; i < 64; i++) - if (buf[i] != 7 * ((i & 3) == 2 && i >= 18 && i < 53)) { - printf("error 3 at gcc schedule static\n"); - result += 1; - } - memset(buf, '\0', sizeof(buf)); + } + + for (i = 0; i < 64; i++) { + ASSERT_EQ(buf_1[i], 7 * ((i & 3) == 2 && i >= 18 && i < 53)); + } + + memset(buf_1, '\0', sizeof(buf_1)); #pragma omp parallel for schedule(static, 3) - for (p = &buf[16]; p <= &buf[40]; p = p + 4ULL) + for (p = &buf_1[16]; p <= &buf_1[40]; p = p + 4U) { p[2] = -7; - for (i = 0; i < 64; i++) - if (buf[i] != -7 * ((i & 3) == 2 && i >= 18 && i <= 42)) { - printf("error 4 at gcc schedule static\n"); - result += 1; - } - memset(buf, '\0', sizeof(buf)); + } + + for (i = 0; i < 64; i++) { + ASSERT_EQ(buf_1[i], -7 * ((i & 3) == 2 && i >= 18 && i <= 42)); + } + + memset(buf_1, '\0', sizeof(buf_1)); #pragma omp parallel for schedule(static, 3) - for (p = &buf[53]; p > &buf[9]; --p) + for (p = &buf_1[53]; p > &buf_1[9]; --p) { *p = 5; - for (i = 0; i < 64; i++) - if (buf[i] != 5 * (i >= 10 && i < 54)) { - printf("error 5 at gcc schedule static\n"); - result += 1; - } - memset(buf, '\0', sizeof(buf)); + } + + for (i = 0; i < 64; i++) { + ASSERT_EQ(buf_1[i], 5 * (i >= 10 && i < 54)); + } + + memset(buf_1, '\0', sizeof(buf_1)); #pragma omp parallel for schedule(static, 3) - for (p = &buf[63]; p >= &buf[3]; p -= 2) + for (p = &buf_1[63]; p >= &buf_1[3]; p -= 2) { p[-2] = 6; - for (i = 0; i < 64; i++) - if (buf[i] != 6 * ((i & 1) && i <= 61)) { - printf("error 6 at gcc schedule static\n"); - result += 1; - } - memset(buf, '\0', sizeof(buf)); + } + + for (i = 0; i < 64; i++) { + ASSERT_EQ(buf_1[i], 6 * ((i & 1) && i <= 61)); + } + + memset(buf_1, '\0', sizeof(buf_1)); #pragma omp parallel for schedule(static, 3) - for (p = &buf[48]; p > &buf[15]; p = -4 + p) + for (p = &buf_1[48]; p > &buf_1[15]; p = -4 + p) { p[2] = 7; - for (i = 0; i < 64; i++) - if (buf[i] != 7 * ((i & 3) == 2 && i >= 18 && i < 53)) { - printf("error 7 at at gcc schedule static\n"); - result += 1; - } - memset(buf, '\0', sizeof(buf)); + } + + for (i = 0; i < 64; i++) { + ASSERT_EQ(buf_1[i], 7 * ((i & 3) == 2 && i >= 18 && i < 53)); + } + + memset(buf_1, '\0', sizeof(buf_1)); #pragma omp parallel for schedule(static, 3) - for (p = &buf[40]; p >= &buf[16]; p = p - 4ULL) + for (p = &buf_1[40]; p >= &buf_1[16]; p = p - 4U) { p[2] = -7; - for (i = 0; i < 64; i++) - if (buf[i] != -7 * ((i & 3) == 2 && i >= 18 && i <= 42)) { - printf("error 8 at gcc schedule static\n"); - result += 1; - } + } - if (result == 0) { - printf("All test passed\n"); - } else { - printf("Failed %d tests\n", result); + for (i = 0; i < 64; i++) { + ASSERT_EQ(buf_1[i], -7 * ((i & 3) == 2 && i >= 18 && i <= 42)); } } -void gcc_omp_parallel_for_schedule_static_thread(void) { - printf("Testing: schedule default chunk size\n"); +TEST(parallel_for_schedule_static_thread) { + + memset(buf_2, '\0', sizeof(buf_2)); #pragma omp parallel for num_threads(4) schedule(static) for (int i = 0; i < 10; i++) { - printf("%d\n", omp_get_thread_num()); + buf_2[i] = omp_get_thread_num(); + } + + uint32_t chunkSize = (10 + 4 - 1) / 4; // ceil(10/4) + for (uint32_t i = 0; i < 10; i++) { + ASSERT_EQ(buf_2[i], (i / chunkSize) % 4); } - printf("Testing: schedule chunk size 2\n"); + memset(buf_2, '\0', sizeof(buf_2)); #pragma omp parallel for num_threads(4) schedule(static, 2) for (int i = 0; i < 10; i++) { - printf("%d\n", omp_get_thread_num()); + buf_2[i] = omp_get_thread_num(); } - printf("Testing: private\n"); - int A = 9; + for (uint32_t i = 0; i < 10; i++) { + ASSERT_EQ(buf_2[i], (i / 2) % 4); + } + + uint32_t A = 9; #pragma omp parallel for num_threads(4) schedule(static) private(A) - for (int i = 0; i < 4; i++) { + for (uint32_t i = 0; i < 4; i++) { A = i; - printf("%d\n", A); } - printf("A %d\n", A); - printf("Testing: First private\n"); + ASSERT_EQ(A, 9); + A = 9; + memset(buf_2, '\0', sizeof(buf_2)); #pragma omp parallel for num_threads(4) schedule(static) firstprivate(A) - for (int i = 0; i < 4; i++) { - printf("%d\n", A); + for (uint32_t i = 0; i < 4; i++) { + buf_2[i] = A; A = i; } - printf("A %d\n", A); - printf("Testing: Last private\n"); + for (int i = 0; i < 4; i++) { + ASSERT_EQ(buf_2[i], 9); + } + ASSERT_EQ(A, 9); + A = 9; #pragma omp parallel for num_threads(4) schedule(static) lastprivate(A) - for (int i = 0; i < 4; i++) { + for (uint32_t i = 0; i < 4; i++) { A = i; } - printf("A %d\n", A); + + ASSERT_EQ(A, 3); } int main() { - uint32_t core_id = mempool_get_core_id(); - - mempool_barrier_init(core_id); - - if (core_id == 0) { - -/////////////////////////////////////////////////////////// -////////////////////// test /////////////////////////// -/////////////////////////////////////////////////////////// -#ifdef TEST_THREAD - gcc_omp_parallel_for_schedule_static_thread(); -#else - gcc_omp_parallel_for_schedule_static(); -#endif - - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); return 0; } diff --git a/software/tests/omp/omp_parallel_for_dynamic/main.c b/software/tests/omp/omp_parallel_for_dynamic/main.c index 8011cdcb2..d56a743d1 100644 --- a/software/tests/omp/omp_parallel_for_dynamic/main.c +++ b/software/tests/omp/omp_parallel_for_dynamic/main.c @@ -6,159 +6,96 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "testing.h" -void work(int num) { - int i; - volatile int cnt = 0; +int buf[64]; - for (i = 0; i < num; i++) { - cnt += i; - } -} - -void gcc_omp_parallel_for_schedule_dynamic(void) { - int buf[64]; +TEST(gcc_omp_parallel_for_schedule_dynamic) { int i, j; - int result = 0; + memset(buf, '\0', sizeof(buf)); #pragma omp parallel for schedule(dynamic, 3) for (j = 10; j < 54; j++) buf[j] = 5; for (i = 0; i < 64; i++) - if (buf[i] != 5 * (i >= 10 && i < 54)) { - printf("error 1 at gcc schedule dynamic\n"); - result += 1; - } + ASSERT_EQ(buf[i], 5 * (i >= 10 && i < 54)); + + DEBUG_PRINT("First\n"); + memset(buf, '\0', sizeof(buf)); #pragma omp parallel for schedule(dynamic, 3) for (j = 3; j <= 63; j += 2) buf[j - 2] = 6; for (i = 0; i < 64; i++) - if (buf[i] != 6 * ((i & 1) && i <= 61)) { - printf("error 2 at gcc schedule dynamic\n"); - result += 1; - } + ASSERT_EQ(buf[i], 6 * ((i & 1) && i <= 61)); + + DEBUG_PRINT("Second\n"); + memset(buf, '\0', sizeof(buf)); #pragma omp parallel for schedule(dynamic, 3) for (j = 16; j < 51; j += 4) buf[j + 2] = 7; for (i = 0; i < 64; i++) - if (buf[i] != 7 * ((i & 3) == 2 && i >= 18 && i < 53)) { - printf("error 3 at gcc schedule dynamic\n"); - result += 1; - } + ASSERT_EQ(buf[i], 7 * ((i & 3) == 2 && i >= 18 && i < 53)); + + DEBUG_PRINT("Third\n"); + memset(buf, '\0', sizeof(buf)); #pragma omp parallel for schedule(dynamic, 3) for (j = 16; j <= 40; j += 4) buf[j + 2] = -7; for (i = 0; i < 64; i++) - if (buf[i] != -7 * ((i & 3) == 2 && i >= 18 && i <= 42)) { - printf("error 4 at gcc schedule dynamic\n"); - result += 1; - } + ASSERT_EQ(buf[i], -7 * ((i & 3) == 2 && i >= 18 && i <= 42)); + + DEBUG_PRINT("Fourth\n"); + memset(buf, '\0', sizeof(buf)); #pragma omp parallel for schedule(dynamic, 3) - for (j = 53; j > 9; --j) + for (j = 53; j > 9; --j) { + DEBUG_PRINT("%d\n", j); buf[j] = 5; + } + for (i = 0; i < 64; i++) - if (buf[i] != 5 * (i >= 10 && i < 54)) { - printf("error 5 at gcc schedule dynamic\n"); - result += 1; - } + ASSERT_EQ(buf[i], 5 * (i >= 10 && i < 54)); + + DEBUG_PRINT("Fifth\n"); + memset(buf, '\0', sizeof(buf)); #pragma omp parallel for schedule(dynamic, 3) for (j = 63; j >= 3; j -= 2) buf[j - 2] = 6; for (i = 0; i < 64; i++) - if (buf[i] != 6 * ((i & 1) && i <= 61)) { - printf("error 6 at gcc schedule dynamic\n"); - result += 1; - } + ASSERT_EQ(buf[i], 6 * ((i & 1) && i <= 61)); + + DEBUG_PRINT("Sixth\n"); + memset(buf, '\0', sizeof(buf)); #pragma omp parallel for schedule(dynamic, 3) for (j = 48; j > 15; j -= 4) buf[j + 2] = 7; for (i = 0; i < 64; i++) - if (buf[i] != 7 * ((i & 3) == 2 && i >= 18 && i < 53)) { - printf("error 7 at gcc schedule dynamic\n"); - result += 1; - } + ASSERT_EQ(buf[i], 7 * ((i & 3) == 2 && i >= 18 && i < 53)); + + DEBUG_PRINT("Seventh\n"); + memset(buf, '\0', sizeof(buf)); #pragma omp parallel for schedule(dynamic, 3) for (j = 40; j >= 16; j -= 4) buf[j + 2] = -7; for (i = 0; i < 64; i++) - if (buf[i] != -7 * ((i & 3) == 2 && i >= 18 && i <= 42)) { - printf("error 8 at gcc schedule dynamic\n"); - result += 1; - } - if (result == 0) { - printf("All tests passed\n"); - } else { - printf("Failed %d tests", result); - } -} + ASSERT_EQ(buf[i], -7 * ((i & 3) == 2 && i >= 18 && i <= 42)); -// void gcc_omp_parallel_for_schedule_dynamic_thread_test(void){ -// printf("Testing: schedule chunk size 1\n"); -// #pragma omp parallel for num_threads(4) schedule(dynamic) -// for (int k = 0; k < 10; k++){ -// printf("%d\n", omp_get_thread_num()); -// } - -// printf("Testing: schedule chunk size 2\n"); -// #pragma omp parallel for num_threads(4) schedule(dynamic,3) -// for (int k = 0; k < 10; k++){ -// printf("%d\n", omp_get_thread_num()); -// } -// } + DEBUG_PRINT("Eighth\n"); +} int main() { - uint32_t core_id = mempool_get_core_id(); - - mempool_barrier_init(core_id); - - if (core_id == 0) { - - mempool_wait(1000); - - /////////////////////////////////////////////////////////// - ////////////////////// test /////////////////////////// - /////////////////////////////////////////////////////////// - - gcc_omp_parallel_for_schedule_dynamic(); - - /////////////////////////////////////////////////////////// - ///////////////////// Benchmark /////////////////////// - /////////////////////////////////////////////////////////// - // uint32_t time; - // mempool_start_benchmark(); - // #pragma omp parallel for num_threads(2) schedule(dynamic,2) - // for(int i = 0; i < 6400; i++){ - // work(10); - // } - // mempool_stop_benchmark(); - // time = mempool_get_timer(); - // printf("Parallel Time %d\n",time); - - // mempool_start_benchmark(); - // for(int i = 0; i < 6400; i++){ - // work(10); - // } - // mempool_stop_benchmark(); - // time = mempool_get_timer(); - // printf("Sequential Time %d\n",time); - - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); return 0; } diff --git a/software/tests/omp/omp_test/main.c b/software/tests/omp/omp_test/main.c index 6cde22a88..c5f65bc89 100644 --- a/software/tests/omp/omp_test/main.c +++ b/software/tests/omp/omp_test/main.c @@ -6,120 +6,109 @@ #include #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "testing.h" -#define REPETITIONS 1 /* Number of times to run each test */ +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif -void work1() { - int sum = 0; - for (int i = 0; i < 100; i++) { - sum++; - } -} - -int test_omp_parallel_for() { - int sum = 0; +TEST(test_omp_parallel_for) { + for (int i = 0; i < REPETITIONS; i++) { + int sum = 0; #pragma omp parallel shared(sum) - { + { #pragma omp for reduction(+ : sum) - for (int i = 0; i <= 100; i++) { - sum += i; + for (int i = 0; i <= 100; i++) { + sum += i; + } } + + ASSERT_EQ(sum, 5050); } - return sum; } -int test_omp_parallel_for_dynamic() { - int sum = 0; +TEST(test_omp_parallel_for_dynamic) { + for (int i = 0; i < REPETITIONS; i++) { + int sum = 0; #pragma omp parallel shared(sum) - { + { #pragma omp for schedule(dynamic, 16) reduction(+ : sum) - for (int i = 0; i <= 100; i++) { - sum += i; + for (int i = 0; i <= 100; i++) { + sum += i; + } } + + ASSERT_EQ(sum, 5050); } - return sum; } -int test_omp_parallel_for_dynamic_static() { - int sum = 0; +TEST(test_omp_parallel_for_dynamic_static) { + for (int i = 0; i < REPETITIONS; i++) { + int sum = 0; #pragma omp parallel shared(sum) - { + { #pragma omp for schedule(dynamic, 16) reduction(+ : sum) - for (int i = 0; i <= 100; i++) { - sum += i; - } + for (int i = 0; i <= 100; i++) { + sum += i; + } + +#pragma omp single + sum = 0; - sum = 0; #pragma omp for schedule(static) reduction(+ : sum) - for (int i = -100; i <= 0; i++) { - sum += i; + for (int i = -100; i <= 0; i++) { + sum += i; + } } + + printf("sum: %d\n", sum); + ASSERT_EQ(sum, -5050); } - return sum; } -int test_omp_many() { - int sum = 0; +TEST(test_omp_many) { + for (int i = 0; i < REPETITIONS; i++) { + int sum = 0; + int master_sum, single_sum = 0; #pragma omp parallel shared(sum) - { + { #pragma omp for schedule(dynamic, 16) reduction(+ : sum) - for (int i = 0; i <= 100; i++) { - sum += i; - } + for (int i = 0; i <= 100; i++) { + sum += i; + } #pragma omp barrier #pragma omp master - { - printf("first sum: %d\n", sum); - sum = 0; - } + { master_sum = sum; } #pragma omp barrier #pragma omp for schedule(static) reduction(+ : sum) - for (int i = -10; i <= 0; i++) { - sum += i; - } + for (int i = -10; i <= 0; i++) { + sum += i; + } #pragma omp barrier #pragma omp single - { printf("second sum: %d\n", sum); } + { single_sum = sum; } + } + + ASSERT_EQ(master_sum, 5050); + ASSERT_EQ(single_sum, 4995); } - return sum; } int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t i; - - if (core_id == 0) { - printf("Master Thread start\n"); - for (i = 0; i < REPETITIONS; i++) { - printf("Test: %d\n", i); - printf("For loop-sum is: %d\n", test_omp_parallel_for()); - printf("For loop dynamic-sum is: %d\n", test_omp_parallel_for_dynamic()); - printf("For loop dynamic-static-sum is: %d\n", - test_omp_parallel_for_dynamic_static()); - printf("Test many omp-sum is: %d\n", test_omp_many()); - printf("Test finished: %d\n", i); - } - printf("Master Thread end\n\n\n"); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } - - return 0; + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); } diff --git a/software/tests/omp/reduction/main.c b/software/tests/omp/reduction/main.c index 15afb6a49..196e1bdaf 100644 --- a/software/tests/omp/reduction/main.c +++ b/software/tests/omp/reduction/main.c @@ -7,228 +7,238 @@ #include "baremetal/mempool_conv2d_i32p.h" #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "testing.h" -#define REPETITIONS 10 /* Number of times to run each test */ +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif #define MAX_FACTOR 10 #define KNOWN_PRODUCT 3628800 /* 10! */ -#define LOOPCOUNT 100 /* Number of iterations to slit amongst threads */ - -int test_omp_parallel_for_reduction() { - int sum; - int known_sum; - int diff; - int product; - int known_product; - int logic_and; - int logic_or; - int bit_and; - int bit_or; - int exclusiv_bit_or; - int logics[LOOPCOUNT]; - int i; - int result; - - sum = 0; - result = 0; - product = 1; - logic_and = 1; - logic_or = 0; - bit_and = 1; - bit_or = 0; - exclusiv_bit_or = 0; - - /* Tests for integers */ - known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2; -#pragma omp parallel for schedule(static, 1) private(i) reduction(+ : sum) - for (i = 1; i <= LOOPCOUNT; i++) { - sum = sum + i; - } - if (known_sum != sum) { - result++; - printf("Error in sum with integers: Result was %d" - " instead of %d\n", - sum, known_sum); - } +#define LOOPCOUNT 100 /* Number of iterations to split amongst threads */ - diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2; -#pragma omp parallel for schedule(static, 1) private(i) reduction(- : diff) - for (i = 1; i <= LOOPCOUNT; ++i) { - diff = diff - i; - } - if (diff != 0) { - result++; - printf("Error in difference with integers: Result was %d" - " instead of 0.\n", - diff); - } +uint32_t logics[LOOPCOUNT]; -/* Tests for integers */ -#pragma omp parallel for schedule(static, 1) private(i) reduction(* : product) - for (i = 1; i <= MAX_FACTOR; i++) { - product *= i; - } - known_product = KNOWN_PRODUCT; - if (known_product != product) { - result++; - printf("Error in Product with integers: Result was %d" - " instead of %d\n\n", - product, known_product); - } +TEST(test_omp_parallel_for_sum) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int sum = 0; + int known_sum = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2; - /* Tests for logic AND */ - for (i = 0; i < LOOPCOUNT; i++) { - logics[i] = 1; - } +#pragma omp parallel for schedule(static, 1) reduction(+ : sum) + for (int i = 1; i <= LOOPCOUNT; i++) { + sum += i; + } -#pragma omp parallel for private(i) schedule(static,1) reduction(&&:logic_and) - for (i = 0; i < LOOPCOUNT; ++i) { - logic_and = (logic_and && logics[i]); - } - if (!logic_and) { - result++; - printf("Error in logic AND part 1.\n"); + ASSERT_EQ(sum, known_sum); } +} - logic_and = 1; - logics[LOOPCOUNT / 2] = 0; +TEST(test_omp_parallel_for_diff) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int diff = (LOOPCOUNT * (LOOPCOUNT + 1)) / 2; -#pragma omp parallel for schedule(static,1) private(i) reduction(&&:logic_and) - for (i = 0; i < LOOPCOUNT; ++i) { - logic_and = logic_and && logics[i]; - } - if (logic_and) { - result++; - printf("Error in logic AND part 2.\n"); - } +#pragma omp parallel for schedule(static, 1) reduction(- : diff) + for (int i = 1; i <= LOOPCOUNT; ++i) { + diff -= i; + } - /* Tests for logic OR */ - for (i = 0; i < LOOPCOUNT; i++) { - logics[i] = 0; + ASSERT_EQ(diff, 0); } +} -#pragma omp parallel for schedule(static, 1) private(i) reduction(|| : logic_or) - for (i = 0; i < LOOPCOUNT; ++i) { - logic_or = logic_or || logics[i]; - } - if (logic_or) { - result++; - printf("Error in logic OR part 1.\n"); - } - logic_or = 0; - logics[LOOPCOUNT / 2] = 1; +TEST(test_omp_parallel_for_product) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int product = 1; + int known_product = KNOWN_PRODUCT; -#pragma omp parallel for schedule(static, 1) private(i) reduction(|| : logic_or) - for (i = 0; i < LOOPCOUNT; ++i) { - logic_or = logic_or || logics[i]; - } - if (!logic_or) { - result++; - printf("Error in logic OR part 2.\n"); - } +#pragma omp parallel for schedule(static, 1) reduction(* : product) + for (int i = 1; i <= MAX_FACTOR; i++) { + product *= i; + } - /* Tests for bitwise AND */ - for (i = 0; i < LOOPCOUNT; ++i) { - logics[i] = 1; + ASSERT_EQ(product, known_product); } +} -#pragma omp parallel for schedule(static, 1) private(i) reduction(& : bit_and) - for (i = 0; i < LOOPCOUNT; ++i) { - bit_and = (bit_and & logics[i]); - } - if (!bit_and) { - result++; - printf("Error in BIT AND part 1.\n"); - } +TEST(test_omp_parallel_for_logic_and_part1) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int logic_and = 1; + for (int i = 0; i < LOOPCOUNT; i++) { + logics[i] = 1; + } - bit_and = 1; - logics[LOOPCOUNT / 2] = 0; +#pragma omp parallel for schedule(static, 1) reduction(&& : logic_and) + for (int i = 0; i < LOOPCOUNT; ++i) { + logic_and = logic_and && logics[i]; + } -#pragma omp parallel for schedule(static, 1) private(i) reduction(& : bit_and) - for (i = 0; i < LOOPCOUNT; ++i) { - bit_and = bit_and & logics[i]; - } - if (bit_and) { - result++; - printf("Error in BIT AND part 2.\n"); + ASSERT_EQ(logic_and, 1); } +} - /* Tests for bitwise OR */ - for (i = 0; i < LOOPCOUNT; i++) { - logics[i] = 0; - } +TEST(test_omp_parallel_for_logic_and_part2) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int logic_and = 1; + memset(logics, 0, LOOPCOUNT); + for (int i = 0; i < LOOPCOUNT; i++) { + logics[i] = 1; + } + logics[LOOPCOUNT / 2] = 0; -#pragma omp parallel for schedule(static, 1) private(i) reduction(| : bit_or) - for (i = 0; i < LOOPCOUNT; ++i) { - bit_or = bit_or | logics[i]; - } - if (bit_or) { - result++; - printf("Error in BIT OR part 1\n"); +#pragma omp parallel for schedule(static, 1) reduction(&& : logic_and) + for (int i = 0; i < LOOPCOUNT; ++i) { + logic_and = logic_and && logics[i]; + } + + ASSERT_EQ(logic_and, 0); } - bit_or = 0; - logics[LOOPCOUNT / 2] = 1; +} -#pragma omp parallel for schedule(static, 1) private(i) reduction(| : bit_or) - for (i = 0; i < LOOPCOUNT; ++i) { - bit_or = bit_or | logics[i]; +TEST(test_omp_parallel_for_logic_or_part1) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int logic_or = 0; + memset(logics, 0, LOOPCOUNT); + for (int i = 0; i < LOOPCOUNT; i++) { + logics[i] = 0; + } + +#pragma omp parallel for schedule(static, 1) reduction(|| : logic_or) + for (int i = 0; i < LOOPCOUNT; ++i) { + logic_or = logic_or || logics[i]; + } + + ASSERT_EQ(logic_or, 0); } - if (!bit_or) { - result++; - printf("Error in BIT OR part 2\n"); +} + +TEST(test_omp_parallel_for_logic_or_part2) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int logic_or = 0; + memset(logics, 0, LOOPCOUNT); + for (int i = 0; i < LOOPCOUNT; i++) { + logics[i] = 0; + } + logics[LOOPCOUNT / 2] = 1; + +#pragma omp parallel for schedule(static, 1) reduction(|| : logic_or) + for (int i = 0; i < LOOPCOUNT; ++i) { + logic_or = logic_or || logics[i]; + } + + ASSERT_EQ(logic_or, 1); } +} + +TEST(test_omp_parallel_for_bit_and_part1) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t bit_and = 1; + memset(logics, 0, LOOPCOUNT); + for (int i = 0; i < LOOPCOUNT; ++i) { + logics[i] = 1; + } + +#pragma omp parallel for schedule(static, 1) reduction(& : bit_and) + for (int i = 0; i < LOOPCOUNT; ++i) { + bit_and = bit_and & logics[i]; + } - /* Tests for bitwise XOR */ - for (i = 0; i < LOOPCOUNT; i++) { - logics[i] = 0; + ASSERT_EQ(bit_and, 1); } +} + +TEST(test_omp_parallel_for_bit_and_part2) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t bit_and = 1; + memset(logics, 0, LOOPCOUNT); + for (int i = 0; i < LOOPCOUNT; ++i) { + logics[i] = 1; + } + logics[LOOPCOUNT / 2] = 0; + +#pragma omp parallel for schedule(static, 1) reduction(& : bit_and) + for (int i = 0; i < LOOPCOUNT; ++i) { + bit_and = bit_and & logics[i]; + } -#pragma omp parallel for schedule(static,1) private(i) reduction(^:exclusiv_bit_or) - for (i = 0; i < LOOPCOUNT; ++i) { - exclusiv_bit_or = exclusiv_bit_or ^ logics[i]; + ASSERT_EQ(bit_and, 0); } - if (exclusiv_bit_or) { - result++; - printf("Error in EXCLUSIV BIT OR part 1\n"); +} + +TEST(test_omp_parallel_for_bit_or_part1) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t bit_or = 0; + memset(logics, 0, LOOPCOUNT); + for (int i = 0; i < LOOPCOUNT; i++) { + logics[i] = 0; + } + +#pragma omp parallel for schedule(static, 1) reduction(| : bit_or) + for (int i = 0; i < LOOPCOUNT; ++i) { + bit_or = bit_or | logics[i]; + } + + ASSERT_EQ(bit_or, 0); } +} - exclusiv_bit_or = 0; - logics[LOOPCOUNT / 2] = 1; +TEST(test_omp_parallel_for_bit_or_part2) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t bit_or = 0; + memset(logics, 0, LOOPCOUNT); + for (int i = 0; i < LOOPCOUNT; i++) { + logics[i] = 0; + } + logics[LOOPCOUNT / 2] = 1; -#pragma omp parallel for schedule(static,1) private(i) reduction(^:exclusiv_bit_or) - for (i = 0; i < LOOPCOUNT; ++i) { - exclusiv_bit_or = exclusiv_bit_or ^ logics[i]; +#pragma omp parallel for schedule(static, 1) reduction(| : bit_or) + for (int i = 0; i < LOOPCOUNT; ++i) { + bit_or = bit_or | logics[i]; + } + + ASSERT_EQ(bit_or, 1); } - if (!exclusiv_bit_or) { - result++; - printf("Error in EXCLUSIV BIT OR part 2\n"); +} + +TEST(test_omp_parallel_for_exclusiv_bit_or_part1) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t exclusiv_bit_or = 0; + memset(logics, 0, LOOPCOUNT); + for (int i = 0; i < LOOPCOUNT; i++) { + logics[i] = 0; + } + +#pragma omp parallel for schedule(static, 1) reduction(^ : exclusiv_bit_or) + for (int i = 0; i < LOOPCOUNT; ++i) { + exclusiv_bit_or = exclusiv_bit_or ^ logics[i]; + } + + ASSERT_EQ(exclusiv_bit_or, 0); } - return (result); } -int main() { - uint32_t core_id = mempool_get_core_id(); - int i; - int num_failed = 0; - - if (core_id == 0) { - printf("Master Thread start\n"); - for (i = 0; i < REPETITIONS; i++) { - printf("test: %d\n", i); - num_failed = test_omp_parallel_for_reduction(); - printf("num_failed: %d\n", num_failed); - } - printf("Master Thread end\n\n\n"); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); +TEST(test_omp_parallel_for_exclusiv_bit_or_part2) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t exclusiv_bit_or = 0; + memset(logics, 0, LOOPCOUNT); + for (int i = 0; i < LOOPCOUNT; i++) { + logics[i] = 0; } + logics[LOOPCOUNT / 2] = 1; + +#pragma omp parallel for schedule(static, 1) reduction(^ : exclusiv_bit_or) + for (int i = 0; i < LOOPCOUNT; ++i) { + exclusiv_bit_or = exclusiv_bit_or ^ logics[i]; + } + + ASSERT_EQ(exclusiv_bit_or, 1); } +} - return num_failed; +int main() { + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); } diff --git a/software/tests/omp/sections/main.c b/software/tests/omp/sections/main.c new file mode 100644 index 000000000..bf2285402 --- /dev/null +++ b/software/tests/omp/sections/main.c @@ -0,0 +1,39 @@ +// Copyright 2022 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "encoding.h" +#include "omp.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" +#include "testing.h" + +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif + +TEST(test_omp_parallel_sections) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t result = 0; + uint32_t section_1 = 0; + uint32_t section_2 = 0; + +#pragma omp parallel sections + { + +#pragma omp section + { section_1 = omp_get_thread_num(); } + +#pragma omp section + { section_2 = omp_get_thread_num(); } + } + + ASSERT_NEQ(section_1, section_2); + } +} + +int main() { + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); +} diff --git a/software/tests/omp/single/main.c b/software/tests/omp/single/main.c index 9ab31b845..e97461d87 100644 --- a/software/tests/omp/single/main.c +++ b/software/tests/omp/single/main.c @@ -2,120 +2,75 @@ // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 -#include -#include - #include "encoding.h" -#include "libgomp.h" +#include "omp.h" #include "printf.h" #include "runtime.h" #include "synchronization.h" +#include "testing.h" -#define REPETITIONS 10 /* Number of times to run each test */ - -void work1() { - int sum = 0; - for (int i = 0; i < 100; i++) { - sum++; - } -} +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif -uint32_t test_omp_parallel_single() { - uint32_t result; - result = 0; - uint32_t core_id; +TEST(test_omp_parallel_single) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t result = 0; #pragma omp parallel shared(result) - { - core_id = mempool_get_core_id(); - - work1(); - if (core_id == 0) { - work1(); - } + { #pragma omp single - { result = 100; } - - work1(); - if (core_id == 0) { - work1(); + { result += 100; } } -#pragma omp single - { - if (result == 100) - result = core_id; - } + ASSERT_EQ(result, 100); } - return result; } -uint32_t test_omp_for_single() { - uint32_t sum = 0; +TEST(test_omp_for_single) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t sum = 0; #pragma omp parallel shared(sum) - { -#pragma omp single { - for (uint32_t i = 0; i <= 100; i++) { - sum += i; +#pragma omp single + { + for (uint32_t i = 0; i <= 100; i++) { + sum += i; + } } - } #pragma omp single - { - if (sum == 100 * 101 / 2) - sum = 1; + { + if (sum == 100 * 101 / 2) + sum = 1; + } } + + ASSERT_EQ(sum, 1); } - return sum; } -uint32_t test_omp_single_copyprivate() { - uint32_t result; - result = 0; - -#pragma omp parallel firstprivate(result) - { - uint32_t core_id = mempool_get_core_id(); +TEST(test_omp_single_copyprivate) { + for (int rep = 0; rep < REPETITIONS; rep++) { + uint32_t result = 0; + uint32_t outerResult = 0; - work1(); - if (core_id == 0) { - work1(); - } +#pragma omp parallel private(result) + { #pragma omp single copyprivate(result) - { result = 100; } + { result = 100; } - work1(); - if (core_id == 5) { - result *= 2; - printf("Core 5 result: %d\n", result); +#pragma omp single + { outerResult = result; } } + + ASSERT_EQ(outerResult, 100); } - return result; } int main() { - uint32_t core_id = mempool_get_core_id(); - uint32_t i; - - if (core_id == 0) { - printf("Master Thread start\n"); - for (i = 0; i < REPETITIONS; i++) { - printf("Test: %d\n", i); - printf("Single core_id: %d\n", test_omp_parallel_single()); - printf("For loop-sum is t/f: %d\n", test_omp_for_single()); - printf("Copyprivate: %d\n", test_omp_single_copyprivate()); - printf("Test finished: %d\n", i); - } - printf("Master Thread end\n\n\n"); - } else { - while (1) { - mempool_wfi(); - run_task(core_id); - } - } - - return 0; + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); } diff --git a/software/tests/omp/teams/main.c b/software/tests/omp/teams/main.c new file mode 100644 index 000000000..6d82c557d --- /dev/null +++ b/software/tests/omp/teams/main.c @@ -0,0 +1,91 @@ +// Copyright 2024 ETH Zurich and University of Bologna. +// Licensed under the Apache License, Version 2.0, see LICENSE for details. +// SPDX-License-Identifier: Apache-2.0 + +#include "encoding.h" +#include "omp.h" +#include "printf.h" +#include "runtime.h" +#include "synchronization.h" +#include "testing.h" + +#ifndef REPETITIONS +#define REPETITIONS 100 /* Number of times to run each test */ +#endif + +TEST(test_teams_distribute) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int num_teams = 0; + int team_num[12]; + +#pragma omp teams distribute num_teams(4) + for (int i = 0; i < 12; i++) { + team_num[i] = omp_get_team_num(); + + if (omp_get_team_num() == 0) { + num_teams = omp_get_num_teams(); + } + } + + for (int i = 0; i < 12; i++) { + ASSERT_EQ(team_num[i], i / 3); + } + + ASSERT_EQ(num_teams, 4); + } +} + +TEST(test_teams_reduce) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int sum[4]; + +#pragma omp teams num_teams(4) + { + int local_sum = 0; + +#pragma omp parallel for reduction(+ : local_sum) + for (int i = 0; i < 32; i++) { + local_sum += 1; + } + + sum[omp_get_team_num()] = local_sum; + } + + for (int i = 0; i < 4; i++) { + ASSERT_EQ(sum[i], 32); + } + } +} + +TEST(test_teams_barrier) { + for (int rep = 0; rep < REPETITIONS; rep++) { + int results[2]; +#pragma omp teams num_teams(2) + { + uint32_t team_num = omp_get_team_num(); + int result = 0; + +#pragma omp parallel + { + uint32_t rank = omp_get_thread_num(); + if (rank == 1) { + mempool_wait(1000); // give 1 sec to whole test + result = 3; + } +#pragma omp barrier + + if (rank == 2) { + results[team_num] = result; + } + } + } + + ASSERT_EQ(results[0], 3); + ASSERT_EQ(results[1], 3); + } +} + +int main() { + RUN_ALL_TESTS(); + PRINT_TEST_RESULTS(); +}