From 6193c093c1ad0986a59a1d2750775fac4b91f617 Mon Sep 17 00:00:00 2001 From: David Schneller Date: Sun, 27 Oct 2024 03:03:07 +0100 Subject: [PATCH] Bugfixes --- pspamm/codegen/architectures/arm_sve/generator.py | 9 ++++++--- pspamm/matmul.py | 1 - 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index 305897b..9ba194d 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -273,12 +273,15 @@ def move_register_block(self, addr, comment = cursor.look(cursor_ptr, block_offset, cell_offset) addr.disp += self.precision.size() * load_offset + offset = addr.disp - prev_disp + # count how many elements we have processed between last step and this step - cont_counter = ((addr.disp - prev_disp) // mul_vl) + cont_counter = (offset // mul_vl) larger_max_offset = cont_counter > max_mem_ins_mult + non_dividing_offset = offset % mul_vl != 0 - if larger_max_offset or (prev_overhead and addr.disp > 0): - offset_comment = "disp > {}".format(max_offset) if larger_max_offset else "previous mem. instr. used p0" + if larger_max_offset or (prev_overhead and addr.disp > 0) or non_dividing_offset: + offset_comment = f"disp > {max_offset}" if larger_max_offset else ("disp % VL != 0" if non_dividing_offset else "previous mem. instr. used p0") asm.add(add(addr.disp, additional_regs[0], offset_comment, addr.base)) prev_disp = addr.disp addr.base = additional_regs[0] diff --git a/pspamm/matmul.py b/pspamm/matmul.py index f4252f2..000c14a 100644 --- a/pspamm/matmul.py +++ b/pspamm/matmul.py @@ -325,7 +325,6 @@ def kernelK(asm, Bki, A_ptr, B_ptr): store_block.add(fma(regs[ir, x + ic], self.alpha_reg[1], A_regs_cut[ir, ic], "C = C + alpha * AB", None, pred=pred_m)) store_block.add(self.generator.move_register_block(self.C, C_ptr, Coords(), A_regs_cut, self.v_size, self.additional_regs, None, True, self.prefetching, self.ldc * x)) asm.add(store_block) - else: asm.add(self.generator.move_register_block(self.C, C_ptr, Coords(), regs, self.v_size, self.additional_regs, None, True, self.prefetching))