diff --git a/pspamm/codegen/architectures/arm_sve/generator.py b/pspamm/codegen/architectures/arm_sve/generator.py index d1bca7f..d5007cf 100644 --- a/pspamm/codegen/architectures/arm_sve/generator.py +++ b/pspamm/codegen/architectures/arm_sve/generator.py @@ -379,22 +379,14 @@ def make_microkernel(self, divider = 1 elem128 = 1 vk = bk - preg = 'p7/z' - preg_last = 'p7/z' else: max_offs = 127 divider = 16 elem128 = 16 // self.get_precision().size() vk = -(bk // -elem128) - #if isinstance(B, DenseCursor): - # preg = 'p1/z' if self.has_bk_overhead else 'p7/z' - # preg_last = 'p2/z' if self.has_k_overhead else preg - #else: - # preg = 'p7/z' - # preg_last = 'p7/z' - preg = 'p7/z' - preg_last = 'p7/z' + preg = self.pred_n_trues(elem128, elem128, 'z') + preg_last = preg if bk % elem128 == 0 else self.pred_n_trues(bk % elem128, elem128, 'z') for Vmi in range(Vm): # set to all v_size predicates to true, we want to replicate a B element into a whole vector for bni in range(bn): # inside this n-block @@ -404,7 +396,7 @@ def make_microkernel(self, if B.has_nonzero_cell(B_ptr, to_B_block, to_cell): B_cell_addr, B_comment = B.look(B_ptr, to_B_block, to_cell) if B_regs[bki_reg, bni] not in bs: - p_zeroing = Register_ARM(AsmType.p64x8, preg_last) if bki_reg + 1 == vk else Register_ARM(AsmType.p64x8, preg) + p_zeroing = preg_last if bki_reg + 1 == vk else preg # max_offs is the maximum allowed immediate offset when using ld1rd/ld1rw to broadcast a scalar value if B_cell_addr.disp > max_offs or B_cell_addr.disp % divider != 0: