diff --git a/examples/BabelStream/functor/babel_stream.py b/examples/BabelStream/functor/babel_stream.py index 8041e4ee..4109d709 100644 --- a/examples/BabelStream/functor/babel_stream.py +++ b/examples/BabelStream/functor/babel_stream.py @@ -43,7 +43,7 @@ def dot(self, index: int, acc: pk.Acc[float]): acc += self.a[index] * self.b[index] -if __name__ == "__main__": +def run() -> None: array_size: int = 2**25 # 100000 startA: float = 0.1 startB: float = 0.2 @@ -92,7 +92,7 @@ def dot(self, index: int, acc: pk.Acc[float]): timings[4].append(timer.seconds()) timer.reset() - goldA = startA + goldA = startA goldB = startB goldC = startC @@ -108,9 +108,9 @@ def dot(self, index: int, acc: pk.Acc[float]): errB /= len(w.b) errC = reduce(lambda s, val: s + abs(val - goldC), w.c) errC /= len(w.c) - - # epsi = sys.float_info.epsilon * 100 - epsi = 1e-8 + + # epsi = sys.float_info.epsilon * 100 + epsi = 1e-8 if (errA > epsi): print(f"Validation failed on a[]. Average error {errA}") if (errB > epsi): @@ -143,3 +143,6 @@ def dot(self, index: int, acc: pk.Acc[float]): # bandwidth = 1.0e-9 * (total_bytes / runtime) # print(f"Runtime (seconds): {runtime}") # print(f"Bandwidth (GB/s): {bandwidth}") + +if __name__ == "__main__": + run() diff --git a/examples/BabelStream/standalone/babel_stream.py b/examples/BabelStream/standalone/babel_stream.py index 13c96e3a..9ba6d5a1 100644 --- a/examples/BabelStream/standalone/babel_stream.py +++ b/examples/BabelStream/standalone/babel_stream.py @@ -31,7 +31,7 @@ def dot(index, acc, a_view, b_view): acc += a_view[index] * b_view[index] -if __name__ == "__main__": +def run() -> None: array_size: int = 2**25 # 100000 startA: float = 0.1 startB: float = 0.2 @@ -85,7 +85,7 @@ def dot(index, acc, a_view, b_view): timings[4].append(timer.seconds()) timer.reset() - goldA = startA + goldA = startA goldB = startB goldC = startC @@ -101,9 +101,9 @@ def dot(index, acc, a_view, b_view): errB /= len(b) errC = reduce(lambda s, val: s + abs(val - goldC), c) errC /= len(c) - - # epsi = sys.float_info.epsilon * 100 - epsi = 1e-8 + + # epsi = sys.float_info.epsilon * 100 + epsi = 1e-8 if (errA > epsi): print(f"Validation failed on a[]. Average error {errA}") if (errB > epsi): @@ -136,3 +136,6 @@ def dot(index, acc, a_view, b_view): # bandwidth = 1.0e-9 * (total_bytes / runtime) # print(f"Runtime (seconds): {runtime}") # print(f"Bandwidth (GB/s): {bandwidth}") + +if __name__ == "__main__": + run() diff --git a/examples/BabelStream/workload/babel_stream.py b/examples/BabelStream/workload/babel_stream.py index 97ce320c..30631468 100644 --- a/examples/BabelStream/workload/babel_stream.py +++ b/examples/BabelStream/workload/babel_stream.py @@ -6,7 +6,7 @@ @pk.workload class KokkosStream: - def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float, + def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float, scalar: float, num_times: int): self.array_size: int = ARRAY_SIZE @@ -18,7 +18,7 @@ def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float, self.initB: pk.double = initB self.initC: pk.double = initC self.scalar: pk.double = scalar - self.num_times: int = num_times + self.num_times: int = num_times self.sum: pk.double = 0 self.runtime: float = 0 @@ -48,38 +48,38 @@ def run(self): self.runtime = timer.seconds() - # @pk.callback - # def results(self): - # goldA = self.initA - # goldB = self.initB - # goldC = self.initC - - # for i in range(self.num_times): - # goldC = goldA - # goldB = self.scalar * goldC - # goldC = goldA + goldB - # goldA = goldB + self.scalar * goldC - - # errA = reduce(lambda s, val: s + abs(val - goldA), self.a) - # errA /= len(self.a) - # errB = reduce(lambda s, val: s + abs(val - goldB), self.b) - # errB /= len(self.b) - # errC = reduce(lambda s, val: s + abs(val - goldC), self.c) - # errC /= len(self.c) - - # # epsi = sys.float_info.epsilon * 100 - # epsi = 1e-8 - # if (errA > epsi): - # print(f"Validation failed on a[]. Average error {errA}") - # if (errB > epsi): - # print(f"Validation failed on b[]. Average error {errB}") - # if (errC > epsi): - # print(f"Validation failed on c[]. Average error {errC}") - - # goldSum = goldA * goldB * self.array_size - # errSum = self.sum - goldSum - # if (abs(errSum) > 1e-8): - # print(f"Validation failed on sum. Error {errSum}") + @pk.callback + def results(self): + goldA = self.initA + goldB = self.initB + goldC = self.initC + + for i in range(self.num_times): + goldC = goldA + goldB = self.scalar * goldC + goldC = goldA + goldB + goldA = goldB + self.scalar * goldC + + errA = reduce(lambda s, val: s + abs(val - goldA), self.a) + errA /= len(self.a) + errB = reduce(lambda s, val: s + abs(val - goldB), self.b) + errB /= len(self.b) + errC = reduce(lambda s, val: s + abs(val - goldC), self.c) + errC /= len(self.c) + + # epsi = sys.float_info.epsilon * 100 + epsi = 1e-8 + if (errA > epsi): + print(f"Validation failed on a[]. Average error {errA}") + if (errB > epsi): + print(f"Validation failed on b[]. Average error {errB}") + if (errC > epsi): + print(f"Validation failed on c[]. Average error {errC}") + + goldSum = goldA * goldB * self.array_size + errSum = self.sum - goldSum + if (abs(errSum) > 1e-8): + print(f"Validation failed on sum. Error {errSum}") # total_bytes = 3 * sys.getsizeof(0.0) * self.array_size * num_times; # bandwidth = 1.0e-9 * (total_bytes / self.runtime) @@ -114,7 +114,7 @@ def dot(self, index: int, acc: pk.Acc[float]): acc += self.a[index] * self.b[index] -if __name__ == "__main__": +def run() -> None: array_size: int = 2**25 # 100000 startA: float = 0.1 startB: float = 0.2 @@ -138,3 +138,6 @@ def dot(self, index: int, acc: pk.Acc[float]): pk.set_default_space(space) pk.execute(space, KokkosStream(array_size, startA, startB, startC, startScalar, num_times)) + +if __name__ == "__main__": + run() diff --git a/examples/ParRes/workload/nstream.py b/examples/ParRes/workload/nstream.py index ef6ce4e1..18f62d9e 100644 --- a/examples/ParRes/workload/nstream.py +++ b/examples/ParRes/workload/nstream.py @@ -16,7 +16,7 @@ def __init__(self, iterations, length, offset): self.scalar: float = 3 self.asum: float = 0 - self.nstream_time: float = 0 + self.nstream_time: float = 0 @pk.main def run(self): @@ -66,7 +66,7 @@ def init(self, i: int): self.B[i] = 2 self.C[i] = 2 -if __name__ == "__main__": +def run() -> None: parser = argparse.ArgumentParser() parser.add_argument('iterations', type=int) parser.add_argument('length', type=int) @@ -100,3 +100,5 @@ def init(self, i: int): print("Offset = " , offset) pk.execute(pk.ExecutionSpace.Default, main(iterations, length, offset)) +if __name__ == "__main__": + run() diff --git a/examples/ParRes/workload/stencil.py b/examples/ParRes/workload/stencil.py index a92c4424..0bfd33bc 100644 --- a/examples/ParRes/workload/stencil.py +++ b/examples/ParRes/workload/stencil.py @@ -18,14 +18,14 @@ def __init__(self, iterations, n, tile_size, star, radius): self.out: pk.View2D[pk.double] = pk.View([self.n, self.n], pk.double, layout=pk.Layout.LayoutRight) self.norm: float = 0 - self.stencil_time: float = 0 + self.stencil_time: float = 0 @pk.main def run(self): t: int = tile_size r: int = radius - pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]), + pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]), self.init) pk.fence() @@ -34,7 +34,7 @@ def run(self): for i in range(iterations): if (i == 1): pk.fence() - + if r == 1: # star1 stencil pk.parallel_for("stencil", pk.MDRangePolicy([r,r], [n-r, n-r], [t, t]), self.star1) @@ -45,8 +45,8 @@ def run(self): # star3 stencil pk.parallel_for("stencil", pk.MDRangePolicy([r,r], [n-r, n-r], [t, t]), self.star3) - - pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]), + + pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]), self.increment) pk.fence() @@ -55,7 +55,7 @@ def run(self): active_points: int = (n-2*r)*(n-2*r) # verify correctness - self.norm = pk.parallel_reduce(pk.MDRangePolicy([r, r], [n-r, n-r], [t, t]), + self.norm = pk.parallel_reduce(pk.MDRangePolicy([r, r], [n-r, n-r], [t, t]), self.norm_reduce) pk.fence() self.norm /= active_points @@ -78,7 +78,7 @@ def increment(self, i: int, j: int): @pk.workunit def norm_reduce(self, i: int, j: int, acc: pk.Acc[pk.double]): - acc += abs(self.out[i][j]) + acc += abs(self.out[i][j]) # @pk.callback # def print_result(self): @@ -121,7 +121,7 @@ def star3(self, i: int, j: int): +self.inp[i][j+2] * 0.08333333333333333 \ +self.inp[i][j+3] * 0.05555555555555555 -if __name__ == "__main__": +def run() -> None: parser = argparse.ArgumentParser() parser.add_argument('iterations', type=int) parser.add_argument('n', type=int) @@ -169,9 +169,11 @@ def star3(self, i: int, j: int): n = 2 ** n print("Number of iterations = ", iterations) - print("Grid size = ", n) + print("Grid size = ", n) print("Tile size = ", tile_size) print("Type of stencil = ", "star" if star else "grid") print("Radius of stencil = ", radius) pk.execute(pk.ExecutionSpace.Default, main(iterations, n, tile_size, star, radius)) +if __name__ == "__main__": + run() diff --git a/examples/ParRes/workload/transpose.py b/examples/ParRes/workload/transpose.py index b1e74646..7b57edca 100644 --- a/examples/ParRes/workload/transpose.py +++ b/examples/ParRes/workload/transpose.py @@ -11,19 +11,19 @@ def __init__(self, iterations, order, tile_size, permute): self.iterations: int = iterations self.order: int = order self.tile_size: int = tile_size - self.permute: int = permute + self.permute: int = permute self.A: pk.View2D[pk.double] = pk.View([self.order, self.order], pk.double, layout=pk.LayoutRight) self.B: pk.View2D[pk.double] = pk.View([self.order, self.order], pk.double, layout=pk.LayoutRight) self.abserr: float = 0 - self.transpose_time: float = 0 + self.transpose_time: float = 0 self.addit: float = (self.iterations) * (0.5 * (self.iterations - 1)) @pk.main def run(self): pk.parallel_for( - pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), self.init) + pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), self.init) pk.fence() timer = pk.Timer() @@ -39,7 +39,7 @@ def run(self): self.transpose_time = timer.seconds() self.abserr = pk.parallel_reduce( - pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), + pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), self.abserr_reduce) pk.printf("%f\n", self.abserr) @@ -69,9 +69,9 @@ def abserr_reduce(self, i: int, j: int, acc: pk.Acc[pk.double]): def tranpose(self, i: int, j: int): self.B[i][j] += self.A[j][i] self.A[j][i] += 1 - -if __name__ == "__main__": + +def run() -> None: parser = argparse.ArgumentParser() parser.add_argument('iterations', type=int) parser.add_argument('order', type=int) @@ -112,3 +112,6 @@ def tranpose(self, i: int, j: int): print("Tile size = " , tile_size) print("Permute loops = " , "yes" if permute else "no") pk.execute(pk.ExecutionSpace.Default, main(iterations, order, tile_size, permute)) + +if __name__ == "__main__": + run() diff --git a/examples/kokkos-benchmarks/functor/bytes_and_flops.py b/examples/kokkos-benchmarks/functor/bytes_and_flops.py index b7defa76..7a982e2a 100644 --- a/examples/kokkos-benchmarks/functor/bytes_and_flops.py +++ b/examples/kokkos-benchmarks/functor/bytes_and_flops.py @@ -26,7 +26,7 @@ def benchmark(self, team: pk.TeamMember): n: int = team.league_rank() for r in range(self.R): def team_for(i: int): - a1: pk.double = self.A[n][i][0] + a1: pk.double = self.A[n][i][0] b: pk.double = self.B[n][i][0] a2: pk.double = a1 * 1.3 a3: pk.double = a2 * 1.1 @@ -51,13 +51,13 @@ def team_for(i: int): pk.parallel_for(pk.TeamThreadRange(team, self.K), team_for) -if __name__ == "__main__": +def run() -> None: # example args - # Bandwidth Bound : 2 100000 1024 1 1 1 8 256 0 - # Cache Bound : 2 100000 1024 64 1 1 8 512 0 - # Compute Bound : 2 100000 1024 1 1 8 64 256 0 - # Load Slots Used : 2 20000 256 32 16 8 1 256 0 - # Inefficient Load: 2 20000 256 32 2 8 1 256 0 + # Bandwidth Bound : 2 100000 1024 1 1 1 8 256 0 + # Cache Bound : 2 100000 1024 64 1 1 8 512 0 + # Compute Bound : 2 100000 1024 1 1 8 64 256 0 + # Load Slots Used : 2 20000 256 32 16 8 1 256 0 + # Inefficient Load: 2 20000 256 32 2 8 1 256 0 # NOTE P and U are hard coded to double and 8 because otherwise we would have a lot of duplicates parser = argparse.ArgumentParser() parser.add_argument("P", type=int, help="Precision (1==float, 2==double)") @@ -84,7 +84,7 @@ def team_for(i: int): exit(1) if args.S != 0: print("S must be 0 (shared scratch memory not supported)") - exit(1) + exit(1) space = pk.ExecutionSpace.OpenMP if args.execution_space: @@ -98,7 +98,7 @@ def team_for(i: int): T = args.T S = args.S scalar_size = 8 - + pk.set_default_space(space) r = pk.TeamPolicy(N, T) @@ -113,3 +113,7 @@ def team_for(i: int): print(f"NKRUFTS: {N} {K} {R} {U} {F} {T} {S} Time: {seconds} " + f"Bandwidth: {1.0 * num_bytes / seconds / (1024**3)} GiB/s GFlop/s: {1e-9 * flops / seconds}") print(w.C) + + +if __name__ == "__main__": + run() diff --git a/examples/kokkos-benchmarks/functor/gather.py b/examples/kokkos-benchmarks/functor/gather.py index a06664b9..e4e94ac7 100644 --- a/examples/kokkos-benchmarks/functor/gather.py +++ b/examples/kokkos-benchmarks/functor/gather.py @@ -23,7 +23,7 @@ def __init__(self, N: int, K: int, D: int, R: int, F: int): self.A.fill(1.5) self.B.fill(2.0) - + #TODO use kokkos to init in parallel random.seed(12313) for i in range(N): @@ -60,7 +60,7 @@ def benchmark(self, i: int): self.C[i] = c -if __name__ == "__main__": +def run() -> None: # example args 2 100000 32 512 1000 8 8 # NOTE S and U are hard coded to double and 8 because otherwise we would have a lot of duplicates parser = argparse.ArgumentParser() @@ -87,7 +87,7 @@ def benchmark(self, i: int): space = pk.ExecutionSpace.OpenMP if args.execution_space: space = pk.ExecutionSpace(args.execution_space) - + pk.set_default_space(space) N = args.N @@ -115,3 +115,5 @@ def benchmark(self, i: int): print(f"SNKDRUF: {scalar_size/4} {N} {K} {D} {R} {U} {F} Time: {seconds} " + f"Bandwidth: {1.0 * num_bytes / seconds / (1024**3)} GiB/s GFlop/s: {1e-9 * flops / seconds} GGather/s: {1e-9 * gather_ops / seconds}") +if __name__ == "__main__": + run() diff --git a/examples/kokkos-tutorials/functor/subview.py b/examples/kokkos-tutorials/functor/subview.py index afa29141..79dd1cb6 100644 --- a/examples/kokkos-tutorials/functor/subview.py +++ b/examples/kokkos-tutorials/functor/subview.py @@ -40,7 +40,7 @@ def yAx(self, j: int, acc: pk.Acc[float]): acc += self.y[j] * temp2 -if __name__ == "__main__": +def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] @@ -75,3 +75,6 @@ def yAx(self, j: int, acc: pk.Acc[float]): print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)") + +if __name__ == "__main__": + run() diff --git a/examples/kokkos-tutorials/standalone/subview.py b/examples/kokkos-tutorials/standalone/subview.py index a1dd4d76..a07e7035 100644 --- a/examples/kokkos-tutorials/standalone/subview.py +++ b/examples/kokkos-tutorials/standalone/subview.py @@ -15,7 +15,7 @@ def yAx(j, acc, cols, y_view, x_view, A_view): acc += y_view[j] * temp2 -if __name__ == "__main__": +def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] @@ -68,3 +68,6 @@ def yAx(j, acc, cols, y_view, x_view, A_view): print(f"N({N}) M({M}) nrepeat({nrepeat}) problem(MB) time({timer_result}) bandwidth(GB/s)") + +if __name__ == "__main__": + run() diff --git a/examples/kokkos-tutorials/workload/subview.py b/examples/kokkos-tutorials/workload/subview.py index 9a60ebe5..66176d9b 100644 --- a/examples/kokkos-tutorials/workload/subview.py +++ b/examples/kokkos-tutorials/workload/subview.py @@ -63,7 +63,7 @@ def yAx(self, j: int, acc: pk.Acc[float]): acc += self.y[j] * temp2 -if __name__ == "__main__": +def run() -> None: values: Tuple[int, int, int, int, int, bool] = parse_args() N: int = values[0] M: int = values[1] @@ -80,3 +80,6 @@ def yAx(self, j: int, acc: pk.Acc[float]): print(f"Total size S = {N * M} N = {N} M = {M}") pk.execute(pk.get_default_space(), Workload(N, M, nrepeat, fill)) + +if __name__ == "__main__": + run() diff --git a/examples/kokkos/scan_functor.py b/examples/kokkos/scan_functor.py index 07d18877..ed362b17 100644 --- a/examples/kokkos/scan_functor.py +++ b/examples/kokkos/scan_functor.py @@ -15,8 +15,7 @@ def scan(self, i: int, acc: pk.Acc[pk.double], last_pass: bool): if last_pass: self.A[i] = acc - -if __name__ == "__main__": +def run() -> None: N = 10 w = Workload(N) p = pk.RangePolicy(pk.ExecutionSpace.OpenMP, 0, N) @@ -27,3 +26,6 @@ def scan(self, i: int, acc: pk.Acc[pk.double], last_pass: bool): timer_result = timer.seconds() print(f"{w.A} total={result} time({timer_result})") + +if __name__ == "__main__": + run() diff --git a/examples/kokkos/scan_standalone.py b/examples/kokkos/scan_standalone.py index 276e5f0f..30a3b9fd 100644 --- a/examples/kokkos/scan_standalone.py +++ b/examples/kokkos/scan_standalone.py @@ -10,7 +10,7 @@ def scan(i, acc, last_pass, view): if last_pass: view[i] = acc -if __name__ == "__main__": +def run() -> None: N = 10 A: pk.View1D[pk.int32] = pk.View([N], pk.int32) @@ -22,3 +22,6 @@ def scan(i, acc, last_pass, view): timer_result = timer.seconds() print(f"{A} total={result} time({timer_result})") + +if __name__ == "__main__": + run() diff --git a/examples/kokkos/scan_workload.py b/examples/kokkos/scan_workload.py index a331426b..3e4e1138 100644 --- a/examples/kokkos/scan_workload.py +++ b/examples/kokkos/scan_workload.py @@ -29,6 +29,8 @@ def scan(self, i: int, acc: pk.Acc[pk.double], last_pass: bool): if last_pass: self.A[i] = acc +def run() -> None: + pk.execute(pk.ExecutionSpace.OpenMP, Workload(10)) if __name__ == "__main__": - pk.execute(pk.ExecutionSpace.OpenMP, Workload(10)) + run() diff --git a/examples/pykokkos/binsort.py b/examples/pykokkos/binsort.py index f9ea59e4..2c2b6646 100644 --- a/examples/pykokkos/binsort.py +++ b/examples/pykokkos/binsort.py @@ -43,7 +43,7 @@ def results(self) -> None: print(f"{self.view[i]} ") -def run(): +def run() -> None: workload = Workload(10) pk.execute(pk.ExecutionSpace.Default, workload) print(workload.view) @@ -51,8 +51,5 @@ def run(): print(workload.bin_offsets) print(workload.bin_count) - if __name__ == "__main__": - pk.kokkos_manager.initialize() run() - pk.kokkos_manager.finalize() diff --git a/examples/pykokkos/streams.py b/examples/pykokkos/streams.py index 4fc29590..d437b308 100644 --- a/examples/pykokkos/streams.py +++ b/examples/pykokkos/streams.py @@ -15,7 +15,7 @@ def print_stream(i, x, id): elif x == 4: pk.printf("Stream 3 GPU %d\n", id) -if __name__ == "__main__": +def run() -> None: space = pk.Cuda # Create streams on GPU 0 (default GPU) @@ -48,3 +48,7 @@ def print_stream(i, x, id): pk.parallel_for(pk.RangePolicy(instance2, 0, 2), print_stream, x=2, id=cp.cuda.runtime.getDevice()) print("Done launching kernels") + + +if __name__ == "__main__": + run() diff --git a/examples/pykokkos/subviews.py b/examples/pykokkos/subviews.py index d3ec7333..77993c83 100644 --- a/examples/pykokkos/subviews.py +++ b/examples/pykokkos/subviews.py @@ -19,6 +19,8 @@ def work(self, i: int): def callback(self) -> None: print(self.view) +def run() -> None: + pk.execute(pk.ExecutionSpace.Default, Workload()) if __name__ == "__main__": - pk.execute(pk.ExecutionSpace.Default, Workload()) + run()