kokkos · NaderAlAwar · Nov 30, 2023 · Nov 29, 2023 · Nov 30, 2023
diff --git a/examples/BabelStream/functor/babel_stream.py b/examples/BabelStream/functor/babel_stream.py
@@ -43,7 +43,7 @@ def dot(self, index: int, acc: pk.Acc[float]):
         acc += self.a[index] * self.b[index]
 
 
-if __name__ == "__main__":
+def run() -> None:
     array_size: int = 2**25 # 100000
     startA: float = 0.1
     startB: float = 0.2
@@ -92,7 +92,7 @@ def dot(self, index: int, acc: pk.Acc[float]):
         timings[4].append(timer.seconds())
         timer.reset()
 
-    goldA = startA 
+    goldA = startA
     goldB = startB
     goldC = startC
 
@@ -108,9 +108,9 @@ def dot(self, index: int, acc: pk.Acc[float]):
     errB /= len(w.b)
     errC = reduce(lambda s, val: s + abs(val - goldC), w.c)
     errC /= len(w.c)
-    
-    # epsi = sys.float_info.epsilon * 100 
-    epsi = 1e-8 
+
+    # epsi = sys.float_info.epsilon * 100
+    epsi = 1e-8
     if (errA > epsi):
         print(f"Validation failed on a[]. Average error {errA}")
     if (errB > epsi):
@@ -143,3 +143,6 @@ def dot(self, index: int, acc: pk.Acc[float]):
     # bandwidth = 1.0e-9 * (total_bytes / runtime)
     # print(f"Runtime (seconds): {runtime}")
     # print(f"Bandwidth (GB/s): {bandwidth}")
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/BabelStream/standalone/babel_stream.py b/examples/BabelStream/standalone/babel_stream.py
@@ -31,7 +31,7 @@ def dot(index, acc, a_view, b_view):
     acc += a_view[index] * b_view[index]
 
 
-if __name__ == "__main__":
+def run() -> None:
     array_size: int = 2**25 # 100000
     startA: float = 0.1
     startB: float = 0.2
@@ -85,7 +85,7 @@ def dot(index, acc, a_view, b_view):
         timings[4].append(timer.seconds())
         timer.reset()
 
-    goldA = startA 
+    goldA = startA
     goldB = startB
     goldC = startC
 
@@ -101,9 +101,9 @@ def dot(index, acc, a_view, b_view):
     errB /= len(b)
     errC = reduce(lambda s, val: s + abs(val - goldC), c)
     errC /= len(c)
-    
-    # epsi = sys.float_info.epsilon * 100 
-    epsi = 1e-8 
+
+    # epsi = sys.float_info.epsilon * 100
+    epsi = 1e-8
     if (errA > epsi):
         print(f"Validation failed on a[]. Average error {errA}")
     if (errB > epsi):
@@ -136,3 +136,6 @@ def dot(index, acc, a_view, b_view):
     # bandwidth = 1.0e-9 * (total_bytes / runtime)
     # print(f"Runtime (seconds): {runtime}")
     # print(f"Bandwidth (GB/s): {bandwidth}")
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/BabelStream/workload/babel_stream.py b/examples/BabelStream/workload/babel_stream.py
@@ -6,7 +6,7 @@
 
 @pk.workload
 class KokkosStream:
-    def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float, 
+    def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float,
             scalar: float, num_times: int):
         self.array_size: int = ARRAY_SIZE
 
@@ -18,7 +18,7 @@ def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float,
         self.initB: pk.double = initB
         self.initC: pk.double = initC
         self.scalar: pk.double = scalar
-        self.num_times: int = num_times 
+        self.num_times: int = num_times
         self.sum: pk.double = 0
 
         self.runtime: float = 0
@@ -48,38 +48,38 @@ def run(self):
 
         self.runtime = timer.seconds()
 
-    # @pk.callback
-    # def results(self):
-    #     goldA = self.initA
-    #     goldB = self.initB
-    #     goldC = self.initC
-
-    #     for i in range(self.num_times):
-    #         goldC = goldA
-    #         goldB = self.scalar * goldC
-    #         goldC = goldA + goldB
-    #         goldA = goldB + self.scalar * goldC
-
-    #     errA = reduce(lambda s, val: s + abs(val - goldA), self.a)
-    #     errA /= len(self.a)
-    #     errB = reduce(lambda s, val: s + abs(val - goldB), self.b)
-    #     errB /= len(self.b)
-    #     errC = reduce(lambda s, val: s + abs(val - goldC), self.c)
-    #     errC /= len(self.c)
-        
-    #     # epsi = sys.float_info.epsilon * 100 
-    #     epsi = 1e-8 
-    #     if (errA > epsi):
-    #         print(f"Validation failed on a[]. Average error {errA}")
-    #     if (errB > epsi):
-    #         print(f"Validation failed on b[]. Average error {errB}")
-    #     if (errC > epsi):
-    #         print(f"Validation failed on c[]. Average error {errC}")
-
-    #     goldSum = goldA * goldB * self.array_size
-    #     errSum = self.sum - goldSum
-    #     if (abs(errSum) > 1e-8):
-    #         print(f"Validation failed on sum. Error {errSum}")
+    @pk.callback
+    def results(self):
+        goldA = self.initA
+        goldB = self.initB
+        goldC = self.initC
+
+        for i in range(self.num_times):
+            goldC = goldA
+            goldB = self.scalar * goldC
+            goldC = goldA + goldB
+            goldA = goldB + self.scalar * goldC
+
+        errA = reduce(lambda s, val: s + abs(val - goldA), self.a)
+        errA /= len(self.a)
+        errB = reduce(lambda s, val: s + abs(val - goldB), self.b)
+        errB /= len(self.b)
+        errC = reduce(lambda s, val: s + abs(val - goldC), self.c)
+        errC /= len(self.c)
+
+        # epsi = sys.float_info.epsilon * 100
+        epsi = 1e-8
+        if (errA > epsi):
+            print(f"Validation failed on a[]. Average error {errA}")
+        if (errB > epsi):
+            print(f"Validation failed on b[]. Average error {errB}")
+        if (errC > epsi):
+            print(f"Validation failed on c[]. Average error {errC}")
+
+        goldSum = goldA * goldB * self.array_size
+        errSum = self.sum - goldSum
+        if (abs(errSum) > 1e-8):
+            print(f"Validation failed on sum. Error {errSum}")
 
     #     total_bytes = 3 * sys.getsizeof(0.0) * self.array_size * num_times;
     #     bandwidth = 1.0e-9 * (total_bytes / self.runtime)
@@ -114,7 +114,7 @@ def dot(self, index: int, acc: pk.Acc[float]):
         acc += self.a[index] * self.b[index]
 
 
-if __name__ == "__main__":
+def run() -> None:
     array_size: int = 2**25 # 100000
     startA: float = 0.1
     startB: float = 0.2
@@ -138,3 +138,6 @@ def dot(self, index: int, acc: pk.Acc[float]):
 
     pk.set_default_space(space)
     pk.execute(space, KokkosStream(array_size, startA, startB, startC, startScalar, num_times))
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/ParRes/workload/nstream.py b/examples/ParRes/workload/nstream.py
@@ -16,7 +16,7 @@ def __init__(self, iterations, length, offset):
         self.scalar: float = 3
         self.asum: float = 0
 
-        self.nstream_time: float = 0 
+        self.nstream_time: float = 0
 
     @pk.main
     def run(self):
@@ -66,7 +66,7 @@ def init(self, i: int):
         self.B[i] = 2
         self.C[i] = 2
 
-if __name__ == "__main__":
+def run() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument('iterations', type=int)
     parser.add_argument('length', type=int)
@@ -100,3 +100,5 @@ def init(self, i: int):
     print("Offset               = " , offset)
     pk.execute(pk.ExecutionSpace.Default, main(iterations, length, offset))
 
+if __name__ == "__main__":
+    run()
diff --git a/examples/ParRes/workload/stencil.py b/examples/ParRes/workload/stencil.py
@@ -18,14 +18,14 @@ def __init__(self, iterations, n, tile_size, star, radius):
         self.out: pk.View2D[pk.double] = pk.View([self.n, self.n], pk.double, layout=pk.Layout.LayoutRight)
         self.norm: float = 0
 
-        self.stencil_time: float = 0 
+        self.stencil_time: float = 0
 
     @pk.main
     def run(self):
         t: int = tile_size
         r: int = radius
 
-        pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]), 
+        pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),
             self.init)
         pk.fence()
 
@@ -34,7 +34,7 @@ def run(self):
         for i in range(iterations):
             if (i == 1):
                 pk.fence()
-            
+
             if r == 1:
                 # star1 stencil
                 pk.parallel_for("stencil", pk.MDRangePolicy([r,r], [n-r, n-r], [t, t]), self.star1)
@@ -45,8 +45,8 @@ def run(self):
                 # star3 stencil
                 pk.parallel_for("stencil", pk.MDRangePolicy([r,r], [n-r, n-r], [t, t]), self.star3)
 
-            
-            pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]), 
+
+            pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),
                 self.increment)
 
         pk.fence()
@@ -55,7 +55,7 @@ def run(self):
         active_points: int = (n-2*r)*(n-2*r)
 
         # verify correctness
-        self.norm = pk.parallel_reduce(pk.MDRangePolicy([r, r], [n-r, n-r], [t, t]), 
+        self.norm = pk.parallel_reduce(pk.MDRangePolicy([r, r], [n-r, n-r], [t, t]),
                 self.norm_reduce)
         pk.fence()
         self.norm /= active_points
@@ -78,7 +78,7 @@ def increment(self, i: int, j: int):
 
     @pk.workunit
     def norm_reduce(self, i: int, j: int, acc: pk.Acc[pk.double]):
-        acc += abs(self.out[i][j]) 
+        acc += abs(self.out[i][j])
 
     # @pk.callback
     # def print_result(self):
@@ -121,7 +121,7 @@ def star3(self, i: int, j: int):
             +self.inp[i][j+2] * 0.08333333333333333 \
             +self.inp[i][j+3] * 0.05555555555555555
 
-if __name__ == "__main__":
+def run() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument('iterations', type=int)
     parser.add_argument('n', type=int)
@@ -169,9 +169,11 @@ def star3(self, i: int, j: int):
 
     n = 2 ** n
     print("Number of iterations = ", iterations)
-    print("Grid size            = ", n) 
+    print("Grid size            = ", n)
     print("Tile size            = ", tile_size)
     print("Type of stencil      = ", "star" if star else "grid")
     print("Radius of stencil    = ", radius)
     pk.execute(pk.ExecutionSpace.Default, main(iterations, n, tile_size, star, radius))
 
+if __name__ == "__main__":
+    run()
diff --git a/examples/ParRes/workload/transpose.py b/examples/ParRes/workload/transpose.py
@@ -11,19 +11,19 @@ def __init__(self, iterations, order, tile_size, permute):
         self.iterations: int = iterations
         self.order: int = order
         self.tile_size: int = tile_size
-        self.permute: int = permute 
+        self.permute: int = permute
 
         self.A: pk.View2D[pk.double] = pk.View([self.order, self.order], pk.double, layout=pk.LayoutRight)
         self.B: pk.View2D[pk.double] = pk.View([self.order, self.order], pk.double, layout=pk.LayoutRight)
 
         self.abserr: float = 0
-        self.transpose_time: float = 0 
+        self.transpose_time: float = 0
         self.addit: float = (self.iterations) * (0.5 * (self.iterations - 1))
 
     @pk.main
     def run(self):
         pk.parallel_for(
-            pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), self.init) 
+            pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), self.init)
         pk.fence()
 
         timer = pk.Timer()
@@ -39,7 +39,7 @@ def run(self):
         self.transpose_time = timer.seconds()
 
         self.abserr = pk.parallel_reduce(
-            pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), 
+            pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]),
             self.abserr_reduce)
 
         pk.printf("%f\n", self.abserr)
@@ -69,9 +69,9 @@ def abserr_reduce(self, i: int, j: int, acc: pk.Acc[pk.double]):
     def tranpose(self, i: int, j: int):
         self.B[i][j] += self.A[j][i]
         self.A[j][i] += 1
-
 
-if __name__ == "__main__":
+
+def run() -> None:
     parser = argparse.ArgumentParser()
     parser.add_argument('iterations', type=int)
     parser.add_argument('order', type=int)
@@ -112,3 +112,6 @@ def tranpose(self, i: int, j: int):
     print("Tile size            = " , tile_size)
     print("Permute loops        = " , "yes" if permute else "no")
     pk.execute(pk.ExecutionSpace.Default, main(iterations, order, tile_size, permute))
+
+if __name__ == "__main__":
+    run()
diff --git a/examples/kokkos-benchmarks/functor/bytes_and_flops.py b/examples/kokkos-benchmarks/functor/bytes_and_flops.py
@@ -26,7 +26,7 @@ def benchmark(self, team: pk.TeamMember):
         n: int = team.league_rank()
         for r in range(self.R):
             def team_for(i: int):
-                a1: pk.double = self.A[n][i][0] 
+                a1: pk.double = self.A[n][i][0]
                 b: pk.double = self.B[n][i][0]
                 a2: pk.double = a1 * 1.3
                 a3: pk.double = a2 * 1.1
@@ -51,13 +51,13 @@ def team_for(i: int):
 
             pk.parallel_for(pk.TeamThreadRange(team, self.K), team_for)
 
-if __name__ == "__main__":
+def run() -> None:
     # example args
-    # Bandwidth Bound : 2 100000 1024 1 1 1 8 256 0 
-    # Cache Bound     : 2 100000 1024 64 1 1 8 512 0 
-    # Compute Bound   : 2 100000 1024 1 1 8 64 256 0 
-    # Load Slots Used : 2 20000 256 32 16 8 1 256 0 
-    # Inefficient Load: 2 20000 256 32 2 8 1 256 0 
+    # Bandwidth Bound : 2 100000 1024 1 1 1 8 256 0
+    # Cache Bound     : 2 100000 1024 64 1 1 8 512 0
+    # Compute Bound   : 2 100000 1024 1 1 8 64 256 0
+    # Load Slots Used : 2 20000 256 32 16 8 1 256 0
+    # Inefficient Load: 2 20000 256 32 2 8 1 256 0
     # NOTE P and U are hard coded to double and 8 because otherwise we would have a lot of duplicates
     parser = argparse.ArgumentParser()
     parser.add_argument("P", type=int, help="Precision (1==float, 2==double)")
@@ -84,7 +84,7 @@ def team_for(i: int):
         exit(1)
     if args.S != 0:
         print("S must be 0 (shared scratch memory not supported)")
-        exit(1) 
+        exit(1)
 
     space = pk.ExecutionSpace.OpenMP
     if args.execution_space:
@@ -98,7 +98,7 @@ def team_for(i: int):
     T = args.T
     S = args.S
     scalar_size = 8
-    
+
     pk.set_default_space(space)
 
     r = pk.TeamPolicy(N, T)
@@ -113,3 +113,7 @@ def team_for(i: int):
     print(f"NKRUFTS: {N} {K} {R} {U} {F} {T} {S} Time: {seconds} " +
             f"Bandwidth: {1.0 * num_bytes / seconds / (1024**3)} GiB/s GFlop/s: {1e-9 * flops / seconds}")
     print(w.C)
+
+
+if __name__ == "__main__":
+    run()