Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix examples that fail because Kokkos::finalize called too early #218

Merged
merged 2 commits into from
Nov 30, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions examples/BabelStream/functor/babel_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ def dot(self, index: int, acc: pk.Acc[float]):
acc += self.a[index] * self.b[index]


if __name__ == "__main__":
def run() -> None:
array_size: int = 2**25 # 100000
startA: float = 0.1
startB: float = 0.2
Expand Down Expand Up @@ -92,7 +92,7 @@ def dot(self, index: int, acc: pk.Acc[float]):
timings[4].append(timer.seconds())
timer.reset()

goldA = startA
goldA = startA
goldB = startB
goldC = startC

Expand All @@ -108,9 +108,9 @@ def dot(self, index: int, acc: pk.Acc[float]):
errB /= len(w.b)
errC = reduce(lambda s, val: s + abs(val - goldC), w.c)
errC /= len(w.c)
# epsi = sys.float_info.epsilon * 100
epsi = 1e-8

# epsi = sys.float_info.epsilon * 100
epsi = 1e-8
if (errA > epsi):
print(f"Validation failed on a[]. Average error {errA}")
if (errB > epsi):
Expand Down Expand Up @@ -143,3 +143,6 @@ def dot(self, index: int, acc: pk.Acc[float]):
# bandwidth = 1.0e-9 * (total_bytes / runtime)
# print(f"Runtime (seconds): {runtime}")
# print(f"Bandwidth (GB/s): {bandwidth}")

if __name__ == "__main__":
run()
13 changes: 8 additions & 5 deletions examples/BabelStream/standalone/babel_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ def dot(index, acc, a_view, b_view):
acc += a_view[index] * b_view[index]


if __name__ == "__main__":
def run() -> None:
array_size: int = 2**25 # 100000
startA: float = 0.1
startB: float = 0.2
Expand Down Expand Up @@ -85,7 +85,7 @@ def dot(index, acc, a_view, b_view):
timings[4].append(timer.seconds())
timer.reset()

goldA = startA
goldA = startA
goldB = startB
goldC = startC

Expand All @@ -101,9 +101,9 @@ def dot(index, acc, a_view, b_view):
errB /= len(b)
errC = reduce(lambda s, val: s + abs(val - goldC), c)
errC /= len(c)
# epsi = sys.float_info.epsilon * 100
epsi = 1e-8

# epsi = sys.float_info.epsilon * 100
epsi = 1e-8
if (errA > epsi):
print(f"Validation failed on a[]. Average error {errA}")
if (errB > epsi):
Expand Down Expand Up @@ -136,3 +136,6 @@ def dot(index, acc, a_view, b_view):
# bandwidth = 1.0e-9 * (total_bytes / runtime)
# print(f"Runtime (seconds): {runtime}")
# print(f"Bandwidth (GB/s): {bandwidth}")

if __name__ == "__main__":
run()
73 changes: 38 additions & 35 deletions examples/BabelStream/workload/babel_stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

@pk.workload
class KokkosStream:
def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float,
def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float,
scalar: float, num_times: int):
self.array_size: int = ARRAY_SIZE

Expand All @@ -18,7 +18,7 @@ def __init__(self, ARRAY_SIZE: int, initA: float, initB: float, initC: float,
self.initB: pk.double = initB
self.initC: pk.double = initC
self.scalar: pk.double = scalar
self.num_times: int = num_times
self.num_times: int = num_times
self.sum: pk.double = 0

self.runtime: float = 0
Expand Down Expand Up @@ -48,38 +48,38 @@ def run(self):

self.runtime = timer.seconds()

# @pk.callback
# def results(self):
# goldA = self.initA
# goldB = self.initB
# goldC = self.initC

# for i in range(self.num_times):
# goldC = goldA
# goldB = self.scalar * goldC
# goldC = goldA + goldB
# goldA = goldB + self.scalar * goldC

# errA = reduce(lambda s, val: s + abs(val - goldA), self.a)
# errA /= len(self.a)
# errB = reduce(lambda s, val: s + abs(val - goldB), self.b)
# errB /= len(self.b)
# errC = reduce(lambda s, val: s + abs(val - goldC), self.c)
# errC /= len(self.c)
# # epsi = sys.float_info.epsilon * 100
# epsi = 1e-8
# if (errA > epsi):
# print(f"Validation failed on a[]. Average error {errA}")
# if (errB > epsi):
# print(f"Validation failed on b[]. Average error {errB}")
# if (errC > epsi):
# print(f"Validation failed on c[]. Average error {errC}")

# goldSum = goldA * goldB * self.array_size
# errSum = self.sum - goldSum
# if (abs(errSum) > 1e-8):
# print(f"Validation failed on sum. Error {errSum}")
@pk.callback
def results(self):
goldA = self.initA
goldB = self.initB
goldC = self.initC

for i in range(self.num_times):
goldC = goldA
goldB = self.scalar * goldC
goldC = goldA + goldB
goldA = goldB + self.scalar * goldC

errA = reduce(lambda s, val: s + abs(val - goldA), self.a)
errA /= len(self.a)
errB = reduce(lambda s, val: s + abs(val - goldB), self.b)
errB /= len(self.b)
errC = reduce(lambda s, val: s + abs(val - goldC), self.c)
errC /= len(self.c)

# epsi = sys.float_info.epsilon * 100
epsi = 1e-8
if (errA > epsi):
print(f"Validation failed on a[]. Average error {errA}")
if (errB > epsi):
print(f"Validation failed on b[]. Average error {errB}")
if (errC > epsi):
print(f"Validation failed on c[]. Average error {errC}")

goldSum = goldA * goldB * self.array_size
errSum = self.sum - goldSum
if (abs(errSum) > 1e-8):
print(f"Validation failed on sum. Error {errSum}")

# total_bytes = 3 * sys.getsizeof(0.0) * self.array_size * num_times;
# bandwidth = 1.0e-9 * (total_bytes / self.runtime)
Expand Down Expand Up @@ -114,7 +114,7 @@ def dot(self, index: int, acc: pk.Acc[float]):
acc += self.a[index] * self.b[index]


if __name__ == "__main__":
def run() -> None:
array_size: int = 2**25 # 100000
startA: float = 0.1
startB: float = 0.2
Expand All @@ -138,3 +138,6 @@ def dot(self, index: int, acc: pk.Acc[float]):

pk.set_default_space(space)
pk.execute(space, KokkosStream(array_size, startA, startB, startC, startScalar, num_times))

if __name__ == "__main__":
run()
6 changes: 4 additions & 2 deletions examples/ParRes/workload/nstream.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def __init__(self, iterations, length, offset):
self.scalar: float = 3
self.asum: float = 0

self.nstream_time: float = 0
self.nstream_time: float = 0

@pk.main
def run(self):
Expand Down Expand Up @@ -66,7 +66,7 @@ def init(self, i: int):
self.B[i] = 2
self.C[i] = 2

if __name__ == "__main__":
def run() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('iterations', type=int)
parser.add_argument('length', type=int)
Expand Down Expand Up @@ -100,3 +100,5 @@ def init(self, i: int):
print("Offset = " , offset)
pk.execute(pk.ExecutionSpace.Default, main(iterations, length, offset))

if __name__ == "__main__":
run()
20 changes: 11 additions & 9 deletions examples/ParRes/workload/stencil.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,14 @@ def __init__(self, iterations, n, tile_size, star, radius):
self.out: pk.View2D[pk.double] = pk.View([self.n, self.n], pk.double, layout=pk.Layout.LayoutRight)
self.norm: float = 0

self.stencil_time: float = 0
self.stencil_time: float = 0

@pk.main
def run(self):
t: int = tile_size
r: int = radius

pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),
pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),
self.init)
pk.fence()

Expand All @@ -34,7 +34,7 @@ def run(self):
for i in range(iterations):
if (i == 1):
pk.fence()

if r == 1:
# star1 stencil
pk.parallel_for("stencil", pk.MDRangePolicy([r,r], [n-r, n-r], [t, t]), self.star1)
Expand All @@ -45,8 +45,8 @@ def run(self):
# star3 stencil
pk.parallel_for("stencil", pk.MDRangePolicy([r,r], [n-r, n-r], [t, t]), self.star3)

pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),

pk.parallel_for(pk.MDRangePolicy([0,0], [n, n], [t, t]),
self.increment)

pk.fence()
Expand All @@ -55,7 +55,7 @@ def run(self):
active_points: int = (n-2*r)*(n-2*r)

# verify correctness
self.norm = pk.parallel_reduce(pk.MDRangePolicy([r, r], [n-r, n-r], [t, t]),
self.norm = pk.parallel_reduce(pk.MDRangePolicy([r, r], [n-r, n-r], [t, t]),
self.norm_reduce)
pk.fence()
self.norm /= active_points
Expand All @@ -78,7 +78,7 @@ def increment(self, i: int, j: int):

@pk.workunit
def norm_reduce(self, i: int, j: int, acc: pk.Acc[pk.double]):
acc += abs(self.out[i][j])
acc += abs(self.out[i][j])

# @pk.callback
# def print_result(self):
Expand Down Expand Up @@ -121,7 +121,7 @@ def star3(self, i: int, j: int):
+self.inp[i][j+2] * 0.08333333333333333 \
+self.inp[i][j+3] * 0.05555555555555555

if __name__ == "__main__":
def run() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('iterations', type=int)
parser.add_argument('n', type=int)
Expand Down Expand Up @@ -169,9 +169,11 @@ def star3(self, i: int, j: int):

n = 2 ** n
print("Number of iterations = ", iterations)
print("Grid size = ", n)
print("Grid size = ", n)
print("Tile size = ", tile_size)
print("Type of stencil = ", "star" if star else "grid")
print("Radius of stencil = ", radius)
pk.execute(pk.ExecutionSpace.Default, main(iterations, n, tile_size, star, radius))

if __name__ == "__main__":
run()
15 changes: 9 additions & 6 deletions examples/ParRes/workload/transpose.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,19 +11,19 @@ def __init__(self, iterations, order, tile_size, permute):
self.iterations: int = iterations
self.order: int = order
self.tile_size: int = tile_size
self.permute: int = permute
self.permute: int = permute

self.A: pk.View2D[pk.double] = pk.View([self.order, self.order], pk.double, layout=pk.LayoutRight)
self.B: pk.View2D[pk.double] = pk.View([self.order, self.order], pk.double, layout=pk.LayoutRight)

self.abserr: float = 0
self.transpose_time: float = 0
self.transpose_time: float = 0
self.addit: float = (self.iterations) * (0.5 * (self.iterations - 1))

@pk.main
def run(self):
pk.parallel_for(
pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), self.init)
pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]), self.init)
pk.fence()

timer = pk.Timer()
Expand All @@ -39,7 +39,7 @@ def run(self):
self.transpose_time = timer.seconds()

self.abserr = pk.parallel_reduce(
pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]),
pk.MDRangePolicy([0,0], [self.order, self.order], [self.tile_size, self.tile_size]),
self.abserr_reduce)

pk.printf("%f\n", self.abserr)
Expand Down Expand Up @@ -69,9 +69,9 @@ def abserr_reduce(self, i: int, j: int, acc: pk.Acc[pk.double]):
def tranpose(self, i: int, j: int):
self.B[i][j] += self.A[j][i]
self.A[j][i] += 1


if __name__ == "__main__":

def run() -> None:
parser = argparse.ArgumentParser()
parser.add_argument('iterations', type=int)
parser.add_argument('order', type=int)
Expand Down Expand Up @@ -112,3 +112,6 @@ def tranpose(self, i: int, j: int):
print("Tile size = " , tile_size)
print("Permute loops = " , "yes" if permute else "no")
pk.execute(pk.ExecutionSpace.Default, main(iterations, order, tile_size, permute))

if __name__ == "__main__":
run()
22 changes: 13 additions & 9 deletions examples/kokkos-benchmarks/functor/bytes_and_flops.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def benchmark(self, team: pk.TeamMember):
n: int = team.league_rank()
for r in range(self.R):
def team_for(i: int):
a1: pk.double = self.A[n][i][0]
a1: pk.double = self.A[n][i][0]
b: pk.double = self.B[n][i][0]
a2: pk.double = a1 * 1.3
a3: pk.double = a2 * 1.1
Expand All @@ -51,13 +51,13 @@ def team_for(i: int):

pk.parallel_for(pk.TeamThreadRange(team, self.K), team_for)

if __name__ == "__main__":
def run() -> None:
# example args
# Bandwidth Bound : 2 100000 1024 1 1 1 8 256 0
# Cache Bound : 2 100000 1024 64 1 1 8 512 0
# Compute Bound : 2 100000 1024 1 1 8 64 256 0
# Load Slots Used : 2 20000 256 32 16 8 1 256 0
# Inefficient Load: 2 20000 256 32 2 8 1 256 0
# Bandwidth Bound : 2 100000 1024 1 1 1 8 256 0
# Cache Bound : 2 100000 1024 64 1 1 8 512 0
# Compute Bound : 2 100000 1024 1 1 8 64 256 0
# Load Slots Used : 2 20000 256 32 16 8 1 256 0
# Inefficient Load: 2 20000 256 32 2 8 1 256 0
# NOTE P and U are hard coded to double and 8 because otherwise we would have a lot of duplicates
parser = argparse.ArgumentParser()
parser.add_argument("P", type=int, help="Precision (1==float, 2==double)")
Expand All @@ -84,7 +84,7 @@ def team_for(i: int):
exit(1)
if args.S != 0:
print("S must be 0 (shared scratch memory not supported)")
exit(1)
exit(1)

space = pk.ExecutionSpace.OpenMP
if args.execution_space:
Expand All @@ -98,7 +98,7 @@ def team_for(i: int):
T = args.T
S = args.S
scalar_size = 8

pk.set_default_space(space)

r = pk.TeamPolicy(N, T)
Expand All @@ -113,3 +113,7 @@ def team_for(i: int):
print(f"NKRUFTS: {N} {K} {R} {U} {F} {T} {S} Time: {seconds} " +
f"Bandwidth: {1.0 * num_bytes / seconds / (1024**3)} GiB/s GFlop/s: {1e-9 * flops / seconds}")
print(w.C)


if __name__ == "__main__":
run()
Loading
Loading