Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA support #35

Draft
wants to merge 16 commits into
base: vc/legacy
Choose a base branch
from
4 changes: 3 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@ version = "0.3.0"

[deps]
CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
FileWatching = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
FunctionWrappers = "069b7b12-0de2-55c6-9aab-29f3d0a68a2e"
Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
Sockets = "6462fe0b-24de-5631-8697-dd941f90decc"
UCX_jll = "16e4e860-d6b8-5056-a518-93e88b6392ae"

[compat]
CEnum = "0.4"
FunctionWrappers = "1.1"
UCX_jll = "1.10"
julia = "1.5"
FunctionWrappers = "1.1"
22 changes: 22 additions & 0 deletions examples/benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Run the benchmarks

## Setup
```
julia --project=examples/benchmarks
pkg> dev .
```

## Running

### MPI benchmarks (TCP)

```
julia --project=examples/benchmarks -e 'ENV["JULIA_MPI_BINARY"]="system"; using Pkg; Pkg.build("MPI"; verbose=true)'
mpiexec --mca btl tcp,self -n 2 julia --project=examples/benchmarks examples/benchmarks/mpi/latency.jl
```



```
JULIA_PROJECT=(pwd)/examples/benchmarks julia examples/benchmarks/legacy/latency.jl
```
9 changes: 6 additions & 3 deletions examples/benchmarks/distributed/latency.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ using Distributed

include(joinpath(@__DIR__, "..", "config.jl"))

addprocs(1)

@everywhere function target(A)
nothing
end
Expand Down Expand Up @@ -42,7 +40,12 @@ function benchmark()
t_start = Base.time_ns()
end

remotecall_wait(target, 2, view(send_buf, 1:size))
GC.@preserve send_buf begin
ptr = pointer(send_buf)
subset = Base.unsafe_wrap(Array, ptr, size)
# avoid view
remotecall_wait(target, 2, subset)
end
end
t_end = Base.time_ns()

Expand Down
64 changes: 64 additions & 0 deletions examples/benchmarks/distributed/latency_cuda.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
@everywhere using CUDA

include(joinpath(@__DIR__, "..", "config.jl"))

@everywhere function target(::Any)
nothing
end

const MAX_MESSAGE_SIZE = 1<<22
# const MAX_MESSAGE_SIZE = 4096
const LARGE_MESSAGE_SIZE = 8192

const LAT_LOOP_SMALL = 10000
const LAT_SKIP_SMALL = 100
const LAT_LOOP_LARGE = 1000
const LAT_SKIP_LARGE = 10

function touch_data(send_buf, size)
send_buf[1:size] .= 'A' % UInt8
end

function benchmark()
t = Table(msg_size = Int[], latency = Float64[], kind=Symbol[])
send_buf = CuArray{UInt8, 1}(undef, MAX_MESSAGE_SIZE)

size = 1
while size <= MAX_MESSAGE_SIZE
@info "sending" size
flush(stderr)
touch_data(send_buf, size)

if size > LARGE_MESSAGE_SIZE
loop = LAT_LOOP_LARGE
skip = LAT_SKIP_LARGE
else
loop = LAT_LOOP_SMALL
skip = LAT_SKIP_SMALL
end

t_start = 0
for i in -skip:loop
if i == 1
t_start = Base.time_ns()
end

remotecall_wait(target, 2, view(send_buf, 1:size))

end
t_end = Base.time_ns()

t_delta = t_end-t_start
t_op = t_delta / loop

push!(t, (msg_size = size, latency = t_op, kind = :distributed))

size *= 2
end

CSV.write(joinpath(@__DIR__, "latency_cuda.csv"), t)
end

if !isinteractive()
benchmark()
end
24 changes: 24 additions & 0 deletions examples/benchmarks/distributed/latency_cuda_n1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
msg_size,latency,kind
1,194326.9893,distributed
2,258492.2302,distributed
4,163384.0434,distributed
8,181657.0711,distributed
16,161660.3484,distributed
32,170364.3234,distributed
64,183142.3,distributed
128,189219.9591,distributed
256,191429.4955,distributed
512,169848.7063,distributed
1024,178409.7519,distributed
2048,182247.2716,distributed
4096,181372.9255,distributed
8192,186916.8803,distributed
16384,227877.354,distributed
32768,262931.695,distributed
65536,299395.474,distributed
131072,400552.752,distributed
262144,726228.433,distributed
524288,1.123765e6,distributed
1048576,1.941801365e6,distributed
2097152,3.637946629e6,distributed
4194304,8.655691419e6,distributed
24 changes: 24 additions & 0 deletions examples/benchmarks/distributed/latency_cuda_n2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
msg_size,latency,kind
1,225954.7719,distributed
2,317755.7089,distributed
4,216636.6364,distributed
8,215174.4908,distributed
16,215454.6626,distributed
32,258174.5308,distributed
64,216194.8179,distributed
128,231967.9421,distributed
256,225013.8801,distributed
512,236940.8486,distributed
1024,231660.4506,distributed
2048,222922.7825,distributed
4096,232425.0653,distributed
8192,246477.4929,distributed
16384,298064.405,distributed
32768,342866.96,distributed
65536,371927.251,distributed
131072,502949.961,distributed
262144,788521.308,distributed
524288,1.369897892e6,distributed
1048576,2.571240613e6,distributed
2097152,4.916306884e6,distributed
4194304,1.1224723223e7,distributed
24 changes: 24 additions & 0 deletions examples/benchmarks/distributed/latency_n1.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
msg_size,latency,kind
1,96009.0072,distributed
2,97592.2913,distributed
4,104222.7204,distributed
8,96774.2828,distributed
16,106855.5027,distributed
32,113018.8739,distributed
64,109522.7947,distributed
128,110182.2696,distributed
256,159410.2898,distributed
512,128404.4605,distributed
1024,132115.1629,distributed
2048,114721.292,distributed
4096,115280.7431,distributed
8192,110971.0442,distributed
16384,169943.913,distributed
32768,123538.35,distributed
65536,148656.547,distributed
131072,139377.027,distributed
262144,157263.064,distributed
524288,196206.107,distributed
1048576,280161.416,distributed
2097152,450651.949,distributed
4194304,875355.147,distributed
24 changes: 24 additions & 0 deletions examples/benchmarks/distributed/latency_n2.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
msg_size,latency,kind
1,131763.9244,distributed
2,133011.9576,distributed
4,143194.8101,distributed
8,132870.4986,distributed
16,141274.7225,distributed
32,132614.0035,distributed
64,174522.3768,distributed
128,144640.3753,distributed
256,131596.4903,distributed
512,145297.8343,distributed
1024,136729.8732,distributed
2048,135132.3934,distributed
4096,156707.2372,distributed
8192,152365.9384,distributed
16384,196623.648,distributed
32768,188031.68,distributed
65536,228107.612,distributed
131072,269893.838,distributed
262144,387071.053,distributed
524288,617363.447,distributed
1048576,1.06839952e6,distributed
2097152,2.069559307e6,distributed
4194304,3.721738061e6,distributed
2 changes: 2 additions & 0 deletions examples/benchmarks/distributed/setup.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
using Distributed
addprocs(1)
75 changes: 75 additions & 0 deletions examples/benchmarks/legacy/latency.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
@everywhere using UCX
UCX.Legacy.wireup()

include(joinpath(@__DIR__, "..", "config.jl"))

@everywhere function target(::Any)
nothing
end

const MAX_MESSAGE_SIZE = 1<<22
# const MAX_MESSAGE_SIZE = 4096
const LARGE_MESSAGE_SIZE = 8192

const LAT_LOOP_SMALL = 10000
const LAT_SKIP_SMALL = 100
const LAT_LOOP_LARGE = 1000
const LAT_SKIP_LARGE = 10

function touch_data(send_buf, size)
send_buf[1:size] .= 'A' % UInt8
end

function benchmark()
t = Table(msg_size = Int[], latency = Float64[], kind=Symbol[])
send_buf = Vector{UInt8}(undef, MAX_MESSAGE_SIZE)

size = 1
while size <= MAX_MESSAGE_SIZE
@info "sending" size
flush(stderr)
touch_data(send_buf, size)

if size > LARGE_MESSAGE_SIZE
loop = LAT_LOOP_LARGE
skip = LAT_SKIP_LARGE
else
loop = LAT_LOOP_SMALL
skip = LAT_SKIP_SMALL
end

t_start = 0
for i in -skip:loop
if i == 1
t_start = Base.time_ns()
end

GC.@preserve send_buf begin
ptr = pointer(send_buf)
subset = Base.unsafe_wrap(Array, ptr, size)
# avoid view
UCX.Legacy.remotecall_wait(target, 2, subset)
end

end
t_end = Base.time_ns()

t_delta = t_end-t_start
t_op = t_delta / loop

push!(t, (msg_size = size, latency = t_op, kind = :distributed))

size *= 2
end

if length(ARGS) > 0
suffix = string("_", ARGS[1])
else
suffix = ""
end
CSV.write(joinpath(@__DIR__, "latency$suffix.csv"), t)
end

if !isinteractive()
benchmark()
end
68 changes: 68 additions & 0 deletions examples/benchmarks/legacy/latency_cuda.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
@everywhere using UCX
UCX.Legacy.wireup()

@everywhere using CUDA

include(joinpath(@__DIR__, "..", "config.jl"))

@everywhere function target(::Any)
nothing
end

const MIN_MESSAGE_SIZE = 1
const MAX_MESSAGE_SIZE = 1<<22
# const MAX_MESSAGE_SIZE = 4096
const LARGE_MESSAGE_SIZE = 8192

const LAT_LOOP_SMALL = 10000
const LAT_SKIP_SMALL = 100
const LAT_LOOP_LARGE = 1000
const LAT_SKIP_LARGE = 10

function touch_data(send_buf, size)
send_buf[1:size] .= 'A' % UInt8
end

function benchmark()
t = Table(msg_size = Int[], latency = Float64[], kind=Symbol[])
send_buf = CuArray{UInt8, 1}(undef, MAX_MESSAGE_SIZE)

size = MIN_MESSAGE_SIZE
while size <= MAX_MESSAGE_SIZE
@info "sending" size
flush(stderr)
touch_data(send_buf, size)

if size > LARGE_MESSAGE_SIZE
loop = LAT_LOOP_LARGE
skip = LAT_SKIP_LARGE
else
loop = LAT_LOOP_SMALL
skip = LAT_SKIP_SMALL
end

t_start = 0
for i in -skip:loop
if i == 1
t_start = Base.time_ns()
end

UCX.Legacy.remotecall_wait(target, 2, view(send_buf, 1:size))

end
t_end = Base.time_ns()

t_delta = t_end-t_start
t_op = t_delta / loop

push!(t, (msg_size = size, latency = t_op, kind = :distributed))

size *= 2
end

CSV.write(joinpath(@__DIR__, "latency_cuda.csv"), t)
end

if !isinteractive()
benchmark()
end
Loading