Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

first opencl experiments #798

Draft
wants to merge 17 commits into
base: main
Choose a base branch
from
10 changes: 10 additions & 0 deletions linalg/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ lazy_static = "1.4.0"
log = "0.4.14"
num-traits = "0.2.14"
tract-data = { version = "0.18.3-pre", path = "../data" }
opencl3 = { version = "0.8.1", optional = true }
paste = "1.0.5"
scan_fmt = "0.2.6"

Expand All @@ -44,6 +45,7 @@ proptest = "1.0.0"
# preferred.
no_fp16 = []
default = []
opencl = [ "opencl3" ]

[[bench]]
bench = false
Expand Down Expand Up @@ -97,3 +99,11 @@ harness = false
[[bench]]
name = "x86_64"
harness = false

[[bench]]
name = "opencl"
harness = false

[[bench]]
name = "opencl-gemv"
harness = false
287 changes: 287 additions & 0 deletions linalg/benches/opencl-gemv.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,287 @@
use std::ptr::null_mut;

use criterion::{measurement::Measurement, *};

use opencl3::{
command_queue::{CommandQueue, CL_QUEUE_PROFILING_ENABLE},
context::Context,
device::{Device, CL_DEVICE_TYPE_GPU},
event::Event,
kernel::{ExecuteKernel, Kernel},
memory::{Buffer, CL_MEM_READ_ONLY, CL_MEM_READ_WRITE},
platform::get_platforms,
program::Program,
types::{cl_float, CL_NON_BLOCKING},
};

fn context() -> Context {
let platforms = get_platforms().unwrap();
let device = platforms[0].get_devices(CL_DEVICE_TYPE_GPU).unwrap().remove(0);
let device = Device::new(device);
Context::from_device(&device).expect("Context::from_device failed")
}

fn queue(context: &Context) -> CommandQueue {
CommandQueue::create_with_properties(
&context,
context.default_device(),
CL_QUEUE_PROFILING_ENABLE,
0,
)
.expect("CommandQueue::create failed")
}

fn empty(c: &mut Criterion) {
c.bench_function("empty", |b| {
let ctx = context();
let queue = queue(&ctx);
b.iter(|| {
queue.finish().unwrap();
})
});
}

fn write_buffer(c: &mut Criterion) {
let mut g = c.benchmark_group("write_buffer");
for size in &[64, 256, 1024, 8 * 1024, 32 * 1024, 128 * 1024, 1024 * 1024] {
g.bench_with_input(BenchmarkId::new("write_buf", size.to_string()), size, |b, s| {
let ctx = context();
let q = queue(&ctx);
let v = vec![0f32; *s];
let mut cl =
Buffer::<cl_float>::create(&ctx, CL_MEM_READ_ONLY, v.len(), null_mut()).unwrap();
b.iter(move || {
q.enqueue_write_buffer(&mut cl, CL_NON_BLOCKING, 0, &v, &[]).unwrap();
q.finish().unwrap();
});
})
.throughput(Throughput::Elements(*size as _));
}
}

static GEMV1: &'static str = "__kernel void gemv1(__global const float *a,__global const float *x,
__global float *y, int m, int n) {
float sum = 0.0f;
int row = get_global_id(0);
// a e
// b f
// c g
// d h
a += (row % 4) + (row / 4) * 4 * n;
for (int k=0 ; k<n ; k++) {
sum += a[k] * x[k];
}
y[row] = sum;
}";

fn profile_gemv1() {
// let (m, n, iters) = (1024, 32, 10000);
// let (m, n, iters) = (1024, 1024, 10000);
let (m, n, l, iters) = (16, 16, 2, 10000);
let ctx = context();
let a = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_ONLY, m * n, null_mut()).unwrap();
let x = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_ONLY, n, null_mut()).unwrap();
let y = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_WRITE, m, null_mut()).unwrap();

let queue = queue(&ctx);
let program = Program::create_and_build_from_source(&ctx, GEMV1, "").unwrap();
let kernel = Kernel::create(&program, "gemv1").expect("Kernel::create failed");
let mut ns = 0;
for i in 0..iters {
let mut run = ExecuteKernel::new(&kernel);
let event = run
.set_arg(&a)
.set_arg(&x)
.set_arg(&y)
.set_arg(&(m as i32))
.set_arg(&(n as i32))
.set_global_work_sizes(&[m])
.set_local_work_sizes(&[2])
.enqueue_nd_range(&queue)
.unwrap();
event.wait().unwrap();
ns += event.profiling_command_end().unwrap() - event.profiling_command_start().unwrap();
}
let gigaflops = (m * n) as f32 / ns as f32 * iters as f32;
dbg!(gigaflops);
}

fn bench_gemv1_bench_one(c: &mut Criterion, name: &str, m: usize, n: usize, loc: usize) {
let loops = 100;
c.benchmark_group(format!("gemv1-{}-{}x{}by{}", name, m, n, loc))
.throughput(Throughput::Elements((m * n * loops) as _))
.bench_function("loop", |b| {
let ctx = context();
let a = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_ONLY, m * n, null_mut()).unwrap();
let x = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_ONLY, n, null_mut()).unwrap();
let y = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_WRITE, m, null_mut()).unwrap();

let queue = queue(&ctx);
let program = Program::create_and_build_from_source(&ctx, GEMV1, "").unwrap();
let kernel = Kernel::create(&program, "gemv1").expect("Kernel::create failed");
b.iter(|| {
let mut event: Option<Event> = None;
for i in 0..loops {
let mut run = ExecuteKernel::new(&kernel);
if let Some(e) = event {
run.set_wait_event(&e);
}
event = Some(
run.set_arg(&a)
.set_arg(&x)
.set_arg(&y)
.set_arg(&(m as i32))
.set_arg(&(n as i32))
.set_global_work_sizes(&[m])
.set_local_work_sizes(&[loc])
.enqueue_nd_range(&queue)
.unwrap(),
);
}
event.unwrap().wait().unwrap();
});
});
}

static GEMV2: &'static str = "
#define ROW_DIM 0
#define COL_DIM 1
__kernel void gemv2(__global const float * a,
__global const float * x,
__global float * y,
__local float * work,
int m, int n)
{
float sum = (float)0;
for (int k=get_global_id(COL_DIM);k<n;k+=get_global_size(COL_DIM)) {
sum += a[get_global_id(ROW_DIM)+m*k] * x[k];
}

int rows = get_local_size(ROW_DIM); // rows in group
int cols = get_local_size(COL_DIM); // initial cols in group
int ii = get_local_id(ROW_DIM); // local row index in group, 0<=ii<rows
int jj = get_local_id(COL_DIM); // block index in column, 0<=jj<cols
work[ii+rows*jj] = sum;
barrier(CLK_LOCAL_MEM_FENCE); // sync group

while ( cols > 1 ) {
cols >>= 1;
if (jj < cols) work[ii+rows*jj] += work[ii+rows*(jj+cols)];
barrier(CLK_LOCAL_MEM_FENCE); // sync group
}

if ( jj == 0 ) y[get_global_id(ROW_DIM)] = work[ii];
}";

fn bench_gemv2_bench_one(c: &mut Criterion, name: &str, m: usize, n: usize, loc: usize) {
let p = 4;
c.benchmark_group(format!("gemv2-{}-{}x{}by{}", name, m, n, loc))
.throughput(Throughput::Elements((m * n) as _))
.bench_function("loop", |b| {
let ctx = context();
let a = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_ONLY, m * n, null_mut()).unwrap();
let x = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_ONLY, n, null_mut()).unwrap();
let y = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_WRITE, m, null_mut()).unwrap();

let queue = queue(&ctx);
let program = Program::create_and_build_from_source(&ctx, GEMV2, "").unwrap();
let kernel = Kernel::create(&program, "gemv2").expect("Kernel::create failed");
b.iter(|| {
let mut run = ExecuteKernel::new(&kernel);
let event = run
.set_arg(&a)
.set_arg(&x)
.set_arg(&y)
.set_arg_local_buffer(loc * p)
.set_arg(&(m as i32))
.set_arg(&(n as i32))
.set_global_work_sizes(&[m, p])
.set_local_work_sizes(&[loc, p])
.enqueue_nd_range(&queue)
.unwrap();
event.wait().unwrap();
});
});
}

static GEMV3: &'static str = "__kernel void gemv3(__global const float *a,__global const float *x,
__global float *y, int m, int n) {
// A packed to load 16 values, a full cache line in one go
// a b c d q r s t
// e f g h u v...
// i j k l
// m n o p
// panel len is n*4. panel id is row / 4 -> n * row
float4 sum1 = (float4)(0.0f, 0.0f, 0.0f, 0.0f);
int row = get_global_id(0);
__global const float *pa = &a[4 * n * (row / 4) + 4 * row % 4];
for (int k=0 ; k<n/4 ; k++) {
float4 w = vload4(0, &pa[16 * k]);
float4 b = vload4(0, &x[4 * k]);
for (int l = 0; l < 10; l++) {
sum1 = mad(w, b, sum1);
sum1 = mad(w, b, sum1);
sum1 = mad(w, b, sum1);
sum1 = mad(w, b, sum1);
sum1 = mad(w, b, sum1);
sum1 = mad(w, b, sum1);
sum1 = mad(w, b, sum1);
sum1 = mad(w, b, sum1);
sum1 = mad(w, b, sum1);
sum1 = mad(w, b, sum1);
}
}
y[row] = sum1.x + sum1.y + sum1.z + sum1.w;
}";

fn bench_gemv3_bench_one(c: &mut Criterion, name: &str, m: usize, n: usize, loc: usize) {
let loops = 100;
c.benchmark_group(format!("gemv3-{}-{}x{}by{}", name, m, n, loc))
.throughput(Throughput::Elements((m * n * loops * 100 * 2) as _))
.bench_function("loop", |b| {
let ctx = context();
let a = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_ONLY, m * n, null_mut()).unwrap();
let x = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_ONLY, n, null_mut()).unwrap();
let y = Buffer::<cl_float>::create(&ctx, CL_MEM_READ_WRITE, m, null_mut()).unwrap();

let q = queue(&ctx);
let program = Program::create_and_build_from_source(&ctx, GEMV3, "").unwrap();
let kernel = Kernel::create(&program, "gemv3").expect("Kernel::create failed");
b.iter(|| {
for i in 0..loops {
let mut run = ExecuteKernel::new(&kernel);
run.set_arg(&a)
.set_arg(&x)
.set_arg(&y)
.set_arg(&(m as i32))
.set_arg(&(n as i32))
.set_global_work_sizes(&[m])
.set_local_work_sizes(&[loc])
.enqueue_nd_range(&q)
.unwrap();
}
q.finish().unwrap();
});
});
}

fn gemv(c: &mut Criterion) {
for loc in [1, 2, 4, 8, 16] {
for m in [16, 32, 64, 128, 256, 1024] {
for n in [16, 32, 64, 128, 256, 1024] {
bench_gemv1_bench_one(c, "gemv1", m, n, loc);
bench_gemv2_bench_one(c, "gemv2", m, n, loc);
bench_gemv3_bench_one(c, "gemv3", m, n, loc);
}
}
}
}

criterion_group!(benches, empty, gemv, write_buffer);
criterion_main!(benches);

/*
fn main() {
profile_gemv1()
}
*/
Loading