Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Linear layer bias gradient computation; add size checks to CUDA functions #170

Merged
merged 4 commits into from
Aug 15, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 40 additions & 15 deletions coaster-blas/src/frameworks/cuda/helper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ macro_rules! iblas_copy_for_cuda {
x: &SharedTensor<$t>,
y: &mut SharedTensor<$t>,
) -> Result<(), ::coaster::error::Error> {
assert_eq!(x.desc().size(), y.desc().size());
drahnr marked this conversation as resolved.
Show resolved Hide resolved
let n = x.desc().size() as i32;
let x_mem = read!(x, self);
let y_mem = write_only!(y, self);
Expand Down Expand Up @@ -250,32 +251,56 @@ macro_rules! iblas_gemm_for_cuda {
beta: &SharedTensor<$t>,
c: &mut SharedTensor<$t>,
) -> Result<(), ::coaster::error::Error> {
let c_desc = c.desc().clone();
let alpha_mem = read!(alpha, self);
let beta_mem = read!(beta, self);
let a_mem = read!(a, self);
let b_mem = read!(b, self);
let c_mem = write_only!(c, self);
use Transpose as T;
hweom marked this conversation as resolved.
Show resolved Hide resolved

// Determine the dimensions of all the matrices.
// We always treat the first dimension as the number of rows and all
// the subsequent dimensions combined as the "columns".
let a_0 = a.desc()[0] as i32;
let a_1 = a.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
let b_0 = b.desc()[0] as i32;
let b_1 = b.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
let c_1 = c_desc.iter().skip(1).fold(1, |prod, i| prod * i) as i32;
let n = match bt {
Transpose::NoTrans => b_1,
_ => b_0,
};
let (m, k) = match at {
Transpose::NoTrans => (a_0, a_1),
_ => (a_1, a_0),
let c_0 = c.desc()[0] as i32;
let c_1 = c.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;

let (m, n, k) = match (at, bt) {
(T::NoTrans, T::NoTrans) => {
assert_eq!(a_1, b_0);
(a_0, b_1, a_1)
}
(T::NoTrans, T::Trans | T::ConjTrans) => {
assert_eq!(a_1, b_1);
(a_0, b_0, a_1)
}
(T::Trans | T::ConjTrans, T::NoTrans) => {
assert_eq!(a_0, b_0);
(a_1, b_1, a_0)
}
(T::Trans | T::ConjTrans, T::Trans | T::ConjTrans) => {
assert_eq!(a_0, b_1);
(a_1, b_0, a_0)
}
};

// Verify that C dimensions match.
assert_eq!(c_0, m);
assert_eq!(c_1, n);

let lda = a_1;
let ldb = b_1;
let ldc = c_1;

let ctx: &cublas::Context = self.framework().cublas();

let alpha_mem = read!(alpha, self);
let beta_mem = read!(beta, self);
let a_mem = read!(a, self);
let b_mem = read!(b, self);
let c_mem = write_only!(c, self);

// cuBLAS uses column-major matrix format, while SharedTensor is row-major.
// To compute AxB = C, we instead compute BᵀxAᵀ = Cᵀ and treat the transposed
// column-major matrix as a normal (non-transposed) row-major one.
exec!(
gemm,
(*ctx).gemm(
Expand All @@ -285,7 +310,7 @@ macro_rules! iblas_gemm_for_cuda {
m,
k,
trans!(alpha_mem, $t),
trans!(b_mem, $t), // matrix a and b are switched to make it work with row-major memory layout.
trans!(b_mem, $t),
ldb,
trans!(a_mem, $t),
lda,
Expand Down
24 changes: 21 additions & 3 deletions juice/src/layers/common/linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,19 +35,23 @@ pub struct Linear {

one: SharedTensor<f32>,
zero: SharedTensor<f32>,
ones_row: SharedTensor<f32>,
}

impl Linear {
/// Create a Linear layer from a LinearConfig.
pub fn from_config(config: &LinearConfig) -> Linear {
let one = native_scalar(1f32);
let zero = native_scalar(0f32);
let mut ones_row = SharedTensor::new(&vec![1]);
FillerType::fill_constant(&mut ones_row, 1.0);

Linear {
output_size: config.output_size,

one,
zero,
ones_row,
}
}

Expand Down Expand Up @@ -83,6 +87,7 @@ impl<B: IBackend + LayerOps<f32>> ILayer<B> for Linear {
output_gradient: &mut Vec<ArcLock<SharedTensor<f32>>>,
) {
let input = input_data[0].read().unwrap();
let batch_size = input.desc()[0];
// reshape top
let output_shape = self.calculate_output_shape(input.desc());
output_data[0].write().unwrap().resize(&output_shape).unwrap();
Expand Down Expand Up @@ -116,6 +121,10 @@ impl<B: IBackend + LayerOps<f32>> ILayer<B> for Linear {
if let Some(weight) = weights_gradient.get(1) {
weight.write().unwrap().resize(&(1, self.output_size)).unwrap();
}

// Reshape the column of 1s which is used to compute bias gradient.
self.ones_row.resize(&vec![1, batch_size]).unwrap();
FillerType::fill_constant(&mut self.ones_row, 1.0);
}

fn exact_num_output_blobs(&self) -> Option<usize> {
Expand Down Expand Up @@ -215,10 +224,19 @@ impl<B: IBackend + LayerOps<f32>> ComputeParametersGradient<f32, B> for Linear {
.unwrap();

// gradient w.r.t bias
// Technically, the gradient of vector b of length n to itself is the I_n identity matrix,
// so instead we'll just copy the output_gradient[0] vector into
// The gradient of vector b of length n to itself is the I_n identity matrix,
// so multiply output_gradient[0] by a 1-column.
// Since parameters_gradients[1] is a row-vector, transpose the whole operation.
backend
.copy(&output_gradients[0], &mut parameters_gradients[1])
.gemm(
&self.one,
Transpose::NoTrans,
&self.ones_row,
Transpose::NoTrans,
output_gradients[0],
&self.zero,
parameters_gradients[1],
)
.unwrap();
}
}
Expand Down