From 6952a490de290fe86739d3fa0764cbb1084da6a3 Mon Sep 17 00:00:00 2001 From: Mikhail Balakhno <4531463+hweom@users.noreply.github.com> Date: Mon, 15 Aug 2022 05:41:53 -0700 Subject: [PATCH] Fix Linear layer bias gradient computation; add size checks to CUDA functions (#170) * Assert the correct tensor sizes in copy() and gemm(); fix related Linear logic * Check output matrix dims in GEMM; fix corresponding Linear layer logic * Update coaster-blas/src/frameworks/cuda/helper.rs Co-authored-by: Bernhard Schuster Co-authored-by: Mikhail Balakhno <{ID}+{username}@users.noreply.github.com> Co-authored-by: Bernhard Schuster --- coaster-blas/src/frameworks/cuda/helper.rs | 55 ++++++++++++++++------ juice/src/layers/common/linear.rs | 24 ++++++++-- 2 files changed, 61 insertions(+), 18 deletions(-) diff --git a/coaster-blas/src/frameworks/cuda/helper.rs b/coaster-blas/src/frameworks/cuda/helper.rs index ebda394dd..44260e891 100644 --- a/coaster-blas/src/frameworks/cuda/helper.rs +++ b/coaster-blas/src/frameworks/cuda/helper.rs @@ -106,6 +106,7 @@ macro_rules! iblas_copy_for_cuda { x: &SharedTensor<$t>, y: &mut SharedTensor<$t>, ) -> Result<(), ::coaster::error::Error> { + assert_eq!(x.desc().size(), y.desc().size()); let n = x.desc().size() as i32; let x_mem = read!(x, self); let y_mem = write_only!(y, self); @@ -250,32 +251,56 @@ macro_rules! iblas_gemm_for_cuda { beta: &SharedTensor<$t>, c: &mut SharedTensor<$t>, ) -> Result<(), ::coaster::error::Error> { - let c_desc = c.desc().clone(); - let alpha_mem = read!(alpha, self); - let beta_mem = read!(beta, self); - let a_mem = read!(a, self); - let b_mem = read!(b, self); - let c_mem = write_only!(c, self); + use Transpose::{NoTrans, ConjTrans, Trans}; + // Determine the dimensions of all the matrices. + // We always treat the first dimension as the number of rows and all + // the subsequent dimensions combined as the "columns". let a_0 = a.desc()[0] as i32; let a_1 = a.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32; let b_0 = b.desc()[0] as i32; let b_1 = b.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32; - let c_1 = c_desc.iter().skip(1).fold(1, |prod, i| prod * i) as i32; - let n = match bt { - Transpose::NoTrans => b_1, - _ => b_0, - }; - let (m, k) = match at { - Transpose::NoTrans => (a_0, a_1), - _ => (a_1, a_0), + let c_0 = c.desc()[0] as i32; + let c_1 = c.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32; + + let (m, n, k) = match (at, bt) { + (T::NoTrans, T::NoTrans) => { + assert_eq!(a_1, b_0); + (a_0, b_1, a_1) + } + (T::NoTrans, T::Trans | T::ConjTrans) => { + assert_eq!(a_1, b_1); + (a_0, b_0, a_1) + } + (T::Trans | T::ConjTrans, T::NoTrans) => { + assert_eq!(a_0, b_0); + (a_1, b_1, a_0) + } + (T::Trans | T::ConjTrans, T::Trans | T::ConjTrans) => { + assert_eq!(a_0, b_1); + (a_1, b_0, a_0) + } }; + + // Verify that C dimensions match. + assert_eq!(c_0, m); + assert_eq!(c_1, n); + let lda = a_1; let ldb = b_1; let ldc = c_1; let ctx: &cublas::Context = self.framework().cublas(); + let alpha_mem = read!(alpha, self); + let beta_mem = read!(beta, self); + let a_mem = read!(a, self); + let b_mem = read!(b, self); + let c_mem = write_only!(c, self); + + // cuBLAS uses column-major matrix format, while SharedTensor is row-major. + // To compute AxB = C, we instead compute BᵀxAᵀ = Cᵀ and treat the transposed + // column-major matrix as a normal (non-transposed) row-major one. exec!( gemm, (*ctx).gemm( @@ -285,7 +310,7 @@ macro_rules! iblas_gemm_for_cuda { m, k, trans!(alpha_mem, $t), - trans!(b_mem, $t), // matrix a and b are switched to make it work with row-major memory layout. + trans!(b_mem, $t), ldb, trans!(a_mem, $t), lda, diff --git a/juice/src/layers/common/linear.rs b/juice/src/layers/common/linear.rs index 7f09bfda4..38d90b993 100644 --- a/juice/src/layers/common/linear.rs +++ b/juice/src/layers/common/linear.rs @@ -35,6 +35,7 @@ pub struct Linear { one: SharedTensor, zero: SharedTensor, + ones_row: SharedTensor, } impl Linear { @@ -42,12 +43,15 @@ impl Linear { pub fn from_config(config: &LinearConfig) -> Linear { let one = native_scalar(1f32); let zero = native_scalar(0f32); + let mut ones_row = SharedTensor::new(&vec![1]); + FillerType::fill_constant(&mut ones_row, 1.0); Linear { output_size: config.output_size, one, zero, + ones_row, } } @@ -83,6 +87,7 @@ impl> ILayer for Linear { output_gradient: &mut Vec>>, ) { let input = input_data[0].read().unwrap(); + let batch_size = input.desc()[0]; // reshape top let output_shape = self.calculate_output_shape(input.desc()); output_data[0].write().unwrap().resize(&output_shape).unwrap(); @@ -116,6 +121,10 @@ impl> ILayer for Linear { if let Some(weight) = weights_gradient.get(1) { weight.write().unwrap().resize(&(1, self.output_size)).unwrap(); } + + // Reshape the column of 1s which is used to compute bias gradient. + self.ones_row.resize(&vec![1, batch_size]).unwrap(); + FillerType::fill_constant(&mut self.ones_row, 1.0); } fn exact_num_output_blobs(&self) -> Option { @@ -215,10 +224,19 @@ impl> ComputeParametersGradient for Linear { .unwrap(); // gradient w.r.t bias - // Technically, the gradient of vector b of length n to itself is the I_n identity matrix, - // so instead we'll just copy the output_gradient[0] vector into + // The gradient of vector b of length n to itself is the I_n identity matrix, + // so multiply output_gradient[0] by a 1-column. + // Since parameters_gradients[1] is a row-vector, transpose the whole operation. backend - .copy(&output_gradients[0], &mut parameters_gradients[1]) + .gemm( + &self.one, + Transpose::NoTrans, + &self.ones_row, + Transpose::NoTrans, + output_gradients[0], + &self.zero, + parameters_gradients[1], + ) .unwrap(); } }