From 6952a490de290fe86739d3fa0764cbb1084da6a3 Mon Sep 17 00:00:00 2001
From: Mikhail Balakhno <4531463+hweom@users.noreply.github.com>
Date: Mon, 15 Aug 2022 05:41:53 -0700
Subject: [PATCH] Fix Linear layer bias gradient computation; add size checks
 to CUDA functions (#170)

* Assert the correct tensor sizes in copy() and gemm(); fix related Linear logic

* Check output matrix dims in GEMM; fix corresponding Linear layer logic

* Update coaster-blas/src/frameworks/cuda/helper.rs

Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>

Co-authored-by: Mikhail Balakhno <{ID}+{username}@users.noreply.github.com>
Co-authored-by: Bernhard Schuster <bernhard@ahoi.io>
---
 coaster-blas/src/frameworks/cuda/helper.rs | 55 ++++++++++++++++------
 juice/src/layers/common/linear.rs          | 24 ++++++++--
 2 files changed, 61 insertions(+), 18 deletions(-)

diff --git a/coaster-blas/src/frameworks/cuda/helper.rs b/coaster-blas/src/frameworks/cuda/helper.rs
index ebda394dd..44260e891 100644
--- a/coaster-blas/src/frameworks/cuda/helper.rs
+++ b/coaster-blas/src/frameworks/cuda/helper.rs
@@ -106,6 +106,7 @@ macro_rules! iblas_copy_for_cuda {
             x: &SharedTensor<$t>,
             y: &mut SharedTensor<$t>,
         ) -> Result<(), ::coaster::error::Error> {
+            assert_eq!(x.desc().size(), y.desc().size());
             let n = x.desc().size() as i32;
             let x_mem = read!(x, self);
             let y_mem = write_only!(y, self);
@@ -250,32 +251,56 @@ macro_rules! iblas_gemm_for_cuda {
             beta: &SharedTensor<$t>,
             c: &mut SharedTensor<$t>,
         ) -> Result<(), ::coaster::error::Error> {
-            let c_desc = c.desc().clone();
-            let alpha_mem = read!(alpha, self);
-            let beta_mem = read!(beta, self);
-            let a_mem = read!(a, self);
-            let b_mem = read!(b, self);
-            let c_mem = write_only!(c, self);
+            use Transpose::{NoTrans, ConjTrans, Trans};
 
+            // Determine the dimensions of all the matrices.
+            // We always treat the first dimension as the number of rows and all
+            // the subsequent dimensions combined as the "columns".
             let a_0 = a.desc()[0] as i32;
             let a_1 = a.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
             let b_0 = b.desc()[0] as i32;
             let b_1 = b.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
-            let c_1 = c_desc.iter().skip(1).fold(1, |prod, i| prod * i) as i32;
-            let n = match bt {
-                Transpose::NoTrans => b_1,
-                _ => b_0,
-            };
-            let (m, k) = match at {
-                Transpose::NoTrans => (a_0, a_1),
-                _ => (a_1, a_0),
+            let c_0 = c.desc()[0] as i32;
+            let c_1 = c.desc().iter().skip(1).fold(1, |prod, i| prod * i) as i32;
+
+            let (m, n, k) = match (at, bt) {
+                (T::NoTrans, T::NoTrans) => {
+                    assert_eq!(a_1, b_0);
+                    (a_0, b_1, a_1)
+                }
+                (T::NoTrans, T::Trans | T::ConjTrans) => {
+                    assert_eq!(a_1, b_1);
+                    (a_0, b_0, a_1)
+                }
+                (T::Trans | T::ConjTrans, T::NoTrans) => {
+                    assert_eq!(a_0, b_0);
+                    (a_1, b_1, a_0)
+                }
+                (T::Trans | T::ConjTrans, T::Trans | T::ConjTrans) => {
+                    assert_eq!(a_0, b_1);
+                    (a_1, b_0, a_0)
+                }
             };
+
+            // Verify that C dimensions match.
+            assert_eq!(c_0, m);
+            assert_eq!(c_1, n);
+
             let lda = a_1;
             let ldb = b_1;
             let ldc = c_1;
 
             let ctx: &cublas::Context = self.framework().cublas();
 
+            let alpha_mem = read!(alpha, self);
+            let beta_mem = read!(beta, self);
+            let a_mem = read!(a, self);
+            let b_mem = read!(b, self);
+            let c_mem = write_only!(c, self);
+
+            // cuBLAS uses column-major matrix format, while SharedTensor is row-major.
+            // To compute AxB = C, we instead compute BᵀxAᵀ = Cᵀ and treat the transposed
+            // column-major matrix as a normal (non-transposed) row-major one.
             exec!(
                 gemm,
                 (*ctx).gemm(
@@ -285,7 +310,7 @@ macro_rules! iblas_gemm_for_cuda {
                     m,
                     k,
                     trans!(alpha_mem, $t),
-                    trans!(b_mem, $t), // matrix a and b are switched to make it work with row-major memory layout.
+                    trans!(b_mem, $t),
                     ldb,
                     trans!(a_mem, $t),
                     lda,
diff --git a/juice/src/layers/common/linear.rs b/juice/src/layers/common/linear.rs
index 7f09bfda4..38d90b993 100644
--- a/juice/src/layers/common/linear.rs
+++ b/juice/src/layers/common/linear.rs
@@ -35,6 +35,7 @@ pub struct Linear {
 
     one: SharedTensor<f32>,
     zero: SharedTensor<f32>,
+    ones_row: SharedTensor<f32>,
 }
 
 impl Linear {
@@ -42,12 +43,15 @@ impl Linear {
     pub fn from_config(config: &LinearConfig) -> Linear {
         let one = native_scalar(1f32);
         let zero = native_scalar(0f32);
+        let mut ones_row = SharedTensor::new(&vec![1]);
+        FillerType::fill_constant(&mut ones_row, 1.0);
 
         Linear {
             output_size: config.output_size,
 
             one,
             zero,
+            ones_row,
         }
     }
 
@@ -83,6 +87,7 @@ impl<B: IBackend + LayerOps<f32>> ILayer<B> for Linear {
         output_gradient: &mut Vec<ArcLock<SharedTensor<f32>>>,
     ) {
         let input = input_data[0].read().unwrap();
+        let batch_size = input.desc()[0];
         // reshape top
         let output_shape = self.calculate_output_shape(input.desc());
         output_data[0].write().unwrap().resize(&output_shape).unwrap();
@@ -116,6 +121,10 @@ impl<B: IBackend + LayerOps<f32>> ILayer<B> for Linear {
         if let Some(weight) = weights_gradient.get(1) {
             weight.write().unwrap().resize(&(1, self.output_size)).unwrap();
         }
+
+        // Reshape the column of 1s which is used to compute bias gradient.
+        self.ones_row.resize(&vec![1, batch_size]).unwrap();
+        FillerType::fill_constant(&mut self.ones_row, 1.0);
     }
 
     fn exact_num_output_blobs(&self) -> Option<usize> {
@@ -215,10 +224,19 @@ impl<B: IBackend + LayerOps<f32>> ComputeParametersGradient<f32, B> for Linear {
             .unwrap();
 
         // gradient w.r.t bias
-        // Technically, the gradient of vector b of length n to itself is the I_n identity matrix,
-        // so instead we'll just copy the output_gradient[0] vector into
+        // The gradient of vector b of length n to itself is the I_n identity matrix,
+        // so multiply output_gradient[0] by a 1-column.
+        // Since parameters_gradients[1] is a row-vector, transpose the whole operation.
         backend
-            .copy(&output_gradients[0], &mut parameters_gradients[1])
+            .gemm(
+                &self.one,
+                Transpose::NoTrans,
+                &self.ones_row,
+                Transpose::NoTrans,
+                output_gradients[0],
+                &self.zero,
+                parameters_gradients[1],
+            )
             .unwrap();
     }
 }