Add softmax layers and convert MNIST example (#184)

* Move Convolution workspace into context * Formatting fixes * Fixed unit tests * Partial implementation of the Convolution layer * Implement the remaining parts for Convolution layer * Implement dropout and pooling layers * Fix CUDA tensor descriptor size error and adjust layer testing infra * Extended debug output for layers with custom Debug impl * Changed mnist example to the new architecture * Plumbed the momentum arg in the mnist example * Implemented softmax and logsoftmax layers * Remove unnecessary NLL parameter and fix mnist example * Fix native backend softmax and logsoftmax grad computation * Changed slicing syntax in native backend softmax functions Co-authored-by: Mikhail Balakhno <{ID}+{username}@users.noreply.github.com>
fff-rs · Jan 9, 2023 · 1a6a820 · 1a6a820
1 parent c388ebb
commit 1a6a820
Show file tree

Hide file tree

Showing 10 changed files with 341 additions and 152 deletions.
diff --git a/coaster-nn/src/frameworks/native/helper.rs b/coaster-nn/src/frameworks/native/helper.rs
@@ -307,17 +307,32 @@ macro_rules! impl_ops_softmax_for {
                 x: &SharedTensor<$t>,
                 result: &mut SharedTensor<$t>,
             ) -> Result<(), Error> {
+                // Input tensor must have at least 2 dimensions.
+                // First dimension is treated as a batch number.
+                assert!(
+                    x.desc().size() > 1,
+                    "Input tensor for softmax must have at least 2 dimensions, got {:?}",
+                    x.desc()
+                );
+
+                let batch_size = x.desc()[0];
+                let item_size = x.desc().iter().skip(1).fold(1, |acc, v| acc * v);
+
                 let xs = read!(x, $t, self);
                 let rs = write_only!(result, $t, self);
 
                 map1(xs, rs, |v| v.exp())?;
 
-                let mut sum: $t = 0.0; // iter_arith is not stable yet
-                for r in &*rs {
-                    sum += *r;
-                }
-                for r in rs {
-                    *r /= sum;
+                for i in 0..batch_size {
+                    let batch_item = &mut rs[i * item_size..][..item_size];
+
+                    let mut sum: $t = 0.0; // iter_arith is not stable yet
+                    for r in &*batch_item {
+                        sum += *r;
+                    }
+                    for r in &mut *batch_item {
+                        *r /= sum;
+                    }
                 }
                 Ok(())
             }
@@ -329,16 +344,32 @@ macro_rules! impl_ops_softmax_for {
                 x_diff: &SharedTensor<$t>,
                 result_diff: &mut SharedTensor<$t>,
             ) -> Result<(), Error> {
+                let batch_size = x.desc()[0];
+                let item_size = x.desc().iter().skip(1).fold(1, |acc, v| acc * v);
+
                 let xs = read!(x, $t, self);
                 let dxs = read!(x_diff, $t, self);
                 let drs = write_only!(result_diff, $t, self);
 
-                let mut dot: $t = 0.0;
-                for (t, dt) in xs.iter().zip(dxs.iter()) {
-                    dot += t * dt;
+                for i in 0..batch_size {
+                    let batch_item_in = &xs[i * item_size..][..item_size];
+                    let batch_item_diff_in = &dxs[i * item_size..][..item_size];
+                    let batch_item_out = &mut drs[i * item_size..][..item_size];
+
+                    let mut dot: $t = 0.0;
+                    for (t, dt) in batch_item_in.iter().zip(batch_item_diff_in.iter()) {
+                        dot += t * dt;
+                    }
+
+                    map2(
+                        batch_item_in,
+                        batch_item_diff_in,
+                        batch_item_out,
+                        |t, dt| t * (dt - dot),
+                    )?;
                 }
 
-                map2(xs, dxs, drs, |t, dt| t * (dt - dot))
+                Ok(())
             }
         }
     };
@@ -354,20 +385,37 @@ macro_rules! impl_ops_log_softmax_for {
                 x: &SharedTensor<$t>,
                 result: &mut SharedTensor<$t>,
             ) -> Result<(), $crate::co::error::Error> {
+                // Input tensor must have at least 2 dimensions.
+                // First dimension is treated as a batch number.
+                assert!(
+                    x.desc().size() > 1,
+                    "Input tensor for softmax must have at least 2 dimensions, got {:?}",
+                    x.desc()
+                );
+
+                let batch_size = x.desc()[0];
+                let item_size = x.desc().iter().skip(1).fold(1, |acc, v| acc * v);
+
                 let xs = read!(x, $t, self);
                 let rs = write_only!(result, $t, self);
 
-                let max_x = xs
-                    .iter()
-                    .fold(::std::$t::NEG_INFINITY, |acc, &t| acc.max(t));
+                for i in 0..batch_size {
+                    let batch_item_in = &xs[i * item_size..][..item_size];
+                    let batch_item_out = &mut rs[i * item_size..][..item_size];
+                    let max_x = batch_item_in
+                        .iter()
+                        .fold(::std::$t::NEG_INFINITY, |acc, &t| acc.max(t));
 
-                let mut logsum: $t = 0.0;
-                for t in xs {
-                    logsum += (-(max_x - t)).exp();
+                    let mut logsum: $t = 0.0;
+                    for t in batch_item_in {
+                        logsum += (*t - max_x).exp();
+                    }
+                    logsum = max_x + logsum.ln();
+
+                    map1(batch_item_in, batch_item_out, |t| t - logsum)?;
                 }
-                logsum = max_x + logsum.ln();
 
-                map1(xs, rs, |t| t - logsum)
+                Ok(())
             }
 
             fn log_softmax_grad(
@@ -376,15 +424,31 @@ macro_rules! impl_ops_log_softmax_for {
                 x_diff: &SharedTensor<$t>,
                 result_diff: &mut SharedTensor<$t>,
             ) -> Result<(), $crate::co::error::Error> {
+                let batch_size = x.desc()[0];
+                let item_size = x.desc().iter().skip(1).fold(1, |acc, v| acc * v);
+
                 let xs = read!(x, $t, self);
                 let dxs = read!(x_diff, $t, self);
                 let drs = write_only!(result_diff, $t, self);
 
-                let mut sum: $t = 0.0;
-                for &grad_val in dxs.iter() {
-                    sum += grad_val;
+                for i in 0..batch_size {
+                    let batch_item_in = &xs[i * item_size..][..item_size];
+                    let batch_item_diff_in = &dxs[i * item_size..][..item_size];
+                    let batch_item_out = &mut drs[i * item_size..][..item_size];
+
+                    let mut sum: $t = 0.0;
+                    for &grad_val in batch_item_diff_in.iter() {
+                        sum += grad_val;
+                    }
+                    map2(
+                        batch_item_in,
+                        batch_item_diff_in,
+                        batch_item_out,
+                        |t, dt| dt - t.exp() * sum,
+                    )?;
                 }
-                map2(xs, dxs, drs, |t, dt| dt - t.exp() * sum)
+
+                Ok(())
             }
         }
     };

diff --git a/coaster-nn/src/plugin.rs b/coaster-nn/src/plugin.rs
@@ -621,7 +621,10 @@ pub trait Convolution<F>: NN<F> {
 /// Provides the functionality for a Backend to support Softmax operations.
 pub trait Softmax<F>: NN<F> {
     /// Computes a [Softmax][softmax] over the input Tensor `x`.
-    /// [softmax]: https://en.wikipedia.org/wiki/Softmax_function
+    /// [softmax]: https://en.wikipedia.org/wiki/Softmax_function.
+    /// Tensor must have more than one dimensions: N,D1,..., where first dimension N
+    /// is interpreted as the batch size. Softmax operation is applied independently
+    /// to each batch item over D1,... .
     ///
     /// Saves the result to `result`.
     fn softmax(
@@ -645,6 +648,9 @@ pub trait Softmax<F>: NN<F> {
 /// Provides the functionality for a Backend to support LogSoftmax operations.
 pub trait LogSoftmax<F>: NN<F> {
     /// Computes a logarithmic softmax over the input Tensor `x`.
+    /// Tensor must have more than one dimensions: N,D1,..., where first dimension N
+    /// is interpreted as the batch size. LogSoftmax operation is applied independently
+    /// to each batch item over D1,... .
     ///
     /// Saves the result to `result`.
     fn log_softmax(

diff --git a/coaster-nn/src/tests/softmax.rs b/coaster-nn/src/tests/softmax.rs
@@ -6,7 +6,7 @@ use crate::co::prelude::*;
 use crate::plugin::{LogSoftmax, Softmax};
 use crate::tests::{filled_tensor, tensor_assert_eq, tensor_assert_eq_tensor, Epsilon};
 
-const DIMS: [usize; 3] = [4, 1, 3];
+const DIMS: [usize; 4] = [1, 4, 1, 3];
 
 const IN: [f64; 12] = [
     -0.3768541784373798341,