diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index a40989bde88..b909ecbf83e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,6 +1,6 @@
 # Contribution rules
 
-- While we have not yet documented a strict code style convention, please follow the existing conventions in the relevant file, submodule, module, and project when you add new code or when you extend/fix existing functionality. To ensure some degree of style consistency, use `make lint`. It applies [Google Style](https://google.github.io/styleguide/cppguide.html) rules to the following:
+- DALI Coding Style Guide can be found [here](STYLE_GUIDE.md). We follow [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) with few exceptions and additional guidelines regarding DALI-specific cases. See the DALI Coding Style Guide for details. When no rules can be found, follow the already occuring conventions. If there is no precedence in our codebase we are open to discussion. Ensure that your contribution passes the `make lint` check. It applies rules to the following:
   - class, function/method, and variable/field naming
   - comment style
   - indentation
diff --git a/STYLE_GUIDE.md b/STYLE_GUIDE.md
new file mode 100644
index 00000000000..db3402cdca9
--- /dev/null
+++ b/STYLE_GUIDE.md
@@ -0,0 +1,55 @@
+# DALI Coding Style Guide
+
+This document describes DALI Coding Style Guide. Rules specified here take precedence
+over [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html) which should
+be followed in the remaining cases.
+
+The code should always pass the current `make lint` check.
+
+## Changes compared to Google C++ Style Guide
+
+Google C++ Style Guide is the default *style* guide. In places where it limits use of common
+C++ idioms or language features it is discouarged.
+
+### C++ Version
+
+DALI uses C++14 standard as this is the most recent version supported with CUDA.
+
+### Line length
+
+We use a line length limit equal to 100.
+
+### Reference arguments
+
+Parameters can be passed as non-const lvalue reference. [Google rule](https://google.github.io/styleguide/cppguide.html#Reference_Arguments)
+prohibits semantically valid restriction of not passing null pointer
+and introduces ugly code like `foo(&bar)` or `(*buf)[i]`.
+
+## DALI specific rules
+
+### DALI Kernels argument order
+
+DALI Kernels follow order of Outputs, Inputs, Arguments - where Output and Inputs are
+expected to be views to Tensors (TensorLists) and Arguments are other inputs.
+
+The same order should be maintained for Kernel template arguments.
+See [the example](dali/kernels/kernel.h) kernel implementation for details.
+
+The order of the arguments is following memcpy semantics.
+
+### Documentation
+
+DALI uses Doxygen for C++ code documentation with Javadoc-styled comments:
+
+```
+/**
+ * ... text ...
+ */
+```
+
+
+## Unspecified cases
+
+When the style is left unspecified please follow the one used most in the current codebase.
+If there is no precedence in the codebase, we are open to discussion, but we hold the final
+word to avoid endless discussion in that matter.
\ No newline at end of file
diff --git a/dali/kernels/common/block_setup.h b/dali/kernels/common/block_setup.h
index fa274d0fff1..199b607aa0d 100644
--- a/dali/kernels/common/block_setup.h
+++ b/dali/kernels/common/block_setup.h
@@ -52,15 +52,17 @@ std::enable_if_t<(skip >= 0), TensorShape<n-1>> skip_dim(const TensorShape<n> &s
   return shape_cat(shape.template first<skip>(), shape.template last<n-skip-1>());
 }
 
-/// @brief A utility for calculating block layout for GPU kernels
-/// @tparam _ndim         number dimensions to take into account while calculating the layout
-/// @tparam _channel_dim  dimension in which channels are stored; channel dimension does not
-///                       participate in layout calculation \n
-///                       In cases where channel dimension can or should participate in layout
-///                       calaculation, do not specify channel dimenion and treat it as an
-///                       additional spatial dimension (e.g. for linear operations in CHW layout)\n
-///                       -1 indicates there are only spatial dimensions, all of which
-///                       participate in layout calculation.
+/**
+ * @brief A utility for calculating block layout for GPU kernels
+ * @tparam _ndim         number dimensions to take into account while calculating the layout
+ * @tparam _channel_dim  dimension in which channels are stored; channel dimension does not
+ *                       participate in layout calculation \n
+ *                       In cases where channel dimension can or should participate in layout
+ *                       calaculation, do not specify channel dimenion and treat it as an
+ *                       additional spatial dimension (e.g. for linear operations in CHW layout)\n
+ *                       -1 indicates there are only spatial dimensions, all of which
+ *                       participate in layout calculation.
+ */
 template <int _ndim, int _channel_dim>
 class BlockSetup {
  public:
diff --git a/dali/kernels/common/scatter_gather.h b/dali/kernels/common/scatter_gather.h
index d538510e268..e9c4069d82f 100644
--- a/dali/kernels/common/scatter_gather.h
+++ b/dali/kernels/common/scatter_gather.h
@@ -35,7 +35,9 @@ struct CopyRange {
 size_t Coalesce(span<CopyRange> ranges);
 }  // namespace detail
 
-/// Implements a device-to-device batch copy of multiple sources to multiple destinations
+/**
+ * Implements a device-to-device batch copy of multiple sources to multiple destinations
+ */
 class DLL_PUBLIC ScatterGatherGPU {
  public:
   static constexpr size_t kDefaultBlockSize = 64<<10;
@@ -62,7 +64,9 @@ class DLL_PUBLIC ScatterGatherGPU {
     blocks_.clear();
   }
 
-  /// @brief Adds one copy to the batch
+  /**
+   * @brief Adds one copy to the batch
+   */
   void AddCopy(void *dst, const void *src, size_t size) {
     if (size > 0) {
       ranges_.push_back({
@@ -73,11 +77,13 @@ class DLL_PUBLIC ScatterGatherGPU {
     }
   }
 
-  /// @brief Executes the copies
-  /// @param stream - the cudaStream on which the copies are scheduled
-  /// @param reset - if true, calls Reset after processing is over
-  /// @param useMemcpyOnly - if true, all copies are executed using cudaMemcpy;
-  ///                        otherwise a batched kernel is used if there are more than 2 ranges
+  /**
+   * @brief Executes the copies
+   * @param stream - the cudaStream on which the copies are scheduled
+   * @param reset - if true, calls Reset after processing is over
+   * @param useMemcpyOnly - if true, all copies are executed using cudaMemcpy;
+   *                        otherwise a batched kernel is used if there are more than 2 ranges
+   */
   DLL_PUBLIC void Run(cudaStream_t stream, bool reset = true, bool useMemcpyOnly = false);
 
   using CopyRange = detail::CopyRange;
@@ -85,16 +91,22 @@ class DLL_PUBLIC ScatterGatherGPU {
  private:
   std::vector<CopyRange> ranges_;
 
-  /// @brief Sorts and merges contiguous ranges
+  /**
+   * @brief Sorts and merges contiguous ranges
+   */
   void Coalesce() {
     size_t n = detail::Coalesce(make_span(ranges_.data(), ranges_.size()));
     ranges_.resize(n);
   }
 
-  /// @brief Divides ranges so they don't exceed `max_block_size_`
+  /**
+   * @brief Divides ranges so they don't exceed `max_block_size_`
+   */
   void MakeBlocks();
 
-  /// @brief Reserves GPU memory for the description of the blocks.
+  /**
+   * @brief Reserves GPU memory for the description of the blocks.
+   */
   void ReserveGPUBlocks();
 
   size_t max_size_per_block_ = kDefaultBlockSize;
diff --git a/dali/kernels/context.h b/dali/kernels/context.h
index 23e0feec15c..f7baf5200e5 100644
--- a/dali/kernels/context.h
+++ b/dali/kernels/context.h
@@ -45,21 +45,29 @@ template <typename... Collections>
 std::tuple<std::remove_cv_t<element_t<Collections>>*...>
 ToContiguousGPUMem(Scratchpad &scratchpad, cudaStream_t stream, const Collections &... c);
 
-/// @brief Interface for kernels to obtain auxiliary working memory
+/**
+ * @brief Interface for kernels to obtain auxiliary working memory
+ */
 class Scratchpad {
  public:
-  /// @brief Allocates `bytes` bytes of memory in `alloc_type`, with specified `alignment`
+  /**
+   * @brief Allocates `bytes` bytes of memory in `alloc_type`, with specified `alignment`
+   */
   virtual void *Alloc(AllocType alloc_type, size_t bytes, size_t alignment) = 0;
 
-  /// @brief Allocates storage for a Tensor of elements `T` and given `shape`
-  ///        in the memory of type `alloc_type`.
+  /**
+   * @brief Allocates storage for a Tensor of elements `T` and given `shape`
+   *        in the memory of type `alloc_type`.
+   */
   template <AllocType alloc_type, typename T, size_t dim>
   TensorView<AllocBackend<alloc_type>, T, dim> AllocTensor(TensorShape<dim> shape) {
     return { Allocate<T>(alloc_type, volume(shape)), std::move(shape) };
   }
 
-  /// @brief Allocates storage for a TensorList of elements `T` and given `shape`
-  ///        in the memory of type `alloc_type`.
+  /**
+   * @brief Allocates storage for a TensorList of elements `T` and given `shape`
+   *        in the memory of type `alloc_type`.
+   */
   template <AllocType alloc_type, typename T, size_t dim>
   TensorListView<AllocBackend<alloc_type>, T, dim>
   AllocTensorList(const std::vector<TensorShape<dim>> &shape) {
@@ -68,8 +76,10 @@ class Scratchpad {
     return tlv;
   }
 
-  /// @brief Allocates storage for a TensorList of elements `T` and given `shape`
-  ///        in the memory of type `alloc_type`.
+  /**
+   * @brief Allocates storage for a TensorList of elements `T` and given `shape`
+   *        in the memory of type `alloc_type`.
+   */
   template <AllocType alloc_type, typename T, size_t dim>
   TensorListView<AllocBackend<alloc_type>, T, dim>
   AllocTensorList(TensorListShape<dim> shape) {
@@ -78,8 +88,10 @@ class Scratchpad {
     return tlv;
   }
 
-  /// @brief Allocates memory suitable for storing `count` items of type `T` in the
-  ///        memory of type `alloc_type`.
+  /**
+   * @brief Allocates memory suitable for storing `count` items of type `T` in the
+   *        memory of type `alloc_type`.
+   */
   template <typename T>
   T *Allocate(AllocType alloc_type, size_t count, size_t alignment = alignof(T)) {
     return reinterpret_cast<T*>(Alloc(alloc_type, count*sizeof(T), alignment));
@@ -138,7 +150,9 @@ struct KernelContext {
   CPUContext cpu;
   GPUContext gpu;
 
-  /// @brief Caller-provided allocator for temporary data.
+  /**
+   * @brief Caller-provided allocator for temporary data.
+   */
   Scratchpad *scratchpad;
 };
 
diff --git a/dali/kernels/imgproc/resample/bilinear_impl.cuh b/dali/kernels/imgproc/resample/bilinear_impl.cuh
index da37d0f3904..deb1b8fa48b 100644
--- a/dali/kernels/imgproc/resample/bilinear_impl.cuh
+++ b/dali/kernels/imgproc/resample/bilinear_impl.cuh
@@ -27,11 +27,13 @@ namespace kernels {
 
 namespace {
 
-/// @brief Implements horizontal resampling for a custom ROI
-/// @param x0 - start column, in output coordinates
-/// @param x1 - end column (exclusive), in output coordinates
-/// @param y0 - start row
-/// @param y1 - end row (exclusive)
+/**
+ * @brief Implements horizontal resampling for a custom ROI
+ * @param x0 - start column, in output coordinates
+ * @param x1 - end column (exclusive), in output coordinates
+ * @param y0 - start row
+ * @param y1 - end row (exclusive)
+ */
 template <int static_channels = -1, typename Dst, typename Src>
 __device__ void LinearHorz_Channels(
     int x0, int x1, int y0, int y1,
@@ -94,11 +96,13 @@ __device__ void LinearHorz(
   ));  // NOLINT
 }
 
-/// @brief Implements vertical resampling for a custom ROI
-/// @param x0 - start column, in output coordinates
-/// @param x1 - end column (exclusive), in output coordinates
-/// @param y0 - start row
-/// @param y1 - end row (exclusive)
+/**
+ * @brief Implements vertical resampling for a custom ROI
+ * @param x0 - start column, in output coordinates
+ * @param x1 - end column (exclusive), in output coordinates
+ * @param y0 - start row
+ * @param y1 - end row (exclusive)
+ */
 template <typename Dst, typename Src>
 __device__ void LinearVert(
     int x0, int x1, int y0, int y1,
diff --git a/dali/kernels/imgproc/resample/params.h b/dali/kernels/imgproc/resample/params.h
index 20737854e39..45b30e768ca 100644
--- a/dali/kernels/imgproc/resample/params.h
+++ b/dali/kernels/imgproc/resample/params.h
@@ -60,12 +60,16 @@ struct FilterDesc {
   float radius = 0;
 };
 
-/// @brief Resampling parameters for 1 dimension
+/**
+ * @brief Resampling parameters for 1 dimension
+ */
 struct ResamplingParams {
   FilterDesc min_filter, mag_filter;
   int output_size = KeepOriginalSize;
 
-  /// @brief 1D region of interest
+  /**
+   * @brief 1D region of interest
+   */
   struct ROI {
     ROI() = default;
     ROI(float start, float end) : use_roi(true), start(start), end(end) {}
diff --git a/dali/kernels/imgproc/resample/resampling_impl.cuh b/dali/kernels/imgproc/resample/resampling_impl.cuh
index 4801e1a8ba6..9be22522a7e 100644
--- a/dali/kernels/imgproc/resample/resampling_impl.cuh
+++ b/dali/kernels/imgproc/resample/resampling_impl.cuh
@@ -31,25 +31,27 @@ namespace resample_shared {
   extern __shared__ float coeffs[];
 };
 
-/// @brief Implements horizontal resampling for a custom ROI
-/// @param x0 - start column, in output coordinates
-/// @param x1 - end column (exclusive), in output coordinates
-/// @param y0 - start row
-/// @param y1 - end row (exclusive)
-/// @tparam static_channels - number of channels, if known at compile time
-///
-/// The function fills the output in block-sized vertical spans.
-/// Block horizontal size is warp-aligned.
-/// Filter coefficients are pre-calculated for each vertical span to avoid
-/// recalculating them for each row, and stored in a shared memory block.
-///
-/// The function follows different code paths for static and dynamic number of channels.
-/// For the dynamic, the innermost loop goes over filter taps, which eliminates the need
-/// for thread-local memory to store intermediate sums. This allows processing arbitrary
-/// number of channels.
-/// For static number of channels, the run-time parameter `channels` is ignored and
-/// there's also a local temporary storage for a tap sum for each channel. This is faster,
-/// but requires extra registers for the intermediate sums.
+/**
+ * @brief Implements horizontal resampling for a custom ROI
+ * @param x0 - start column, in output coordinates
+ * @param x1 - end column (exclusive), in output coordinates
+ * @param y0 - start row
+ * @param y1 - end row (exclusive)
+ * @tparam static_channels - number of channels, if known at compile time
+ *
+ * The function fills the output in block-sized vertical spans.
+ * Block horizontal size is warp-aligned.
+ * Filter coefficients are pre-calculated for each vertical span to avoid
+ * recalculating them for each row, and stored in a shared memory block.
+ *
+ * The function follows different code paths for static and dynamic number of channels.
+ * For the dynamic, the innermost loop goes over filter taps, which eliminates the need
+ * for thread-local memory to store intermediate sums. This allows processing arbitrary
+ * number of channels.
+ * For static number of channels, the run-time parameter `channels` is ignored and
+ * there's also a local temporary storage for a tap sum for each channel. This is faster,
+ * but requires extra registers for the intermediate sums.
+ */
 template <int static_channels = -1, typename Dst, typename Src>
 __device__ void ResampleHorz_Channels(
     int x0, int x1, int y0, int y1,
@@ -140,16 +142,18 @@ __device__ void ResampleHorz_Channels(
   }
 }
 
-/// @brief Implements vertical resampling for a custom ROI
-/// @param x0 - start column, in output coordinates
-/// @param x1 - end column (exclusive), in output coordinates
-/// @param y0 - start row
-/// @param y1 - end row (exclusive)
-/// @tparam static_channels - number of channels, if known at compile time
-///
-/// The function fills the output in block-sized horizontal spans.
-/// Filter coefficients are pre-calculated for each horizontal span to avoid
-/// recalculating them for each column, and stored in a shared memory block.
+/**
+ * @brief Implements vertical resampling for a custom ROI
+ * @param x0 - start column, in output coordinates
+ * @param x1 - end column (exclusive), in output coordinates
+ * @param y0 - start row
+ * @param y1 - end row (exclusive)
+ * @tparam static_channels - number of channels, if known at compile time
+ *
+ * The function fills the output in block-sized horizontal spans.
+ * Filter coefficients are pre-calculated for each horizontal span to avoid
+ * recalculating them for each column, and stored in a shared memory block.
+ */
 template <int static_channels = -1, typename Dst, typename Src>
 __device__ void ResampleVert_Channels(
     int x0, int x1, int y0, int y1,
diff --git a/dali/kernels/imgproc/resample/resampling_impl_cpu.h b/dali/kernels/imgproc/resample/resampling_impl_cpu.h
index ba952279b36..26574234583 100644
--- a/dali/kernels/imgproc/resample/resampling_impl_cpu.h
+++ b/dali/kernels/imgproc/resample/resampling_impl_cpu.h
@@ -173,15 +173,17 @@ inline void ResampleAxis(Surface2D<Out> out, Surface2D<In> in,
     assert(!"Invalid axis index");
 }
 
-/// @brief Resamples `in` using Nearest Neighbor interpolation and stores result in `out`
-/// @param out - output surface
-/// @param in - input surface
-/// @param src_x0 - starting X coordinate of input
-/// @param src_y0 - starting Y coordinate of input
-/// @param scale_x - step of X input coordinate taken for each output pixel
-/// @param scale_y - step of Y input coordinate taken for each output row
-/// @remarks The function clamps input coordinates to fit in range defined by `in` dimensions.
-///          Scales can be negative to achieve flipping.
+/**
+ * @brief Resamples `in` using Nearest Neighbor interpolation and stores result in `out`
+ * @param out - output surface
+ * @param in - input surface
+ * @param src_x0 - starting X coordinate of input
+ * @param src_y0 - starting Y coordinate of input
+ * @param scale_x - step of X input coordinate taken for each output pixel
+ * @param scale_y - step of Y input coordinate taken for each output row
+ * @remarks The function clamps input coordinates to fit in range defined by `in` dimensions.
+ *          Scales can be negative to achieve flipping.
+ */
 template <typename Out, typename In>
 void ResampleNN(Surface2D<Out> out, Surface2D<const In> in,
                 float src_x0, float src_y0, float scale_x, float scale_y) {
diff --git a/dali/kernels/imgproc/resample/resampling_setup.h b/dali/kernels/imgproc/resample/resampling_setup.h
index 01d16c1f7e4..f3bfa7432bf 100644
--- a/dali/kernels/imgproc/resample/resampling_setup.h
+++ b/dali/kernels/imgproc/resample/resampling_setup.h
@@ -25,14 +25,18 @@
 namespace dali {
 namespace kernels {
 
-/// @brief Maps a block (by blockIdx) to a sample.
+/**
+ * @brief Maps a block (by blockIdx) to a sample.
+ */
 struct SampleBlockInfo {
   int sample, block_in_sample;
 };
 
 ResamplingFilter GetResamplingFilter(const ResamplingFilters *filters, const FilterDesc &params);
 
-/// @brief Builds and maintains resampling setup
+/**
+ * @brief Builds and maintains resampling setup
+ */
 class SeparableResamplingSetup {
  public:
   enum ProcessingOrder : int {
@@ -40,8 +44,10 @@ class SeparableResamplingSetup {
     VertHorz
   };
 
-  /// Number of blocks per pass may differ depending on
-  /// the image aspect ratio and block aspect ratio.
+  /**
+   * Number of blocks per pass may differ depending on
+   * the image aspect ratio and block aspect ratio.
+   */
   struct BlockCount {
     int pass[2];
   };
diff --git a/dali/kernels/imgproc/resample/separable.h b/dali/kernels/imgproc/resample/separable.h
index efa5267a32a..dfbe6c1c43c 100644
--- a/dali/kernels/imgproc/resample/separable.h
+++ b/dali/kernels/imgproc/resample/separable.h
@@ -23,7 +23,9 @@
 namespace dali {
 namespace kernels {
 
-/// @brief Defines an interface of a separable resampling filter
+/**
+ * @brief Defines an interface of a separable resampling filter
+ */
 template <typename OutputElement, typename InputElement>
 struct SeparableResamplingFilter {
   using Input = InListGPU<InputElement, 3>;
diff --git a/dali/kernels/imgproc/resample/separable_impl.h b/dali/kernels/imgproc/resample/separable_impl.h
index d54f1c851a7..c83da17b175 100644
--- a/dali/kernels/imgproc/resample/separable_impl.h
+++ b/dali/kernels/imgproc/resample/separable_impl.h
@@ -24,12 +24,14 @@
 namespace dali {
 namespace kernels {
 
-/// @brief Implements a separable resampling filter
-///
-/// This implementation can apply differnt resampling filters to each sample.
-/// Resampling order is chosen based on input/output shapes and filter type and support.
-/// The filter allocates memory only in `Setup` - and even there, it won't reallocate
-/// if subsequent calls do not exceed previous number of samples.
+/**
+ * @brief Implements a separable resampling filter
+ *
+ * This implementation can apply differnt resampling filters to each sample.
+ * Resampling order is chosen based on input/output shapes and filter type and support.
+ * The filter allocates memory only in `Setup` - and even there, it won't reallocate
+ * if subsequent calls do not exceed previous number of samples.
+ */
 template <typename OutputElement, typename InputElement,
           typename Interface = SeparableResamplingFilter<OutputElement, InputElement>>
 struct SeparableResamplingGPUImpl : Interface {
@@ -38,13 +40,17 @@ struct SeparableResamplingGPUImpl : Interface {
   using typename Interface::Output;
   using SampleDesc = SeparableResamplingSetup::SampleDesc;
 
-  /// Generates and stores resampling setup
+  /**
+   * Generates and stores resampling setup
+   */
   BatchResamplingSetup setup;
 
   using IntermediateElement = float;
   using Intermediate = OutListGPU<IntermediateElement, 3>;
 
-  /// The intermediate tensor list
+  /**
+   * The intermediate tensor list
+   */
   Intermediate intermediate;
 
   void Initialize(KernelContext &context) {
@@ -88,8 +94,10 @@ struct SeparableResamplingGPUImpl : Interface {
         stream);
   }
 
-  /// @remarks This function shall not allocate memory by ano other means
-  ///          than through `context.scratchpad`
+  /**
+   * @remarks This function shall not allocate memory by ano other means
+   *          than through `context.scratchpad`
+   */
   virtual void
   Run(KernelContext &context, const Output &out, const Input &in, const Params &params) {
     cudaStream_t stream = context.gpu.stream;
diff --git a/dali/kernels/imgproc/surface.h b/dali/kernels/imgproc/surface.h
index f9345ff5191..80a570503ed 100644
--- a/dali/kernels/imgproc/surface.h
+++ b/dali/kernels/imgproc/surface.h
@@ -29,22 +29,26 @@ struct Surface2D {
     return data[y * row_stride + x * pixel_stride + c * channel_stride];
   }
 
-  /// @brief Provides implicit _reference_ cast to surface of type const T,
-  ///        if T is not already const
-  ///
-  /// @remarks The template magic is a workaround to avoid conversion to self
-  ///          when T is already const
+  /**
+   * @brief Provides implicit _reference_ cast to surface of type const T,
+   *        if T is not already const
+   *
+   * @remarks The template magic is a workaround to avoid conversion to self
+   *          when T is already const
+   */
   template <typename U = T,
             typename V = std::enable_if_t<!std::is_const<U>::value, const U>>
   __host__ __device__ operator Surface2D<V>&() {
     return *reinterpret_cast<Surface2D<V>*>(this);
   }
 
-  /// @brief Provides implicit _reference_ cast to surface of type const T,
-  ///        if T is not already const
-  ///
-  /// @remarks The template magic is a workaround to avoid conversion to self
-  ///          when T is already const
+  /**
+   * @brief Provides implicit _reference_ cast to surface of type const T,
+   *        if T is not already const
+   *
+   * @remarks The template magic is a workaround to avoid conversion to self
+   *          when T is already const
+   */
   template <typename U = T,
             typename V = std::enable_if_t<!std::is_const<U>::value, const U>>
   __host__ __device__ constexpr operator const Surface2D<V>&() const {
diff --git a/dali/kernels/imgproc/warp/mapping_traits.h b/dali/kernels/imgproc/warp/mapping_traits.h
index 1df3b8b6eb3..fd4d4730a97 100644
--- a/dali/kernels/imgproc/warp/mapping_traits.h
+++ b/dali/kernels/imgproc/warp/mapping_traits.h
@@ -39,11 +39,13 @@ struct mapping_params {
   using type = Mapping;
 };
 
-/// @brief This type is passed to the Warp kernel to construct the mapping object
-///
-/// The Mapping object can be transient and/or contain additional state.
-/// mapping_params_t type is used to distinguish between mapping object and
-/// the parameters needed to construct one - by default they are the same type.
+/**
+ * @brief This type is passed to the Warp kernel to construct the mapping object
+ *
+ * The Mapping object can be transient and/or contain additional state.
+ * mapping_params_t type is used to distinguish between mapping object and
+ * the parameters needed to construct one - by default they are the same type.
+ */
 template <typename Mapping>
 using mapping_params_t = typename mapping_params<Mapping>::type;
 
diff --git a/dali/kernels/imgproc/warp_gpu.h b/dali/kernels/imgproc/warp_gpu.h
index 195a16e0a7a..db9f8ed2703 100644
--- a/dali/kernels/imgproc/warp_gpu.h
+++ b/dali/kernels/imgproc/warp_gpu.h
@@ -26,7 +26,9 @@
 namespace dali {
 namespace kernels {
 
-/// @remarks Assume HWC layout
+/**
+ * @remarks Assume HWC layout
+ */
 template <typename _Mapping, int ndim, typename _OutputType, typename _InputType,
           typename _BorderType>
 class WarpGPU {
diff --git a/dali/kernels/kernel.h b/dali/kernels/kernel.h
index 38392340f63..d5f64e89087 100644
--- a/dali/kernels/kernel.h
+++ b/dali/kernels/kernel.h
@@ -27,64 +27,72 @@
 
 namespace dali {
 
-/// @brief Defines the DALI kernel API. See dali::kernels::examples::Kernel for details
+/**
+ * @brief Defines the DALI kernel API. See dali::kernels::examples::Kernel for details
+ */
 namespace kernels {
 
 namespace examples {
 
-/// @brief  DALI Kernel example
-///
-/// This class represents a "concept" of a DALI kernel.
-/// A kernel must provide two non-overloaded functions:
-/// Run and Setup.
-///
-/// Run and Setup functions are expected to accept arguments in strictly specified order:
-/// Setup(KernelContext, [inputs], [arguments])
-/// Run(KernelContext, [outputs], [inputs], [arguments])
-/// Additionally, both of these functions accept the same sets of inputs and arguments.
-///
-/// The kernel can be run directly or its inputs, outputs and arguments can be tied
-/// into tuples and then the kernel be configured and launched using:
-///
-/// `dali::kernels::kernel::Setup`
-///
-/// `dali::kernels::kernel::Run`
-///
-/// Programmer can check whether their type satisfies conditions for being a kernel
-/// through instantiating check_kernel<KernelType>. If the type does not meet requirements,
-/// static_asserts should produce meaningful diagnostics that will help to rectify the problem.
-template <typename Input1, typename Input2, typename OutputType>
+/**
+ * @brief  DALI Kernel example
+ *
+ * This class represents a "concept" of a DALI kernel.
+ * A kernel must provide two non-overloaded functions:
+ * Run and Setup.
+ *
+ * Run and Setup functions are expected to accept arguments in strictly specified order:
+ * Setup(KernelContext, [inputs], [arguments])
+ * Run(KernelContext, [outputs], [inputs], [arguments])
+ * Additionally, both of these functions accept the same sets of inputs and arguments.
+ *
+ * The kernel can be run directly or its inputs, outputs and arguments can be tied
+ * into tuples and then the kernel be configured and launched using:
+ *
+ * `dali::kernels::kernel::Setup`
+ *
+ * `dali::kernels::kernel::Run`
+ *
+ * Programmer can check whether their type satisfies conditions for being a kernel
+ * through instantiating check_kernel<KernelType>. If the type does not meet requirements,
+ * static_asserts should produce meaningful diagnostics that will help to rectify the problem.
+ */
+template <typename OutputType, typename Input1, typename Input2>
 struct Kernel {
-  /// @brief Returns kernel output(s) shape(s) and additional memory requirements
-  ///
-  /// Setup receives full input tensor lists and any extra arguments that
-  /// are going to be passed to a subsequent call to Run.
-  ///
-  /// @remarks The inputs are provided mainly to inspect their shapes; actually looking at the
-  /// data may degrade performance severely.
-  ///
-  /// @param context - environment of the kernel;, cuda stream, batch info, etc.
-  ///                  At the time of call to Setup, its scratch area is undefined.
-  ///
-  /// @param in1 - example input, consisting of a list of 3D tensors with element type Input1
-  /// @param in2 - example input, consisting of a 4D tensor with element type Input2
-  /// @param aux - some extra parameters (e.g. convolution kernel, mask)
+  /**
+   * @brief Returns kernel output(s) shape(s) and additional memory requirements
+   *
+   * Setup receives full input tensor lists and any extra arguments that
+   * are going to be passed to a subsequent call to Run.
+   *
+   * @remarks The inputs are provided mainly to inspect their shapes; actually looking at the
+   * data may degrade performance severely.
+   *
+   * @param context - environment of the kernel;, cuda stream, batch info, etc.
+   *                  At the time of call to Setup, its scratch area is undefined.
+   *
+   * @param in1 - example input, consisting of a list of 3D tensors with element type Input1
+   * @param in2 - example input, consisting of a 4D tensor with element type Input2
+   * @param aux - some extra parameters (e.g. convolution kernel, mask)
+   */
   KernelRequirements Setup(
     KernelContext &context,
     const InListGPU<Input1, 3> &in1,
     const InTensorGPU<Input2, 4> &in2,
     const std::vector<float> &aux);
 
-  /// @brief Runs the kernel
-  ///
-  /// Run processes the inputs and populates the pre-allocated output. Output shape is expected
-  /// to match that returned by Setup.
-  ///
-  /// @param context - environment; provides scratch memory, cuda stream, batch info, etc.
-  ///                  Scratch area must satisfy requirements returned by Setup.
-  /// @param in1 - example input, consisting of a list of 3D tensors with element type Input1
-  /// @param in2 - example input, consisting of a 4D tensor with element type Input2
-    /// @param aux - some extra parameters (e.g. convolution kernel, mask)
+  /**
+   * @brief Runs the kernel
+   *
+   * Run processes the inputs and populates the pre-allocated output. Output shape is expected
+   * to match that returned by Setup.
+   *
+   * @param context - environment; provides scratch memory, cuda stream, batch info, etc.
+   *                  Scratch area must satisfy requirements returned by Setup.
+   * @param in1 - example input, consisting of a list of 3D tensors with element type Input1
+   * @param in2 - example input, consisting of a 4D tensor with element type Input2
+     * @param aux - some extra parameters (e.g. convolution kernel, mask)
+     */
   void Run(
     KernelContext &context,
     const OutListGPU<OutputType, 3> &out,
@@ -95,7 +103,9 @@ struct Kernel {
 
 }  // namespace examples
 
-/// @brief A collection of pseudo-methods to operate on Kernel classes/objects
+/**
+ * @brief A collection of pseudo-methods to operate on Kernel classes/objects
+ */
 namespace kernel {
 
 // avoid retyping "Kernel" every second word...
@@ -112,10 +122,12 @@ using args = kernel_args<Kernel>;
 using Context = KernelContext;
 using Requirements = KernelRequirements;
 
-/// @brief Gets requirements for given Kernel
-/// @param context            - execution environment (without scratch memory)
-/// @param input              - kernel inputs, convertible to kernel_inputs<Kernel>
-/// @param args               - kernel extra arguments, convertible to kernel_args<Kernel>
+/**
+ * @brief Gets requirements for given Kernel
+ * @param context            - execution environment (without scratch memory)
+ * @param input              - kernel inputs, convertible to kernel_inputs<Kernel>
+ * @param args               - kernel extra arguments, convertible to kernel_args<Kernel>
+ */
 template <typename Kernel>
 Requirements Setup(
       Kernel &instance,
@@ -126,11 +138,13 @@ Requirements Setup(
   return apply_all(std::mem_fn(&Kernel::Setup), instance, context, input, args);
 }
 
-/// @brief Executes a Kernel on an input set
-/// @param context             - execution environment (with scratch memory)
-/// @param input               - kernel inputs, convertible to kernel_inputs<Kernel>
-/// @param outputs             - kernel outputs, convertible to kernel_outputs<Kernel>
-/// @param args                - kernel extra arguments, convertible to kernel_args<Kernel>
+/**
+ * @brief Executes a Kernel on an input set
+ * @param context             - execution environment (with scratch memory)
+ * @param input               - kernel inputs, convertible to kernel_inputs<Kernel>
+ * @param outputs             - kernel outputs, convertible to kernel_outputs<Kernel>
+ * @param args                - kernel extra arguments, convertible to kernel_args<Kernel>
+ */
 template <typename Kernel>
 void Run(
       Kernel &instance,
diff --git a/dali/kernels/kernel_manager.h b/dali/kernels/kernel_manager.h
index 9669d2fc6a6..c84698efcdb 100644
--- a/dali/kernels/kernel_manager.h
+++ b/dali/kernels/kernel_manager.h
@@ -71,81 +71,99 @@ struct AnyKernelInstance {
   explicit operator bool() const noexcept { return static_cast<bool>(instance); }
 };
 
-/// @brief Manages multiple instances of run-time typed kernels
-///
-/// KernelManager provides type erasure for kernels whose type is selected at
-/// run-time. Kernel manager also carries out mundane tasks of keeping
-/// ScratchpadAllocators and reserving memory according to requirements returned
-/// by kernel's Setup method.
-///
-/// A scratchpad allocator is created per-thread with thread indexing supported
-/// explicitly by the caller.
+/**
+ * @brief Manages multiple instances of run-time typed kernels
+ *
+ * KernelManager provides type erasure for kernels whose type is selected at
+ * run-time. Kernel manager also carries out mundane tasks of keeping
+ * ScratchpadAllocators and reserving memory according to requirements returned
+ * by kernel's Setup method.
+ *
+ * A scratchpad allocator is created per-thread with thread indexing supported
+ * explicitly by the caller.
+ */
 class DLL_PUBLIC KernelManager {
  public:
   static constexpr size_t NumAllocTypes = ScratchpadAllocator::NumAllocTypes;
   using ScratchSizes = std::array<size_t, NumAllocTypes>;
 
-  /// @brief Creates `num_threads` scratcapads and `num_instances` slots for kernels
-  ///
-  /// @param num_threads -    number of threads that can concurrently use the kernels in the
-  ///                         manager, assuming that each threads uses its unique
-  ///                         zero-based inde
-  /// @param num_instances -  number of Kernel instances to be created; typically corresponds
-  ///                         to number of samples (for per-sample kernels) or minibatches
+  /**
+   * @brief Creates `num_threads` scratcapads and `num_instances` slots for kernels
+   *
+   * @param num_threads -    number of threads that can concurrently use the kernels in the
+   *                         manager, assuming that each threads uses its unique
+   *                         zero-based inde
+   * @param num_instances -  number of Kernel instances to be created; typically corresponds
+   *                         to number of samples (for per-sample kernels) or minibatches
+   */
   void Resize(size_t num_threads, size_t num_instances);
 
-  /// @brief Creates `num_threads` scratcapads and `num_instances` kernels of type Kernel
-  ///        constructed with `args...`.
-  ///
-  /// @param num_threads   -  number of threads that can concurrently use the kernels in the
-  ///                         manager, assuming that each threads uses its unique
-  ///                         zero-based inde
-  /// @param num_instances -  number of Kernel instances to be created; typically corresponds
-  ///                         to number of samples (for per-sample kernels) or minibatches
-  /// @param args           - arguments passed to Kernel's constructor upon creation.
-  /// @tparam Kernel        - type of the kernel to be created
+  /**
+   * @brief Creates `num_threads` scratcapads and `num_instances` kernels of type Kernel
+   *        constructed with `args...`.
+   *
+   * @param num_threads   -  number of threads that can concurrently use the kernels in the
+   *                         manager, assuming that each threads uses its unique
+   *                         zero-based inde
+   * @param num_instances -  number of Kernel instances to be created; typically corresponds
+   *                         to number of samples (for per-sample kernels) or minibatches
+   * @param args           - arguments passed to Kernel's constructor upon creation.
+   * @tparam Kernel        - type of the kernel to be created
+   */
   template <typename Kernel, typename... Args>
   void Resize(size_t num_threads, size_t num_instances, const Args&... args) {
     Resize(num_threads, num_instances);
     Initialize<Kernel>(args...);
   }
 
-  /// @brief Populates the instance slots with instances of a given Kernel
-  ///
-  /// @param args           - arguments passed to Kernel's constructor upon creation.
-  /// @tparam Kernel        - type of the kernel to be created
+  /**
+   * @brief Populates the instance slots with instances of a given Kernel
+   *
+   * @param args           - arguments passed to Kernel's constructor upon creation.
+   * @tparam Kernel        - type of the kernel to be created
+   */
   template <typename Kernel, typename... Args>
   void Initialize(const Args&... args) {
     for (size_t i = 0; i < NumInstances(); i++)
       CreateOrGet<Kernel>(i, args...);
   }
 
-  /// @brief Clears kernel instances and scratchpads
+  /**
+   * @brief Clears kernel instances and scratchpads
+   */
   void Reset();
 
-  /// @brief Gets or creates a Kernel instance
+  /**
+   * @brief Gets or creates a Kernel instance
+   */
   template <typename Kernel, typename... ConstructorArgs>
   Kernel &CreateOrGet(int instance_idx, ConstructorArgs &&...args) {
     return instances[instance_idx].create_or_get<Kernel>(std::forward<ConstructorArgs>(args)...);
   }
 
 
-  /// @brief Gets a Kernel instance
-  ///
-  /// If there's no instance for a given index of the type is different,
-  /// `std::logic_error` is thrown.
-  /// @return A reference to a kernel instance at given index
+  /**
+   * @brief Gets a Kernel instance
+   *
+   * If there's no instance for a given index of the type is different,
+   * `std::logic_error` is thrown.
+   * @return A reference to a kernel instance at given index
+   */
   template <typename Kernel>
   Kernel &Get(int instance_idx) {
     return instances[instance_idx].get<Kernel>();
   }
 
-  /// @brief Gets a reference to an internally maintained copy of KernelRequirements
+  /**
+   * @brief Gets a reference to an internally maintained copy of KernelRequirements
+   */
   KernelRequirements &GetRequirements(int instance_idx) noexcept {
     return instances[instance_idx].requirements;
   }
 
-  /// @brief Gets a const-reference to an internally maintained copy of KernelRequirements
+  /**
+   * @brief Gets a const-reference to an internally maintained copy of KernelRequirements
+   */
   const KernelRequirements &GetRequirements(int instance_idx) const noexcept {
     return instances[instance_idx].requirements;
   }
@@ -153,23 +171,27 @@ class DLL_PUBLIC KernelManager {
   size_t NumInstances() const noexcept { return instances.size(); }
   size_t NumThreads() const noexcept { return scratchpads.size(); }
 
-  /// @brief Gets a scratchpad allocator assigned to a given thread.
+  /**
+   * @brief Gets a scratchpad allocator assigned to a given thread.
+   */
   ScratchpadAllocator &GetScratchpadAllocator(int thread_idx) {
     return scratchpads[thread_idx];
   }
 
-  /// @brief Calls setup on specified kernel instance.
-  ///
-  /// @param instance_idx   - kernel instance index; typically corresponds
-  ///                         to sample index (for per-sample kernels) or minibatch index
-  /// @param context        - context for the kernel
-  ///                         * should contain valid CUDA stream for GPU kernels;
-  /// @param in_args        - pack of arguments (inputs, arguments) used in Kernel::Setup
-  /// @return Reference to internally maintained copy of the kernel requirements.
-  /// @remarks The copies of KernelRequirements for each instance index are used for allocating
-  ///          scratch memory. While the function returns non-const reference, please note
-  ///          that decreasing scratch sizes calculated by Setup will result in undefined
-  ///          behavior, including memory corruption or illegal access.
+  /**
+   * @brief Calls setup on specified kernel instance.
+   *
+   * @param instance_idx   - kernel instance index; typically corresponds
+   *                         to sample index (for per-sample kernels) or minibatch index
+   * @param context        - context for the kernel
+   *                         * should contain valid CUDA stream for GPU kernels;
+   * @param in_args        - pack of arguments (inputs, arguments) used in Kernel::Setup
+   * @return Reference to internally maintained copy of the kernel requirements.
+   * @remarks The copies of KernelRequirements for each instance index are used for allocating
+   *          scratch memory. While the function returns non-const reference, please note
+   *          that decreasing scratch sizes calculated by Setup will result in undefined
+   *          behavior, including memory corruption or illegal access.
+   */
   template <typename Kernel, typename... InArgs>
   KernelRequirements &Setup(int instance_idx, KernelContext &context, InArgs &&...in_args) {
     auto &inst = instances[instance_idx];
@@ -180,16 +202,18 @@ class DLL_PUBLIC KernelManager {
     return inst.requirements;
   }
 
-  /// @brief Calls Run on specified kernel instance using Scratchpad for given thread.
-  ///
-  /// @param thread_idx     - zero-based thread index
-  /// @param instance_idx   - kernel instance index; typically corresponds
-  ///                         to sample index (for per-sample kernels) or minibatch index
-  /// @param context        - context for the kernel
-  ///                         * should contain valid CUDA stream for GPU kernels;
-  ///                         * scratchpad pointer is overriden with a scratchpad
-  ///                           created for given thread_idx
-  /// @param out_in_args    - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
+  /**
+   * @brief Calls Run on specified kernel instance using Scratchpad for given thread.
+   *
+   * @param thread_idx     - zero-based thread index
+   * @param instance_idx   - kernel instance index; typically corresponds
+   *                         to sample index (for per-sample kernels) or minibatch index
+   * @param context        - context for the kernel
+   *                         * should contain valid CUDA stream for GPU kernels;
+   *                         * scratchpad pointer is overriden with a scratchpad
+   *                           created for given thread_idx
+   * @param out_in_args    - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
+   */
   template <typename Kernel, typename... OutInArgs>
   void Run(int thread_idx, int instance_idx, KernelContext &context, OutInArgs &&...out_in_args) {
     assert(static_cast<size_t>(thread_idx) < scratchpads.size());
@@ -197,17 +221,19 @@ class DLL_PUBLIC KernelManager {
     Run<Kernel>(sa, instance_idx, context, std::forward<OutInArgs>(out_in_args)...);
   }
 
-  /// @brief Calls Run on specified kernel instance using Scratchpad for given thread.
-  ///
-  /// @param sa             - scratchpad allocator; memory will be reserved in it to satisfy
-  ///                         instance's requirements
-  /// @param instance_idx   - kernel instance index; typically corresponds
-  ///                         to sample index (for per-sample kernels) or minibatch index
-  /// @param context        - context for the kernel
-  ///                         * should contain valid CUDA stream for GPU kernels;
-  ///                         * scratchpad pointer is overriden with a scratchpad
-  ///                           created from `sa`
-  /// @param out_in_args    - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
+  /**
+   * @brief Calls Run on specified kernel instance using Scratchpad for given thread.
+   *
+   * @param sa             - scratchpad allocator; memory will be reserved in it to satisfy
+   *                         instance's requirements
+   * @param instance_idx   - kernel instance index; typically corresponds
+   *                         to sample index (for per-sample kernels) or minibatch index
+   * @param context        - context for the kernel
+   *                         * should contain valid CUDA stream for GPU kernels;
+   *                         * scratchpad pointer is overriden with a scratchpad
+   *                           created from `sa`
+   * @param out_in_args    - pack of arguments (outputs, inputs, arguments) used in Kernel::Run
+   */
   template <typename Kernel, typename... OutInArgs>
   void Run(ScratchpadAllocator &sa,
            int instance_idx,
@@ -224,23 +250,29 @@ class DLL_PUBLIC KernelManager {
     context.scratchpad = old_scratchpad;
   }
 
-  /// @brief Makes sure ScratchpadAllocator can accommodate `sizes`
-  ///
-  /// @param sa     - scratchpad allocator to reserve
-  /// @param sizes  - requested minimum size
-  ///
-  /// The manager maintains a lifetime maximum of sizes requested.
-  /// If reallocation is necessary, it allocates `sizes` or that maximum
-  /// whichever is larger.
+  /**
+   * @brief Makes sure ScratchpadAllocator can accommodate `sizes`
+   *
+   * @param sa     - scratchpad allocator to reserve
+   * @param sizes  - requested minimum size
+   *
+   * The manager maintains a lifetime maximum of sizes requested.
+   * If reallocation is necessary, it allocates `sizes` or that maximum
+   * whichever is larger.
+   */
   auto ReserveScratchpad(ScratchpadAllocator &sa, const ScratchSizes &sizes)->
   decltype(sa.GetScratchpad());
 
-  /// @brief Calls ReserveScratchpad on ScratchpadAllocator associated with given thread_idx
+  /**
+   * @brief Calls ReserveScratchpad on ScratchpadAllocator associated with given thread_idx
+   */
   inline auto ReserveScratchpad(int thread_idx, const ScratchSizes &sizes) {
     return ReserveScratchpad(GetScratchpadAllocator(thread_idx), sizes);
   }
 
-  /// @brief Returns maximum scratchpad size seen so far
+  /**
+   * @brief Returns maximum scratchpad size seen so far
+   */
   inline ScratchSizes MaxScratchSizes() const {
     ScratchSizes sizes;
     for (size_t i = 0; i < sizes.size(); i++) {
@@ -249,15 +281,19 @@ class DLL_PUBLIC KernelManager {
     return sizes;
   }
 
-  /// @brief Reserves scratchpad big enough to accommodate largest scratch area ever seen
+  /**
+   * @brief Reserves scratchpad big enough to accommodate largest scratch area ever seen
+   */
   inline auto ReserveMaxScratchpad(int thread_idx) {
     return ReserveScratchpad(thread_idx, MaxScratchSizes());
   }
 
-  /// @brief Sets a memory size hint for allocating scratchpad memory
-  ///
-  /// All calls to ScratchpadAllocator::Reserve followint this call will request at least
-  /// bytes memory for given allocation type.
+  /**
+   * @brief Sets a memory size hint for allocating scratchpad memory
+   *
+   * All calls to ScratchpadAllocator::Reserve followint this call will request at least
+   * bytes memory for given allocation type.
+   */
   void SetMemoryHint(AllocType type, size_t bytes) {
     int alloc_idx = static_cast<int>(type);
     atomic_max(max_scratch_sizes[alloc_idx], bytes);
diff --git a/dali/kernels/kernel_req.h b/dali/kernels/kernel_req.h
index 663c5cab917..e2dd038e2a1 100644
--- a/dali/kernels/kernel_req.h
+++ b/dali/kernels/kernel_req.h
@@ -25,16 +25,20 @@
 namespace dali {
 namespace kernels {
 
-/// @brief Represents requirements for kernel to do its job for given inputs and arguments.
+/**
+ * @brief Represents requirements for kernel to do its job for given inputs and arguments.
+ */
 struct KernelRequirements {
   std::vector<TensorListShape<DynamicDimensions>> output_shapes;
 
   std::array<size_t, (size_t)AllocType::Count> scratch_sizes = {};
 
-  /// @param reuse_scratch  - if true, scratch size is taken to be maximum from that for
-  ///                         all input sets, otherwise it's the sum
-  /// @param new_req        - requirements for the new input set, to be merged with this one
-  /// @return               - *this, for chaining
+  /**
+   * @param reuse_scratch  - if true, scratch size is taken to be maximum from that for
+   *                         all input sets, otherwise it's the sum
+   * @param new_req        - requirements for the new input set, to be merged with this one
+   * @return               - *this, for chaining
+   */
   KernelRequirements &AddInputSet(const KernelRequirements &new_req, bool reuse_scratch) {
     auto &r = new_req;
 
@@ -50,19 +54,23 @@ struct KernelRequirements {
   }
 };
 
-/// @brief A utility class for adding scratchpad requirements with proper alignment,
-///        assuming bump allocation.
+/**
+ * @brief A utility class for adding scratchpad requirements with proper alignment,
+ *        assuming bump allocation.
+ */
 struct ScratchpadEstimator {
-  /// @brief Adds a new memory requirement for count instances of T
-  ///
-  /// The method includes padding, assuming the add function is called in order of allocations.
-  /// The resulting allocation size is equal to size of a structure which contains all allocated
-  /// objects assuming natural alignment.
-  /// The estimator assumes that scratch buffer implementation will provide memory block based at
-  /// largest possible alignment boundary.
-  ///
-  /// @return Total number of bytes required for given allocation method,
-  ///         including this allocation.
+  /**
+   * @brief Adds a new memory requirement for count instances of T
+   *
+   * The method includes padding, assuming the add function is called in order of allocations.
+   * The resulting allocation size is equal to size of a structure which contains all allocated
+   * objects assuming natural alignment.
+   * The estimator assumes that scratch buffer implementation will provide memory block based at
+   * largest possible alignment boundary.
+   *
+   * @return Total number of bytes required for given allocation method,
+   *         including this allocation.
+   */
   template <typename T>
   size_t add(AllocType alloc_type, size_t count, size_t alignment = alignof(T)) {
     size_t offset = align_up(sizes[(size_t)alloc_type], alignment);
diff --git a/dali/kernels/kernel_traits.h b/dali/kernels/kernel_traits.h
index a10618c953c..d5ac1985b74 100644
--- a/dali/kernels/kernel_traits.h
+++ b/dali/kernels/kernel_traits.h
@@ -157,25 +157,31 @@ struct KernelArgs<Kernel, false> {
 
 }  // namespace detail
 
-/// @brief Tells what inputs a kernel takes
-///
-/// If there's a type `Kernel::Inputs`, then this type is returned.
-/// Otherwise, it's a tuple of all `InList` parameters from `Kernel::Run` signature.
+/**
+ * @brief Tells what inputs a kernel takes
+ *
+ * If there's a type `Kernel::Inputs`, then this type is returned.
+ * Otherwise, it's a tuple of all `InList` parameters from `Kernel::Run` signature.
+ */
 template <typename Kernel>
 using kernel_inputs = typename detail::KernelInputs<Kernel>::type;
 
-/// @brief Tells what outputs a kernel produces
-///
-/// If there's a type `Kernel::Outputs`, then this type is returned.
-/// Otherwise, it's a tuple of all `OutList` parameters from `Kernel::Run` signature.
+/**
+ * @brief Tells what outputs a kernel produces
+ *
+ * If there's a type `Kernel::Outputs`, then this type is returned.
+ * Otherwise, it's a tuple of all `OutList` parameters from `Kernel::Run` signature.
+ */
 template <typename Kernel>
 using kernel_outputs = typename detail::KernelOutputs<Kernel>::type;
 
-/// @brief Tells what extra arguments a kernel takes
-///
-/// If there's a type `Kernel::Args`, then this type is returned.
-/// Otherwise returns all parameters to `Kernel::Run` that are neither
-/// `InList`, `OutList` or KernelContext.
+/**
+ * @brief Tells what extra arguments a kernel takes
+ *
+ * If there's a type `Kernel::Args`, then this type is returned.
+ * Otherwise returns all parameters to `Kernel::Run` that are neither
+ * `InList`, `OutList` or KernelContext.
+ */
 template <typename Kernel>
 using kernel_args = typename detail::KernelArgs<Kernel>::type;
 
diff --git a/dali/kernels/scratch.h b/dali/kernels/scratch.h
index b9d0d7861b4..e32550d4051 100644
--- a/dali/kernels/scratch.h
+++ b/dali/kernels/scratch.h
@@ -61,7 +61,9 @@ class BumpAllocator {
     assert(used_ + required <= total_);
   }
 
-  /// @brief Resets the usage counter so the buffer can be reused.
+  /**
+   * @brief Resets the usage counter so the buffer can be reused.
+   */
   inline void Clear() {
     used_ = 0;
   }
@@ -72,7 +74,9 @@ class BumpAllocator {
   size_t used_ = 0;
 };
 
-/// @brief Scratchpad with pre-existing buffers
+/**
+ * @brief Scratchpad with pre-existing buffers
+ */
 struct PreallocatedScratchpad : Scratchpad {
   PreallocatedScratchpad() = default;
 
@@ -99,41 +103,55 @@ struct PreallocatedScratchpad : Scratchpad {
   std::array<BumpAllocator, size_t(AllocType::Count)> allocs;
 };
 
-/// @brief Implements an ever-growing scratchpad
+/**
+ * @brief Implements an ever-growing scratchpad
+ */
 class ScratchpadAllocator {
  public:
   static constexpr size_t NumAllocTypes = static_cast<size_t>(AllocType::Count);
 
-  /// @brief Describes scratch memory allocation policy
-  ///
-  /// When reserving `size` memory and the existing capacity is `capacity`
-  /// then the new allocation will be of size:
-  /// ```
-  /// new_capacity = max(size * (1 + Margin), capacity * GrowthRatio)
-  /// ```
+  /**
+   * @brief Describes scratch memory allocation policy
+   *
+   * When reserving `size` memory and the existing capacity is `capacity`
+   * then the new allocation will be of size:
+   * ```
+   * new_capacity = max(size * (1 + Margin), capacity * GrowthRatio)
+   * ```
+   */
   struct AllocPolicy {
-    /// When reserving more memory than available, current capacity will
-    /// be multiplied by this value.
+    /**
+     * When reserving more memory than available, current capacity will
+     * be multiplied by this value.
+     */
     float GrowthRatio = 2;
 
-    /// When reserving memory, make sure that at least `(1 + Margin) * size` is
-    /// actually allocated.
+    /**
+     * When reserving memory, make sure that at least `(1 + Margin) * size` is
+     * actually allocated.
+     */
     float Margin = 0.1;
   };
 
-  /// @brief Returns reference to the current
-  ///        allocation policy for given allocation type.
+  /**
+   * @brief Returns reference to the current
+   *        allocation policy for given allocation type.
+   */
   AllocPolicy &Policy(AllocType type) {
     return buffers_[static_cast<int>(type)].policy;
   }
 
-  /// @brief Returns allocation policy for given allocation type
+  /**
+   * @brief Returns allocation policy for given allocation type
+   */
   const AllocPolicy &Policy(AllocType type) const {
     return buffers_[static_cast<int>(type)].policy;
   }
 
-  /// @brief Releases any storage allocated by calls to `Reserve`.
-  /// @remarks Scratchpad returned by `GetScratchpad` is invalid after this call.
+  /**
+   * @brief Releases any storage allocated by calls to `Reserve`.
+   * @remarks Scratchpad returned by `GetScratchpad` is invalid after this call.
+   */
   void Free() {
     for (auto &buffer : buffers_) {
       buffer.mem.reset();
@@ -142,18 +160,22 @@ class ScratchpadAllocator {
     }
   }
 
-  /// @brief Reserves memory for all allocation types.
-  ///
-  /// See `Reserve(AllocType, size_t)` for details.
+  /**
+   * @brief Reserves memory for all allocation types.
+   *
+   * See `Reserve(AllocType, size_t)` for details.
+   */
   void Reserve(std::array<size_t, NumAllocTypes> sizes) {
     for (size_t idx = 0; idx < NumAllocTypes; idx++) {
       Reserve(AllocType(idx), sizes[idx]);
     }
   }
 
-  /// @brief Ensures that at least `sizes` bytes of memory are available in storage `type`
-  /// @remarks If reallocation happens, any `Scratchpad` returned by `GetScratchpad`
-  ///          is invalidated.
+  /**
+   * @brief Ensures that at least `sizes` bytes of memory are available in storage `type`
+   * @remarks If reallocation happens, any `Scratchpad` returned by `GetScratchpad`
+   *          is invalidated.
+   */
   void Reserve(AllocType type, size_t size) {
     size_t index = static_cast<size_t>(type);
     auto &buf = buffers_[index];
@@ -177,7 +199,9 @@ class ScratchpadAllocator {
     }
   }
 
-  /// @brief Returns allocator's capacities for all allocation types
+  /**
+   * @brief Returns allocator's capacities for all allocation types
+   */
   std::array<size_t, NumAllocTypes> Capacities() const noexcept {
     std::array<size_t, NumAllocTypes> capacities;
     for (size_t i = 0; i < buffers_.size(); i++)
@@ -185,14 +209,18 @@ class ScratchpadAllocator {
     return capacities;
   }
 
-  /// @brief Returns allocator's capacity for given allocation type
+  /**
+   * @brief Returns allocator's capacity for given allocation type
+   */
   size_t Capacity(AllocType type) const noexcept {
     return buffers_[static_cast<size_t>(type)].capacity;
   }
 
-  /// @brief Returns a scratchpad.
-  /// @remarks The returned scratchpad is invalidated by desctruction of this
-  ///          object or by subsequent calls to `Reserve` or `Free`.
+  /**
+   * @brief Returns a scratchpad.
+   * @remarks The returned scratchpad is invalidated by desctruction of this
+   *          object or by subsequent calls to `Reserve` or `Free`.
+   */
   PreallocatedScratchpad GetScratchpad() {
     PreallocatedScratchpad scratchpad;
     for (size_t idx = 0; idx < NumAllocTypes; idx++) {
diff --git a/dali/kernels/scratch_copy_impl.h b/dali/kernels/scratch_copy_impl.h
index 21923b15cb1..a6373c018f0 100644
--- a/dali/kernels/scratch_copy_impl.h
+++ b/dali/kernels/scratch_copy_impl.h
@@ -29,7 +29,9 @@ namespace detail {
 
 inline void copy_to_buffer(char *buffer, const size_t *offsets) {}
 
-/// @brief Copy contents of collections `{ c, tail... }` to pointers stored in `ptrs`.
+/**
+ * @brief Copy contents of collections `{ c, tail... }` to pointers stored in `ptrs`.
+ */
 template <typename Collection, typename... Collections>
 void copy_to_buffer(char *buffer,
                   const size_t *offsets,
@@ -42,12 +44,14 @@ void copy_to_buffer(char *buffer,
 
 inline void GetCollectionOffsets(size_t base, size_t *offsets) { *offsets = base; }
 
-/// @brief Assuming aligned storage in a single buffer,
-///        calculates start offsets of collections `{ c, tail... }`
-/// @param base     - offset of the first element of the first collection `c`
-/// @param offsets  - the array to store the offsets
-/// @param c        - collection to be stored at (aligned) `base`
-/// @param tail     - collections to be stored after `c`
+/**
+ * @brief Assuming aligned storage in a single buffer,
+ *        calculates start offsets of collections `{ c, tail... }`
+ * @param base     - offset of the first element of the first collection `c`
+ * @param offsets  - the array to store the offsets
+ * @param c        - collection to be stored at (aligned) `base`
+ * @param tail     - collections to be stored after `c`
+ */
 template <typename Collection, typename... Collections>
 void GetCollectionOffsets(size_t base, size_t *offsets,
                               const Collection &c,
@@ -90,7 +94,9 @@ auto variadic_max(T0 t0, T... tail) {
 }  // namespace detail
 
 
-/// @brief Allocates from scratchpad and copies the collections to the allocated buffer.
+/**
+ * @brief Allocates from scratchpad and copies the collections to the allocated buffer.
+ */
 template <typename... Collections>
 std::tuple<std::remove_cv_t<element_t<Collections>>*...>
 ToContiguousHostMem(Scratchpad &scratchpad, const Collections &... c) {
@@ -110,9 +116,11 @@ ToContiguousHostMem(Scratchpad &scratchpad, const Collections &... c) {
   return detail::GetCollectionPtrs(tmp, &offsets[0], c...);
 }
 
-/// @brief Allocates GPU from scratchpad, copies the collections to a
-///        temporary host buffer and then transfers the contents to the GPU in just one
-///        `cudaMemcpyAsync`.
+/**
+ * @brief Allocates GPU from scratchpad, copies the collections to a
+ *        temporary host buffer and then transfers the contents to the GPU in just one
+ *        `cudaMemcpyAsync`.
+ */
 template <typename... Collections>
 std::tuple<std::remove_cv_t<element_t<Collections>>*...>
 ToContiguousGPUMem(Scratchpad &scratchpad, cudaStream_t stream, const Collections &... c) {
diff --git a/dali/kernels/tensor_shape.h b/dali/kernels/tensor_shape.h
index 87db2e2ddd9..e486494373c 100644
--- a/dali/kernels/tensor_shape.h
+++ b/dali/kernels/tensor_shape.h
@@ -68,20 +68,24 @@ struct compile_time_size_impl<std::array<T, N>> : std::integral_constant<int, N>
 template <typename T, size_t N>
 struct compile_time_size_impl<DeviceArray<T, N>> : std::integral_constant<int, N> {};
 
-/// @brief Class representing shape of a Tensor
-///
-/// Static shapes do not allocate additional memory as they are backed by static array
-/// @tparam ndim Either non-negative integer representing static number of dimensions
-///         or DynamicDimensions.
+/**
+ * @brief Class representing shape of a Tensor
+ *
+ * Static shapes do not allocate additional memory as they are backed by static array
+ * @tparam ndim Either non-negative integer representing static number of dimensions
+ *         or DynamicDimensions.
+ */
 template <int ndim = DynamicDimensions>
 struct TensorShape;
 
 template <int N>
 struct compile_time_size_impl<TensorShape<N>> : std::integral_constant<int, N> {};
 
-/// @brief Base class for TensorShape containing common code for iterators and operator[]
-/// @tparam Container - the data structure in which the sizes are stored
-/// @tparam ndim - number of dimensions
+/**
+ * @brief Base class for TensorShape containing common code for iterators and operator[]
+ * @tparam Container - the data structure in which the sizes are stored
+ * @tparam ndim - number of dimensions
+ */
 template <typename Container, int ndim>
 struct TensorShapeBase {
   using container_type = Container;
@@ -110,10 +114,14 @@ struct TensorShapeBase {
   DALI_NO_EXEC_CHECK
   DALI_HOST_DEV const_iterator cend() const noexcept { return shape.cend(); }
 
-  /// @brief Returns number of dimensions in this shape
+  /**
+   * @brief Returns number of dimensions in this shape
+   */
   DALI_NO_EXEC_CHECK
   DALI_HOST_DEV size_type size() const noexcept { return shape.size(); }
-  /// @brief Returns number of dimensions in this shape
+  /**
+   * @brief Returns number of dimensions in this shape
+   */
   DALI_NO_EXEC_CHECK
   DALI_HOST_DEV size_type sample_dim() const noexcept { return shape.size(); }
   DALI_NO_EXEC_CHECK
@@ -127,24 +135,32 @@ struct TensorShapeBase {
   Container shape;
   static constexpr int static_ndim = ndim;
 
-  /// @brief Returns a static subshape consisting of first other_ndim dimensions (outer dimensions)
-  /// [1, 2, 3, 4].first<2>() -> [1, 2]
+  /**
+   * @brief Returns a static subshape consisting of first other_ndim dimensions (outer dimensions)
+   * [1, 2, 3, 4].first<2>() -> [1, 2]
+   */
   DALI_NO_EXEC_CHECK
   template <int other_ndim>
   DALI_HOST_DEV
   TensorShape<other_ndim> first() const;
-  /// @brief Returns a static subshape consisting of last other_ndim dimensions (inner dimensions)
-  /// [1, 2, 3, 4].last<2>() -> [3, 4]
+  /**
+   * @brief Returns a static subshape consisting of last other_ndim dimensions (inner dimensions)
+   * [1, 2, 3, 4].last<2>() -> [3, 4]
+   */
   DALI_NO_EXEC_CHECK
   template <int other_ndim>
   DALI_HOST_DEV
   TensorShape<other_ndim> last() const;
 
-  /// @brief Returns a dynamic subshape consisting of first count dimensions (outer dimensions)
-  /// [1, 2, 3, 4].first(2) -> [1, 2]
+  /**
+   * @brief Returns a dynamic subshape consisting of first count dimensions (outer dimensions)
+   * [1, 2, 3, 4].first(2) -> [1, 2]
+   */
   TensorShape<DynamicDimensions> first(int count) const;
-  /// @brief Returns a dynamic subshape consisting of last count dimensions (inner dimensions)
-  /// [1, 2, 3, 4].last(2) -> [3, 4]
+  /**
+   * @brief Returns a dynamic subshape consisting of last count dimensions (inner dimensions)
+   * [1, 2, 3, 4].last(2) -> [3, 4]
+   */
   TensorShape<DynamicDimensions> last(int count) const;
 
  protected:
@@ -161,7 +177,9 @@ struct TensorShapeBase {
 
 using DynamicTensorShapeContainer = SmallVector<int64_t, 6>;
 
-/// @brief Dynamic TensorShape can be constructed from any Static TensorShape
+/**
+ * @brief Dynamic TensorShape can be constructed from any Static TensorShape
+ */
 template <>
 struct TensorShape<DynamicDimensions>
     : public TensorShapeBase<DynamicTensorShapeContainer, DynamicDimensions> {
@@ -207,8 +225,10 @@ struct TensorShape<DynamicDimensions>
     return *this;
   }
 
-  /// @brief Convert to static shape
-  /// Behaviour is undefined for other_ndim != dim()
+  /**
+   * @brief Convert to static shape
+   * Behaviour is undefined for other_ndim != dim()
+   */
   template <int other_ndim>
   TensorShape<other_ndim> to_static() const {
     static_assert(other_ndim != DynamicDimensions,
@@ -313,7 +333,9 @@ TensorShape<DynamicDimensions> TensorShapeBase<Container, ndim>::last(int count)
   return result;
 }
 
-/// @brief Checks if both shapes have the same number of dimensions and all of them are equal
+/**
+ * @brief Checks if both shapes have the same number of dimensions and all of them are equal
+ */
 DALI_NO_EXEC_CHECK
 template <int left_ndim, int right_ndim>
 DALI_HOST_DEV bool operator==(const TensorShape<left_ndim> &left,
@@ -344,9 +366,11 @@ constexpr int shape_cat_ndim(int left_ndim, int right_ndim) {
              : left_ndim + right_ndim;
 }
 
-/// @brief Concatenate shapes
-/// @return TensorShape<shape_cat_ndim(left_ndim, right_ndim)> Static shape if both of arguments
-///         are static, otherwise dynamic
+/**
+ * @brief Concatenate shapes
+ * @return TensorShape<shape_cat_ndim(left_ndim, right_ndim)> Static shape if both of arguments
+ *         are static, otherwise dynamic
+ */
 DALI_NO_EXEC_CHECK
 template <int left_ndim, int right_ndim>
 DALI_HOST_DEV
@@ -364,7 +388,9 @@ TensorShape<shape_cat_ndim(left_ndim, right_ndim)> shape_cat(const TensorShape<l
   return result;
 }
 
-/// @brief Appends a scalar to a shape
+/**
+ * @brief Appends a scalar to a shape
+ */
 DALI_NO_EXEC_CHECK
 template <int ndim, int out_dim = ndim == DynamicDimensions ? ndim : ndim + 1>
 DALI_HOST_DEV
@@ -378,7 +404,9 @@ TensorShape<out_dim> shape_cat(const TensorShape<ndim> &left, int64_t right) {
   return result;
 }
 
-/// @brief Prepends a scalar to a shape
+/**
+ * @brief Prepends a scalar to a shape
+ */
 DALI_NO_EXEC_CHECK
 template <int ndim, int out_dim = ndim == DynamicDimensions ? ndim : ndim + 1>
 DALI_HOST_DEV
@@ -392,7 +420,9 @@ TensorShape<out_dim> shape_cat(int64_t left, const TensorShape<ndim> &right) {
   return result;
 }
 
-/// @brief Flatten list of shapes into contigous vector
+/**
+ * @brief Flatten list of shapes into contigous vector
+ */
 template <int sample_ndim>
 std::enable_if_t<sample_ndim != DynamicDimensions, std::vector<int64_t>>
 flatten_shapes(const std::vector<TensorShape<sample_ndim>> &shapes) {
@@ -406,8 +436,10 @@ flatten_shapes(const std::vector<TensorShape<sample_ndim>> &shapes) {
   return result;
 }
 
-/// @brief Get the dim from list of shapes that have uniform dimensions.
-/// @return 0 if list is empty, otherwise dim of first element
+/**
+ * @brief Get the dim from list of shapes that have uniform dimensions.
+ * @return 0 if list is empty, otherwise dim of first element
+ */
 template <typename T>
 std::enable_if_t<std::is_same<T, TensorShape<DynamicDimensions>>::value ||
                  std::is_same<T, std::vector<int64_t>>::value,
@@ -444,35 +476,49 @@ static std::vector<int64_t> flatten_shapes(std::initializer_list<std::vector<int
   return flatten_shapes(std::vector<std::vector<int64_t>>(shapes));
 }
 
-/// @brief List of TensorShapes stored as contigous vector.
-///        All shapes have the same number of dimensions
-///
-/// @tparam sample_ndim Either non-negative integer representing static number of dimensions
-///         or DynamicDimensions.
+/**
+ * @brief List of TensorShapes stored as contigous vector.
+ *        All shapes have the same number of dimensions
+ *
+ * @tparam sample_ndim Either non-negative integer representing static number of dimensions
+ *         or DynamicDimensions.
+ */
 template <int sample_ndim = DynamicDimensions>
 struct TensorListShape;
 
-/// @tparam Derived - actual class of an object (CRTP)
-/// @tparam sample_dim - number of dimensions of each sample in the list
+/**
+ * @tparam Derived - actual class of an object (CRTP)
+ * @tparam sample_dim - number of dimensions of each sample in the list
+ */
 template <typename Derived, int sample_ndim>
 struct TensorListShapeBase {
-  /// @brief Returns a static subshape list consisting of first other_ndim dimensions
-  ///        (outer dimensions) for each sample
+  /**
+   * @brief Returns a static subshape list consisting of first other_ndim dimensions
+   *        (outer dimensions) for each sample
+   */
   template <int other_ndim>
   TensorListShape<other_ndim> first() const;
-  /// @brief Returns a static subshape list consisting of last other_ndim dimensions
-  ///        (inner dimensions) for each sample
+  /**
+   * @brief Returns a static subshape list consisting of last other_ndim dimensions
+   *        (inner dimensions) for each sample
+   */
   template <int other_ndim>
   TensorListShape<other_ndim> last() const;
 
-  /// @brief Returns a dynamic subshape list consisting of first count dimensions
-  ///        (outer dimensions) for each sample
+  /**
+   * @brief Returns a dynamic subshape list consisting of first count dimensions
+   *        (outer dimensions) for each sample
+   */
   TensorListShape<DynamicDimensions> first(int count) const;
-  /// @brief Returns a dynamic subshape list consisting of last count dimensions
-  ///        (inner dimensions) for each sample
+  /**
+   * @brief Returns a dynamic subshape list consisting of last count dimensions
+   *        (inner dimensions) for each sample
+   */
   TensorListShape<DynamicDimensions> last(int count) const;
 
-  /// @brief Return a span containing the shape of `sample`
+  /**
+   * @brief Return a span containing the shape of `sample`
+   */
   span<int64_t, sample_ndim == DynamicDimensions ?
                 dynamic_extent : span_extent_t(sample_ndim)>
   tensor_shape_span(int64_t sample) {
@@ -485,10 +531,12 @@ struct TensorListShapeBase {
     return {&shapes[sample * sample_dim()], span_extent_t(sample_dim())};
   }
 
-  /// @brief Return the TensorShape for given `sample`
-  ///
-  /// @tparam tensor_ndim Should be equal sample_dim() or DynamicDimensions to obtain either static
-  ///         or dynamic TensorShape
+  /**
+   * @brief Return the TensorShape for given `sample`
+   *
+   * @tparam tensor_ndim Should be equal sample_dim() or DynamicDimensions to obtain either static
+   *         or dynamic TensorShape
+   */
   template <int tensor_ndim = sample_ndim>
   TensorShape<tensor_ndim> tensor_shape(int64_t sample) const {
     static_assert(tensor_ndim == sample_ndim || sample_ndim == DynamicDimensions
@@ -510,7 +558,9 @@ struct TensorListShapeBase {
     set_tensor_shape<TensorShape<sample_ndim>>(sample, sample_shape);
   }
 
-  /// @brief Set a TensorShape for `sample`
+  /**
+   * @brief Set a TensorShape for `sample`
+   */
   template <typename SampleShape>
   void set_tensor_shape(int64_t sample, const SampleShape &sample_shape) {
     detail::check_compatible_ndim<sample_ndim, compile_time_size<SampleShape>::value>();
@@ -668,7 +718,9 @@ struct TensorListShape<DynamicDimensions>
     return *this;
   }
 
-  /// @brief Return a dynamic TensorShape for `sample`
+  /**
+   * @brief Return a dynamic TensorShape for `sample`
+   */
   TensorShape<DynamicDimensions> operator[](int64_t sample) const {
     return tensor_shape<DynamicDimensions>(sample);
   }
@@ -678,10 +730,12 @@ struct TensorListShape<DynamicDimensions>
   int ndim = 0;
   using Base::shapes;
 
-  /// @brief Convert to static TensorListShape
-  ///
-  /// Behaviour is undefined for other_ndim != sample_dim()
-  /// @tparam other_ndim must be equal sample_dim()
+  /**
+   * @brief Convert to static TensorListShape
+   *
+   * Behaviour is undefined for other_ndim != sample_dim()
+   * @tparam other_ndim must be equal sample_dim()
+   */
   template <int other_ndim>
   TensorListShape<other_ndim> to_static() const & {
     static_assert(other_ndim != DynamicDimensions,
@@ -755,7 +809,9 @@ struct TensorListShape : TensorListShapeBase<TensorListShape<sample_ndim>, sampl
     return *this;
   }
 
-  /// @brief Return a static TensorShape for `sample`
+  /**
+   * @brief Return a static TensorShape for `sample`
+   */
   TensorShape<sample_ndim> operator[](int64_t sample) const {
     TensorShape<sample_ndim> result;
     int64_t base = sample_dim() * sample;
@@ -877,8 +933,10 @@ bool operator!=(const TensorListShape<left_ndim> &left, const TensorListShape<ri
   return !(left == right);
 }
 
-/// @brief Calculate pointers for Tensors stored in contigous buffer whose shapes
-///        are described by tls. Offsets are calculated as number of elements of each tensor.
+/**
+ * @brief Calculate pointers for Tensors stored in contigous buffer whose shapes
+ *        are described by tls. Offsets are calculated as number of elements of each tensor.
+ */
 template <int sample_ndim, typename T>
 void calculate_pointers(std::vector<T*> &pointers, T *base,
                         const TensorListShape<sample_ndim> &tls) {
@@ -890,8 +948,10 @@ void calculate_pointers(std::vector<T*> &pointers, T *base,
   }
 }
 
-/// @brief Calculate pointers for Tensors stored in contigous buffer whose shapes
-///        are described by tls. Offsets are calculated as number of elements of each tensor.
+/**
+ * @brief Calculate pointers for Tensors stored in contigous buffer whose shapes
+ *        are described by tls. Offsets are calculated as number of elements of each tensor.
+ */
 template <int sample_ndim, typename T>
 std::vector<T *> calculate_pointers(T *base, const TensorListShape<sample_ndim> &tls) {
   std::vector<T *> pointers;
@@ -899,7 +959,9 @@ std::vector<T *> calculate_pointers(T *base, const TensorListShape<sample_ndim>
   return pointers;
 }
 
-/// @brief Checks if all TensorShapes stored in `tls` have the same sizes
+/**
+ * @brief Checks if all TensorShapes stored in `tls` have the same sizes
+ */
 template <int ndim>
 bool is_uniform(const TensorListShape<ndim> &tls) {
   if (!tls.size()) {
diff --git a/dali/kernels/tensor_view.h b/dali/kernels/tensor_view.h
index e17073678e4..7ec4ee76ff3 100644
--- a/dali/kernels/tensor_view.h
+++ b/dali/kernels/tensor_view.h
@@ -51,8 +51,10 @@ bool ContainsCoords(const Shape &shape, const Position &pos) {
   return true;
 }
 
-/// @brief Calculates flat index of a given element in the tensor
-/// @remarks If pos has fewer dimensions than shape, the remaining offsets are assumed to be 0
+/**
+ * @brief Calculates flat index of a given element in the tensor
+ * @remarks If pos has fewer dimensions than shape, the remaining offsets are assumed to be 0
+ */
 template <typename Shape, typename Position>
 if_array_like<Position, ptrdiff_t> CalcOffset(const Shape &shape, const Position &pos) {
   ptrdiff_t ofs = pos[0];
@@ -69,7 +71,9 @@ if_array_like<Position, ptrdiff_t> CalcOffset(const Shape &shape, const Position
   return ofs;
 }
 
-/// @brief Calculates the offset to a slice of the tensor
+/**
+ * @brief Calculates the offset to a slice of the tensor
+ */
 template <typename Shape>
 ptrdiff_t CalcOffset(const Shape &shape, const ptrdiff_t &index) {
   ptrdiff_t ofs = index;
@@ -82,11 +86,13 @@ ptrdiff_t CalcOffset(const Shape &shape, const ptrdiff_t &index) {
 
 struct EmptyBackendTag {};
 
-/// @brief Non-owning wrapper for Tensor, containing typed pointer to data and TensorShape
-///
-/// @tparam Backend
-/// @tparam DataType
-/// @tparam ndim either static for ndim >= 0 or DynamicDimensions
+/**
+ * @brief Non-owning wrapper for Tensor, containing typed pointer to data and TensorShape
+ *
+ * @tparam Backend
+ * @tparam DataType
+ * @tparam ndim either static for ndim >= 0 or DynamicDimensions
+ */
 template <typename Backend, typename DataType, int ndim = DynamicDimensions>
 struct TensorView;
 
@@ -95,14 +101,18 @@ struct TensorViewBase {
   using element_type = DataType;
   int dim() const { return shape.size(); }
 
-  /// @brief Utility to calculate pointer to element at given coordinates
+  /**
+   * @brief Utility to calculate pointer to element at given coordinates
+   */
   template <typename... Indices>
   DataType *operator()(int64_t idx0, Indices &&... idx) const {
     return data + CalcOffset(shape, std::array<ptrdiff_t, sizeof...(Indices) + 1>{
                                         idx0, (ptrdiff_t{idx})...});
   }
 
-  /// @brief Utility to calculate pointer to element at given coordinates
+  /**
+   * @brief Utility to calculate pointer to element at given coordinates
+   */
   template <typename Offset>
   DataType *operator()(const Offset &pos) const {
     return data + CalcOffset(shape, pos);
@@ -125,7 +135,9 @@ struct TensorViewBase {
   TensorViewBase(DataType *data, TensorShape<ndim> &&shape) : data(data), shape(std::move(shape)) {}
 };
 
-/// @brief Dynamic TensorView can be constructed from any Static TensorView
+/**
+ * @brief Dynamic TensorView can be constructed from any Static TensorView
+ */
 template <typename Backend, typename DataType>
 struct TensorView<Backend, DataType, DynamicDimensions>
     : TensorViewBase<Backend, DataType, DynamicDimensions> {
@@ -218,13 +230,15 @@ TensorView<Backend, DataType, other_ndim> TensorViewBase<Backend, DataType, ndim
   return {data, shape.template to_static<other_ndim>()};
 }
 
-/// @brief Non-owning list of Tensors.
-///
-/// Contains TensorListShape and pointers to the beginning of each Tensor.
-/// For sample `i`, offsets[i] is an offset to first element and offsets[i+1] is an offset to
-/// last + 1 element.
-/// Shape and pointers are stored in contiguous memory for improved data locality and reduced
-/// number of allocations.
+/**
+ * @brief Non-owning list of Tensors.
+ *
+ * Contains TensorListShape and pointers to the beginning of each Tensor.
+ * For sample `i`, offsets[i] is an offset to first element and offsets[i+1] is an offset to
+ * last + 1 element.
+ * Shape and pointers are stored in contiguous memory for improved data locality and reduced
+ * number of allocations.
+ */
 template <typename Backend, typename DataType, int sample_ndim = DynamicDimensions>
 struct TensorListView;
 
@@ -232,7 +246,9 @@ template <typename Backend, typename DataType, int sample_ndim>
 struct TensorListViewBase {
   using element_type = DataType;
 
-  /// @brief Return non-owning View to sample at specified index
+  /**
+   * @brief Return non-owning View to sample at specified index
+   */
   TensorView<Backend, DataType, sample_ndim> operator[](int sample) const {
     return { tensor_data(sample), tensor_shape(sample) };
   }
@@ -244,7 +260,9 @@ struct TensorListViewBase {
     return { data[sample], shape.template tensor_shape<other_sample_ndim>(sample)};
   }
 
-  /// @brief Number of samples
+  /**
+   * @brief Number of samples
+   */
   int size() const noexcept { return shape.size(); }
   int num_samples() const noexcept { return size(); }
   ptrdiff_t num_elements() const {
@@ -322,33 +340,43 @@ struct TensorListViewBase {
   TensorListViewBase &operator=(const TensorListViewBase &) = default;
   TensorListViewBase &operator=(TensorListViewBase &&other) = default;
 
-  /// @brief Constructs a tensor list without specific memory
-  ///
-  /// The shape is copied from `shape` parameter and the `data`
-  /// vector is resized to num_samples and filled with null pointers.
+  /**
+   * @brief Constructs a tensor list without specific memory
+   *
+   * The shape is copied from `shape` parameter and the `data`
+   * vector is resized to num_samples and filled with null pointers.
+   */
   TensorListViewBase(const TensorListShape<sample_ndim> &shape)  // NOLINT
       : shape(shape)
       , data(this->num_samples(), nullptr) {}
 
-  /// @brief Constructs a tensor list without specific memory
-  ///
-  /// The shape is taken from `shape` parameter and the `data`
-  /// vector is resized to num_samples and filled with null pointers.
+  /**
+   * @brief Constructs a tensor list without specific memory
+   *
+   * The shape is taken from `shape` parameter and the `data`
+   * vector is resized to num_samples and filled with null pointers.
+   */
   TensorListViewBase(TensorListShape<sample_ndim> &&shape)  // NOLINT
       : shape(std::move(shape))
       , data(this->num_samples(), nullptr) {}
 
-  /// @brief Constructs a tensor list from non-contiguous memory
+  /**
+   * @brief Constructs a tensor list from non-contiguous memory
+   */
   TensorListViewBase(const data_pointers_t &data, const TensorListShape<sample_ndim> &shape)
       : shape(shape)
       , data(data) {}
-  /// @brief Constructs a tensor list from non-contiguous memory
+  /**
+   * @brief Constructs a tensor list from non-contiguous memory
+   */
   TensorListViewBase(data_pointers_t &&data, TensorListShape<sample_ndim> &&shape)
       : shape(std::move(shape))
       , data(std::move(data)) {}
 
 
-  /// @brief Constructs a tensor list from non-contiguous memory
+  /**
+   * @brief Constructs a tensor list from non-contiguous memory
+   */
   template <typename U>
   TensorListViewBase(const std::vector<U*> &data, const TensorListShape<sample_ndim> &shape)
       : shape(shape)
@@ -356,7 +384,9 @@ struct TensorListViewBase {
     detail::check_implicit_conversion<U, DataType>();
   }
 
-  /// @brief Constructs a tensor list from non-contiguous memory
+  /**
+   * @brief Constructs a tensor list from non-contiguous memory
+   */
   template <typename U>
   TensorListViewBase(std::vector<U*> &&data, TensorListShape<sample_ndim> &&shape)
       : shape(std::move(shape))
@@ -364,21 +394,29 @@ struct TensorListViewBase {
     detail::check_implicit_conversion<U, DataType>();
   }
 
-  /// @brief Constructs a tensor list from non-contiguous memory
+  /**
+   * @brief Constructs a tensor list from non-contiguous memory
+   */
   TensorListViewBase(DataType **data, const TensorListShape<sample_ndim> &shape)
       : shape(shape)
       , data(data, data + this->shape.num_samples()) {}
-  /// @brief Constructs a tensor list from non-contiguous memory
+  /**
+   * @brief Constructs a tensor list from non-contiguous memory
+   */
   TensorListViewBase(DataType **data, TensorListShape<sample_ndim> &&shape)
       : shape(std::move(shape))
       , data(data, data + this->shape.num_samples()) {}
 
-  /// @brief Constructs a tensor list from contiguous memory
+  /**
+   * @brief Constructs a tensor list from contiguous memory
+   */
   TensorListViewBase(DataType *data, const TensorListShape<sample_ndim> &shape)
       : shape(shape) {
     calculate_pointers(this->data, data, this->shape);
   }
-  /// @brief Constructs a tensor list from contiguous memory
+  /**
+   * @brief Constructs a tensor list from contiguous memory
+   */
   TensorListViewBase(DataType *data, TensorListShape<sample_ndim> &&shape)
       : shape(std::move(shape)) {
     calculate_pointers(this->data, data, this->shape);
@@ -395,8 +433,8 @@ struct TensorListView<Backend, DataType, DynamicDimensions>
   TensorListView &operator=(const TensorListView &) = default;
   TensorListView &operator=(TensorListView &&) = default;
 
-  ////////////////////////////////////////////////////////////////////////////
-  // Construction from contiguous memory
+  //@{
+  /** @brief  Construction from contiguous memory */
 
   TensorListView(DataType *data, const std::vector<TensorShape<DynamicDimensions>> &shapes)
       : Base(data, shapes) {}
@@ -409,8 +447,10 @@ struct TensorListView<Backend, DataType, DynamicDimensions>
   TensorListView(DataType *data, TensorListShape<other_sample_ndim> &&shape)
       : Base(data, std::move(shape)) {}
 
-  ////////////////////////////////////////////////////////////////////////////
-  // Construction from non-contiguous memory
+  //@}
+
+  //@{
+  /** @brief Construction from non-contiguous memory */
 
   TensorListView(DataType **data, const std::vector<TensorShape<DynamicDimensions>> &shapes)
       : Base(data, shapes) {}
@@ -423,8 +463,10 @@ struct TensorListView<Backend, DataType, DynamicDimensions>
   TensorListView(DataType **data, TensorListShape<other_sample_ndim> &&shape)
       : Base(data, std::move(shape)) {}
 
-  ////////////////////////////////////////////////////////////////////////////
-  // Implicit conversion
+  //@}
+
+  //@{
+  /** @brief Implicit conversion */
 
   template <int other_sample_ndim, typename U>
   TensorListView(const TensorListView<Backend, U, other_sample_ndim> &other)
@@ -437,6 +479,8 @@ struct TensorListView<Backend, DataType, DynamicDimensions>
       : Base(std::move(other.data), std::move(other.shape)) {
     detail::check_implicit_conversion<U, DataType>();
   }
+
+  //@}
 };
 
 template <typename Backend, typename DataType, int sample_ndim>
@@ -449,8 +493,8 @@ struct TensorListView : TensorListViewBase<Backend, DataType, sample_ndim> {
   TensorListView &operator=(const TensorListView &) = default;
   TensorListView &operator=(TensorListView &&) = default;
 
-  ////////////////////////////////////////////////////////////////////////////
-  // Construction from contiguous memory
+  //@{
+  /** @brief Construction from contiguous memory */
 
   TensorListView(std::nullptr_t, const std::vector<TensorShape<sample_ndim>> &shapes)
       : Base(TensorListShape<sample_ndim>(shapes)) {}
@@ -463,8 +507,10 @@ struct TensorListView : TensorListViewBase<Backend, DataType, sample_ndim> {
   TensorListView(std::nullptr_t, TensorListShape<other_sample_ndim> &&shape)
       : Base(std::move(shape)) {}
 
-  ////////////////////////////////////////////////////////////////////////////
-  // Construction from contiguous memory
+  //@}
+
+  //@{
+  /** @brief Construction from contiguous memory */
 
   TensorListView(DataType *data, const std::vector<TensorShape<sample_ndim>> &shapes)
       : Base(data, TensorListShape<sample_ndim>(shapes)) {}
@@ -477,8 +523,10 @@ struct TensorListView : TensorListViewBase<Backend, DataType, sample_ndim> {
   TensorListView(DataType *data, TensorListShape<other_sample_ndim> &&shape)
       : Base(data, std::move(shape)) {}
 
-  ////////////////////////////////////////////////////////////////////////////
-  // Construction from non-contiguous memory
+  //@}
+
+  //@{
+  /** @brief Construction from non-contiguous memory */
 
   TensorListView(DataType **data, const std::vector<TensorShape<sample_ndim>> &shapes)
       : Base(data, shapes) {}
@@ -491,8 +539,10 @@ struct TensorListView : TensorListViewBase<Backend, DataType, sample_ndim> {
   TensorListView(DataType **data, TensorListShape<other_sample_ndim> &&shape)
       : Base(data, std::move(shape)) {}
 
-  ////////////////////////////////////////////////////////////////////////////
-  // Implicit conversion
+  //@}
+
+  //@{
+  /** @brief Implicit conversion */
 
   template <typename U>
   TensorListView(const TensorListView<Backend, U, sample_ndim> &other)
@@ -506,80 +556,102 @@ struct TensorListView : TensorListViewBase<Backend, DataType, sample_ndim> {
       : Base(std::move(other.data), std::move(other.shape)) {
     detail::check_implicit_conversion<U, DataType>();
   }
+
+  //@}
 };
 
 struct StorageCPU;
 struct StorageGPU;
 
-/// @brief Wraps raw memory as a tensor
+/**
+ * @brief Wraps raw memory as a tensor
+ */
 template <typename StorageBackend, int ndim, typename T>
 TensorView<StorageBackend, T, ndim> make_tensor(T *data, TensorShape<ndim> shape) {
   return { data, std::move(shape) };
 }
 
-/// @brief Wraps raw memory as a tensor list
+/**
+ * @brief Wraps raw memory as a tensor list
+ */
 template <typename StorageBackend, int ndim, typename T>
 TensorListView<StorageBackend, T, ndim> make_tensor_list(T *data, TensorListShape<ndim> shape) {
   return { data, std::move(shape) };
 }
 
-/// @brief Wraps raw memory as a tensor list
+/**
+ * @brief Wraps raw memory as a tensor list
+ */
 template <typename StorageBackend, int ndim, typename T>
 TensorListView<StorageBackend, T, ndim> make_tensor_list(T **data, TensorListShape<ndim> shape) {
   return { data, std::move(shape) };
 }
 
-/// @brief Wraps CPU raw memory as a tensor
+/**
+ * @brief Wraps CPU raw memory as a tensor
+ */
 template <int ndim, typename T>
 TensorView<StorageCPU, T, ndim> make_tensor_cpu(T *data, TensorShape<ndim> shape) {
   return { data, std::move(shape) };
 }
 
-/// @brief Wraps contiguous CPU memory as a tensor list
+/**
+ * @brief Wraps contiguous CPU memory as a tensor list
+ */
 template <int ndim, typename T>
 TensorListView<StorageCPU, T, ndim> make_tensor_list_cpu(T *data, TensorListShape<ndim> shape) {
   return { data, std::move(shape) };
 }
 
-/// @brief Wraps CPU raw memory as a tensor list
+/**
+ * @brief Wraps CPU raw memory as a tensor list
+ */
 template <int ndim, typename T>
 TensorListView<StorageCPU, T, ndim> make_tensor_list_cpu(T **data, TensorListShape<ndim> shape) {
   return { data, std::move(shape) };
 }
 
-/// @brief Wraps GPU raw memory as a tensor
+/**
+ * @brief Wraps GPU raw memory as a tensor
+ */
 template <int ndim, typename T>
 TensorView<StorageGPU, T, ndim> make_tensor_gpu(T *data, TensorShape<ndim> shape) {
   return { data, std::move(shape) };
 }
 
-/// @brief Wraps contiguous GPU memory as a tensor list
+/**
+ * @brief Wraps contiguous GPU memory as a tensor list
+ */
 template <int ndim, typename T>
 TensorListView<StorageGPU, T, ndim> make_tensor_list_gpu(T *data, TensorListShape<ndim> shape) {
   return { data, std::move(shape) };
 }
 
-/// @brief Wraps GPU raw memory as a tensor list
+/**
+ * @brief Wraps GPU raw memory as a tensor list
+ */
 template <int ndim, typename T>
 TensorListView<StorageGPU, T, ndim> make_tensor_list_gpu(T **data, TensorListShape<ndim> shape) {
   return { data, std::move(shape) };
 }
 
-/// @{
-/// @brief Get a subtensor by slicing along outermost dimension at position `pos`
-///
-/// @details Produces tensor, for which number of dimensions is reduced by 1.
-/// Removed dimension is outer-most (e.g. for shape {3,2,4,6} produces {2,4,6}).
-/// Data inside the tensor is extracted according to provided index.
-/// Data is not copied.
-///
-/// Example:
-/// tv.data = [[1, 2, 3], [4, 5, 6]]       (shape: [2, 3])
-/// oust_dimension(tv, 1) -> [4, 5, 6]     (shape: [3])
-///
-/// @param source Source TensorView
-/// @param idx Index inside dimension, along which data is extracted
-/// @return TensorView with reduced dimensionality
+/**
+ * @{
+ * @brief Get a subtensor by slicing along outermost dimension at position `pos`
+ *
+ * @details Produces tensor, for which number of dimensions is reduced by 1.
+ * Removed dimension is outer-most (e.g. for shape {3,2,4,6} produces {2,4,6}).
+ * Data inside the tensor is extracted according to provided index.
+ * Data is not copied.
+ *
+ * Example:
+ * tv.data = [[1, 2, 3], [4, 5, 6]]       (shape: [2, 3])
+ * oust_dimension(tv, 1) -> [4, 5, 6]     (shape: [3])
+ *
+ * @param source Source TensorView
+ * @param idx Index inside dimension, along which data is extracted
+ * @return TensorView with reduced dimensionality
+ */
 template<typename StorageBackend, typename DataType, int ndims>
 TensorView<StorageBackend, DataType, ndims - 1>
 subtensor(TensorView<StorageBackend, DataType, ndims> source, int64_t pos) {
@@ -596,13 +668,17 @@ subtensor(TensorView<StorageBackend, DataType, DynamicDimensions> source, int64_
   DataType *data = source.data + pos * volume(shape);
   return make_tensor<StorageBackend>(data, std::move(shape));
 }
-/// @}
-
-/// @brief Retrieves a sample range from a tensor list
-/// @param input      input list
-/// @param out_slice  output list
-/// @param begin      index of the first sample to include in the subrange
-/// @param end        index one past the last sample to include in the subrange
+/**
+ * @}
+ */
+
+/**
+ * @brief Retrieves a sample range from a tensor list
+ * @param input      input list
+ * @param out_slice  output list
+ * @param begin      index of the first sample to include in the subrange
+ * @param end        index one past the last sample to include in the subrange
+ */
 template <typename StorageBackend, typename DataType, int out_ndim, int ndim>
 void sample_range(TensorListView<StorageBackend, DataType, out_ndim> &out_slice,
     const TensorListView<StorageBackend, DataType, ndim> &input, int begin, int end) {
@@ -619,11 +695,13 @@ void sample_range(TensorListView<StorageBackend, DataType, out_ndim> &out_slice,
 }
 
 
-/// @brief Retrieves a sample range from a tensor list
-/// @param input      input list
-/// @param begin      index of the first sample to include in the subrange
-/// @param end        index one past the last sample to include in the subrange
-/// @return `TensorListView<out_ndim>` consisting of samples at indices `begin` to `end` - 1
+/**
+ * @brief Retrieves a sample range from a tensor list
+ * @param input      input list
+ * @param begin      index of the first sample to include in the subrange
+ * @param end        index one past the last sample to include in the subrange
+ * @return `TensorListView<out_ndim>` consisting of samples at indices `begin` to `end` - 1
+ */
 template <int out_ndim = InferDimensions, typename StorageBackend, typename DataType, int ndim,
   int output_ndim = (out_ndim == InferDimensions ? ndim : out_ndim)>
 TensorListView<StorageBackend, DataType, output_ndim> sample_range(
diff --git a/dali/kernels/test/block_setup_test.cc b/dali/kernels/test/block_setup_test.cc
index bde0f10bf48..5e206440a49 100644
--- a/dali/kernels/test/block_setup_test.cc
+++ b/dali/kernels/test/block_setup_test.cc
@@ -137,10 +137,12 @@ bool operator==(const BlockMap<dim> &a, const BlockMap<dim> &b) {
 
 inline void ValidateBlockMap(const BlockMap<0> &map, const TensorShape<0> &shape) {}
 
-/// @brief Check that the output shape is covered with rectangular grid.
-///
-/// The grid cells must be aligned between rows/slices, but don't have to be uniform
-/// - typically the last cell will be smaller and that's expected.
+/**
+ * @brief Check that the output shape is covered with rectangular grid.
+ *
+ * The grid cells must be aligned between rows/slices, but don't have to be uniform
+ * - typically the last cell will be smaller and that's expected.
+ */
 template <int dim>
 void ValidateBlockMap(const BlockMap<dim> &map, const TensorShape<dim> &shape) {
   ASSERT_FALSE(map.inner.empty());
diff --git a/dali/kernels/test/kernel_test.cc b/dali/kernels/test/kernel_test.cc
index d33c1629ab9..4fd9ae8725e 100644
--- a/dali/kernels/test/kernel_test.cc
+++ b/dali/kernels/test/kernel_test.cc
@@ -22,8 +22,8 @@
 namespace dali {
 namespace kernels {
 
-template <typename Input1, typename Input2, typename OutputType>
-using ExampleKernel = examples::Kernel<Input1, Input2, OutputType>;
+template <typename OutputType, typename Input1, typename Input2>
+using ExampleKernel = examples::Kernel<OutputType, Input1, Input2>;
 
 // Neither function present
 struct Empty {
@@ -66,18 +66,18 @@ TEST(KernelAPI, InferIOArgs) {
   >::value, "Wrong set of inputs detected");
 
   static_assert(std::is_same<
-    kernel_outputs<ExampleKernel<float, int, int>>,
+    kernel_outputs<ExampleKernel<int, float, int>>,
     std::tuple<const OutListGPU<int, 3>&>
   >::value, "Wrong set of outputs detected");
 
   static_assert(std::is_same<
-    kernel_args<ExampleKernel<float, int, float>>,
+    kernel_args<ExampleKernel<float, float, int>>,
     std::tuple<const std::vector<float>&>
   >::value, "Wrong set of arguments detected");
 }
 
 TEST(KernelAPI, EnforceConcept) {
-  static_assert(detail::has_unique_member_function_Run<ExampleKernel<float, int, float>>::value,
+  static_assert(detail::has_unique_member_function_Run<ExampleKernel<float, float, int>>::value,
                 "ExampleKernel has Run function");
 
   static_assert(!detail::has_unique_member_function_Run<Empty>::value,
@@ -85,7 +85,7 @@ TEST(KernelAPI, EnforceConcept) {
   static_assert(!detail::has_unique_member_function_Run<TwoRuns>::value,
                 "TwoRuns has two Run functions");
 
-  check_kernel<ExampleKernel<int, float, int>>();
+  check_kernel<ExampleKernel<int, int, float>>();
 
   static_assert(!is_kernel<Empty>::value, "Empty has no Run function and cannot be a kernel");
   static_assert(!is_kernel<NoGetReq>::value,
@@ -94,8 +94,8 @@ TEST(KernelAPI, EnforceConcept) {
   static_assert(!is_kernel<RunBadParamsType>::value, "Run has bad parameters");
 }
 
-template <typename I1, typename I2, typename O>
-KernelRequirements dali::kernels::examples::Kernel<I1, I2, O>::Setup(
+template <typename O, typename I1, typename I2>
+KernelRequirements dali::kernels::examples::Kernel<O, I1, I2>::Setup(
   KernelContext &context,
   const InListGPU<I1, 3> &in1,
   const InTensorGPU<I2, 4> &in2,
@@ -103,8 +103,8 @@ KernelRequirements dali::kernels::examples::Kernel<I1, I2, O>::Setup(
   return {};
 }
 
-template <typename I1, typename I2, typename O>
-void dali::kernels::examples::Kernel<I1, I2, O>::Run(KernelContext &context,
+template <typename O, typename I1, typename I2>
+void dali::kernels::examples::Kernel<O, I1, I2>::Run(KernelContext &context,
   const OutListGPU<O, 3> &out,
   const InListGPU<I1, 3> &in1,
   const InTensorGPU<I2, 4> &in2,
@@ -117,7 +117,7 @@ TEST(KernelAPI, CallWithTuples) {
   OutListGPU<float, 3> out;
   std::vector<float> aux;
 
-  examples::Kernel<float, int, float> K;
+  examples::Kernel<float, float, int> K;
   KernelContext context;
   kernel::Run(K, context, std::tie(out), std::tie(in1, in2), std::tie(aux));
 }
diff --git a/dali/kernels/test/warp_test/warp_gpu_test.cu b/dali/kernels/test/warp_test/warp_gpu_test.cu
index c5d051c6603..f0d2ae78557 100644
--- a/dali/kernels/test/warp_test/warp_gpu_test.cu
+++ b/dali/kernels/test/warp_test/warp_gpu_test.cu
@@ -125,8 +125,10 @@ TEST(WarpGPU, Affine_Transpose_Single) {
   WarpGPU_Affine_Transpose(false);
 }
 
-/// @brief Apply correction of pixel centers and convert the mapping to
-///        OpenCV matrix type.
+/**
+ * @brief Apply correction of pixel centers and convert the mapping to
+ *        OpenCV matrix type.
+ */
 inline cv::Matx<float, 2, 3> AffineToCV(const AffineMapping2D &mapping) {
   vec2 translation = mapping({0.5f, 0.5f}) - vec2(0.5f, 0.5f);
   mat2x3 tmp = mapping.transform;
diff --git a/dali/pipeline/data/tensor_vector.h b/dali/pipeline/data/tensor_vector.h
index e34cbfcbcf9..81c180f345d 100644
--- a/dali/pipeline/data/tensor_vector.h
+++ b/dali/pipeline/data/tensor_vector.h
@@ -168,13 +168,17 @@ class TensorVector {
     return pinned_;
   }
 
-  /// @brief Reserve as contiguous tensor list internally
+  /**
+   * @brief Reserve as contiguous tensor list internally
+   */
   inline void reserve(size_t total_bytes) {
     state_ = State::contiguous;
     tl_->reserve(total_bytes);
   }
 
-  /// @brief Reserve as vector of `batch_size` tensors internally
+  /**
+   * @brief Reserve as vector of `batch_size` tensors internally
+   */
   inline void reserve(size_t bytes_per_sample, int batch_size) {
     DALI_ENFORCE(tensors_.empty() || static_cast<int>(tensors_.size()) == batch_size,
                  "Changing the batch size is prohibited. It should be set once.");
@@ -189,14 +193,18 @@ class TensorVector {
     }
   }
 
-  /// @brief If the TensorVector is backed by TensorList (contiguous memory)
+  /**
+   * @brief If the TensorVector is backed by TensorList (contiguous memory)
+   */
   bool IsContiguous() {
     // TODO(klecki): check the views_count as well?
     return state_ == State::contiguous && views_count_ == size();
   }
 
-  /// @brief Set the current state if further calls like Resize() or set_type
-  ///        should use TensorList or std::vector<Tensor> as backing memory
+  /**
+   * @brief Set the current state if further calls like Resize() or set_type
+   *        should use TensorList or std::vector<Tensor> as backing memory
+   */
   void SetContiguous(bool contiguous) {
     if (contiguous) {
       state_ = State::contiguous;
diff --git a/dali/pipeline/data/types.h b/dali/pipeline/data/types.h
index ea215bc1a13..ab17872d762 100644
--- a/dali/pipeline/data/types.h
+++ b/dali/pipeline/data/types.h
@@ -123,11 +123,15 @@ enum DALIDataType {
 template <DALIDataType id>
 struct id2type_helper;
 
-/// @brief Compile-time mapping from a type to DALIDataType
+/**
+ * @brief Compile-time mapping from a type to DALIDataType
+ */
 template <typename data_type>
 struct type2id;
 
-/// @brief Compile-time mapping from DALIDataType to a type
+/**
+ * @brief Compile-time mapping from DALIDataType to a type
+ */
 template <DALIDataType id>
 using id2type = typename id2type_helper<id>::type;
 
diff --git a/dali/pipeline/data/views.h b/dali/pipeline/data/views.h
index 3c73d58e9f1..3fb1aa989e3 100644
--- a/dali/pipeline/data/views.h
+++ b/dali/pipeline/data/views.h
@@ -26,7 +26,9 @@
 namespace dali {
 namespace detail {
 
-/// @brief Maps DALI Backend to dali::kernels storage backend.
+/**
+ * @brief Maps DALI Backend to dali::kernels storage backend.
+ */
 template <typename Backend>
 struct storage_tag_map;
 
@@ -54,9 +56,11 @@ void enforce_dim_in_view(const ShapeType &shape) {
 
 }  // namespace detail
 
-/// @brief Returns an equivalent tensor shape for a dense, uniform tensor list.
-/// @return Tensor shape with outermost dimension corresponding to samples.
-/// @remarks If the argument is not a dense tensor, an error is raised.
+/**
+ * @brief Returns an equivalent tensor shape for a dense, uniform tensor list.
+ * @return Tensor shape with outermost dimension corresponding to samples.
+ * @remarks If the argument is not a dense tensor, an error is raised.
+ */
 template <int ndim, typename Backend>
 kernels::TensorShape<ndim> get_tensor_shape(const TensorList<Backend> &tl) {
   DALI_ENFORCE(tl.IsDenseTensor(), "Uniform, dense tensor expected");
diff --git a/dali/pipeline/operators/geometric/bb_flip.cu b/dali/pipeline/operators/geometric/bb_flip.cu
index 744e13cecf5..3a02565d6be 100644
--- a/dali/pipeline/operators/geometric/bb_flip.cu
+++ b/dali/pipeline/operators/geometric/bb_flip.cu
@@ -18,19 +18,25 @@
 
 namespace dali {
 
-/// @param output                - output bounding boxes
-/// @param input                 - input bounding boxes
-/// @param num_boxes             - number of bounding boxes in the input
-/// @param sample_indices        - when using per-sample flip, contains sample indices for each
-///                                bounding box in the input tensor list
-/// @param per_sample_horizontal - per-sample flag indicating whether bounding boxes from
-//                                 a given sample should be flipped horizontally; may by NULL
-/// @param per_sample_vertical   - per-sample flag indicating whether bounding boxes from
-//                                 a given sample should be flipped vertically; may be NULL
-/// @param global_horizontal     - whether to flip horizontally; overriden by
-///                                per_sample_horizontal, if specified
-/// @param global_vertical       - whether to flip vertically; overriden by
-///                                per_sample_vertical, if specified
+/**
+ * @param output                - output bounding boxes
+ * @param input                 - input bounding boxes
+ * @param num_boxes             - number of bounding boxes in the input
+ * @param sample_indices        - when using per-sample flip, contains sample indices for each
+ *                                bounding box in the input tensor list
+ * @param per_sample_horizontal - per-sample flag indicating whether bounding boxes from
+ *
+ *                                 a given sample should be flipped horizontally; may by NULL
+ *
+ * @param per_sample_vertical   - per-sample flag indicating whether bounding boxes from
+ *
+ *                                 a given sample should be flipped vertically; may be NULL
+ *
+ * @param global_horizontal     - whether to flip horizontally; overriden by
+ *                                per_sample_horizontal, if specified
+ * @param global_vertical       - whether to flip vertically; overriden by
+ *                                per_sample_vertical, if specified
+ */
 template <bool ltrb>
 __global__ void BbFlipKernel(float *output, const float *input, size_t num_boxes,
                              bool global_horizontal, const int *per_sample_horizontal,
diff --git a/dali/pipeline/operators/op_schema.h b/dali/pipeline/operators/op_schema.h
index 30ee0359816..3d63f5414d8 100644
--- a/dali/pipeline/operators/op_schema.h
+++ b/dali/pipeline/operators/op_schema.h
@@ -360,8 +360,10 @@ class DLL_PUBLIC OpSchema {
 
   DLL_PUBLIC bool HasOptionalArgument(const std::string &name, const bool local_only = false) const;
 
-  /// @brief Finds default value for a given argument
-  /// @return A pair of the defining schema and the value
+  /**
+   * @brief Finds default value for a given argument
+   * @return A pair of the defining schema and the value
+   */
   DLL_PUBLIC std::pair<const OpSchema *, const Value *>
   FindDefaultValue(const std::string &arg_name,
                    bool local_only = false,
diff --git a/dali/pipeline/operators/resize/resize_base.h b/dali/pipeline/operators/resize/resize_base.h
index 4e48111d696..4f9366dd07a 100644
--- a/dali/pipeline/operators/resize/resize_base.h
+++ b/dali/pipeline/operators/resize/resize_base.h
@@ -32,11 +32,17 @@ class DLL_PUBLIC ResamplingFilterAttr {
  public:
   DLL_PUBLIC ResamplingFilterAttr(const OpSpec &spec);
 
-  /// Filter used when downscaling
+  /**
+   * Filter used when downscaling
+   */
   kernels::FilterDesc min_filter_{ kernels::ResamplingFilterType::Triangular, 0 };
-  /// Filter used when upscaling
+  /**
+   * Filter used when upscaling
+   */
   kernels::FilterDesc mag_filter_{ kernels::ResamplingFilterType::Linear, 0 };
-  /// Initial size, in bytes, of a temporary buffer for resampling.
+  /**
+   * Initial size, in bytes, of a temporary buffer for resampling.
+   */
   size_t temp_buffer_hint_ = 0;
 };
 
diff --git a/dali/pipeline/operators/transpose/cutt/LRUCache.h b/dali/pipeline/operators/transpose/cutt/LRUCache.h
index 98560e7228f..e6ba044bb64 100755
--- a/dali/pipeline/operators/transpose/cutt/LRUCache.h
+++ b/dali/pipeline/operators/transpose/cutt/LRUCache.h
@@ -1,116 +1,116 @@
-/******************************************************************************
-MIT License
-
-Copyright (c) 2016 Antti-Pekka Hynninen
-Copyright (c) 2016 NVIDIA
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*******************************************************************************/
-// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_LRUCACHE_H
-#define DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_LRUCACHE_H
-
-#include <utility>
-#include <list>
-#include <unordered_map>
-
-using namespace std;
-
-//
-// Simple LRU cache implementation
-//
-template <typename key_type, typename value_type>
-class LRUCache {
-private:
-
-  struct ValueIterator {
-    value_type value;
-    typename list<key_type>::iterator it;
-  };
-
-  // Size of the cache
-  const size_t capacity;
-
-  // Value that is returned when the key is not found
-  const value_type null_value;
-
-  // Double linked list of keys. Oldest is at the back
-  list<key_type> keys;
-
-  // Cache: (hash table)
-  // key = key
-  // value = {value, pointer to linked list}
-  unordered_map<key_type, ValueIterator> cache;
-
-public:
-  
-  LRUCache(const size_t capacity, const value_type null_value) : capacity(capacity), null_value(null_value) {} 
- 
-  value_type get(key_type key) {
-    auto it = cache.find(key);
-    if (it == cache.end()) return null_value;
-    touch(it);
-    return it->second.value;
-  }
-  
-  void set(key_type key, value_type value) {
-    auto it = cache.find(key);
-    if (it != cache.end()) {
-      // key found
-      it->second.value = value;
-      touch(it);
-    } else {
-      // key not found
-      if (cache.size() == capacity) {
-        key_type oldest_key = keys.back();
-        keys.pop_back();
-        cache.erase( cache.find(oldest_key) );
-      }
-      keys.push_front(key);
-      ValueIterator vi;
-      vi.value = value;
-      vi.it = keys.begin();
-      pair<key_type, ValueIterator> boo(key, vi);
-      cache.insert(boo);
-    }
-  }
-
-private:
-
-  void touch(typename unordered_map<key_type, ValueIterator>::iterator it) {
-    keys.erase(it->second.it);
-    keys.push_front(it->first);
-    it->second.it = keys.begin();
-  }
-};
-
-#endif // DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_LRUCACHE_H
+/******************************************************************************
+MIT License
+
+Copyright (c) 2016 Antti-Pekka Hynninen
+Copyright (c) 2016 NVIDIA
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*******************************************************************************/
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_LRUCACHE_H
+#define DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_LRUCACHE_H
+
+#include <utility>
+#include <list>
+#include <unordered_map>
+
+using namespace std;
+
+//
+// Simple LRU cache implementation
+//
+template <typename key_type, typename value_type>
+class LRUCache {
+private:
+
+  struct ValueIterator {
+    value_type value;
+    typename list<key_type>::iterator it;
+  };
+
+  // Size of the cache
+  const size_t capacity;
+
+  // Value that is returned when the key is not found
+  const value_type null_value;
+
+  // Double linked list of keys. Oldest is at the back
+  list<key_type> keys;
+
+  // Cache: (hash table)
+  // key = key
+  // value = {value, pointer to linked list}
+  unordered_map<key_type, ValueIterator> cache;
+
+public:
+  
+  LRUCache(const size_t capacity, const value_type null_value) : capacity(capacity), null_value(null_value) {} 
+ 
+  value_type get(key_type key) {
+    auto it = cache.find(key);
+    if (it == cache.end()) return null_value;
+    touch(it);
+    return it->second.value;
+  }
+  
+  void set(key_type key, value_type value) {
+    auto it = cache.find(key);
+    if (it != cache.end()) {
+      // key found
+      it->second.value = value;
+      touch(it);
+    } else {
+      // key not found
+      if (cache.size() == capacity) {
+        key_type oldest_key = keys.back();
+        keys.pop_back();
+        cache.erase( cache.find(oldest_key) );
+      }
+      keys.push_front(key);
+      ValueIterator vi;
+      vi.value = value;
+      vi.it = keys.begin();
+      pair<key_type, ValueIterator> boo(key, vi);
+      cache.insert(boo);
+    }
+  }
+
+private:
+
+  void touch(typename unordered_map<key_type, ValueIterator>::iterator it) {
+    keys.erase(it->second.it);
+    keys.push_front(it->first);
+    it->second.it = keys.begin();
+  }
+};
+
+#endif // DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_LRUCACHE_H
diff --git a/dali/pipeline/operators/transpose/cutt/int_vector.h b/dali/pipeline/operators/transpose/cutt/int_vector.h
index c409a299196..6cc77ee1a58 100755
--- a/dali/pipeline/operators/transpose/cutt/int_vector.h
+++ b/dali/pipeline/operators/transpose/cutt/int_vector.h
@@ -1,373 +1,373 @@
-/******************************************************************************
-MIT License
-
-Copyright (c) 2016 Antti-Pekka Hynninen
-Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-*******************************************************************************/
-// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifndef DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_VECTOR_H
-#define DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_VECTOR_H
-
-// Intel: Minimum SSE2 required for vectorization.
-// SSE can't be used because it does not support integer operations. SSE defaults to scalar
-
-#if defined(__SSE2__)
-// Intel x86
-#include <x86intrin.h>
-
-#if defined(__AVX__)
-#define USE_AVX
-const int INT_VECTOR_LEN = 8;
-
-#if defined(__AVX2__)
-// #include <avx2intrin.h>
-const char INT_VECTOR_TYPE[] = "AVX2";
-#else
-const char INT_VECTOR_TYPE[] = "AVX";
-#endif
-
-#else
-#define USE_SSE
-const int INT_VECTOR_LEN = 4;
-const char INT_VECTOR_TYPE[] = "SSE2";
-#endif
-
-#elif defined(__ALTIVEC__)  // #if defined(__SSE2__)
-#define USE_ALTIVEC
-// IBM altivec
-#include <altivec.h>
-#undef bool
-const int INT_VECTOR_LEN = 4;
-const char INT_VECTOR_TYPE[] = "ALTIVEC";
-
-#else // #if defined(__SSE2__)
-// Nothing
-const int INT_VECTOR_LEN = 1;
-const char INT_VECTOR_TYPE[] = "SCALAR";
-#endif
-
-//
-// Integer vector class for Intel and IBM CPU platforms
-//
-class int_vector {
-private:
-
-#if defined(USE_AVX)
-  __m256i x;
-#elif defined(USE_SSE)
-  __m128i x;
-#elif defined(USE_ALTIVEC)
-  vector signed int x;
-#else
-  int x;
-#endif
-
-public:
-
-  inline int_vector() {
-  }
-
-  inline int_vector(const int a) {
-#if defined(USE_AVX)
-    x = _mm256_set1_epi32(a);
-#elif defined(USE_SSE)
-    x = _mm_set1_epi32(a);
-#elif defined(USE_ALTIVEC)
-    x = (vector signed int){a, a, a, a};
-#else
-    x = a;
-#endif    
-  }
-
-  inline int_vector(const int a[]) {
-#if defined(USE_AVX)
-    x = _mm256_set_epi32(a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
-#elif defined(USE_SSE)
-    x = _mm_set_epi32(a[3], a[2], a[1], a[0]);
-#elif defined(USE_ALTIVEC)
-    x = vec_ld(0, a);
-#else
-    x = a[0];
-#endif    
-  }
-
-#if defined(USE_AVX)
-  inline int_vector(const __m256i ax) {
-    x = ax;
-  }
-#elif defined(USE_SSE)
-  inline int_vector(const __m128i ax) {
-    x = ax;
-  }
-#elif defined(USE_ALTIVEC)
-  inline int_vector(const vector signed int ax) {
-    x = ax;
-  }
-#endif
-
-  // 
-  // Member functions
-  //
-
-  inline int_vector operator+=(const int_vector a) {
-#if defined(USE_AVX)
-    x = _mm256_add_epi32(x, a.x);
-#elif defined(USE_SSE)
-    x = _mm_add_epi32(x, a.x);
-#elif defined(USE_ALTIVEC)
-    x += a.x;
-#else
-    x += a.x;
-#endif
-    return *this;
-  }
-
-  inline int_vector operator-=(const int_vector a) {
-#if defined(USE_AVX)
-    x = _mm256_sub_epi32(x, a.x);
-#elif defined(USE_SSE)
-    x = _mm_sub_epi32(x, a.x);
-#elif defined(USE_ALTIVEC)
-    x -= a.x;
-#else
-    x -= a.x;
-#endif
-    return *this;
-  }
-
-  inline int_vector operator&=(const int_vector a) {
-#if defined(USE_AVX)
-    x = _mm256_and_si256(x, a.x);
-#elif defined(USE_SSE)
-    x = _mm_and_si128(x, a.x);
-#elif defined(USE_ALTIVEC)
-    x &= a.x;
-#else
-    x &= a.x;
-#endif
-    return *this;
-  }
-
-  inline int_vector operator|=(const int_vector a) {
-#if defined(USE_AVX)
-    x = _mm256_or_si256(x, a.x);
-#elif defined(USE_SSE)
-    x = _mm_or_si128(x, a.x);
-#elif defined(USE_ALTIVEC)
-    x |= a.x;
-#else
-    x |= a.x;
-#endif
-    return *this;
-  }
-
-  inline int_vector operator~() {
-#if defined(USE_AVX)
-    int_vector fullmask = int_vector(-1);
-    return int_vector( _mm256_andnot_si256(x, fullmask.x) );
-#elif defined(USE_SSE)
-    int_vector fullmask = int_vector(-1);
-    return int_vector( _mm_andnot_si128(x, fullmask.x) );
-#elif defined(USE_ALTIVEC)
-    return int_vector( ~x );
-#else
-    return ~x;
-#endif
-  }
-
-  // Sign extended shift by a constant.
-  // Note: 0 <= n <= 31. Otherwise results are unpredictable
-  inline int_vector operator>>=(const int n) {
-#if defined(USE_AVX)
-    x = _mm256_srai_epi32(x, n);
-#elif defined(USE_SSE)
-    x = _mm_srai_epi32(x, n);
-#elif defined(USE_ALTIVEC)
-    x >>= n;
-#else
-    x >>= n;
-#endif
-    return *this;
-  }
-
-  // Sign extended shift by a constant
-  // Note: 0 <= n <= 31. Otherwise results are unpredictable
-  inline int_vector operator<<=(const int n) {
-#if defined(USE_AVX)
-    x = _mm256_slli_epi32(x, n);
-#elif defined(USE_SSE)
-    x = _mm_slli_epi32(x, n);
-#elif defined(USE_ALTIVEC)
-    x <<= n;
-#else
-    x <<= n;
-#endif
-    return *this;
-  }
-
-  // Copy contest to int array
-  void copy(int* a) const {
-#if defined(USE_AVX)
-    _mm256_storeu_si256((__m256i *)a, x);
-#elif defined(USE_SSE)
-    _mm_storeu_si128((__m128i *)a, x);
-#elif defined(USE_ALTIVEC)
-     // void vec_stl (vector signed int, int, int *);
-    vec_stl(x, 0, a);
-#else
-    a[0] = x;
-#endif
-  }
-
-  //
-  // Non-member functions
-  //
-
-  inline friend int_vector operator+(int_vector a, const int_vector b) {
-    a += b;
-    return a;
-  }
-
-  inline friend int_vector operator-(int_vector a, const int_vector b) {
-    a -= b;
-    return a;
-  }
-
-  inline friend int_vector operator&(int_vector a, const int_vector b) {
-    a &= b;
-    return a;
-  }
-
-  inline friend int_vector operator|(int_vector a, const int_vector b) {
-    a |= b;
-    return a;
-  }
-
-  inline friend int_vector operator>>(int_vector a, const int n) {
-    a >>= n;
-    return a;
-  }
-
-  inline friend int_vector operator<<(int_vector a, const int n) {
-    a <<= n;
-    return a;
-  }
-
-  // Returns 0xffffffff = -1 on the vector elements that are equal
-  inline friend int_vector eq_mask(const int_vector a, const int_vector b) {
-#if defined(USE_AVX)
-    return int_vector(_mm256_cmpeq_epi32(a.x, b.x));
-#elif defined(USE_SSE)
-    return int_vector(_mm_cmpeq_epi32(a.x, b.x));
-#elif defined(USE_ALTIVEC)
-    return int_vector(a.x == b.x);
-#else
-    return int_vector((a.x == b.x)*(-1));
-#endif
-  }
-
-  inline friend int_vector neq_mask(const int_vector a, const int_vector b) {
-    return ~eq_mask(a, b);
-  }
-
-  // 0xffffffff => 1
-  inline friend int_vector mask_to_bool(const int_vector a) {
-#if defined(USE_AVX)
-    return int_vector(_mm256_srli_epi32(a.x, 31));
-#elif defined(USE_SSE)
-    return int_vector(_mm_srli_epi32(a.x, 31));
-#elif defined(USE_ALTIVEC)
-    return int_vector((vector signed int)((vector unsigned int)a.x >> 31));
-#else
-    return ((unsigned int)a.x >> 31);
-#endif
-  }
-
-  inline friend int_vector operator==(const int_vector a, const int_vector b) {
-    return mask_to_bool(eq_mask(a, b));
-  }
-
-  inline friend int_vector operator!=(const int_vector a, const int_vector b) {
-    return mask_to_bool(neq_mask(a, b));
-  }
-
-  // 1 => 0xffffffff
-  inline friend int_vector bool_to_mask(const int_vector a) {
-#if defined(USE_AVX)
-    return neq_mask(a, int_vector(0));
-#elif defined(USE_SSE)
-    return neq_mask(a, int_vector(0));
-#elif defined(USE_ALTIVEC)
-    return neq_mask(a, int_vector(0));
-#else
-    return (a ? -1 : 0);
-#endif
-  }
-
-  // Implicit type conversion
-  // Returns true if any of the elements are != 0
-  operator bool() const {
-#if defined(USE_AVX)
-    int_vector a = neq_mask(*this, int_vector(0));
-    return (_mm256_movemask_epi8(a.x) != 0);
-#elif defined(USE_SSE)
-    int_vector a = neq_mask(*this, int_vector(0));
-    return (_mm_movemask_epi8(a.x) != 0);
-#elif defined(USE_ALTIVEC)
-    return vec_any_ne(x, ((const vector signed int){0, 0, 0, 0}));
-#else
-    return x;
-#endif
-  }
-
-  //
-  // Helper functions
-  //
-  void print() {
-    int vec[INT_VECTOR_LEN];
-    this->copy(vec);
-    for (int i=0;i < INT_VECTOR_LEN;i++) {
-      printf("%d ", vec[i]);
-    }
-  }
-
-};
-
-
-#if defined(USE_ALTIVEC)
-#undef vector
-#undef pixel
-#endif
-
+/******************************************************************************
+MIT License
+
+Copyright (c) 2016 Antti-Pekka Hynninen
+Copyright (c) 2016 Oak Ridge National Laboratory (UT-Batelle)
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*******************************************************************************/
+// Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_VECTOR_H
+#define DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_VECTOR_H
+
+// Intel: Minimum SSE2 required for vectorization.
+// SSE can't be used because it does not support integer operations. SSE defaults to scalar
+
+#if defined(__SSE2__)
+// Intel x86
+#include <x86intrin.h>
+
+#if defined(__AVX__)
+#define USE_AVX
+const int INT_VECTOR_LEN = 8;
+
+#if defined(__AVX2__)
+// #include <avx2intrin.h>
+const char INT_VECTOR_TYPE[] = "AVX2";
+#else
+const char INT_VECTOR_TYPE[] = "AVX";
+#endif
+
+#else
+#define USE_SSE
+const int INT_VECTOR_LEN = 4;
+const char INT_VECTOR_TYPE[] = "SSE2";
+#endif
+
+#elif defined(__ALTIVEC__)  // #if defined(__SSE2__)
+#define USE_ALTIVEC
+// IBM altivec
+#include <altivec.h>
+#undef bool
+const int INT_VECTOR_LEN = 4;
+const char INT_VECTOR_TYPE[] = "ALTIVEC";
+
+#else // #if defined(__SSE2__)
+// Nothing
+const int INT_VECTOR_LEN = 1;
+const char INT_VECTOR_TYPE[] = "SCALAR";
+#endif
+
+//
+// Integer vector class for Intel and IBM CPU platforms
+//
+class int_vector {
+private:
+
+#if defined(USE_AVX)
+  __m256i x;
+#elif defined(USE_SSE)
+  __m128i x;
+#elif defined(USE_ALTIVEC)
+  vector signed int x;
+#else
+  int x;
+#endif
+
+public:
+
+  inline int_vector() {
+  }
+
+  inline int_vector(const int a) {
+#if defined(USE_AVX)
+    x = _mm256_set1_epi32(a);
+#elif defined(USE_SSE)
+    x = _mm_set1_epi32(a);
+#elif defined(USE_ALTIVEC)
+    x = (vector signed int){a, a, a, a};
+#else
+    x = a;
+#endif    
+  }
+
+  inline int_vector(const int a[]) {
+#if defined(USE_AVX)
+    x = _mm256_set_epi32(a[7], a[6], a[5], a[4], a[3], a[2], a[1], a[0]);
+#elif defined(USE_SSE)
+    x = _mm_set_epi32(a[3], a[2], a[1], a[0]);
+#elif defined(USE_ALTIVEC)
+    x = vec_ld(0, a);
+#else
+    x = a[0];
+#endif    
+  }
+
+#if defined(USE_AVX)
+  inline int_vector(const __m256i ax) {
+    x = ax;
+  }
+#elif defined(USE_SSE)
+  inline int_vector(const __m128i ax) {
+    x = ax;
+  }
+#elif defined(USE_ALTIVEC)
+  inline int_vector(const vector signed int ax) {
+    x = ax;
+  }
+#endif
+
+  // 
+  // Member functions
+  //
+
+  inline int_vector operator+=(const int_vector a) {
+#if defined(USE_AVX)
+    x = _mm256_add_epi32(x, a.x);
+#elif defined(USE_SSE)
+    x = _mm_add_epi32(x, a.x);
+#elif defined(USE_ALTIVEC)
+    x += a.x;
+#else
+    x += a.x;
+#endif
+    return *this;
+  }
+
+  inline int_vector operator-=(const int_vector a) {
+#if defined(USE_AVX)
+    x = _mm256_sub_epi32(x, a.x);
+#elif defined(USE_SSE)
+    x = _mm_sub_epi32(x, a.x);
+#elif defined(USE_ALTIVEC)
+    x -= a.x;
+#else
+    x -= a.x;
+#endif
+    return *this;
+  }
+
+  inline int_vector operator&=(const int_vector a) {
+#if defined(USE_AVX)
+    x = _mm256_and_si256(x, a.x);
+#elif defined(USE_SSE)
+    x = _mm_and_si128(x, a.x);
+#elif defined(USE_ALTIVEC)
+    x &= a.x;
+#else
+    x &= a.x;
+#endif
+    return *this;
+  }
+
+  inline int_vector operator|=(const int_vector a) {
+#if defined(USE_AVX)
+    x = _mm256_or_si256(x, a.x);
+#elif defined(USE_SSE)
+    x = _mm_or_si128(x, a.x);
+#elif defined(USE_ALTIVEC)
+    x |= a.x;
+#else
+    x |= a.x;
+#endif
+    return *this;
+  }
+
+  inline int_vector operator~() {
+#if defined(USE_AVX)
+    int_vector fullmask = int_vector(-1);
+    return int_vector( _mm256_andnot_si256(x, fullmask.x) );
+#elif defined(USE_SSE)
+    int_vector fullmask = int_vector(-1);
+    return int_vector( _mm_andnot_si128(x, fullmask.x) );
+#elif defined(USE_ALTIVEC)
+    return int_vector( ~x );
+#else
+    return ~x;
+#endif
+  }
+
+  // Sign extended shift by a constant.
+  // Note: 0 <= n <= 31. Otherwise results are unpredictable
+  inline int_vector operator>>=(const int n) {
+#if defined(USE_AVX)
+    x = _mm256_srai_epi32(x, n);
+#elif defined(USE_SSE)
+    x = _mm_srai_epi32(x, n);
+#elif defined(USE_ALTIVEC)
+    x >>= n;
+#else
+    x >>= n;
+#endif
+    return *this;
+  }
+
+  // Sign extended shift by a constant
+  // Note: 0 <= n <= 31. Otherwise results are unpredictable
+  inline int_vector operator<<=(const int n) {
+#if defined(USE_AVX)
+    x = _mm256_slli_epi32(x, n);
+#elif defined(USE_SSE)
+    x = _mm_slli_epi32(x, n);
+#elif defined(USE_ALTIVEC)
+    x <<= n;
+#else
+    x <<= n;
+#endif
+    return *this;
+  }
+
+  // Copy contest to int array
+  void copy(int* a) const {
+#if defined(USE_AVX)
+    _mm256_storeu_si256((__m256i *)a, x);
+#elif defined(USE_SSE)
+    _mm_storeu_si128((__m128i *)a, x);
+#elif defined(USE_ALTIVEC)
+     // void vec_stl (vector signed int, int, int *);
+    vec_stl(x, 0, a);
+#else
+    a[0] = x;
+#endif
+  }
+
+  //
+  // Non-member functions
+  //
+
+  inline friend int_vector operator+(int_vector a, const int_vector b) {
+    a += b;
+    return a;
+  }
+
+  inline friend int_vector operator-(int_vector a, const int_vector b) {
+    a -= b;
+    return a;
+  }
+
+  inline friend int_vector operator&(int_vector a, const int_vector b) {
+    a &= b;
+    return a;
+  }
+
+  inline friend int_vector operator|(int_vector a, const int_vector b) {
+    a |= b;
+    return a;
+  }
+
+  inline friend int_vector operator>>(int_vector a, const int n) {
+    a >>= n;
+    return a;
+  }
+
+  inline friend int_vector operator<<(int_vector a, const int n) {
+    a <<= n;
+    return a;
+  }
+
+  // Returns 0xffffffff = -1 on the vector elements that are equal
+  inline friend int_vector eq_mask(const int_vector a, const int_vector b) {
+#if defined(USE_AVX)
+    return int_vector(_mm256_cmpeq_epi32(a.x, b.x));
+#elif defined(USE_SSE)
+    return int_vector(_mm_cmpeq_epi32(a.x, b.x));
+#elif defined(USE_ALTIVEC)
+    return int_vector(a.x == b.x);
+#else
+    return int_vector((a.x == b.x)*(-1));
+#endif
+  }
+
+  inline friend int_vector neq_mask(const int_vector a, const int_vector b) {
+    return ~eq_mask(a, b);
+  }
+
+  // 0xffffffff => 1
+  inline friend int_vector mask_to_bool(const int_vector a) {
+#if defined(USE_AVX)
+    return int_vector(_mm256_srli_epi32(a.x, 31));
+#elif defined(USE_SSE)
+    return int_vector(_mm_srli_epi32(a.x, 31));
+#elif defined(USE_ALTIVEC)
+    return int_vector((vector signed int)((vector unsigned int)a.x >> 31));
+#else
+    return ((unsigned int)a.x >> 31);
+#endif
+  }
+
+  inline friend int_vector operator==(const int_vector a, const int_vector b) {
+    return mask_to_bool(eq_mask(a, b));
+  }
+
+  inline friend int_vector operator!=(const int_vector a, const int_vector b) {
+    return mask_to_bool(neq_mask(a, b));
+  }
+
+  // 1 => 0xffffffff
+  inline friend int_vector bool_to_mask(const int_vector a) {
+#if defined(USE_AVX)
+    return neq_mask(a, int_vector(0));
+#elif defined(USE_SSE)
+    return neq_mask(a, int_vector(0));
+#elif defined(USE_ALTIVEC)
+    return neq_mask(a, int_vector(0));
+#else
+    return (a ? -1 : 0);
+#endif
+  }
+
+  // Implicit type conversion
+  // Returns true if any of the elements are != 0
+  operator bool() const {
+#if defined(USE_AVX)
+    int_vector a = neq_mask(*this, int_vector(0));
+    return (_mm256_movemask_epi8(a.x) != 0);
+#elif defined(USE_SSE)
+    int_vector a = neq_mask(*this, int_vector(0));
+    return (_mm_movemask_epi8(a.x) != 0);
+#elif defined(USE_ALTIVEC)
+    return vec_any_ne(x, ((const vector signed int){0, 0, 0, 0}));
+#else
+    return x;
+#endif
+  }
+
+  //
+  // Helper functions
+  //
+  void print() {
+    int vec[INT_VECTOR_LEN];
+    this->copy(vec);
+    for (int i=0;i < INT_VECTOR_LEN;i++) {
+      printf("%d ", vec[i]);
+    }
+  }
+
+};
+
+
+#if defined(USE_ALTIVEC)
+#undef vector
+#undef pixel
+#endif
+
 #endif // DALI_PIPELINE_OPERATORS_TRANSPOSE_CUTT_INT_VECTOR_H
\ No newline at end of file
diff --git a/dali/test/dali_test_bboxes.h b/dali/test/dali_test_bboxes.h
index d1fe72b1f4b..3f0ff0737e5 100644
--- a/dali/test/dali_test_bboxes.h
+++ b/dali/test/dali_test_bboxes.h
@@ -1,180 +1,180 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
-#ifndef DALI_TEST_DALI_TEST_BBOXES_H_
-#define DALI_TEST_DALI_TEST_BBOXES_H_
-
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "dali/test/dali_test_single_op.h"
-
-namespace dali {
-
-struct SingleParamOpDescr {
-  SingleParamOpDescr() = default;
-  SingleParamOpDescr(const char *name, OpArg &&arg, double eps = 0)  // NOLINT
-  : opName(name), opArg(std::move(arg)), epsVal(eps) {}
-  const char *opName = nullptr;
-  OpArg opArg;
-  double epsVal = 0;
-};
-
-template <typename ImgType>
-class GenericBBoxesTest : public DALISingleOpTest<ImgType> {
- protected:
-  void RunBBoxesCPU(const opDescr &descr, bool ltrb) {
-    const int batch_size = this->jpegs_.nImages();
-    this->SetBatchSize(batch_size);
-    this->SetNumThreads(1);
-
-    TensorList<CPUBackend> boxes;
-    TensorList<CPUBackend> labels;
-    this->MakeBBoxesAndLabelsBatch(&boxes, &labels, batch_size, ltrb);
-    this->SetExternalInputs({{"boxes", &boxes}, {"labels", &labels}});
-
-    auto pipe = this->GetPipeline();
-
-    OpSpec spec(descr.opName);
-    if (descr.opAddImgType) spec = spec.AddArg("image_type", this->ImageType());
-
-    this->AddOperatorWithOutput(this->AddArguments(&spec, descr.args)
-                                    .AddInput("boxes", "cpu")
-                                    .AddInput("labels", "cpu")
-                                    .AddOutput("output", "cpu")
-                                    .AddOutput("output1", "cpu")
-                                    .AddOutput("output2", "cpu")
-                                    .AddOutput("output3", "cpu"));
-
-    this->SetTestCheckType(this->GetTestCheckType());
-    pipe->Build(DALISingleOpTest<ImgType>::outputs_);
-    pipe->RunCPU();
-    pipe->RunGPU();
-
-    DeviceWorkspace ws;
-    pipe->Outputs(&ws);
-  }
-
-  std::vector<std::shared_ptr<TensorList<CPUBackend>>> RunSliceGPU(
-      const vector<std::pair<string, TensorList<CPUBackend> *>> &inputs) {
-    const int batch_size = this->jpegs_.nImages();
-    this->SetBatchSize(batch_size);
-    this->SetNumThreads(1);
-
-    this->SetExternalInputs(inputs);
-
-    auto pipe = this->GetPipeline();
-
-    // Prospective crop
-    pipe->AddOperator(OpSpec("RandomBBoxCrop")
-                          .AddArg("device", "cpu")
-                          .AddArg("image_type", this->ImageType())
-                          .AddArg("bytes_per_sample_hint", vector<int>{ 8, 8, 256, 128 })
-                          .AddInput("boxes", "cpu")
-                          .AddInput("labels", "cpu")
-                          .AddOutput("begin", "cpu")
-                          .AddOutput("crop", "cpu")
-                          .AddOutput("resized_boxes", "cpu")
-                          .AddOutput("filtered_labels", "cpu"));
-
-    // GPU slice
-    pipe->AddOperator(OpSpec("Slice")
-                          .AddArg("device", "gpu")
-                          .AddArg("image_type", this->ImageType())
-                          .AddInput("images", "gpu")
-                          .AddInput("begin", "cpu")
-                          .AddInput("crop", "cpu")
-                          .AddOutput("cropped_images", "gpu"));
-
-    this->SetTestCheckType(this->GetTestCheckType());
-    pipe->Build({{"cropped_images", "gpu"}, {"resized_boxes", "gpu"}});
-    pipe->RunCPU();
-    pipe->RunGPU();
-
-    DeviceWorkspace ws;
-    pipe->Outputs(&ws);
-
-    auto images_cpu = this->CopyToHost(ws.Output<GPUBackend>(0))[0];
-    images_cpu->SetLayout(ws.Output<GPUBackend>(0).GetLayout());
-
-    auto boxes_cpu = this->CopyToHost(ws.Output<GPUBackend>(1))[0];
-    boxes_cpu->SetLayout(ws.Output<GPUBackend>(1).GetLayout());
-
-    return {images_cpu, boxes_cpu};
-  }
-
-  std::vector<std::shared_ptr<TensorList<CPUBackend>>> RunSliceCPU(
-      const vector<std::pair<string, TensorList<CPUBackend> *>> &inputs) {
-    const int batch_size = this->jpegs_.nImages();
-    this->SetBatchSize(batch_size);
-    this->SetNumThreads(1);
-
-    this->SetExternalInputs(inputs);
-
-    auto pipe = this->GetPipeline();
-
-    // Prospective crop
-    pipe->AddOperator(OpSpec("RandomBBoxCrop")
-                          .AddArg("device", "cpu")
-                          .AddArg("image_type", this->ImageType())
-                          .AddInput("boxes", "cpu")
-                          .AddInput("labels", "cpu")
-                          .AddOutput("begin", "cpu")
-                          .AddOutput("crop", "cpu")
-                          .AddOutput("resized_boxes", "cpu")
-                          .AddOutput("filtered_labels", "cpu"));
-
-    // GPU slice
-    pipe->AddOperator(OpSpec("Slice")
-                          .AddArg("device", "cpu")
-                          .AddArg("image_type", this->ImageType())
-                          .AddInput("images", "cpu")
-                          .AddInput("begin", "cpu")
-                          .AddInput("crop", "cpu")
-                          .AddOutput("cropped_images", "cpu"));
-
-    this->SetTestCheckType(this->GetTestCheckType());
-    pipe->Build({{"cropped_images", "cpu"}, {"resized_boxes", "cpu"}});
-    pipe->RunCPU();
-    pipe->RunGPU();
-
-    DeviceWorkspace ws;
-    pipe->Outputs(&ws);
-
-    std::vector<std::shared_ptr<TensorList<CPUBackend>>> ret;
-    ret.push_back(std::make_shared<TensorList<CPUBackend>>());
-    ret.push_back(std::make_shared<TensorList<CPUBackend>>());
-    ret[0]->Copy(ws.Output<CPUBackend>(0), 0);
-    ret[1]->Copy(ws.Output<CPUBackend>(1), 0);
-    return ret;
-  }
-
-  vector<std::shared_ptr<TensorList<CPUBackend>>> Reference(
-      const vector<TensorList<CPUBackend> *> &inputs,
-      DeviceWorkspace *ws) override {
-    auto &from = ws->Output<GPUBackend>(1);
-    auto reference = this->CopyToHost(from);
-    reference[0]->SetLayout(from.GetLayout());
-    return reference;
-  }
-
-  uint32_t GetTestCheckType() const override {
-    return t_checkColorComp +
-           t_checkElements;  // + t_checkAll + t_checkNoAssert;
-  }
-
-  void RunBBoxesCPU(const SingleParamOpDescr &paramOp, bool addImgType = false,
-                    bool ltrb = true) {
-    vector<OpArg> args;
-    args.push_back(paramOp.opArg);
-    opDescr finalDesc(paramOp.opName, paramOp.epsVal, addImgType, &args);
-    RunBBoxesCPU(finalDesc, ltrb);
-  }
-
-  TensorList<CPUBackend> images_out;
-  TensorList<CPUBackend> boxes_out;
-};
-
-}  // namespace dali
-
-#endif  // DALI_TEST_DALI_TEST_BBOXES_H_
+// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+#ifndef DALI_TEST_DALI_TEST_BBOXES_H_
+#define DALI_TEST_DALI_TEST_BBOXES_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dali/test/dali_test_single_op.h"
+
+namespace dali {
+
+struct SingleParamOpDescr {
+  SingleParamOpDescr() = default;
+  SingleParamOpDescr(const char *name, OpArg &&arg, double eps = 0)  // NOLINT
+  : opName(name), opArg(std::move(arg)), epsVal(eps) {}
+  const char *opName = nullptr;
+  OpArg opArg;
+  double epsVal = 0;
+};
+
+template <typename ImgType>
+class GenericBBoxesTest : public DALISingleOpTest<ImgType> {
+ protected:
+  void RunBBoxesCPU(const opDescr &descr, bool ltrb) {
+    const int batch_size = this->jpegs_.nImages();
+    this->SetBatchSize(batch_size);
+    this->SetNumThreads(1);
+
+    TensorList<CPUBackend> boxes;
+    TensorList<CPUBackend> labels;
+    this->MakeBBoxesAndLabelsBatch(&boxes, &labels, batch_size, ltrb);
+    this->SetExternalInputs({{"boxes", &boxes}, {"labels", &labels}});
+
+    auto pipe = this->GetPipeline();
+
+    OpSpec spec(descr.opName);
+    if (descr.opAddImgType) spec = spec.AddArg("image_type", this->ImageType());
+
+    this->AddOperatorWithOutput(this->AddArguments(&spec, descr.args)
+                                    .AddInput("boxes", "cpu")
+                                    .AddInput("labels", "cpu")
+                                    .AddOutput("output", "cpu")
+                                    .AddOutput("output1", "cpu")
+                                    .AddOutput("output2", "cpu")
+                                    .AddOutput("output3", "cpu"));
+
+    this->SetTestCheckType(this->GetTestCheckType());
+    pipe->Build(DALISingleOpTest<ImgType>::outputs_);
+    pipe->RunCPU();
+    pipe->RunGPU();
+
+    DeviceWorkspace ws;
+    pipe->Outputs(&ws);
+  }
+
+  std::vector<std::shared_ptr<TensorList<CPUBackend>>> RunSliceGPU(
+      const vector<std::pair<string, TensorList<CPUBackend> *>> &inputs) {
+    const int batch_size = this->jpegs_.nImages();
+    this->SetBatchSize(batch_size);
+    this->SetNumThreads(1);
+
+    this->SetExternalInputs(inputs);
+
+    auto pipe = this->GetPipeline();
+
+    // Prospective crop
+    pipe->AddOperator(OpSpec("RandomBBoxCrop")
+                          .AddArg("device", "cpu")
+                          .AddArg("image_type", this->ImageType())
+                          .AddArg("bytes_per_sample_hint", vector<int>{ 8, 8, 256, 128 })
+                          .AddInput("boxes", "cpu")
+                          .AddInput("labels", "cpu")
+                          .AddOutput("begin", "cpu")
+                          .AddOutput("crop", "cpu")
+                          .AddOutput("resized_boxes", "cpu")
+                          .AddOutput("filtered_labels", "cpu"));
+
+    // GPU slice
+    pipe->AddOperator(OpSpec("Slice")
+                          .AddArg("device", "gpu")
+                          .AddArg("image_type", this->ImageType())
+                          .AddInput("images", "gpu")
+                          .AddInput("begin", "cpu")
+                          .AddInput("crop", "cpu")
+                          .AddOutput("cropped_images", "gpu"));
+
+    this->SetTestCheckType(this->GetTestCheckType());
+    pipe->Build({{"cropped_images", "gpu"}, {"resized_boxes", "gpu"}});
+    pipe->RunCPU();
+    pipe->RunGPU();
+
+    DeviceWorkspace ws;
+    pipe->Outputs(&ws);
+
+    auto images_cpu = this->CopyToHost(ws.Output<GPUBackend>(0))[0];
+    images_cpu->SetLayout(ws.Output<GPUBackend>(0).GetLayout());
+
+    auto boxes_cpu = this->CopyToHost(ws.Output<GPUBackend>(1))[0];
+    boxes_cpu->SetLayout(ws.Output<GPUBackend>(1).GetLayout());
+
+    return {images_cpu, boxes_cpu};
+  }
+
+  std::vector<std::shared_ptr<TensorList<CPUBackend>>> RunSliceCPU(
+      const vector<std::pair<string, TensorList<CPUBackend> *>> &inputs) {
+    const int batch_size = this->jpegs_.nImages();
+    this->SetBatchSize(batch_size);
+    this->SetNumThreads(1);
+
+    this->SetExternalInputs(inputs);
+
+    auto pipe = this->GetPipeline();
+
+    // Prospective crop
+    pipe->AddOperator(OpSpec("RandomBBoxCrop")
+                          .AddArg("device", "cpu")
+                          .AddArg("image_type", this->ImageType())
+                          .AddInput("boxes", "cpu")
+                          .AddInput("labels", "cpu")
+                          .AddOutput("begin", "cpu")
+                          .AddOutput("crop", "cpu")
+                          .AddOutput("resized_boxes", "cpu")
+                          .AddOutput("filtered_labels", "cpu"));
+
+    // GPU slice
+    pipe->AddOperator(OpSpec("Slice")
+                          .AddArg("device", "cpu")
+                          .AddArg("image_type", this->ImageType())
+                          .AddInput("images", "cpu")
+                          .AddInput("begin", "cpu")
+                          .AddInput("crop", "cpu")
+                          .AddOutput("cropped_images", "cpu"));
+
+    this->SetTestCheckType(this->GetTestCheckType());
+    pipe->Build({{"cropped_images", "cpu"}, {"resized_boxes", "cpu"}});
+    pipe->RunCPU();
+    pipe->RunGPU();
+
+    DeviceWorkspace ws;
+    pipe->Outputs(&ws);
+
+    std::vector<std::shared_ptr<TensorList<CPUBackend>>> ret;
+    ret.push_back(std::make_shared<TensorList<CPUBackend>>());
+    ret.push_back(std::make_shared<TensorList<CPUBackend>>());
+    ret[0]->Copy(ws.Output<CPUBackend>(0), 0);
+    ret[1]->Copy(ws.Output<CPUBackend>(1), 0);
+    return ret;
+  }
+
+  vector<std::shared_ptr<TensorList<CPUBackend>>> Reference(
+      const vector<TensorList<CPUBackend> *> &inputs,
+      DeviceWorkspace *ws) override {
+    auto &from = ws->Output<GPUBackend>(1);
+    auto reference = this->CopyToHost(from);
+    reference[0]->SetLayout(from.GetLayout());
+    return reference;
+  }
+
+  uint32_t GetTestCheckType() const override {
+    return t_checkColorComp +
+           t_checkElements;  // + t_checkAll + t_checkNoAssert;
+  }
+
+  void RunBBoxesCPU(const SingleParamOpDescr &paramOp, bool addImgType = false,
+                    bool ltrb = true) {
+    vector<OpArg> args;
+    args.push_back(paramOp.opArg);
+    opDescr finalDesc(paramOp.opName, paramOp.epsVal, addImgType, &args);
+    RunBBoxesCPU(finalDesc, ltrb);
+  }
+
+  TensorList<CPUBackend> images_out;
+  TensorList<CPUBackend> boxes_out;
+};
+
+}  // namespace dali
+
+#endif  // DALI_TEST_DALI_TEST_BBOXES_H_
diff --git a/dali/test/device_test.h b/dali/test/device_test.h
index f5766e77b7e..25f6e14a0c8 100644
--- a/dali/test/device_test.h
+++ b/dali/test/device_test.h
@@ -137,12 +137,14 @@ __device__ void suite_name##_##test_name##_body( \
 
 #define TEST_KERNEL_NAME(suite_name, test_name) suite_name##_##test_name##_kernel
 
-/// Executes default test case body.
-/// @param suite_name - test suite name, as used in DEFINE_TEST_KERNEL
-/// @param test_name - test case name, as used in DEFINE_TEST_KERNEL
-/// @param grid - CUDA grid size
-/// @param block - CUDA block size
-/// @param ... - extra parameters passed to the kernel invocation, if any
+/**
+ * Executes default test case body.
+ * @param suite_name - test suite name, as used in DEFINE_TEST_KERNEL
+ * @param test_name - test case name, as used in DEFINE_TEST_KERNEL
+ * @param grid - CUDA grid size
+ * @param block - CUDA block size
+ * @param ... - extra parameters passed to the kernel invocation, if any
+ */
 #define DEVICE_TEST_CASE_BODY(suite_name, test_name, grid, block, ...) \
   using TestStatus = dali::testing::TestStatus; \
   TestStatus *status = nullptr; \
@@ -163,11 +165,13 @@ __device__ void suite_name##_##test_name##_body( \
   } \
   EXPECT_FALSE(host_status.failed) << "There were errors in device code";
 
-/// Simple test of a device function
-/// @param suite_name GTest's suite name
-/// @param test_name GTest's test case name
-/// @param grid CUDA grid size
-/// @param block CUDA block size
+/**
+ * Simple test of a device function
+ * @param suite_name GTest's suite name
+ * @param test_name GTest's test case name
+ * @param grid CUDA grid size
+ * @param block CUDA block size
+ */
 #define DEVICE_TEST(suite_name, test_name, grid, block) \
 DECLARE_TEST_KERNEL(suite_name, test_name); \
 TEST(suite_name, test_name) \
diff --git a/dali/util/user_stream.h b/dali/util/user_stream.h
index 81a40a98803..a39c7004f08 100644
--- a/dali/util/user_stream.h
+++ b/dali/util/user_stream.h
@@ -33,7 +33,9 @@
 namespace dali {
 class DLL_PUBLIC UserStream {
  public:
-  /// @brief Gets UserStream instance
+  /**
+   * @brief Gets UserStream instance
+   */
   DLL_PUBLIC static UserStream* Get() {
     std::lock_guard<std::mutex> lock(m_);
     if (us_ == nullptr) {
@@ -42,7 +44,9 @@ class DLL_PUBLIC UserStream {
     return us_;
   }
 
-  /// @brief Obtains cudaStream_t for provided buffer. If there is no for given device,
+  /**
+   * @brief Obtains cudaStream_t for provided buffer. If there is no for given device,
+   */
   //  new one is created and stored in the internal map
   DLL_PUBLIC cudaStream_t GetStream(const dali::Buffer<GPUBackend> &b) {
     size_t dev = GetDeviceForBuffer(b);
@@ -59,14 +63,18 @@ class DLL_PUBLIC UserStream {
     }
   }
 
-  /// @brief Synchronizes on the device where given buffer b exists
+  /**
+   * @brief Synchronizes on the device where given buffer b exists
+   */
   DLL_PUBLIC void WaitForDevice(const dali::Buffer<GPUBackend> &b) {
     size_t dev = GetDeviceForBuffer(b);
     DeviceGuard g(dev);
     CUDA_CALL(cudaDeviceSynchronize());
   }
 
-  /// @brief Synchronizes on the the stream where buffer b was created
+  /**
+   * @brief Synchronizes on the the stream where buffer b was created
+   */
   DLL_PUBLIC void Wait(const dali::Buffer<GPUBackend> &b) {
     size_t dev = GetDeviceForBuffer(b);
     DALI_ENFORCE(streams_.find(dev) != streams_.end(),
@@ -75,7 +83,9 @@ class DLL_PUBLIC UserStream {
     CUDA_CALL(cudaStreamSynchronize(streams_[dev]));
   }
 
-  /// @brief Synchronizes stream connected with the current device
+  /**
+   * @brief Synchronizes stream connected with the current device
+   */
   DLL_PUBLIC void Wait() {
     int dev;
     CUDA_CALL(cudaGetDevice(&dev));
@@ -85,7 +95,9 @@ class DLL_PUBLIC UserStream {
     CUDA_CALL(cudaStreamSynchronize(streams_[dev]));
   }
 
-  /// @brief Synchronizes all tracked streams
+  /**
+   * @brief Synchronizes all tracked streams
+   */
   DLL_PUBLIC void WaitAll() {
     for (const auto &dev_pair : streams_) {
       DeviceGuard g(dev_pair.first);