From 6c8ff4e21fc498a48da54fa9191cddfb67b0004a Mon Sep 17 00:00:00 2001
From: fis <ybbs.daans@hotmail.com>
Date: Thu, 26 Jul 2018 04:23:51 +0800
Subject: [PATCH] Re-structure.

* Put KmMatrix under gpu/matrix
* Use shared utils for both kmeans and KmMatrix.
* Rename the namespaces.
---
 CMakeLists.txt                                |  3 +-
 src/common/utils.h                            | 33 ++++++++
 src/gpu/kmeans/KmMatrix/KmConfig.h            | 81 -------------------
 src/gpu/kmeans/KmMatrix/utils.cuh             | 69 ----------------
 src/gpu/kmeans/kmeans_general.h               |  3 +-
 src/gpu/kmeans/kmeans_h2o4gpu.cu              |  6 +-
 src/gpu/kmeans/kmeans_init.cu                 | 32 ++++----
 src/gpu/kmeans/kmeans_init.cuh                | 21 ++---
 src/gpu/kmeans/kmeans_labels.h                | 37 ++-------
 src/gpu/{kmeans => matrix}/KmMatrix/Arith.cu  | 18 +++--
 src/gpu/{kmeans => matrix}/KmMatrix/Arith.hpp | 10 +--
 .../{kmeans => matrix}/KmMatrix/Generator.cuh | 20 ++---
 .../{kmeans => matrix}/KmMatrix/Generator.hpp |  8 +-
 .../KmMatrix/GeneratorKernels.cu              |  8 +-
 .../{kmeans => matrix}/KmMatrix/KmMatrix.cpp  | 26 +++---
 .../{kmeans => matrix}/KmMatrix/KmMatrix.hpp  | 15 ++--
 .../KmMatrix/KmMatrixCuda.cu                  | 15 ++--
 .../KmMatrix/KmMatrixCuda.cuh                 |  8 +-
 .../KmMatrix/KmMatrixProxy.cpp                |  8 +-
 src/gpu/{kmeans => matrix}/KmMatrix/blas.cuh  | 34 ++++----
 .../{kmeans/KmMatrix => utils}/GpuInfo.cuh    | 11 ++-
 src/gpu/utils/utils.cuh                       | 75 ++++++++++++-----
 tests/cpp/gpu/KmMatrix/test_arith.cu          |  6 +-
 tests/cpp/gpu/KmMatrix/test_matrix.cu         | 44 +++++-----
 tests/cpp/gpu/KmMatrix/test_proxy.cu          | 14 ++--
 tests/cpp/gpu/kmeans/test_kmeans_init.cu      | 16 ++--
 26 files changed, 263 insertions(+), 358 deletions(-)
 delete mode 100644 src/gpu/kmeans/KmMatrix/KmConfig.h
 delete mode 100644 src/gpu/kmeans/KmMatrix/utils.cuh
 rename src/gpu/{kmeans => matrix}/KmMatrix/Arith.cu (94%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/Arith.hpp (90%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/Generator.cuh (84%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/Generator.hpp (80%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/GeneratorKernels.cu (95%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrix.cpp (95%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrix.hpp (94%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrixCuda.cu (96%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrixCuda.cuh (95%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/KmMatrixProxy.cpp (95%)
 rename src/gpu/{kmeans => matrix}/KmMatrix/blas.cuh (94%)
 rename src/gpu/{kmeans/KmMatrix => utils}/GpuInfo.cuh (88%)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 70e8dd524..c696cad27 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,8 +94,7 @@ if(USE_CUDA)
   FILE(GLOB_RECURSE GPU_SOURCES
     src/*.cu
     src/*.cuh
-    src/gpu/kmeans/KmMatrix/*.cpp
-    src/gpu/kmeans/KmMatrix/*.hpp
+    src/gpu/matrix/*.cpp
     src/common/*.cpp
     src/common/*.h)
 
diff --git a/src/common/utils.h b/src/common/utils.h
index 10c97ac03..a2fb9ef7b 100644
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -4,8 +4,14 @@
  */
 #pragma once
 #include <vector>
+
+#include <iostream>
+#include <sstream>
+
 #include "cblas/cblas.h"
 
+#define USE_CUDA() 1
+
 template<typename T>
 void self_dot(std::vector<T> array_in, int n, int dim,
               std::vector<T>& dots);
@@ -19,3 +25,30 @@ void compute_distances(std::vector<float> data_in,
                        std::vector<float> centroids_in,
                        std::vector<float> &pairwise_distances,
                        int n, int dim, int k);
+
+// Matrix host dev
+#define HG_HOSTDEV       __host__   __device__
+#define HG_DEV           __device__
+#define HG_DEVINLINE     __device__ __forceinline__
+#define HG_HOSTDEVINLINE __host__   __device__      __forceinline__
+
+#define h2o4gpu_error(x) error(x, __FILE__, __LINE__);
+
+inline void error(const char* e, const char* file, int line)
+{
+  std::stringstream ss;
+  ss << e << " - " << file << "(" << line << ")";
+  //throw error_text;
+  std::cerr << ss.str() << std::endl;
+  exit(-1);
+}
+
+#define h2o4gpu_check(condition, msg) check(condition, msg, __FILE__, __LINE__);
+
+inline void check(bool val, const char* e, const char* file, int line)
+{
+  if (!val)
+  {
+    error(e, file, line);
+  }
+}
diff --git a/src/gpu/kmeans/KmMatrix/KmConfig.h b/src/gpu/kmeans/KmMatrix/KmConfig.h
deleted file mode 100644
index fa718a221..000000000
--- a/src/gpu/kmeans/KmMatrix/KmConfig.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*!
- * Copyright 2018 H2O.ai, Inc.
- * License   Apache License Version 2.0 (see LICENSE for details)
- */
-
-#ifndef KM_CONFIG_H_
-#define KM_CONFIG_H_
-
-#define USE_CUDA() 1
-
-#include "stdio.h"
-
-// Matrix host dev
-#define M_HOSTDEV __host__ __device__
-#define M_DEV __device__
-#define M_DEVINLINE __device__ __forceinline__
-#define M_HOSTDEVINLINE __host__ __device__ __forceinline__
-
-#define CUDA_CHECK(cmd) do {                            \
-    cudaError_t e = cmd;                                \
-    if( e != cudaSuccess ) {                            \
-      printf("Cuda failure %s:%d '%s'\n",               \
-             __FILE__,__LINE__,cudaGetErrorString(e));  \
-      fflush( stdout );                                 \
-      exit(EXIT_FAILURE);                               \
-    }                                                   \
-  } while(0)
-
-#define CUBLAS_CHECK(cmd) do {                                          \
-    cublasStatus_t status = cmd;                                        \
-    if ( status != CUBLAS_STATUS_SUCCESS) {                             \
-      const char* errmsg = nullptr;                                     \
-      switch(status) {                                                  \
-        case CUBLAS_STATUS_NOT_INITIALIZED:                             \
-          errmsg = "library not initialized";                           \
-          break;                                                        \
-                                                                        \
-        case CUBLAS_STATUS_ALLOC_FAILED:                                \
-          errmsg = "resource allocation failed";                        \
-          break;                                                        \
-                                                                        \
-        case CUBLAS_STATUS_INVALID_VALUE:                               \
-          errmsg = "an invalid numeric value was used as an argument";  \
-          break;                                                        \
-                                                                        \
-        case CUBLAS_STATUS_ARCH_MISMATCH:                               \
-          errmsg = "an absent device architectural feature is required"; \
-          break;                                                        \
-                                                                        \
-        case CUBLAS_STATUS_MAPPING_ERROR:                               \
-          errmsg = "an access to GPU memory space failed";              \
-          break;                                                        \
-                                                                        \
-        case CUBLAS_STATUS_EXECUTION_FAILED:                            \
-          errmsg = "the GPU program failed to execute";                 \
-          break;                                                        \
-                                                                        \
-        case CUBLAS_STATUS_INTERNAL_ERROR:                              \
-          errmsg = "an internal operation failed";                      \
-          break;                                                        \
-                                                                        \
-        default:                                                        \
-          errmsg = "unknown error";                                     \
-          break;                                                        \
-      }                                                                 \
-      printf("%s", errmsg);                                             \
-    }                                                                   \
-  } while (false)
-
-#define M_ERROR(msg)                                                    \
-  printf("%s\n\t in %s, %u, %s\n", msg, __FILE__, __LINE__,             \
-         __PRETTY_FUNCTION__);                                          \
-  abort();
-
-#define M_USER_ERROR(msg)                                               \
-  fprintf(stderr,                                                       \
-          "%s\n\t in %s, %u, %s\n", msg, __FILE__, __LINE__,            \
-          __PRETTY_FUNCTION__);                                         \
-  exit(1)
-
-#endif  // KM_CONFIG_H_
diff --git a/src/gpu/kmeans/KmMatrix/utils.cuh b/src/gpu/kmeans/KmMatrix/utils.cuh
deleted file mode 100644
index 606359fda..000000000
--- a/src/gpu/kmeans/KmMatrix/utils.cuh
+++ /dev/null
@@ -1,69 +0,0 @@
-/*!
- * Copyright 2018 H2O.ai, Inc.
- * License   Apache License Version 2.0 (see LICENSE for details)
- */
-
-#ifndef UTILS_CUH_
-#define UTILS_CUH_
-
-#include "GpuInfo.cuh"
-
-namespace H2O4GPU {
-namespace KMeans {
-
-M_DEVINLINE size_t global_thread_idx () {
-  return threadIdx.x + blockIdx.x * blockDim.x;
-}
-
-M_DEVINLINE size_t global_thread_idy () {
-  return threadIdx.y + blockIdx.y * blockDim.y; 
-}
-
-M_DEVINLINE size_t grid_stride_x () {
-  return blockDim.x * gridDim.x;
-}
-
-M_DEVINLINE size_t grid_stride_y () {
-  return blockDim.y * gridDim.y;
-}
-
-template <typename T1, typename T2>
-T1 M_HOSTDEVINLINE div_roundup(const T1 a, const T2 b) {
-  return static_cast<T1>(ceil(static_cast<double>(a) / b));
-}
-
-
-// Work around for shared memory
-// https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name
-template <typename T>
-struct KmSharedMem;
-
-template <>
-struct KmSharedMem<float> {
-  __device__ float * ptr() {
-    extern __shared__ __align__(sizeof(float)) float s_float[];
-    return s_float;
-  }
-};
-
-template <>
-struct KmSharedMem<double> {
-  __device__ double * ptr() {
-    extern __shared__ __align__(sizeof(double)) double s_double[];
-    return s_double;
-  }
-};
-
-template <>
-struct KmSharedMem<int> {
-  __device__ int * ptr() {
-    extern __shared__ __align__(sizeof(int)) int s_int[];
-    return s_int;
-  }
-};
-
-
-}  // KMeans
-}  // H2O4GPU
-
-#endif  // UTILS_CUH_
\ No newline at end of file
diff --git a/src/gpu/kmeans/kmeans_general.h b/src/gpu/kmeans/kmeans_general.h
index 9697a48e3..d32dcbce8 100644
--- a/src/gpu/kmeans/kmeans_general.h
+++ b/src/gpu/kmeans/kmeans_general.h
@@ -4,6 +4,7 @@
  */
 #pragma once
 #include "../../common/logger.h"
+#include "../utils/utils.cuh"
 #include "stdio.h"
 #define MAX_NGPUS 16
 
@@ -13,8 +14,6 @@
 
 // TODO(pseudotensor): Avoid throw for python exception handling.  Need to avoid all exit's and return exit code all the way back.
 #define gpuErrchk(ans) { gpu_assert((ans), __FILE__, __LINE__); }
-#define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__);
-#define safe_cublas(ans) throw_on_cublas_error((ans), __FILE__, __LINE__);
 
 #define CUDACHECK(cmd) do {                           \
     cudaError_t e = cmd;                              \
diff --git a/src/gpu/kmeans/kmeans_h2o4gpu.cu b/src/gpu/kmeans/kmeans_h2o4gpu.cu
index a07fd5087..224070503 100644
--- a/src/gpu/kmeans/kmeans_h2o4gpu.cu
+++ b/src/gpu/kmeans/kmeans_h2o4gpu.cu
@@ -774,9 +774,9 @@ int kmeans_fit(int verbose, int seed, int gpu_idtry, int n_gputry,
             thrust::device,
             data[i]->begin(), data[i]->end(), h_init_data.begin());
       }
-      H2O4GPU::KMeans::KmMatrix<T> init_data(h_init_data, rows, cols);
-      H2O4GPU::KMeans::KmMatrix<T> final_centroids_matrix =
-          H2O4GPU::KMeans::KmeansLlInit<T>(seed, 1.5)(init_data, k);
+      h2o4gpu::kMeans::KmMatrix<T> init_data(h_init_data, rows, cols);
+      h2o4gpu::kMeans::KmMatrix<T> final_centroids_matrix =
+          h2o4gpu::kMeans::KmeansLlInit<T>(seed, 1.5)(init_data, k);
       thrust::host_vector<T> final_centroids (final_centroids_matrix.size());
       thrust::copy(
           final_centroids_matrix.dev_ptr(),
diff --git a/src/gpu/kmeans/kmeans_init.cu b/src/gpu/kmeans/kmeans_init.cu
index 3a93cc138..091023a9c 100644
--- a/src/gpu/kmeans/kmeans_init.cu
+++ b/src/gpu/kmeans/kmeans_init.cu
@@ -15,14 +15,16 @@
 
 #include "kmeans_init.cuh"
 
-#include "KmMatrix/KmMatrix.hpp"
-#include "KmMatrix/Arith.hpp"
-#include "KmMatrix/utils.cuh"
-#include "KmMatrix/GpuInfo.cuh"
-#include "KmMatrix/blas.cuh"
+#include "../matrix/KmMatrix/KmMatrix.hpp"
+#include "../matrix/KmMatrix/Arith.hpp"
+#include "../matrix/KmMatrix/blas.cuh"
+#include "../utils/utils.cuh"
+#include "../utils/GpuInfo.cuh"
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace kMeans {
+
+using namespace Matrix;
 
 namespace kernel {
 // X^2 + Y^2, here only calculates the + operation.
@@ -116,7 +118,7 @@ KmMatrix<T> PairWiseDistanceOp<T>::operator()(KmMatrix<T>& _data,
           data_dot_.k_param(),
           centroids_dot_.k_param());
 
-  CUDA_CHECK(cudaGetLastError());
+  safe_cuda(cudaGetLastError());
 
   cublasHandle_t handle = GpuInfo::ins().cublas_handle();
 
@@ -191,7 +193,7 @@ KmMatrix<T> GreedyRecluster<T>::recluster(KmMatrix<T>& _centroids, size_t _k) {
   void *d_temp_storage = NULL;
 
   // determine the temp_storage_bytes
-  CUDA_CHECK(cub::DeviceHistogram::HistogramEven(
+  safe_cuda(cub::DeviceHistogram::HistogramEven(
       d_temp_storage, temp_storage_bytes,
       min_indices.dev_ptr(),
       weights.dev_ptr(),
@@ -200,8 +202,8 @@ KmMatrix<T> GreedyRecluster<T>::recluster(KmMatrix<T>& _centroids, size_t _k) {
       (T)min_indices.rows(),
       (int)_centroids.rows()));
 
-  CUDA_CHECK(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes));
-  CUDA_CHECK(cub::DeviceHistogram::HistogramEven(
+  safe_cuda(cudaMalloc((void**)&d_temp_storage, temp_storage_bytes));
+  safe_cuda(cub::DeviceHistogram::HistogramEven(
       d_temp_storage, temp_storage_bytes,
       min_indices.dev_ptr(),    // d_samples
       weights.dev_ptr(),        // d_histogram
@@ -209,7 +211,7 @@ KmMatrix<T> GreedyRecluster<T>::recluster(KmMatrix<T>& _centroids, size_t _k) {
       (T)0.0,                   // lower_level
       (T)min_indices.rows(),    // upper_level
       (int)_centroids.rows())); // num_samples
-  CUDA_CHECK(cudaFree(d_temp_storage));
+  safe_cuda(cudaFree(d_temp_storage));
 
   // Sort the indices by weights in ascending order, then use those at front
   // as result.
@@ -352,7 +354,7 @@ KmeansLlInit<T, ReclusterPolicy>::operator()(KmMatrix<T>& _data, size_t _k) {
         "k must be less than or equal to the number of data points"
         ", k: %lu, data points: %lu",
         _k, _data.rows());
-    M_USER_ERROR(err_msg);
+    h2o4gpu_error(err_msg);
   }
 
   if (seed_ < 0) {
@@ -433,5 +435,5 @@ INSTANTIATE(int)
 #undef INSTANTIATE
 }
 
-}  // namespace Kmeans
-}  // namespace H2O4GPU
+}  // namespace kMeans
+}  // namespace h2o4gpu
diff --git a/src/gpu/kmeans/kmeans_init.cuh b/src/gpu/kmeans/kmeans_init.cuh
index 6f067fc9f..28a055c2a 100644
--- a/src/gpu/kmeans/kmeans_init.cuh
+++ b/src/gpu/kmeans/kmeans_init.cuh
@@ -9,17 +9,18 @@
 
 #include <memory>
 
-#include "KmMatrix/KmConfig.h"
-#include "KmMatrix/KmMatrix.hpp"
-#include "KmMatrix/utils.cuh"
-#include "KmMatrix/Generator.hpp"
-#include "KmMatrix/Generator.cuh"
-#include "KmMatrix/GpuInfo.cuh"
+#include "../matrix/KmMatrix/KmMatrix.hpp"
+#include "../matrix/KmMatrix/Generator.hpp"
+#include "../matrix/KmMatrix/Generator.cuh"
+#include "../utils/GpuInfo.cuh"
+#include "../utils/utils.cuh"
 
 constexpr double ESP = 1e-8;
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace kMeans {
+
+using namespace Matrix;
 
 namespace detail {
 
@@ -202,7 +203,7 @@ struct KmeansLlInit : public KmeansInitBase<T> {
 
 // FIXME: Make kmeans++ a derived class of KmeansInitBase
 
-}  // namespace Kmeans
-}  // namespace H2O4GPU
+}  // namespace kMeans
+}  // namespace h2o4gpu
 
 #endif  // KMEANS_INIT_H_
\ No newline at end of file
diff --git a/src/gpu/kmeans/kmeans_labels.h b/src/gpu/kmeans/kmeans_labels.h
index ada7f286e..87b7b5ed9 100644
--- a/src/gpu/kmeans/kmeans_labels.h
+++ b/src/gpu/kmeans/kmeans_labels.h
@@ -9,9 +9,13 @@
 #include <sstream>
 #include <cublas_v2.h>
 #include <cfloat>
-#include "kmeans_general.h"
 #include <thrust/fill.h>
 
+#include "kmeans_general.h"
+#include "../utils/utils.cuh"
+
+using namespace h2o4gpu;
+
 inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=true) {
 	if (code != cudaSuccess) {
 		fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
@@ -23,20 +27,6 @@ inline void gpu_assert(cudaError_t code, const char *file, int line, bool abort=
 	}
 }
 
-
-inline cudaError_t throw_on_cuda_error(cudaError_t code, const char *file,
-                                       int line) {
-  if (code != cudaSuccess) {
-    std::stringstream ss;
-    ss << file << "(" << line << ")";
-    std::string file_and_line;
-    ss >> file_and_line;
-    thrust::system_error(code, thrust::cuda_category(), file_and_line);
-  }
-
-  return code;
-}
-
 #ifdef CUBLAS_API_H_
 // cuBLAS API errors
 static const char *cudaGetErrorEnum(cublasStatus_t error)
@@ -72,23 +62,6 @@ static const char *cudaGetErrorEnum(cublasStatus_t error)
 }
 #endif
 
-inline cublasStatus_t throw_on_cublas_error(cublasStatus_t code, const char *file,
-                                       int line) {
-
-
-  if (code != CUBLAS_STATUS_SUCCESS) {
-    fprintf(stderr,"cublas error: %s %s %d\n", cudaGetErrorEnum(code), file, line);
-    std::stringstream ss;
-    ss << file << "(" << line << ")";
-    std::string file_and_line;
-    ss >> file_and_line;
-    thrust::system_error(code, thrust::cuda_category(), file_and_line);
-  }
-
-  return code;
-}
-
-
 extern cudaStream_t cuda_stream[MAX_NGPUS];
 
 template<unsigned int i>
diff --git a/src/gpu/kmeans/KmMatrix/Arith.cu b/src/gpu/matrix/KmMatrix/Arith.cu
similarity index 94%
rename from src/gpu/kmeans/KmMatrix/Arith.cu
rename to src/gpu/matrix/KmMatrix/Arith.cu
index 88bc9d920..2f9c4ccd5 100644
--- a/src/gpu/kmeans/KmMatrix/Arith.cu
+++ b/src/gpu/matrix/KmMatrix/Arith.cu
@@ -1,6 +1,8 @@
 #include "Arith.hpp"
-namespace H2O4GPU {
-namespace KMeans {
+#include "../../utils/GpuInfo.cuh"
+
+namespace h2o4gpu {
+namespace Matrix {
 
 namespace kernel {
 
@@ -121,7 +123,9 @@ KmMatrix<int> ArgMinOp<T>::argmin(KmMatrix<T>& _val, KmMatrixDim _dim) {
     return _res;
   } else {
     // FIXME
-    M_ERROR("Not implemented");
+    h2o4gpu_error("Not implemented");
+    KmMatrix<int> res;
+    return res;
   }
 }
 
@@ -135,7 +139,9 @@ KmMatrix<T> MinOp<T>::min(KmMatrix<T>& _val, KmMatrixDim _dim) {
     return _res;
   } else {
     // FIXME
-    M_ERROR("Not implemented");
+    h2o4gpu_error("Not implemented");
+    KmMatrix<T> res;
+    return res;
   }
 }
 
@@ -159,5 +165,5 @@ INSTANTIATE(double)
 INSTANTIATE(float)
 INSTANTIATE(int)
 
-}      // namespace KMenas
-}      // namespace H204GPU
\ No newline at end of file
+}      // namespace Matrix
+}      // namespace h2o4gpu
\ No newline at end of file
diff --git a/src/gpu/kmeans/KmMatrix/Arith.hpp b/src/gpu/matrix/KmMatrix/Arith.hpp
similarity index 90%
rename from src/gpu/kmeans/KmMatrix/Arith.hpp
rename to src/gpu/matrix/KmMatrix/Arith.hpp
index 3ae493455..7760cd6aa 100644
--- a/src/gpu/kmeans/KmMatrix/Arith.hpp
+++ b/src/gpu/matrix/KmMatrix/Arith.hpp
@@ -3,10 +3,10 @@
 
 #include "KmMatrix.hpp"
 #include "blas.cuh"
-#include "utils.cuh"
+#include "../../utils/utils.cuh"
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 
 // FIXME: Using struct for operations is just keeping the possibility of
 // creating an unified operations for KmMatrix. For example, let KmMatrix
@@ -52,7 +52,7 @@ struct MinOp {
   KmMatrix<T> min(KmMatrix<T>& _val, KmMatrixDim _dim);
 };
 
-}      // namespace KMenas
-}      // namespace H204GPU
+}      // namespace Matrix
+}      // namespace h2o4gpu
 
 #endif  // M_ARITH_HPP_
diff --git a/src/gpu/kmeans/KmMatrix/Generator.cuh b/src/gpu/matrix/KmMatrix/Generator.cuh
similarity index 84%
rename from src/gpu/kmeans/KmMatrix/Generator.cuh
rename to src/gpu/matrix/KmMatrix/Generator.cuh
index 890729f3c..061edbe96 100644
--- a/src/gpu/kmeans/KmMatrix/Generator.cuh
+++ b/src/gpu/matrix/KmMatrix/Generator.cuh
@@ -10,11 +10,11 @@
 
 #include "Generator.hpp"
 #include "KmMatrix.hpp"
-#include "utils.cuh"
+#include "../../utils/utils.cuh"
 
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 
 namespace kernel {
 // Split the definition to avoid multiple definition.
@@ -45,9 +45,9 @@ struct UniformGenerator : public GeneratorBase<T> {
     random_numbers_ = KmMatrix<T> (1, size_);
 
     if (dev_states_ != nullptr) {
-      CUDA_CHECK(cudaFree(dev_states_));
+      safe_cuda(cudaFree(dev_states_));
     }
-    CUDA_CHECK(cudaMalloc((void **)&dev_states_, size_ * sizeof(curandState)));
+    safe_cuda(cudaMalloc((void **)&dev_states_, size_ * sizeof(curandState)));
     kernel::setup_random_states<<<div_roundup(size_, 256), 256>>>(
         seed_, dev_states_, size_);
   }
@@ -60,7 +60,7 @@ struct UniformGenerator : public GeneratorBase<T> {
 
   UniformGenerator (size_t _size, int _seed) {
     if (_size == 0) {
-      M_ERROR("Zero size for generate is not allowed.");
+      h2o4gpu_error("Zero size for generate is not allowed.");
     }
     initialize(_size);
   }
@@ -70,7 +70,7 @@ struct UniformGenerator : public GeneratorBase<T> {
 
   ~UniformGenerator () {
     if (dev_states_ != nullptr) {
-      CUDA_CHECK(cudaFree(dev_states_));
+      safe_cuda(cudaFree(dev_states_));
     }
   }
 
@@ -87,7 +87,7 @@ struct UniformGenerator : public GeneratorBase<T> {
 
   KmMatrix<T> generate(size_t _size) override {
     if (_size == 0) {
-      M_ERROR("Zero size for generate is not allowed.");
+      h2o4gpu_error("Zero size for generate is not allowed.");
     }
     if (_size != size_) {
       initialize(_size);
@@ -96,5 +96,5 @@ struct UniformGenerator : public GeneratorBase<T> {
   }
 };
   
-}  // H2O4GPU
-}  // KMeans
\ No newline at end of file
+}  // namespace h2o4gpu
+}  // namespace Matrix
\ No newline at end of file
diff --git a/src/gpu/kmeans/KmMatrix/Generator.hpp b/src/gpu/matrix/KmMatrix/Generator.hpp
similarity index 80%
rename from src/gpu/kmeans/KmMatrix/Generator.hpp
rename to src/gpu/matrix/KmMatrix/Generator.hpp
index 7200b85ff..9ef1ae9ac 100644
--- a/src/gpu/kmeans/KmMatrix/Generator.hpp
+++ b/src/gpu/matrix/KmMatrix/Generator.hpp
@@ -8,8 +8,8 @@
 
 #include "KmMatrix.hpp"
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 
 template <typename T>
 class GeneratorBase {
@@ -18,8 +18,8 @@ class GeneratorBase {
   virtual KmMatrix<T> generate(size_t _size) = 0;
 };
 
-}
-}
+}  // namespace Matrix
+}  // namespace h2o4gpu
 
 
 #endif  // GENERATOR_HPP_
diff --git a/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu b/src/gpu/matrix/KmMatrix/GeneratorKernels.cu
similarity index 95%
rename from src/gpu/kmeans/KmMatrix/GeneratorKernels.cu
rename to src/gpu/matrix/KmMatrix/GeneratorKernels.cu
index 28b521f26..e81885f12 100644
--- a/src/gpu/kmeans/KmMatrix/GeneratorKernels.cu
+++ b/src/gpu/matrix/KmMatrix/GeneratorKernels.cu
@@ -6,8 +6,8 @@
 #include <cublas_v2.h>
 #include <curand_kernel.h>
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 namespace kernel {
 
 __global__ void setup_random_states(int _seed, curandState *_state,
@@ -59,5 +59,5 @@ __global__ void generate_uniform_kernel(int *_res,
 }
 
 }  // namespace kernel
-}  // namespace KMeans
-}  // namespace H2O4GPU
\ No newline at end of file
+}  // namespace Matrix
+}  // namespace h2o4gpu
\ No newline at end of file
diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp b/src/gpu/matrix/KmMatrix/KmMatrix.cpp
similarity index 95%
rename from src/gpu/kmeans/KmMatrix/KmMatrix.cpp
rename to src/gpu/matrix/KmMatrix/KmMatrix.cpp
index d350bd7e2..67d3f3207 100644
--- a/src/gpu/kmeans/KmMatrix/KmMatrix.cpp
+++ b/src/gpu/matrix/KmMatrix/KmMatrix.cpp
@@ -4,13 +4,13 @@
  */
 
 #include "KmMatrix.hpp"
-#include "KmConfig.h"
+
 #if USE_CUDA()
 #include "KmMatrixCuda.cuh"
 #endif
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 
 // ==============================
 // KmMatrixImpl implementation
@@ -222,7 +222,7 @@ KmMatrixProxy<T> KmMatrix<T>::row(size_t idx, bool dev_mem) {
 
 template <typename T>
 KmMatrixProxy<T> KmMatrix<T>::col(size_t idx) {
-  M_ERROR("Not implemented.");
+  h2o4gpu_error("Not implemented.");
   return KmMatrixProxy<T>(*this, 0, 0, 0);
 }
 
@@ -236,14 +236,14 @@ KmMatrix<T> KmMatrix<T>::rows(KmMatrix<T>& _index) {
     }
     res = impls[(int)Backend::CUDADense]->rows(_index);
   } else {
-    M_ERROR("Not implemented.");
+    h2o4gpu_error("Not implemented.");
   }
   return res;
 }
 
 template <typename T>
 KmMatrix<T> KmMatrix<T>::cols(KmMatrix<T>& _index) {
-  M_ERROR("Not implemented.");
+  h2o4gpu_error("Not implemented.");
   KmMatrix<T> res;
   return res;
 }
@@ -254,7 +254,7 @@ bool KmMatrix<T>::operator==(KmMatrix<T>& _rhs) {
     bool res = impls[(int)Backend::CUDADense]->equal(_rhs);
     return res;
   } else {
-    M_ERROR("Not implemented.");
+    h2o4gpu_error("Not implemented.");
     return false;
   }
 }
@@ -266,17 +266,17 @@ KmMatrix<T> KmMatrix<T>::stack(KmMatrix<T> &_second,
 
   if (_dim == KmMatrixDim::ROW) {
     if (cols() != _second.cols()) {
-      M_ERROR("Columns of first is not equal to second.");
+      h2o4gpu_error("Columns of first is not equal to second.");
     }
 
     if (backend_ == Backend::CUDADense) {
       res = impls[(int)Backend::CUDADense]->stack(_second, _dim);
     } else {
-      M_ERROR("Not implemented.");
+      h2o4gpu_error("Not implemented.");
     }
 
   } else {
-    M_ERROR("Not implemented.");
+    h2o4gpu_error("Not implemented.");
   }
 
   return res;
@@ -342,7 +342,7 @@ KmMatrix<T> stack(KmMatrix<T>& _first, KmMatrix<T>& _second,
   template KmMatrix<T> KmMatrix<T>::cols(KmMatrix<T>& _index);          \
   template bool KmMatrix<T>::operator==(KmMatrix<T> &_rhs);             \
   template KmMatrix<T> KmMatrix<T>::stack(KmMatrix<T> &_second,         \
-      H2O4GPU::KMeans::KmMatrixDim _dim);                               \
+                                          KmMatrixDim _dim);            \
   /* Helper functions */                                                \
   template std::ostream& operator<<(std::ostream& os, KmMatrix<T>& m);  \
   template KmMatrix<T> stack(KmMatrix<T>& _first, KmMatrix<T>& _second, \
@@ -354,5 +354,5 @@ INSTANTIATE(double)
 INSTANTIATE(int)
 
 #undef INSTANTIATE
-}  // namespace KMeans
-}  // namepsace H2O4GPU
+}  // namespace Matrix
+}  // namepsace h2o4gpu
diff --git a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp b/src/gpu/matrix/KmMatrix/KmMatrix.hpp
similarity index 94%
rename from src/gpu/kmeans/KmMatrix/KmMatrix.hpp
rename to src/gpu/matrix/KmMatrix/KmMatrix.hpp
index 31ad2dcd4..dbeb1943e 100644
--- a/src/gpu/kmeans/KmMatrix/KmMatrix.hpp
+++ b/src/gpu/matrix/KmMatrix/KmMatrix.hpp
@@ -11,15 +11,14 @@
 #include <memory>
 #include <iomanip>
 #include <stdexcept>
-
-#include "KmConfig.h"
+#include "../../../common/utils.h"
 
 #if USE_CUDA()
 #include "KmMatrixCuda.cuh"
 #endif
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 
 template <typename T>
 class KmMatrixProxy;
@@ -53,13 +52,13 @@ struct kParam {
     cols = _other.cols;
     ptr = _other.ptr;
   }
-  M_HOSTDEV void operator=(const kParam& _other) {
+  HG_HOSTDEV void operator=(const kParam& _other) {
     rows = _other.rows;
     cols = _other.cols;
     ptr = _other.ptr;
   }
 
-  M_HOSTDEV size_t size() const {
+  HG_HOSTDEV size_t size() const {
     return rows * cols;
   }
 };
@@ -193,7 +192,7 @@ struct KmMatrixSizeError: public std::runtime_error
   {}
 };
 
-}  // namespace KMeans
-}  // namespace H2O4GPU
+}  // namespace Matrix
+}  // namespace h2o4gpu
 
 #endif
diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cu
similarity index 96%
rename from src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu
rename to src/gpu/matrix/KmMatrix/KmMatrixCuda.cu
index a2d9a3c29..6b6052ce1 100644
--- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cu
+++ b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cu
@@ -12,8 +12,8 @@
 #include "KmMatrixCuda.cuh"
 #include "KmMatrix.hpp"
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 
 template <typename T>
 CudaKmMatrixImpl<T>::CudaKmMatrixImpl(KmMatrix<T> * _par) :
@@ -173,7 +173,7 @@ KmMatrix<T> CudaKmMatrixImpl<T>::stack(KmMatrix<T>& _second,
                                        KmMatrixDim _dim) {
   if (_dim == KmMatrixDim::ROW) {
     if (KmMatrixImpl<T>::matrix_->cols() != _second.cols()) {
-      M_ERROR("Columns of first is not equal to second.");
+      h2o4gpu_error("Columns of first is not equal to second.");
     }
     host_to_device();
 
@@ -195,7 +195,9 @@ KmMatrix<T> CudaKmMatrixImpl<T>::stack(KmMatrix<T>& _second,
     return res;
   } else {
     // FIXME
-    M_ERROR("Not implemented.");
+    h2o4gpu_error("Not implemented.");
+    KmMatrix<T> res;
+    return res;
   }
 }
 
@@ -229,5 +231,6 @@ INSTANTIATE(double)
 INSTANTIATE(int)
 
 #undef INSTANTIATE
-}  // namespace H204GPU
-}  // namespace Array
+
+}  // namespace Matrix
+}  // namespace h2o4gpu
diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cuh
similarity index 95%
rename from src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh
rename to src/gpu/matrix/KmMatrix/KmMatrixCuda.cuh
index 50500f20e..02f8f6adc 100644
--- a/src/gpu/kmeans/KmMatrix/KmMatrixCuda.cuh
+++ b/src/gpu/matrix/KmMatrix/KmMatrixCuda.cuh
@@ -10,8 +10,8 @@
 #include "thrust/device_vector.h"
 #include <memory>
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 
 template <typename T>
 class KmMatrix;
@@ -81,7 +81,7 @@ class CudaKmMatrixImpl : public KmMatrixImpl<T> {
   virtual bool on_device() const override;
 };
 
-}  // MkMatrix
-}  // H204GPU
+}  // namespace Matrix
+}  // namespace h2o4gpu
 
 #endif
diff --git a/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp b/src/gpu/matrix/KmMatrix/KmMatrixProxy.cpp
similarity index 95%
rename from src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp
rename to src/gpu/matrix/KmMatrix/KmMatrixProxy.cpp
index 0bdfa9ebf..9e1d5ffd9 100644
--- a/src/gpu/kmeans/KmMatrix/KmMatrixProxy.cpp
+++ b/src/gpu/matrix/KmMatrix/KmMatrixProxy.cpp
@@ -5,8 +5,8 @@
 
 #include "KmMatrix.hpp"
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 
 template <typename T>
 KmMatrixProxy<T>::KmMatrixProxy(KmMatrix<T>& _other,
@@ -64,5 +64,5 @@ INSTANTIATE(float)
 INSTANTIATE(double)
 INSTANTIATE(int)
 
-}
-}
+}  // namespace Matrix
+}  // namespace h2o4gpu
diff --git a/src/gpu/kmeans/KmMatrix/blas.cuh b/src/gpu/matrix/KmMatrix/blas.cuh
similarity index 94%
rename from src/gpu/kmeans/KmMatrix/blas.cuh
rename to src/gpu/matrix/KmMatrix/blas.cuh
index 20cf73151..50b69898b 100644
--- a/src/gpu/kmeans/KmMatrix/blas.cuh
+++ b/src/gpu/matrix/KmMatrix/blas.cuh
@@ -7,12 +7,12 @@
 #define KM_BLAS_CUH_
 
 #include <cublas_v2.h>
-#include "KmConfig.h"
+#include "../../utils/utils.cuh"
 
 // C++ Wrappers for cublas
 
-namespace H2O4GPU {
-namespace KMeans {
+namespace h2o4gpu {
+namespace Matrix {
 
 namespace Blas {
 // LEVEL 1
@@ -20,7 +20,7 @@ inline void axpy(cublasHandle_t handle, int n,
                  const double *alpha,
                  const double *x, int incx,
                  double *y, int incy) {
-  CUBLAS_CHECK(cublasDaxpy(handle, n,
+  safe_cublas(cublasDaxpy(handle, n,
                            alpha,
                            x, incx,
                            y, incy));}
@@ -29,7 +29,7 @@ inline void axpy(cublasHandle_t handle, int n,
                  const float *alpha,
                  const float *x, int incx,
                  float *y, int incy) {
-  CUBLAS_CHECK(cublasSaxpy(handle, n,
+  safe_cublas(cublasSaxpy(handle, n,
                            alpha,
                            x, incx,
                            y, incy));}
@@ -38,7 +38,7 @@ inline void axpy(cublasHandle_t handle, int n,
                  const int *alpha,
                  const int *x, int incx,
                  int *y, int incy) {
-  CUBLAS_CHECK(cublasSaxpy(handle, n,
+  safe_cublas(cublasSaxpy(handle, n,
                            (const float *)alpha,
                            (const float *)x, incx,
                            (float *)y, incy));}
@@ -57,7 +57,7 @@ inline void gemm(cublasHandle_t handle,
                  const float *beta, /* host or device pointer */
                  float *C,
                  int ldc) {
-  CUBLAS_CHECK(cublasSgemm(handle,
+  safe_cublas(cublasSgemm(handle,
                            transa, transb,
                            m, n, k,
                            alpha, /* host or device pointer */
@@ -80,7 +80,7 @@ inline void gemm(cublasHandle_t handle,
                  const double *beta, /* host or device pointer */
                  double *C,
                  int ldc) {
-  CUBLAS_CHECK(cublasDgemm(handle,
+  safe_cublas(cublasDgemm(handle,
                            transa,
                            transb,
                            m,
@@ -109,7 +109,7 @@ inline void gemm(cublasHandle_t handle,
                  const int *beta, /* host or device pointer */
                  int *C,
                  int ldc) {
-  CUBLAS_CHECK(cublasSgemm(handle,
+  safe_cublas(cublasSgemm(handle,
                            transa, transb,
                            m, n, k,
                            (const float*)alpha, /* host or device pointer */
@@ -129,7 +129,7 @@ inline void gemm_batched(cublasHandle_t handle,
                          const double *beta,
                          double          *Carray[], int ldc, 
                          int batchCount) {
-  CUBLAS_CHECK(cublasDgemmBatched(handle,
+  safe_cublas(cublasDgemmBatched(handle,
                                   transa, 
                                   transb,
                                   m, n, k,
@@ -151,7 +151,7 @@ inline void gemm_batched(cublasHandle_t handle,
                          const float *beta,
                          float *Carray[], int ldc, 
                          int batchCount) {
-  CUBLAS_CHECK(cublasSgemmBatched(handle,
+  safe_cublas(cublasSgemmBatched(handle,
                                   transa, 
                                   transb,
                                   m, n, k,
@@ -173,7 +173,7 @@ inline void gemm_batched(cublasHandle_t handle,
                          const int *beta,
                          float *Carray[], int ldc, 
                          int batchCount) {
-  CUBLAS_CHECK(cublasSgemmBatched(handle,
+  safe_cublas(cublasSgemmBatched(handle,
                                   transa, 
                                   transb,
                                   m, n, k,
@@ -196,7 +196,7 @@ inline void gemm_strided_batched(
     const double* beta,
     double* C, int ldC, int strideC,
     int batchCount) {
-  CUBLAS_CHECK(cublasDgemmStridedBatched(handle,
+  safe_cublas(cublasDgemmStridedBatched(handle,
                                          transA, 
                                          transB,
                                          M, N, K,
@@ -221,7 +221,7 @@ inline void gemm_strided_batched(
     const float* beta,
     float* C, int ldC, int strideC,
     int batchCount) {
-  CUBLAS_CHECK(cublasSgemmStridedBatched(handle,
+  safe_cublas(cublasSgemmStridedBatched(handle,
                                          transA, 
                                          transB,
                                          M, N, K,
@@ -246,7 +246,7 @@ inline void gemm_strided_batched(
     const int* beta,
     int* C, int ldC, int strideC,
     int batchCount) {
-  CUBLAS_CHECK(cublasSgemmStridedBatched(handle,
+  safe_cublas(cublasSgemmStridedBatched(handle,
                                          transA, 
                                          transB,
                                          M, N, K,
@@ -262,7 +262,7 @@ inline void gemm_strided_batched(
 }
 
 }  // Blas
-}  // KMeans
-}  // H2O4GPU
+}  // Matrix
+}  // h2o4gpu
 
 #endif  // KM_BLAS_CUH_
\ No newline at end of file
diff --git a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh b/src/gpu/utils/GpuInfo.cuh
similarity index 88%
rename from src/gpu/kmeans/KmMatrix/GpuInfo.cuh
rename to src/gpu/utils/GpuInfo.cuh
index 22fe53e8d..0a57644f7 100644
--- a/src/gpu/kmeans/KmMatrix/GpuInfo.cuh
+++ b/src/gpu/utils/GpuInfo.cuh
@@ -6,13 +6,14 @@
 #ifndef GPU_INFO_HPP_
 #define GPU_INFO_HPP_
 
-#include "KmConfig.h"
+#include "utils.cuh"
 
 #include <cublas_v2.h>
 
 #include <stdlib.h>
 #include <stdio.h>
 
+namespace h2o4gpu {
 // Singleton class storing gpu info.
 // Call GpuInfo::ins() to use the class;
 class GpuInfo {
@@ -23,19 +24,19 @@ class GpuInfo {
 
  public:
   GpuInfo () {
-    CUDA_CHECK(cudaGetDeviceCount(&n_gpu_));
+    safe_cuda(cudaGetDeviceCount(&n_gpu_));
     n_sm_ = (int*) malloc (n_gpu_);
     handles_ = (cublasHandle_t*) malloc (n_gpu_);
 
     for (int i = 0; i < n_gpu_; ++i) {
       cudaDeviceGetAttribute(&n_sm_[i], cudaDevAttrMultiProcessorCount, i);
-      CUBLAS_CHECK(cublasCreate(&handles_[i]));
+      safe_cublas(cublasCreate(&handles_[i]));
     }
   }
   ~GpuInfo () {
     free (n_sm_);
     for (int i = 0; i < n_gpu_; ++i) {
-      CUBLAS_CHECK(cublasDestroy(handles_[i]));
+      safe_cublas(cublasDestroy(handles_[i]));
     }
     free (handles_);
   }
@@ -74,4 +75,6 @@ class GpuInfo {
   }
 };
 
+}      // namespace h2o4gpu
+
 #endif  // GPU_INFO_HPP_
diff --git a/src/gpu/utils/utils.cuh b/src/gpu/utils/utils.cuh
index 3f35f375d..fed9e46e1 100644
--- a/src/gpu/utils/utils.cuh
+++ b/src/gpu/utils/utils.cuh
@@ -9,29 +9,10 @@
 #include <ctime>
 #include <cusparse.h>
 
+#include "../../common/utils.h"
+
 namespace h2o4gpu
 {
-#define h2o4gpu_error(x) error(x, __FILE__, __LINE__);
-
-	inline void error(const char* e, const char* file, int line)
-	{
-		std::stringstream ss;
-		ss << e << " - " << file << "(" << line << ")";
-		//throw error_text;
-		std::cerr << ss.str() << std::endl;
-		exit(-1);
-	}
-
-#define h2o4gpu_check(condition, msg) check(condition, msg, __FILE__, __LINE__);
-
-	inline void check(bool val, const char* e, const char* file, int line)
-	{
-		if (!val)
-		{
-			error(e, file, line);
-		}
-	}
-
 
 #define safe_cuda(ans) throw_on_cuda_error((ans), __FILE__, __LINE__)
 
@@ -366,4 +347,56 @@ namespace h2o4gpu
 			                  return idx * col_size;
 		                  });
 	}
+
+HG_DEVINLINE size_t global_thread_idx () {
+  return threadIdx.x + blockIdx.x * blockDim.x;
+}
+
+HG_DEVINLINE size_t global_thread_idy () {
+  return threadIdx.y + blockIdx.y * blockDim.y; 
+}
+
+HG_DEVINLINE size_t grid_stride_x () {
+  return blockDim.x * gridDim.x;
+}
+
+HG_DEVINLINE size_t grid_stride_y () {
+  return blockDim.y * gridDim.y;
+}
+
+template <typename T1, typename T2>
+T1 HG_HOSTDEVINLINE div_roundup(const T1 a, const T2 b) {
+  return static_cast<T1>(ceil(static_cast<double>(a) / b));
+}
+
+
+// Work around for shared memory
+// https://stackoverflow.com/questions/20497209/getting-cuda-error-declaration-is-incompatible-with-previous-variable-name
+template <typename T>
+struct KernelSharedMem;
+
+template <>
+struct KernelSharedMem<float> {
+  __device__ float * ptr() {
+    extern __shared__ __align__(sizeof(float)) float s_float[];
+    return s_float;
+  }
+};
+
+template <>
+struct KernelSharedMem<double> {
+  __device__ double * ptr() {
+    extern __shared__ __align__(sizeof(double)) double s_double[];
+    return s_double;
+  }
+};
+
+template <>
+struct KernelSharedMem<int> {
+  __device__ int * ptr() {
+    extern __shared__ __align__(sizeof(int)) int s_int[];
+    return s_int;
+  }
+};
+
 }
diff --git a/tests/cpp/gpu/KmMatrix/test_arith.cu b/tests/cpp/gpu/KmMatrix/test_arith.cu
index fda0db018..81bf4101c 100644
--- a/tests/cpp/gpu/KmMatrix/test_arith.cu
+++ b/tests/cpp/gpu/KmMatrix/test_arith.cu
@@ -2,12 +2,12 @@
 #include <thrust/device_vector.h>
 #include <vector>
 
-#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp"
-#include "../../../../src/gpu/kmeans/KmMatrix/Arith.hpp"
+#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp"
+#include "../../../../src/gpu/matrix/KmMatrix/Arith.hpp"
 
 #include <iostream>
 
-using namespace H2O4GPU::KMeans;
+using namespace h2o4gpu::Matrix;
 
 constexpr float esp = 0.001f;
 
diff --git a/tests/cpp/gpu/KmMatrix/test_matrix.cu b/tests/cpp/gpu/KmMatrix/test_matrix.cu
index 3b36b35fe..e1aae97f1 100644
--- a/tests/cpp/gpu/KmMatrix/test_matrix.cu
+++ b/tests/cpp/gpu/KmMatrix/test_matrix.cu
@@ -1,7 +1,7 @@
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
 
-#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp"
+#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp"
 #include <cmath>
 
 // r --gtest_filter=KmMatrix.KmMatrixEqual
@@ -10,7 +10,7 @@ TEST(KmMatrix, KmMatrixEqual) {
   for (size_t i = 0; i < 2048 * 1024; ++i) {
     vec[i] = i;
   }
-  H2O4GPU::KMeans::KmMatrix<double> mat (vec, 2048, 1024);
+  h2o4gpu::Matrix::KmMatrix<double> mat (vec, 2048, 1024);
 
   ASSERT_TRUE (mat == mat);
 
@@ -18,7 +18,7 @@ TEST(KmMatrix, KmMatrixEqual) {
   for (size_t i = 0; i < 2048 * 1024; ++i) {
     vec2[i] = i + i;
   }
-  H2O4GPU::KMeans::KmMatrix<double> mat2 (vec2, 2048, 1024);
+  h2o4gpu::Matrix::KmMatrix<double> mat2 (vec2, 2048, 1024);
 
   ASSERT_FALSE(mat == mat2);
 }
@@ -29,9 +29,9 @@ TEST(KmMatrix, KmMatrixAssig) {
     vec[i] = i;
   }
 
-  H2O4GPU::KMeans::KmMatrix<double> mat0 (vec, 2048, 1024);
-  H2O4GPU::KMeans::KmMatrix<double> mat1 = mat0;
-  H2O4GPU::KMeans::KmMatrix<double> mat2;
+  h2o4gpu::Matrix::KmMatrix<double> mat0 (vec, 2048, 1024);
+  h2o4gpu::Matrix::KmMatrix<double> mat1 = mat0;
+  h2o4gpu::Matrix::KmMatrix<double> mat2;
 
   mat2 = mat0;
 
@@ -44,16 +44,16 @@ TEST(KmMatrix, KmMatrixRows) {
   for (size_t i = 0; i < 12 * 16; ++i) {
     vec[i] = i;
   }
-  H2O4GPU::KMeans::KmMatrix<double> mat (vec, 12, 16);
+  h2o4gpu::Matrix::KmMatrix<double> mat (vec, 12, 16);
 
   thrust::host_vector<double> h_index (4, 1);
   h_index[0] = 0;
   h_index[1] = 2;
   h_index[2] = 9;
   h_index[3] = 1;
-  H2O4GPU::KMeans::KmMatrix<double> index (h_index, 4, 1);
+  h2o4gpu::Matrix::KmMatrix<double> index (h_index, 4, 1);
 
-  H2O4GPU::KMeans::KmMatrix<double> rows = mat.rows(index);
+  h2o4gpu::Matrix::KmMatrix<double> rows = mat.rows(index);
 
   thrust::host_vector<double> h_sol (4 * 16);
   for (size_t i = 0; i < 16; ++i) {
@@ -69,7 +69,7 @@ TEST(KmMatrix, KmMatrixRows) {
     h_sol[i] = vec[16 * 1 + (i - 48)];
   }
 
-  H2O4GPU::KMeans::KmMatrix<double> sol (h_sol, 4, 16);
+  h2o4gpu::Matrix::KmMatrix<double> sol (h_sol, 4, 16);
 
   ASSERT_TRUE(rows == sol);
 }
@@ -77,13 +77,13 @@ TEST(KmMatrix, KmMatrixRows) {
 TEST(KmMatrix, SizeError) {
   thrust::host_vector<double> vec (12 * 16);
   ASSERT_THROW(
-      H2O4GPU::KMeans::KmMatrix<double> mat (vec, 12, 4),
+      h2o4gpu::Matrix::KmMatrix<double> mat (vec, 12, 4),
       std::runtime_error);
 }
 
 TEST(KmMatrix, KmMatrixUtils) {
   thrust::host_vector<double> vec (12 * 16);
-  H2O4GPU::KMeans::KmMatrix<double> mat (vec, 12, 16);
+  h2o4gpu::Matrix::KmMatrix<double> mat (vec, 12, 16);
 
   ASSERT_EQ(mat.rows(), 12);
   ASSERT_EQ(mat.cols(), 16);
@@ -93,9 +93,9 @@ TEST(KmMatrix, KmMatrixUtils) {
 TEST(KmMatrix, KmMatrixKparam) {
   thrust::host_vector<double> vec (12 * 16);
   thrust::fill(vec.begin(), vec.end(), 1);
-  H2O4GPU::KMeans::KmMatrix<double> mat (vec, 12, 16);
+  h2o4gpu::Matrix::KmMatrix<double> mat (vec, 12, 16);
 
-  H2O4GPU::KMeans::kParam<double> param = mat.k_param();
+  h2o4gpu::Matrix::kParam<double> param = mat.k_param();
   ASSERT_EQ(param.ptr, mat.dev_ptr());
   ASSERT_EQ(param.rows, 12);
   ASSERT_EQ(param.cols, 16);
@@ -110,11 +110,11 @@ TEST(KmMatrix, KmMatrixCycle) {
   // Tweak this one to see if memory grows, there should be a better way to
   // test memory leak.
   size_t iters = std::pow(16, 1);
-  H2O4GPU::KMeans::KmMatrix<double> mat0 (vec, rows, cols);
+  h2o4gpu::Matrix::KmMatrix<double> mat0 (vec, rows, cols);
   mat0.dev_ptr();
   for (size_t i = 0; i < iters; ++i) {
-    H2O4GPU::KMeans::KmMatrix<double> mat1 = mat0;
-    H2O4GPU::KMeans::KmMatrix<double> mat2 = mat1;
+    h2o4gpu::Matrix::KmMatrix<double> mat1 = mat0;
+    h2o4gpu::Matrix::KmMatrix<double> mat2 = mat1;
     mat0 = mat2;
   }
 }
@@ -126,16 +126,16 @@ TEST(KmMatrix, Stack) {
   for (size_t i = 0; i < rows * cols; ++i) {
     vec[i] = i;
   }
-  H2O4GPU::KMeans::KmMatrix<double> mat(vec, rows, cols);
+  h2o4gpu::Matrix::KmMatrix<double> mat(vec, rows, cols);
 
   thrust::host_vector<double> vec1 (rows * cols);
   for (size_t i = rows * cols; i < 2 * rows * cols; ++i) {
     vec1[i - rows * cols] = i;
   }
-  H2O4GPU::KMeans::KmMatrix<double> mat1(vec1, rows, cols);
+  h2o4gpu::Matrix::KmMatrix<double> mat1(vec1, rows, cols);
 
-  H2O4GPU::KMeans::KmMatrix<double> calculated =
-      H2O4GPU::KMeans::stack(mat, mat1, H2O4GPU::KMeans::KmMatrixDim::ROW);
+  h2o4gpu::Matrix::KmMatrix<double> calculated =
+      h2o4gpu::Matrix::stack(mat, mat1, h2o4gpu::Matrix::KmMatrixDim::ROW);
 
   thrust::host_vector<double> res (2 * rows * cols);
   for (size_t i = 0; i < rows * cols; ++i) {
@@ -145,7 +145,7 @@ TEST(KmMatrix, Stack) {
     res[i] = i;
   }
 
-  H2O4GPU::KMeans::KmMatrix<double> res_mat (res, 2 * rows, cols);
+  h2o4gpu::Matrix::KmMatrix<double> res_mat (res, 2 * rows, cols);
 
   ASSERT_TRUE(calculated == res_mat);
 }
diff --git a/tests/cpp/gpu/KmMatrix/test_proxy.cu b/tests/cpp/gpu/KmMatrix/test_proxy.cu
index 732d27eef..f7574596f 100644
--- a/tests/cpp/gpu/KmMatrix/test_proxy.cu
+++ b/tests/cpp/gpu/KmMatrix/test_proxy.cu
@@ -1,6 +1,6 @@
 #include <gtest/gtest.h>
 #include <thrust/device_vector.h>
-#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp"
+#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp"
 
 // r --gtest_filter=KmMatrix.KmMatrixHostProxy
 TEST(KmMatrix, KmMatrixProxyHostEqual) {
@@ -10,9 +10,9 @@ TEST(KmMatrix, KmMatrixProxyHostEqual) {
     vec[i] = i;
   }
 
-  H2O4GPU::KMeans::KmMatrix<double> mat (vec, rows, cols);
+  h2o4gpu::Matrix::KmMatrix<double> mat (vec, rows, cols);
 
-  H2O4GPU::KMeans::KmMatrix<double> row = mat.row(1);
+  h2o4gpu::Matrix::KmMatrix<double> row = mat.row(1);
 
   thrust::host_vector<double> res (cols);
 
@@ -20,7 +20,7 @@ TEST(KmMatrix, KmMatrixProxyHostEqual) {
     res[i] = v;
   }
 
-  H2O4GPU::KMeans::KmMatrix<double> res_mat (res, 1, cols);
+  h2o4gpu::Matrix::KmMatrix<double> res_mat (res, 1, cols);
 
   ASSERT_TRUE(res_mat == row);
 }
@@ -34,12 +34,12 @@ TEST(KmMatrix, KmMatrixProxyDevEqual) {
     vec[i] = i;
   }
 
-  H2O4GPU::KMeans::KmMatrix<double> mat (vec, rows, cols);
+  h2o4gpu::Matrix::KmMatrix<double> mat (vec, rows, cols);
   mat.set_name ("mat");
 
   mat.dev_ptr();
 
-  H2O4GPU::KMeans::KmMatrix<double> row = mat.row(1);
+  h2o4gpu::Matrix::KmMatrix<double> row = mat.row(1);
   row.set_name ("row");
 
   thrust::host_vector<double> res (cols);
@@ -48,7 +48,7 @@ TEST(KmMatrix, KmMatrixProxyDevEqual) {
     res[i] = v;
   }
 
-  H2O4GPU::KMeans::KmMatrix<double> res_mat (res, 1, cols);
+  h2o4gpu::Matrix::KmMatrix<double> res_mat (res, 1, cols);
   res_mat.set_name("res");
 
   ASSERT_TRUE(res_mat == row);
diff --git a/tests/cpp/gpu/kmeans/test_kmeans_init.cu b/tests/cpp/gpu/kmeans/test_kmeans_init.cu
index d15b6d4d7..32efb79d1 100644
--- a/tests/cpp/gpu/kmeans/test_kmeans_init.cu
+++ b/tests/cpp/gpu/kmeans/test_kmeans_init.cu
@@ -5,22 +5,26 @@
 
 #include <gtest/gtest.h>
 
-#include "../../../../src/gpu/kmeans/KmMatrix/KmMatrix.hpp"
-#include "../../../../src/gpu/kmeans/KmMatrix/Generator.hpp"
-#include "../../../../src/gpu/kmeans/KmMatrix/Arith.hpp"
+#include "../../../../src/gpu/matrix/KmMatrix/KmMatrix.hpp"
+#include "../../../../src/gpu/matrix/KmMatrix/Generator.hpp"
+#include "../../../../src/gpu/matrix/KmMatrix/Arith.hpp"
 #include "../../../../src/gpu/kmeans/kmeans_init.cuh"
+#include "../../../../src/common/utils.h"
 
 #include <thrust/device_vector.h>
 #include <iostream>
 #include <memory>
 
-using namespace H2O4GPU::KMeans;
+using namespace h2o4gpu::kMeans;
+using namespace h2o4gpu::Matrix;
 
 template <typename T>
 struct GeneratorMock : GeneratorBase<T> {
  public:
   KmMatrix<T> generate() override {
-    M_ERROR("Not implemented");
+    h2o4gpu_error("Not implemented");
+    KmMatrix<T> res;
+    return res;
   }
 
   KmMatrix<T> generate(size_t _size) override {
@@ -155,7 +159,7 @@ TEST(KmeansLL, KmeansLLInit) {
     h_data[i] = i - 4;
   }
 
-  H2O4GPU::KMeans::KmMatrix<double> data (h_data, 6, 5);
+  KmMatrix<double> data (h_data, 6, 5);
 
   auto res = kmeans_ll_init(data, 2);