diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index da358e31dd..26f72df448 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -83,7 +83,7 @@ if(CUDA_FOUND)
 
   add_library(cuda INTERFACE)
   target_include_directories(cuda SYSTEM INTERFACE "${CUDA_INCLUDE_DIRS}")
-
+    
 else()
   message( FATAL_ERROR "CUDA package not found -> specify search path via CUDA_ROOT variable")
 endif()
diff --git a/deps/any b/deps/any
new file mode 160000
index 0000000000..e88b1bfc16
--- /dev/null
+++ b/deps/any
@@ -0,0 +1 @@
+Subproject commit e88b1bfc160fa9b01e6174dd29c812eeeece3be9
diff --git a/deps/googletest b/deps/googletest
new file mode 160000
index 0000000000..2fe3bd994b
--- /dev/null
+++ b/deps/googletest
@@ -0,0 +1 @@
+Subproject commit 2fe3bd994b3189899d93f1d5a881e725e046fdc2
diff --git a/deps/invoke b/deps/invoke
new file mode 160000
index 0000000000..2c1eabc2e2
--- /dev/null
+++ b/deps/invoke
@@ -0,0 +1 @@
+Subproject commit 2c1eabc2e20ab02961f95c704ff0c0818671ddd1
diff --git a/deps/optional b/deps/optional
new file mode 160000
index 0000000000..c28fcf74d2
--- /dev/null
+++ b/deps/optional
@@ -0,0 +1 @@
+Subproject commit c28fcf74d207fc667c4ed3dbae4c251ea551c8c1
diff --git a/deps/pybind11 b/deps/pybind11
new file mode 160000
index 0000000000..8de7772cc7
--- /dev/null
+++ b/deps/pybind11
@@ -0,0 +1 @@
+Subproject commit 8de7772cc72daca8e947b79b83fea46214931604
diff --git a/deps/variant b/deps/variant
new file mode 160000
index 0000000000..23cb94f027
--- /dev/null
+++ b/deps/variant
@@ -0,0 +1 @@
+Subproject commit 23cb94f027d4ef33bf48133acc2695c7e5c6f1e7
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index a963c7b49b..f741a7f554 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -37,3 +37,5 @@ set_target_properties(
   PROPERTIES 
   CUDA_STANDARD 17
 )
+
+add_subdirectory(test)
\ No newline at end of file
diff --git a/lib/kernels/include/kernels/profiling.h b/lib/kernels/include/kernels/profiling.h
index 6a06a7b7a5..210e35934f 100644
--- a/lib/kernels/include/kernels/profiling.h
+++ b/lib/kernels/include/kernels/profiling.h
@@ -7,16 +7,18 @@
 
 namespace FlexFlow {
 
-struct ProfilingSettings : public use_visitable_cmp<ProfilingSettings> {
+struct ProfilingSettings {
 public:
   ProfilingSettings() = delete;
   ProfilingSettings(int warmup_iters, int measure_iters);
 
 public:
   int warmup_iters;
-  int measure_iters;
+  req<int> measure_iters;
 };
 
+FF_VISITABLE_STRUCT(ProfilingSettings, warmup_iters, measure_iters);
+
 template <typename F, typename... Ts>
 optional<float>
     profiling_wrapper(F const &f, bool enable_profiling, Ts &&...ts) {
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index 44507c14c4..cf51616641 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -7,7 +7,61 @@ ArrayShape::ArrayShape(size_t *_dims, size_t num_dims)
     : dims(_dims, _dims + num_dims) {}
 
 std::size_t ArrayShape::get_volume() const {
-  return product(this->dims);
+  return this->num_elements();
+}
+
+std::size_t ArrayShape::get_dim() const {
+  return this->num_dims();
+}
+
+std::size_t ArrayShape::num_elements() const {
+  return this->product(this->dims);
+}
+
+std::size_t ArrayShape::num_dims() const {
+  return this->dims.size();
+}
+
+std::size_t ArrayShape::operator[](legion_dim_t idx) const {
+  return this->dims.at(idx);
+}
+
+std::size_t ArrayShape::at(legion_dim_t idx) const {
+  return this->dims.at(idx);
+}
+
+legion_dim_t ArrayShape::last_idx() const {
+  return legion_dim_t(this->dims.size() - 1);
+}
+
+legion_dim_t ArrayShape::neg_idx(int idx) const {
+  assert(idx < 0 && "Idx should be negative for negative indexing");
+  return legion_dim_t(this->dims.size() + idx);
+}
+
+optional<std::size_t> ArrayShape::at_maybe(std::size_t idx) const {
+  if (idx < this->dims.size()) {
+    return this->dims[legion_dim_t(idx)];
+  } else {
+    return {};
+  }
+}
+
+ArrayShape ArrayShape::reversed_dim_order() const {
+  std::vector<std::size_t> dims_reversed(this->dims.rbegin(), this->dims.rend());
+  return ArrayShape(dims_reversed);
+}
+
+ArrayShape ArrayShape::sub_shape(optional<legion_dim_t> start,
+                                 optional<legion_dim_t> end) {
+  size_t s = start.has_value() ? start.value().value() : 0;
+  size_t e = end.has_value() ? end.value().value() : this->dims.size();
+  std::vector<std::size_t> sub_dims(this->dims.begin() + s, this->dims.begin() + e);
+  return ArrayShape(sub_dims);
+}
+
+size_t get_volume(ArrayShape const &shape) {
+  return shape.get_volume();
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/src/perf_metrics.cc b/lib/kernels/src/perf_metrics.cc
index 2036ddd35a..61163caeae 100644
--- a/lib/kernels/src/perf_metrics.cc
+++ b/lib/kernels/src/perf_metrics.cc
@@ -15,8 +15,9 @@ PerfMetrics::PerfMetrics(int _train_all,
                          double _start_time_micro,
                          double _current_time_micro)
     : train_all(_train_all), train_correct(_train_correct), cce_loss(_cce_loss),
-      mse_loss(_mse_loss), rmse_loss(_rmse_loss), mae_loss(_mae_loss),
-      start_time(_start_time_micro), current_time(_current_time_micro) {}
+      sparse_cce_loss(_sparse_cce_loss), mse_loss(_mse_loss),
+      rmse_loss(_rmse_loss), mae_loss(_mae_loss), start_time(_start_time_micro),
+      current_time(_current_time_micro) {}
 
 float get_throughput(PerfMetrics const &m) {
   return m.train_all / (m.current_time - m.start_time);
diff --git a/lib/kernels/test/CMakeLists.txt b/lib/kernels/test/CMakeLists.txt
new file mode 100644
index 0000000000..b6085b2ebd
--- /dev/null
+++ b/lib/kernels/test/CMakeLists.txt
@@ -0,0 +1,27 @@
+set(project_target kernel-test)
+project(${project_target} LANGUAGES CUDA)
+
+file(GLOB_RECURSE SRC
+     CONFIGURE_DEPENDS
+     LIST_DIRECTORIES False
+     src/*.cc)
+
+add_executable(
+  ${project_target}
+  ${SRC})
+
+target_link_libraries(
+  ${project_target}
+  kernels
+  cuda
+  rapidcheck
+  octest)
+
+set_target_properties(
+  ${project_target} 
+  PROPERTIES 
+  CUDA_STANDARD 11
+)
+
+define_ff_vars(${project_target})
+doctest_discover_tests(${project_target})
\ No newline at end of file
diff --git a/lib/kernels/test/src/doctest.h b/lib/kernels/test/src/doctest.h
new file mode 100644
index 0000000000..7cbb6c633a
--- /dev/null
+++ b/lib/kernels/test/src/doctest.h
@@ -0,0 +1 @@
+#include "doctest/doctest.h"
diff --git a/lib/kernels/test/src/main.cc b/lib/kernels/test/src/main.cc
new file mode 100644
index 0000000000..9522fa7fdb
--- /dev/null
+++ b/lib/kernels/test/src/main.cc
@@ -0,0 +1,2 @@
+#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
+#include "doctest/doctest.h"
diff --git a/lib/kernels/test/src/test_accessor.cc b/lib/kernels/test/src/test_accessor.cc
new file mode 100644
index 0000000000..4792cc6f82
--- /dev/null
+++ b/lib/kernels/test/src/test_accessor.cc
@@ -0,0 +1,84 @@
+
+#include "doctest.h"
+#include "kernels/accessor.h"
+
+using namespace FlexFlow;
+
+TEST_CASE("Test GenericTensorAccessorW") {
+  float dataW = 3.14f;
+  GenericTensorAccessorW accessorW{
+      DataType::FLOAT, ArrayShape(std::vector<std::size_t>{}), &dataW};
+
+  // Test get method
+  CHECK(*accessorW.get<DataType::FLOAT>() == doctest::Approx(3.14f));
+
+  // Test specific type get ptr methods
+  CHECK(get_float_ptr(accessorW) != nullptr);
+  CHECK(*get_float_ptr(accessorW) == doctest::Approx(3.14f));
+
+  // Check runtime error for invalid access
+  CHECK_THROWS_WITH(accessorW.get<DataType::DOUBLE>(),
+                    "Invalid access data type (FLOAT != DOUBLE)");
+}
+
+TEST_CASE("Test GenericTensorAccessorR") {
+  float dataR = 7.89f;
+  GenericTensorAccessorR accessorR{
+      DataType::FLOAT, ArrayShape(std::vector<std::size_t>{}), &dataR};
+  // Test get method
+  CHECK(*accessorR.get<DataType::FLOAT>() == doctest::Approx(7.89f));
+
+  // Test specific type get ptr methods
+  CHECK(get_float_ptr(accessorR) != nullptr);
+  CHECK(*get_float_ptr(accessorR) == doctest::Approx(7.89f));
+
+  // Check runtime error for invalid access
+  CHECK_THROWS_WITH(accessorR.get<DataType::DOUBLE>(),
+                    "Invalid access data type (FLOAT != DOUBLE)");
+}
+
+TEST_CASE("Test get_int32_ptr for GenericTensorAccessorW") {
+  int32_t dataW = 12345;
+  GenericTensorAccessorW accessorW{
+      DataType::INT32, ArrayShape(std::vector<std::size_t>{}), &dataW};
+
+  // Test get_int32_ptr method
+  CHECK(get_int32_ptr(accessorW) != nullptr);
+  CHECK(*get_int32_ptr(accessorW) == 12345);
+}
+
+TEST_CASE("Test get_int64_ptr for GenericTensorAccessorW") {
+  int64_t dataW = 1234567890LL;
+  GenericTensorAccessorW accessorW{
+      DataType::INT64, ArrayShape(std::vector<std::size_t>{}), &dataW};
+  // Test get_int64_ptr method
+  CHECK(get_int64_ptr(accessorW) != nullptr);
+  CHECK(*get_int64_ptr(accessorW) == 1234567890LL);
+}
+
+TEST_CASE("Test get_float_ptr for GenericTensorAccessorW") {
+  float dataW = 3.14f;
+  GenericTensorAccessorW accessorW{
+      DataType::FLOAT, ArrayShape(std::vector<std::size_t>{}), &dataW};
+  // Test get_float_ptr method
+  CHECK(get_float_ptr(accessorW) != nullptr);
+  CHECK(*get_float_ptr(accessorW) == doctest::Approx(3.14f));
+}
+
+TEST_CASE("Test get_double_ptr for GenericTensorAccessorW") {
+  double dataW = 6.28;
+  GenericTensorAccessorW accessorW{
+      DataType::DOUBLE, ArrayShape(std::vector<std::size_t>{}), &dataW};
+  // Test get_double_ptr method
+  CHECK(get_double_ptr(accessorW) != nullptr);
+  CHECK(*get_double_ptr(accessorW) == doctest::Approx(6.28));
+}
+
+TEST_CASE("Test get_int32_ptr for GenericTensorAccessorR") {
+  int32_t dataR = 67890;
+  GenericTensorAccessorR accessorR{
+      DataType::INT32, ArrayShape(std::vector<std::size_t>{}), &dataR};
+  // Test get_int32_ptr method
+  CHECK(get_int32_ptr(accessorR) != nullptr);
+  CHECK(*get_int32_ptr(accessorR) == 67890);
+}
diff --git a/lib/kernels/test/src/test_array_shape.cc b/lib/kernels/test/src/test_array_shape.cc
new file mode 100644
index 0000000000..dc15e13f0d
--- /dev/null
+++ b/lib/kernels/test/src/test_array_shape.cc
@@ -0,0 +1,46 @@
+#include "doctest.h"
+#include "kernels/array_shape.h" // Assuming this is where your ArrayShape is
+#include "kernels/legion_dim.h"
+
+using namespace FlexFlow;
+
+TEST_CASE("ArrayShape Initialization and Basic Functions") {
+  std::vector<std::size_t> dims = {2, 3, 4};
+  ArrayShape shape(dims);
+  CHECK(shape.get_dim() == 3);
+  CHECK(shape.get_volume() == 24);
+  CHECK(shape.num_elements() == 24);
+  CHECK(shape.num_dims() == 3);
+  CHECK(shape[legion_dim_t(1)] == 3);
+  CHECK(shape.at(legion_dim_t(2)) == 4);
+  CHECK(shape.at(ff_dim_t(2)) == 4);
+}
+
+TEST_CASE("Negative Indices and Optional Indexing") {
+  std::vector<std::size_t> dims = {2, 3, 4};
+  ArrayShape shape(dims);
+
+  CHECK(shape.neg_idx(-1) == legion_dim_t(2));
+  CHECK(shape.neg_idx(-3) == legion_dim_t(0));
+
+  CHECK(shape.at_maybe(0) == 2);
+  CHECK(shape.at_maybe(2) == 4);
+  CHECK(!shape.at_maybe(5).has_value());
+}
+
+TEST_CASE("Reversed Dim Order and Sub-shape") {
+  using namespace FlexFlow;
+
+  std::vector<std::size_t> dims = {2, 3, 4};
+  ArrayShape shape(dims);
+
+  ArrayShape reversed = shape.reversed_dim_order();
+  CHECK(reversed[legion_dim_t(0)] == 4);
+  CHECK(reversed[legion_dim_t(1)] == 3);
+  CHECK(reversed[legion_dim_t(2)] == 2);
+
+  ArrayShape sub = shape.sub_shape(legion_dim_t(0), legion_dim_t(2));
+  CHECK(sub.get_dim() == 2);
+  CHECK(sub[legion_dim_t(0)] == 2);
+  CHECK(sub[legion_dim_t(1)] == 3);
+}
diff --git a/lib/kernels/test/src/test_datatype_dispatch.cc b/lib/kernels/test/src/test_datatype_dispatch.cc
new file mode 100644
index 0000000000..a60617e8e7
--- /dev/null
+++ b/lib/kernels/test/src/test_datatype_dispatch.cc
@@ -0,0 +1,45 @@
+#include "doctest.h"
+#include "kernels/datatype_dispatch.h"
+
+using namespace FlexFlow;
+
+template <DataType DT>
+struct Function1 {
+  int operator()(int value) const {
+    if (DT == DataType::FLOAT) {
+      return value + 1;
+    }
+    if (DT == DataType::DOUBLE) {
+      return value + 2;
+    }
+    return 0;
+  }
+};
+
+TEST_CASE("Testing dispatch function") {
+  int value = 10;
+  int result = dispatch<Function1>(DataType::FLOAT, value);
+  CHECK(result == 11);
+}
+
+// test DataTypeDispatch1
+TEST_CASE("Testing DataTypeDispatch1") {
+  DataTypeDispatch1<Function1> dispatcher;
+  int value = 10;
+  int result = dispatcher(DataType::FLOAT, value);
+  CHECK(result == 11);
+}
+
+TEST_CASE("Testing dispatch function double") {
+  int value = 10;
+  int result = dispatch<Function1>(DataType::DOUBLE, value);
+  CHECK(result == 11);
+}
+
+
+TEST_CASE("Testing DataTypeDispatch1 Double") {
+  DataTypeDispatch1<Function1> dispatcher;
+  int value = 10;
+  int result = dispatcher(DataType::DOUBLE, value);
+  CHECK(result == 11);
+}
\ No newline at end of file
diff --git a/lib/kernels/test/src/test_legion_dim.cc b/lib/kernels/test/src/test_legion_dim.cc
new file mode 100644
index 0000000000..1b8aa6f3b6
--- /dev/null
+++ b/lib/kernels/test/src/test_legion_dim.cc
@@ -0,0 +1,55 @@
+#include "doctest.h"
+#include "kernels/legion_dim.h"
+
+using namespace FlexFlow;
+
+TEST_CASE("Testing DimOrdered") {
+  SUBCASE("constructor method") {
+    DimOrdered<legion_dim_t, int> fromInitList = {1, 2, 3};
+    CHECK(fromInitList.size() == 3);
+    std::vector<int> vec = {4, 5, 6};
+    DimOrdered<legion_dim_t, int> fromVector(vec);
+    CHECK(fromVector.size() == 3);
+  }
+
+  SUBCASE("at") {
+    DimOrdered<legion_dim_t, int> dimOrder = {1, 2, 3};
+    CHECK(dimOrder[legion_dim_t(0)] == 1);
+    CHECK(dimOrder[legion_dim_t(1)] == 2);
+    CHECK(dimOrder[legion_dim_t(2)] == 3);
+    CHECK(dimOrder[ff_dim_t(0)] == 1);
+    CHECK(dimOrder[ff_dim_t(1)] == 2);
+    CHECK(dimOrder[ff_dim_t(2)] == 3);
+  }
+
+  SUBCASE("comparsion") {
+    DimOrdered<legion_dim_t, int> order1 = {1, 2, 3};
+    DimOrdered<legion_dim_t, int> order2 = {1, 2, 4};
+    DimOrdered<legion_dim_t, int> order3 = {1, 2, 3};
+
+    CHECK(order1 != order2);
+    CHECK(order1 == order3);
+  }
+
+  SUBCASE("iterator") {
+    DimOrdered<legion_dim_t, int> dimOrder = {1, 2, 3};
+    int sum = 0;
+    for (int value : dimOrder) {
+      sum += value;
+    }
+    CHECK(sum == 6);
+  }
+}
+
+TEST_CASE("Testing LegionTensorDims") {
+
+  SUBCASE("LegionTensorDims Basic Operation") {
+    LegionTensorDims tensorDims = {100, 200};
+
+    // tensorDims[legion_dim_t(1)] = 100;
+    CHECK(tensorDims[legion_dim_t(0)] == 100);
+
+    //  tensorDims[legion_dim_t(2)] = 200;
+    CHECK(tensorDims[legion_dim_t(1)] == 200);
+  }
+}
diff --git a/lib/kernels/test/src/test_perf_metrics.cc b/lib/kernels/test/src/test_perf_metrics.cc
new file mode 100644
index 0000000000..f6f3d41c08
--- /dev/null
+++ b/lib/kernels/test/src/test_perf_metrics.cc
@@ -0,0 +1,96 @@
+#include "doctest.h"
+#include "kernels/perf_metrics.h"
+#include <random>
+
+using namespace FlexFlow;
+
+// Helper function to generate random values for PerfMetrics
+PerfMetrics randomPerfMetrics() {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+
+  std::uniform_real_distribution<> dis(0.0, 1.0);
+
+  PerfMetrics metrics{0};
+  metrics.train_all = dis(gen);
+  metrics.train_correct = dis(gen);
+  metrics.cce_loss = dis(gen);
+  metrics.sparse_cce_loss = dis(gen);
+  metrics.mse_loss = dis(gen);
+  metrics.rmse_loss = dis(gen);
+  metrics.mae_loss = dis(gen);
+  metrics.start_time = dis(gen);
+  metrics.current_time = dis(gen);
+
+  return metrics;
+}
+
+TEST_CASE("PerfMetricsTests1 " ) {
+
+  SUBCASE("Throughput non-negative") {
+    auto m = randomPerfMetrics();
+    CHECK(get_throughput(m) >= 0);
+  }
+
+  SUBCASE("Accuracy between 0 and 1") {
+    auto m = randomPerfMetrics();
+    float accuracy = get_accuracy(m);
+    CHECK(accuracy >= 0.0f);
+    CHECK(accuracy <= 1.0f);
+  }
+
+  SUBCASE("Update maintains non-negative values") {
+    auto lhs = randomPerfMetrics();
+    auto rhs = randomPerfMetrics();
+    auto result = update(lhs, rhs);
+    CHECK(result.train_all >= 0);
+    // Add other assertions for other fields...
+  }
+}
+
+
+TEST_CASE("PerfMetrics Tests") {
+
+  SUBCASE("Constructor and basic properties") {
+    double start_time = 100.0;
+    PerfMetrics metrics(start_time);
+
+    CHECK(metrics.start_time == start_time);
+    CHECK(metrics.current_time == start_time);
+    CHECK(metrics.train_all == 0);
+  }
+
+  SUBCASE("Throughput non-negative") {
+    PerfMetrics m(10, 5, 0.5, 0.5, 0.5, 0.5, 0.5, 100.0, 200.0);
+    CHECK(get_throughput(m) >= 0);
+  }
+
+  SUBCASE("Accuracy between 0 and 1") {
+    PerfMetrics m(10, 5, 0.5, 0.5, 0.5, 0.5, 0.5, 100.0, 200.0);
+    float accuracy = get_accuracy(m);
+    CHECK(accuracy >= 0.0f);
+    CHECK(accuracy <= 1.0f);
+  }
+
+  SUBCASE("Update maintains non-negative values") {
+    PerfMetrics lhs(10, 5, 0.5, 0.5, 0.5, 0.5, 0.5, 100.0, 200.0);
+    PerfMetrics rhs(5, 3, 0.2, 0.2, 0.2, 0.2, 0.2, 200.0, 300.0);
+
+    auto result = update(lhs, rhs);
+    CHECK(result.train_all == 15);
+    CHECK(result.train_correct.value() == 8);
+  }
+
+  SUBCASE("Scale values correctly") {
+    PerfMetrics pm(10, 5, 0.5, 0.5, 0.5, 0.5, 0.5, 100.0, 200.0);
+    float scale = 2.0f;
+
+    auto result = apply_scale(pm, scale);
+    CHECK(result.cce_loss.value() == doctest::Approx(1.0f));
+    CHECK(result.sparse_cce_loss.value() == doctest::Approx(1.0f));
+    CHECK(result.mse_loss.value() == doctest::Approx(1.0f));
+    CHECK(result.rmse_loss.value() == doctest::Approx(1.0f));
+    CHECK(result.mae_loss.value() == doctest::Approx(1.0f));
+  }
+}
+