From de7fa32511e77fc34cb9a9f09df12dba826a84ab Mon Sep 17 00:00:00 2001
From: Marsella8 <45826022+Marsella8@users.noreply.github.com>
Date: Fri, 24 Jan 2025 15:55:04 -0800
Subject: [PATCH 1/5] Task Simulator (#1565)

* task_simulator_forward_pass

* task simulator fixes

* additional testing + bug fix

* task simulator fix

* refactor to task simulator

* minor fix

* task simulator refactor

* added testing

* added task graph profiler

* fixes

* minor refactoring + moving things around

* interface update

* minor fix

* merge fix

* fixes

* minor fix

* fixes

* fixes

* fmt

* fixes

* uncommented test

* fmt

* test fix

* fix

---------

Co-authored-by: Pietro Max Marsella <marsella@stanford.edu>
Co-authored-by: Colin Unger <lockshaw@lockshaw.net>
---
 .../cost_estimator/op_cost_estimate_key.h     |  19 ++
 .../op_cost_metrics.struct.toml               |   6 +-
 .../cost_estimator/tensor_set_movement.h      |  19 ++
 .../machine_mapping/machine_mapping.h         |   4 +
 .../unstructured_device_mapping.h             |  18 ++
 .../unstructured_device_mapping.struct.toml   |  26 ++
 .../in_progress_task.struct.toml              |  26 ++
 .../in_progress_task_comparator.h             |  13 +
 .../pcg_task.variant.toml                     |  20 ++
 .../task_graph_simulator/pcg_task_graph.h     |  17 ++
 .../pcg_task_graph.struct.toml                |  34 +++
 .../simulate_task_graph_execution.h           |  17 ++
 .../task_execution_constraint.struct.toml     |  15 +
 .../task_graph_execution_state.struct.toml    |  40 +++
 .../task_graph_execution_trace.h              |  12 +
 .../task_graph_execution_trace.struct.toml    |  23 ++
 .../task_profile.struct.toml                  |  26 ++
 .../task_graph_simulator/task_simulator.h     |  18 ++
 .../src/compiler/allowed_machine_views.cc     |   4 +
 .../cost_estimator/op_cost_estimate_key.cc    |  23 ++
 .../cost_estimator/tensor_set_movement.cc     |  16 ++
 .../get_optimal_machine_mapping.cc            |   5 +-
 .../machine_mapping/machine_mapping.cc        |  13 +-
 .../machine_mapping_with_memory_result.cc     |  13 +-
 .../unstructured_device_mapping.cc            |  28 ++
 .../in_progress_task_comparator.cc            |  11 +
 .../task_graph_simulator/pcg_task_graph.cc    |  59 ++++
 .../simulate_task_graph_execution.cc          | 107 +++++++
 .../task_graph_execution_trace.cc             |  27 ++
 .../task_graph_simulator/task_simulator.cc    |  71 +++++
 .../cost_estimator_for_test.cc                |  13 +
 .../cost_estimator_for_test.h                 |  10 +-
 .../get_optimal_machine_mapping.cc            |  19 +-
 .../get_tensor_set_movement_across_split.cc   |   2 +-
 .../machine_mapping/machine_mapping.cc        |   1 -
 ...get_optimal_machine_mapping_with_memory.cc |  48 +++-
 .../machine_mapping_result_with_memory.cc     |  51 +++-
 .../simulate_task_graph_execution.cc          | 211 ++++++++++++++
 .../task_graph_simulator/task_simulator.cc    | 265 ++++++++++++++++++
 lib/pcg/include/pcg/machine_specification.h   |   1 +
 lib/pcg/include/pcg/machine_view.h            |   8 +
 lib/pcg/include/pcg/operator_task_space.h     |   5 +
 .../parallel_computation_graph.h              |  18 ++
 lib/pcg/src/pcg/machine_specification.cc      |   1 +
 lib/pcg/src/pcg/machine_view.cc               |  57 +++-
 lib/pcg/src/pcg/operator_task_space.cc        |  21 +-
 .../parallel_computation_graph.cc             |  44 +++
 .../parallel_computation_graph_edge.cc        |   1 +
 lib/pcg/test/src/pcg/machine_view.cc          |  91 ++++++
 .../parallel_computation_graph.cc             |  80 +++++-
 lib/runtime/src/parallel_compuation_graph.cc  |   7 -
 .../include/utils/archetypes/value_type.h     |  13 +
 .../include/utils/containers/lookup_in_map.h  |  27 ++
 lib/utils/include/utils/containers/minimum.h  |  21 ++
 .../utils/deduplicated_priority_queue.h       |  11 +
 .../algorithms/get_outgoing_edges.h           |  16 ++
 .../src/utils/containers/lookup_in_map.cc     |  12 +
 lib/utils/src/utils/containers/minimum.cc     |   1 +
 .../algorithms/get_outgoing_edges.cc          |  28 ++
 .../src/utils/containers/lookup_in_map.cc     |  31 ++
 .../algorithms/get_incoming_edges.cc          |  51 ++++
 .../algorithms/get_outgoing_edges.cc          |  90 ++++++
 62 files changed, 1923 insertions(+), 62 deletions(-)
 create mode 100644 lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h
 create mode 100644 lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h
 create mode 100644 lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h
 create mode 100644 lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml
 create mode 100644 lib/compiler/include/compiler/task_graph_simulator/task_simulator.h
 create mode 100644 lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc
 create mode 100644 lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc
 create mode 100644 lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc
 create mode 100644 lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc
 create mode 100644 lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc
 create mode 100644 lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
 create mode 100644 lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc
 create mode 100644 lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc
 rename lib/compiler/test/src/compiler/{machine_mapping => }/cost_estimator_for_test.cc (72%)
 rename lib/compiler/test/src/compiler/{machine_mapping => }/cost_estimator_for_test.h (77%)
 create mode 100644 lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
 create mode 100644 lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
 delete mode 100644 lib/runtime/src/parallel_compuation_graph.cc
 create mode 100644 lib/utils/include/utils/containers/lookup_in_map.h
 create mode 100644 lib/utils/include/utils/containers/minimum.h
 create mode 100644 lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h
 create mode 100644 lib/utils/src/utils/containers/lookup_in_map.cc
 create mode 100644 lib/utils/src/utils/containers/minimum.cc
 create mode 100644 lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
 create mode 100644 lib/utils/test/src/utils/containers/lookup_in_map.cc
 create mode 100644 lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
 create mode 100644 lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc

diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h
new file mode 100644
index 0000000000..93a1143cde
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_estimate_key.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_ESTIMATE_KEY_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_OP_COST_ESTIMATE_KEY_H
+
+#include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+
+namespace FlexFlow {
+
+OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer(
+    ParallelComputationGraph const &pcg,
+    parallel_layer_guid_t const &layer,
+    MachineView const &machine_view);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
index d2ff3f42e7..5e81d6c10e 100644
--- a/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
+++ b/lib/compiler/include/compiler/cost_estimator/op_cost_metrics.struct.toml
@@ -11,7 +11,11 @@ includes = [
 ]
 
 [[fields]]
-name = "runtime"
+name = "forward_runtime"
+type = "float"
+
+[[fields]]
+name = "backward_runtime"
 type = "float"
 
 [[fields]]
diff --git a/lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h b/lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h
new file mode 100644
index 0000000000..34188ff97c
--- /dev/null
+++ b/lib/compiler/include/compiler/cost_estimator/tensor_set_movement.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TENSOR_SET_MOVEMENT_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TENSOR_SET_MOVEMENT_H
+
+#include "compiler/cost_estimator/tensor_set_movement.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
+
+namespace FlexFlow {
+
+TensorSetMovement get_tensor_set_movement_from_pcg_edge(
+    ParallelComputationGraphEdge const &edge,
+    ParallelComputationGraph const &pcg,
+    MachineView const &src_mv,
+    MachineView const &dst_mv);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
index 06cbbf942d..7375cde985 100644
--- a/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
+++ b/lib/compiler/include/compiler/machine_mapping/machine_mapping.h
@@ -2,6 +2,10 @@
 #define _FLEXFLOW_COMPILER_MACHINE_MAPPING_H
 
 #include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h
new file mode 100644
index 0000000000..0fb31210fd
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_COMPILER_MACHINE_MAPPING_UNSTRUCTURED_DEVICE_MAPPING_H
+#define _FLEXFLOW_COMPILER_MACHINE_MAPPING_UNSTRUCTURED_DEVICE_MAPPING_H
+
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "compiler/machine_mapping/unstructured_device_mapping.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+
+namespace FlexFlow {
+
+UnstructuredDeviceMapping
+    get_unstructured_device_mapping(MachineMapping const &machine_mapping,
+                                    MachineSpecification const &machine_spec,
+                                    ParallelComputationGraph const &pcg);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml
new file mode 100644
index 0000000000..ae38a37292
--- /dev/null
+++ b/lib/compiler/include/compiler/machine_mapping/unstructured_device_mapping.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "UnstructuredDeviceMapping"
+features = [
+  "eq",
+  # "ord",
+  "hash",
+  # "json",
+  # "rapidcheck",
+  "fmt",
+]
+
+includes = [
+  "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h",
+  "pcg/device_id_t.dtg.h"
+]  
+
+src_includes = [   
+  "utils/hash/unordered_map.h",
+  "utils/fmt/unordered_map.h",  
+  "utils/hash/unordered_set.h",  
+  "utils/fmt/unordered_set.h" 
+]
+
+[[fields]]
+name = "raw_device_map"
+type = "std::unordered_map<::FlexFlow::parallel_layer_guid_t, std::unordered_set<::FlexFlow::device_id_t>>"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml
new file mode 100644
index 0000000000..71e0e17f5e
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "InProgressTask"
+
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "ord"
+]
+
+includes = [
+  "utils/graph/node/node.dtg.h"
+]
+
+
+[[fields]]
+name = "start_time"
+type = "float"
+
+[[fields]]
+name = "end_time"
+type = "float"
+
+[[fields]]
+name = "node"
+type = "::FlexFlow::Node"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h
new file mode 100644
index 0000000000..ed509cb7be
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/in_progress_task_comparator.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_IN_PROGRESS_TASK_COMPARATOR_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_IN_PROGRESS_TASK_COMPARATOR_H
+
+#include "compiler/task_graph_simulator/in_progress_task.dtg.h"
+#include <tuple>
+
+namespace FlexFlow {
+struct InProgressTaskComparator {
+  bool operator()(InProgressTask const &lhs, InProgressTask const &rhs) const;
+};
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_IN_PROGRESS_TASK_COMPARATOR_H
diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml
new file mode 100644
index 0000000000..13f2f17652
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task.variant.toml
@@ -0,0 +1,20 @@
+namespace = "FlexFlow"
+name = "PCGTask"
+features = [
+  "eq",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "compiler/cost_estimator/op_cost_estimate_key.dtg.h",
+  "compiler/cost_estimator/tensor_set_movement.dtg.h",
+]
+
+[[values]]
+type = "::FlexFlow::OpCostEstimateKey"
+key = "operator"
+
+[[values]]
+type = "::FlexFlow::TensorSetMovement"
+key = "tensor_movement"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h
new file mode 100644
index 0000000000..2c6d6514e8
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_PCG_TASK_GRAPH_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_PCG_TASK_GRAPH_H
+
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "compiler/task_graph_simulator/pcg_task_graph.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+
+namespace FlexFlow {
+
+PCGTaskGraph get_pcg_task_graph(ParallelComputationGraph const &pcg,
+                                MachineMapping const &machine_mapping,
+                                MachineSpecification const &machine_spec);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml
new file mode 100644
index 0000000000..099f44c564
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/pcg_task_graph.struct.toml
@@ -0,0 +1,34 @@
+namespace = "FlexFlow"
+name = "PCGTaskGraph"
+
+features = [
+]
+
+includes = [
+  "utils/graph/digraph/digraph_view.h",
+  "utils/bidict/bidict.h",
+  "compiler/task_graph_simulator/pcg_task.dtg.h",
+  "pcg/device_id_t.dtg.h",
+  "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h",
+  "<unordered_set>",
+  "<unordered_map>"
+]
+
+src_includes = [
+  "utils/fmt/unordered_set.h",
+  "utils/hash/unordered_set.h",
+  "utils/fmt/unordered_map.h",
+  "utils/hash/unordered_map.h"
+]
+
+[[fields]]
+name = "graph"
+type = "::FlexFlow::DiGraphView"
+
+[[fields]]
+name = "node_to_task"
+type = "::FlexFlow::bidict<::FlexFlow::Node, ::FlexFlow::PCGTask>"
+
+[[fields]]
+name = "node_to_devices"
+type = "std::unordered_map<::FlexFlow::Node, std::unordered_set<::FlexFlow::device_id_t>>"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h b/lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h
new file mode 100644
index 0000000000..424e65f9df
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/simulate_task_graph_execution.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_SIMULATE_TASK_GRAPH_EXECUTION_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_SIMULATE_TASK_GRAPH_EXECUTION_H
+
+#include "compiler/task_graph_simulator/task_execution_constraint.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h"
+#include "utils/graph/digraph/digraph_view.h"
+#include <functional>
+namespace FlexFlow {
+
+TaskGraphExecutionTrace simulate_task_graph_execution(
+    DiGraphView const &task_graph,
+    std::function<float(Node const &)> cost_function,
+    TaskExecutionConstraint const &constraint);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml
new file mode 100644
index 0000000000..004655b5ec
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_execution_constraint.struct.toml
@@ -0,0 +1,15 @@
+namespace = "FlexFlow"
+name = "TaskExecutionConstraint"
+features = [
+]
+
+includes = [
+  "utils/graph/node/node.dtg.h",
+  "<functional>",
+  "<unordered_set>"
+]
+
+
+[[fields]]
+name = "is_satisfied"
+type = "std::function<bool(Node const &, std::unordered_set<Node> const &, std::unordered_set<Node> const &)>"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml
new file mode 100644
index 0000000000..b96d7264b9
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_state.struct.toml
@@ -0,0 +1,40 @@
+namespace = "FlexFlow"
+name = "TaskGraphExecutionState"
+
+features = [
+]
+
+includes = [
+  "utils/deduplicated_priority_queue.h",
+  "utils/graph/node/node.dtg.h",
+  "compiler/task_graph_simulator/in_progress_task.dtg.h", 
+  "compiler/task_graph_simulator/in_progress_task_comparator.h",
+  "<unordered_set>",
+  "<set>",
+  "<functional>"
+]
+
+src_includes = [
+  "utils/hash/unordered_set.h", 
+  "utils/fmt/unordered_set.h",
+  "utils/hash/set.h", 
+  "utils/fmt/set.h",
+  "utils/fmt/vector.h", 
+  "utils/hash/vector.h"
+]
+
+[[fields]]
+name = "ready_tasks"
+type = "std::set<::FlexFlow::Node>"
+
+[[fields]]
+name = "in_progress_tasks"
+type = "::FlexFlow::DeduplicatedPriorityQueue<::FlexFlow::InProgressTask, std::vector<::FlexFlow::InProgressTask>, ::FlexFlow::InProgressTaskComparator>"
+
+[[fields]]
+name = "finished_tasks"
+type = "std::unordered_set<::FlexFlow::Node>"
+
+[[fields]]
+name = "current_time"
+type = "float"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h
new file mode 100644
index 0000000000..0ad5b4824b
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_TASK_GRAPH_SIMULATOR_TASK_GRAPH_EXECUTION_TRACE_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_TASK_GRAPH_SIMULATOR_TASK_GRAPH_EXECUTION_TRACE_H
+
+#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h"
+
+namespace FlexFlow {
+
+float get_total_execution_time(TaskGraphExecutionTrace const &trace);
+
+} // namespace FlexFlow
+
+#endif // _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_TASK_GRAPH_SIMULATOR_TASK_GRAPH_EXECUTION_TRACE_H
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml
new file mode 100644
index 0000000000..3003e5a157
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_graph_execution_trace.struct.toml
@@ -0,0 +1,23 @@
+namespace = "FlexFlow"
+name = "TaskGraphExecutionTrace"
+
+features = [
+  "hash",
+  "fmt",
+  "eq"
+]
+
+includes = [
+  "compiler/task_graph_simulator/task_profile.dtg.h",
+  "<unordered_set>"
+]
+
+src_includes = [
+  "utils/fmt/unordered_set.h",
+  "utils/hash/unordered_set.h"
+]
+
+
+[[fields]]
+name = "task_profiles"
+type = "std::unordered_set<::FlexFlow::TaskProfile>"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml b/lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml
new file mode 100644
index 0000000000..1a47acfa0e
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_profile.struct.toml
@@ -0,0 +1,26 @@
+namespace = "FlexFlow"
+name = "TaskProfile"
+
+features = [
+  "eq",
+  "hash",
+  "fmt",
+  "ord"
+]
+
+includes = [
+  "utils/graph/node/node.dtg.h"
+]
+
+
+[[fields]]
+name = "node"
+type = "::FlexFlow::Node"
+
+[[fields]]
+name = "start_time"
+type = "float"
+
+[[fields]]
+name = "end_time"
+type = "float"
diff --git a/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h
new file mode 100644
index 0000000000..b35733e419
--- /dev/null
+++ b/lib/compiler/include/compiler/task_graph_simulator/task_simulator.h
@@ -0,0 +1,18 @@
+#ifndef _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H
+#define _FLEXFLOW_LIB_COMPILER_INCLUDE_COMPILER_COST_ESTIMATOR_TASK_SIMULATOR_H
+
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+
+namespace FlexFlow {
+float task_simulator_estimate_forward_pass_time(
+    ParallelComputationGraph const &pcg,
+    CostEstimator const &estimator,
+    MachineMapping const &machine_mapping,
+    MachineSpecification const &machine_spec);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc
index 1c226f79b0..db7477b460 100644
--- a/lib/compiler/src/compiler/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/allowed_machine_views.cc
@@ -24,6 +24,10 @@ namespace FlexFlow {
 bool is_valid_machine_view(MachineView const &mv,
                            OperatorTaskSpace const &task,
                            MachineSpecification const &ms) {
+  if (num_dims(mv) != num_dims(task)) {
+    return false;
+  }
+
   std::optional<MachineSpaceCoordinate> maximum_device_coord =
       get_machine_space_coordinate(
           task, mv, get_task_space_maximum_coordinate(task), ms);
diff --git a/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc
new file mode 100644
index 0000000000..ef5775851f
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/op_cost_estimate_key.cc
@@ -0,0 +1,23 @@
+#include "compiler/cost_estimator/op_cost_estimate_key.h"
+#include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+OpCostEstimateKey get_mapped_op_cost_estimate_key_for_layer(
+    ParallelComputationGraph const &pcg,
+    parallel_layer_guid_t const &layer,
+    MachineView const &machine_view) {
+  return map_unmapped_op_cost_estimate_key(
+      get_unmapped_op_cost_estimate_key_for_layer(pcg, layer), machine_view);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc b/lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc
new file mode 100644
index 0000000000..8f2ab84b84
--- /dev/null
+++ b/lib/compiler/src/compiler/cost_estimator/tensor_set_movement.cc
@@ -0,0 +1,16 @@
+#include "compiler/cost_estimator/tensor_set_movement.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+namespace FlexFlow {
+
+TensorSetMovement get_tensor_set_movement_from_pcg_edge(
+    ParallelComputationGraphEdge const &edge,
+    ParallelComputationGraph const &pcg,
+    MachineView const &src_mv,
+    MachineView const &dst_mv) {
+  ParallelTensorShape tensor_shape =
+      get_parallel_tensor_shape(pcg, parallel_tensor_guid_t{edge.raw_edge.src});
+  return TensorSetMovement{
+      {SingleTensorMovement{tensor_shape, {src_mv}, {dst_mv}}}};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 5bdd8645a5..49d528e4ab 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -1,4 +1,5 @@
 #include "compiler/machine_mapping/get_optimal_machine_mapping.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/get_machine_resource_splits.h"
 #include "compiler/machine_mapping/machine_mapping_cache.h"
@@ -240,8 +241,8 @@ MachineMappingResult
   auto get_mapping_result = [&](MachineView const &machine_view) {
     OpCostEstimateKey mapped =
         map_unmapped_op_cost_estimate_key(leaf, machine_view);
-    float cost = context.cost_estimator.estimate_cost(mapped).runtime;
-
+    OpCostMetrics metrics = context.cost_estimator.estimate_cost(mapped);
+    float cost = metrics.forward_runtime + metrics.backward_runtime;
     return make_singleton_machine_mapping_result(cost, machine_view);
   };
 
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index 57e82684e9..fc3a58995c 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,13 +1,20 @@
 #include "compiler/machine_mapping/machine_mapping.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/are_disjoint.h"
+#include "utils/containers/get_one_of.h"
 #include "utils/containers/keys.h"
+#include "utils/containers/map_values.h"
 #include "utils/containers/merge_maps.h"
 
 namespace FlexFlow {
 
-MachineMapping combine_disjoint_mappings(MachineMapping const &s1,
-                                         MachineMapping const &s2) {
-  return MachineMapping{merge_maps(s1.machine_views, s2.machine_views)};
+MachineMapping combine_disjoint_mappings(MachineMapping const &m1,
+                                         MachineMapping const &m2) {
+  return MachineMapping{merge_maps(m1.machine_views, m2.machine_views)};
 }
 
 bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) {
diff --git a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
index a6c2d1ed04..9b4a1fd6fe 100644
--- a/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
+++ b/lib/compiler/src/compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.cc
@@ -30,7 +30,9 @@ MachineMappingWithMemoryResult remove_non_pareto_optimal_machine_mapping_result(
     bool is_pareto_optimal = true;
     for (MachineMappingForSingleLayer const &other_mapping :
          result.machine_mappings) {
-      if (mapping.cost.runtime >= other_mapping.cost.runtime &&
+      if (mapping.cost.forward_runtime >= other_mapping.cost.forward_runtime &&
+          mapping.cost.backward_runtime >=
+              other_mapping.cost.backward_runtime &&
           mapping.cost.memory >= other_mapping.cost.memory &&
           mapping != other_mapping) {
         is_pareto_optimal = false;
@@ -54,7 +56,10 @@ MachineMappingWithMemoryResult
       [&](MachineMappingForSingleLayer const &pre_mm,
           MachineMappingForSingleLayer const &post_mm) {
         OpCostMetrics cost = OpCostMetrics{
-            pre_mm.cost.runtime + comm_cost + post_mm.cost.runtime,
+            pre_mm.cost.forward_runtime + comm_cost +
+                post_mm.cost.forward_runtime,
+            pre_mm.cost.backward_runtime + comm_cost +
+                post_mm.cost.backward_runtime,
             pre_mm.cost.memory + post_mm.cost.memory,
         };
 
@@ -93,7 +98,9 @@ MachineMappingWithMemoryResult
       [&](MachineMappingForSingleLayer const &lhs_mm,
           MachineMappingForSingleLayer const &rhs_mm) {
         OpCostMetrics cost = OpCostMetrics{
-            std::max(lhs_mm.cost.runtime, rhs_mm.cost.runtime),
+            std::max(lhs_mm.cost.forward_runtime, rhs_mm.cost.forward_runtime),
+            std::max(lhs_mm.cost.backward_runtime,
+                     rhs_mm.cost.backward_runtime), //(@wmdi) is this correct?
             std::max(lhs_mm.cost.memory, rhs_mm.cost.memory),
         };
 
diff --git a/lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc b/lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc
new file mode 100644
index 0000000000..63e359d9ac
--- /dev/null
+++ b/lib/compiler/src/compiler/machine_mapping/unstructured_device_mapping.cc
@@ -0,0 +1,28 @@
+
+#include "compiler/machine_mapping/unstructured_device_mapping.h"
+#include "compiler/machine_mapping/unstructured_device_mapping.dtg.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/map_values.h"
+
+namespace FlexFlow {
+
+UnstructuredDeviceMapping
+    get_unstructured_device_mapping(MachineMapping const &machine_mapping,
+                                    MachineSpecification const &machine_spec,
+                                    ParallelComputationGraph const &pcg) {
+  std::unordered_map<parallel_layer_guid_t, std::unordered_set<device_id_t>>
+      device_mapping;
+  for (auto const &[layer, machine_view] : machine_mapping.machine_views) {
+    OperatorTaskSpace op = get_operator_task_space(pcg, layer);
+    device_mapping.insert(
+        {layer, get_device_ids(op, machine_view, machine_spec)});
+  }
+  return UnstructuredDeviceMapping{device_mapping};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc b/lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc
new file mode 100644
index 0000000000..2064c56a52
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/in_progress_task_comparator.cc
@@ -0,0 +1,11 @@
+#include "compiler/task_graph_simulator/in_progress_task_comparator.h"
+#include <tuple>
+
+namespace FlexFlow {
+
+bool InProgressTaskComparator::operator()(InProgressTask const &lhs,
+                                          InProgressTask const &rhs) const {
+  return std::tie(lhs.end_time, lhs.node) > std::tie(rhs.end_time, rhs.node);
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc
new file mode 100644
index 0000000000..539c44a963
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/pcg_task_graph.cc
@@ -0,0 +1,59 @@
+#include "compiler/task_graph_simulator/pcg_task_graph.h"
+#include "compiler/cost_estimator/op_cost_estimate_key.h"
+#include "compiler/cost_estimator/tensor_set_movement.h"
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "pcg/device_id_t.dtg.h"
+#include "pcg/machine_specification.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/machine_view.h"
+#include "pcg/operator_task_space.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "utils/bidict/bidict.h"
+#include "utils/graph/instances/adjacency_digraph.h"
+#include <unordered_map>
+#include <unordered_set>
+
+namespace FlexFlow {
+
+PCGTaskGraph get_pcg_task_graph(ParallelComputationGraph const &pcg,
+                                MachineMapping const &machine_mapping,
+                                MachineSpecification const &machine_spec) {
+  DiGraph digraph = DiGraph::create<AdjacencyDiGraph>();
+  bidict<Node, PCGTask> node_to_task;
+  bidict<Node, parallel_layer_guid_t> node_to_layer;
+  std::unordered_map<Node, std::unordered_set<device_id_t>> node_to_devices;
+
+  for (parallel_layer_guid_t const &layer : get_parallel_layers(pcg)) {
+    MachineView mv = machine_mapping.machine_views.at(layer);
+    OpCostEstimateKey op_key =
+        get_mapped_op_cost_estimate_key_for_layer(pcg, layer, mv);
+    Node node = digraph.add_node();
+    node_to_task.equate(node, PCGTask{op_key});
+    node_to_layer.equate(node, layer);
+    node_to_devices[node] =
+        get_device_ids(get_operator_task_space(pcg, layer),
+                       machine_mapping.machine_views.at(layer),
+                       machine_spec);
+  }
+
+  for (ParallelComputationGraphEdge const &edge : get_edges(pcg)) {
+    MachineView src_mv = machine_mapping.machine_views.at(get_src_layer(edge));
+    MachineView dst_mv = machine_mapping.machine_views.at(get_dst_layer(edge));
+    TensorSetMovement movement =
+        get_tensor_set_movement_from_pcg_edge(edge, pcg, src_mv, dst_mv);
+    Node node = digraph.add_node();
+    node_to_task.equate(node, PCGTask{movement});
+    node_to_devices[node] = {};
+    Node src_node = node_to_layer.at_r(get_src_layer(edge));
+    Node dst_node = node_to_layer.at_r(get_dst_layer(edge));
+
+    digraph.add_edge(DirectedEdge{src_node, node});
+    digraph.add_edge(DirectedEdge{node, dst_node});
+  }
+
+  return PCGTaskGraph{digraph, node_to_task, node_to_devices};
+}
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc b/lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
new file mode 100644
index 0000000000..974a70ddf5
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
@@ -0,0 +1,107 @@
+#include "compiler/task_graph_simulator/simulate_task_graph_execution.h"
+#include "compiler/task_graph_simulator/in_progress_task.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_state.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "utils/containers/filtrans.h"
+#include "utils/containers/is_subseteq_of.h"
+#include "utils/containers/set_of.h"
+#include "utils/containers/sorted.h"
+#include "utils/exception.h"
+#include "utils/graph/digraph/algorithms.h"
+#include "utils/graph/digraph/algorithms/get_predecessors.h"
+#include "utils/graph/digraph/algorithms/get_successors.h"
+#include "utils/graph/digraph/algorithms/is_acyclic.h"
+#include "utils/graph/digraph/digraph_view.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/overload.h"
+#include <functional>
+#include <unordered_set>
+
+namespace FlexFlow {
+
+TaskGraphExecutionTrace simulate_task_graph_execution(
+    DiGraphView const &task_graph,
+    std::function<float(Node const &)> cost_function,
+    TaskExecutionConstraint const &constraint) {
+  if (!is_acyclic(task_graph)) {
+    throw mk_runtime_error(
+        "simulate_task_graph_execution cannot simulate cyclic directed graphs");
+  }
+
+  TaskGraphExecutionState execution_state =
+      TaskGraphExecutionState{/*ready_tasks=*/set_of(get_sources(task_graph)),
+                              /*in_progress_tasks=*/{},
+                              /*finished_tasks=*/{},
+                              /*current_time=*/0.0};
+
+  std::unordered_set<TaskProfile> task_profiles;
+
+  auto start_task_processing = [&](Node const &task) {
+    float cost = cost_function(task);
+    execution_state.in_progress_tasks.push(
+        InProgressTask{execution_state.current_time,
+                       execution_state.current_time + cost,
+                       task});
+    execution_state.ready_tasks.erase(task);
+  };
+
+  auto dependencies_are_satisfied = [&](Node const &task) {
+    std::unordered_set<Node> incoming_dependencies =
+        get_predecessors(task_graph, task);
+    return is_subseteq_of(incoming_dependencies,
+                          execution_state.finished_tasks);
+  };
+
+  auto finish_task_processing = [&](InProgressTask const &in_progress_task) {
+    execution_state.finished_tasks.insert(in_progress_task.node);
+    for (Node const &task : get_successors(task_graph, in_progress_task.node)) {
+      if (dependencies_are_satisfied(task)) {
+        execution_state.ready_tasks.insert(task);
+      }
+    }
+    task_profiles.insert(TaskProfile{in_progress_task.node,
+                                     in_progress_task.start_time,
+                                     in_progress_task.end_time});
+    execution_state.current_time = in_progress_task.end_time;
+  };
+
+  auto is_processing_done = [&]() {
+    return execution_state.ready_tasks.empty() &&
+           execution_state.in_progress_tasks.empty();
+  };
+
+  auto get_next_task_to_finish = [&]() {
+    InProgressTask task = execution_state.in_progress_tasks.top();
+    execution_state.in_progress_tasks.pop();
+    return task;
+  };
+
+  while (!is_processing_done()) {
+    auto ready_tasks_copy = execution_state.ready_tasks;
+    for (Node const &task : ready_tasks_copy) {
+      std::unordered_set<Node> raw_in_progress_tasks = transform(
+          unordered_set_of(execution_state.in_progress_tasks.contents()),
+          [](InProgressTask const &t) { return t.node; });
+
+      if (constraint.is_satisfied(
+              task, raw_in_progress_tasks, execution_state.finished_tasks)) {
+        start_task_processing(task);
+      }
+    }
+
+    if (!execution_state.in_progress_tasks.empty()) {
+      InProgressTask next_task = get_next_task_to_finish();
+      finish_task_processing(next_task);
+    } else {
+      throw mk_runtime_error("Constraints cannot be satisfied");
+    }
+  }
+  if (execution_state.finished_tasks.size() != num_nodes(task_graph)) {
+    throw mk_runtime_error("Failed to execute all tasks in given graph");
+  }
+
+  return TaskGraphExecutionTrace{task_profiles};
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc
new file mode 100644
index 0000000000..716a7afe15
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/task_graph_execution_trace.cc
@@ -0,0 +1,27 @@
+#include "compiler/task_graph_simulator/task_graph_execution_trace.h"
+#include "utils/containers/maximum.h"
+#include "utils/containers/minimum.h"
+#include "utils/containers/transform.h"
+#include "utils/exception.h"
+#include "utils/fmt/unordered_set.h"
+
+namespace FlexFlow {
+
+float get_total_execution_time(TaskGraphExecutionTrace const &trace) {
+  if (trace.task_profiles.empty()) {
+    throw mk_runtime_error(
+        fmt::format("TaskGraphExecutionTrace {} is empty", trace));
+  }
+  float end_time =
+      maximum(transform(trace.task_profiles, [](TaskProfile const &profile) {
+        return profile.end_time;
+      }));
+  float start_time =
+      minimum(transform(trace.task_profiles, [](TaskProfile const &profile) {
+        return profile.start_time;
+      }));
+
+  return end_time - start_time;
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc
new file mode 100644
index 0000000000..ab204e7d71
--- /dev/null
+++ b/lib/compiler/src/compiler/task_graph_simulator/task_simulator.cc
@@ -0,0 +1,71 @@
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/cost_estimator/op_cost_estimate_key.h"
+#include "compiler/machine_mapping/unstructured_device_mapping.dtg.h"
+#include "compiler/machine_mapping/unstructured_device_mapping.h"
+#include "compiler/task_graph_simulator/pcg_task.dtg.h"
+#include "compiler/task_graph_simulator/pcg_task_graph.h"
+#include "compiler/task_graph_simulator/simulate_task_graph_execution.h"
+#include "compiler/task_graph_simulator/task_execution_constraint.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_trace.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "utils/containers/filtrans.h"
+#include "utils/containers/set_union.h"
+#include "utils/containers/transform.h"
+#include "utils/graph/digraph/digraph.h"
+#include "utils/hash/unordered_set.h"
+#include <unordered_set>
+
+namespace FlexFlow {
+
+float task_simulator_estimate_forward_pass_time(
+    ParallelComputationGraph const &pcg,
+    CostEstimator const &estimator,
+    MachineMapping const &machine_mapping,
+    MachineSpecification const &machine_spec) {
+
+  PCGTaskGraph task_graph =
+      get_pcg_task_graph(pcg, machine_mapping, machine_spec);
+
+  auto cost_function = [&](Node const &node) -> float {
+    PCGTask task = task_graph.node_to_task.at_l(node);
+    if (task.is_operator()) {
+      return estimator.estimate_cost(task.require_operator()).forward_runtime;
+    } else {
+      return estimator.estimate_cost(task.require_tensor_movement());
+    }
+  };
+
+  auto is_allowed_to_run =
+      [&](Node const &task,
+          std::unordered_set<Node> const &in_progress_tasks,
+          std::unordered_set<Node> const &finished_tasks) -> bool {
+    PCGTask current_task = task_graph.node_to_task.at_l(task);
+
+    UnstructuredDeviceMapping device_map =
+        get_unstructured_device_mapping(machine_mapping, machine_spec, pcg);
+
+    if (current_task.is_tensor_movement()) {
+      return true;
+    }
+    assert(current_task.is_operator());
+
+    auto get_devices = [&](Node const &n) {
+      return task_graph.node_to_devices.at(n);
+    };
+
+    std::unordered_set<device_id_t> devices_occupied =
+        set_union(transform(in_progress_tasks, get_devices));
+    std::unordered_set<device_id_t> required_devices = get_devices(task);
+    return intersection(devices_occupied, required_devices).empty();
+  };
+
+  TaskExecutionConstraint constraint =
+      TaskExecutionConstraint{is_allowed_to_run};
+
+  return get_total_execution_time(simulate_task_graph_execution(
+      task_graph.graph, cost_function, constraint));
+}
+
+} // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc b/lib/compiler/test/src/compiler/cost_estimator_for_test.cc
similarity index 72%
rename from lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
rename to lib/compiler/test/src/compiler/cost_estimator_for_test.cc
index 0431104878..48e6f5e561 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.cc
+++ b/lib/compiler/test/src/compiler/cost_estimator_for_test.cc
@@ -1,6 +1,8 @@
 #include "./cost_estimator_for_test.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
@@ -40,4 +42,15 @@ CostEstimator make_fake_cost_estimator(
       });
 }
 
+CostEstimator make_fake_constant_cost_estimator(float forward_op_cost,
+                                                float backward_op_cost,
+                                                float comm_cost,
+                                                nonnegative_int memory_cost) {
+  return make_fake_cost_estimator(
+      [=](OpCostEstimateKey const &op) {
+        return OpCostMetrics{forward_op_cost, backward_op_cost, memory_cost};
+      },
+      [=](TensorSetMovement const &op) { return comm_cost; });
+}
+
 } // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h b/lib/compiler/test/src/compiler/cost_estimator_for_test.h
similarity index 77%
rename from lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
rename to lib/compiler/test/src/compiler/cost_estimator_for_test.h
index 16ea3a85bc..1e8ce83caf 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/cost_estimator_for_test.h
+++ b/lib/compiler/test/src/compiler/cost_estimator_for_test.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_TEST_COST_ESTIMATOR_H
-#define _FLEXFLOW_TEST_COST_ESTIMATOR_H
+#ifndef _FLEXFLOW_TEST_COST_ESTIMATOR_FOR_TEST_H
+#define _FLEXFLOW_TEST_COST_ESTIMATOR_FOR_TEST_H
 
 #include "compiler/cost_estimator/cost_estimator.h"
 #include "compiler/cost_estimator/op_cost_estimate_key.dtg.h"
@@ -7,6 +7,7 @@
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.dtg.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.dtg.h"
 #include "compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
@@ -34,6 +35,11 @@ CostEstimator make_fake_cost_estimator(
     std::unordered_map<OpCostEstimateKey, OpCostMetrics> const &op_cost_map,
     std::unordered_map<TensorSetMovement, float> const &comm_cost_map);
 
+CostEstimator make_fake_constant_cost_estimator(float forward_op_cost,
+                                                float backward_op_cost,
+                                                float comm_cost,
+                                                nonnegative_int memory_cost);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index ac180cd079..542edd9fa9 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -1,5 +1,5 @@
 #include "compiler/machine_mapping/get_optimal_machine_mapping.h"
-#include "./cost_estimator_for_test.h"
+#include "../cost_estimator_for_test.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/machine_mapping_cache.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
@@ -9,6 +9,7 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "utils/containers/get_only.h"
 #include "utils/full_binary_tree/binary_tree_path.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <doctest/doctest.h>
 
 using namespace FlexFlow;
@@ -146,13 +147,21 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto map1 = std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
         {map_unmapped_op_cost_estimate_key(k1, mv1),
-         OpCostMetrics{/*runtime=*/1.0, /*memory=*/nonnegative_int{0}}},
+         OpCostMetrics{/*forward_runtime=*/0.5,
+                       /*backward_runtime=*/0.5,
+                       /*memory=*/nonnegative_int{0}}},
         {map_unmapped_op_cost_estimate_key(k2, mv1),
-         OpCostMetrics{/*runtime=*/2.0, /*memory=*/nonnegative_int{0}}},
+         OpCostMetrics{/*forward_runtime=*/1.0,
+                       /*backward_runtime=*/1.0,
+                       /*memory=*/nonnegative_int{0}}},
         {map_unmapped_op_cost_estimate_key(k1, mv2),
-         OpCostMetrics{/*runtime=*/1.5, /*memory=*/nonnegative_int{0}}},
+         OpCostMetrics{/*forward_runtime=*/0.75,
+                       /*backward_runtime=*/0.75,
+                       /*memory=*/nonnegative_int{0}}},
         {map_unmapped_op_cost_estimate_key(k2, mv2),
-         OpCostMetrics{/*runtime=*/2.5, /*memory=*/nonnegative_int{0}}},
+         OpCostMetrics{/*forward_runtime=*/1.25,
+                       /*backward_runtime=*/1.25,
+                       /*memory=*/nonnegative_int{0}}},
     }};
 
     CostEstimator cost_estimator = make_fake_cost_estimator(
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
index e22f715d82..52ad82595d 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
@@ -1,5 +1,5 @@
 #include "compiler/machine_mapping/get_tensor_set_movement_across_split.h"
-#include "./cost_estimator_for_test.h"
+#include "../cost_estimator_for_test.h"
 #include "compiler/machine_mapping/transitive_reduced_pcg.h"
 #include "pcg/machine_view.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
index 221cca3ae1..304034f9be 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,5 +1,4 @@
 #include "compiler/machine_mapping/machine_mapping.h"
-#include "cost_estimator_for_test.h"
 #include "doctest/doctest.h"
 #include "pcg/machine_view.h"
 
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 9706f1c75f..8612017705 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -1,5 +1,5 @@
 #include "compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.h"
-#include "../cost_estimator_for_test.h"
+#include "../../cost_estimator_for_test.h"
 #include "compiler/machine_mapping/abstracted_tensor_set_movement/abstracted_tensor_set_movement.h"
 #include "compiler/machine_mapping/machine_mapping_constraints.h"
 #include "compiler/machine_mapping/machine_mapping_problem_tree/machine_mapping_problem_tree.h"
@@ -9,6 +9,7 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "utils/containers/get_only.h"
 #include "utils/full_binary_tree/binary_tree_path.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <doctest/doctest.h>
 
 using namespace FlexFlow;
@@ -147,24 +148,32 @@ TEST_SUITE(FF_TEST_SUITE) {
     CostEstimator cost_estimator = make_fake_cost_estimator(
         std::unordered_map<OpCostEstimateKey, OpCostMetrics>{{
             {map_unmapped_op_cost_estimate_key(k1, mv1),
-             OpCostMetrics{1.0, nonnegative_int{2}}},
+             OpCostMetrics{/*forward_runtime=*/1.0,
+                           /*backward_runtime=*/1.0,
+                           /*memory=*/nonnegative_int{2}}},
             {map_unmapped_op_cost_estimate_key(k2, mv1),
-             OpCostMetrics{2.0, nonnegative_int{3}}},
+             OpCostMetrics{/*forward_runtime=*/2.0,
+                           /*backward_runtime=*/2.0,
+                           /*memory=*/nonnegative_int{3}}},
             {map_unmapped_op_cost_estimate_key(k1, mv2),
-             OpCostMetrics{1.5, nonnegative_int{1}}},
+             OpCostMetrics{/*forward_runtime=*/1.5,
+                           /*backward_runtime=*/1.5,
+                           /*memory=*/nonnegative_int{1}}},
             {map_unmapped_op_cost_estimate_key(k2, mv2),
-             OpCostMetrics{2.5, nonnegative_int{2}}},
+             OpCostMetrics{/*forward_runtime=*/2.5,
+                           /*backward_runtime=*/2.5,
+                           /*memory=*/nonnegative_int{2}}},
         }},
         std::unordered_map<TensorSetMovement, float>{{
-            {TensorSetMovement{{}}, 0.0},
+            {TensorSetMovement{/*movements=*/{}}, /*cost=*/0.0},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm1),
-             0.1},
+             /*cost=*/0.1},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm2),
-             0.2},
+             /*cost=*/0.2},
             {concretize_abstracted_tensor_set_movement(movement1, mm1, mm2),
-             0.3},
+             /*cost=*/0.3},
             {concretize_abstracted_tensor_set_movement(movement1, mm2, mm1),
-             0.4},
+             /*cost=*/0.4},
         }});
 
     MachineMappingContext context = MachineMappingContext{
@@ -187,13 +196,17 @@ TEST_SUITE(FF_TEST_SUITE) {
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           MachineMappingForSingleLayer{
-              OpCostMetrics{1.0, nonnegative_int{2}},
+              OpCostMetrics{/*forward_runtime=*/1.0,
+                            /*backward_runtime=*/1.0,
+                            /*memory=*/nonnegative_int{2}},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv1},
               }},
           },
           MachineMappingForSingleLayer{
-              OpCostMetrics{1.5, nonnegative_int{1}},
+              OpCostMetrics{/*forward_runtime=*/1.5,
+                            /*backward_runtime=*/1.5,
+                            /*memory=*/nonnegative_int{1}},
               ParallelLayerGuidObliviousMachineMapping{{
                   {binary_tree_root_path(), mv2},
               }},
@@ -217,7 +230,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           MachineMappingForSingleLayer{
               OpCostMetrics{
-                  /*runtime=*/1.0 + 2.0 + 0.1,
+                  /*forward_runtime=*/1.0 + 2.0 + 0.1,
+                  /*backward_runtime=*/1.0 + 2.0 + 0.1,
                   /*memory=*/nonnegative_int{2 + 3},
               },
               ParallelLayerGuidObliviousMachineMapping{{
@@ -236,7 +250,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               }},
           },
           MachineMappingForSingleLayer{
-              OpCostMetrics{1.5 + 2.5 + 0.1, nonnegative_int{1 + 2}},
+              OpCostMetrics{/*forward_runtime=*/1.5 + 2.5 + 0.1,
+                            /*backward_runtime=*/1.5 + 2.5 + 0.1,
+                            /*memory=*/nonnegative_int{1 + 2}},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
@@ -270,7 +286,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               cache, context, problem_tree, full_machine_spec, constraints);
       MachineMappingWithMemoryResult correct =
           MachineMappingWithMemoryResult{{MachineMappingForSingleLayer{
-              OpCostMetrics{2.5, nonnegative_int{2}},
+              OpCostMetrics{/*forward_runtime=*/2.5,
+                            /*backward_runtime=*/2.5,
+                            /*memory=*/nonnegative_int{2}},
               ParallelLayerGuidObliviousMachineMapping{{
                   {
                       BinaryTreePath{{
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
index ecfb7cfeb3..1f3b7545a8 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -1,5 +1,6 @@
 #include "compiler/machine_mapping/memory_optimization/machine_mapping_with_memory_result.h"
 #include "pcg/machine_view.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <doctest/doctest.h>
 
 using namespace FlexFlow;
@@ -52,15 +53,20 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics cost1 = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{2},
     };
+
     OpCostMetrics cost2 = OpCostMetrics{
-        /*runtime=*/4.0,
+        /*forward_runtime=*/4.0,
+        /*backward_runtime=*/4.0,
         /*memory=*/nonnegative_int{1},
     };
+
     OpCostMetrics cost3 = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{3},
     };
 
@@ -182,7 +188,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics pre_cost = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{2},
     };
     MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{
@@ -208,7 +215,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }};
 
     OpCostMetrics post_cost = OpCostMetrics{
-        /*runtime=*/4.0,
+        /*forward_runtime=*/4.0,
+        /*backward_runtime=*/4.0,
         /*memory=*/nonnegative_int{1},
     };
 
@@ -253,8 +261,10 @@ TEST_SUITE(FF_TEST_SUITE) {
               {
                   MachineMappingForSingleLayer{
                       /*cost=*/OpCostMetrics{
-                          /*runtime=*/pre_cost.runtime + comm_cost +
-                              post_cost.runtime,
+                          /*forward_runtime=*/pre_cost.forward_runtime +
+                              comm_cost + post_cost.forward_runtime,
+                          /*backward_runtime=*/pre_cost.backward_runtime +
+                              comm_cost + post_cost.backward_runtime,
                           /*memory=*/pre_cost.memory + post_cost.memory,
                       },
                       /*machine_mapping=*/
@@ -307,8 +317,10 @@ TEST_SUITE(FF_TEST_SUITE) {
             {
                 MachineMappingForSingleLayer{
                     /*cost=*/OpCostMetrics{
-                        /*runtime=*/pre_cost.runtime + comm_cost +
-                            post_cost.runtime,
+                        /*forward_runtime=*/pre_cost.forward_runtime +
+                            comm_cost + post_cost.forward_runtime,
+                        /*backward_runtime=*/pre_cost.backward_runtime +
+                            comm_cost + post_cost.backward_runtime,
                         /*memory=*/pre_cost.memory + post_cost.memory,
                     },
                     /*machine_mapping=*/
@@ -377,7 +389,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics lhs_cost = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{2},
     };
     MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{
@@ -403,7 +416,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }};
 
     OpCostMetrics rhs_cost = OpCostMetrics{
-        /*runtime=*/4.0,
+        /*forward_runtime=*/4.0,
+        /*backward_runtime=*/4.0,
         /*memory=*/nonnegative_int{1},
     };
     MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{
@@ -442,7 +456,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       MachineMappingWithMemoryResult correct = MachineMappingWithMemoryResult{{
           MachineMappingForSingleLayer{
               /*cost=*/OpCostMetrics{
-                  /*runtime=*/std::max(lhs_cost.runtime, rhs_cost.runtime),
+                  /*forward_runtime=*/std::max(lhs_cost.forward_runtime,
+                                               rhs_cost.forward_runtime),
+                  /*backward_runtime=*/
+                  std::max(lhs_cost.backward_runtime,
+                           rhs_cost.backward_runtime),
                   /*memory=*/std::max(lhs_cost.memory, rhs_cost.memory),
               },
               /*machine_mapping=*/
@@ -518,15 +536,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     OpCostMetrics cost1 = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{2},
     };
     OpCostMetrics cost2 = OpCostMetrics{
-        /*runtime=*/4.0,
+        /*forward_runtime=*/4.0,
+        /*backward_runtime=*/4.0,
         /*memory=*/nonnegative_int{1},
     };
     OpCostMetrics cost3 = OpCostMetrics{
-        /*runtime=*/2.0,
+        /*forward_runtime=*/2.0,
+        /*backward_runtime=*/2.0,
         /*memory=*/nonnegative_int{3},
     };
 
diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc b/lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
new file mode 100644
index 0000000000..e88f2b7840
--- /dev/null
+++ b/lib/compiler/test/src/compiler/task_graph_simulator/simulate_task_graph_execution.cc
@@ -0,0 +1,211 @@
+#include "compiler/task_graph_simulator/simulate_task_graph_execution.h"
+#include "compiler/task_graph_simulator/task_graph_execution_state.dtg.h"
+#include "compiler/task_graph_simulator/task_graph_execution_trace.dtg.h"
+#include "utils/containers/lookup_in_map.h"
+#include "utils/graph/algorithms.h"
+#include "utils/graph/digraph/directed_edge.dtg.h"
+#include "utils/graph/instances/adjacency_digraph.h"
+#include <doctest/doctest.h>
+#include <optional>
+
+namespace FlexFlow {
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("simulate_task_graph_execution") {
+    DiGraph g = DiGraph::create<AdjacencyDiGraph>();
+    SUBCASE("linear graph") {
+      std::vector<Node> n = add_nodes(g, 4);
+      add_edges(g,
+                {
+                    DirectedEdge{n.at(0), n.at(1)},
+                    DirectedEdge{n.at(1), n.at(2)},
+                    DirectedEdge{n.at(2), n.at(3)},
+                });
+
+      auto cost_function = lookup_in_map<Node, float>(
+          {{n.at(0), 1}, {n.at(1), 10}, {n.at(2), 100}, {n.at(3), 1000}});
+
+      auto is_allowed_to_run =
+          [&](Node const &n,
+              std::unordered_set<Node> const &in_progress_tasks,
+              std::unordered_set<Node> const &finished_tasks) { return true; };
+
+      TaskExecutionConstraint constraint =
+          TaskExecutionConstraint{is_allowed_to_run};
+
+      TaskGraphExecutionTrace result =
+          simulate_task_graph_execution(g, cost_function, constraint);
+      TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+          TaskProfile{n.at(0), 0, 1},
+          TaskProfile{n.at(1), 1, 11},
+          TaskProfile{n.at(2), 11, 111},
+          TaskProfile{n.at(3), 111, 1111},
+      }};
+      CHECK(correct == result);
+    }
+
+    SUBCASE("rhomboidal graph") {
+      std::vector<Node> n = add_nodes(g, 4);
+
+      add_edges(g,
+                {DirectedEdge{n.at(0), n.at(1)},
+                 DirectedEdge{n.at(0), n.at(2)},
+                 DirectedEdge{n.at(1), n.at(3)},
+                 DirectedEdge{n.at(2), n.at(3)}});
+
+      auto cost_function = lookup_in_map<Node, float>(
+          {{n.at(0), 10}, {n.at(1), 15}, {n.at(2), 20}, {n.at(3), 25}});
+
+      SUBCASE("no processing constraints") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return true;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 25},
+            TaskProfile{n.at(2), 10, 30},
+            TaskProfile{n.at(3), 30, 55},
+        }};
+        CHECK(correct == result);
+      }
+
+      SUBCASE("one node at a time") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return in_progress_tasks.size() == 0;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 25},
+            TaskProfile{n.at(2), 25, 45},
+            TaskProfile{n.at(3), 45, 70},
+        }};
+        CHECK(correct == result);
+      }
+    }
+
+    SUBCASE("diamond graph with crossing") {
+      std::vector<Node> n = add_nodes(g, 6);
+
+      add_edges(g,
+                {
+                    DirectedEdge{n.at(0), n.at(1)},
+                    DirectedEdge{n.at(0), n.at(2)},
+                    DirectedEdge{n.at(1), n.at(3)},
+                    DirectedEdge{n.at(2), n.at(3)},
+                    DirectedEdge{n.at(2), n.at(4)},
+                    DirectedEdge{n.at(3), n.at(5)},
+                    DirectedEdge{n.at(4), n.at(5)},
+                });
+
+      auto cost_function = lookup_in_map<Node, float>({{n.at(0), 10},
+                                                       {n.at(1), 15},
+                                                       {n.at(2), 20},
+                                                       {n.at(3), 25},
+                                                       {n.at(4), 30},
+                                                       {n.at(5), 35}});
+
+      SUBCASE("no processing constraints") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return true;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 25},
+            TaskProfile{n.at(2), 10, 30},
+            TaskProfile{n.at(3), 30, 55},
+            TaskProfile{n.at(4), 30, 60},
+            TaskProfile{n.at(5), 60, 95},
+        }};
+        CHECK(correct == result);
+      }
+
+      SUBCASE("one node at a time") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return in_progress_tasks.size() == 0;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 25},
+            TaskProfile{n.at(2), 25, 45},
+            TaskProfile{n.at(3), 45, 70},
+            TaskProfile{n.at(4), 70, 100},
+            TaskProfile{n.at(5), 100, 135},
+        }};
+        CHECK(correct == result);
+      }
+    }
+
+    SUBCASE("all-to-all intermediate") {
+      std::vector<Node> n = add_nodes(g, 5);
+
+      add_edges(g,
+                {DirectedEdge{n.at(0), n.at(1)},
+                 DirectedEdge{n.at(0), n.at(2)},
+                 DirectedEdge{n.at(0), n.at(3)},
+                 DirectedEdge{n.at(1), n.at(4)},
+                 DirectedEdge{n.at(2), n.at(4)},
+                 DirectedEdge{n.at(3), n.at(4)}});
+
+      auto cost_function = lookup_in_map<Node, float>({{n.at(0), 10},
+                                                       {n.at(1), 100},
+                                                       {n.at(2), 100},
+                                                       {n.at(3), 100},
+                                                       {n.at(4), 20}});
+
+      SUBCASE("at most two nodes at a time") {
+        auto is_allowed_to_run =
+            [&](Node const &n,
+                std::unordered_set<Node> const &in_progress_tasks,
+                std::unordered_set<Node> const &finished_tasks) {
+              return in_progress_tasks.size() < 2;
+            };
+
+        TaskExecutionConstraint constraint =
+            TaskExecutionConstraint{is_allowed_to_run};
+        TaskGraphExecutionTrace result =
+            simulate_task_graph_execution(g, cost_function, constraint);
+        TaskGraphExecutionTrace correct = TaskGraphExecutionTrace{{
+            TaskProfile{n.at(0), 0, 10},
+            TaskProfile{n.at(1), 10, 110},
+            TaskProfile{n.at(2), 10, 110},
+            TaskProfile{n.at(3), 110, 210},
+            TaskProfile{n.at(4), 210, 230},
+        }};
+        CHECK(correct == result);
+      }
+    }
+  }
+}
+} // namespace FlexFlow
diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
new file mode 100644
index 0000000000..e278338440
--- /dev/null
+++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
@@ -0,0 +1,265 @@
+#include "compiler/task_graph_simulator/task_simulator.h"
+#include "../cost_estimator_for_test.h"
+#include "compiler/cost_estimator/cost_estimator.h"
+#include "compiler/cost_estimator/op_cost_metrics.dtg.h"
+#include "compiler/machine_mapping/machine_mapping.dtg.h"
+#include "compiler/machine_mapping/machine_mapping.h"
+#include "compiler/machine_mapping/machine_mapping_problem_tree/unmapped_op_cost_estimate_key.h"
+#include "op-attrs/ops/input_attrs.dtg.h"
+#include "op-attrs/parallel_tensor_dims.dtg.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/device_id.h"
+#include "pcg/device_type.dtg.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/machine_specification.h"
+#include "pcg/machine_specification_dimension.dtg.h"
+#include "pcg/machine_view.dtg.h"
+#include "pcg/machine_view.h"
+#include "pcg/machine_view_dimension.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+#include "pcg/stride_t.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "utils/containers/get_only.h"
+#include "utils/deduplicated_priority_queue.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_source_nodes.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include <doctest/doctest.h>
+#include <optional>
+#include <unordered_map>
+#include <unordered_set>
+
+namespace FlexFlow {
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("task_simulator_estimate_forward_pass_time") {
+    MachineSpecification machine_spec =
+        MachineSpecification{/*num_nodes=*/3,
+                             /*num_cpus_per_node=*/3,
+                             /*num_gpus_per_node=*/3,
+                             /*inter_node_bandwidth=*/1.0f,
+                             /*intra_node_bandwidth=*/1.0f};
+
+    SUBCASE("linear graph") {
+      ParallelComputationGraphBuilder b;
+      ParallelTensorShape input_shape = ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{},
+              ReplicaParallelDimSet{
+                  SumDegree{1},
+                  DiscardCopyDegree{1},
+              },
+          },
+          DataType::FLOAT,
+      };
+      parallel_tensor_guid_t tensor0 = b.create_input_tensor(input_shape);
+      parallel_tensor_guid_t tensor1 = b.relu(tensor0);
+
+      parallel_layer_guid_t layer0 = get_source_layer(tensor0);
+      parallel_layer_guid_t layer1 = get_source_layer(tensor1);
+
+      std::vector<MachineViewDimension> dims = {
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+      };
+      ParallelComputationGraph pcg = b.pcg;
+      MachineView mv1 =
+          MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+      MachineView mv2 =
+          MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims};
+
+      MachineMapping device_mapping = MachineMapping{{
+          {layer0, mv1},
+          {layer1, mv2},
+      }};
+
+      SUBCASE("constant op, comm cost") {
+        CostEstimator estimator = make_fake_constant_cost_estimator(
+            /*forward_op_cost=*/10.0f,
+            /*backward_op_cost=*/10.0f,
+            /*comm_cost=*/1.0f,
+            /*memory_cost=*/nonnegative_int{0});
+
+        float result = task_simulator_estimate_forward_pass_time(
+            pcg, estimator, device_mapping, machine_spec);
+
+        float correct = 10 + 1 + 10;
+        CHECK(result == correct);
+      }
+
+      SUBCASE("variable op, comm cost") {
+        CostEstimator cost_estimator = make_fake_cost_estimator(
+            [](OpCostEstimateKey const &op) {
+              if (op.op_attrs.has<InputAttrs>()) {
+                return OpCostMetrics{/*forward_runtime=*/10.0f,
+                                     /*backward_runtime=*/10.0f,
+                                     /*memory=*/nonnegative_int{0}}; // layer0
+              }
+              if (op.op_attrs.has<ElementUnaryAttrs>()) {
+                return OpCostMetrics{/*forward_runtime=*/1.0f,
+                                     /*backward_runtime=*/1.0f,
+                                     /*memory=*/nonnegative_int{0}}; // layer1
+              }
+              return OpCostMetrics{/*forward_runtime=*/0.0f,
+                                   /*backward_runtime=*/0.0f,
+                                   /*memory=*/nonnegative_int{0}};
+            },
+            [](TensorSetMovement const &comm) { return 5.0f; });
+
+        float result = task_simulator_estimate_forward_pass_time(
+            pcg, cost_estimator, device_mapping, machine_spec);
+        float correct = 10 + 5 + 1;
+        CHECK(result == correct);
+      }
+    }
+
+    SUBCASE("rhomboidal graph") {
+      ParallelComputationGraphBuilder b;
+
+      ParallelTensorShape input_shape = ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{ShardParallelDim{10, 1}},
+              ReplicaParallelDimSet{
+                  SumDegree{1},
+                  DiscardCopyDegree{1},
+              },
+          },
+          DataType::FLOAT,
+      };
+
+      parallel_tensor_guid_t tensor0 = b.create_input_tensor(input_shape);
+      parallel_tensor_guid_t tensor1 = b.relu(tensor0);
+      parallel_tensor_guid_t tensor2 = b.relu(tensor0);
+      parallel_tensor_guid_t tensor3 = b.add(tensor1, tensor2);
+
+      parallel_layer_guid_t layer0 = get_source_layer(tensor0);
+      parallel_layer_guid_t layer1 = get_source_layer(tensor1);
+      parallel_layer_guid_t layer2 = get_source_layer(tensor2);
+      parallel_layer_guid_t layer3 = get_source_layer(tensor3);
+
+      ParallelComputationGraph pcg = b.pcg;
+      std::vector<MachineViewDimension> dims = {
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+          MachineViewDimension{stride_t{1},
+                               MachineSpecificationDimension::INTER_NODE},
+      };
+
+      SUBCASE("all different devices") {
+        MachineView mv0 =
+            MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+        MachineView mv1 =
+            MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims};
+        MachineView mv2 =
+            MachineView{MachineSpaceCoordinate{1, 0, DeviceType::GPU}, dims};
+        MachineView mv3 =
+            MachineView{MachineSpaceCoordinate{1, 1, DeviceType::GPU}, dims};
+
+        MachineMapping device_mapping = MachineMapping{{
+            {layer0, mv0},
+            {layer1, mv1},
+            {layer2, mv2},
+            {layer3, mv3},
+        }};
+        SUBCASE("constant op, comm cost") {
+          CostEstimator estimator = make_fake_constant_cost_estimator(
+              /*forward_op_cost=*/10.0f,
+              /*backward_op_cost=*/10.0f,
+              /*comm_cost=*/1.0f,
+              /*memory_cost=*/nonnegative_int{0});
+
+          float result = task_simulator_estimate_forward_pass_time(
+              pcg, estimator, device_mapping, machine_spec);
+          float correct = 10 + 1 + 10 + 1 + 10;
+          CHECK(result == correct);
+        }
+        SUBCASE("variable op, comm cost") {
+          CostEstimator cost_estimator = make_fake_cost_estimator(
+              [](OpCostEstimateKey const &op) {
+                if (op.op_attrs.has<InputAttrs>()) {
+                  return OpCostMetrics{/*forward_runtime=*/10.0f,
+                                       /*backward_runtime=*/10.0f,
+                                       /*memory=*/nonnegative_int{0}}; // layer0
+                }
+                if (op.op_attrs.has<ElementUnaryAttrs>()) {
+                  return OpCostMetrics{
+                      /*forward_runtime=*/1.0f,
+                      /*backward_runtime=*/1.0f,
+                      /*memory=*/nonnegative_int{0}}; // layers 1, 2
+                }
+                if (op.op_attrs.has<ElementBinaryAttrs>()) {
+                  return OpCostMetrics{/*forward_runtime=*/2.0f,
+                                       /*backward_runtime=*/2.0f,
+                                       /*memory=*/nonnegative_int{0}}; // layer3
+                }
+                return OpCostMetrics{/*forward_runtime=*/0.0f,
+                                     /*backward_runtime=*/0.0f,
+                                     /*memory=*/nonnegative_int{0}};
+              },
+              [](TensorSetMovement const &comm) { return 5.0f; });
+        }
+      }
+
+      SUBCASE("all the same device") {
+        MachineView mv =
+            MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+        MachineMapping device_mapping = MachineMapping{{
+            {layer0, mv},
+            {layer1, mv},
+            {layer2, mv},
+            {layer3, mv},
+        }};
+        SUBCASE("constant op, cost cost") {
+          CostEstimator cost_estimator = make_fake_constant_cost_estimator(
+              /*forward_op_cost=*/10.0f,
+              /*backward_op_cost=*/10.0f,
+              /*comm_cost=*/1.0f,
+              /*memory_cost=*/nonnegative_int{0});
+
+          float result = task_simulator_estimate_forward_pass_time(
+              pcg, cost_estimator, device_mapping, machine_spec);
+          float correct = 10 + 10 + 10 + 10 + 1 + 1;
+          CHECK(result == correct);
+        }
+        SUBCASE("variable op, cost cost") {
+          CostEstimator cost_estimator = make_fake_cost_estimator(
+              [](OpCostEstimateKey const &op) {
+                if (op.op_attrs.has<InputAttrs>()) {
+                  return OpCostMetrics{/*forward_runtime=*/10.0f,
+                                       /*backward_runtime=*/10.0f,
+                                       /*memory=*/nonnegative_int{0}}; // layer0
+                }
+                if (op.op_attrs.has<ElementUnaryAttrs>()) {
+                  return OpCostMetrics{
+                      /*forward_runtime=*/1.0f,
+                      /*backward_runtime=*/1.0f,
+                      /*memory=*/nonnegative_int{0}}; // layers 1, 2
+                }
+                if (op.op_attrs.has<ElementBinaryAttrs>()) {
+                  return OpCostMetrics{/*forward_runtime=*/2.0f,
+                                       /*backward_runtime=*/2.0f,
+                                       /*memory=*/nonnegative_int{0}}; // layer3
+                }
+                return OpCostMetrics{/*forward_runtime=*/0.0f,
+                                     /*backward_runtime=*/0.0f,
+                                     /*memory=*/nonnegative_int{0}};
+              },
+              [](TensorSetMovement const &comm) { return 5.0f; });
+          float result = task_simulator_estimate_forward_pass_time(
+              pcg, cost_estimator, device_mapping, machine_spec);
+          float correct = 10 + 5 + (1 + 1) + 5 + 2;
+          CHECK(result == correct);
+        }
+      }
+    }
+  }
+}
+} // namespace FlexFlow
diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h
index 6ffa9900c2..39591e8a70 100644
--- a/lib/pcg/include/pcg/machine_specification.h
+++ b/lib/pcg/include/pcg/machine_specification.h
@@ -20,6 +20,7 @@ bool is_valid_machine_space_coordinate(MachineSpecification const &ms,
 
 device_id_t get_device_id(MachineSpecification const &ms,
                           MachineSpaceCoordinate const &coord);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/machine_view.h b/lib/pcg/include/pcg/machine_view.h
index 293227b7a1..f72b2359dc 100644
--- a/lib/pcg/include/pcg/machine_view.h
+++ b/lib/pcg/include/pcg/machine_view.h
@@ -37,6 +37,14 @@ std::unordered_set<MachineSpaceCoordinate>
                                   MachineView const &mv,
                                   MachineSpecification const &ms);
 
+std::unordered_set<device_id_t> get_device_ids(OperatorTaskSpace const &task,
+                                               MachineView const &mv,
+                                               MachineSpecification const &ms);
+
+MachineView make_1d_machine_view(MachineSpaceCoordinate const &start,
+                                 MachineSpecificationDimension const &dim,
+                                 stride_t stride);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h
index 61cab4eff1..1a19397c72 100644
--- a/lib/pcg/include/pcg/operator_task_space.h
+++ b/lib/pcg/include/pcg/operator_task_space.h
@@ -2,6 +2,8 @@
 #define _FLEXFLOW_PCG_INCLUDE_OPERATOR_TASK_SPACE_H
 
 #include "pcg/operator_task_space.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/task_space_coordinate.dtg.h"
 #include <cstddef>
 #include <unordered_set>
@@ -17,6 +19,9 @@ TaskSpaceCoordinate
 size_t num_dims(OperatorTaskSpace const &task);
 size_t num_tasks(OperatorTaskSpace const &task);
 
+OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
+                                          parallel_layer_guid_t const &layer);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
index c740e1ffd2..f7567b5025 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph.h
@@ -6,6 +6,7 @@
 #include "pcg/parallel_computation_graph/parallel_layer_added_result.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include <unordered_set>
 
 namespace FlexFlow {
 
@@ -31,6 +32,20 @@ std::unordered_set<ParallelComputationGraphEdge>
                                       parallel_layer_guid_t const &,
                                       parallel_layer_guid_t const &);
 
+std::unordered_set<ParallelComputationGraphEdge>
+    get_edges(ParallelComputationGraph const &);
+
+std::unordered_set<ParallelComputationGraphEdge>
+    get_outgoing_edges(ParallelComputationGraph const &,
+                       parallel_layer_guid_t const &);
+
+std::unordered_set<ParallelComputationGraphEdge>
+    get_incoming_edges(ParallelComputationGraph const &,
+                       parallel_layer_guid_t const &);
+
+std::unordered_set<parallel_layer_guid_t>
+    get_initial_layers(ParallelComputationGraph const &);
+
 std::vector<parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &,
                          parallel_layer_guid_t const &);
@@ -45,6 +60,9 @@ std::vector<parallel_tensor_guid_t>
     get_incoming_weights(ParallelComputationGraph const &,
                          parallel_layer_guid_t const &);
 
+parallel_layer_guid_t get_source_layer(ParallelComputationGraph const &g,
+                                       parallel_tensor_guid_t const &t);
+
 ParallelLayerAttrs get_parallel_layer_attrs(ParallelComputationGraph const &,
                                             parallel_layer_guid_t const &);
 PCGOperatorAttrs pcg_get_op_attrs(ParallelComputationGraph const &,
diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc
index ca5b8ba047..19ff50b4b7 100644
--- a/lib/pcg/src/pcg/machine_specification.cc
+++ b/lib/pcg/src/pcg/machine_specification.cc
@@ -1,5 +1,6 @@
 #include "pcg/machine_specification.h"
 #include "pcg/device_id.h"
+#include "utils/containers/transform.h"
 #include "utils/exception.h"
 namespace FlexFlow {
 
diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc
index 18f6cacb7e..cc42ad83b2 100644
--- a/lib/pcg/src/pcg/machine_view.cc
+++ b/lib/pcg/src/pcg/machine_view.cc
@@ -1,14 +1,21 @@
 #include "pcg/machine_view.h"
+#include "pcg/machine_space_coordinate.dtg.h"
+#include "pcg/machine_specification.dtg.h"
 #include "pcg/machine_specification.h"
+#include "pcg/machine_specification_dimension.dtg.h"
+#include "pcg/machine_view_dimension.dtg.h"
+#include "pcg/operator_task_space.dtg.h"
 #include "pcg/operator_task_space.h"
+#include "pcg/stride_t.dtg.h"
 #include "utils/containers/contains.h"
 #include "utils/containers/count.h"
 #include "utils/containers/filter.h"
+#include "utils/containers/get_only.h"
 #include "utils/containers/scanl.h"
 #include "utils/containers/sum.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/zip.h"
-
+#include "utils/exception.h"
 namespace FlexFlow {
 
 size_t num_dims(MachineView const &mv) {
@@ -35,6 +42,13 @@ MachineView machine_view_from_strides_and_machine_spec_dimensions(
     MachineSpaceCoordinate const &start,
     std::vector<stride_t> const &strides,
     std::vector<MachineSpecificationDimension> const &dims) {
+  if (strides.size() != dims.size()) {
+    throw mk_runtime_error(fmt::format(
+        "Length of strides ({}) and dims ({}) must match when calling "
+        "machine_view_from_strides_and_machine_spec_dimensions",
+        start,
+        strides));
+  }
   std::vector<MachineViewDimension> dimensions =
       transform(zip(strides, dims), [&](auto const &p) {
         return MachineViewDimension{p.first, p.second};
@@ -48,6 +62,14 @@ std::optional<MachineSpaceCoordinate> get_machine_space_coordinate(
     TaskSpaceCoordinate const &coord,
     MachineSpecification const &machine_specification) {
 
+  if (num_dims(machine_view) != task.degrees.size()) {
+    throw mk_runtime_error(
+        fmt::format("Dimension of machine_view ({}) must match dimension of "
+                    "task ({}) when computing machine space coordinate",
+                    machine_view,
+                    task.degrees));
+  }
+
   auto get_dimension_indices_for_dimension =
       [&](MachineSpecificationDimension dimension) {
         std::vector<MachineSpecificationDimension> mv_dimensions =
@@ -106,10 +128,37 @@ std::unordered_set<MachineSpaceCoordinate> get_machine_space_coordinates(
     MachineSpecification const &machine_specification) {
   return transform(
       get_task_space_coordinates(task), [&](TaskSpaceCoordinate const &coord) {
-        return get_machine_space_coordinate(
-                   task, machine_view, coord, machine_specification)
-            .value();
+        std::optional<MachineSpaceCoordinate> maybe_coordinate =
+            get_machine_space_coordinate(
+                task, machine_view, coord, machine_specification);
+        if (!maybe_coordinate.has_value()) {
+          throw mk_runtime_error(
+              fmt::format("In get_machine_space_coordinates, the given "
+                          "OperatorTaskSpace {} and MachineView {} are not "
+                          "compatible with the given MachineSpecification {}",
+                          task,
+                          machine_view,
+                          machine_specification));
+        }
+        return maybe_coordinate.value();
       });
 }
 
+std::unordered_set<device_id_t> get_device_ids(OperatorTaskSpace const &task,
+                                               MachineView const &mv,
+                                               MachineSpecification const &ms) {
+  return transform(get_machine_space_coordinates(task, mv, ms),
+                   [&](MachineSpaceCoordinate const &coord) {
+                     return get_device_id(ms, coord);
+                   });
+}
+
+MachineView make_1d_machine_view(MachineSpaceCoordinate const &start,
+                                 MachineSpecificationDimension const &dim,
+                                 stride_t stride) {
+
+  return machine_view_from_strides_and_machine_spec_dimensions(
+      start, {stride}, {dim});
+}
+
 } // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc
index 2538cb4ea0..7157b75082 100644
--- a/lib/pcg/src/pcg/operator_task_space.cc
+++ b/lib/pcg/src/pcg/operator_task_space.cc
@@ -1,12 +1,19 @@
 #include "pcg/operator_task_space.h"
+#include "op-attrs/parallel_tensor_shape.dtg.h"
+#include "op-attrs/parallel_tensor_shape.h"
+#include "pcg/operator_task_space.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
 #include "utils/containers/cartesian_product.h"
+#include "utils/containers/extend.h"
 #include "utils/containers/maximum.h"
 #include "utils/containers/product.h"
 #include "utils/containers/range.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/unordered_set_of.h"
+#include "utils/containers/vector_of.h"
 #include "utils/fmt/unordered_set.h"
-
 namespace FlexFlow {
 
 std::unordered_set<TaskSpaceCoordinate>
@@ -36,4 +43,16 @@ size_t num_tasks(OperatorTaskSpace const &task) {
   return product(task.degrees);
 }
 
+OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
+                                          parallel_layer_guid_t const &layer) {
+  parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0);
+  ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor);
+
+  std::vector<int> degrees;
+  extend(degrees, vector_of(ff_ordered_shard_degrees(shape)));
+  degrees.push_back(get_sum_degree(shape));
+  degrees.push_back(get_discard_copy_degree(shape));
+  return OperatorTaskSpace{degrees};
+}
+
 } // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index 781c44640c..4cc0500fa2 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -1,15 +1,25 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "op-attrs/get_incoming_tensor_roles.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
+#include "pcg/parallel_computation_graph/parallel_layer_guid_t.dtg.h"
 #include "utils/containers/filtrans.h"
 #include "utils/containers/get_only.h"
 #include "utils/containers/transform.h"
+#include "utils/containers/unordered_set_of.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
 #include "utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.h"
+#include "utils/graph/dataflow_graph/algorithms/get_incoming_edges.h"
+#include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h"
+#include "utils/graph/dataflow_graph/dataflow_edge.dtg.h"
+#include "utils/graph/digraph/algorithms.h"
 #include "utils/graph/digraph/algorithms/get_topological_ordering.h"
 #include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
 #include "utils/graph/labelled_dataflow_graph/algorithms/find_isomorphism.h"
 #include "utils/graph/labelled_dataflow_graph/algorithms/rewrite_node_labels.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/graph/node/node.dtg.h"
+#include <unordered_set>
 
 namespace FlexFlow {
 
@@ -66,6 +76,13 @@ ParallelLayerAddedResult
                             /*output_labels=*/{tensor_attrs});
 }
 
+std::unordered_set<ParallelComputationGraphEdge>
+    get_edges(ParallelComputationGraph const &pcg) {
+  return transform(get_edges(pcg.raw_graph), [](DataflowEdge const &e) {
+    return ParallelComputationGraphEdge{e};
+  });
+}
+
 std::unordered_set<ParallelComputationGraphEdge>
     get_pcg_edges_from_layer_to_layer(ParallelComputationGraph const &pcg,
                                       parallel_layer_guid_t const &src,
@@ -78,6 +95,33 @@ std::unordered_set<ParallelComputationGraphEdge>
   });
 }
 
+std::unordered_set<ParallelComputationGraphEdge>
+    get_outgoing_edges(ParallelComputationGraph const &pcg,
+                       parallel_layer_guid_t const &l) {
+  std::unordered_set<DataflowEdge> raw_edges =
+      get_outgoing_edges(pcg.raw_graph, l.raw_graph_node);
+  return transform(raw_edges, [](DataflowEdge const &e) {
+    return ParallelComputationGraphEdge{e};
+  });
+}
+
+std::unordered_set<ParallelComputationGraphEdge>
+    get_incoming_edges(ParallelComputationGraph const &pcg,
+                       parallel_layer_guid_t const &l) {
+  std::unordered_set<DataflowEdge> raw_edges =
+      unordered_set_of(get_incoming_edges(pcg.raw_graph, l.raw_graph_node));
+  return transform(raw_edges, [](DataflowEdge const &e) {
+    return ParallelComputationGraphEdge{e};
+  });
+}
+
+std::unordered_set<parallel_layer_guid_t>
+    get_initial_layers(ParallelComputationGraph const &pcg) {
+  std::unordered_set<Node> raw_sources = get_sources(pcg.raw_graph);
+  return transform(raw_sources,
+                   [](Node const &n) { return parallel_layer_guid_t{n}; });
+}
+
 std::vector<parallel_tensor_guid_t>
     get_incoming_tensors(ParallelComputationGraph const &pcg,
                          parallel_layer_guid_t const &l) {
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
index dca8154eb4..d30739486e 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
@@ -1,4 +1,5 @@
 #include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/pcg/test/src/pcg/machine_view.cc b/lib/pcg/test/src/pcg/machine_view.cc
index dcf22d6c00..3e9d48fac3 100644
--- a/lib/pcg/test/src/pcg/machine_view.cc
+++ b/lib/pcg/test/src/pcg/machine_view.cc
@@ -1,4 +1,5 @@
 #include "pcg/machine_view.h"
+#include "pcg/gpu_id_t.dtg.h"
 #include "test/utils/doctest/fmt/optional.h"
 #include "utils/containers/transform.h"
 #include "utils/fmt/unordered_set.h"
@@ -298,4 +299,94 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
     }
   }
+
+  TEST_CASE("get_device_ids") {
+
+    SUBCASE("1D machine view") {
+
+      // This operator has shape (3,), and thus 3 tasks.
+      // The (only) dimension is projected onto the INTRA (device) dimension
+      // with a stride of 2. The start of the projection defined by MachineView
+      // is at MachineSpaceCoordinate (0, 1), and the machine space has 1 node
+      // and 6 devices per node.
+
+      /**
+       * The tasks will thus be distributed like this:
+       *  +-------+-------+-------+-------+-------+-------+
+       *  |   0   | ((1)) |   2   | ((3)) |   4   | ((5)) |
+       *  +-------+-------+-------+-------+-------+-------+
+       * Where the integers are the device ids and ((x)) are the devices we
+       * select
+       */
+      MachineSpecification ms =
+          MachineSpecification{/*num_nodes=*/1,
+                               /*num_cpus_per_node=*/6,
+                               /*num_gpus_per_node=*/6,
+                               /*inter_node_bandwidth=*/0,
+                               /*intra_node_bandwidth=*/0};
+
+      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      MachineView mv = MachineView{
+          MachineSpaceCoordinate{
+              /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
+          {MachineViewDimension{stride_t{2},
+                                MachineSpecificationDimension::INTRA_NODE}}};
+
+      std::unordered_set<device_id_t> correct = {
+          device_id_t{gpu_id_t{1}},
+          device_id_t{gpu_id_t{3}},
+          device_id_t{gpu_id_t{5}},
+      };
+      std::unordered_set<device_id_t> result = get_device_ids(task, mv, ms);
+      CHECK(result == correct);
+    }
+
+    SUBCASE("2D machine view") {
+      // This operator has shape (2, 2), and thus 2 * 2 = 4 tasks.
+      // - The first dimension is projected onto the INTER (node) dimension with
+      // stride 1,
+      // - The second dimension is projected onto the INTRA (device) dimension
+      // with stride 2. The start of the projection defined by MachineView is at
+      // MachineSpaceCoordinate (1, 2), and the machine space has 3 nodes and 5
+      // devices per node.
+
+      /**
+       * The tasks will thus be distributed like this:
+       *  +-------+-------+-------+-------+-------+
+       *  |   0   |   1   |   2   |   3   |   4   |
+       *  +-------+-------+-------+-------+-------+
+       *  |   5   |   6   | ((7)) |   8   | ((9)) |
+       *  +-------+-------+-------+-------+-------+
+       *  |   10  |   11  | ((12))|  13   | ((14))|
+       *  +-------+-------+-------+-------+-------+
+       * Where the integers are the device ids and ((x)) are the devices we
+       * select
+       */
+
+      MachineSpecification ms =
+          MachineSpecification{/*num_nodes=*/3,
+                               /*num_cpus_per_node=*/5,
+                               /*num_gpus_per_node=*/5,
+                               /*inter_node_bandwidth=*/0,
+                               /*intra_node_bandwidth=*/0};
+
+      OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+      MachineView mv = MachineView{
+          MachineSpaceCoordinate{
+              /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU},
+          {MachineViewDimension{stride_t{1},
+                                MachineSpecificationDimension::INTER_NODE},
+           MachineViewDimension{stride_t{2},
+                                MachineSpecificationDimension::INTRA_NODE}}};
+
+      std::unordered_set<device_id_t> correct = {
+          device_id_t{gpu_id_t{7}},
+          device_id_t{gpu_id_t{9}},
+          device_id_t{gpu_id_t{12}},
+          device_id_t{gpu_id_t{14}},
+      };
+      std::unordered_set<device_id_t> result = get_device_ids(task, mv, ms);
+      CHECK(result == correct);
+    }
+  }
 }
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index fc07edf5b3..dd8308561f 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -36,8 +36,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     parallel_tensor_guid_t tensor3 = get_only(layer3_added.outputs);
 
     std::vector<parallel_layer_guid_t> result = topological_ordering(pcg);
-    // std::vector<parallel_layer_guid_t> correct = {layer1, layer2, layer3};
-    // CHECK(result == correct);
+    std::vector<parallel_layer_guid_t> correct = {layer1, layer2, layer3};
+    CHECK(result == correct);
   }
 
   TEST_CASE(
@@ -105,6 +105,82 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
   }
 
+  TEST_CASE(
+      "get_source_layer(ParallelComputationGraph, parallel_tensor_guid_t)") {
+    ParallelTensorShape tensor_shape = ParallelTensorShape{
+        ParallelTensorDims{
+            FFOrdered<ShardParallelDim>{
+                ShardParallelDim{10, 2},
+                ShardParallelDim{12, 1},
+            },
+            ReplicaParallelDimSet{
+                SumDegree{1},
+                DiscardCopyDegree{1},
+            },
+        },
+        DataType::FLOAT,
+    };
+
+    ParallelComputationGraph pcg = empty_parallel_computation_graph();
+
+    ParallelLayerAttrs layer_label = some<ParallelLayerAttrs>();
+    ParallelTensorAttrs tensor_label = some<ParallelTensorAttrs>();
+
+    SUBCASE("single layer") {
+      ParallelLayerAddedResult layer1_added =
+          add_parallel_layer(pcg, layer_label, {}, {tensor_label});
+      parallel_layer_guid_t layer1 = layer1_added.parallel_layer;
+      parallel_tensor_guid_t tensor1 = get_only(layer1_added.outputs);
+
+      parallel_layer_guid_t result = get_source_layer(pcg, tensor1);
+      parallel_layer_guid_t correct = layer1;
+      CHECK(result == correct);
+    }
+
+    SUBCASE("two connected layers") {
+      ParallelLayerAddedResult layer1_added =
+          add_parallel_layer(pcg, layer_label, {}, {tensor_label});
+      parallel_layer_guid_t layer1 = layer1_added.parallel_layer;
+      parallel_tensor_guid_t tensor1 = get_only(layer1_added.outputs);
+
+      ParallelLayerAddedResult layer2_added =
+          add_parallel_layer(pcg, layer_label, {tensor1}, {tensor_label});
+      parallel_layer_guid_t layer2 = layer2_added.parallel_layer;
+
+      parallel_layer_guid_t result = get_source_layer(pcg, tensor1);
+      parallel_layer_guid_t correct = layer1;
+      CHECK(result == correct);
+    }
+
+    SUBCASE("three layers in series") {
+      ParallelLayerAddedResult layer1_added =
+          add_parallel_layer(pcg, layer_label, {}, {tensor_label});
+      parallel_layer_guid_t layer1 = layer1_added.parallel_layer;
+      parallel_tensor_guid_t tensor1 = get_only(layer1_added.outputs);
+
+      ParallelLayerAddedResult layer2_added =
+          add_parallel_layer(pcg, layer_label, {tensor1}, {tensor_label});
+      parallel_layer_guid_t layer2 = layer2_added.parallel_layer;
+      parallel_tensor_guid_t tensor2 = get_only(layer2_added.outputs);
+
+      ParallelLayerAddedResult layer3_added =
+          add_parallel_layer(pcg, layer_label, {tensor2}, {tensor_label});
+      parallel_layer_guid_t layer3 = layer3_added.parallel_layer;
+
+      SUBCASE("tensor 1") {
+        parallel_layer_guid_t result = get_source_layer(pcg, tensor1);
+        parallel_layer_guid_t correct = layer1;
+        CHECK(result == correct);
+      }
+
+      SUBCASE("tensor 2") {
+        parallel_layer_guid_t result = get_source_layer(pcg, tensor2);
+        parallel_layer_guid_t correct = layer2;
+        CHECK(result == correct);
+      }
+    }
+  }
+
   TEST_CASE(
       "get_incoming_weights(ParallelComputationGraph, parallel_layer_guid_t)") {
     ParallelTensorShape input_shape = ParallelTensorShape{
diff --git a/lib/runtime/src/parallel_compuation_graph.cc b/lib/runtime/src/parallel_compuation_graph.cc
deleted file mode 100644
index ebc5ac1e8e..0000000000
--- a/lib/runtime/src/parallel_compuation_graph.cc
+++ /dev/null
@@ -1,7 +0,0 @@
-#include "parallel_computation_graph.h"
-
-namespace FlexFlow {
-
-ParallelTensor ParallelComputationGraph::{}
-
-} // namespace FlexFlow
diff --git a/lib/utils/include/utils/archetypes/value_type.h b/lib/utils/include/utils/archetypes/value_type.h
index 1635747612..e45b8fda7e 100644
--- a/lib/utils/include/utils/archetypes/value_type.h
+++ b/lib/utils/include/utils/archetypes/value_type.h
@@ -2,7 +2,10 @@
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_ARCHETYPES_VALUE_TYPE_H
 
 #include <cassert>
+#include <fmt/format.h>
 #include <functional>
+#include <ostream>
+#include <sstream>
 
 namespace FlexFlow {
 
@@ -32,6 +35,16 @@ struct value_type {
   }
 };
 
+template <int TAG>
+std::string format_as(value_type<TAG> const &) {
+  assert(false);
+}
+
+template <int TAG>
+std::ostream &operator<<(std::ostream &s, value_type<TAG> const &x) {
+  assert(false);
+}
+
 } // namespace FlexFlow
 
 namespace std {
diff --git a/lib/utils/include/utils/containers/lookup_in_map.h b/lib/utils/include/utils/containers/lookup_in_map.h
new file mode 100644
index 0000000000..946fc589db
--- /dev/null
+++ b/lib/utils/include/utils/containers/lookup_in_map.h
@@ -0,0 +1,27 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_LOOKUP_IN_MAP_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_LOOKUP_IN_MAP_H
+
+#include "utils/containers/contains.h"
+#include "utils/containers/keys.h"
+#include "utils/exception.h"
+#include "utils/fmt/unordered_map.h"
+#include <functional>
+#include <string>
+#include <unordered_map>
+
+namespace FlexFlow {
+
+template <typename K, typename V>
+std::function<V(K const &)> lookup_in_map(std::unordered_map<K, V> const &map) {
+  return [map](K const &key) -> V {
+    if (!contains(keys(map), key)) {
+      throw mk_runtime_error(fmt::format(
+          "Key {} is not present in the underlying map {}", key, map));
+    }
+    return map.at(key);
+  };
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/containers/minimum.h b/lib/utils/include/utils/containers/minimum.h
new file mode 100644
index 0000000000..8bdd6ea985
--- /dev/null
+++ b/lib/utils/include/utils/containers/minimum.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MINIMUM_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MINIMUM_H
+
+#include "utils/exception.h"
+#include <algorithm>
+
+namespace FlexFlow {
+
+template <typename C>
+typename C::value_type minimum(C const &c) {
+  if (c.empty()) {
+    throw mk_runtime_error(
+        fmt::format("minimum expected non-empty container but received {}", c));
+  }
+
+  return *std::min_element(c.begin(), c.end());
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/deduplicated_priority_queue.h b/lib/utils/include/utils/deduplicated_priority_queue.h
index 66f6e524d4..afad3f5889 100644
--- a/lib/utils/include/utils/deduplicated_priority_queue.h
+++ b/lib/utils/include/utils/deduplicated_priority_queue.h
@@ -3,6 +3,7 @@
 
 #include "utils/containers/contains.h"
 #include <queue>
+#include <set>
 #include <unordered_set>
 #include <vector>
 
@@ -38,6 +39,16 @@ class DeduplicatedPriorityQueue {
     impl.pop();
   }
 
+  std::set<Elem, Compare> contents() const {
+    auto temp = impl;
+    std::set<Elem, Compare> result;
+    while (!temp.empty()) {
+      result.insert(temp.top());
+      temp.pop();
+    }
+    return result;
+  }
+
 private:
   std::priority_queue<Elem, Container, Compare> impl;
   std::unordered_set<Elem, Hash> hashmap;
diff --git a/lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h b/lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h
new file mode 100644
index 0000000000..a8b5efe66e
--- /dev/null
+++ b/lib/utils/include/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_GET_OUTGOING_EDGES_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_GET_OUTGOING_EDGES_H
+
+#include "utils/graph/dataflow_graph/dataflow_graph_view.h"
+
+namespace FlexFlow {
+
+std::unordered_set<DataflowEdge> get_outgoing_edges(DataflowGraphView const &,
+                                                    Node const &);
+std::unordered_set<DataflowEdge>
+    get_outgoing_edges(DataflowGraphView const &,
+                       std::unordered_set<Node> const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/src/utils/containers/lookup_in_map.cc b/lib/utils/src/utils/containers/lookup_in_map.cc
new file mode 100644
index 0000000000..a0d7db8e82
--- /dev/null
+++ b/lib/utils/src/utils/containers/lookup_in_map.cc
@@ -0,0 +1,12 @@
+#include "utils/containers/lookup_in_map.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using K = value_type<0>;
+using V = value_type<1>;
+
+template std::function<V(K const &)>
+    lookup_in_map(std::unordered_map<K, V> const &map);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/minimum.cc b/lib/utils/src/utils/containers/minimum.cc
new file mode 100644
index 0000000000..c9bbc7706f
--- /dev/null
+++ b/lib/utils/src/utils/containers/minimum.cc
@@ -0,0 +1 @@
+#include "utils/containers/minimum.h"
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
new file mode 100644
index 0000000000..2376e4897f
--- /dev/null
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
@@ -0,0 +1,28 @@
+#include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h"
+#include "utils/containers/sorted_by.h"
+
+namespace FlexFlow {
+
+std::unordered_set<DataflowEdge> get_outgoing_edges(DataflowGraphView const &g,
+                                                    Node const &n) {
+  return g.query_edges(DataflowEdgeQuery{
+      {n},
+      query_set<int>::matchall(),
+      query_set<Node>::matchall(),
+      query_set<int>::matchall(),
+  });
+}
+
+std::unordered_set<DataflowEdge>
+    get_outgoing_edges(DataflowGraphView const &g,
+                       std::unordered_set<Node> const &ns) {
+  DataflowEdgeQuery query = DataflowEdgeQuery{
+      query_set<Node>{ns},
+      query_set<int>::matchall(),
+      query_set<Node>::matchall(),
+      query_set<int>::matchall(),
+  };
+  return g.query_edges(query);
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/test/src/utils/containers/lookup_in_map.cc b/lib/utils/test/src/utils/containers/lookup_in_map.cc
new file mode 100644
index 0000000000..9ca356ee4b
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/lookup_in_map.cc
@@ -0,0 +1,31 @@
+#include "utils/containers/lookup_in_map.h"
+#include <doctest/doctest.h>
+#include <functional>
+#include <string>
+
+using namespace FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+
+  TEST_CASE("lookup_in_map") {
+
+    std::unordered_map<std::string, int> map = {{"a", 1}, {"b", 2}};
+
+    SUBCASE("existing keys") {
+      std::function<int(std::string const &)> func = lookup_in_map(map);
+      CHECK(func("a") == 1);
+      CHECK(func("b") == 2);
+    }
+
+    SUBCASE("missing key") {
+      std::function<int(std::string const &)> func = lookup_in_map(map);
+      CHECK_THROWS(func("c"));
+    }
+
+    SUBCASE("empty map") {
+      std::unordered_map<std::string, int> map = {};
+      std::function<int(std::string const &)> func = lookup_in_map(map);
+      CHECK_THROWS(func("a"));
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
new file mode 100644
index 0000000000..86e4802cdb
--- /dev/null
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
@@ -0,0 +1,51 @@
+#include "utils/graph/dataflow_graph/algorithms/get_incoming_edges.h"
+#include "utils/containers/get_only.h"
+#include "utils/graph/dataflow_graph/dataflow_graph.h"
+#include "utils/graph/instances/unordered_set_dataflow_graph.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_incoming_edges(DataflowGraphView, Node)") {
+    DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult n1_added = g.add_node({}, 1);
+    Node n1 = n1_added.node;
+    DataflowOutput o1 = get_only(n1_added.outputs);
+
+    NodeAddedResult n2_added = g.add_node({}, 1);
+    Node n2 = n2_added.node;
+    DataflowOutput o2 = get_only(n2_added.outputs);
+
+    NodeAddedResult n3_added = g.add_node({o2}, 1);
+    Node n3 = n3_added.node;
+    DataflowOutput o3 = get_only(n3_added.outputs);
+
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    Node n4 = n4_added.node;
+    DataflowOutput o4 = get_only(n4_added.outputs);
+
+    SUBCASE("n4 - multiple incoming edges") {
+      std::vector<DataflowEdge> result = get_incoming_edges(g, n4);
+      std::vector<DataflowEdge> correct = {
+          DataflowEdge{o2, DataflowInput{n4, 0}},
+          DataflowEdge{o3, DataflowInput{n4, 1}}};
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n3- single incoming edge") {
+      std::vector<DataflowEdge> result = get_incoming_edges(g, n3);
+      std::vector<DataflowEdge> correct = {
+          DataflowEdge{o2, DataflowInput{n3, 0}},
+      };
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n1- no incoming edges") {
+      std::vector<DataflowEdge> result = get_incoming_edges(g, n1);
+      std::vector<DataflowEdge> correct = {};
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
new file mode 100644
index 0000000000..be874b7e29
--- /dev/null
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
@@ -0,0 +1,90 @@
+#include "utils/graph/dataflow_graph/algorithms/get_outgoing_edges.h"
+#include "utils/containers/get_only.h"
+#include "utils/graph/dataflow_graph/dataflow_graph.h"
+#include "utils/graph/instances/unordered_set_dataflow_graph.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_outgoing_edges(DataflowGraphView, Node)") {
+    DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult n1_added = g.add_node({}, 1);
+    Node n1 = n1_added.node;
+    DataflowOutput o1 = get_only(n1_added.outputs);
+
+    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    Node n2 = n2_added.node;
+    DataflowOutput o2 = get_only(n2_added.outputs);
+
+    NodeAddedResult n3_added = g.add_node({o1}, 1);
+    Node n3 = n3_added.node;
+    DataflowOutput o3 = get_only(n3_added.outputs);
+
+    NodeAddedResult n4_added = g.add_node({o2}, 1);
+    Node n4 = n4_added.node;
+    DataflowOutput o4 = get_only(n4_added.outputs);
+
+    SUBCASE("n2 - single outgoing edge") {
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n2);
+      std::unordered_set<DataflowEdge> correct = {
+          DataflowEdge{o2, DataflowInput{n4, 0}},
+      };
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n1 - multiple outgoing edges") {
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n1);
+      std::unordered_set<DataflowEdge> correct = {
+          DataflowEdge{o1, DataflowInput{n2, 0}},
+          DataflowEdge{o1, DataflowInput{n3, 0}},
+      };
+      CHECK(result == correct);
+    }
+
+    SUBCASE("n4 - no outgoing edges") {
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n4);
+      std::unordered_set<DataflowEdge> correct = {};
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("get_outgoing_edges(DataflowGraphView, std::unordered_set<Node>)") {
+    DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult n1_added = g.add_node({}, 1);
+    Node n1 = n1_added.node;
+    DataflowOutput o1 = get_only(n1_added.outputs);
+
+    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    Node n2 = n2_added.node;
+    DataflowOutput o2 = get_only(n2_added.outputs);
+
+    NodeAddedResult n3_added = g.add_node({o1}, 1);
+    Node n3 = n3_added.node;
+    DataflowOutput o3 = get_only(n3_added.outputs);
+
+    NodeAddedResult n4_added = g.add_node({o2}, 1);
+    Node n4 = n4_added.node;
+    DataflowOutput o4 = get_only(n4_added.outputs);
+
+    SUBCASE("multiple nodes - combined outgoing edges") {
+      std::unordered_set<Node> nodes = {n1, n2};
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, nodes);
+      std::unordered_set<DataflowEdge> correct = {
+          DataflowEdge{o1, DataflowInput{n2, 0}},
+          DataflowEdge{o1, DataflowInput{n3, 0}},
+          DataflowEdge{o2, DataflowInput{n4, 0}},
+      };
+      CHECK(result == correct);
+    }
+
+    SUBCASE("multiple nodes - no outgoing edges") {
+      std::unordered_set<Node> nodes = {n3, n4};
+      std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, nodes);
+      std::unordered_set<DataflowEdge> correct = {};
+      CHECK(result == correct);
+    }
+  }
+}

From c116872ed69ae58ac0711405d7d222fe60b80b25 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Wed, 29 Jan 2025 19:39:32 -0800
Subject: [PATCH 2/5] Add AWS linux AMI to runs-on for testing (#1589)

---
 .github/runs-on.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index 14f75549dd..b558b5131a 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -1,4 +1,10 @@
 images:
+  amazon-linux-gpu-x64:
+    platform: "linux"
+    arch: "x64"
+    owner: "898082745236" # AWS
+    name: "Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver*"
+
   dlami-x64:
     platform: "linux"
     arch: "x64"
@@ -8,4 +14,4 @@ images:
 runners:
   gpu-nvidia:
     family: ["g4dn.xlarge"]
-    image: dlami-x64
+    image: amazon-linux-gpu-x64

From 41d2fb5e6ad35f47df6f8d93b79b8cf0c4630c81 Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Thu, 30 Jan 2025 13:57:39 -0800
Subject: [PATCH 3/5] Pin runs-on images (#1590)

---
 .github/runs-on.yml | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/.github/runs-on.yml b/.github/runs-on.yml
index b558b5131a..6312b64955 100644
--- a/.github/runs-on.yml
+++ b/.github/runs-on.yml
@@ -1,17 +1,12 @@
 images:
-  amazon-linux-gpu-x64:
+  runs-on-gpu-pinned:
     platform: "linux"
     arch: "x64"
-    owner: "898082745236" # AWS
-    name: "Amazon Linux 2 AMI with NVIDIA TESLA GPU Driver*"
+    owner: "135269210855" # runs-on
+    name: "runs-on-v2.2-ubuntu22-gpu-x64-20250123194414"
 
-  dlami-x64:
+  runs-on-cpu-pinned:
     platform: "linux"
     arch: "x64"
-    owner: "898082745236" # AWS
-    name: "Deep Learning Base OSS Nvidia Driver GPU AMI (Ubuntu 22.04)*"
-
-runners:
-  gpu-nvidia:
-    family: ["g4dn.xlarge"]
-    image: amazon-linux-gpu-x64
+    owner: "135269210855" # runs-on
+    name: "runs-on-v2.2-ubuntu22-full-x64-20250101080516"

From 4d3294ab3f6858590d3d1f6d6d0cbaa09afc692a Mon Sep 17 00:00:00 2001
From: Colin Unger <lockshaw@lockshaw.net>
Date: Fri, 31 Jan 2025 00:20:51 -0800
Subject: [PATCH 4/5] GPU CI Fix (Pin runs-on GPU image) (#1588)

* Debug

* Change to base DL AMI

* Print disk usage

* Run nvidia-smi

* Remove excess cuda installs in base ami

* Re-enable freeing space in GPU CI

* Try updating nix-develop version

* Check what happens if you just enter the non-nixGL environment

* Try switching AMIs

* Try to remove the module stuff

* Move to lockshaw/develop-action

* Try pointing at a fixed commit

* Update nix-develop action

* Update nix-develop action to use BASH_FUNC filtering

* Remove all the /usr/local/cuda entries

* Switch back to gpu-ci env

* Update the cuda arch

* Try out the new runs-on gpu image

* Move over to pinned runs-on image

* Remove a bunch more unnecessary stuff in image to get back disk space

* Try using an emphemeral store

* Try mounting

* Fix bug

* Try sudo

* Move nix into _work

* Rollback all unnecessary changes

* Re-enable waiting on cpu-ci
---
 .github/workflows/helpers/free_space_on_runner_gpu.sh | 8 --------
 .github/workflows/tests.yml                           | 9 +++++----
 2 files changed, 5 insertions(+), 12 deletions(-)
 delete mode 100755 .github/workflows/helpers/free_space_on_runner_gpu.sh

diff --git a/.github/workflows/helpers/free_space_on_runner_gpu.sh b/.github/workflows/helpers/free_space_on_runner_gpu.sh
deleted file mode 100755
index a382ee58f6..0000000000
--- a/.github/workflows/helpers/free_space_on_runner_gpu.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -euo pipefail
-set -x
-
-sudo rm -rf /usr/share/dotnet
-sudo rm -rf /usr/local/lib/android
-sudo rm -rf /opt/ghc
-sudo rm -rf "/usr/local/share/boost"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 7e2dabd784..e2fc0b6df6 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -16,7 +16,7 @@ jobs:
           submodules: recursive
 
       - name: Free additional space on runner
-        run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh
+        run: ./.github/workflows/helpers/free_space_on_runner.sh
 
       - name: Install nix
         uses: cachix/install-nix-action@v25
@@ -67,7 +67,7 @@ jobs:
     runs-on:
       - runs-on
       - family=g4dn.xlarge
-      - image=ubuntu22-full-x64
+      - image=runs-on-gpu-pinned
 
     strategy:
       max-parallel: 1
@@ -79,8 +79,9 @@ jobs:
         with:
           submodules: recursive
 
-      - name: free additional space on runner
-        run: ./.github/workflows/helpers/free_space_on_runner_gpu.sh
+      - name: mount ephemeral drive to nix
+        run: |
+          sudo mkdir $HOME/_work/nix && sudo mkdir /nix && sudo mount --bind $HOME/_work/nix /nix
 
       - name: install nix
         uses: cachix/install-nix-action@v25

From 2b71235b66f15bbc5eaa9ad3b24fc3d470d335c3 Mon Sep 17 00:00:00 2001
From: Victor Li <32348970+victorli2002@users.noreply.github.com>
Date: Sat, 1 Feb 2025 12:54:42 -0800
Subject: [PATCH 5/5] Merge substitution-builder (#1575)

* Start on pcg builder

* Add tests and some implementation for pcg builder

* Add pcg tests, make dtgen constructors explicit to fix bug

* Add remainder of PCG tests

* Fix build issues in local-execution

* Format

* Address Reyna comments, add topological_order function for PCG

* Pre multidigraph refactor

* Removing visitable from sp code

* Add open dataflow graph, start to replace pcg dataflow graph

* Start refactoring substitutions

* Add utility functions to support pattern matching

* Pre-refactor inputs

* Fix proj url

* Get back to substitutions, now with unordered graph inputs

* Get substitutions building

* substitutions-tests now builds

* Fix bug in filter, pass some initial substitution tests

* Add tests for fmt::to_string, fix some substitutions bugs

* Pass initial unit tests for find_pattern_matches

* Start on unit tests for pcg pattern

* Pass initial test for find_pattern_matches

* Fix small build issue in tests

* Format

* Sync tests in CI with tests in proj

* Fix minor build errors in kernels and local-execution

* Format

* Remove outdated code

* More outdated code removal

* More cleanup, add test for sp decomposition

* Pull apart containers.h

* More sp testing and fixes

* Break up graph algorithms.h

* Pre- full SP algo commit

* Add initial implementation and tests for cbc decomposition and inverse line graph

* Pass test for get_inverse_line_graph

* Add new multidigraph

* Fix get_inverse_line_graph to return a MultiDiGraph instead of a DiGraph

* Add tests for parallel and series reduction finding

* Add really rough implementation of valdez sp decomposition

* Fix local-execution build

* Add implementations and tests for applying series/parallel reductions

* Format

* Clean up sp decomposition interface and tests

* Format

* Add comments for top-level substitutions functions, add proj doxygen support

* Start sketching out substitutions code

* Fix build errors

* Add ability to permute node ids

* Cleanup and start to test new substitutions code

* Add test case for evaluate_substitution_output

* Add naive isomorphism detection code

* Add graph inputs to open dataflow graph isomorphism

* Add input permutation to evaluate_substitution_output

* Fix permute_node_ids

* Add test for permute_input_ids

* Migrate over to mutable implementation of apply_substitution

* Add fast isomorphism checking and an initial implementation of full substitution logic

* Pass initial full substitutions test

* Cleanup old isomorphism checking code

* Fix post-merge bugs

* Fix broken pcg builder test

* Format

* Reorganize code and remove some outdated code pre-code-review

* Format

* Restarting work on this after working on export-model-arch

* Adding in some a simple function to get the currently available substritutions

* nonnegative_int additions, code cleanup, etc.

* A bunch more moving over to nonnegative_int

* Even more nonnegative_int updating

* Fix build

* Fix failing tests

* Format

* Format

---------

Co-authored-by: Colin Unger <lockshaw@lockshaw.net>
Co-authored-by: Victor Li <vli42@sapling2.stanford.edu>
---
 .../src/export_model_arch.cc                  |  17 +-
 cmake/flexflow-utils.cmake                    |  14 +-
 flake.nix                                     |  14 +-
 ...omputation_graph_binary_sp_decomposition.h |   5 +-
 .../src/compiler/allowed_machine_views.cc     |  41 +-
 .../get_machine_resource_splits.cc            |  10 +-
 .../machine_mapping/machine_mapping.cc        |  10 +-
 ...el_layer_guid_oblivious_machine_mapping.cc |   4 +-
 ...mputation_graph_binary_sp_decomposition.cc |   2 +-
 .../test/src/allowed_machine_views.cc         |  60 +-
 ...racted_tensor_set_movement_across_split.cc |   8 +-
 .../get_machine_resource_splits.cc            | 193 ++---
 .../get_optimal_machine_mapping.cc            |  28 +-
 .../get_tensor_set_movement_across_split.cc   |  32 +-
 .../machine_mapping/machine_mapping.cc        |  24 +-
 .../get_machine_mapping_problem_tree.cc       |   6 +-
 .../machine_mapping/machine_mapping_result.cc |  36 +-
 ...get_optimal_machine_mapping_with_memory.cc |  28 +-
 .../machine_mapping_result_with_memory.cc     |  80 +--
 ...ion_graph_series_parallel_decomposition.cc |  64 +-
 .../task_graph_simulator/task_simulator.cc    |  88 ++-
 lib/compiler/test/src/graph_optimize_state.cc |  59 +-
 lib/kernels/include/kernels/array_shape.h     |  36 +-
 .../include/kernels/batch_norm_kernels.h      |  41 +-
 .../batch_norm_per_device_state.struct.toml   |  68 ++
 lib/kernels/include/kernels/legion_dim.h      |  10 +-
 .../include/kernels/legion_dim_t.struct.toml  |   7 +-
 .../include/kernels/transpose_kernels.h       |  17 +-
 lib/kernels/src/allocation.cc                 |   3 +-
 lib/kernels/src/array_shape.cc                |  53 +-
 lib/kernels/src/cuda/cuda_helper.cu           |   8 +-
 .../src/cuda/ops/batch_norm_kernels.cu        |  32 +-
 lib/kernels/src/cuda/ops/cast_kernels.cu      |   4 +-
 lib/kernels/src/cuda/ops/combine_kernels.cu   |   5 +-
 lib/kernels/src/cuda/ops/concat_kernels.cu    |   7 +-
 lib/kernels/src/cuda/ops/conv_2d_kernels.cu   |  16 +-
 .../src/cuda/ops/element_unary_kernels.cu     |   8 +-
 lib/kernels/src/cuda/ops/flat_kernels.cu      |  12 +-
 lib/kernels/src/cuda/ops/gather_kernels.cu    |  54 +-
 lib/kernels/src/cuda/ops/partition_kernels.cu |  16 +-
 lib/kernels/src/cuda/ops/reduction_kernels.cu |   8 +-
 lib/kernels/src/cuda/ops/replicate_kernels.cu |   8 +-
 lib/kernels/src/cuda/ops/reshape_kernels.cu   |   7 +-
 lib/kernels/src/cuda/ops/transpose_kernels.cu |  99 +--
 lib/kernels/src/legion_dim.cc                 |   9 +-
 lib/kernels/test/src/test_attention_kernel.cc |  50 +-
 .../test/src/test_batch_matmul_kernel.cc      |  30 +-
 .../test/src/test_batch_norm_kernel.cc        |  58 +-
 lib/kernels/test/src/test_cast_kernel.cc      |   4 +-
 lib/kernels/test/src/test_combine_kernel.cc   |   2 +-
 lib/kernels/test/src/test_concat_kernel.cc    |  15 +-
 lib/kernels/test/src/test_dropout.cc          |   8 +-
 lib/kernels/test/src/test_flat_kernel.cc      |   6 +-
 lib/kernels/test/src/test_gather_kernels.cc   |   7 +-
 .../test/src/test_layer_norm_kernels.cc       |   8 +-
 lib/kernels/test/src/test_partition_kernel.cc |   6 +-
 lib/kernels/test/src/test_pool_2d_kernels.cc  |  52 +-
 lib/kernels/test/src/test_reduction_kernel.cc |   9 +-
 lib/kernels/test/src/test_replicate_kernel.cc |   8 +-
 lib/kernels/test/src/test_reshape_kernel.cc   |   6 +-
 lib/kernels/test/src/test_reverse_kernels.cc  |  31 +-
 lib/kernels/test/src/test_softmax_kernel.cc   |  23 +-
 lib/kernels/test/src/test_split_kernel.cc     |  16 +-
 lib/kernels/test/src/test_transpose_kernel.cc |  19 +-
 lib/kernels/test/src/test_utils.cc            |  15 +-
 lib/kernels/test/src/test_utils.h             |  16 +-
 ...device_specific_device_states.variant.toml |   6 +-
 .../local-execution/legion_tensor_shape.h     |  40 --
 .../per_device_op_state.variant.toml          |   5 -
 .../local-execution/task_id_t.enum.toml       |   3 -
 .../src/legion_tensor_shape.cc                |  15 -
 lib/local-execution/src/ops/attention.cc      |  55 +-
 lib/local-execution/src/ops/batch_matmul.cc   |  73 +-
 lib/local-execution/src/ops/batch_matmul.h    |   2 +-
 lib/local-execution/src/ops/batch_norm.cc     |  27 +-
 lib/local-execution/src/ops/conv_2d.cc        |  26 +-
 lib/local-execution/src/ops/gather.cc         |   9 +-
 lib/local-execution/src/ops/layer_norm.cc     |  24 +-
 lib/local-execution/src/ops/linear.cc         |  49 +-
 lib/local-execution/src/ops/pool_2d.cc        |  91 ++-
 lib/local-execution/src/ops/reduce.cc         |   9 +-
 lib/local-execution/src/ops/reduction.cc      |   4 +-
 lib/local-execution/src/ops/replicate.cc      |   4 +-
 lib/local-execution/src/ops/reverse.cc        |  46 +-
 lib/local-execution/src/ops/softmax.cc        |  20 +-
 lib/local-execution/src/ops/split.cc          |  51 +-
 lib/local-execution/src/ops/topk.cc           |  20 +-
 lib/local-execution/src/ops/transpose.cc      |  55 +-
 lib/local-execution/src/ops/transpose.h       |   3 -
 .../src/task_signature_impl.cc                |   4 -
 .../test/src/test_local_slots_backing.cc      |  13 +-
 .../test/src/test_local_task_arg_accessor.cc  |  13 +-
 .../test/src/test_task_registry.cc            |   8 +-
 .../models/bert/bert_config.struct.toml       |  15 +-
 .../candle_uno/candle_uno_config.struct.toml  |   9 +-
 .../inception_v3_config.struct.toml           |   8 +-
 .../include/models/split_test/split_test.h    |   2 +-
 .../transformer_config.struct.toml            |  21 +-
 lib/models/src/models/bert/bert.cc            |  41 +-
 .../src/models/candle_uno/candle_uno.cc       |  57 +-
 .../src/models/inception_v3/inception_v3.cc   | 675 +++++++++---------
 .../src/models/split_test/split_test.cc       |  16 +-
 .../src/models/transformer/transformer.cc     |  89 +--
 .../computation_graph_op_attrs.variant.toml   |   2 +-
 lib/op-attrs/include/op-attrs/datatype.h      |   3 +-
 .../op-attrs/dim_ordered/dim_ordered.h        |  27 +-
 .../include/op-attrs/dim_ordered/slice.h      |   4 +-
 lib/op-attrs/include/op-attrs/get_op_type.h   |   2 +-
 lib/op-attrs/include/op-attrs/ops/attention.h |  36 +-
 .../multihead_attention_inputs.struct.toml    |  12 +-
 .../op-attrs/ops/attention_attrs.struct.toml  |  12 +-
 .../include/op-attrs/ops/batch_matmul.h       |   2 +-
 .../op-attrs/ops/batch_matmul.struct.toml     |  19 -
 .../ops/batch_matmul_attrs.struct.toml        |  30 +
 .../op-attrs/ops/combine_attrs.struct.toml    |   3 +-
 .../conv_2d/conv_2d_input_shape.struct.toml   |   9 +-
 .../conv_2d_parallel_input_shape.struct.toml  |   5 +-
 .../op-attrs/ops/conv_2d_attrs.struct.toml    |  17 +-
 .../op-attrs/ops/embedding_attrs.struct.toml  |   7 +-
 .../op-attrs/ops/linear_attrs.struct.toml     |   3 +-
 lib/op-attrs/include/op-attrs/ops/pool_2d.h   |   4 +-
 .../op-attrs/ops/pool_2d_attrs.struct.toml    |  13 +-
 .../op-attrs/ops/reduction_attrs.struct.toml  |   6 +-
 .../ops/repartition_attrs.struct.toml         |   3 +-
 .../op-attrs/ops/replicate_attrs.struct.toml  |   6 +-
 .../op-attrs/ops/split_attrs.struct.toml      |   3 +-
 .../op-attrs/ops/topk_attrs.struct.toml       |   6 +-
 .../parallel_tensor_dim_degrees.struct.toml   |   3 +-
 .../include/op-attrs/parallel_tensor_dims.h   |  20 +-
 .../include/op-attrs/parallel_tensor_shape.h  |  25 +-
 .../discard_copy_degree.struct.toml           |   6 +-
 .../sum_degree.struct.toml                    |   6 +-
 .../op-attrs/pcg_operator_attrs.variant.toml  |   2 +-
 .../include/op-attrs/relative_ff_dim_t.h      |   2 +-
 .../op-attrs/replica_parallel_dim.struct.toml |   3 +-
 .../op-attrs/replica_parallel_dim_set.h       |   3 +-
 .../op-attrs/shard_parallel_dim.struct.toml   |   8 +-
 lib/op-attrs/include/op-attrs/tensor_dims.h   |   8 +-
 .../include/op-attrs/tensor_dims.struct.toml  |   4 +-
 lib/op-attrs/include/op-attrs/tensor_shape.h  |  10 +-
 lib/op-attrs/src/op-attrs/datatype.cc         |  15 +-
 lib/op-attrs/src/op-attrs/ff_dim_t.cc         |   2 +-
 lib/op-attrs/src/op-attrs/ops/attention.cc    | 474 ++----------
 .../attention/multihead_attention_inputs.cc   |  18 +-
 .../multihead_attention_parallel_inputs.cc    |   6 +-
 lib/op-attrs/src/op-attrs/ops/batch_matmul.cc |  21 +-
 lib/op-attrs/src/op-attrs/ops/batch_norm.cc   |  27 +-
 lib/op-attrs/src/op-attrs/ops/concat.cc       |  15 +-
 lib/op-attrs/src/op-attrs/ops/conv_2d.cc      |  65 +-
 .../ops/conv_2d/conv_2d_input_shape.cc        |   8 +-
 lib/op-attrs/src/op-attrs/ops/embedding.cc    |  24 +-
 lib/op-attrs/src/op-attrs/ops/flat.cc         |  18 +-
 lib/op-attrs/src/op-attrs/ops/layer_norm.cc   |   6 +-
 lib/op-attrs/src/op-attrs/ops/linear.cc       |  20 +-
 lib/op-attrs/src/op-attrs/ops/pool_2d.cc      |  73 +-
 .../src/op-attrs/parallel_tensor_dims.cc      |  39 +-
 .../src/op-attrs/parallel_tensor_shape.cc     |  64 +-
 .../src/op-attrs/relative_ff_dim_t.cc         |   4 +-
 .../src/op-attrs/replica_parallel_dim_set.cc  |   6 +-
 lib/op-attrs/src/op-attrs/tensor_dims.cc      |  15 +-
 lib/op-attrs/src/op-attrs/tensor_shape.cc     |  13 +-
 .../test/src/op-attrs/ops/attention.cc        | 178 +++--
 .../test/src/op-attrs/ops/batch_matmul.cc     | 144 ++--
 .../test/src/op-attrs/ops/batch_norm.cc       |  84 +--
 lib/op-attrs/test/src/op-attrs/ops/cast.cc    |  34 +-
 lib/op-attrs/test/src/op-attrs/ops/combine.cc |  20 +-
 lib/op-attrs/test/src/op-attrs/ops/concat.cc  | 176 ++---
 lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc | 168 ++---
 lib/op-attrs/test/src/op-attrs/ops/dropout.cc |  62 +-
 .../test/src/op-attrs/ops/element_binary.cc   |  70 +-
 .../test/src/op-attrs/ops/element_unary.cc    |  38 +-
 .../test/src/op-attrs/ops/embedding.cc        |  68 +-
 lib/op-attrs/test/src/op-attrs/ops/flat.cc    | 110 +--
 .../test/src/op-attrs/ops/layer_norm.cc       |  93 +--
 lib/op-attrs/test/src/op-attrs/ops/linear.cc  | 142 ++--
 lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc | 226 +++---
 .../test/src/op-attrs/ops/reduction.cc        |  16 +-
 .../test/src/op-attrs/ops/repartition.cc      |  16 +-
 .../test/src/op-attrs/ops/replicate.cc        |  17 +-
 lib/op-attrs/test/src/op-attrs/ops/softmax.cc |  78 +-
 .../test/src/op-attrs/pcg_operator_attrs.cc   |   4 +-
 .../test/src/op-attrs/relative_ff_dim_t.cc    |  10 +-
 lib/op-attrs/test/src/op-attrs/tensor_dims.cc |  31 +-
 .../include/pcg/computation_graph_builder.h   |  91 +--
 lib/pcg/include/pcg/cpu_id_t.struct.toml      |   6 +-
 lib/pcg/include/pcg/device_id.h               |   4 +-
 .../file_format/v1/graphs/v1_dataflow_graph.h |   2 +-
 .../v1/graphs/v1_dataflow_graph.struct.toml   |   3 +-
 .../v1/graphs/v1_graph_edge.struct.toml       |  12 +-
 .../v1/graphs/v1_labelled_dataflow_graph.h    |   9 +-
 .../v1_labelled_dataflow_graph.struct.toml    |   5 +-
 .../v1_binary_sp_decomposition.variant.toml   |   3 +-
 .../pcg/file_format/v1/v1_computation_graph.h |   2 +-
 lib/pcg/include/pcg/gpu_id_t.struct.toml      |   6 +-
 .../pcg/machine_space_coordinate.struct.toml  |   5 +-
 lib/pcg/include/pcg/machine_specification.h   |  12 +-
 .../pcg/machine_specification.struct.toml     |  10 +-
 lib/pcg/include/pcg/machine_view.h            |   2 +-
 lib/pcg/include/pcg/operator_task_space.h     |   4 +-
 .../pcg/operator_task_space.struct.toml       |   3 +-
 .../parallel_computation_graph_builder.h      |  38 +-
 .../parallel_computation_graph_edge.h         |   2 +-
 .../pcg/start_invariant_machine_view.h        |   2 +-
 lib/pcg/include/pcg/stride_t.struct.toml      |   6 +-
 .../pcg/task_space_coordinate.struct.toml     |   3 +-
 lib/pcg/src/pcg/computation_graph_builder.cc  | 137 ++--
 lib/pcg/src/pcg/device_id.cc                  |   4 +-
 .../v1/graphs/v1_dataflow_graph.cc            |   6 +-
 .../v1/graphs/v1_labelled_dataflow_graph.cc   |  16 +
 .../v1/v1_binary_sp_decomposition/json.cc     |   4 +-
 .../file_format/v1/v1_computation_graph.cc    |   7 +-
 lib/pcg/src/pcg/machine_space_offset.cc       |   6 +-
 lib/pcg/src/pcg/machine_specification.cc      |  17 +-
 lib/pcg/src/pcg/machine_view.cc               |  79 +-
 lib/pcg/src/pcg/operator_task_space.cc        |  22 +-
 .../generate_weight_transform.cc              |   4 +-
 .../parallel_computation_graph_builder.cc     |  98 +--
 .../parallel_computation_graph_edge.cc        |   2 +-
 .../src/pcg/start_invariant_machine_view.cc   |   7 +-
 lib/pcg/test/src/pcg/computation_graph.cc     |  40 +-
 .../test/src/pcg/computation_graph_builder.cc |  18 +-
 .../v1/v1_binary_sp_decomposition/json.cc     |  18 +-
 .../file_format/v1/v1_computation_graph.cc    |   8 +-
 .../v1/v1_parallel_computation_graph.cc       |  10 +-
 lib/pcg/test/src/pcg/machine_specification.cc |  17 +-
 lib/pcg/test/src/pcg/machine_view.cc          | 160 ++---
 lib/pcg/test/src/pcg/operator_task_space.cc   |  28 +-
 .../parallel_computation_graph.cc             |  38 +-
 .../parallel_computation_graph_builder.cc     | 155 ++--
 .../src/pcg/start_invariant_machine_view.cc   |  56 +-
 .../apply_substitution/apply_substitution.h   |  31 +
 .../evaluate_substitution_output.h            |   6 +-
 .../output_expr_to_result_sub_pcg_mapping.h   |   6 +-
 ...expr_to_result_sub_pcg_mapping.struct.toml |   0
 .../perform_shape_inference.h                 |   4 +-
 .../substitutions/constraint_type.enum.toml   |   3 +
 .../operator_pattern/get_attribute_map.h      |  15 +
 .../operator_attribute_constraint.h           |   2 +
 .../operator_attribute_key.enum.toml          |   1 +
 .../operator_pattern/operator_attribute_key.h |  12 +
 ...operator_attribute_list_access.struct.toml |   5 +-
 .../operator_attribute_value.variant.toml     |  14 +-
 .../output_graph/output_graph_expr.h          |   5 +
 .../output_graph/output_graph_expr_value.h    |  16 +
 .../output_graph_expr_value.variant.toml      |  19 +
 .../output_operator_attrs_assignment.h        |   3 +
 ...tput_operator_attrs_assignment.struct.toml |   7 +-
 .../include/substitutions/pcg_pattern.h       |   2 +
 .../include/substitutions/pcg_pattern_match.h |   4 +-
 .../sub_parallel_computation_graph_edge.h     |   2 +-
 .../include/substitutions/substitution.h      |  25 +-
 .../substitutions/substitution_builder.h      |  49 ++
 .../tensor_attribute_list_access.struct.toml  |   5 +-
 .../tensor_pattern/tensor_attribute_pattern.h |   3 +
 .../tensor_attribute_value.variant.toml       |   5 +-
 .../substitutions/unity_substitution_set.h    |  47 ++
 .../unlabelled/input_pattern_edge.h           |   2 +-
 .../unlabelled/pattern_matching.h             |  10 +-
 .../unlabelled/pattern_node_output.h          |   2 +-
 .../unlabelled/standard_pattern_edge.h        |   4 +-
 .../apply_substitution/apply_substitution.cc  | 165 +++++
 .../evaluate_substitution_output.cc           |   4 +-
 .../output_expr_to_result_sub_pcg_mapping.cc  |   2 +-
 .../perform_shape_inference.cc                |   2 +-
 .../operator_pattern/eval_list_access.cc      |  21 +-
 .../operator_pattern/eval_list_size.cc        |   5 +-
 .../operator_pattern/get_attribute.cc         | 156 ++--
 .../operator_pattern/get_attribute_map.cc     |  25 +
 .../operator_attribute_constraint.cc          |  10 +
 .../operator_attribute_key.cc                 |  68 ++
 .../materialize_operator_from_attrs_map.cc    |  27 +-
 .../output_graph/output_graph_expr.cc         |  18 +
 .../output_graph/output_graph_expr_value.cc   |  30 +
 .../output_operator_attrs_assignment.cc       |  41 +-
 .../src/substitutions/pcg_pattern.cc          |  18 +
 .../sub_parallel_computation_graph.cc         |  55 +-
 .../sub_parallel_computation_graph_edge.cc    |   2 +-
 .../src/substitutions/substitution.cc         | 301 ++++----
 .../src/substitutions/substitution_builder.cc | 162 +++++
 .../tensor_pattern/eval_list_access.cc        |   5 +-
 .../tensor_pattern/eval_list_size.cc          |   5 +-
 .../tensor_pattern/get_attribute.cc           |  10 +-
 .../tensor_attribute_pattern.cc               |  16 +
 .../substitutions/unity_substitution_set.cc   | 235 ++++++
 .../unlabelled/input_pattern_edge.cc          |   2 +-
 .../unlabelled/pattern_node_output.cc         |   2 +-
 .../unlabelled/standard_pattern_edge.cc       |   4 +-
 .../apply_substitution/apply_substitution.cc  | 174 +++++
 .../evaluate_substitution_output.cc           |  63 +-
 .../perform_shape_inference.cc                |  63 +-
 .../operator_pattern/get_attribute.cc         |   2 +-
 .../test/src/substitutions/pcg_pattern.cc     |  14 +-
 .../test/src/substitutions/substitution.cc    | 345 ++++-----
 .../src/substitutions/substitution_builder.cc | 145 ++++
 .../substitutions/unity_substitution_set.cc   |  20 +
 .../unlabelled/find_pattern_matches.cc}       |  29 +-
 .../unlabelled/pattern_matching.cc            | 210 ++++++
 .../substitutions/unlabelled/pattern_split.cc |   8 +-
 .../unlabelled/unlabelled_graph_pattern.cc    |   4 +-
 .../test/src/test_substitution.cc             | 148 ----
 .../algorithms/bidict_from_enumerating.h      |  14 +-
 .../utils/cli/cli_flag_key.struct.toml        |   6 +-
 .../cli_positional_argument_key.struct.toml   |   6 +-
 lib/utils/include/utils/containers/at_idx.h   |   5 +-
 .../include/utils/containers/enumerate.h      |  16 +-
 .../utils/containers/enumerate_vector.h       |  11 +-
 lib/utils/include/utils/containers/flatmap.h  |  15 +-
 .../get_all_permutations_with_repetition.h    |  10 +-
 lib/utils/include/utils/containers/make.h     |  13 +
 .../include/utils/containers/merge_maps.h     |  60 +-
 .../utils/containers/merge_method.enum.toml   |  17 +
 lib/utils/include/utils/containers/product.h  |   2 +-
 lib/utils/include/utils/containers/repeat.h   |   5 +-
 .../include/utils/containers/repeat_element.h |  22 +
 .../include/utils/containers/replicate.h      |  15 -
 lib/utils/include/utils/containers/sum.h      |   2 +-
 .../algorithms/view_as_open_dataflow_graph.h  |  34 +
 .../dataflow_edge_query.struct.toml           |   5 +-
 .../graph/dataflow_graph/dataflow_graph.h     |   3 +-
 .../dataflow_graph/dataflow_input.struct.toml |   3 +-
 .../dataflow_output.struct.toml               |   3 +-
 .../dataflow_output_query.struct.toml         |   6 +-
 .../graph/dataflow_graph/i_dataflow_graph.h   |   2 +-
 .../instances/unordered_set_dataflow_graph.h  |   4 +-
 ...ordered_set_labelled_open_dataflow_graph.h |   7 +-
 .../algorithms/get_graph_data.h               |   1 +
 .../algorithms/permute_node_ids.h             |   1 +
 .../graph/multidigraph/algorithms/add_nodes.h |   3 +-
 .../algorithms/are_isomorphic.h               |  13 +
 .../open_dataflow_graph_isomorphism.h         |  21 +
 .../dataflow_input_edge_query.struct.toml     |   3 +-
 .../i_open_dataflow_graph.h                   |   2 +-
 .../open_dataflow_graph/open_dataflow_edge.h  |   2 +-
 .../open_dataflow_graph/open_dataflow_graph.h |   2 +-
 .../unordered_set_open_dataflow_graph.h       |   2 +-
 lib/utils/include/utils/graph/render_dot.h    |  19 +
 .../include/utils/nonnegative_int/ceildiv.h   |  11 +
 .../utils/nonnegative_int/nonnegative_int.h   |  28 +-
 .../utils/nonnegative_int/nonnegative_range.h |  14 +
 .../utils/nonnegative_int/num_elements.h      |  17 +
 lib/utils/include/utils/variant.h             |   1 +
 .../algorithms/bidict_from_enumerating.cc     |  13 +
 lib/utils/src/utils/cli/cli_parse.cc          |   6 +-
 lib/utils/src/utils/cli/cli_spec.cc           |  15 +-
 lib/utils/src/utils/containers/at_idx.cc      |   9 +
 lib/utils/src/utils/containers/enumerate.cc   |  11 +
 .../src/utils/containers/enumerate_vector.cc  |   9 +
 lib/utils/src/utils/containers/make.cc        |   8 +
 lib/utils/src/utils/containers/range.cc       |   1 +
 lib/utils/src/utils/containers/repeat.cc      |  10 +
 .../src/utils/containers/repeat_element.cc    |  10 +
 lib/utils/src/utils/containers/replicate.cc   |   1 -
 .../utils/graph/dataflow_graph/algorithms.cc  |   2 +-
 .../graph/dataflow_graph/algorithms/as_dot.cc |  41 +-
 .../get_dataflow_edges_from_node_to_node.cc   |   4 +-
 .../algorithms/get_incoming_edges.cc          |   8 +-
 .../algorithms/get_outgoing_edges.cc          |   8 +-
 .../algorithms/get_subgraph_incoming_edges.cc |   4 +-
 .../algorithms/get_subgraph_outgoing_edges.cc |   4 +-
 .../algorithms/view_as_open_dataflow_graph.cc |  22 +-
 .../algorithms/view_as_open_dataflow_graph.h  |  32 -
 .../dataflow_graph/dataflow_edge_query.cc     |  20 +-
 .../graph/dataflow_graph/dataflow_graph.cc    |   2 +-
 .../dataflow_graph/dataflow_output_query.cc   |   6 +-
 .../dataflow_graph/i_dataflow_graph_view.cc   |   4 +-
 .../digraph/algorithms/transitive_closure.cc  |   5 +-
 .../algorithms/transitive_reduction.cc        |   4 +-
 .../instances/unordered_set_dataflow_graph.cc |  10 +-
 .../multidigraph/algorithms/add_nodes.cc      |   2 +-
 .../algorithms/are_isomorphic.cc              |  11 +
 .../open_dataflow_graph/algorithms/as_dot.cc  |  11 +-
 .../algorithms/get_incoming_edges.cc          |   6 +-
 .../algorithms/get_subgraph_incoming_edges.cc |   6 +-
 .../open_dataflow_graph_isomorphism.cc        |  54 ++
 .../dataflow_input_edge_query.cc              |  10 +-
 .../open_dataflow_graph/open_dataflow_edge.cc |   2 +-
 .../open_dataflow_graph.cc                    |   2 +-
 .../unordered_set_open_dataflow_graph.cc      |   2 +-
 lib/utils/src/utils/graph/render_dot.cc       |  90 +++
 .../src/utils/nonnegative_int/ceildiv.cc      |  20 +
 .../utils/nonnegative_int/nonnegative_int.cc  |  79 +-
 .../nonnegative_int/nonnegative_range.cc      |  19 +
 .../src/utils/nonnegative_int/num_elements.cc |  10 +
 lib/utils/test/src/main.cc                    |   2 -
 .../algorithms/bidict_from_enumerating.cc     |  19 +-
 lib/utils/test/src/utils/cli/cli_parse.cc     |  34 +-
 lib/utils/test/src/utils/containers/at_idx.cc |  29 +
 .../test/src/utils/containers/enumerate.cc    |  33 +-
 .../src/utils/containers/enumerate_vector.cc  |  33 +
 .../test/src/utils/containers/flatmap.cc      |  32 +
 .../get_all_permutations_with_repetition.cc   |   8 +-
 lib/utils/test/src/utils/containers/make.cc   |  15 +
 .../test/src/utils/containers/merge_maps.cc   |  78 +-
 .../test/src/utils/containers/product.cc      |  20 +
 lib/utils/test/src/utils/containers/repeat.cc |   2 +-
 .../{replicate.cc => repeat_element.cc}       |   9 +-
 .../utils/graph/dataflow_graph/algorithms.cc  |  14 +-
 .../dataflow_graphs_are_isomorphic.cc         |  24 +-
 .../algorithms/find_isomorphism.cc            |  24 +-
 .../get_dataflow_edges_from_node_to_node.cc   |  26 +-
 .../algorithms/get_incoming_edges.cc          |  14 +-
 .../algorithms/get_outgoing_edges.cc          |  28 +-
 .../algorithms/get_subgraph_incoming_edges.cc |  14 +-
 .../algorithms/get_subgraph_outgoing_edges.cc |  12 +-
 ...sitive_reduced_boundary_nodes_for_split.cc |   8 +-
 ...t_transitive_reduced_edges_across_split.cc |  34 +-
 ...transitive_reduced_outputs_across_split.cc |   8 +-
 .../unordered_open_dataflow_graph.cc          |   8 +-
 .../multidigraph/algorithms/add_edges.cc      |   2 +-
 .../multidigraph/algorithms/add_nodes.cc      |   2 +-
 .../multidigraph/algorithms/get_edges.cc      |   2 +-
 .../algorithms/find_isomorphism.cc            |  23 +-
 .../get_open_dataflow_graph_inputs.cc         |   2 +-
 .../get_open_dataflow_value_uses.cc           |  20 +-
 .../get_unused_open_dataflow_graph_inputs.cc  |   4 +-
 .../open_dataflow_graphs_are_isomorphic.cc    |  23 +-
 .../algorithms/permute_input_ids.cc           |  14 +-
 .../algorithms/permute_node_ids.cc            |  28 +-
 .../series_parallel/parallel_reduction.cc     |  14 +-
 .../graph/series_parallel/series_reduction.cc |  18 +-
 .../test/src/utils/nonnegative_int/ceildiv.cc |  52 ++
 .../utils/nonnegative_int/nonnegative_int.cc  |  90 ++-
 .../nonnegative_int/nonnegative_range.cc      |  42 ++
 .../src/utils/nonnegative_int/num_elements.cc |  15 +
 lib/utils/test/src/utils/random_utils.cc      |   6 +-
 425 files changed, 7351 insertions(+), 5065 deletions(-)
 create mode 100644 lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml
 delete mode 100644 lib/local-execution/include/local-execution/legion_tensor_shape.h
 delete mode 100644 lib/local-execution/src/legion_tensor_shape.cc
 delete mode 100644 lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml
 create mode 100644 lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml
 create mode 100644 lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h
 rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/evaluate_substitution_output.h (76%)
 rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/output_expr_to_result_sub_pcg_mapping.h (62%)
 rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/output_expr_to_result_sub_pcg_mapping.struct.toml (100%)
 rename lib/substitutions/include/substitutions/{substitution_internal => apply_substitution}/perform_shape_inference.h (85%)
 create mode 100644 lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h
 create mode 100644 lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h
 create mode 100644 lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h
 create mode 100644 lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml
 create mode 100644 lib/substitutions/include/substitutions/substitution_builder.h
 create mode 100644 lib/substitutions/include/substitutions/unity_substitution_set.h
 create mode 100644 lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
 rename lib/substitutions/src/substitutions/{substitution_internal => apply_substitution}/evaluate_substitution_output.cc (96%)
 rename lib/substitutions/src/substitutions/{substitution_internal => apply_substitution}/output_expr_to_result_sub_pcg_mapping.cc (93%)
 rename lib/substitutions/src/substitutions/{substitution_internal => apply_substitution}/perform_shape_inference.cc (95%)
 create mode 100644 lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc
 create mode 100644 lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc
 create mode 100644 lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc
 create mode 100644 lib/substitutions/src/substitutions/substitution_builder.cc
 create mode 100644 lib/substitutions/src/substitutions/unity_substitution_set.cc
 create mode 100644 lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
 rename lib/substitutions/test/src/substitutions/{substitution_internal => apply_substitution}/evaluate_substitution_output.cc (86%)
 rename lib/substitutions/test/src/substitutions/{substitution_internal => apply_substitution}/perform_shape_inference.cc (78%)
 create mode 100644 lib/substitutions/test/src/substitutions/substitution_builder.cc
 create mode 100644 lib/substitutions/test/src/substitutions/unity_substitution_set.cc
 rename lib/substitutions/test/src/{test_pattern_matches.cc => substitutions/unlabelled/find_pattern_matches.cc} (94%)
 create mode 100644 lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc
 delete mode 100644 lib/substitutions/test/src/test_substitution.cc
 create mode 100644 lib/utils/include/utils/containers/make.h
 create mode 100644 lib/utils/include/utils/containers/merge_method.enum.toml
 create mode 100644 lib/utils/include/utils/containers/repeat_element.h
 delete mode 100644 lib/utils/include/utils/containers/replicate.h
 create mode 100644 lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
 create mode 100644 lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h
 create mode 100644 lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h
 create mode 100644 lib/utils/include/utils/graph/render_dot.h
 create mode 100644 lib/utils/include/utils/nonnegative_int/ceildiv.h
 create mode 100644 lib/utils/include/utils/nonnegative_int/nonnegative_range.h
 create mode 100644 lib/utils/include/utils/nonnegative_int/num_elements.h
 create mode 100644 lib/utils/src/utils/containers/make.cc
 create mode 100644 lib/utils/src/utils/containers/repeat_element.cc
 delete mode 100644 lib/utils/src/utils/containers/replicate.cc
 delete mode 100644 lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
 create mode 100644 lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc
 create mode 100644 lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc
 create mode 100644 lib/utils/src/utils/graph/render_dot.cc
 create mode 100644 lib/utils/src/utils/nonnegative_int/ceildiv.cc
 create mode 100644 lib/utils/src/utils/nonnegative_int/nonnegative_range.cc
 create mode 100644 lib/utils/src/utils/nonnegative_int/num_elements.cc
 delete mode 100644 lib/utils/test/src/main.cc
 create mode 100644 lib/utils/test/src/utils/containers/at_idx.cc
 create mode 100644 lib/utils/test/src/utils/containers/enumerate_vector.cc
 create mode 100644 lib/utils/test/src/utils/containers/make.cc
 rename lib/utils/test/src/utils/containers/{replicate.cc => repeat_element.cc} (69%)
 create mode 100644 lib/utils/test/src/utils/nonnegative_int/ceildiv.cc
 create mode 100644 lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc
 create mode 100644 lib/utils/test/src/utils/nonnegative_int/num_elements.cc

diff --git a/bin/export-model-arch/src/export_model_arch.cc b/bin/export-model-arch/src/export_model_arch.cc
index 64419acce4..a9f6c65b86 100644
--- a/bin/export-model-arch/src/export_model_arch.cc
+++ b/bin/export-model-arch/src/export_model_arch.cc
@@ -13,6 +13,7 @@
 #include "utils/cli/cli_parse.h"
 #include "utils/cli/cli_parse_result.h"
 #include "utils/cli/cli_spec.h"
+#include "utils/graph/open_dataflow_graph/algorithms/as_dot.h"
 #include "utils/graph/series_parallel/binary_sp_decomposition_tree/right_associative_binary_sp_tree_from_nary.h"
 #include "utils/graph/series_parallel/get_series_parallel_decomposition.h"
 
@@ -21,11 +22,11 @@ using namespace ::FlexFlow;
 ComputationGraph get_single_operator_computation_graph() {
   ComputationGraphBuilder b;
 
-  size_t batch_size = 8;
-  size_t in_channels = 16;
-  size_t out_channels = 12;
+  nonnegative_int batch_size = 8_n;
+  nonnegative_int in_channels = 16_n;
+  nonnegative_int out_channels = 12_n;
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           batch_size,
           in_channels,
           out_channels,
@@ -69,7 +70,7 @@ tl::expected<ComputationGraph, std::string>
   } else if (model_name == "bert") {
     return get_bert_computation_graph(get_default_bert_config());
   } else if (model_name == "split_test") {
-    int batch_size = 8;
+    nonnegative_int batch_size = 8_n;
     return get_split_test_computation_graph(batch_size);
   } else if (model_name == "single_operator") {
     return get_single_operator_computation_graph();
@@ -100,10 +101,10 @@ tl::expected<JsonSPModelExport, std::string>
     result.value();
   });
 
-  std::pair<V1ComputationGraph, bidict<int, layer_guid_t>> v1_result =
-      to_v1_including_node_numbering(computation_graph);
+  std::pair<V1ComputationGraph, bidict<nonnegative_int, layer_guid_t>>
+      v1_result = to_v1_including_node_numbering(computation_graph);
   V1ComputationGraph v1_cg = v1_result.first;
-  bidict<int, layer_guid_t> layer_numbering = v1_result.second;
+  bidict<nonnegative_int, layer_guid_t> layer_numbering = v1_result.second;
   V1BinarySPDecomposition v1_sp_decomposition =
       to_v1(sp_decomposition, layer_numbering);
 
diff --git a/cmake/flexflow-utils.cmake b/cmake/flexflow-utils.cmake
index 7ba39e92c9..515a249521 100644
--- a/cmake/flexflow-utils.cmake
+++ b/cmake/flexflow-utils.cmake
@@ -20,6 +20,7 @@ function(define_ff_vars target)
     MAX_TENSOR_DIM=${FF_MAX_DIM}
     MAX_NUM_TASK_REGIONS=${FF_MAX_NUM_TASK_REGIONS}
     MAX_NUM_TASK_ARGUMENTS=${FF_MAX_NUM_TASK_ARGUMENTS}
+    # _FORTIFY_SOURCE=0
     )
 
   if (FF_GPU_BACKEND STREQUAL "cuda")
@@ -39,7 +40,18 @@ function(ff_set_cxx_properties target)
       CXX_EXTENSIONS NO
   )
   target_compile_options(${target}
-    PRIVATE $<$<COMPILE_LANGUAGE:CXX>:> "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." # add C++ compile flags here
+    PUBLIC 
+    $<$<COMPILE_LANGUAGE:CXX>:> 
+    "-ffile-prefix-map=${CMAKE_SOURCE_DIR}=." 
+    "-fsanitize=undefined" 
+    "-fno-sanitize-recover=all"
+    # add C++ compile flags here
+  )
+  target_link_options(${target}
+    PUBLIC 
+    $<$<COMPILE_LANGUAGE:CXX>:> 
+    "-fsanitize=undefined" 
+    "-fno-sanitize-recover=all"
   )
 endfunction()
 
diff --git a/flake.nix b/flake.nix
index 91651bd0c1..e4644ef727 100644
--- a/flake.nix
+++ b/flake.nix
@@ -38,9 +38,15 @@
       };
       lib = pkgs.lib;
 
-      mkShell = pkgs.mkShell.override {
+      mkShell = attrs: pkgs.mkShell.override {
         stdenv = pkgs.cudaPackages.backendStdenv;
-      };
+      } (attrs // {
+        hardeningDisable = ["all"]; # disable nixpkgs default compiler arguments, otherwise ubsan doesn't catch 
+                                    # signed overflows due to the signedoverflow hardening setting. 
+                                    # for more details, see the following (long-running) nixpkgs github issues: 
+                                    # - https://github.com/NixOS/nixpkgs/issues/18995
+                                    # - https://github.com/NixOS/nixpkgs/issues/60919
+      });
 
       proj = proj-repo.packages.${system}.proj;
     in 
@@ -121,6 +127,8 @@
 
         gpu-ci = mkShell {
           inputsFrom = [ ci ];
+          hardeningDisable = [ "all" ];
+
           buildInputs = builtins.concatLists [
             (with nixGL.packages.${system}; [
               nixGLDefault
@@ -135,6 +143,8 @@
             "${proj-repo.packages.${system}.proj-nvim}"
           ];
 
+          hardeningDisable = [ "all" ];
+
           buildInputs = builtins.concatLists [
             (with pkgs; [
               clang-tools
diff --git a/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h b/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h
index fdc80a1e37..8a7c467303 100644
--- a/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h
+++ b/lib/compiler/include/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.h
@@ -36,8 +36,9 @@ bool is_right_associative(ComputationGraphBinarySPDecomposition const &);
 std::unordered_multiset<layer_guid_t>
     get_layers(ComputationGraphBinarySPDecomposition const &);
 
-V1BinarySPDecomposition to_v1(ComputationGraphBinarySPDecomposition const &,
-                              bidict<int, layer_guid_t> const &layer_numbering);
+V1BinarySPDecomposition
+    to_v1(ComputationGraphBinarySPDecomposition const &,
+          bidict<nonnegative_int, layer_guid_t> const &layer_numbering);
 
 } // namespace FlexFlow
 
diff --git a/lib/compiler/src/compiler/allowed_machine_views.cc b/lib/compiler/src/compiler/allowed_machine_views.cc
index db7477b460..6f86d1d82a 100644
--- a/lib/compiler/src/compiler/allowed_machine_views.cc
+++ b/lib/compiler/src/compiler/allowed_machine_views.cc
@@ -11,12 +11,15 @@
 #include "utils/containers/map_from_keys_and_values.h"
 #include "utils/containers/product.h"
 #include "utils/containers/range.h"
-#include "utils/containers/replicate.h"
+#include "utils/containers/repeat_element.h"
 #include "utils/containers/sorted.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/unordered_multiset_of.h"
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/zip.h"
+#include "utils/nonnegative_int/ceildiv.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -47,24 +50,29 @@ static std::unordered_set<MachineView>
                                 OperatorTaskSpace const &task,
                                 DeviceType const &device_type) {
 
-  auto get_max_stride_upper_bound = [](std::vector<int> const &tensor_dims,
-                                       int total_devices) -> int {
-    int min_num_devices_with_full_stride_volume = product(transform(
-        tensor_dims, [](int const &num_devices) { return num_devices - 1; }));
-    return std::ceil(total_devices / min_num_devices_with_full_stride_volume);
+  auto get_max_stride_upper_bound =
+      [](std::vector<nonnegative_int> const &tensor_dims,
+         nonnegative_int total_devices) -> nonnegative_int {
+    nonnegative_int min_num_devices_with_full_stride_volume =
+        product(transform(tensor_dims, [](nonnegative_int num_devices) {
+          return nonnegative_int{num_devices.unwrap_nonnegative() - 1};
+        }));
+    return ceildiv(total_devices, min_num_devices_with_full_stride_volume);
   };
 
-  auto candidate_strides = [&](std::vector<int> const &tensor_dims,
-                               int total_devices)
+  auto candidate_strides = [&](std::vector<nonnegative_int> const &tensor_dims,
+                               nonnegative_int total_devices)
       -> std::unordered_multiset<MultiDimensionalStride> {
-    int max_stride_upper_bound =
+    nonnegative_int max_stride_upper_bound =
         get_max_stride_upper_bound(tensor_dims, total_devices);
 
     std::vector<stride_t> single_stride_range =
-        transform(range(1, max_stride_upper_bound + 1),
-                  [](int stride) { return stride_t{stride}; });
+        transform(nonnegative_range(1_n, max_stride_upper_bound + 1_n),
+                  [](nonnegative_int stride) { return stride_t{stride}; });
     std::unordered_multiset<std::vector<stride_t>> raw_stride_vectors =
-        cartesian_product(replicate(tensor_dims.size(), single_stride_range));
+        cartesian_product(
+            repeat_element(/*num_times=*/num_elements(tensor_dims),
+                           /*element=*/single_stride_range));
     std::unordered_multiset<MultiDimensionalStride> strides =
         transform(raw_stride_vectors, [](auto const &stride_vec) {
           return MultiDimensionalStride{stride_vec};
@@ -75,8 +83,9 @@ static std::unordered_set<MachineView>
   auto candidate_starts = [](MachineSpecification const &ms,
                              DeviceType const &device_type) {
     std::unordered_set<MachineSpaceCoordinate> result;
-    for (int node_idx : range(ms.num_nodes)) {
-      for (int device_idx : range(get_num_devices_per_node(ms, device_type))) {
+    for (nonnegative_int node_idx : nonnegative_range(ms.num_nodes)) {
+      for (nonnegative_int device_idx :
+           nonnegative_range(get_num_devices_per_node(ms, device_type))) {
         result.insert(
             MachineSpaceCoordinate{node_idx, device_idx, device_type});
       }
@@ -91,8 +100,8 @@ static std::unordered_set<MachineView>
     return get_all_permutations_with_repetition(options, num_dims(task));
   };
 
-  std::vector<int> tensor_dims = task.degrees;
-  int total_devices = get_num_devices(machine_spec, device_type);
+  std::vector<nonnegative_int> tensor_dims = task.degrees;
+  nonnegative_int total_devices = get_num_devices(machine_spec, device_type);
 
   std::unordered_set<MachineView> machine_views;
 
diff --git a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc
index 5126d9687e..bb9d54f1e9 100644
--- a/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc
+++ b/lib/compiler/src/compiler/machine_mapping/get_machine_resource_splits.cc
@@ -11,8 +11,9 @@ std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
   for (int i = 1; i < resource.num_nodes; i *= 2) {
     MachineSpecification sub_resource1 = resource;
     MachineSpecification sub_resource2 = resource;
-    sub_resource1.num_nodes = i;
-    sub_resource2.num_nodes = resource.num_nodes - i;
+    sub_resource1.num_nodes = nonnegative_int{i};
+    sub_resource2.num_nodes =
+        nonnegative_int{resource.num_nodes.unwrap_nonnegative() - i};
     result.insert(std::make_pair(sub_resource1, sub_resource2));
     result.insert(std::make_pair(sub_resource2, sub_resource1));
   }
@@ -20,8 +21,9 @@ std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
   for (int i = 1; i < resource.num_gpus_per_node; i *= 2) {
     MachineSpecification sub_resource1 = resource;
     MachineSpecification sub_resource2 = resource;
-    sub_resource1.num_gpus_per_node = i;
-    sub_resource2.num_gpus_per_node = resource.num_gpus_per_node - i;
+    sub_resource1.num_gpus_per_node = nonnegative_int{i};
+    sub_resource2.num_gpus_per_node =
+        nonnegative_int{resource.num_gpus_per_node.unwrap_nonnegative() - i};
     result.insert(std::make_pair(sub_resource1, sub_resource2));
     result.insert(std::make_pair(sub_resource2, sub_resource1));
   }
diff --git a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
index fc3a58995c..82c8274808 100644
--- a/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/machine_mapping.cc
@@ -1,20 +1,14 @@
 #include "compiler/machine_mapping/machine_mapping.h"
-#include "pcg/machine_specification.h"
-#include "pcg/machine_view.h"
-#include "pcg/operator_task_space.dtg.h"
-#include "pcg/operator_task_space.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph.h"
 #include "utils/containers/are_disjoint.h"
-#include "utils/containers/get_one_of.h"
 #include "utils/containers/keys.h"
-#include "utils/containers/map_values.h"
 #include "utils/containers/merge_maps.h"
 
 namespace FlexFlow {
 
 MachineMapping combine_disjoint_mappings(MachineMapping const &m1,
                                          MachineMapping const &m2) {
-  return MachineMapping{merge_maps(m1.machine_views, m2.machine_views)};
+  return MachineMapping{
+      merge_disjoint_maps(m1.machine_views, m2.machine_views)};
 }
 
 bool nodes_are_disjoint(MachineMapping const &m1, MachineMapping const &m2) {
diff --git a/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc b/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc
index 715a4c2e3d..ed60004bf4 100644
--- a/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc
+++ b/lib/compiler/src/compiler/machine_mapping/parallel_layer_guid_oblivious_machine_mapping.cc
@@ -10,8 +10,8 @@ ParallelLayerGuidObliviousMachineMapping binary_combine_mappings(
     ParallelLayerGuidObliviousMachineMapping const &lhs,
     ParallelLayerGuidObliviousMachineMapping const &rhs) {
   return ParallelLayerGuidObliviousMachineMapping{
-      merge_maps(map_keys(lhs.raw_mapping, nest_inside_left_child),
-                 map_keys(rhs.raw_mapping, nest_inside_right_child)),
+      merge_disjoint_maps(map_keys(lhs.raw_mapping, nest_inside_left_child),
+                          map_keys(rhs.raw_mapping, nest_inside_right_child)),
   };
 }
 
diff --git a/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc b/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc
index 32fb53b58a..9886468386 100644
--- a/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc
+++ b/lib/compiler/src/compiler/series_parallel/computation_graph/computation_graph_binary_sp_decomposition.cc
@@ -164,7 +164,7 @@ std::unordered_multiset<layer_guid_t>
 
 V1BinarySPDecomposition
     to_v1(ComputationGraphBinarySPDecomposition const &tree,
-          bidict<int, layer_guid_t> const &layer_numbering) {
+          bidict<nonnegative_int, layer_guid_t> const &layer_numbering) {
   return tree.visit<V1BinarySPDecomposition>(
       overload{[&](ComputationGraphBinarySeriesSplit const &series) {
                  return V1BinarySPDecomposition{
diff --git a/lib/compiler/test/src/allowed_machine_views.cc b/lib/compiler/test/src/allowed_machine_views.cc
index 936894ad2d..817cc80700 100644
--- a/lib/compiler/test/src/allowed_machine_views.cc
+++ b/lib/compiler/test/src/allowed_machine_views.cc
@@ -15,39 +15,39 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("1 degree of parallelism") {
       MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/1,
-          /*num_cpus_per_node=*/5,
-          /*num_gpus_per_node=*/5,
+          /*num_nodes=*/1_n,
+          /*num_cpus_per_node=*/5_n,
+          /*num_gpus_per_node=*/5_n,
           /*inter_node_bandwidth=*/0,
           /*intra_node_bandwidth=*/0,
       };
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
 
       std::unordered_set<MachineView> correct = {
           MachineView{
               MachineSpaceCoordinate{
-                  /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1},
+                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
 
           MachineView{
               MachineSpaceCoordinate{
-                  /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1},
+                  /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
           MachineView{
               MachineSpaceCoordinate{
-                  /*node_idx=*/0, /*device_idx=*/2, DeviceType::GPU},
-              {MachineViewDimension{stride_t{1},
+                  /*node_idx=*/0_n, /*device_idx=*/2_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{1_n},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
           MachineView{
               MachineSpaceCoordinate{
-                  /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
-              {MachineViewDimension{stride_t{2},
+                  /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+              {MachineViewDimension{stride_t{2_n},
                                     MachineSpecificationDimension::INTRA_NODE}},
           },
       };
@@ -61,18 +61,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("2 degrees of parallelism") {
 
       MachineSpecification ms = MachineSpecification{
-          /*num_nodes=*/3,
-          /*num_cpus_per_node=*/3,
-          /*num_gpus_per_node=*/3,
+          /*num_nodes=*/3_n,
+          /*num_cpus_per_node=*/3_n,
+          /*num_gpus_per_node=*/3_n,
           /*inter_node_bandwidth=*/0,
           /*intra_node_bandwidth=*/0,
       };
-      OperatorTaskSpace task = OperatorTaskSpace{{2, 3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 3_n}};
 
-      auto make_2d_view = [&](int start_node_idx,
-                              int start_device_idx,
-                              int stride1,
-                              int stride2,
+      auto make_2d_view = [&](nonnegative_int start_node_idx,
+                              nonnegative_int start_device_idx,
+                              nonnegative_int stride1,
+                              nonnegative_int stride2,
                               MachineSpecificationDimension m1,
                               MachineSpecificationDimension m2) {
         return MachineView{
@@ -86,13 +86,19 @@ TEST_SUITE(FF_TEST_SUITE) {
       auto intra = MachineSpecificationDimension::INTRA_NODE;
       auto inter = MachineSpecificationDimension::INTER_NODE;
       std::unordered_set<MachineView> correct = {
-          make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, inter, intra),
-          make_2d_view(1, 0, /*stride1=*/1, /*stride2=*/1, inter, intra),
-          make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, inter, intra),
-
-          make_2d_view(0, 0, /*stride1=*/1, /*stride2=*/1, intra, inter),
-          make_2d_view(0, 1, /*stride1=*/1, /*stride2=*/1, intra, inter),
-          make_2d_view(0, 0, /*stride1=*/2, /*stride2=*/1, intra, inter),
+          make_2d_view(
+              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
+          make_2d_view(
+              1_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, inter, intra),
+          make_2d_view(
+              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, inter, intra),
+
+          make_2d_view(
+              0_n, 0_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
+          make_2d_view(
+              0_n, 1_n, /*stride1=*/1_n, /*stride2=*/1_n, intra, inter),
+          make_2d_view(
+              0_n, 0_n, /*stride1=*/2_n, /*stride2=*/1_n, intra, inter),
       };
 
       std::unordered_set<MachineView> result =
diff --git a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
index 5c8ea1c0f1..b0d86124a1 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/abstracted_tensor_set_movement/get_abstracted_tensor_set_movement_across_split.cc
@@ -28,12 +28,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc
index 499b111f8f..5f4ba2bfdc 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_machine_resource_splits.cc
@@ -8,10 +8,11 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_machine_resource_splits") {
-    auto make_machine_spec = [](int num_nodes, int num_gpus_per_node) {
+    auto make_machine_spec = [](nonnegative_int num_nodes,
+                                nonnegative_int num_gpus_per_node) {
       return MachineSpecification{
           /*num_nodes=*/num_nodes,
-          /*num_cpus_per_node=*/1,
+          /*num_cpus_per_node=*/1_n,
           /*num_gpus_per_node=*/num_gpus_per_node,
           /*inter_node_bandwidth=*/1.0,
           /*intra_node_bandwidth=*/1.0,
@@ -19,8 +20,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     SUBCASE("returns no splits if no splits are possible") {
-      MachineSpecification input = make_machine_spec(/*num_nodes=*/1,
-                                                     /*num_gpus_per_node=*/1);
+      MachineSpecification input = make_machine_spec(/*num_nodes=*/1_n,
+                                                     /*num_gpus_per_node=*/1_n);
 
       std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
           result = get_machine_resource_splits(input);
@@ -32,8 +33,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE(
         "returns splits in gpu and node dimensions, but not at the same time") {
-      MachineSpecification input = make_machine_spec(/*num_nodes=*/2,
-                                                     /*num_gpus_per_node=*/2);
+      MachineSpecification input = make_machine_spec(/*num_nodes=*/2_n,
+                                                     /*num_gpus_per_node=*/2_n);
 
       std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
           result = get_machine_resource_splits(input);
@@ -41,16 +42,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<std::pair<MachineSpecification, MachineSpecification>>
           correct = {
               {
-                  make_machine_spec(/*num_nodes=*/2,
-                                    /*num_gpus_per_node=*/1),
-                  make_machine_spec(/*num_nodes=*/2,
-                                    /*num_gpus_per_node=*/1),
+                  make_machine_spec(/*num_nodes=*/2_n,
+                                    /*num_gpus_per_node=*/1_n),
+                  make_machine_spec(/*num_nodes=*/2_n,
+                                    /*num_gpus_per_node=*/1_n),
               },
               {
-                  make_machine_spec(/*num_nodes=*/1,
-                                    /*num_gpus_per_node=*/2),
-                  make_machine_spec(/*num_nodes=*/1,
-                                    /*num_gpus_per_node=*/2),
+                  make_machine_spec(/*num_nodes=*/1_n,
+                                    /*num_gpus_per_node=*/2_n),
+                  make_machine_spec(/*num_nodes=*/1_n,
+                                    /*num_gpus_per_node=*/2_n),
               },
 
           };
@@ -60,8 +61,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("returns splits in node dimension in powers of two") {
       SUBCASE("num_nodes is a power of 2") {
-        MachineSpecification input = make_machine_spec(/*num_nodes=*/8,
-                                                       /*num_gpus_per_node=*/1);
+        MachineSpecification input =
+            make_machine_spec(/*num_nodes=*/8_n,
+                              /*num_gpus_per_node=*/1_n);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -71,34 +73,34 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/7,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/7_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/2,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/6,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/2_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/6_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/4,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/4,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/4_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/4_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/6,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/2,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/6_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/2_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/7,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/7_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
             };
 
@@ -106,8 +108,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("num_nodes is not a power of 2") {
-        MachineSpecification input = make_machine_spec(/*num_nodes=*/6,
-                                                       /*num_gpus_per_node=*/1);
+        MachineSpecification input =
+            make_machine_spec(/*num_nodes=*/6_n,
+                              /*num_gpus_per_node=*/1_n);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -117,28 +120,28 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/5,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/5_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/2,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/4,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/2_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/4_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/4,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/2,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/4_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/2_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/5,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/5_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
             };
 
@@ -148,8 +151,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("returns splits in gpu dimension in powers of two") {
       SUBCASE("num_gpus_per_node is a power of 2") {
-        MachineSpecification input = make_machine_spec(/*num_nodes=*/1,
-                                                       /*num_gpus_per_node=*/8);
+        MachineSpecification input =
+            make_machine_spec(/*num_nodes=*/1_n,
+                              /*num_gpus_per_node=*/8_n);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -159,34 +163,34 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/7),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/7_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/2),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/6),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/2_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/6_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/4),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/4),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/4_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/4_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/6),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/2),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/6_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/2_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/7),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/7_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
             };
 
@@ -194,8 +198,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("num_gpus_per_node is not a power of 2") {
-        MachineSpecification input = make_machine_spec(/*num_nodes=*/1,
-                                                       /*num_gpus_per_node=*/6);
+        MachineSpecification input =
+            make_machine_spec(/*num_nodes=*/1_n,
+                              /*num_gpus_per_node=*/6_n);
 
         std::unordered_set<
             std::pair<MachineSpecification, MachineSpecification>>
@@ -205,28 +210,28 @@ TEST_SUITE(FF_TEST_SUITE) {
             std::pair<MachineSpecification, MachineSpecification>>
             correct = {
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/5),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/5_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/2),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/4),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/2_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/4_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/4),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/2),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/4_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/2_n),
                 },
                 {
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/5),
-                    make_machine_spec(/*num_nodes=*/1,
-                                      /*num_gpus_per_node=*/1),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/5_n),
+                    make_machine_spec(/*num_nodes=*/1_n,
+                                      /*num_gpus_per_node=*/1_n),
                 },
             };
       }
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
index 542edd9fa9..c5b891781d 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_optimal_machine_mapping.cc
@@ -45,14 +45,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView mv1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -60,31 +60,31 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView mv2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
     };
 
     MachineSpecification full_machine_spec = MachineSpecification{
-        /*num_nodes=*/2,
-        /*num_cpus_per_node=*/1,
-        /*num_gpus_per_node=*/1,
+        /*num_nodes=*/2_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
 
     MachineSpecification split_machine_spec = MachineSpecification{
-        /*num_nodes=*/1,
-        /*num_cpus_per_node=*/1,
-        /*num_gpus_per_node=*/1,
+        /*num_nodes=*/1_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
@@ -121,8 +121,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{},
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
index 52ad82595d..642fdf7ae1 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/get_tensor_set_movement_across_split.cc
@@ -30,12 +30,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -66,14 +66,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView pre_mv1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -81,14 +81,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView pre_mv2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -96,14 +96,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView post_mv1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{3},
+                stride_t{3_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -111,14 +111,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView post_mv2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{4},
+                stride_t{4_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
index 304034f9be..e88b714bd4 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping.cc
@@ -9,14 +9,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("combine_disjoint_mappings(MachineMapping, MachineMappping)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -24,14 +24,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -55,14 +55,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("nodes_are_disjoint(MachineMapping, MachineMappping)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -70,14 +70,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
index 06ab1e5b8c..a8ec24de63 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_problem_tree/get_machine_mapping_problem_tree.cc
@@ -65,11 +65,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 1},
+                ShardParallelDim{10_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
index 73b921fc98..4a261bcdae 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/machine_mapping_result.cc
@@ -8,14 +8,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("series_combine") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -23,14 +23,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -189,14 +189,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("parallel_combine") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -204,14 +204,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -312,14 +312,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("minimize_runtime") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -327,14 +327,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
index 8612017705..313f24c384 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/get_optimal_machine_mapping_with_memory.cc
@@ -45,14 +45,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView mv1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -60,31 +60,31 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView mv2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
     };
 
     MachineSpecification full_machine_spec = MachineSpecification{
-        /*num_nodes=*/2,
-        /*num_cpus_per_node=*/1,
-        /*num_gpus_per_node=*/1,
+        /*num_nodes=*/2_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
 
     MachineSpecification split_machine_spec = MachineSpecification{
-        /*num_nodes=*/1,
-        /*num_cpus_per_node=*/1,
-        /*num_gpus_per_node=*/1,
+        /*num_nodes=*/1_n,
+        /*num_cpus_per_node=*/1_n,
+        /*num_gpus_per_node=*/1_n,
         /*inter_node_bandwidth=*/1,
         /*intra_node_bandwidth=*/1,
     };
@@ -121,8 +121,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{},
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
index 1f3b7545a8..04149cae8f 100644
--- a/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
+++ b/lib/compiler/test/src/compiler/machine_mapping/memory_optimization/machine_mapping_result_with_memory.cc
@@ -9,14 +9,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("remove_non_pareto_optimal_machine_mapping_result") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -24,14 +24,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -39,14 +39,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{4},
+                stride_t{4_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -55,19 +55,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics cost1 = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{2},
+        /*memory=*/2_n,
     };
 
     OpCostMetrics cost2 = OpCostMetrics{
         /*forward_runtime=*/4.0,
         /*backward_runtime=*/4.0,
-        /*memory=*/nonnegative_int{1},
+        /*memory=*/1_n,
     };
 
     OpCostMetrics cost3 = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{3},
+        /*memory=*/3_n,
     };
 
     MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
@@ -159,14 +159,14 @@ TEST_SUITE(FF_TEST_SUITE) {
             "std::optional<ParallelSplitTransformation> const&)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -174,14 +174,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -190,7 +190,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics pre_cost = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{2},
+        /*memory=*/2_n,
     };
     MachineMappingWithMemoryResult pre = MachineMappingWithMemoryResult{{
         MachineMappingForSingleLayer{
@@ -217,7 +217,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics post_cost = OpCostMetrics{
         /*forward_runtime=*/4.0,
         /*backward_runtime=*/4.0,
-        /*memory=*/nonnegative_int{1},
+        /*memory=*/1_n,
     };
 
     MachineMappingWithMemoryResult post = MachineMappingWithMemoryResult{{
@@ -360,14 +360,14 @@ TEST_SUITE(FF_TEST_SUITE) {
             "std::optional<ParallelSplitTransformation> const&)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -375,14 +375,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -391,7 +391,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics lhs_cost = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{2},
+        /*memory=*/2_n,
     };
     MachineMappingWithMemoryResult lhs = MachineMappingWithMemoryResult{{
         MachineMappingForSingleLayer{
@@ -418,7 +418,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics rhs_cost = OpCostMetrics{
         /*forward_runtime=*/4.0,
         /*backward_runtime=*/4.0,
-        /*memory=*/nonnegative_int{1},
+        /*memory=*/1_n,
     };
     MachineMappingWithMemoryResult rhs = MachineMappingWithMemoryResult{{
         MachineMappingForSingleLayer{
@@ -492,14 +492,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("minimize_runtime(memory)") {
     MachineView machine_view_0 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{1},
+                stride_t{1_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -507,14 +507,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_1 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{2},
+                stride_t{2_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -522,14 +522,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     MachineView machine_view_2 = MachineView{
         /*start=*/MachineSpaceCoordinate{
-            /*node_idx=*/0,
-            /*device_idx=*/0,
+            /*node_idx=*/0_n,
+            /*device_idx=*/0_n,
             /*device_type=*/DeviceType::GPU,
         },
         /*dimensions=*/
         {
             MachineViewDimension{
-                stride_t{4},
+                stride_t{4_n},
                 MachineSpecificationDimension::INTRA_NODE,
             },
         },
@@ -538,17 +538,17 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpCostMetrics cost1 = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{2},
+        /*memory=*/2_n,
     };
     OpCostMetrics cost2 = OpCostMetrics{
         /*forward_runtime=*/4.0,
         /*backward_runtime=*/4.0,
-        /*memory=*/nonnegative_int{1},
+        /*memory=*/1_n,
     };
     OpCostMetrics cost3 = OpCostMetrics{
         /*forward_runtime=*/2.0,
         /*backward_runtime=*/2.0,
-        /*memory=*/nonnegative_int{3},
+        /*memory=*/3_n,
     };
 
     MachineMappingForSingleLayer mm1 = MachineMappingForSingleLayer{
diff --git a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
index 2b59669aad..d0f289043c 100644
--- a/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
+++ b/lib/compiler/test/src/compiler/series_parallel/computation_graph/get_computation_graph_series_parallel_decomposition.cc
@@ -29,11 +29,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraph cg = [&] {
         ComputationGraphBuilder b;
 
-        TensorShape input_shape = TensorShape{TensorDims{FFOrdered<size_t>{
-                                                  10,
-                                                  12,
-                                              }},
-                                              DataType::FLOAT};
+        TensorShape input_shape =
+            TensorShape{TensorDims{FFOrdered<nonnegative_int>{
+                            10_n,
+                            12_n,
+                        }},
+                        DataType::FLOAT};
         b.create_input(input_shape, CreateGrad::YES, input_layer_name);
 
         return b.computation_graph;
@@ -57,16 +58,17 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraph cg = [&] {
         ComputationGraphBuilder b;
 
-        TensorShape input_shape = TensorShape{TensorDims{FFOrdered<size_t>{
-                                                  10,
-                                                  12,
-                                              }},
-                                              DataType::FLOAT};
+        TensorShape input_shape =
+            TensorShape{TensorDims{FFOrdered<nonnegative_int>{
+                            10_n,
+                            12_n,
+                        }},
+                        DataType::FLOAT};
         tensor_guid_t input =
             b.create_input(input_shape, CreateGrad::YES, input_layer_name);
 
         b.dense(input,
-                /*outDim=*/14,
+                /*outDim=*/14_n,
                 /*activation=*/std::nullopt,
                 /*use_bias=*/true,
                 /*data_type=*/DataType::FLOAT,
@@ -119,9 +121,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -129,7 +131,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             b.create_input(input_shape, CreateGrad::YES, input_name);
 
         b.dense(input,
-                /*outDim=*/14,
+                /*outDim=*/14_n,
                 /*activation=*/std::nullopt,
                 /*use_bias=*/false,
                 /*data_type=*/DataType::FLOAT,
@@ -138,7 +140,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                 /*name=*/op1_name,
                 /*projection_name=*/w1_name);
         b.dense(input,
-                /*outDim=*/14,
+                /*outDim=*/14_n,
                 /*activation=*/std::nullopt,
                 /*use_bias=*/false,
                 /*data_type=*/DataType::FLOAT,
@@ -189,9 +191,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -246,9 +248,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -277,7 +279,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("real models") {
       SUBCASE("split_test") {
         ComputationGraph cg =
-            get_split_test_computation_graph(/*batch_size=*/8);
+            get_split_test_computation_graph(/*batch_size=*/8_n);
 
         std::optional<SeriesParallelDecomposition> sp_decomposition =
             get_computation_graph_series_parallel_decomposition(cg);
@@ -339,14 +341,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraph cg = [&] {
         ComputationGraphBuilder b;
 
-        TensorShape input_shape = TensorShape{TensorDims{FFOrdered<size_t>{
-                                                  10,
-                                                  12,
-                                              }},
-                                              DataType::FLOAT};
+        TensorShape input_shape =
+            TensorShape{TensorDims{FFOrdered<nonnegative_int>{
+                            10_n,
+                            12_n,
+                        }},
+                        DataType::FLOAT};
         tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
 
-        b.dense(input, /*outDim=*/14);
+        b.dense(input, /*outDim=*/14_n);
 
         return b.computation_graph;
       }();
@@ -356,7 +359,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("split_test") {
-      ComputationGraph cg = get_split_test_computation_graph(/*batch_size=*/8);
+      ComputationGraph cg =
+          get_split_test_computation_graph(/*batch_size=*/8_n);
 
       std::string result =
           render_preprocessed_computation_graph_for_sp_decomposition(cg);
diff --git a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
index e278338440..d262539dc1 100644
--- a/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
+++ b/lib/compiler/test/src/compiler/task_graph_simulator/task_simulator.cc
@@ -38,9 +38,9 @@ namespace FlexFlow {
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("task_simulator_estimate_forward_pass_time") {
     MachineSpecification machine_spec =
-        MachineSpecification{/*num_nodes=*/3,
-                             /*num_cpus_per_node=*/3,
-                             /*num_gpus_per_node=*/3,
+        MachineSpecification{/*num_nodes=*/3_n,
+                             /*num_cpus_per_node=*/3_n,
+                             /*num_gpus_per_node=*/3_n,
                              /*inter_node_bandwidth=*/1.0f,
                              /*intra_node_bandwidth=*/1.0f};
 
@@ -50,8 +50,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{},
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
@@ -63,16 +63,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_layer_guid_t layer1 = get_source_layer(tensor1);
 
       std::vector<MachineViewDimension> dims = {
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
       };
       ParallelComputationGraph pcg = b.pcg;
       MachineView mv1 =
-          MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+          MachineView{MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, dims};
       MachineView mv2 =
-          MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims};
+          MachineView{MachineSpaceCoordinate{0_n, 1_n, DeviceType::GPU}, dims};
 
       MachineMapping device_mapping = MachineMapping{{
           {layer0, mv1},
@@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
             /*forward_op_cost=*/10.0f,
             /*backward_op_cost=*/10.0f,
             /*comm_cost=*/1.0f,
-            /*memory_cost=*/nonnegative_int{0});
+            /*memory_cost=*/0_n);
 
         float result = task_simulator_estimate_forward_pass_time(
             pcg, estimator, device_mapping, machine_spec);
@@ -99,16 +99,16 @@ TEST_SUITE(FF_TEST_SUITE) {
               if (op.op_attrs.has<InputAttrs>()) {
                 return OpCostMetrics{/*forward_runtime=*/10.0f,
                                      /*backward_runtime=*/10.0f,
-                                     /*memory=*/nonnegative_int{0}}; // layer0
+                                     /*memory=*/0_n}; // layer0
               }
               if (op.op_attrs.has<ElementUnaryAttrs>()) {
                 return OpCostMetrics{/*forward_runtime=*/1.0f,
                                      /*backward_runtime=*/1.0f,
-                                     /*memory=*/nonnegative_int{0}}; // layer1
+                                     /*memory=*/0_n}; // layer1
               }
               return OpCostMetrics{/*forward_runtime=*/0.0f,
                                    /*backward_runtime=*/0.0f,
-                                   /*memory=*/nonnegative_int{0}};
+                                   /*memory=*/0_n};
             },
             [](TensorSetMovement const &comm) { return 5.0f; });
 
@@ -124,10 +124,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       ParallelTensorShape input_shape = ParallelTensorShape{
           ParallelTensorDims{
-              FFOrdered<ShardParallelDim>{ShardParallelDim{10, 1}},
+              FFOrdered<ShardParallelDim>{ShardParallelDim{10_n, 1_n}},
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
@@ -145,23 +145,23 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       ParallelComputationGraph pcg = b.pcg;
       std::vector<MachineViewDimension> dims = {
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
-          MachineViewDimension{stride_t{1},
+          MachineViewDimension{stride_t{1_n},
                                MachineSpecificationDimension::INTER_NODE},
       };
 
       SUBCASE("all different devices") {
-        MachineView mv0 =
-            MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
-        MachineView mv1 =
-            MachineView{MachineSpaceCoordinate{0, 1, DeviceType::GPU}, dims};
-        MachineView mv2 =
-            MachineView{MachineSpaceCoordinate{1, 0, DeviceType::GPU}, dims};
-        MachineView mv3 =
-            MachineView{MachineSpaceCoordinate{1, 1, DeviceType::GPU}, dims};
+        MachineView mv0 = MachineView{
+            MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, dims};
+        MachineView mv1 = MachineView{
+            MachineSpaceCoordinate{0_n, 1_n, DeviceType::GPU}, dims};
+        MachineView mv2 = MachineView{
+            MachineSpaceCoordinate{1_n, 0_n, DeviceType::GPU}, dims};
+        MachineView mv3 = MachineView{
+            MachineSpaceCoordinate{1_n, 1_n, DeviceType::GPU}, dims};
 
         MachineMapping device_mapping = MachineMapping{{
             {layer0, mv0},
@@ -174,7 +174,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               /*forward_op_cost=*/10.0f,
               /*backward_op_cost=*/10.0f,
               /*comm_cost=*/1.0f,
-              /*memory_cost=*/nonnegative_int{0});
+              /*memory_cost=*/0_n);
 
           float result = task_simulator_estimate_forward_pass_time(
               pcg, estimator, device_mapping, machine_spec);
@@ -187,30 +187,29 @@ TEST_SUITE(FF_TEST_SUITE) {
                 if (op.op_attrs.has<InputAttrs>()) {
                   return OpCostMetrics{/*forward_runtime=*/10.0f,
                                        /*backward_runtime=*/10.0f,
-                                       /*memory=*/nonnegative_int{0}}; // layer0
+                                       /*memory=*/0_n}; // layer0
                 }
                 if (op.op_attrs.has<ElementUnaryAttrs>()) {
-                  return OpCostMetrics{
-                      /*forward_runtime=*/1.0f,
-                      /*backward_runtime=*/1.0f,
-                      /*memory=*/nonnegative_int{0}}; // layers 1, 2
+                  return OpCostMetrics{/*forward_runtime=*/1.0f,
+                                       /*backward_runtime=*/1.0f,
+                                       /*memory=*/0_n}; // layers 1, 2
                 }
                 if (op.op_attrs.has<ElementBinaryAttrs>()) {
                   return OpCostMetrics{/*forward_runtime=*/2.0f,
                                        /*backward_runtime=*/2.0f,
-                                       /*memory=*/nonnegative_int{0}}; // layer3
+                                       /*memory=*/0_n}; // layer3
                 }
                 return OpCostMetrics{/*forward_runtime=*/0.0f,
                                      /*backward_runtime=*/0.0f,
-                                     /*memory=*/nonnegative_int{0}};
+                                     /*memory=*/0_n};
               },
               [](TensorSetMovement const &comm) { return 5.0f; });
         }
       }
 
       SUBCASE("all the same device") {
-        MachineView mv =
-            MachineView{MachineSpaceCoordinate{0, 0, DeviceType::GPU}, dims};
+        MachineView mv = MachineView{
+            MachineSpaceCoordinate{0_n, 0_n, DeviceType::GPU}, dims};
         MachineMapping device_mapping = MachineMapping{{
             {layer0, mv},
             {layer1, mv},
@@ -222,7 +221,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               /*forward_op_cost=*/10.0f,
               /*backward_op_cost=*/10.0f,
               /*comm_cost=*/1.0f,
-              /*memory_cost=*/nonnegative_int{0});
+              /*memory_cost=*/0_n);
 
           float result = task_simulator_estimate_forward_pass_time(
               pcg, cost_estimator, device_mapping, machine_spec);
@@ -235,22 +234,21 @@ TEST_SUITE(FF_TEST_SUITE) {
                 if (op.op_attrs.has<InputAttrs>()) {
                   return OpCostMetrics{/*forward_runtime=*/10.0f,
                                        /*backward_runtime=*/10.0f,
-                                       /*memory=*/nonnegative_int{0}}; // layer0
+                                       /*memory=*/0_n}; // layer0
                 }
                 if (op.op_attrs.has<ElementUnaryAttrs>()) {
-                  return OpCostMetrics{
-                      /*forward_runtime=*/1.0f,
-                      /*backward_runtime=*/1.0f,
-                      /*memory=*/nonnegative_int{0}}; // layers 1, 2
+                  return OpCostMetrics{/*forward_runtime=*/1.0f,
+                                       /*backward_runtime=*/1.0f,
+                                       /*memory=*/0_n}; // layers 1, 2
                 }
                 if (op.op_attrs.has<ElementBinaryAttrs>()) {
                   return OpCostMetrics{/*forward_runtime=*/2.0f,
                                        /*backward_runtime=*/2.0f,
-                                       /*memory=*/nonnegative_int{0}}; // layer3
+                                       /*memory=*/0_n}; // layer3
                 }
                 return OpCostMetrics{/*forward_runtime=*/0.0f,
                                      /*backward_runtime=*/0.0f,
-                                     /*memory=*/nonnegative_int{0}};
+                                     /*memory=*/0_n};
               },
               [](TensorSetMovement const &comm) { return 5.0f; });
           float result = task_simulator_estimate_forward_pass_time(
diff --git a/lib/compiler/test/src/graph_optimize_state.cc b/lib/compiler/test/src/graph_optimize_state.cc
index 46177ad420..0fd9e245a6 100644
--- a/lib/compiler/test/src/graph_optimize_state.cc
+++ b/lib/compiler/test/src/graph_optimize_state.cc
@@ -11,35 +11,37 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape =
         ParallelTensorShape{ParallelTensorDims{
                                 FFOrdered<ShardParallelDim>{
-                                    ShardParallelDim{32, 2},
-                                    ShardParallelDim{16, 1},
+                                    ShardParallelDim{32_n, 2_n},
+                                    ShardParallelDim{16_n, 1_n},
                                 },
                                 ReplicaParallelDimSet{
-                                    SumDegree{1},
-                                    DiscardCopyDegree{1},
+                                    SumDegree{1_n},
+                                    DiscardCopyDegree{1_n},
                                 },
                             },
                             DataType::FLOAT};
 
     parallel_tensor_guid_t input0 =
         builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
-    parallel_tensor_guid_t dense0 = builder.dense(input0,
-                                                  8,
-                                                  Activation::RELU,
-                                                  true,
-                                                  DataType::FLOAT,
-                                                  std::nullopt,
-                                                  std::nullopt,
-                                                  "dense0");
+    parallel_tensor_guid_t dense0 =
+        builder.dense(/*input=*/input0,
+                      /*outDim=*/8_n,
+                      /*activation=*/Activation::RELU,
+                      /*use_bias=*/true,
+                      /*data_type=*/DataType::FLOAT,
+                      /*projection_initializer=*/std::nullopt,
+                      /*bias_initializer=*/std::nullopt,
+                      /*name=*/"dense0");
 
-    parallel_tensor_guid_t dense1 = builder.dense(dense0,
-                                                  4,
-                                                  Activation::RELU,
-                                                  true,
-                                                  DataType::FLOAT,
-                                                  std::nullopt,
-                                                  std::nullopt,
-                                                  "dense1");
+    parallel_tensor_guid_t dense1 =
+        builder.dense(/*input=*/dense0,
+                      /*outDim=*/4_n,
+                      /*activation=*/Activation::RELU,
+                      /*use_bias=*/true,
+                      /*data_type=*/DataType::FLOAT,
+                      /*projection_initializer=*/std::nullopt,
+                      /*bias_initializer=*/std::nullopt,
+                      /*name=*/"dense1");
 
     ParallelComputationGraph pcg = builder.pcg;
 
@@ -59,14 +61,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input0_ =
         builder.create_input_tensor(input_shape, CreateGrad::YES, "input0");
-    parallel_tensor_guid_t dense0_ = builder.dense(input0,
-                                                   8,
-                                                   Activation::RELU,
-                                                   true,
-                                                   DataType::FLOAT,
-                                                   std::nullopt,
-                                                   std::nullopt,
-                                                   "dense0");
+    parallel_tensor_guid_t dense0_ =
+        builder.dense(/*input=*/input0,
+                      /*outDim=*/8_n,
+                      /*activation=*/Activation::RELU,
+                      /*use_bias=*/true,
+                      /*data_type=*/DataType::FLOAT,
+                      /*projection_initializer=*/std::nullopt,
+                      /*bias_initializer=*/std::nullopt,
+                      /*name=*/"dense0");
 
     ParallelComputationGraph pcg_ = builder.pcg;
 
diff --git a/lib/kernels/include/kernels/array_shape.h b/lib/kernels/include/kernels/array_shape.h
index 326c6922f9..57498ee466 100644
--- a/lib/kernels/include/kernels/array_shape.h
+++ b/lib/kernels/include/kernels/array_shape.h
@@ -1,8 +1,9 @@
 #ifndef _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 #define _FLEXFLOW_KERNELS_ARRAY_SHAPE_H
 
-#include "legion_dim.h"
+#include "kernels/legion_dim.h"
 #include "op-attrs/tensor_shape.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include "utils/stack_vector/stack_vector.h"
 #include "utils/visitable.h"
 #include <cstddef>
@@ -14,44 +15,49 @@ namespace FlexFlow {
 struct ArrayShape {
 public:
   ArrayShape() = delete;
-  ArrayShape(size_t *dims, size_t num_dims);
+  ArrayShape(nonnegative_int *dims, nonnegative_int num_dims);
   ArrayShape(TensorShape const &shape);
-  ArrayShape(std::vector<std::size_t> const &);
+  ArrayShape(std::vector<nonnegative_int> const &);
 
   /**
    * @brief Alias of ArrayShape::num_elements for compatibility with
    * Legion::Domain
    */
-  std::size_t get_volume() const;
+  nonnegative_int get_volume() const;
 
   /**
    * @brief Alias of ArrayShape::num_dims for compatibility with Legion::Domain
    */
-  std::size_t get_dim() const;
+  nonnegative_int get_dim() const;
 
-  std::size_t num_elements() const;
-  std::size_t num_dims() const;
+  nonnegative_int num_elements() const;
+  nonnegative_int num_dims() const;
 
-  std::size_t operator[](legion_dim_t) const;
-  std::size_t at(legion_dim_t) const;
-  std::size_t at(ff_dim_t) const;
+  nonnegative_int operator[](legion_dim_t) const;
+  nonnegative_int at(legion_dim_t) const;
+  nonnegative_int at(ff_dim_t) const;
+
+  bool operator==(ArrayShape const &) const;
+  bool operator!=(ArrayShape const &) const;
 
   legion_dim_t last_idx() const;
   legion_dim_t neg_idx(int) const;
 
-  std::optional<std::size_t> at_maybe(legion_dim_t) const;
-  std::optional<std::size_t> at_maybe(ff_dim_t) const;
+  std::optional<nonnegative_int> at_maybe(legion_dim_t) const;
+  std::optional<nonnegative_int> at_maybe(ff_dim_t) const;
 
   ArrayShape
       sub_shape(std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
                 std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const;
 
 public:
-  LegionTensorDims dims;
+  LegionOrdered<nonnegative_int> dims;
+
+private:
+  std::tuple<decltype(dims) const &> tie() const;
 };
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(ArrayShape, dims);
 
-size_t get_volume(ArrayShape const &);
+nonnegative_int get_volume(ArrayShape const &);
 
 TensorShape get_tensor_shape(ArrayShape const &, DataType);
 
diff --git a/lib/kernels/include/kernels/batch_norm_kernels.h b/lib/kernels/include/kernels/batch_norm_kernels.h
index 7d533d672c..f2ca17f429 100644
--- a/lib/kernels/include/kernels/batch_norm_kernels.h
+++ b/lib/kernels/include/kernels/batch_norm_kernels.h
@@ -3,46 +3,11 @@
 
 #include "device.h"
 #include "kernels/allocation.h"
+#include "kernels/batch_norm_per_device_state.dtg.h"
 #include "kernels/ff_handle.h"
 #include <memory>
 
 namespace FlexFlow {
-
-struct BatchNormPerDeviceState {
-  PerDeviceFFHandle handle;
-  ffTensorDescriptor_t inputTensor;
-  ffTensorDescriptor_t outputTensor;
-  ffTensorDescriptor_t biasTensor;
-  ffActivationDescriptor_t actiDesc;
-  ffBatchNormMode_t mode;
-  float *runningMean;
-  float *runningVar;
-  float *saveMean;
-  float *saveVar;
-  int output_n;
-  int output_c;
-  int output_h;
-  int output_w;
-  req<bool> relu;
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(BatchNormPerDeviceState,
-                                             handle,
-                                             inputTensor,
-                                             outputTensor,
-                                             biasTensor,
-                                             actiDesc,
-                                             mode,
-                                             runningMean,
-                                             runningVar,
-                                             saveMean,
-                                             saveVar,
-                                             output_n,
-                                             output_c,
-                                             output_h,
-                                             output_w,
-                                             relu);
-
 namespace Kernels {
 namespace BatchNorm {
 
@@ -56,14 +21,14 @@ BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
                                     bool relu);
 
 void forward_kernel(ffStream_t stream,
-                    BatchNormPerDeviceState const &m,
+                    BatchNormPerDeviceState const &per_device_statem,
                     float const *input_ptr,
                     float *output_ptr,
                     float const *scale_ptr,
                     float const *bias_ptr);
 
 void backward_kernel(ffStream_t stream,
-                     BatchNormPerDeviceState const &m,
+                     BatchNormPerDeviceState const &per_device_state,
                      float const *input_ptr,
                      float *output_grad_ptr,
                      float const *output_ptr,
diff --git a/lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml b/lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml
new file mode 100644
index 0000000000..6d2f04f60c
--- /dev/null
+++ b/lib/kernels/include/kernels/batch_norm_per_device_state.struct.toml
@@ -0,0 +1,68 @@
+namespace = "FlexFlow"
+name = "BatchNormPerDeviceState"
+features = []
+
+includes = [
+  "kernels/device.h",
+  "kernels/ff_handle.h",
+]
+
+[[fields]]
+name = "handle"
+type = "::FlexFlow::PerDeviceFFHandle"
+
+[[fields]]
+name = "inputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "outputTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "biasTensor"
+type = "ffTensorDescriptor_t"
+
+[[fields]]
+name = "actiDesc"
+type = "ffActivationDescriptor_t"
+
+[[fields]]
+name = "mode"
+type = "ffBatchNormMode_t"
+
+[[fields]]
+name = "runningMean"
+type = "float *"
+
+[[fields]]
+name = "runningVar"
+type = "float *"
+
+[[fields]]
+name = "saveMean"
+type = "float *"
+
+[[fields]]
+name = "saveVar"
+type = "float *"
+
+[[fields]]
+name = "output_n"
+type = "int"
+
+[[fields]]
+name = "output_c"
+type = "int"
+
+[[fields]]
+name = "output_h"
+type = "int"
+
+[[fields]]
+name = "output_w"
+type = "int"
+
+[[fields]]
+name = "relu"
+type = "bool"
diff --git a/lib/kernels/include/kernels/legion_dim.h b/lib/kernels/include/kernels/legion_dim.h
index e4dd9723b8..7b9b9c455c 100644
--- a/lib/kernels/include/kernels/legion_dim.h
+++ b/lib/kernels/include/kernels/legion_dim.h
@@ -8,19 +8,23 @@ namespace FlexFlow {
 
 legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value);
 
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t, int num_dimensions);
+legion_dim_t legion_dim_from_ff_dim(ff_dim_t, nonnegative_int num_dimensions);
 
 template <typename T>
 using LegionOrdered = DimOrdered<legion_dim_t, T>;
 
-using LegionTensorDims = LegionOrdered<size_t>;
-
 template <typename T>
 FFOrdered<T>
     ff_ordered_from_legion_ordered(LegionOrdered<T> const &legion_ordered) {
   return FFOrdered<T>(legion_ordered.rbegin(), legion_ordered.rend());
 }
 
+template <typename T>
+LegionOrdered<T>
+    legion_ordered_from_ff_ordered(FFOrdered<T> const &ff_ordered) {
+  return LegionOrdered<T>(ff_ordered.rbegin(), ff_ordered.rend());
+}
+
 template <typename T>
 std::string format_as(LegionOrdered<T> const &v) {
   std::vector<T> as_vec(v.cbegin(), v.cend());
diff --git a/lib/kernels/include/kernels/legion_dim_t.struct.toml b/lib/kernels/include/kernels/legion_dim_t.struct.toml
index d2afb0d73f..6c047f096b 100644
--- a/lib/kernels/include/kernels/legion_dim_t.struct.toml
+++ b/lib/kernels/include/kernels/legion_dim_t.struct.toml
@@ -1,6 +1,5 @@
 namespace = "FlexFlow"
 name = "legion_dim_t"
-
 features = [
   "eq",
   "ord",
@@ -9,6 +8,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "value"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/kernels/include/kernels/transpose_kernels.h b/lib/kernels/include/kernels/transpose_kernels.h
index 56da81ba2b..0f1cc2ae61 100644
--- a/lib/kernels/include/kernels/transpose_kernels.h
+++ b/lib/kernels/include/kernels/transpose_kernels.h
@@ -3,32 +3,21 @@
 
 #include "device.h"
 #include "kernels/accessor.h"
+#include "op-attrs/ops/transpose_attrs.dtg.h"
 #include <vector>
 
 namespace FlexFlow {
 
-struct TransposePerDeviceState {
-  int num_dim;
-  req<std::vector<legion_dim_t>> perm;
-};
-
-FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(TransposePerDeviceState,
-                                             num_dim,
-                                             perm);
-
 namespace Kernels {
 namespace Transpose {
 
-TransposePerDeviceState init_kernel(int num_dim,
-                                    std::vector<ff_dim_t> const &perm);
-
 void forward_kernel(cudaStream_t stream,
-                    TransposePerDeviceState const &m,
+                    TransposeAttrs const &attrs,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output);
 
 void backward_kernel(cudaStream_t stream,
-                     TransposePerDeviceState const &m,
+                     TransposeAttrs const &attrs,
                      GenericTensorAccessorW const &in_grad,
                      GenericTensorAccessorR const &out_grad);
 
diff --git a/lib/kernels/src/allocation.cc b/lib/kernels/src/allocation.cc
index ccd88580db..d666592e77 100644
--- a/lib/kernels/src/allocation.cc
+++ b/lib/kernels/src/allocation.cc
@@ -13,7 +13,8 @@ void Allocator::deallocate(void *ptr) {
 
 GenericTensorAccessorW
     Allocator::allocate_tensor(TensorShape const &tensor_shape) {
-  void *ptr = this->allocate(get_size_in_bytes(tensor_shape));
+  void *ptr =
+      this->allocate(get_size_in_bytes(tensor_shape).unwrap_nonnegative());
   return {tensor_shape.data_type, tensor_shape, ptr};
 }
 
diff --git a/lib/kernels/src/array_shape.cc b/lib/kernels/src/array_shape.cc
index d5e2f1167d..243185ada4 100644
--- a/lib/kernels/src/array_shape.cc
+++ b/lib/kernels/src/array_shape.cc
@@ -1,62 +1,71 @@
 #include "kernels/array_shape.h"
 #include "utils/containers/product.h"
+#include "utils/containers/reversed.h"
+#include "utils/containers/vector_of.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
-static LegionTensorDims
-    legion_dims_from_ff_dims(FFOrdered<size_t> const &ff_ordered) {
-  std::vector<size_t> sizes(ff_ordered.size());
-  std::reverse_copy(ff_ordered.begin(), ff_ordered.end(), sizes.begin());
-  return LegionTensorDims(sizes.begin(), sizes.end());
+static LegionOrdered<nonnegative_int>
+    legion_dims_from_ff_dims(FFOrdered<nonnegative_int> const &ff_ordered) {
+  return LegionOrdered<nonnegative_int>{reversed(vector_of(ff_ordered))};
 }
 
-ArrayShape::ArrayShape(size_t *_dims, size_t num_dims)
-    : dims(_dims, _dims + num_dims) {}
+ArrayShape::ArrayShape(nonnegative_int *_dims, nonnegative_int num_dims)
+    : dims(_dims, _dims + num_dims.unwrap_nonnegative()) {}
 
 ArrayShape::ArrayShape(TensorShape const &shape)
     : dims(legion_dims_from_ff_dims(shape.dims.ff_ordered)) {}
 
-ArrayShape::ArrayShape(std::vector<std::size_t> const &input_dims)
+ArrayShape::ArrayShape(std::vector<nonnegative_int> const &input_dims)
     : dims(input_dims) {}
 
-std::size_t ArrayShape::get_volume() const {
+nonnegative_int ArrayShape::get_volume() const {
   return this->num_elements();
 }
 
-std::size_t ArrayShape::num_dims() const {
-  return this->dims.size();
+nonnegative_int ArrayShape::num_dims() const {
+  return ::FlexFlow::num_elements(this->dims);
 }
 
-std::size_t ArrayShape::get_dim() const {
+nonnegative_int ArrayShape::get_dim() const {
   return this->num_dims();
 }
 
-std::size_t ArrayShape::num_elements() const {
+nonnegative_int ArrayShape::num_elements() const {
   if (dims.size() == 0) {
-    return 0;
+    return 0_n;
   }
   return product(this->dims);
 }
 
-std::size_t ArrayShape::operator[](legion_dim_t idx) const {
+nonnegative_int ArrayShape::operator[](legion_dim_t idx) const {
   return dims.at(idx);
 }
 
-std::size_t ArrayShape::at(legion_dim_t idx) const {
+nonnegative_int ArrayShape::at(legion_dim_t idx) const {
   return dims.at(idx);
 }
 
-std::size_t ArrayShape::at(ff_dim_t idx) const {
+nonnegative_int ArrayShape::at(ff_dim_t idx) const {
   return dims.at(legion_dim_from_ff_dim(idx, this->num_dims()));
 }
 
+bool ArrayShape::operator==(ArrayShape const &other) const {
+  return this->tie() == other.tie();
+}
+
+bool ArrayShape::operator!=(ArrayShape const &other) const {
+  return this->tie() != other.tie();
+}
+
 ArrayShape ArrayShape::sub_shape(
     std::optional<std::variant<ff_dim_t, legion_dim_t>> start,
     std::optional<std::variant<ff_dim_t, legion_dim_t>> end) const {
   NOT_IMPLEMENTED();
 }
 
-std::optional<std::size_t> ArrayShape::at_maybe(legion_dim_t index) const {
+std::optional<nonnegative_int> ArrayShape::at_maybe(legion_dim_t index) const {
   if (index.value < dims.size()) {
     return dims.at(index);
   } else {
@@ -64,11 +73,15 @@ std::optional<std::size_t> ArrayShape::at_maybe(legion_dim_t index) const {
   }
 }
 
-std::optional<std::size_t> ArrayShape::at_maybe(ff_dim_t index) const {
+std::optional<nonnegative_int> ArrayShape::at_maybe(ff_dim_t index) const {
   return this->at_maybe(legion_dim_from_ff_dim(index, this->num_dims()));
 }
 
-size_t get_volume(ArrayShape const &shape) {
+std::tuple<LegionOrdered<nonnegative_int> const &> ArrayShape::tie() const {
+  return std::tie(this->dims);
+}
+
+nonnegative_int get_volume(ArrayShape const &shape) {
   return shape.get_volume();
 }
 
diff --git a/lib/kernels/src/cuda/cuda_helper.cu b/lib/kernels/src/cuda/cuda_helper.cu
index 2ff02038f4..66388c0ec8 100644
--- a/lib/kernels/src/cuda/cuda_helper.cu
+++ b/lib/kernels/src/cuda/cuda_helper.cu
@@ -224,10 +224,10 @@ ffStatus_t
       tensor,
       CUDNN_TENSOR_NCHW,
       CUDNN_DATA_FLOAT,
-      shape.at_maybe(legion_dim_t{0}).value_or(1),
-      shape.at_maybe(legion_dim_t{1}).value_or(1),
-      shape.at_maybe(legion_dim_t{2}).value_or(1),
-      shape.at_maybe(legion_dim_t{3}).value_or(1));
+      shape.at_maybe(legion_dim_t{0_n}).value_or(1_n).unwrap_nonnegative(),
+      shape.at_maybe(legion_dim_t{1_n}).value_or(1_n).unwrap_nonnegative(),
+      shape.at_maybe(legion_dim_t{2_n}).value_or(1_n).unwrap_nonnegative(),
+      shape.at_maybe(legion_dim_t{3_n}).value_or(1_n).unwrap_nonnegative());
 }
 
 cudnnDataType_t ff_to_cudnn_datatype(DataType type) {
diff --git a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
index 6c6e17a181..4e153a028e 100644
--- a/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
+++ b/lib/kernels/src/cuda/ops/batch_norm_kernels.cu
@@ -145,21 +145,23 @@ BatchNormPerDeviceState init_kernel(PerDeviceFFHandle handle,
         actiDesc, CUDNN_ACTIVATION_RELU, CUDNN_PROPAGATE_NAN, 0.0));
   }
 
-  BatchNormPerDeviceState per_device_state = {handle,
-                                              inputTensor,
-                                              outputTensor,
-                                              biasTensor,
-                                              actiDesc,
-                                              mode,
-                                              runningMean,
-                                              runningVar,
-                                              saveMean,
-                                              saveVar,
-                                              output_n,
-                                              output_c,
-                                              output_h,
-                                              output_w,
-                                              relu};
+  BatchNormPerDeviceState per_device_state = BatchNormPerDeviceState{
+      handle,
+      inputTensor,
+      outputTensor,
+      biasTensor,
+      actiDesc,
+      mode,
+      runningMean,
+      runningVar,
+      saveMean,
+      saveVar,
+      output_n,
+      output_c,
+      output_h,
+      output_w,
+      relu,
+  };
 
   checkCUDA(cudaStreamDestroy(stream));
   return per_device_state;
diff --git a/lib/kernels/src/cuda/ops/cast_kernels.cu b/lib/kernels/src/cuda/ops/cast_kernels.cu
index b895ffb68f..fe7aec68b9 100644
--- a/lib/kernels/src/cuda/ops/cast_kernels.cu
+++ b/lib/kernels/src/cuda/ops/cast_kernels.cu
@@ -41,7 +41,7 @@ struct ForwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume();
+    size_t volume = input.shape.get_volume().unwrap_nonnegative();
     cast_forward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
         input.get<IDT>(), output.get<ODT>(), volume);
   }
@@ -52,7 +52,7 @@ struct BackwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &input,
                   GenericTensorAccessorW const &output) {
-    size_t volume = input.shape.get_volume();
+    size_t volume = input.shape.get_volume().unwrap_nonnegative();
     cast_backward<<<GET_BLOCKS(volume), CUDA_NUM_THREADS, 0, stream>>>(
         input.get<IDT>(), output.get<ODT>(), volume, cast_to<ODT>(1.0f));
   }
diff --git a/lib/kernels/src/cuda/ops/combine_kernels.cu b/lib/kernels/src/cuda/ops/combine_kernels.cu
index 98c01d1f7c..7cc67ceed8 100644
--- a/lib/kernels/src/cuda/ops/combine_kernels.cu
+++ b/lib/kernels/src/cuda/ops/combine_kernels.cu
@@ -29,7 +29,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync(output.get<DT>(),
                               input.get<DT>(),
-                              input.shape.get_volume() * size_of_datatype(DT),
+                              input.shape.get_volume().unwrap_nonnegative() *
+                                  size_of_datatype(DT).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -40,7 +41,7 @@ struct BackwardKernel {
   void operator()(ffStream_t stream,
                   GenericTensorAccessorR const &output_grad,
                   GenericTensorAccessorW const &input_grad) {
-    size_t num_elements = output_grad.shape.get_volume();
+    size_t num_elements = output_grad.shape.get_volume().unwrap_nonnegative();
     add_kernel<real_type_t<DT>>
         <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
             input_grad.get<DT>(), output_grad.get<DT>(), num_elements);
diff --git a/lib/kernels/src/cuda/ops/concat_kernels.cu b/lib/kernels/src/cuda/ops/concat_kernels.cu
index 68004738d2..2715ff16e9 100644
--- a/lib/kernels/src/cuda/ops/concat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/concat_kernels.cu
@@ -25,8 +25,11 @@ void calc_blk_size(size_t &num_blocks,
                    size_t &blk_size,
                    ArrayShape const &shape,
                    ff_dim_t axis) {
-  blk_size = shape.sub_shape(legion_dim_t{0}, axis).num_elements();
-  num_blocks = shape.sub_shape(axis, std::nullopt).num_elements();
+  blk_size = shape.sub_shape(legion_dim_t{0_n}, axis)
+                 .num_elements()
+                 .unwrap_nonnegative();
+  num_blocks =
+      shape.sub_shape(axis, std::nullopt).num_elements().unwrap_nonnegative();
 }
 
 void forward_kernel(cudaStream_t stream,
diff --git a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
index e3a4c97a31..dac55539d2 100644
--- a/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
+++ b/lib/kernels/src/cuda/ops/conv_2d_kernels.cu
@@ -137,15 +137,15 @@ Conv2DPerDeviceState init_kernel(PerDeviceFFHandle handle,
   ffConvolutionBwdFilterAlgo_t bwdFilterAlgo;
   ffConvolutionBwdDataAlgo_t bwdDataAlgo;
 
-  int input_w = input.shape[legion_dim_t(0)];
-  int input_h = input.shape[legion_dim_t(1)];
-  int input_c = input.shape[legion_dim_t(2)];
-  int input_n = input.shape[legion_dim_t(3)];
+  int input_w = input.shape.at(legion_dim_t(0_n)).unwrap_nonnegative();
+  int input_h = input.shape.at(legion_dim_t(1_n)).unwrap_nonnegative();
+  int input_c = input.shape.at(legion_dim_t(2_n)).unwrap_nonnegative();
+  int input_n = input.shape.at(legion_dim_t(3_n)).unwrap_nonnegative();
 
-  int output_w = output.shape[legion_dim_t(0)];
-  int output_h = output.shape[legion_dim_t(1)];
-  int output_c = output.shape[legion_dim_t(2)];
-  int output_n = output.shape[legion_dim_t(3)];
+  int output_w = output.shape.at(legion_dim_t(0_n)).unwrap_nonnegative();
+  int output_h = output.shape.at(legion_dim_t(1_n)).unwrap_nonnegative();
+  int output_c = output.shape.at(legion_dim_t(2_n)).unwrap_nonnegative();
+  int output_n = output.shape.at(legion_dim_t(3_n)).unwrap_nonnegative();
 
   checkCUDNN(cudnnCreateTensorDescriptor(&inputTensor));
   checkCUDNN(cudnnCreateTensorDescriptor(&biasTensor));
diff --git a/lib/kernels/src/cuda/ops/element_unary_kernels.cu b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
index a35d28fa8c..056c80ecf6 100644
--- a/lib/kernels/src/cuda/ops/element_unary_kernels.cu
+++ b/lib/kernels/src/cuda/ops/element_unary_kernels.cu
@@ -266,7 +266,7 @@ struct ForwardKernel {
                                         output.get<T>()));
     } else if (use_scalar(op_type)) {
       assert(scalar.has_value());
-      size_t num_elements = input.shape.num_elements();
+      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
       elewise_scalar_unary_forward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
@@ -275,7 +275,7 @@ struct ForwardKernel {
               input.get<T>(),
               output.get<T>());
     } else {
-      size_t num_elements = input.shape.num_elements();
+      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
       elewise_unary_forward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements, op_type, input.get<T>(), output.get<T>());
@@ -312,7 +312,7 @@ struct BackwardKernel {
                                          input_grad.get<T>()));
     } else if (use_scalar(op_type)) {
       assert(scalar.has_value());
-      size_t num_elements = input.shape.num_elements();
+      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
       elewise_scalar_unary_backward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
@@ -323,7 +323,7 @@ struct BackwardKernel {
               input.get<T>(),
               input_grad.get<T>());
     } else {
-      size_t num_elements = input.shape.num_elements();
+      size_t num_elements = input.shape.num_elements().unwrap_nonnegative();
       elewise_unary_backward_kernel<real_type_t<T>>
           <<<GET_BLOCKS(num_elements), CUDA_NUM_THREADS, 0, stream>>>(
               num_elements,
diff --git a/lib/kernels/src/cuda/ops/flat_kernels.cu b/lib/kernels/src/cuda/ops/flat_kernels.cu
index 941db108a0..973d05f596 100644
--- a/lib/kernels/src/cuda/ops/flat_kernels.cu
+++ b/lib/kernels/src/cuda/ops/flat_kernels.cu
@@ -27,7 +27,8 @@ void forward_kernel(cudaStream_t stream,
 
   checkCUDA(cudaMemcpyAsync(output_ptr,
                             input.get_float_ptr(),
-                            (input.shape.num_elements()) * sizeof(float),
+                            input.shape.num_elements().unwrap_nonnegative() *
+                                sizeof(float),
                             cudaMemcpyDeviceToDevice,
                             stream));
 }
@@ -39,8 +40,13 @@ void backward_kernel(cudaStream_t stream,
 
   float alpha = 1.0f;
   apply_add_with_scale<float>
-      <<<GET_BLOCKS(input.shape.num_elements()), CUDA_NUM_THREADS, 0, stream>>>(
-          input_grad_ptr, output_grad_ptr, input.shape.num_elements(), alpha);
+      <<<GET_BLOCKS(input.shape.num_elements().unwrap_nonnegative()),
+         CUDA_NUM_THREADS,
+         0,
+         stream>>>(input_grad_ptr,
+                   output_grad_ptr,
+                   input.shape.num_elements().unwrap_nonnegative(),
+                   alpha);
 }
 
 } // namespace Flat
diff --git a/lib/kernels/src/cuda/ops/gather_kernels.cu b/lib/kernels/src/cuda/ops/gather_kernels.cu
index 11c0a1a5e7..31c1bac217 100644
--- a/lib/kernels/src/cuda/ops/gather_kernels.cu
+++ b/lib/kernels/src/cuda/ops/gather_kernels.cu
@@ -128,22 +128,24 @@ void forward_kernel(ffStream_t stream,
 
   coord_t stride =
       output.shape.sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
-          .num_elements();
-  coord_t output_dim_size = output.shape[m.legion_dim];
-  coord_t input_dim_size = input.shape[m.legion_dim];
+          .num_elements()
+          .unwrap_nonnegative();
+  coord_t output_dim_size = output.shape.at(m.legion_dim).unwrap_nonnegative();
+  coord_t input_dim_size = input.shape.at(m.legion_dim).unwrap_nonnegative();
 
   assert(index.data_type == DataType::INT32 ||
          index.data_type == DataType::INT64);
 
-  DataTypeDispatch1<ForwardKernel>{}(index.data_type,
-                                     stream,
-                                     input,
-                                     index,
-                                     output,
-                                     output.shape.get_volume(),
-                                     stride,
-                                     input_dim_size,
-                                     output_dim_size);
+  DataTypeDispatch1<ForwardKernel>{}(
+      index.data_type,
+      stream,
+      input,
+      index,
+      output,
+      output.shape.get_volume().unwrap_nonnegative(),
+      stride,
+      input_dim_size,
+      output_dim_size);
 }
 
 void backward_kernel(ffStream_t stream,
@@ -156,22 +158,26 @@ void backward_kernel(ffStream_t stream,
   coord_t stride =
       output_grad.shape
           .sub_shape(std::nullopt, add_to_legion_dim(m.legion_dim, 1))
-          .get_volume();
-  coord_t output_dim_size = output_grad.shape[m.legion_dim];
-  coord_t input_dim_size = input_grad.shape[m.legion_dim];
+          .get_volume()
+          .unwrap_nonnegative();
+  coord_t output_dim_size =
+      output_grad.shape.at(m.legion_dim).unwrap_nonnegative();
+  coord_t input_dim_size =
+      input_grad.shape.at(m.legion_dim).unwrap_nonnegative();
 
   assert(index.data_type == DataType::INT32 ||
          index.data_type == DataType::INT64);
 
-  DataTypeDispatch1<BackwardKernel>{}(index.data_type,
-                                      stream,
-                                      output_grad,
-                                      index,
-                                      input_grad,
-                                      output_grad.shape.get_volume(),
-                                      stride,
-                                      input_dim_size,
-                                      output_dim_size);
+  DataTypeDispatch1<BackwardKernel>{}(
+      index.data_type,
+      stream,
+      output_grad,
+      index,
+      input_grad,
+      output_grad.shape.get_volume().unwrap_nonnegative(),
+      stride,
+      input_dim_size,
+      output_dim_size);
 }
 
 } // namespace Gather
diff --git a/lib/kernels/src/cuda/ops/partition_kernels.cu b/lib/kernels/src/cuda/ops/partition_kernels.cu
index 1d07efb5fa..2831562f58 100644
--- a/lib/kernels/src/cuda/ops/partition_kernels.cu
+++ b/lib/kernels/src/cuda/ops/partition_kernels.cu
@@ -29,7 +29,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync(output.get<T>(),
                               input.get<T>(),
-                              input.shape.num_elements() * size_of_datatype(T),
+                              input.shape.num_elements().unwrap_nonnegative() *
+                                  size_of_datatype(T).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -41,12 +42,13 @@ struct BackwardKernel {
                   RepartitionPerDeviceState const &m,
                   GenericTensorAccessorW const &input_grad,
                   GenericTensorAccessorR const &output_grad) {
-    add_kernel<real_type_t<T>><<<GET_BLOCKS(input_grad.shape.num_elements()),
-                                 CUDA_NUM_THREADS,
-                                 0,
-                                 stream>>>(input_grad.get<T>(),
-                                           output_grad.get<T>(),
-                                           input_grad.shape.num_elements());
+    add_kernel<real_type_t<T>>
+        <<<GET_BLOCKS(input_grad.shape.num_elements().unwrap_nonnegative()),
+           CUDA_NUM_THREADS,
+           0,
+           stream>>>(input_grad.get<T>(),
+                     output_grad.get<T>(),
+                     input_grad.shape.num_elements().unwrap_nonnegative());
   }
 };
 
diff --git a/lib/kernels/src/cuda/ops/reduction_kernels.cu b/lib/kernels/src/cuda/ops/reduction_kernels.cu
index 0c6ba7d8e3..5d95a3766a 100644
--- a/lib/kernels/src/cuda/ops/reduction_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reduction_kernels.cu
@@ -41,12 +41,13 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output,
                   size_t num_replicas) {
 
-    size_t total_elements = input.shape.num_elements() * num_replicas;
+    size_t total_elements =
+        input.shape.num_elements().unwrap_nonnegative() * num_replicas;
     reduction_forward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
             input.get<T>(),
             output.get<T>(),
-            input.shape.num_elements(),
+            input.shape.num_elements().unwrap_nonnegative(),
             num_replicas);
   }
 };
@@ -58,7 +59,8 @@ struct BackwardKernel {
                   GenericTensorAccessorR const &output) {
     checkCUDA(cudaMemcpyAsync(input.get<T>(),
                               output.get<T>(),
-                              input.shape.num_elements() * size_of_datatype(T),
+                              input.shape.num_elements().unwrap_nonnegative() *
+                                  size_of_datatype(T).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
diff --git a/lib/kernels/src/cuda/ops/replicate_kernels.cu b/lib/kernels/src/cuda/ops/replicate_kernels.cu
index 76bfbe2658..4706f38fd4 100644
--- a/lib/kernels/src/cuda/ops/replicate_kernels.cu
+++ b/lib/kernels/src/cuda/ops/replicate_kernels.cu
@@ -41,7 +41,8 @@ struct ForwardKernel {
 
     checkCUDA(cudaMemcpyAsync((void *)output.get<T>(),
                               (void *)input.get<T>(),
-                              input.shape.num_elements() * size_of_datatype(T),
+                              input.shape.num_elements().unwrap_nonnegative() *
+                                  size_of_datatype(T).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -53,12 +54,13 @@ struct BackwardKernel {
                   GenericTensorAccessorW const &input,
                   GenericTensorAccessorR const &output,
                   size_t num_replicas) {
-    size_t total_elements = input.shape.num_elements() * num_replicas;
+    size_t total_elements =
+        input.shape.num_elements().unwrap_nonnegative() * num_replicas;
     replicate_backward_kernel<real_type_t<T>>
         <<<GET_BLOCKS(total_elements), CUDA_NUM_THREADS, 0, stream>>>(
             input.get<T>(),
             output.get<T>(),
-            input.shape.num_elements(),
+            input.shape.num_elements().unwrap_nonnegative(),
             num_replicas);
   }
 };
diff --git a/lib/kernels/src/cuda/ops/reshape_kernels.cu b/lib/kernels/src/cuda/ops/reshape_kernels.cu
index 5b7843a3a5..c5a289ce6b 100644
--- a/lib/kernels/src/cuda/ops/reshape_kernels.cu
+++ b/lib/kernels/src/cuda/ops/reshape_kernels.cu
@@ -33,7 +33,8 @@ struct ForwardKernel {
                   GenericTensorAccessorW const &output) {
     checkCUDA(cudaMemcpyAsync(output.get<T>(),
                               input.get<T>(),
-                              input.shape.num_elements() * size_of_datatype(T),
+                              input.shape.num_elements().unwrap_nonnegative() *
+                                  size_of_datatype(T).unwrap_nonnegative(),
                               cudaMemcpyDeviceToDevice,
                               stream));
   }
@@ -46,12 +47,12 @@ struct BackwardKernel {
                   GenericTensorAccessorR const &output) {
     float alpha = 1.0f;
     apply_add_with_scale<real_type_t<T>>
-        <<<GET_BLOCKS(input.shape.num_elements()),
+        <<<GET_BLOCKS(input.shape.num_elements().unwrap_nonnegative()),
            CUDA_NUM_THREADS,
            0,
            stream>>>(input.get<T>(),
                      output.get<T>(),
-                     input.shape.num_elements(),
+                     input.shape.num_elements().unwrap_nonnegative(),
                      static_cast<real_type_t<T>>(alpha));
   }
 };
diff --git a/lib/kernels/src/cuda/ops/transpose_kernels.cu b/lib/kernels/src/cuda/ops/transpose_kernels.cu
index 3b3f80944d..60d2f7f342 100644
--- a/lib/kernels/src/cuda/ops/transpose_kernels.cu
+++ b/lib/kernels/src/cuda/ops/transpose_kernels.cu
@@ -16,7 +16,9 @@
 #include "device.h"
 #include "kernels/accessor.h"
 #include "kernels/transpose_kernels.h"
+#include "op-attrs/dim_ordered/transform.h"
 #include "utils/exception.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -29,19 +31,6 @@ struct TransposeStrides {
 namespace Kernels {
 namespace Transpose {
 
-TransposePerDeviceState init_kernel(int num_dim,
-                                    std::vector<ff_dim_t> const &perm) {
-  int const length = perm.size();
-
-  std::vector<legion_dim_t> perm_vector;
-  assert(length <= MAX_TENSOR_DIM);
-  for (int i = 0; i < length; ++i) {
-    perm_vector.push_back(legion_dim_from_ff_dim(perm[i], num_dim));
-  }
-
-  return {num_dim, perm_vector};
-}
-
 __global__ void transpose_simple_kernel(std::size_t volume,
                                         float const *in_ptr,
                                         float *out_ptr,
@@ -59,64 +48,92 @@ __global__ void transpose_simple_kernel(std::size_t volume,
   }
 }
 
+static LegionOrdered<legion_dim_t>
+    legion_ordered_perm_from_ff_ordered(FFOrdered<ff_dim_t> const &perm) {
+  nonnegative_int perm_size = num_elements(perm);
+  LegionOrdered<legion_dim_t> legion_ordered_perm =
+      transform(legion_ordered_from_ff_ordered(perm), [&](ff_dim_t d) {
+        return legion_dim_from_ff_dim(d, perm_size);
+      });
+
+  return legion_ordered_perm;
+}
+
 void forward_kernel(cudaStream_t stream,
-                    TransposePerDeviceState const &m,
+                    TransposeAttrs const &m,
                     GenericTensorAccessorR const &input,
                     GenericTensorAccessorW const &output) {
 
   TransposeStrides info;
-  info.num_dim = input.shape.num_dims();
-  assert(info.num_dim == m.num_dim);
+  info.num_dim = input.shape.num_dims().unwrap_nonnegative();
+  assert(info.num_dim == m.perm.size());
+
+  LegionOrdered<legion_dim_t> legion_ordered_perm =
+      legion_ordered_perm_from_ff_ordered(m.perm);
+
   for (int i = 0; i < info.num_dim; i++) {
     if (i == 0) {
       info.in_strides[i] = 1;
       info.out_strides[i] = 1;
     } else {
-      int in_dim_size = input.shape[legion_dim_t(i)] + 1;
-      int out_dim_size = output.shape[legion_dim_t(i)] + 1;
+      int in_dim_size =
+          input.shape.at(legion_dim_t{nonnegative_int{i}}).unwrap_nonnegative();
+      int out_dim_size = output.shape.at(legion_dim_t{nonnegative_int{i}})
+                             .unwrap_nonnegative();
       info.in_strides[i] = info.in_strides[i - 1] * in_dim_size;
       info.out_strides[i] = info.out_strides[i - 1] * out_dim_size;
     }
-    info.perm[i] = m.perm[i].value;
+
+    info.perm[i] = legion_ordered_perm.at(legion_dim_t{nonnegative_int{i}})
+                       .value.unwrap_nonnegative();
   }
-  transpose_simple_kernel<<<GET_BLOCKS(output.shape.get_volume()),
-                            CUDA_NUM_THREADS,
-                            0,
-                            stream>>>(output.shape.get_volume(),
-                                      input.get_float_ptr(),
-                                      output.get_float_ptr(),
-                                      info,
-                                      0.0f /*beta*/);
+  transpose_simple_kernel<<<
+      GET_BLOCKS(output.shape.get_volume().unwrap_nonnegative()),
+      CUDA_NUM_THREADS,
+      0,
+      stream>>>(output.shape.get_volume().unwrap_nonnegative(),
+                input.get_float_ptr(),
+                output.get_float_ptr(),
+                info,
+                0.0f /*beta*/);
 }
 
 void backward_kernel(cudaStream_t stream,
-                     TransposePerDeviceState const &m,
+                     TransposeAttrs const &m,
                      GenericTensorAccessorW const &in_grad,
                      GenericTensorAccessorR const &out_grad) {
 
   TransposeStrides info;
-  info.num_dim = in_grad.shape.num_dims();
-  assert(info.num_dim == m.num_dim);
+  info.num_dim = in_grad.shape.num_dims().unwrap_nonnegative();
+  assert(info.num_dim == m.perm.size());
+
+  LegionOrdered<legion_dim_t> legion_ordered_perm =
+      legion_ordered_perm_from_ff_ordered(m.perm);
+
   for (int i = 0; i < info.num_dim; i++) {
     if (i == 0) {
       info.in_strides[i] = 1;
       info.out_strides[i] = 1;
     } else {
-      int in_dim_size = out_grad.shape[legion_dim_t(i)] + 1;
-      int out_dim_size = in_grad.shape[legion_dim_t(i)] + 1;
+      int in_dim_size = out_grad.shape.at(legion_dim_t{nonnegative_int{i}})
+                            .unwrap_nonnegative();
+      int out_dim_size = in_grad.shape.at(legion_dim_t{nonnegative_int{i}})
+                             .unwrap_nonnegative();
       info.in_strides[i] = info.in_strides[i - 1] * in_dim_size;
       info.out_strides[i] = info.out_strides[i - 1] * out_dim_size;
     }
-    info.perm[m.perm[i].value] = i;
+    info.perm[legion_ordered_perm.at(legion_dim_t{nonnegative_int{i}})
+                  .value.unwrap_nonnegative()] = i;
   }
-  transpose_simple_kernel<<<GET_BLOCKS(in_grad.shape.get_volume()),
-                            CUDA_NUM_THREADS,
-                            0,
-                            stream>>>(in_grad.shape.get_volume(),
-                                      out_grad.get_float_ptr(),
-                                      in_grad.get_float_ptr(),
-                                      info,
-                                      1.0f /*beta*/);
+  transpose_simple_kernel<<<
+      GET_BLOCKS(in_grad.shape.get_volume().unwrap_nonnegative()),
+      CUDA_NUM_THREADS,
+      0,
+      stream>>>(in_grad.shape.get_volume().unwrap_nonnegative(),
+                out_grad.get_float_ptr(),
+                in_grad.get_float_ptr(),
+                info,
+                1.0f /*beta*/);
 }
 
 } // namespace Transpose
diff --git a/lib/kernels/src/legion_dim.cc b/lib/kernels/src/legion_dim.cc
index 142dcbcb2c..bbb15c5636 100644
--- a/lib/kernels/src/legion_dim.cc
+++ b/lib/kernels/src/legion_dim.cc
@@ -3,11 +3,14 @@
 namespace FlexFlow {
 
 legion_dim_t add_to_legion_dim(legion_dim_t legion_dim, int value) {
-  return legion_dim_t(legion_dim.value + value);
+  return legion_dim_t{
+      nonnegative_int{legion_dim.value.unwrap_nonnegative() + value}};
 }
 
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, int num_dimensions) {
-  return legion_dim_t(num_dimensions - ff_dim.value.get_value() - 1);
+legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim,
+                                    nonnegative_int num_dimensions) {
+  return legion_dim_t{nonnegative_int{num_dimensions.unwrap_nonnegative() -
+                                      ff_dim.value.unwrap_nonnegative() - 1}};
 }
 
 } // namespace FlexFlow
diff --git a/lib/kernels/test/src/test_attention_kernel.cc b/lib/kernels/test/src/test_attention_kernel.cc
index d44129ece1..64264f6c39 100644
--- a/lib/kernels/test/src/test_attention_kernel.cc
+++ b/lib/kernels/test/src/test_attention_kernel.cc
@@ -6,32 +6,38 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test multi-head attention kernel") {
-    size_t num_samples = 10;
-    size_t num_heads = 4;
-    size_t qSize = 64, kSize = 64, vSize = 64;
-    size_t qProjSize = 64, kProjSize = 64, vProjSize = 64, oProjSize = 64;
-    size_t qoSeqLength = 20, kvSeqLength = 20;
+    nonnegative_int num_samples = 10_n;
+    nonnegative_int num_heads = 4_n;
+    nonnegative_int qSize = 64_n;
+    nonnegative_int kSize = 64_n;
+    nonnegative_int vSize = 64_n;
+    nonnegative_int qProjSize = 64_n;
+    nonnegative_int kProjSize = 64_n;
+    nonnegative_int vProjSize = 64_n;
+    nonnegative_int oProjSize = 64_n;
+    nonnegative_int qoSeqLength = 20_n;
+    nonnegative_int kvSeqLength = 20_n;
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    MHAPerDeviceState state =
-        Kernels::MultiHeadAttention::init_kernel(managed_handle.raw_handle(),
-                                                 allocator,
-                                                 num_samples,
-                                                 num_heads,
-                                                 qSize,
-                                                 kSize,
-                                                 vSize,
-                                                 qProjSize,
-                                                 kProjSize,
-                                                 vProjSize,
-                                                 oProjSize,
-                                                 qoSeqLength,
-                                                 kvSeqLength,
-                                                 false);
+    MHAPerDeviceState state = Kernels::MultiHeadAttention::init_kernel(
+        managed_handle.raw_handle(),
+        allocator,
+        /*num_samples=*/num_samples.unwrap_nonnegative(),
+        /*num_heads=*/num_heads.unwrap_nonnegative(),
+        /*qSize=*/qSize.unwrap_nonnegative(),
+        /*kSize=*/kSize.unwrap_nonnegative(),
+        /*vSize=*/vSize.unwrap_nonnegative(),
+        /*qProjSize=*/qProjSize.unwrap_nonnegative(),
+        /*kProjSize=*/kProjSize.unwrap_nonnegative(),
+        /*vProjSize=*/vProjSize.unwrap_nonnegative(),
+        /*oProjSize=*/oProjSize.unwrap_nonnegative(),
+        /*qoSeqLength=*/qoSeqLength.unwrap_nonnegative(),
+        /*kvSeqLength=*/kvSeqLength.unwrap_nonnegative(),
+        /*add_bias_kv=*/false);
 
     TensorShape query_shape = make_float_tensor_shape_from_legion_dims(
         {qoSeqLength, num_samples, qSize});
@@ -41,8 +47,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         {kvSeqLength, num_samples, vSize});
     TensorShape output_shape = make_float_tensor_shape_from_legion_dims(
         {qoSeqLength, num_samples, oProjSize});
-    TensorShape weight_shape =
-        make_float_tensor_shape_from_legion_dims({state.weightSize});
+    TensorShape weight_shape = make_float_tensor_shape_from_legion_dims(
+        {nonnegative_int{state.weightSize}});
 
     GenericTensorAccessorW query_accessor =
         create_random_filled_accessor_w(query_shape, allocator);
diff --git a/lib/kernels/test/src/test_batch_matmul_kernel.cc b/lib/kernels/test/src/test_batch_matmul_kernel.cc
index 18e6977148..cacd5b60fb 100644
--- a/lib/kernels/test/src/test_batch_matmul_kernel.cc
+++ b/lib/kernels/test/src/test_batch_matmul_kernel.cc
@@ -6,13 +6,13 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test BatchMatmul Kernel") {
-    size_t m = 10;
-    size_t n = 10;
-    size_t k = 10;
-    size_t batch = 5;
-    size_t a_seq_length_dim = -1;
-    size_t b_seq_length_dim = -1;
-    size_t seq_length = -1;
+    nonnegative_int m = 10_n;
+    nonnegative_int n = 10_n;
+    nonnegative_int k = 10_n;
+    nonnegative_int batch = 5_n;
+    int a_seq_length_dim = -1;
+    int b_seq_length_dim = -1;
+    int seq_length = -1;
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{};
@@ -39,10 +39,10 @@ TEST_SUITE(FF_TEST_SUITE) {
                                            output_accessor.get_float_ptr(),
                                            a_accessor.get_float_ptr(),
                                            b_accessor.get_float_ptr(),
-                                           m,
-                                           n,
-                                           k,
-                                           batch,
+                                           m.unwrap_nonnegative(),
+                                           n.unwrap_nonnegative(),
+                                           k.unwrap_nonnegative(),
+                                           batch.unwrap_nonnegative(),
                                            a_seq_length_dim,
                                            b_seq_length_dim,
                                            seq_length);
@@ -64,10 +64,10 @@ TEST_SUITE(FF_TEST_SUITE) {
                                             a_grad_accessor.get_float_ptr(),
                                             b_accessor.get_float_ptr(),
                                             b_grad_accessor.get_float_ptr(),
-                                            m,
-                                            n,
-                                            k,
-                                            batch);
+                                            m.unwrap_nonnegative(),
+                                            n.unwrap_nonnegative(),
+                                            k.unwrap_nonnegative(),
+                                            batch.unwrap_nonnegative());
     }
   }
 }
diff --git a/lib/kernels/test/src/test_batch_norm_kernel.cc b/lib/kernels/test/src/test_batch_norm_kernel.cc
index 8487bbda6a..b4c43cf1d8 100644
--- a/lib/kernels/test/src/test_batch_norm_kernel.cc
+++ b/lib/kernels/test/src/test_batch_norm_kernel.cc
@@ -6,22 +6,25 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test BatchNorm Kernel") {
-    size_t output_n = 1, output_c = 10, output_h = 10, output_w = 10;
+    nonnegative_int output_n = 1_n;
+    nonnegative_int output_c = 10_n;
+    nonnegative_int output_h = 10_n;
+    nonnegative_int output_w = 10_n;
 
     ManagedFFStream managed_stream{};
     ManagedPerDeviceFFHandle managed_handle{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    BatchNormPerDeviceState state =
-        Kernels::BatchNorm::init_kernel(managed_handle.raw_handle(),
-                                        allocator,
-                                        nullptr,
-                                        output_n,
-                                        output_c,
-                                        output_h,
-                                        output_w,
-                                        true);
+    BatchNormPerDeviceState state = Kernels::BatchNorm::init_kernel(
+        /*handle=*/managed_handle.raw_handle(),
+        /*allocator=*/allocator,
+        /*runningMean=*/nullptr,
+        /*output_n=*/output_n.unwrap_nonnegative(),
+        /*output_c=*/output_c.unwrap_nonnegative(),
+        /*output_h=*/output_h.unwrap_nonnegative(),
+        /*output_w=*/output_w.unwrap_nonnegative(),
+        /*relu=*/true);
 
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
         {output_n, output_c, output_h, output_w});
@@ -43,12 +46,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW bias_accessor =
           create_filled_accessor_w(bias_shape, allocator, 0.0f);
 
-      Kernels::BatchNorm::forward_kernel(managed_stream.raw_stream(),
-                                         state,
-                                         input_accessor.get_float_ptr(),
-                                         output_accessor.get_float_ptr(),
-                                         scale_accessor.get_float_ptr(),
-                                         bias_accessor.get_float_ptr());
+      Kernels::BatchNorm::forward_kernel(
+          /*stream=*/managed_stream.raw_stream(),
+          /*per_device_state=*/state,
+          /*input_ptr=*/input_accessor.get_float_ptr(),
+          /*output_ptr=*/output_accessor.get_float_ptr(),
+          /*scale_ptr=*/scale_accessor.get_float_ptr(),
+          /*bias_ptr=*/bias_accessor.get_float_ptr());
 
       std::vector<float> host_output_data =
           load_data_to_host_from_device<float>(
@@ -66,16 +70,18 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW bias_grad_accessor =
           create_random_filled_accessor_w(bias_shape, allocator);
 
-      Kernels::BatchNorm::backward_kernel(managed_stream.raw_stream(),
-                                          state,
-                                          input_accessor.get_float_ptr(),
-                                          output_grad_accessor.get_float_ptr(),
-                                          output_accessor.get_float_ptr(),
-                                          input_grad_accessor.get_float_ptr(),
-                                          scale_accessor.get_float_ptr(),
-                                          scale_grad_accessor.get_float_ptr(),
-                                          bias_grad_accessor.get_float_ptr(),
-                                          input_accessor.shape.num_elements());
+      Kernels::BatchNorm::backward_kernel(
+          /*stream=*/managed_stream.raw_stream(),
+          /*per_device_state=*/state,
+          /*input_ptr=*/input_accessor.get_float_ptr(),
+          /*output_grad_ptr=*/output_grad_accessor.get_float_ptr(),
+          /*output_ptr=*/output_accessor.get_float_ptr(),
+          /*input_grad_ptr=*/input_grad_accessor.get_float_ptr(),
+          /*scale_ptr=*/scale_accessor.get_float_ptr(),
+          /*scale_grad_ptr=*/scale_grad_accessor.get_float_ptr(),
+          /*bias_grad_ptr=*/bias_grad_accessor.get_float_ptr(),
+          /*numElements=*/
+          input_accessor.shape.num_elements().unwrap_nonnegative());
 
       std::vector<float> host_input_grad_data =
           load_data_to_host_from_device<float>(
diff --git a/lib/kernels/test/src/test_cast_kernel.cc b/lib/kernels/test/src/test_cast_kernel.cc
index b110208bce..0e0769014d 100644
--- a/lib/kernels/test/src/test_cast_kernel.cc
+++ b/lib/kernels/test/src/test_cast_kernel.cc
@@ -11,9 +11,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100, 100});
+        make_float_tensor_shape_from_legion_dims({100_n, 100_n});
     TensorShape output_shape =
-        make_double_tensor_shape_from_legion_dims({100, 100});
+        make_double_tensor_shape_from_legion_dims({100_n, 100_n});
 
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
diff --git a/lib/kernels/test/src/test_combine_kernel.cc b/lib/kernels/test/src/test_combine_kernel.cc
index 2e1000cb95..2b6b9bf589 100644
--- a/lib/kernels/test/src/test_combine_kernel.cc
+++ b/lib/kernels/test/src/test_combine_kernel.cc
@@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({100, 100});
+        make_float_tensor_shape_from_legion_dims({100_n, 100_n});
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_concat_kernel.cc b/lib/kernels/test/src/test_concat_kernel.cc
index 2212e384fa..215e599716 100644
--- a/lib/kernels/test/src/test_concat_kernel.cc
+++ b/lib/kernels/test/src/test_concat_kernel.cc
@@ -1,13 +1,14 @@
 #include "doctest/doctest.h"
 #include "kernels/concat_kernels.h"
 #include "test_utils.h"
+#include "utils/containers/repeat.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test concat kernel forward and backward") {
-    size_t num_inputs = 3;
-    size_t size_per_input = 100;
-    ff_dim_t concat_axis = ff_dim_t{nonnegative_int{0}};
+    nonnegative_int num_inputs = 3_n;
+    nonnegative_int size_per_input = 100_n;
+    ff_dim_t concat_axis = ff_dim_t{0_n};
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
@@ -21,7 +22,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("forward_kernel") {
       std::vector<GenericTensorAccessorR> input_accessors =
-          repeat<GenericTensorAccessorR>(num_inputs, [&]() {
+          repeat(num_inputs, [&]() {
             return read_only_accessor_from_write_accessor(
                 create_random_filled_accessor_w(input_shape, allocator));
           });
@@ -44,10 +45,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorR output_grad_accessor =
           read_only_accessor_from_write_accessor(
               create_random_filled_accessor_w(output_shape, allocator));
-      std::vector<GenericTensorAccessorW> input_grad_accessors =
-          repeat<GenericTensorAccessorW>(num_inputs, [&]() {
-            return allocator.allocate_tensor(input_shape);
-          });
+      std::vector<GenericTensorAccessorW> input_grad_accessors = repeat(
+          num_inputs, [&]() { return allocator.allocate_tensor(input_shape); });
       Kernels::Concat::backward_kernel(managed_stream.raw_stream(),
                                        output_grad_accessor,
                                        input_grad_accessors,
diff --git a/lib/kernels/test/src/test_dropout.cc b/lib/kernels/test/src/test_dropout.cc
index e29143e251..86f8f2102b 100644
--- a/lib/kernels/test/src/test_dropout.cc
+++ b/lib/kernels/test/src/test_dropout.cc
@@ -1,6 +1,7 @@
 #include "doctest/doctest.h"
 #include "kernels/dropout_kernels.h"
 #include "test_utils.h"
+#include "utils/containers/count.h"
 
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
@@ -9,11 +10,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     float dropout_rate = 0.1;
 
     ArrayShape shape = ArrayShape{
-        std::vector<size_t>{10, 10},
+        std::vector<nonnegative_int>{10_n, 10_n},
     };
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10});
+        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
     TensorShape output_shape = input_shape;
 
     ManagedFFStream managed_stream{};
@@ -25,8 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), dropout_rate, seed, shape, allocator);
 
     auto get_zero_count = [](std::vector<float> const &data) {
-      return std::count_if(
-          data.begin(), data.end(), [](float x) { return x == 0.0f; });
+      return count(data, [](float x) { return x == 0.0f; });
     };
 
     SUBCASE("forward_kernel") {
diff --git a/lib/kernels/test/src/test_flat_kernel.cc b/lib/kernels/test/src/test_flat_kernel.cc
index 70894858e3..83f7f0445e 100644
--- a/lib/kernels/test/src/test_flat_kernel.cc
+++ b/lib/kernels/test/src/test_flat_kernel.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
     TensorShape output_shape = input_shape;
 
     GenericTensorAccessorR input_accessor =
@@ -30,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               read_only_accessor_from_write_accessor(output_accessor));
 
       std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 2.0f);
+          input_accessor.shape.num_elements().unwrap_nonnegative(), 2.0f);
       CHECK(check_output_data == expected_output_data);
     }
 
@@ -50,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               read_only_accessor_from_write_accessor(input_grad_accessor));
 
       std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
+          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
       CHECK(backward_output_data == expected_output_data);
     }
   }
diff --git a/lib/kernels/test/src/test_gather_kernels.cc b/lib/kernels/test/src/test_gather_kernels.cc
index 88ac2f6889..1a8cf5f82a 100644
--- a/lib/kernels/test/src/test_gather_kernels.cc
+++ b/lib/kernels/test/src/test_gather_kernels.cc
@@ -10,10 +10,11 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    GatherPerDeviceState state = {managed_handle.raw_handle(), legion_dim_t(2)};
+    GatherPerDeviceState state = {managed_handle.raw_handle(),
+                                  legion_dim_t{2_n}};
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50});
+    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n});
 
     GenericTensorAccessorR index_accessor =
         read_only_accessor_from_write_accessor(
diff --git a/lib/kernels/test/src/test_layer_norm_kernels.cc b/lib/kernels/test/src/test_layer_norm_kernels.cc
index 03b2f56bb9..5386c1d943 100644
--- a/lib/kernels/test/src/test_layer_norm_kernels.cc
+++ b/lib/kernels/test/src/test_layer_norm_kernels.cc
@@ -6,8 +6,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test LayerNorm Forward and Backward Kernel") {
-    size_t batch_size = 10;
-    size_t feature_size = 10;
+    nonnegative_int batch_size = 10_n;
+    nonnegative_int feature_size = 10_n;
     float epsilon = 1e-5f;
     bool elementwise_affine = true;
 
@@ -26,8 +26,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         Kernels::LayerNorm::init_kernel(managed_handle.raw_handle(),
                                         allocator,
                                         elementwise_affine,
-                                        batch_size,
-                                        feature_size,
+                                        batch_size.unwrap_nonnegative(),
+                                        feature_size.unwrap_nonnegative(),
                                         epsilon);
 
     GenericTensorAccessorR input_accessor =
diff --git a/lib/kernels/test/src/test_partition_kernel.cc b/lib/kernels/test/src/test_partition_kernel.cc
index 437b37e954..4fd1b53210 100644
--- a/lib/kernels/test/src/test_partition_kernel.cc
+++ b/lib/kernels/test/src/test_partition_kernel.cc
@@ -15,7 +15,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         managed_handle.raw_handle(), DataType::FLOAT);
 
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10});
+        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
@@ -33,7 +33,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               read_only_accessor_from_write_accessor(output_accessor));
 
       std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
+          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
       CHECK(check_output_data == expected_output_data);
     }
 
@@ -54,7 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               read_only_accessor_from_write_accessor(input_grad_accessor));
 
       std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements(), 3.0f);
+          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f);
       CHECK(host_grad_input_data == expected_grad_input_data);
     }
   }
diff --git a/lib/kernels/test/src/test_pool_2d_kernels.cc b/lib/kernels/test/src/test_pool_2d_kernels.cc
index ebb92d39db..62b61707c6 100644
--- a/lib/kernels/test/src/test_pool_2d_kernels.cc
+++ b/lib/kernels/test/src/test_pool_2d_kernels.cc
@@ -5,10 +5,20 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Pool2D Forward and Backward Kernel") {
-    size_t input_w = 10, input_h = 10, input_c = 3, input_n = 1;
-    size_t output_w = 5, output_h = 5, output_c = 3, output_n = 1;
-    size_t pad_h = 0, pad_w = 0, kernel_h = 2, kernel_w = 2, stride_h = 2,
-           stride_w = 2;
+    nonnegative_int input_w = 10_n;
+    nonnegative_int input_h = 10_n;
+    nonnegative_int input_c = 3_n;
+    nonnegative_int input_n = 1_n;
+    nonnegative_int output_w = 5_n;
+    nonnegative_int output_h = 5_n;
+    nonnegative_int output_c = 3_n;
+    nonnegative_int output_n = 1_n;
+    nonnegative_int pad_h = 0_n;
+    nonnegative_int pad_w = 0_n;
+    nonnegative_int kernel_h = 2_n;
+    nonnegative_int kernel_w = 2_n;
+    nonnegative_int stride_h = 2_n;
+    nonnegative_int stride_w = 2_n;
 
     PoolOp pool_type = PoolOp::MAX;
 
@@ -18,23 +28,23 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     Pool2DPerDeviceState state =
-        Kernels::Pool2D::init_kernel(managed_handle.raw_handle(),
-                                     std::nullopt,
-                                     input_w,
-                                     input_h,
-                                     input_c,
-                                     input_n,
-                                     output_w,
-                                     output_h,
-                                     output_c,
-                                     output_n,
-                                     pad_h,
-                                     pad_w,
-                                     kernel_h,
-                                     kernel_w,
-                                     stride_h,
-                                     stride_w,
-                                     pool_type);
+        Kernels::Pool2D::init_kernel(/*handle=*/managed_handle.raw_handle(),
+                                     /*activation=*/std::nullopt,
+                                     /*input_w=*/input_w.unwrap_nonnegative(),
+                                     /*input_h=*/input_h.unwrap_nonnegative(),
+                                     /*input_c=*/input_c.unwrap_nonnegative(),
+                                     /*input_n=*/input_n.unwrap_nonnegative(),
+                                     /*output_w=*/output_w.unwrap_nonnegative(),
+                                     /*output_h=*/output_h.unwrap_nonnegative(),
+                                     /*output_c=*/output_c.unwrap_nonnegative(),
+                                     /*output_n=*/output_n.unwrap_nonnegative(),
+                                     /*pad_h=*/pad_h.unwrap_nonnegative(),
+                                     /*pad_w=*/pad_w.unwrap_nonnegative(),
+                                     /*kernel_h=*/kernel_h.unwrap_nonnegative(),
+                                     /*kernel_w=*/kernel_w.unwrap_nonnegative(),
+                                     /*stride_h=*/stride_h.unwrap_nonnegative(),
+                                     /*stride_w=*/stride_w.unwrap_nonnegative(),
+                                     /*pool_type=*/pool_type);
 
     TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
         {input_w, input_h, input_c, input_n});
diff --git a/lib/kernels/test/src/test_reduction_kernel.cc b/lib/kernels/test/src/test_reduction_kernel.cc
index 1ea740f336..04a3817b84 100644
--- a/lib/kernels/test/src/test_reduction_kernel.cc
+++ b/lib/kernels/test/src/test_reduction_kernel.cc
@@ -7,8 +7,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Reduction Forward and Backward Kernel") {
     std::size_t num_replicas = 5;
 
-    TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10, 10, 10, 10});
+    TensorShape input_shape = make_float_tensor_shape_from_legion_dims(
+        {10_n, 10_n, 10_n, 10_n, 10_n});
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
@@ -16,7 +16,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     Allocator allocator = create_local_cuda_memory_allocator();
 
     SUBCASE("forward_kernel") {
-      TensorShape output_shape = make_float_tensor_shape_from_legion_dims({10});
+      TensorShape output_shape =
+          make_float_tensor_shape_from_legion_dims({10_n});
 
       GenericTensorAccessorR input_accessor =
           read_only_accessor_from_write_accessor(
@@ -49,7 +50,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                           output_grad_accessor);
 
       std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements(), 1.0f);
+          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
       std::vector<float> host_grad_data = load_data_to_host_from_device<float>(
           read_only_accessor_from_write_accessor(input_grad_accessor));
       CHECK(host_grad_data == expected_grad_input_data);
diff --git a/lib/kernels/test/src/test_replicate_kernel.cc b/lib/kernels/test/src/test_replicate_kernel.cc
index 86d790f03c..fa726898f2 100644
--- a/lib/kernels/test/src/test_replicate_kernel.cc
+++ b/lib/kernels/test/src/test_replicate_kernel.cc
@@ -5,9 +5,9 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Replicate Kernel") {
-    std::size_t num_replicas = 10;
+    nonnegative_int num_replicas = 10_n;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
     TensorShape output_shape = input_shape;
 
     ManagedPerDeviceFFHandle managed_handle{};
@@ -30,7 +30,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               read_only_accessor_from_write_accessor(output_accessor));
 
       std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
+          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
       CHECK(check_output_data == expected_output_data);
     }
 
@@ -44,7 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       Kernels::Replicate::backward_kernel(managed_stream.raw_stream(),
                                           input_grad_accessor,
                                           output_grad_accessor,
-                                          num_replicas);
+                                          num_replicas.unwrap_nonnegative());
 
       std::vector<float> check_aggregated_data =
           load_data_to_host_from_device<float>(
diff --git a/lib/kernels/test/src/test_reshape_kernel.cc b/lib/kernels/test/src/test_reshape_kernel.cc
index f56bfacc2b..d329a347b3 100644
--- a/lib/kernels/test/src/test_reshape_kernel.cc
+++ b/lib/kernels/test/src/test_reshape_kernel.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
     TensorShape output_shape = input_shape;
 
     ReshapePerDeviceState state =
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               read_only_accessor_from_write_accessor(output_accessor));
 
       std::vector<float> expected_output_data(
-          input_accessor.shape.num_elements(), 1.0f);
+          input_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
       CHECK(check_output_data == expected_output_data);
     }
 
@@ -52,7 +52,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               read_only_accessor_from_write_accessor(input_grad_accessor));
 
       std::vector<float> expected_grad_input_data(
-          input_grad_accessor.shape.num_elements(), 3.0f);
+          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 3.0f);
       CHECK(host_grad_input_data == expected_grad_input_data);
     }
   }
diff --git a/lib/kernels/test/src/test_reverse_kernels.cc b/lib/kernels/test/src/test_reverse_kernels.cc
index cdaf65a305..9c8475f6d6 100644
--- a/lib/kernels/test/src/test_reverse_kernels.cc
+++ b/lib/kernels/test/src/test_reverse_kernels.cc
@@ -5,11 +5,11 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Call Reverse Forward and Backward Kernels") {
-    std::size_t reverse_dim_size = 10;
-    std::size_t in_blk_size = 10;
-    std::size_t num_out_blks = 1;
+    nonnegative_int reverse_dim_size = 10_n;
+    nonnegative_int in_blk_size = 10_n;
+    nonnegative_int num_out_blks = 1_n;
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
     TensorShape output_shape = input_shape;
 
     ManagedPerDeviceFFHandle managed_handle{};
@@ -24,13 +24,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       GenericTensorAccessorW output_accessor =
           allocator.allocate_tensor(output_shape);
 
-      Kernels::Reverse::forward_kernel(managed_stream.raw_stream(),
-                                       input_accessor.get_float_ptr(),
-                                       output_accessor.get_float_ptr(),
-                                       num_out_blks,
-                                       reverse_dim_size,
-                                       in_blk_size,
-                                       input_accessor.shape.num_elements());
+      Kernels::Reverse::forward_kernel(
+          managed_stream.raw_stream(),
+          input_accessor.get_float_ptr(),
+          output_accessor.get_float_ptr(),
+          num_out_blks.unwrap_nonnegative(),
+          reverse_dim_size.unwrap_nonnegative(),
+          in_blk_size.unwrap_nonnegative(),
+          input_accessor.shape.num_elements().unwrap_nonnegative());
 
       std::vector<float> check_output_data =
           load_data_to_host_from_device<float>(
@@ -48,10 +49,10 @@ TEST_SUITE(FF_TEST_SUITE) {
           managed_stream.raw_stream(),
           output_grad_accessor.get_float_ptr(),
           input_grad_accessor.get_float_ptr(),
-          num_out_blks,
-          reverse_dim_size,
-          in_blk_size,
-          input_grad_accessor.shape.num_elements());
+          num_out_blks.unwrap_nonnegative(),
+          reverse_dim_size.unwrap_nonnegative(),
+          in_blk_size.unwrap_nonnegative(),
+          input_grad_accessor.shape.num_elements().unwrap_nonnegative());
 
       std::vector<float> host_grad_input_data =
           load_data_to_host_from_device<float>(
diff --git a/lib/kernels/test/src/test_softmax_kernel.cc b/lib/kernels/test/src/test_softmax_kernel.cc
index f49c1ebbcc..c9eaa76b86 100644
--- a/lib/kernels/test/src/test_softmax_kernel.cc
+++ b/lib/kernels/test/src/test_softmax_kernel.cc
@@ -6,18 +6,27 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Softmax Kernel Operations") {
-    int input_n = 1, input_c = 1, input_h = 1, input_w = 100, channels = 100;
+    nonnegative_int input_n = 1_n;
+    nonnegative_int input_c = 1_n;
+    nonnegative_int input_h = 1_n;
+    nonnegative_int input_w = 100_n;
+    nonnegative_int channels = 100_n;
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
+    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
     TensorShape output_shape = input_shape;
 
-    SoftmaxPerDeviceState state = Kernels::Softmax::init_kernel(
-        managed_handle.raw_handle(), 0, input_n, channels, input_h, input_w);
+    SoftmaxPerDeviceState state =
+        Kernels::Softmax::init_kernel(managed_handle.raw_handle(),
+                                      0,
+                                      input_n.unwrap_nonnegative(),
+                                      channels.unwrap_nonnegative(),
+                                      input_h.unwrap_nonnegative(),
+                                      input_w.unwrap_nonnegative());
 
     GenericTensorAccessorW output_accessor =
         create_random_filled_accessor_w(output_shape, allocator);
@@ -47,10 +56,10 @@ TEST_SUITE(FF_TEST_SUITE) {
           managed_stream.raw_stream(),
           input_grad_accessor.get_float_ptr(),
           output_grad_accessor.get_float_ptr(),
-          output_grad_accessor.shape.num_elements());
+          output_grad_accessor.shape.num_elements().unwrap_nonnegative());
 
-      std::vector<float> expected_input_grad_data =
-          std::vector<float>(input_grad_accessor.shape.num_elements(), 1.0f);
+      std::vector<float> expected_input_grad_data = std::vector<float>(
+          input_grad_accessor.shape.num_elements().unwrap_nonnegative(), 1.0f);
       std::vector<float> host_input_grad_data =
           load_data_to_host_from_device<float>(
               read_only_accessor_from_write_accessor(input_grad_accessor));
diff --git a/lib/kernels/test/src/test_split_kernel.cc b/lib/kernels/test/src/test_split_kernel.cc
index f2346c9244..ea0d280f68 100644
--- a/lib/kernels/test/src/test_split_kernel.cc
+++ b/lib/kernels/test/src/test_split_kernel.cc
@@ -1,12 +1,13 @@
 #include "doctest/doctest.h"
 #include "kernels/split_kernels.h"
 #include "test_utils.h"
+#include "utils/containers/repeat.h"
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Split Forward and Backward Kernel") {
-    size_t num_outputs = 2;
+    nonnegative_int num_outputs = 2_n;
     coord_t out_blk_sizes[] = {50, 50};
     coord_t in_blk_size = 100;
     coord_t num_blks = 1;
@@ -16,15 +17,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100});
-    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50});
+    TensorShape input_shape = make_float_tensor_shape_from_legion_dims({100_n});
+    TensorShape output_shape = make_float_tensor_shape_from_legion_dims({50_n});
 
     SUBCASE("forward_kernel") {
       GenericTensorAccessorW input_accessor =
           create_random_filled_accessor_w(input_shape, allocator);
 
-      std::vector<float *> output_ptrs(num_outputs);
-      generate_n(output_ptrs.begin(), num_outputs, [&]() {
+      std::vector<float *> output_ptrs = repeat(num_outputs, [&]() {
         GenericTensorAccessorW output_accessor =
             allocator.allocate_tensor(output_shape);
         return output_accessor.get_float_ptr();
@@ -36,11 +36,11 @@ TEST_SUITE(FF_TEST_SUITE) {
                                      out_blk_sizes,
                                      in_blk_size,
                                      num_blks,
-                                     num_outputs);
+                                     num_outputs.unwrap_nonnegative());
     }
 
     SUBCASE("backward_kernel") {
-      std::vector<float *> output_grad_ptrs(num_outputs);
+      std::vector<float *> output_grad_ptrs(num_outputs.unwrap_nonnegative());
       for (int i = 0; i < num_outputs; i++) {
         GenericTensorAccessorW output_grad_accessor =
             create_random_filled_accessor_w(output_shape, allocator);
@@ -56,7 +56,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                       out_blk_sizes,
                                       in_blk_size,
                                       num_blks,
-                                      num_outputs);
+                                      num_outputs.unwrap_nonnegative());
     }
   }
 }
diff --git a/lib/kernels/test/src/test_transpose_kernel.cc b/lib/kernels/test/src/test_transpose_kernel.cc
index 2904fa01ae..02d99c86a1 100644
--- a/lib/kernels/test/src/test_transpose_kernel.cc
+++ b/lib/kernels/test/src/test_transpose_kernel.cc
@@ -5,21 +5,20 @@
 using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Test Transpose Kernel Operations") {
-    std::size_t num_dims = 2;
-
-    std::vector<ff_dim_t> perm = {ff_dim_t{nonnegative_int{0}},
-                                  ff_dim_t{nonnegative_int{1}}};
+    TransposeAttrs attrs = TransposeAttrs{
+        FFOrdered<ff_dim_t>{
+            ff_dim_t{0_n},
+            ff_dim_t{1_n},
+        },
+    };
 
     ManagedPerDeviceFFHandle managed_handle{};
     ManagedFFStream managed_stream{};
 
     Allocator allocator = create_local_cuda_memory_allocator();
 
-    TransposePerDeviceState state =
-        Kernels::Transpose::init_kernel(num_dims, perm);
-
     TensorShape input_shape =
-        make_float_tensor_shape_from_legion_dims({10, 10});
+        make_float_tensor_shape_from_legion_dims({10_n, 10_n});
     TensorShape output_shape = input_shape;
 
     SUBCASE("forward_kernel") {
@@ -30,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           allocator.allocate_tensor(output_shape);
 
       Kernels::Transpose::forward_kernel(
-          managed_stream.raw_stream(), state, input_accessor, output_accessor);
+          managed_stream.raw_stream(), attrs, input_accessor, output_accessor);
 
       std::vector<float> host_output_data =
           load_data_to_host_from_device<float>(
@@ -46,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           create_random_filled_accessor_w(input_shape, allocator);
 
       Kernels::Transpose::backward_kernel(managed_stream.raw_stream(),
-                                          state,
+                                          attrs,
                                           input_grad_accessor,
                                           output_grad_accessor);
 
diff --git a/lib/kernels/test/src/test_utils.cc b/lib/kernels/test/src/test_utils.cc
index b591642570..903b666fa9 100644
--- a/lib/kernels/test/src/test_utils.cc
+++ b/lib/kernels/test/src/test_utils.cc
@@ -4,7 +4,7 @@ GenericTensorAccessorW create_random_filled_accessor_w(TensorShape const &shape,
                                                        Allocator &allocator,
                                                        bool cpu_fill) {
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
+  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
   std::vector<float> host_data(volume);
   std::random_device rd;
   std::mt19937 gen(rd());
@@ -31,7 +31,7 @@ GenericTensorAccessorW create_filled_accessor_w(TensorShape const &shape,
                                                 float val,
                                                 bool cpu_fill) {
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
+  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
   std::vector<float> host_data(volume, val);
 
   if (cpu_fill) {
@@ -50,7 +50,7 @@ GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
                                                      Allocator &allocator,
                                                      bool cpu_fill) {
   GenericTensorAccessorW accessor = allocator.allocate_tensor(shape);
-  size_t volume = accessor.shape.num_elements();
+  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
   std::vector<float> host_data(volume);
 
   for (size_t i = 0; i < volume; i++) {
@@ -72,8 +72,7 @@ GenericTensorAccessorW create_iota_filled_accessor_w(TensorShape const &shape,
 void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
                             float val,
                             bool cpu_fill) {
-  LegionTensorDims dims = accessor.shape.dims;
-  size_t volume = accessor.shape.num_elements();
+  size_t volume = accessor.shape.num_elements().unwrap_nonnegative();
   std::vector<float> host_data(volume, val);
 
   if (cpu_fill) {
@@ -86,7 +85,8 @@ void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
   }
 }
 
-TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
+TensorShape
+    make_float_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims) {
   return TensorShape{
       TensorDims{
           dims,
@@ -95,7 +95,8 @@ TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
   };
 }
 
-TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered<size_t> dims) {
+TensorShape
+    make_double_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims) {
   return TensorShape{
       TensorDims{
           dims,
diff --git a/lib/kernels/test/src/test_utils.h b/lib/kernels/test/src/test_utils.h
index 21d4923881..08f0f382fb 100644
--- a/lib/kernels/test/src/test_utils.h
+++ b/lib/kernels/test/src/test_utils.h
@@ -30,9 +30,11 @@ void fill_tensor_accessor_w(GenericTensorAccessorW accessor,
                             float val,
                             bool cpu_fill = false);
 
-TensorShape make_float_tensor_shape_from_legion_dims(FFOrdered<size_t> dims);
+TensorShape
+    make_float_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims);
 
-TensorShape make_double_tensor_shape_from_legion_dims(FFOrdered<size_t> dims);
+TensorShape
+    make_double_tensor_shape_from_legion_dims(FFOrdered<nonnegative_int> dims);
 
 template <typename T>
 std::vector<T> load_data_to_host_from_device(GenericTensorAccessorR accessor) {
@@ -52,16 +54,6 @@ bool contains_non_zero(std::vector<T> &data) {
       data.begin(), data.end(), [](T const &val) { return val == 0; });
 }
 
-template <typename T, typename Func>
-std::vector<T> repeat(std::size_t n, Func &&func) {
-  std::vector<T> result;
-  // result.reserve(n); // Sometimes we don't have default constructor for T
-  for (std::size_t i = 0; i < n; ++i) {
-    result.push_back(func());
-  }
-  return result;
-}
-
 // Specialize doctest's StringMaker for std::vector<float>
 template <>
 struct doctest::StringMaker<std::vector<float>> {
diff --git a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml b/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml
index 5f73bbbb8e..db476e771d 100644
--- a/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml
+++ b/lib/local-execution/include/local-execution/device_specific_device_states.variant.toml
@@ -6,7 +6,7 @@ features = [
 
 includes = [
   "kernels/attention_kernels.h",
-  "kernels/batch_norm_kernels.h",
+  "kernels/batch_norm_per_device_state.dtg.h",
   "kernels/conv_2d_kernels.h",
   "kernels/dropout_kernels.h",
   "kernels/element_binary_kernels.h",
@@ -84,7 +84,3 @@ key = "device_specific_softmax_per_device_state"
 [[values]]
 type = "::FlexFlow::DeviceSpecific<::FlexFlow::TopKPerDeviceState>"
 key = "device_specific_topk_per_device_state"
-
-[[values]]
-type = "::FlexFlow::DeviceSpecific<::FlexFlow::TransposePerDeviceState>"
-key = "device_specific_transpose_per_device_state"
diff --git a/lib/local-execution/include/local-execution/legion_tensor_shape.h b/lib/local-execution/include/local-execution/legion_tensor_shape.h
deleted file mode 100644
index 3786383865..0000000000
--- a/lib/local-execution/include/local-execution/legion_tensor_shape.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef _FLEXFLOW_RUNTIME_SRC_TENSOR_SHAPE_H
-#define _FLEXFLOW_RUNTIME_SRC_TENSOR_SHAPE_H
-
-#include "kernels/legion_dim.h"
-#include "op-attrs/datatype.h"
-#include "op-attrs/ff_dim_t.h"
-#include "op-attrs/tensor_shape.dtg.h"
-#include "utils/stack_vector/stack_vector.h"
-#include "utils/visitable.h"
-#include <cstddef>
-
-namespace FlexFlow {
-
-// TODO FIXME @lockshaw remove inheritance from legion tensor dims
-struct LegionTensorShape : public use_visitable_cmp<LegionTensorShape>,
-                           public LegionTensorDims {
-  LegionTensorShape() = delete;
-  LegionTensorShape(std::vector<size_t> const &dims, DataType data_type);
-  LegionTensorShape(TensorShape const &);
-
-  template <size_t MAXSIZE>
-  LegionTensorShape(stack_vector<size_t, MAXSIZE> const &dims,
-                    DataType data_type)
-      : LegionTensorDims(dims.start(), dims.end()), data_type(data_type) {}
-
-  operator TensorShape() const;
-
-public:
-  DataType data_type;
-};
-
-ff_dim_t to_ff(legion_dim_t, size_t num_dims);
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t, size_t num_dims);
-
-ff_dim_t to_ff(legion_dim_t, TensorShape const &);
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t, TensorShape const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml b/lib/local-execution/include/local-execution/per_device_op_state.variant.toml
index f99ff10bb9..0171e3e497 100644
--- a/lib/local-execution/include/local-execution/per_device_op_state.variant.toml
+++ b/lib/local-execution/include/local-execution/per_device_op_state.variant.toml
@@ -19,7 +19,6 @@ includes = [
   "kernels/reshape_kernels.h",
   "kernels/softmax_kernels.h",
   "kernels/topk_kernels.h",
-  "kernels/transpose_kernels.h",
 ]
 
 [[values]]
@@ -81,7 +80,3 @@ key = "softmax_per_device_state"
 [[values]]
 type = "::FlexFlow::TopKPerDeviceState"
 key = "topk_per_device_state"
-
-[[values]]
-type = "::FlexFlow::TransposePerDeviceState"
-key = "transpose_per_device_state"
diff --git a/lib/local-execution/include/local-execution/task_id_t.enum.toml b/lib/local-execution/include/local-execution/task_id_t.enum.toml
index 9cbe64c268..b0c82b5d26 100644
--- a/lib/local-execution/include/local-execution/task_id_t.enum.toml
+++ b/lib/local-execution/include/local-execution/task_id_t.enum.toml
@@ -205,9 +205,6 @@ name = "TOPK_FWD_TASK_ID"
 [[values]]
 name = "TOPK_BWD_TASK_ID"
 
-[[values]]
-name = "TRANSPOSE_INIT_TASK_ID"
-
 [[values]]
 name = "TRANSPOSE_FWD_TASK_ID"
 
diff --git a/lib/local-execution/src/legion_tensor_shape.cc b/lib/local-execution/src/legion_tensor_shape.cc
deleted file mode 100644
index b227accc2e..0000000000
--- a/lib/local-execution/src/legion_tensor_shape.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include "local-execution/legion_tensor_shape.h"
-#include "kernels/legion_dim.h"
-#include "op-attrs/tensor_shape.h"
-
-namespace FlexFlow {
-
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, size_t num_dims) {
-  return legion_dim_t(num_dims - ff_dim.value.get_value() - 1);
-}
-
-legion_dim_t legion_dim_from_ff_dim(ff_dim_t ff_dim, TensorShape const &shape) {
-  return legion_dim_from_ff_dim(ff_dim, num_dims(shape));
-}
-
-} // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/attention.cc b/lib/local-execution/src/ops/attention.cc
index eebef9039d..e652b666a8 100644
--- a/lib/local-execution/src/ops/attention.cc
+++ b/lib/local-execution/src/ops/attention.cc
@@ -85,10 +85,10 @@ static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<MultiHeadAttentionAttrs>(ATTRS);
   Allocator allocator = acc.get_allocator();
-  size_t qProjSize = acc.get_argument<int>(QPROJSIZE);
-  size_t kProjSize = acc.get_argument<int>(KPROJSIZE);
-  size_t vProjSize = acc.get_argument<int>(VPROJSIZE);
-  size_t oProjSize = acc.get_argument<int>(OPROJSIZE);
+  nonnegative_int qProjSize = acc.get_argument<nonnegative_int>(QPROJSIZE);
+  nonnegative_int kProjSize = acc.get_argument<nonnegative_int>(KPROJSIZE);
+  nonnegative_int vProjSize = acc.get_argument<nonnegative_int>(VPROJSIZE);
+  nonnegative_int oProjSize = acc.get_argument<nonnegative_int>(OPROJSIZE);
 
   PerDeviceFFHandle handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
   ParallelTensorShape query_parallel_tensor_shape =
@@ -108,29 +108,30 @@ static DeviceSpecificDeviceStates
                                             key_parallel_tensor_shape,
                                             value_parallel_tensor_shape));
 
-  int kvSeqLength = get_kvSeqLength(parsed);
-  int qSize = get_qSize(parsed);
-  int kSize = get_kSize(parsed);
-  int vSize = get_vSize(parsed);
-
-  int qoSeqLength = get_qoSeqLength(parsed);
-  int num_samples = get_num_samples(parsed);
-  int num_heads = attrs.num_heads;
-
-  MHAPerDeviceState per_device_state = init_kernel(handle,
-                                                   allocator,
-                                                   num_samples,
-                                                   num_heads,
-                                                   qSize,
-                                                   kSize,
-                                                   vSize,
-                                                   qProjSize,
-                                                   kProjSize,
-                                                   vProjSize,
-                                                   oProjSize,
-                                                   qoSeqLength,
-                                                   kvSeqLength,
-                                                   attrs.add_bias_kv);
+  nonnegative_int kvSeqLength = get_kvSeqLength(parsed);
+  nonnegative_int qSize = get_qSize(parsed);
+  nonnegative_int kSize = get_kSize(parsed);
+  nonnegative_int vSize = get_vSize(parsed);
+
+  nonnegative_int qoSeqLength = get_qoSeqLength(parsed);
+  nonnegative_int num_samples = get_num_samples(parsed);
+  nonnegative_int num_heads = attrs.num_heads;
+
+  MHAPerDeviceState per_device_state =
+      init_kernel(handle,
+                  allocator,
+                  num_samples.unwrap_nonnegative(),
+                  num_heads.unwrap_nonnegative(),
+                  qSize.unwrap_nonnegative(),
+                  kSize.unwrap_nonnegative(),
+                  vSize.unwrap_nonnegative(),
+                  qProjSize.unwrap_nonnegative(),
+                  kProjSize.unwrap_nonnegative(),
+                  vProjSize.unwrap_nonnegative(),
+                  oProjSize.unwrap_nonnegative(),
+                  qoSeqLength.unwrap_nonnegative(),
+                  kvSeqLength.unwrap_nonnegative(),
+                  attrs.add_bias_kv);
   return DeviceSpecificDeviceStates{
       DeviceSpecific<MHAPerDeviceState>::create(per_device_state)};
 }
diff --git a/lib/local-execution/src/ops/batch_matmul.cc b/lib/local-execution/src/ops/batch_matmul.cc
index 1eae409ae2..ad331156b5 100644
--- a/lib/local-execution/src/ops/batch_matmul.cc
+++ b/lib/local-execution/src/ops/batch_matmul.cc
@@ -18,6 +18,8 @@
 #include "local-execution/op_task_signature.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/batch_matmul.h"
+#include "utils/containers/transform.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -65,24 +67,30 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   FFIterationConfig iter_config =
       acc.get_argument<FFIterationConfig>(ITERATION_CONFIG);
 
-  int m = b_input.shape[legion_dim_t(0)];
-  assert(m == output.shape[legion_dim_t(0)]);
-  int n = a_input.shape[legion_dim_t(1)];
-  assert(n == output.shape[legion_dim_t(1)]);
-  int k = a_input.shape[legion_dim_t(0)];
-  assert(k == b_input.shape[legion_dim_t(1)]);
+  nonnegative_int m = b_input.shape.at(legion_dim_t{0_n});
+  assert(m == output.shape.at(legion_dim_t{0_n}));
+  nonnegative_int n = a_input.shape.at(legion_dim_t{1_n});
+  assert(n == output.shape.at(legion_dim_t{1_n}));
+  nonnegative_int k = a_input.shape.at(legion_dim_t{0_n});
+  assert(k == b_input.shape.at(legion_dim_t{1_n}));
 
   assert(a_input.shape.get_volume() == b_input.shape.get_volume());
   assert(a_input.shape.get_volume() == output.shape.get_volume());
 
-  int batch = 1;
-  for (int i = 2; i < a_input.shape.get_dim(); i++) {
-    int dim_size = a_input.shape[legion_dim_t(i)];
-    assert(dim_size == b_input.shape[legion_dim_t(i)]);
-    assert(dim_size == output.shape[legion_dim_t(i)]);
+  nonnegative_int batch = 1_n;
+  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) {
+    nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i});
+    assert(dim_size == b_input.shape.at(legion_dim_t{i}));
+    assert(dim_size == output.shape.at(legion_dim_t{i}));
     batch *= dim_size;
   }
 
+  auto get_raw_seq_len = [](std::optional<nonnegative_int> seq_len) -> int {
+    return transform(seq_len,
+                     [](nonnegative_int x) { return x.unwrap_nonnegative(); })
+        .value_or(-1);
+  };
+
   return profile(forward_kernel,
                  profiling,
                  "[BatchMatmul] forward_time = {:.2lf}ms\n",
@@ -90,12 +98,12 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr(),
                  a_input.get_float_ptr(),
                  b_input.get_float_ptr(),
-                 m,
-                 n,
-                 k,
-                 batch,
-                 attrs.a_seq_length_dim,
-                 attrs.b_seq_length_dim,
+                 m.unwrap_nonnegative(),
+                 n.unwrap_nonnegative(),
+                 k.unwrap_nonnegative(),
+                 batch.unwrap_nonnegative(),
+                 get_raw_seq_len(attrs.a_seq_length_dim),
+                 get_raw_seq_len(attrs.b_seq_length_dim),
                  iter_config.seq_length);
 }
 
@@ -120,19 +128,20 @@ static std::optional<float>
   assert(b_input.shape == b_input_grad.shape);
 
   // check dins
-  int m = b_input.shape[legion_dim_t(0)];
-  assert(m == output.shape[legion_dim_t(0)]);
-  int n = a_input.shape[legion_dim_t(1)];
-  assert(n == output.shape[legion_dim_t(1)]);
-  int k = a_input.shape[legion_dim_t(0)];
-  assert(k == b_input.shape[legion_dim_t(1)]);
+  nonnegative_int m = b_input.shape.at(legion_dim_t{0_n});
+  assert(m == output.shape.at(legion_dim_t{0_n}));
+  nonnegative_int n = a_input.shape.at(legion_dim_t{1_n});
+  assert(n == output.shape.at(legion_dim_t{1_n}));
+  nonnegative_int k = a_input.shape.at(legion_dim_t{0_n});
+  assert(k == b_input.shape.at(legion_dim_t{1_n}));
   assert(a_input.shape.get_volume() == b_input.shape.get_volume());
   assert(a_input.shape.get_volume() == output.shape.get_volume());
-  int batch = 1;
-  for (int i = 2; i < a_input.shape.dims.num_dims(); i++) {
-    int dim_size = a_input.shape[legion_dim_t(i)];
-    assert(dim_size == b_input.shape[legion_dim_t(i)]);
-    assert(dim_size == output.shape[legion_dim_t(i)]);
+
+  nonnegative_int batch = 1_n;
+  for (nonnegative_int i : nonnegative_range(2_n, a_input.shape.get_dim())) {
+    nonnegative_int dim_size = a_input.shape.at(legion_dim_t{i});
+    assert(dim_size == b_input.shape.at(legion_dim_t{i}));
+    assert(dim_size == output.shape.at(legion_dim_t{i}));
     batch *= dim_size;
   }
 
@@ -146,10 +155,10 @@ static std::optional<float>
                  a_input_grad.get_float_ptr(),
                  b_input.get_float_ptr(),
                  b_input_grad.get_float_ptr(),
-                 m,
-                 n,
-                 k,
-                 batch);
+                 m.unwrap_nonnegative(),
+                 n.unwrap_nonnegative(),
+                 k.unwrap_nonnegative(),
+                 batch.unwrap_nonnegative());
 }
 
 TaskImplFunction get_batch_matmul_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/batch_matmul.h b/lib/local-execution/src/ops/batch_matmul.h
index a7e29b1931..23389d5083 100644
--- a/lib/local-execution/src/ops/batch_matmul.h
+++ b/lib/local-execution/src/ops/batch_matmul.h
@@ -4,7 +4,7 @@
 #include "local-execution/op_task_invocation.h"
 #include "local-execution/op_task_signature.h"
 #include "local-execution/sim_environment.h"
-#include "op-attrs/ops/batch_matmul.dtg.h"
+#include "op-attrs/ops/batch_matmul_attrs.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/local-execution/src/ops/batch_norm.cc b/lib/local-execution/src/ops/batch_norm.cc
index 851566fc02..1df6da8d8e 100644
--- a/lib/local-execution/src/ops/batch_norm.cc
+++ b/lib/local-execution/src/ops/batch_norm.cc
@@ -75,21 +75,22 @@ static DeviceSpecificDeviceStates
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<BatchNormAttrs>(ATTRS);
 
-  int output_w = output.shape[legion_dim_t(0)];
-  int output_h = output.shape[legion_dim_t(1)];
-  int output_c = output.shape[legion_dim_t(2)];
-  int output_n = output.shape[legion_dim_t(3)];
+  nonnegative_int output_w = output.shape.at(legion_dim_t{0_n});
+  nonnegative_int output_h = output.shape.at(legion_dim_t{1_n});
+  nonnegative_int output_c = output.shape.at(legion_dim_t{2_n});
+  nonnegative_int output_n = output.shape.at(legion_dim_t{3_n});
 
   float *runningMean;
 
-  BatchNormPerDeviceState per_device_state = init_kernel(handle,
-                                                         allocator,
-                                                         runningMean,
-                                                         output_n,
-                                                         output_c,
-                                                         output_h,
-                                                         output_w,
-                                                         attrs.relu);
+  BatchNormPerDeviceState per_device_state =
+      init_kernel(handle,
+                  allocator,
+                  runningMean,
+                  output_n.unwrap_nonnegative(),
+                  output_c.unwrap_nonnegative(),
+                  output_h.unwrap_nonnegative(),
+                  output_w.unwrap_nonnegative(),
+                  attrs.relu);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<BatchNormPerDeviceState>::create(per_device_state)};
@@ -140,7 +141,7 @@ static std::optional<float>
                  scale.get_float_ptr(),
                  scale_grad.get_float_ptr(),
                  bias_grad.get_float_ptr(),
-                 output.shape.get_volume());
+                 output.shape.get_volume().unwrap_nonnegative());
 }
 
 TaskImplFunction get_batch_norm_init_task_impl() {
diff --git a/lib/local-execution/src/ops/conv_2d.cc b/lib/local-execution/src/ops/conv_2d.cc
index d5c6e7f851..ac59143f00 100644
--- a/lib/local-execution/src/ops/conv_2d.cc
+++ b/lib/local-execution/src/ops/conv_2d.cc
@@ -62,19 +62,19 @@ static DeviceSpecificDeviceStates
   auto filter_grad = acc.get_tensor_grad<Permissions::RW>(FILTER);
 
   Conv2DPerDeviceState per_device_state =
-      init_kernel(handle,
-                  attrs.activation,
-                  attrs.kernel_h,
-                  attrs.kernel_w,
-                  attrs.groups,
-                  attrs.padding_h,
-                  attrs.padding_w,
-                  attrs.stride_h,
-                  attrs.stride_w,
-                  input,
-                  output,
-                  filter.get_float_ptr(),
-                  filter_grad.get_float_ptr());
+      init_kernel(/*handle=*/handle,
+                  /*activation=*/attrs.activation,
+                  /*kernel_h=*/attrs.kernel_h.unwrap_nonnegative(),
+                  /*kernel_w=*/attrs.kernel_w.unwrap_nonnegative(),
+                  /*groups=*/attrs.groups.unwrap_nonnegative(),
+                  /*padding_h=*/attrs.padding_h.unwrap_nonnegative(),
+                  /*padding_w=*/attrs.padding_w.unwrap_nonnegative(),
+                  /*stride_h=*/attrs.stride_h.unwrap_nonnegative(),
+                  /*stride_w=*/attrs.stride_w.unwrap_nonnegative(),
+                  /*input=*/input,
+                  /*output=*/output,
+                  /*filter_ptr=*/filter.get_float_ptr(),
+                  /*filter_grad_ptr=*/filter_grad.get_float_ptr());
   return DeviceSpecificDeviceStates{
       DeviceSpecific<Conv2DPerDeviceState>::create(per_device_state)};
 }
diff --git a/lib/local-execution/src/ops/gather.cc b/lib/local-execution/src/ops/gather.cc
index a015c64f4d..a43c0f757f 100644
--- a/lib/local-execution/src/ops/gather.cc
+++ b/lib/local-execution/src/ops/gather.cc
@@ -15,8 +15,8 @@
 
 #include "gather.h"
 #include "kernels/gather_kernels.h"
-#include "local-execution/legion_tensor_shape.h"
 #include "op-attrs/get_output_shapes.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 #include <optional>
 
 namespace FlexFlow {
@@ -72,10 +72,11 @@ static DeviceSpecificDeviceStates
   assert(input.shape.get_dim() == index.shape.get_dim());
   assert(output.shape.get_dim() == index.shape.get_dim());
 
-  for (int i = 0; i < input.shape.get_dim(); i++) {
-    assert(index.shape[legion_dim_t(i)] == output.shape[legion_dim_t(i)]);
+  for (nonnegative_int i : nonnegative_range(input.shape.get_dim())) {
+    assert(index.shape.at(legion_dim_t{i}) == output.shape.at(legion_dim_t{i}));
     if (i != legion_dim.value) {
-      assert(input.shape[legion_dim_t(i)] == index.shape[legion_dim_t(i)]);
+      assert(input.shape.at(legion_dim_t{i}) ==
+             index.shape.at(legion_dim_t{i}));
     }
   }
 
diff --git a/lib/local-execution/src/ops/layer_norm.cc b/lib/local-execution/src/ops/layer_norm.cc
index e99d27319c..c01475d4a4 100644
--- a/lib/local-execution/src/ops/layer_norm.cc
+++ b/lib/local-execution/src/ops/layer_norm.cc
@@ -15,12 +15,12 @@
 
 #include "layer_norm.h"
 #include "kernels/layer_norm_kernels.h"
-#include "local-execution/legion_tensor_shape.h"
 #include "op-attrs/get_output_shapes.h"
 #include "op-attrs/ops/layer_norm.h"
 #include "op-attrs/parallel_tensor_shape.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 #include <type_traits>
 
 namespace FlexFlow {
@@ -119,27 +119,25 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto handle = acc.get_argument<PerDeviceFFHandle>(HANDLE);
 
-  // question: how to get batch_size and effective_num_elements
-  int64_t effective_batch_size, effective_num_elements;
-  int M = 1;
+  nonnegative_int M = 1_n;
   for (int i = 0; i < attrs.axes.size(); i++) {
-    legion_dim_t legion_dim = legion_dim_from_ff_dim(
-        attrs.axes[i], get_tensor_shape(input.shape, input.data_type));
+    legion_dim_t legion_dim =
+        legion_dim_from_ff_dim(attrs.axes[i], input.shape.num_dims());
     M *= input.shape.at(legion_dim);
   }
-  int num_replicas = 1;
-  for (int i = 0; i < input.shape.num_dims(); i++) {
-    num_replicas *= input.shape.at(legion_dim_t(i));
-    effective_num_elements = M;
-    effective_batch_size = input.shape.get_volume() / M;
+  nonnegative_int num_replicas = 1_n;
+  for (nonnegative_int i : nonnegative_range(input.shape.num_dims())) {
+    num_replicas *= input.shape.at(legion_dim_t{i});
   }
+  nonnegative_int effective_num_elements = M;
+  nonnegative_int effective_batch_size = input.shape.get_volume() / M;
 
   LayerNormPerDeviceState per_device_state =
       init_kernel(handle,
                   allocator,
                   attrs.elementwise_affine,
-                  effective_batch_size,
-                  effective_num_elements,
+                  effective_batch_size.unwrap_nonnegative(),
+                  effective_num_elements.unwrap_nonnegative(),
                   attrs.eps);
   return DeviceSpecificDeviceStates{
       DeviceSpecific<LayerNormPerDeviceState>::create(per_device_state)};
diff --git a/lib/local-execution/src/ops/linear.cc b/lib/local-execution/src/ops/linear.cc
index 3e0b4672ab..e10f1a8e9c 100644
--- a/lib/local-execution/src/ops/linear.cc
+++ b/lib/local-execution/src/ops/linear.cc
@@ -66,21 +66,22 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto weight = acc.get_tensor<Permissions::RO>(WEIGHT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
-  int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}});
-  int batch_size = output.shape.at(ff_dim_t{nonnegative_int{1}});
+  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
+  nonnegative_int batch_size = output.shape.at(ff_dim_t{1_n});
 
   float *one_ptr;
 
-  LinearPerDeviceState per_device_state = init_kernel(handle,
-                                                      one_ptr,
-                                                      attrs.activation,
-                                                      attrs.regularizer,
-                                                      attrs.use_bias,
-                                                      input.data_type,
-                                                      weight.data_type,
-                                                      output.data_type,
-                                                      batch_size,
-                                                      attrs.out_channels);
+  LinearPerDeviceState per_device_state =
+      init_kernel(handle,
+                  one_ptr,
+                  attrs.activation,
+                  attrs.regularizer,
+                  attrs.use_bias,
+                  input.data_type,
+                  weight.data_type,
+                  output.data_type,
+                  batch_size.unwrap_nonnegative(),
+                  attrs.out_channels.unwrap_nonnegative());
   return DeviceSpecificDeviceStates{
       DeviceSpecific<LinearPerDeviceState>::create(per_device_state)};
 }
@@ -96,9 +97,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
   auto attrs = acc.get_argument<LinearAttrs>(ATTRS);
 
-  int in_dim = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int batch_size = output.shape.get_volume() / out_dim;
+  nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
+  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
+  nonnegative_int batch_size = output.shape.get_volume() / out_dim;
 
   float const *bias_ptr = NULL;
   if (attrs.use_bias) {
@@ -113,9 +114,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  output.get_float_ptr(),
                  weight.get_float_ptr(),
                  bias_ptr,
-                 in_dim,
-                 out_dim,
-                 batch_size);
+                 in_dim.unwrap_nonnegative(),
+                 out_dim.unwrap_nonnegative(),
+                 batch_size.unwrap_nonnegative());
 }
 
 ;
@@ -140,9 +141,9 @@ static std::optional<float>
     bias_ptr = bias.get_float_ptr();
   }
 
-  int in_dim = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int out_dim = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int batch_size = output.shape.get_volume() / out_dim;
+  nonnegative_int in_dim = input.shape.at(ff_dim_t{0_n});
+  nonnegative_int out_dim = output.shape.at(ff_dim_t{0_n});
+  nonnegative_int batch_size = output.shape.get_volume() / out_dim;
 
   return profile(backward_kernel,
                  profiling,
@@ -155,9 +156,9 @@ static std::optional<float>
                  (void *)weight.get_float_ptr(),
                  (void *)weight_grad.get_float_ptr(),
                  (void *)bias_ptr,
-                 in_dim,
-                 out_dim,
-                 batch_size);
+                 in_dim.unwrap_nonnegative(),
+                 out_dim.unwrap_nonnegative(),
+                 batch_size.unwrap_nonnegative());
 }
 
 TaskImplFunction get_linear_init_task_impl() {
diff --git a/lib/local-execution/src/ops/pool_2d.cc b/lib/local-execution/src/ops/pool_2d.cc
index 3ab33a2ad6..897b545569 100644
--- a/lib/local-execution/src/ops/pool_2d.cc
+++ b/lib/local-execution/src/ops/pool_2d.cc
@@ -22,6 +22,20 @@ OpTaskInvocation init(Pool2DAttrs const &attrs) {
   return {task_id_t::POOL2D_INIT_TASK_ID, binding};
 }
 
+static nonnegative_int calculate_padding(nonnegative_int output_size,
+                                         nonnegative_int stride,
+                                         nonnegative_int kernel_size,
+                                         nonnegative_int input_size) {
+  int o = output_size.unwrap_nonnegative();
+  int s = stride.unwrap_nonnegative();
+  int k = kernel_size.unwrap_nonnegative();
+  int i = kernel_size.unwrap_nonnegative();
+
+  return nonnegative_int{
+      ((o - 1) * s + k - i + 1) / 2,
+  };
+}
+
 static DeviceSpecificDeviceStates
     init_task_impl(TaskArgumentAccessor const &acc) {
   auto const &attrs = acc.get_argument<Pool2DAttrs>(ATTRS);
@@ -30,56 +44,33 @@ static DeviceSpecificDeviceStates
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  int input_w = input.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int input_h = input.shape.at(ff_dim_t{nonnegative_int{1}}) + 1;
-  int input_c = input.shape.at(ff_dim_t{nonnegative_int{2}}) + 1;
-  int input_n = input.shape.at(ff_dim_t{nonnegative_int{3}}) + 1;
-  int output_w = output.shape.at(ff_dim_t{nonnegative_int{0}}) + 1;
-  int output_h = output.shape.at(ff_dim_t{nonnegative_int{1}}) + 1;
-  int output_c = output.shape.at(ff_dim_t{nonnegative_int{2}}) + 1;
-  int output_n = output.shape.at(ff_dim_t{nonnegative_int{3}}) + 1;
-
-  printf("init pool (input): n(%d) c(%d) h(%d) "
-         "w(%d)\n",
-         input_n,
-         input_c,
-         input_h,
-         input_w);
-  printf("init pool (output): n(%d) c(%d) h(%d) w(%d)\n",
-         output_n,
-         output_c,
-         output_h,
-         output_w);
-
-  int pad_h =
-      ((output_h - 1) * attrs.stride_h + attrs.kernel_h - input_h + 1) / 2;
-  int pad_w =
-      ((output_w - 1) * attrs.stride_w + attrs.kernel_w - input_w + 1) / 2;
-  if (pad_h != attrs.padding_h) {
-    printf("Warning: changing pool_padding_h to satisfy output_h size\n");
-  }
-
-  if (pad_w != attrs.padding_w) {
-    printf("Warning: changing pool_padding_w to satisfy output_w size\n");
-  }
-
-  Pool2DPerDeviceState per_device_state = init_kernel(handle,
-                                                      attrs.activation,
-                                                      input_w,
-                                                      input_h,
-                                                      input_c,
-                                                      input_n,
-                                                      output_w,
-                                                      output_h,
-                                                      output_c,
-                                                      output_n,
-                                                      pad_h,
-                                                      pad_w,
-                                                      attrs.kernel_h,
-                                                      attrs.kernel_w,
-                                                      attrs.stride_h,
-                                                      attrs.stride_w,
-                                                      attrs.pool_type);
+  nonnegative_int input_w = input.shape.at(ff_dim_t{0_n});
+  nonnegative_int input_h = input.shape.at(ff_dim_t{1_n});
+  nonnegative_int input_c = input.shape.at(ff_dim_t{2_n});
+  nonnegative_int input_n = input.shape.at(ff_dim_t{3_n});
+  nonnegative_int output_w = output.shape.at(ff_dim_t{0_n});
+  nonnegative_int output_h = output.shape.at(ff_dim_t{1_n});
+  nonnegative_int output_c = output.shape.at(ff_dim_t{2_n});
+  nonnegative_int output_n = output.shape.at(ff_dim_t{3_n});
+
+  Pool2DPerDeviceState per_device_state =
+      init_kernel(handle,
+                  attrs.activation,
+                  input_w.unwrap_nonnegative(),
+                  input_h.unwrap_nonnegative(),
+                  input_c.unwrap_nonnegative(),
+                  input_n.unwrap_nonnegative(),
+                  output_w.unwrap_nonnegative(),
+                  output_h.unwrap_nonnegative(),
+                  output_c.unwrap_nonnegative(),
+                  output_n.unwrap_nonnegative(),
+                  attrs.padding_h.unwrap_nonnegative(),
+                  attrs.padding_w.unwrap_nonnegative(),
+                  attrs.kernel_h.unwrap_nonnegative(),
+                  attrs.kernel_w.unwrap_nonnegative(),
+                  attrs.stride_h.unwrap_nonnegative(),
+                  attrs.stride_w.unwrap_nonnegative(),
+                  attrs.pool_type);
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<Pool2DPerDeviceState>::create(per_device_state)};
diff --git a/lib/local-execution/src/ops/reduce.cc b/lib/local-execution/src/ops/reduce.cc
index a043d9f847..3f92d7fd77 100644
--- a/lib/local-execution/src/ops/reduce.cc
+++ b/lib/local-execution/src/ops/reduce.cc
@@ -41,9 +41,14 @@ static DeviceSpecificDeviceStates
 
   OperatorType op_type = attrs.op_type;
 
-  size_t reduction_size = input.shape.get_volume() / output.shape.get_volume();
+  nonnegative_int reduction_size =
+      input.shape.get_volume() / output.shape.get_volume();
   ReducePerDeviceState per_device_state =
-      init_kernel(handle, op_type, reduction_size, input.shape, output.shape);
+      init_kernel(handle,
+                  op_type,
+                  reduction_size.unwrap_nonnegative(),
+                  input.shape,
+                  output.shape);
   return DeviceSpecificDeviceStates{
       DeviceSpecific<ReducePerDeviceState>::create(per_device_state)};
 }
diff --git a/lib/local-execution/src/ops/reduction.cc b/lib/local-execution/src/ops/reduction.cc
index a58d79a4f8..0892bcde82 100644
--- a/lib/local-execution/src/ops/reduction.cc
+++ b/lib/local-execution/src/ops/reduction.cc
@@ -50,14 +50,14 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReductionAttrs>(ATTRS);
 
-  size_t num_replicas = attrs.reduction_degree;
+  nonnegative_int num_replicas = attrs.reduction_degree;
 
   return profile(forward_kernel,
                  profiling_settings,
                  "[Reduction] forward_time = {:.2lf}ms\n",
                  input,
                  output,
-                 num_replicas);
+                 num_replicas.unwrap_nonnegative());
 }
 
 static std::optional<float>
diff --git a/lib/local-execution/src/ops/replicate.cc b/lib/local-execution/src/ops/replicate.cc
index 135475a711..d7b06d6bfe 100644
--- a/lib/local-execution/src/ops/replicate.cc
+++ b/lib/local-execution/src/ops/replicate.cc
@@ -62,14 +62,14 @@ static std::optional<float>
 
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
-  auto const &attrs = acc.get_argument<ReplicateAttrs>(ATTRS);
+  auto attrs = acc.get_argument<ReplicateAttrs>(ATTRS);
 
   return profile(backward_kernel,
                  profiling,
                  "[replicate] backward_time = {:.2lf}ms\n",
                  input_grad,
                  output_grad,
-                 attrs.replicate_degree);
+                 attrs.replicate_degree.unwrap_nonnegative());
 }
 
 TaskImplFunction get_replicate_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/reverse.cc b/lib/local-execution/src/ops/reverse.cc
index 8ac4c045c7..94dfc90f7a 100644
--- a/lib/local-execution/src/ops/reverse.cc
+++ b/lib/local-execution/src/ops/reverse.cc
@@ -17,6 +17,7 @@
 #include "kernels/accessor.h"
 #include "kernels/reverse_kernels.h"
 #include "op-attrs/get_output_shapes.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -48,16 +49,18 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int output_size = output.shape.get_volume();
+  nonnegative_int output_size = output.shape.get_volume();
   auto axis = attrs.axis;
-  coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
-  for (int i = 0; i < output.shape.get_dim(); i++) {
+  nonnegative_int in_blk_size = 1_n;
+  nonnegative_int reverse_dim_size = 1_n;
+  nonnegative_int num_out_blks = 1_n;
+  for (nonnegative_int i : nonnegative_range(output.shape.get_dim())) {
     if (i < axis.value) {
-      in_blk_size *= output.shape.at(ff_dim_t{nonnegative_int{i}});
+      in_blk_size *= output.shape.at(ff_dim_t{i});
     } else if (i == axis.value) {
-      reverse_dim_size = output.shape.at(ff_dim_t{nonnegative_int{i}});
+      reverse_dim_size = output.shape.at(ff_dim_t{i});
     } else {
-      num_out_blks *= output.shape.at(ff_dim_t{nonnegative_int{i}});
+      num_out_blks *= output.shape.at(ff_dim_t{i});
     }
   }
 
@@ -66,10 +69,10 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  "[reverse] forward_time = {:.2lf}ms\n",
                  input.get_float_ptr(),
                  output.get_float_ptr(),
-                 num_out_blks,
-                 reverse_dim_size,
-                 in_blk_size,
-                 output_size);
+                 num_out_blks.unwrap_nonnegative(),
+                 reverse_dim_size.unwrap_nonnegative(),
+                 in_blk_size.unwrap_nonnegative(),
+                 output_size.unwrap_nonnegative());
 }
 
 static std::optional<float>
@@ -79,15 +82,18 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<ReverseAttrs>(ATTRS);
 
-  int axis = input_grad.shape.get_dim() - attrs.axis.value.get_value() - 1;
-  coord_t in_blk_size = 1, reverse_dim_size = 1, num_out_blks = 1;
-  for (int i = 0; i < input_grad.shape.get_dim(); i++) {
+  int axis = input_grad.shape.num_dims().unwrap_nonnegative() -
+             attrs.axis.value.unwrap_nonnegative() - 1;
+  nonnegative_int in_blk_size = 1_n;
+  nonnegative_int reverse_dim_size = 1_n;
+  nonnegative_int num_out_blks = 1_n;
+  for (nonnegative_int i : nonnegative_range(input_grad.shape.get_dim())) {
     if (i < axis) {
-      in_blk_size *= input_grad.shape.at(ff_dim_t{nonnegative_int{i}});
+      in_blk_size *= input_grad.shape.at(ff_dim_t{i});
     } else if (i == axis) {
-      reverse_dim_size = input_grad.shape.at(ff_dim_t{nonnegative_int{i}});
+      reverse_dim_size = input_grad.shape.at(ff_dim_t{i});
     } else {
-      num_out_blks *= input_grad.shape.at(ff_dim_t{nonnegative_int{i}});
+      num_out_blks *= input_grad.shape.at(ff_dim_t{i});
     }
   }
 
@@ -96,10 +102,10 @@ static std::optional<float>
                  "[reverse] backward_time = {:.2lf}ms\n",
                  output_grad.get_float_ptr(),
                  input_grad.get_float_ptr(),
-                 num_out_blks,
-                 reverse_dim_size,
-                 in_blk_size,
-                 input_grad.shape.get_volume());
+                 num_out_blks.unwrap_nonnegative(),
+                 reverse_dim_size.unwrap_nonnegative(),
+                 in_blk_size.unwrap_nonnegative(),
+                 input_grad.shape.get_volume().unwrap_nonnegative());
 }
 
 TaskImplFunction get_reverse_fwd_task_impl() {
diff --git a/lib/local-execution/src/ops/softmax.cc b/lib/local-execution/src/ops/softmax.cc
index 8d412c739b..ca5450f4f0 100644
--- a/lib/local-execution/src/ops/softmax.cc
+++ b/lib/local-execution/src/ops/softmax.cc
@@ -59,18 +59,18 @@ static DeviceSpecificDeviceStates
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto const &attrs = acc.get_argument<SoftmaxAttrs>(ATTRS);
 
-  int output_w = output.shape.at(legion_dim_t(0));
-  int output_h = output.shape.at(legion_dim_t(1));
-  int output_c = output.shape.at(legion_dim_t(2));
-  int output_n = output.shape.at(legion_dim_t(3));
+  nonnegative_int output_w = output.shape.at(legion_dim_t{0_n});
+  nonnegative_int output_h = output.shape.at(legion_dim_t{1_n});
+  nonnegative_int output_c = output.shape.at(legion_dim_t{2_n});
+  nonnegative_int output_n = output.shape.at(legion_dim_t{3_n});
 
   SoftmaxPerDeviceState per_device_state =
       init_kernel(handle,
-                  attrs.dim.value.get_value(),
-                  output_n,
-                  output_c,
-                  output_h,
-                  output_w);
+                  attrs.dim.value.unwrap_nonnegative(),
+                  output_n.unwrap_nonnegative(),
+                  output_c.unwrap_nonnegative(),
+                  output_h.unwrap_nonnegative(),
+                  output_w.unwrap_nonnegative());
 
   return DeviceSpecificDeviceStates{
       DeviceSpecific<SoftmaxPerDeviceState>::create(per_device_state)};
@@ -109,7 +109,7 @@ static std::optional<float>
                  "[SoftMax] backward_time = {:.2lf}ms\n",
                  input_grad.get_float_ptr(),
                  output_grad.get_float_ptr(),
-                 output_grad.shape.get_volume());
+                 output_grad.shape.get_volume().unwrap_nonnegative());
 }
 
 TaskImplFunction get_softmax_init_task_impl() {
diff --git a/lib/local-execution/src/ops/split.cc b/lib/local-execution/src/ops/split.cc
index c289bca205..f119ae235b 100644
--- a/lib/local-execution/src/ops/split.cc
+++ b/lib/local-execution/src/ops/split.cc
@@ -19,6 +19,7 @@
 #include "op-attrs/get_output_shapes.h"
 #include "utils/exception.h"
 #include "utils/hash-utils.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -44,19 +45,18 @@ OpTaskInvocation backward(SplitAttrs const &attrs) {
   return {task_id_t::SPLIT_BWD_TASK_ID, binding};
 }
 
-void calc_block_size(coord_t &num_blocks,
-                     coord_t &block_size,
-                     ArrayShape const &array_shape,
-                     ff_dim_t axis) {
-  num_blocks = 1;
-  block_size = 1;
-  for (int d = 0; d < array_shape.num_elements(); d++) {
-    if (d <= axis.value.get_value()) {
-      block_size *= array_shape.at(legion_dim_t(d));
+static std::pair<nonnegative_int, nonnegative_int>
+    calc_block_size(ArrayShape const &array_shape, ff_dim_t axis) {
+  nonnegative_int num_blocks = 1_n;
+  nonnegative_int block_size = 1_n;
+  for (nonnegative_int d : nonnegative_range(array_shape.num_elements())) {
+    if (d <= axis.value) {
+      block_size *= array_shape.at(legion_dim_t{d});
     } else {
-      num_blocks *= array_shape.at(legion_dim_t(d));
+      num_blocks *= array_shape.at(legion_dim_t{d});
     }
   }
+  return {num_blocks, block_size};
 }
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
@@ -65,13 +65,12 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blocks, in_block_size, input.shape, attrs.axis);
+  coord_t out_block_sizes[MAX_NUM_OUTPUTS];
+  auto [num_blocks, in_block_size] = calc_block_size(input.shape, attrs.axis);
 
   for (int i = 0; i < attrs.splits.size(); i++) {
-    coord_t out_num_blocks;
-    calc_block_size(
-        out_num_blocks, out_block_size[i], output.shape, attrs.axis);
+    auto [_, out_block_size] = calc_block_size(output.shape, attrs.axis);
+    out_block_sizes[i] = out_block_size.unwrap_nonnegative();
   }
   float *output_float_ptr = output.get_float_ptr();
   return profile(forward_kernel,
@@ -79,9 +78,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  "Split forward_time = {:.2lf}ms\n",
                  &output_float_ptr,
                  input.get_float_ptr(),
-                 out_block_size,
-                 in_block_size,
-                 num_blocks,
+                 out_block_sizes,
+                 in_block_size.unwrap_nonnegative(),
+                 num_blocks.unwrap_nonnegative(),
                  attrs.splits.size());
 }
 
@@ -93,12 +92,14 @@ static std::optional<float>
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
   auto attrs = acc.get_argument<SplitAttrs>(ATTRS);
 
-  coord_t num_blocks, in_block_size, out_block_size[MAX_NUM_OUTPUTS];
-  calc_block_size(num_blocks, in_block_size, input_grad.shape, attrs.axis);
+  coord_t out_block_sizes[MAX_NUM_OUTPUTS];
+  auto [num_blocks, in_block_size] =
+      calc_block_size(input_grad.shape, attrs.axis);
+
   for (int i = 0; i < attrs.splits.size(); i++) {
     coord_t out_num_blocks;
-    calc_block_size(
-        out_num_blocks, out_block_size[i], output_grad.shape, attrs.axis);
+    auto [_, out_block_size] = calc_block_size(output_grad.shape, attrs.axis);
+    out_block_sizes[i] = out_block_size.unwrap_nonnegative();
   }
   float const *output_grad_ptr = output_grad.get_float_ptr();
   return profile(backward_kernel,
@@ -106,9 +107,9 @@ static std::optional<float>
                  "Split backward_time = {:.2lf}ms\n",
                  input_grad.get_float_ptr(),
                  &output_grad_ptr,
-                 out_block_size,
-                 in_block_size,
-                 num_blocks,
+                 out_block_sizes,
+                 in_block_size.unwrap_nonnegative(),
+                 num_blocks.unwrap_nonnegative(),
                  attrs.splits.size());
 }
 
diff --git a/lib/local-execution/src/ops/topk.cc b/lib/local-execution/src/ops/topk.cc
index 7f3519529a..e9d202a38f 100644
--- a/lib/local-execution/src/ops/topk.cc
+++ b/lib/local-execution/src/ops/topk.cc
@@ -75,8 +75,8 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
 
-  int length = input.shape.at(legion_dim_t(0)) + 1;
-  size_t batch_size = input.shape.get_volume() / length;
+  nonnegative_int length = input.shape.at(legion_dim_t{0_n});
+  nonnegative_int batch_size = input.shape.get_volume() / length;
   auto indices = acc.get_tensor<Permissions::WO>(INDICES);
 
   return profile(forward_kernel,
@@ -86,9 +86,9 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
                  input.get_float_ptr(),
                  output.get_float_ptr(),
                  indices.get_int32_ptr(),
-                 batch_size,
-                 length,
-                 attrs.k,
+                 batch_size.unwrap_nonnegative(),
+                 length.unwrap_nonnegative(),
+                 attrs.k.unwrap_nonnegative(),
                  attrs.sorted);
 }
 
@@ -104,8 +104,8 @@ static std::optional<float>
 
   auto indices = acc.get_tensor<Permissions::RO>(INDICES);
 
-  int length = input_grad.shape.at(legion_dim_t(0)) + 1;
-  size_t batch_size = input_grad.shape.get_volume() / length;
+  nonnegative_int length = input_grad.shape.at(legion_dim_t{0_n});
+  nonnegative_int batch_size = input_grad.shape.get_volume() / length;
 
   return profile(backward_kernel,
                  profiling,
@@ -114,9 +114,9 @@ static std::optional<float>
                  output_grad.get_float_ptr(),
                  indices.get_int32_ptr(),
                  input_grad.get_float_ptr(),
-                 batch_size,
-                 length,
-                 attrs.k);
+                 batch_size.unwrap_nonnegative(),
+                 length.unwrap_nonnegative(),
+                 attrs.k.unwrap_nonnegative());
 }
 
 TaskImplFunction get_topk_init_task_impl() {
diff --git a/lib/local-execution/src/ops/transpose.cc b/lib/local-execution/src/ops/transpose.cc
index 53cf1f20ed..5f183305ab 100644
--- a/lib/local-execution/src/ops/transpose.cc
+++ b/lib/local-execution/src/ops/transpose.cc
@@ -28,39 +28,11 @@ enum Slots {
   OUTPUT, // tensor
   ATTRS,
   PROFILING,
-  PER_DEVICE_STATE,
 };
 
-OpTaskInvocation init(TransposeAttrs const &attrs) {
-  OpTaskBinding binding;
-  binding.bind_arg(ATTRS, attrs);
-  return {task_id_t::TRANSPOSE_INIT_TASK_ID, binding};
-}
-
-static DeviceSpecificDeviceStates
-    init_task_impl(TaskArgumentAccessor const &acc) {
-  auto const &attrs = acc.get_argument<TransposeAttrs>(ATTRS);
-  int size = int_from_size_t(attrs.perm.size());
-
-  std::vector<ff_dim_t> perm = [&] {
-    std::vector<ff_dim_t> result;
-    for (int i : range(size)) {
-      result.push_back(ff_dim_t{nonnegative_int{size - i - 1}});
-    }
-    return result;
-  }();
-
-  TransposePerDeviceState per_device_state = init_kernel(size, perm);
-
-  return DeviceSpecificDeviceStates{
-      DeviceSpecific<TransposePerDeviceState>::create(per_device_state)};
-}
-
 OpTaskInvocation forward(TransposeAttrs const &attrs) {
   OpTaskBinding binding;
 
-  binding.bind_arg(PER_DEVICE_STATE,
-                   per_device_op_state<TransposePerDeviceState>());
   binding.bind_arg(PROFILING, profiling_settings());
 
   binding.bind(INPUT, input_tensor(0));
@@ -71,8 +43,7 @@ OpTaskInvocation forward(TransposeAttrs const &attrs) {
 
 static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<TransposePerDeviceState>(PER_DEVICE_STATE);
+  auto attrs = acc.get_argument<TransposeAttrs>(ATTRS);
 
   auto input = acc.get_tensor<Permissions::RO>(INPUT);
   auto output = acc.get_tensor<Permissions::WO>(OUTPUT);
@@ -80,7 +51,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
   return profile(forward_kernel,
                  profiling,
                  "[Transpose] Forward_time = {:.2lf} [ms]",
-                 per_device_state,
+                 attrs,
                  input,
                  output);
 }
@@ -88,8 +59,7 @@ static std::optional<float> forward_task_impl(TaskArgumentAccessor const &acc) {
 static std::optional<float>
     backward_task_impl(TaskArgumentAccessor const &acc) {
   ProfilingSettings profiling = acc.get_argument<ProfilingSettings>(PROFILING);
-  auto per_device_state =
-      acc.get_argument<TransposePerDeviceState>(PER_DEVICE_STATE);
+  auto attrs = acc.get_argument<TransposeAttrs>(ATTRS);
 
   auto input_grad = acc.get_tensor_grad<Permissions::WO>(INPUT);
   auto output_grad = acc.get_tensor_grad<Permissions::RO>(OUTPUT);
@@ -97,7 +67,7 @@ static std::optional<float>
   return profile(backward_kernel,
                  profiling,
                  "[Transpose] Backward_time = {:.2lf} [ms]",
-                 per_device_state,
+                 attrs,
                  input_grad,
                  output_grad);
 }
@@ -108,42 +78,31 @@ OpTaskInvocation backward(TransposeAttrs const &attrs) {
   return {task_id_t::TRANSPOSE_BWD_TASK_ID, binding};
 }
 
-TaskImplFunction get_transpose_init_task_impl() {
-  return TaskImplFunction{InitTaskImplFunction{init_task_impl}};
-}
 TaskImplFunction get_transpose_fwd_task_impl() {
   return TaskImplFunction{FwdBwdTaskImplFunction{forward_task_impl}};
 }
+
 TaskImplFunction get_transpose_bwd_task_impl() {
   return TaskImplFunction{FwdBwdTaskImplFunction{backward_task_impl}};
 }
 
-OpTaskSignature get_transpose_init_signature() {
-  OpTaskSignature init(OpTaskType::INIT);
-
-  init.add_arg_slot<TransposeAttrs>(ATTRS);
-  init.add_return_value<TransposePerDeviceState>();
-  return init;
-}
 OpTaskSignature get_transpose_fwd_signature() {
   OpTaskSignature fwd(OpTaskType::FWD);
 
   fwd.add_arg_slot<ProfilingSettings>(PROFILING);
-  fwd.add_unchecked_arg_slot<TransposePerDeviceState>(PER_DEVICE_STATE);
 
   fwd.add_input_slot(INPUT);
   fwd.add_output_slot(OUTPUT);
   return fwd;
 }
+
 OpTaskSignature get_transpose_bwd_signature() {
   OpTaskSignature bwd = infer_bwd_signature(get_transpose_fwd_signature());
   return bwd;
 }
 
 std::vector<task_id_t> get_task_ids(TransposeAttrs const &) {
-  return {task_id_t::TRANSPOSE_INIT_TASK_ID,
-          task_id_t::TRANSPOSE_FWD_TASK_ID,
-          task_id_t::TRANSPOSE_BWD_TASK_ID};
+  return {task_id_t::TRANSPOSE_FWD_TASK_ID, task_id_t::TRANSPOSE_BWD_TASK_ID};
 }
 
 } // namespace FlexFlow
diff --git a/lib/local-execution/src/ops/transpose.h b/lib/local-execution/src/ops/transpose.h
index 0f3a2e80a0..f2ce014aa7 100644
--- a/lib/local-execution/src/ops/transpose.h
+++ b/lib/local-execution/src/ops/transpose.h
@@ -9,15 +9,12 @@ namespace FlexFlow {
 
 std::vector<task_id_t> get_task_ids(TransposeAttrs const &);
 
-TaskImplFunction get_transpose_init_task_impl();
 TaskImplFunction get_transpose_fwd_task_impl();
 TaskImplFunction get_transpose_bwd_task_impl();
 
-OpTaskSignature get_transpose_init_signature();
 OpTaskSignature get_transpose_fwd_signature();
 OpTaskSignature get_transpose_bwd_signature();
 
-OpTaskInvocation init(TransposeAttrs const &);
 OpTaskInvocation forward(TransposeAttrs const &);
 OpTaskInvocation backward(TransposeAttrs const &);
 
diff --git a/lib/local-execution/src/task_signature_impl.cc b/lib/local-execution/src/task_signature_impl.cc
index ca428aad25..60928d42d7 100644
--- a/lib/local-execution/src/task_signature_impl.cc
+++ b/lib/local-execution/src/task_signature_impl.cc
@@ -193,9 +193,6 @@ TaskSignatureAndImpl get_task_sig_impl(task_id_t const &task_id) {
     case task_id_t::TOPK_BWD_TASK_ID:
       return TaskSignatureAndImpl{get_topk_bwd_task_impl(),
                                   get_topk_bwd_signature()};
-    case task_id_t::TRANSPOSE_INIT_TASK_ID:
-      return TaskSignatureAndImpl{get_transpose_init_task_impl(),
-                                  get_transpose_init_signature()};
     case task_id_t::TRANSPOSE_FWD_TASK_ID:
       return TaskSignatureAndImpl{get_transpose_fwd_task_impl(),
                                   get_transpose_fwd_signature()};
@@ -296,7 +293,6 @@ OpTaskInvocation init(ComputationGraphOpAttrs const &op) {
       [](ReshapeAttrs const &attrs) { return init(attrs); },
       [](SoftmaxAttrs const &attrs) { return init(attrs); },
       [](TopKAttrs const &attrs) { return init(attrs); },
-      [](TransposeAttrs const &attrs) { return init(attrs); },
       [](auto const &attrs) -> OpTaskInvocation {
         throw mk_runtime_error(fmt::format("Unhandled attr type {}", attrs));
       },
diff --git a/lib/local-execution/test/src/test_local_slots_backing.cc b/lib/local-execution/test/src/test_local_slots_backing.cc
index 1ec441fbca..46827e3981 100644
--- a/lib/local-execution/test/src/test_local_slots_backing.cc
+++ b/lib/local-execution/test/src/test_local_slots_backing.cc
@@ -19,16 +19,17 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalSlotsBacking -- Attention Op") {
     // allocate input memory
     Allocator allocator = create_local_cpu_memory_allocator();
-    int embed_dim = 32;
-    int num_heads = 10;
+    nonnegative_int embed_dim = 32_n;
+    nonnegative_int num_heads = 10_n;
 
-    size_t batch_size = 40;
-    size_t seq_len = 48;
-    size_t feature_size = 36;
+    nonnegative_int batch_size = 40_n;
+    nonnegative_int seq_len = 48_n;
+    nonnegative_int feature_size = 36_n;
 
     DataType dtype = DataType::FLOAT;
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, seq_len, feature_size}},
+        TensorDims{
+            FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
         DataType::FLOAT,
     };
     TensorShape query_shape = input_tensor_shape;
diff --git a/lib/local-execution/test/src/test_local_task_arg_accessor.cc b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
index f52fccb1ed..0fab0f6a60 100644
--- a/lib/local-execution/test/src/test_local_task_arg_accessor.cc
+++ b/lib/local-execution/test/src/test_local_task_arg_accessor.cc
@@ -9,16 +9,17 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("LocalTaskArgumentAccessor") {
     Allocator allocator = create_local_cpu_memory_allocator();
-    int embed_dim = 32;
-    int num_heads = 10;
+    nonnegative_int embed_dim = 32_n;
+    nonnegative_int num_heads = 10_n;
 
-    size_t batch_size = 40;
-    size_t seq_len = 48;
-    size_t feature_size = 36;
+    nonnegative_int batch_size = 40_n;
+    nonnegative_int seq_len = 48_n;
+    nonnegative_int feature_size = 36_n;
 
     DataType dtype = DataType::FLOAT;
     TensorShape input_tensor_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, seq_len, feature_size}},
+        TensorDims{
+            FFOrdered<nonnegative_int>{batch_size, seq_len, feature_size}},
         DataType::FLOAT,
     };
 
diff --git a/lib/local-execution/test/src/test_task_registry.cc b/lib/local-execution/test/src/test_task_registry.cc
index e18b7ea2de..58d6d9be6c 100644
--- a/lib/local-execution/test/src/test_task_registry.cc
+++ b/lib/local-execution/test/src/test_task_registry.cc
@@ -14,8 +14,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     TaskRegistry task_registry = empty_task_registry();
 
     layer_guid_t layer_guid = layer_guid_t{Node{0}};
-    int embed_dim = 32;
-    int num_heads = 10;
+    nonnegative_int embed_dim = 32_n;
+    nonnegative_int num_heads = 10_n;
     ComputationGraphOpAttrs attrs =
         ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
             /*embed_dim=*/embed_dim,
@@ -76,7 +76,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         CHECK(correct_task_mapping == task_registry.task_mapping);
       }
       SUBCASE("different attrs, still same task fn mapping") {
-        int embed_dim = 100;
+        nonnegative_int embed_dim = 100_n;
         layer_guid_t layer_3 = layer_guid_t{Node{3}};
         ComputationGraphOpAttrs other_attrs =
             ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
@@ -98,7 +98,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("equality") {
       TaskRegistry other_task_registry = empty_task_registry();
       SUBCASE("different attrs is still equal") {
-        int embed_dim = 100;
+        nonnegative_int embed_dim = 100_n;
         ComputationGraphOpAttrs other_attrs =
             ComputationGraphOpAttrs{MultiHeadAttentionAttrs{
                 /*embed_dim=*/embed_dim,
diff --git a/lib/models/include/models/bert/bert_config.struct.toml b/lib/models/include/models/bert/bert_config.struct.toml
index 398210cf48..cc2a8eb0a7 100644
--- a/lib/models/include/models/bert/bert_config.struct.toml
+++ b/lib/models/include/models/bert/bert_config.struct.toml
@@ -12,27 +12,28 @@ features = [
 
 includes = [
   "op-attrs/activation.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "vocab_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "hidden_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_encoder_layers"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_heads"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dim_feedforward"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "hidden_act"
@@ -64,8 +65,8 @@ type = "float"
 
 [[fields]]
 name = "sequence_length"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "batch_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml
index 667a6531c3..e7d83efd07 100644
--- a/lib/models/include/models/candle_uno/candle_uno_config.struct.toml
+++ b/lib/models/include/models/candle_uno/candle_uno_config.struct.toml
@@ -14,6 +14,7 @@ includes = [
   "<vector>",
   "<map>",
   "<string>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -25,19 +26,19 @@ src_includes = [
 
 [[fields]]
 name = "batch_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dense_layers"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "dense_feature_layers"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "feature_shapes"
-type = "std::map<std::string, int>"
+type = "std::map<std::string, ::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "input_features"
diff --git a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml
index a2a75c83bb..1290420e16 100644
--- a/lib/models/include/models/inception_v3/inception_v3_config.struct.toml
+++ b/lib/models/include/models/inception_v3/inception_v3_config.struct.toml
@@ -10,13 +10,17 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "num_classes"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "batch_size"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "aux_logits"
diff --git a/lib/models/include/models/split_test/split_test.h b/lib/models/include/models/split_test/split_test.h
index b03e45b2d2..dd7089c4f6 100644
--- a/lib/models/include/models/split_test/split_test.h
+++ b/lib/models/include/models/split_test/split_test.h
@@ -12,7 +12,7 @@ namespace FlexFlow {
  * @note This is a tiny model developed for testing the original Unity
  * implementation. It is not a "real" model and has never been trained.
  */
-ComputationGraph get_split_test_computation_graph(int batch_size);
+ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size);
 
 } // namespace FlexFlow
 
diff --git a/lib/models/include/models/transformer/transformer_config.struct.toml b/lib/models/include/models/transformer/transformer_config.struct.toml
index 23b0478dde..2a0b39feb9 100644
--- a/lib/models/include/models/transformer/transformer_config.struct.toml
+++ b/lib/models/include/models/transformer/transformer_config.struct.toml
@@ -1,6 +1,5 @@
 namespace = "FlexFlow"
 name = "TransformerConfig"
-
 features = [
   "eq",
   "ord",
@@ -10,33 +9,37 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "num_features"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "sequence_length"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "batch_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dim_feedforward"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_heads"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_encoder_layers"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_decoder_layers"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dropout"
@@ -48,4 +51,4 @@ type = "float"
 
 [[fields]]
 name = "vocab_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/models/src/models/bert/bert.cc b/lib/models/src/models/bert/bert.cc
index cf48f2399b..a5d63e8fdc 100644
--- a/lib/models/src/models/bert/bert.cc
+++ b/lib/models/src/models/bert/bert.cc
@@ -6,20 +6,22 @@
 namespace FlexFlow {
 
 BertConfig get_default_bert_config() {
-  return BertConfig{/*vocab_size=*/30522,
-                    /*hidden_size=*/768,
-                    /*num_encoder_layers=*/12,
-                    /*num_heads=*/12,
-                    /*dim_feedforward=*/3072,
-                    /*hidden_act=*/Activation::GELU,
-                    /*hidden_dropout_prob=*/0.1,
-                    /*attention_probs_dropout_prob=*/0.1,
-                    /*initializer_range=*/0.02,
-                    /*layer_norm_eps=*/1e-12,
-                    /*position_embedding_type=*/"absolute",
-                    /*classifier_dropout=*/0.1,
-                    /*sequence_length=*/512,
-                    /*batch_size=*/64};
+  return BertConfig{
+      /*vocab_size=*/30522_n,
+      /*hidden_size=*/768_n,
+      /*num_encoder_layers=*/12_n,
+      /*num_heads=*/12_n,
+      /*dim_feedforward=*/3072_n,
+      /*hidden_act=*/Activation::GELU,
+      /*hidden_dropout_prob=*/0.1,
+      /*attention_probs_dropout_prob=*/0.1,
+      /*initializer_range=*/0.02,
+      /*layer_norm_eps=*/1e-12,
+      /*position_embedding_type=*/"absolute",
+      /*classifier_dropout=*/0.1,
+      /*sequence_length=*/512_n,
+      /*batch_size=*/64_n,
+  };
 }
 
 tensor_guid_t
@@ -56,9 +58,10 @@ tensor_guid_t
                               InitializerAttrs const &bias_initializer,
                               InitializerAttrs const &projection_initializer) {
   assert(num_dims(cgb.get_shape(input)) == 3);
-  std::vector<int> layer_norm_axis = {2}; // Apply layernorm across the last dim
-  int kdim = config.dim_feedforward / config.num_heads;
-  int vdim = config.dim_feedforward / config.num_heads;
+  std::vector<relative_ff_dim_t> layer_norm_axis = {
+      relative_ff_dim_t{-1}}; // Apply layernorm across the last dim
+  nonnegative_int kdim = config.dim_feedforward / config.num_heads;
+  nonnegative_int vdim = config.dim_feedforward / config.num_heads;
   tensor_guid_t self_attention =
       cgb.multihead_attention(input,
                               input,
@@ -127,7 +130,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) {
   InitializerAttrs bias_initializer = InitializerAttrs{ZeroInitializerAttrs{}};
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           config.batch_size, config.sequence_length, config.hidden_size}},
       DataType::FLOAT,
   };
@@ -149,7 +152,7 @@ ComputationGraph get_bert_computation_graph(BertConfig const &config) {
   assert(
       (cgb.get_shape(out_prob) ==
        TensorShape{
-           TensorDims{FFOrdered<size_t>{
+           TensorDims{FFOrdered<nonnegative_int>{
                config.batch_size, config.sequence_length, config.vocab_size}},
            DataType::FLOAT,
        }));
diff --git a/lib/models/src/models/candle_uno/candle_uno.cc b/lib/models/src/models/candle_uno/candle_uno.cc
index 4d52d515fb..60422359a5 100644
--- a/lib/models/src/models/candle_uno/candle_uno.cc
+++ b/lib/models/src/models/candle_uno/candle_uno.cc
@@ -1,32 +1,34 @@
 #include "models/candle_uno/candle_uno.h"
 #include "pcg/initializers/glorot_normal_attrs.dtg.h"
+#include "utils/containers/repeat_element.h"
 
 namespace FlexFlow {
 
 CandleUnoConfig get_default_candle_uno_config() {
-  CandleUnoConfig config{
-      /*batch_size=*/64,
-      /*dense_layers=*/std::vector<int>(4, 4192),
-      /*dense_feature_layers=*/std::vector<int>(8, 4192),
-      /*feature_shapes=*/std::map<std::string, int>{},
-      /*input_features=*/std::map<std::string, std::string>{},
+  return CandleUnoConfig{
+      /*batch_size=*/64_n,
+      /*dense_layers=*/repeat_element(/*num_times=*/4_n, /*element=*/4192_n),
+      /*dense_feature_layers=*/
+      repeat_element(/*num_times=*/8_n, /*element=*/4192_n),
+      /*feature_shapes=*/
+      {
+          {"dose", 1_n},
+          {"cell.rnaseq", 942_n},
+          {"drug.descriptors", 5270_n},
+          {"drug.fingerprints", 2048_n},
+      },
+      /*input_features=*/
+      {
+          {"dose1", "dose"},
+          {"dose2", "dose"},
+          {"cell.rnaseq", "cell.rnaseq"},
+          {"drug1.descriptors", "drug.descriptors"},
+          {"drug1.fingerprints", "drug.fingerprints"},
+          {"drug2.descriptors", "drug.descriptors"},
+          {"drug2.fingerprints", "drug.fingerprints"},
+      },
       /*dropout=*/0.1,
       /*residual=*/false};
-
-  config.feature_shapes["dose"] = 1;
-  config.feature_shapes["cell.rnaseq"] = 942;
-  config.feature_shapes["drug.descriptors"] = 5270;
-  config.feature_shapes["drug.fingerprints"] = 2048;
-
-  config.input_features["dose1"] = "dose";
-  config.input_features["dose2"] = "dose";
-  config.input_features["cell.rnaseq"] = "cell.rnaseq";
-  config.input_features["drug1.descriptors"] = "drug.descriptors";
-  config.input_features["drug1.fingerprints"] = "drug.fingerprints";
-  config.input_features["drug2.descriptors"] = "drug.descriptors";
-  config.input_features["drug2.fingerprints"] = "drug.fingerprints";
-
-  return config;
 }
 
 tensor_guid_t create_candle_uno_feature_model(
@@ -35,7 +37,7 @@ tensor_guid_t create_candle_uno_feature_model(
     tensor_guid_t const &input,
     InitializerAttrs const &kernel_initializer) {
   tensor_guid_t t = input;
-  for (int const dense_dim : config.dense_feature_layers) {
+  for (nonnegative_int dense_dim : config.dense_feature_layers) {
     t = cgb.dense(t,
                   dense_dim,
                   Activation::RELU,
@@ -56,7 +58,7 @@ ComputationGraph
       InitializerAttrs{GlorotNormalAttrs{/*seed=*/0}};
 
   auto create_input_tensor =
-      [&](FFOrdered<size_t> const &dims) -> tensor_guid_t {
+      [&](FFOrdered<nonnegative_int> const &dims) -> tensor_guid_t {
     TensorShape input_shape = TensorShape{
         TensorDims{dims},
         DataType::FLOAT,
@@ -82,7 +84,7 @@ ComputationGraph
 
   for (auto const &input_feature : config.input_features) {
     std::string const &feature_name = input_feature.second;
-    size_t shape = config.feature_shapes.at(feature_name);
+    nonnegative_int shape = config.feature_shapes.at(feature_name);
     tensor_guid_t input = create_input_tensor({config.batch_size, shape});
     all_inputs.push_back(input);
 
@@ -94,8 +96,9 @@ ComputationGraph
     }
   }
 
-  tensor_guid_t output = cgb.concat(encoded_inputs, /*axis=*/1);
-  for (int const &dense_layer_dim : config.dense_layers) {
+  tensor_guid_t output =
+      cgb.concat(encoded_inputs, /*axis=*/relative_ff_dim_t{1});
+  for (nonnegative_int dense_layer_dim : config.dense_layers) {
     tensor_guid_t residual_input = output;
     output = cgb.dense(output,
                        dense_layer_dim,
@@ -111,7 +114,7 @@ ComputationGraph
     }
   }
   output = cgb.dense(output,
-                     /*outDim=*/1,
+                     /*outDim=*/1_n,
                      /*activation=*/std::nullopt,
                      /*use_bias=*/false,
                      /*data_type=*/DataType::FLOAT,
diff --git a/lib/models/src/models/inception_v3/inception_v3.cc b/lib/models/src/models/inception_v3/inception_v3.cc
index f540eae629..3a829f3754 100644
--- a/lib/models/src/models/inception_v3/inception_v3.cc
+++ b/lib/models/src/models/inception_v3/inception_v3.cc
@@ -15,14 +15,17 @@ struct CheckShape {
   ComputationGraphBuilder const &cgb;
   InceptionV3Config const &config;
 
-  void operator()(tensor_guid_t t, int c, int h, int w) const {
+  void operator()(tensor_guid_t t,
+                  nonnegative_int c,
+                  nonnegative_int h,
+                  nonnegative_int w) const {
     TensorShape current_shape = cgb.get_shape(t);
     TensorShape expected_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            size_t_from_int(config.batch_size),
-            size_t_from_int(c),
-            size_t_from_int(h),
-            size_t_from_int(w),
+        TensorDims{FFOrdered<nonnegative_int>{
+            config.batch_size,
+            c,
+            h,
+            w,
         }},
         DataType::FLOAT,
     };
@@ -35,12 +38,12 @@ struct CheckShape {
     }
   }
 
-  void operator()(tensor_guid_t t, int c) const {
+  void operator()(tensor_guid_t t, nonnegative_int c) const {
     TensorShape current_shape = cgb.get_shape(t);
     TensorShape expected_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            size_t_from_int(config.batch_size),
-            size_t_from_int(c),
+        TensorDims{FFOrdered<nonnegative_int>{
+            config.batch_size,
+            c,
         }},
         DataType::FLOAT,
     };
@@ -56,11 +59,11 @@ struct CheckShape {
 
 InceptionV3Config get_default_inception_v3_training_config() {
   return InceptionV3Config{
-      /*num_classes=*/1000,
+      /*num_classes=*/1000_n,
 
       // see section 8 of https://arxiv.org/abs/1512.00567 for the source of the
       // batch size
-      /*batch_size=*/32,
+      /*batch_size=*/32_n,
 
       // see section 4 of https://arxiv.org/abs/1512.00567 for a discussion of
       // auxiliary logits. they are used by default in training
@@ -70,13 +73,13 @@ InceptionV3Config get_default_inception_v3_training_config() {
 
 static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb,
                                        tensor_guid_t const &input,
-                                       int filters,
-                                       int kernel_size_h,
-                                       int kernel_size_w,
-                                       int stride_h = 1,
-                                       int stride_w = 1,
-                                       int padding_h = 0,
-                                       int padding_w = 0,
+                                       nonnegative_int filters,
+                                       nonnegative_int kernel_size_h,
+                                       nonnegative_int kernel_size_w,
+                                       nonnegative_int stride_h = 1_n,
+                                       nonnegative_int stride_w = 1_n,
+                                       nonnegative_int padding_h = 0_n,
+                                       nonnegative_int padding_w = 0_n,
                                        bool use_bias = false) {
   tensor_guid_t conv = cgb.conv2d(input,
                                   /*outChannels=*/filters,
@@ -87,7 +90,7 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb,
                                   /*paddingH=*/padding_h,
                                   /*paddingW=*/padding_w,
                                   /*activation=*/std::nullopt,
-                                  /*groups=*/1,
+                                  /*groups=*/1_n,
                                   /*use_bias=*/use_bias);
   return cgb.batch_norm(conv,
                         /*affine=*/true,
@@ -98,29 +101,29 @@ static tensor_guid_t create_conv_block(ComputationGraphBuilder &cgb,
 
 static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input,
-                                               int pool_features) {
+                                               nonnegative_int pool_features) {
   tensor_guid_t branch1x1 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/64,
-                                              /*kernel_size_h=*/1,
-                                              /*kernel_size_w=*/1);
+                                              /*filters=*/64_n,
+                                              /*kernel_size_h=*/1_n,
+                                              /*kernel_size_w=*/1_n);
 
   tensor_guid_t branch5x5 = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/48,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/48_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/64,
-                          /*kernel_size_h=*/5,
-                          /*kernel_size_w=*/5,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/2,
-                          /*padding_w=*/2);
+                          /*filters=*/64_n,
+                          /*kernel_size_h=*/5_n,
+                          /*kernel_size_w=*/5_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/2_n,
+                          /*padding_w=*/2_n);
     return t;
   }();
 
@@ -128,208 +131,209 @@ static tensor_guid_t create_inception_module_a(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/64,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/64_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/1,
-                          /*padding_w=*/1);
+                          /*filters=*/96_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/1_n,
+                          /*padding_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/1,
-                          /*padding_w=*/1);
+                          /*filters=*/96_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/1_n,
+                          /*padding_w=*/1_n);
     return t;
   }();
 
   tensor_guid_t branch_pool = [&] {
     tensor_guid_t t = input;
     t = cgb.pool2d(t,
-                   /*kernelH=*/3,
-                   /*kernelW=*/3,
-                   /*strideH=*/1,
-                   /*strideW=*/1,
-                   /*paddingH=*/1,
-                   /*paddingW=*/1,
+                   /*kernelH=*/3_n,
+                   /*kernelW=*/3_n,
+                   /*strideH=*/1_n,
+                   /*strideW=*/1_n,
+                   /*paddingH=*/1_n,
+                   /*paddingW=*/1_n,
                    /*type=*/PoolOp::AVG);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/pool_features,
-                          /*kernel_stride_h=*/1,
-                          /*kernel_stride_w=*/1);
+                          /*kernel_stride_h=*/1_n,
+                          /*kernel_stride_w=*/1_n);
     return t;
   }();
 
   return cgb.concat({branch1x1, branch5x5, branch3x3dbl, branch_pool},
-                    /*axis=*/1);
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_inception_module_b(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input) {
   tensor_guid_t branch3x3 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/384,
-                                              /*kernel_size_h=*/3,
-                                              /*kernel_size_w=*/3,
-                                              /*stride_h=*/2,
-                                              /*stride_w=*/2);
+                                              /*filters=*/384_n,
+                                              /*kernel_size_h=*/3_n,
+                                              /*kernel_size_w=*/3_n,
+                                              /*stride_h=*/2_n,
+                                              /*stride_w=*/2_n);
 
   tensor_guid_t branch3x3dbl = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/64,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/64_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/1,
-                          /*padding_w=*/1);
+                          /*filters=*/96_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/1_n,
+                          /*padding_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/96,
-                          /*kernel_stride_h=*/3,
-                          /*kernel_stride_w=*/3,
-                          /*stride_h=*/2,
-                          /*stride_w=*/2);
+                          /*filters=*/96_n,
+                          /*kernel_stride_h=*/3_n,
+                          /*kernel_stride_w=*/3_n,
+                          /*stride_h=*/2_n,
+                          /*stride_w=*/2_n);
     return t;
   }();
 
   tensor_guid_t branch_pool = cgb.pool2d(input,
-                                         /*kernelH=*/3,
-                                         /*kernelW=*/3,
-                                         /*strideH=*/2,
-                                         /*strideW=*/2,
-                                         /*paddingH=*/0,
-                                         /*paddingW=*/0,
+                                         /*kernelH=*/3_n,
+                                         /*kernelW=*/3_n,
+                                         /*strideH=*/2_n,
+                                         /*strideW=*/2_n,
+                                         /*paddingH=*/0_n,
+                                         /*paddingW=*/0_n,
                                          /*type=*/PoolOp::MAX);
 
-  return cgb.concat({branch3x3, branch3x3dbl, branch_pool}, /*axis=*/1);
+  return cgb.concat({branch3x3, branch3x3dbl, branch_pool},
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_inception_module_c(ComputationGraphBuilder &cgb,
                                                CheckShape const &check_shape,
                                                tensor_guid_t const &input,
-                                               int channels_7x7) {
+                                               nonnegative_int channels_7x7) {
   tensor_guid_t branch1x1 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/192,
-                                              /*kernel_size_h=*/1,
-                                              /*kernel_size_w=*/1);
-  check_shape(branch1x1, 192, 17, 17);
+                                              /*filters=*/192_n,
+                                              /*kernel_size_h=*/1_n,
+                                              /*kernel_size_w=*/1_n);
+  check_shape(branch1x1, 192_n, 17_n, 17_n);
 
   tensor_guid_t branch7x7 = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/7,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/0,
-                          /*padding_w=*/3);
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/7_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/0_n,
+                          /*padding_w=*/3_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/7,
-                          /*kernel_size_w=*/1,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/3,
-                          /*padding_w=*/0);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/7_n,
+                          /*kernel_size_w=*/1_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/3_n,
+                          /*padding_w=*/0_n);
     return t;
   }();
-  check_shape(branch7x7, 192, 17, 17);
+  check_shape(branch7x7, 192_n, 17_n, 17_n);
 
   tensor_guid_t branch7x7dbl = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/7,
-                          /*kernel_size_w=*/1,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/3,
-                          /*padding_w=*/0);
+                          /*kernel_size_h=*/7_n,
+                          /*kernel_size_w=*/1_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/3_n,
+                          /*padding_w=*/0_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/7,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/0,
-                          /*padding_w=*/3);
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/7_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/0_n,
+                          /*padding_w=*/3_n);
     t = create_conv_block(cgb,
                           t,
                           /*filters=*/channels_7x7,
-                          /*kernel_size_h=*/7,
-                          /*kernel_size_w=*/1,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/3,
-                          /*padding_w=*/0);
+                          /*kernel_size_h=*/7_n,
+                          /*kernel_size_w=*/1_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/3_n,
+                          /*padding_w=*/0_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/7,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/0,
-                          /*padding_w=*/3);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/7_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/0_n,
+                          /*padding_w=*/3_n);
     return t;
   }();
-  check_shape(branch7x7dbl, 192, 17, 17);
+  check_shape(branch7x7dbl, 192_n, 17_n, 17_n);
 
   tensor_guid_t branch_pool = [&] {
     tensor_guid_t t = input;
     t = cgb.pool2d(t,
-                   /*kernelH=*/3,
-                   /*kernelW=*/3,
-                   /*strideH=*/1,
-                   /*strideW=*/1,
-                   /*paddingH=*/1,
-                   /*paddingW=*/1,
+                   /*kernelH=*/3_n,
+                   /*kernelW=*/3_n,
+                   /*strideH=*/1_n,
+                   /*strideW=*/1_n,
+                   /*paddingH=*/1_n,
+                   /*paddingW=*/1_n,
                    /*type=*/PoolOp::AVG);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     return t;
   }();
-  check_shape(branch_pool, 192, 17, 17);
+  check_shape(branch_pool, 192_n, 17_n, 17_n);
 
   return cgb.concat({branch1x1, branch7x7, branch7x7dbl, branch_pool},
-                    /*axis=*/1);
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb,
@@ -338,10 +342,10 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
-    t = create_conv_block(cgb, t, 320, 3, 3, 2, 2);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
+    t = create_conv_block(cgb, t, 320_n, 3_n, 3_n, 2_n, 2_n);
     return t;
   }();
 
@@ -349,83 +353,84 @@ static tensor_guid_t create_inception_module_d(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/7,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/0,
-                          /*padding_w=*/3);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/7_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/0_n,
+                          /*padding_w=*/3_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/7,
-                          /*kernel_size_w=*/1,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/3,
-                          /*padding_w=*/0);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/7_n,
+                          /*kernel_size_w=*/1_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/3_n,
+                          /*padding_w=*/0_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/2,
-                          /*stride_w=*/2);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/2_n,
+                          /*stride_w=*/2_n);
     return t;
   }();
 
   tensor_guid_t branch_pool = cgb.pool2d(input,
-                                         /*kernelH=*/3,
-                                         /*kernelW=*/3,
-                                         /*strideH=*/2,
-                                         /*strideW=*/2,
-                                         /*paddingH=*/0,
-                                         /*paddingW=*/0,
+                                         /*kernelH=*/3_n,
+                                         /*kernelW=*/3_n,
+                                         /*strideH=*/2_n,
+                                         /*strideW=*/2_n,
+                                         /*paddingH=*/0_n,
+                                         /*paddingW=*/0_n,
                                          /*type=*/PoolOp::MAX);
 
-  return cgb.concat({branch3x3, branch7x7x3, branch_pool}, /*axis=*/1);
+  return cgb.concat({branch3x3, branch7x7x3, branch_pool},
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb,
                                                tensor_guid_t const &input) {
   tensor_guid_t branch1x1 = create_conv_block(cgb,
                                               input,
-                                              /*filters=*/320,
-                                              /*kernel_size_h=*/1,
-                                              /*kernel_size_w=*/1);
+                                              /*filters=*/320_n,
+                                              /*kernel_size_h=*/1_n,
+                                              /*kernel_size_w=*/1_n);
 
   tensor_guid_t branch3x3 = [&] {
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/384,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/384_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     tensor_guid_t t_1 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384,
-                                          /*kernel_size_h=*/1,
-                                          /*kernel_size_w=*/3,
-                                          /*stride_h=*/1,
-                                          /*stride_w=*/1,
-                                          /*padding_h=*/0,
-                                          /*padding_w=*/1);
+                                          /*filters=*/384_n,
+                                          /*kernel_size_h=*/1_n,
+                                          /*kernel_size_w=*/3_n,
+                                          /*stride_h=*/1_n,
+                                          /*stride_w=*/1_n,
+                                          /*padding_h=*/0_n,
+                                          /*padding_w=*/1_n);
     tensor_guid_t t_2 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384,
-                                          /*kernel_size_h=*/3,
-                                          /*kernel_size_w=*/1,
-                                          /*stride_h=*/1,
-                                          /*stride_w=*/1,
-                                          /*padding_h=*/1,
-                                          /*padding_w=*/0);
-    t = cgb.concat({t_1, t_2}, /*axis=*/1);
+                                          /*filters=*/384_n,
+                                          /*kernel_size_h=*/3_n,
+                                          /*kernel_size_w=*/1_n,
+                                          /*stride_h=*/1_n,
+                                          /*stride_w=*/1_n,
+                                          /*padding_h=*/1_n,
+                                          /*padding_w=*/0_n);
+    t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1});
     return t;
   }();
 
@@ -433,60 +438,60 @@ static tensor_guid_t create_inception_module_e(ComputationGraphBuilder &cgb,
     tensor_guid_t t = input;
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/448,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/448_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/384,
-                          /*kernel_size_h=*/3,
-                          /*kernel_size_w=*/3,
-                          /*stride_h=*/1,
-                          /*stride_w=*/1,
-                          /*padding_h=*/1,
-                          /*padding_w=*/1);
+                          /*filters=*/384_n,
+                          /*kernel_size_h=*/3_n,
+                          /*kernel_size_w=*/3_n,
+                          /*stride_h=*/1_n,
+                          /*stride_w=*/1_n,
+                          /*padding_h=*/1_n,
+                          /*padding_w=*/1_n);
     tensor_guid_t t_1 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384,
-                                          /*kernel_size_h=*/1,
-                                          /*kernel_size_w=*/3,
-                                          /*stride_h=*/1,
-                                          /*stride_w=*/1,
-                                          /*padding_h=*/0,
-                                          /*padding_w=*/1);
+                                          /*filters=*/384_n,
+                                          /*kernel_size_h=*/1_n,
+                                          /*kernel_size_w=*/3_n,
+                                          /*stride_h=*/1_n,
+                                          /*stride_w=*/1_n,
+                                          /*padding_h=*/0_n,
+                                          /*padding_w=*/1_n);
     tensor_guid_t t_2 = create_conv_block(cgb,
                                           t,
-                                          /*filters=*/384,
-                                          /*kernel_size_h=*/3,
-                                          /*kernel_size_w=*/1,
-                                          /*stride_h=*/1,
-                                          /*stride_w=*/1,
-                                          /*padding_h=*/1,
-                                          /*padding_w=*/0);
-    t = cgb.concat({t_1, t_2}, /*axis=*/1);
+                                          /*filters=*/384_n,
+                                          /*kernel_size_h=*/3_n,
+                                          /*kernel_size_w=*/1_n,
+                                          /*stride_h=*/1_n,
+                                          /*stride_w=*/1_n,
+                                          /*padding_h=*/1_n,
+                                          /*padding_w=*/0_n);
+    t = cgb.concat({t_1, t_2}, /*axis=*/relative_ff_dim_t{1});
     return t;
   }();
 
   tensor_guid_t branch_pool = [&] {
     tensor_guid_t t = input;
     t = cgb.pool2d(t,
-                   /*kernelH=*/3,
-                   /*kernelW=*/3,
-                   /*strideH=*/1,
-                   /*strideW=*/1,
-                   /*paddingH=*/1,
-                   /*paddingW=*/1,
+                   /*kernelH=*/3_n,
+                   /*kernelW=*/3_n,
+                   /*strideH=*/1_n,
+                   /*strideW=*/1_n,
+                   /*paddingH=*/1_n,
+                   /*paddingW=*/1_n,
                    /*type=*/PoolOp::AVG);
     t = create_conv_block(cgb,
                           t,
-                          /*filters=*/192,
-                          /*kernel_size_h=*/1,
-                          /*kernel_size_w=*/1);
+                          /*filters=*/192_n,
+                          /*kernel_size_h=*/1_n,
+                          /*kernel_size_w=*/1_n);
     return t;
   }();
 
   return cgb.concat({branch1x1, branch3x3, branch3x3dbl, branch_pool},
-                    /*axis=*/1);
+                    /*axis=*/relative_ff_dim_t{1});
 }
 
 static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb,
@@ -494,75 +499,75 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb,
                                            tensor_guid_t const &input) {
   tensor_guid_t t = input;
 
-  check_shape(t, 3, 299, 299);
+  check_shape(t, 3_n, 299_n, 299_n);
 
   // Conv2d_1a_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/32,
-                        /*kernel_size_h=*/3,
-                        /*kernel_size_w=*/3,
-                        /*stride_h=*/2,
-                        /*stride_w=*/2);
-  check_shape(t, 32, 149, 149);
+                        /*filters=*/32_n,
+                        /*kernel_size_h=*/3_n,
+                        /*kernel_size_w=*/3_n,
+                        /*stride_h=*/2_n,
+                        /*stride_w=*/2_n);
+  check_shape(t, 32_n, 149_n, 149_n);
 
   // Conv2d_2a_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/32,
-                        /*kernel_size_h=*/3,
-                        /*kernel_size_w=*/3);
-  check_shape(t, 32, 147, 147);
+                        /*filters=*/32_n,
+                        /*kernel_size_h=*/3_n,
+                        /*kernel_size_w=*/3_n);
+  check_shape(t, 32_n, 147_n, 147_n);
 
   // Conv2d_2b_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/64,
-                        /*kernel_size_h=*/3,
-                        /*kernel_size_w=*/3,
-                        /*stride_h=*/1,
-                        /*stride_w=*/1,
-                        /*padding_h=*/1,
-                        /*padding_w=*/1);
-  check_shape(t, 64, 147, 147);
+                        /*filters=*/64_n,
+                        /*kernel_size_h=*/3_n,
+                        /*kernel_size_w=*/3_n,
+                        /*stride_h=*/1_n,
+                        /*stride_w=*/1_n,
+                        /*padding_h=*/1_n,
+                        /*padding_w=*/1_n);
+  check_shape(t, 64_n, 147_n, 147_n);
 
   // maxpool1
   t = cgb.pool2d(t,
-                 /*kernelH=*/3,
-                 /*kernelW=*/3,
-                 /*strideH=*/2,
-                 /*strideW=*/2,
-                 /*paddingH=*/0,
-                 /*paddingW=*/0,
+                 /*kernelH=*/3_n,
+                 /*kernelW=*/3_n,
+                 /*strideH=*/2_n,
+                 /*strideW=*/2_n,
+                 /*paddingH=*/0_n,
+                 /*paddingW=*/0_n,
                  /*type=*/PoolOp::MAX);
-  check_shape(t, 64, 73, 73);
+  check_shape(t, 64_n, 73_n, 73_n);
 
   // Conv2d_3b_1x1
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/80,
-                        /*kernel_size_h=*/1,
-                        /*kernel_size_w=*/1);
-  check_shape(t, 80, 73, 73);
+                        /*filters=*/80_n,
+                        /*kernel_size_h=*/1_n,
+                        /*kernel_size_w=*/1_n);
+  check_shape(t, 80_n, 73_n, 73_n);
 
   // Conv2d_4a_3x3
   t = create_conv_block(cgb,
                         t,
-                        /*filters=*/192,
-                        /*kernel_size_h=*/3,
-                        /*kernel_size_w=*/3);
-  check_shape(t, 192, 71, 71);
+                        /*filters=*/192_n,
+                        /*kernel_size_h=*/3_n,
+                        /*kernel_size_w=*/3_n);
+  check_shape(t, 192_n, 71_n, 71_n);
 
   // maxpool2
   t = cgb.pool2d(t,
-                 /*kernelH=*/3,
-                 /*kernelW=*/3,
-                 /*strideH=*/2,
-                 /*strideW=*/2,
-                 /*paddingH=*/0,
-                 /*paddingW=*/0,
+                 /*kernelH=*/3_n,
+                 /*kernelW=*/3_n,
+                 /*strideH=*/2_n,
+                 /*strideW=*/2_n,
+                 /*paddingH=*/0_n,
+                 /*paddingW=*/0_n,
                  /*type=*/PoolOp::MAX);
-  check_shape(t, 192, 35, 35);
+  check_shape(t, 192_n, 35_n, 35_n);
 
   return t;
 }
@@ -570,26 +575,26 @@ static tensor_guid_t create_initial_layers(ComputationGraphBuilder &cgb,
 static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb,
                                          CheckShape const &check_shape,
                                          tensor_guid_t const &input,
-                                         size_t num_classes) {
+                                         nonnegative_int num_classes) {
   // avgpool
   tensor_guid_t x = cgb.pool2d(input,
-                               /*kernelH=*/8,
-                               /*kernelW=*/8,
-                               /*strideH=*/1,
-                               /*strideW=*/1,
-                               /*paddingH=*/0,
-                               /*paddingW=*/0,
+                               /*kernelH=*/8_n,
+                               /*kernelW=*/8_n,
+                               /*strideH=*/1_n,
+                               /*strideW=*/1_n,
+                               /*paddingH=*/0_n,
+                               /*paddingW=*/0_n,
                                /*type=*/PoolOp::AVG);
-  check_shape(x, 2048, 1, 1);
+  check_shape(x, 2048_n, 1_n, 1_n);
 
   // dropout
   x = cgb.dropout(x,
                   /*rate=*/0.5);
-  check_shape(x, 2048, 1, 1);
+  check_shape(x, 2048_n, 1_n, 1_n);
 
   x = cgb.flat(x,
-               /*start_dim=*/1);
-  check_shape(x, 2048);
+               /*start_dim=*/relative_ff_dim_t{1});
+  check_shape(x, 2048_n);
 
   // fc
   x = cgb.dense(x,
@@ -597,7 +602,7 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb,
   check_shape(x, num_classes);
 
   // softmax (not in pytorch model, but shown in Table 1 on p6 of
-  // https://arxiv.org/abs/1512.00567)
+  // https://arxiv.org/abs/1512.00567_n)
   x = cgb.softmax(x);
   check_shape(x, num_classes);
 
@@ -607,44 +612,44 @@ static tensor_guid_t create_final_layers(ComputationGraphBuilder &cgb,
 static tensor_guid_t create_inception_aux(ComputationGraphBuilder &cgb,
                                           CheckShape const &check_shape,
                                           tensor_guid_t const &input,
-                                          size_t num_classes) {
+                                          nonnegative_int num_classes) {
   tensor_guid_t x = input;
-  check_shape(x, 768, 17, 17);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   x = cgb.pool2d(x,
-                 /*kernelH=*/5,
-                 /*kernelW=*/5,
-                 /*strideH=*/3,
-                 /*strideW=*/3,
-                 /*paddingH=*/0,
-                 /*paddingW=*/0,
+                 /*kernelH=*/5_n,
+                 /*kernelW=*/5_n,
+                 /*strideH=*/3_n,
+                 /*strideW=*/3_n,
+                 /*paddingH=*/0_n,
+                 /*paddingW=*/0_n,
                  /*type=*/PoolOp::AVG);
-  check_shape(x, 768, 5, 5);
+  check_shape(x, 768_n, 5_n, 5_n);
 
   // conv0
   x = create_conv_block(cgb,
                         x,
-                        /*filters=*/128,
-                        /*kernel_size_h=*/1,
-                        /*kernel_size_w=*/1);
-  check_shape(x, 128, 5, 5);
+                        /*filters=*/128_n,
+                        /*kernel_size_h=*/1_n,
+                        /*kernel_size_w=*/1_n);
+  check_shape(x, 128_n, 5_n, 5_n);
 
   // conv1
   x = create_conv_block(cgb,
                         x,
-                        /*filters=*/768,
-                        /*kernel_size_h=*/5,
-                        /*kernel_size_w=*/5);
-  check_shape(x, 768, 1, 1);
+                        /*filters=*/768_n,
+                        /*kernel_size_h=*/5_n,
+                        /*kernel_size_w=*/5_n);
+  check_shape(x, 768_n, 1_n, 1_n);
 
   x = cgb.adaptive_pool2d(x,
-                          /*output_h=*/1,
-                          /*output_w=*/1);
-  check_shape(x, 768, 1, 1);
+                          /*output_h=*/1_n,
+                          /*output_w=*/1_n);
+  check_shape(x, 768_n, 1_n, 1_n);
 
   x = cgb.flat(x,
-               /*start_dim=*/1);
-  check_shape(x, 768);
+               /*start_dim=*/relative_ff_dim_t{1});
+  check_shape(x, 768_n);
 
   // fc
   x = cgb.dense(x,
@@ -666,39 +671,39 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb,
   };
 
   tensor_guid_t x = create_initial_layers(cgb, check_shape, input);
-  check_shape(x, 192, 35, 35);
+  check_shape(x, 192_n, 35_n, 35_n);
 
   // Mixed_5b
-  x = create_inception_module_a(cgb, x, 32);
-  check_shape(x, 256, 35, 35);
+  x = create_inception_module_a(cgb, x, 32_n);
+  check_shape(x, 256_n, 35_n, 35_n);
 
   // Mixed_5c
-  x = create_inception_module_a(cgb, x, 64);
-  check_shape(x, 288, 35, 35);
+  x = create_inception_module_a(cgb, x, 64_n);
+  check_shape(x, 288_n, 35_n, 35_n);
 
   // Mixed_5d
-  x = create_inception_module_a(cgb, x, 64);
-  check_shape(x, 288, 35, 35);
+  x = create_inception_module_a(cgb, x, 64_n);
+  check_shape(x, 288_n, 35_n, 35_n);
 
   // Mixed_6a
   x = create_inception_module_b(cgb, x);
-  check_shape(x, 768, 17, 17);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   // Mixed_6b
-  x = create_inception_module_c(cgb, check_shape, x, 128);
-  check_shape(x, 768, 17, 17);
+  x = create_inception_module_c(cgb, check_shape, x, 128_n);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   // Mixed_6c
-  x = create_inception_module_c(cgb, check_shape, x, 160);
-  check_shape(x, 768, 17, 17);
+  x = create_inception_module_c(cgb, check_shape, x, 160_n);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   // Mixed_6d
-  x = create_inception_module_c(cgb, check_shape, x, 160);
-  check_shape(x, 768, 17, 17);
+  x = create_inception_module_c(cgb, check_shape, x, 160_n);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   // Mixed_6e
-  x = create_inception_module_c(cgb, check_shape, x, 192);
-  check_shape(x, 768, 17, 17);
+  x = create_inception_module_c(cgb, check_shape, x, 192_n);
+  check_shape(x, 768_n, 17_n, 17_n);
 
   std::optional<tensor_guid_t> aux;
   if (config.aux_logits) {
@@ -708,15 +713,15 @@ static InceptionV3Output create_inception_v3(ComputationGraphBuilder &cgb,
 
   // Mixed_7a
   x = create_inception_module_d(cgb, x);
-  check_shape(x, 1280, 8, 8);
+  check_shape(x, 1280_n, 8_n, 8_n);
 
   // Mixed_7b
   x = create_inception_module_e(cgb, x);
-  check_shape(x, 2048, 8, 8);
+  check_shape(x, 2048_n, 8_n, 8_n);
 
   // Mixed_7c
   x = create_inception_module_e(cgb, x);
-  check_shape(x, 2048, 8, 8);
+  check_shape(x, 2048_n, 8_n, 8_n);
 
   x = create_final_layers(cgb, check_shape, x, config.num_classes);
   check_shape(x, config.num_classes);
@@ -732,11 +737,11 @@ ComputationGraph
   ComputationGraphBuilder cgb;
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(config.batch_size),
-          3,
-          299,
-          299,
+      TensorDims{FFOrdered<nonnegative_int>{
+          config.batch_size,
+          3_n,
+          299_n,
+          299_n,
       }},
       DataType::FLOAT,
   };
diff --git a/lib/models/src/models/split_test/split_test.cc b/lib/models/src/models/split_test/split_test.cc
index 118f94ec06..d3876d8bfc 100644
--- a/lib/models/src/models/split_test/split_test.cc
+++ b/lib/models/src/models/split_test/split_test.cc
@@ -4,18 +4,18 @@
 
 namespace FlexFlow {
 
-ComputationGraph get_split_test_computation_graph(int batch_size) {
+ComputationGraph get_split_test_computation_graph(nonnegative_int batch_size) {
   ComputationGraphBuilder cgb;
 
-  int layer_dim1 = 256;
-  int layer_dim2 = 128;
-  int layer_dim3 = 64;
-  int layer_dim4 = 32;
+  nonnegative_int layer_dim1 = 256_n;
+  nonnegative_int layer_dim2 = 128_n;
+  nonnegative_int layer_dim3 = 64_n;
+  nonnegative_int layer_dim4 = 32_n;
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(batch_size),
-          size_t_from_int(layer_dim1),
+      TensorDims{FFOrdered<nonnegative_int>{
+          batch_size,
+          layer_dim1,
       }},
       DataType::FLOAT,
   };
diff --git a/lib/models/src/models/transformer/transformer.cc b/lib/models/src/models/transformer/transformer.cc
index 173a1b291c..f71763313a 100644
--- a/lib/models/src/models/transformer/transformer.cc
+++ b/lib/models/src/models/transformer/transformer.cc
@@ -4,16 +4,16 @@
 namespace FlexFlow {
 
 TransformerConfig get_default_transformer_config() {
-  return TransformerConfig{/*num_features=*/512,
-                           /*sequence_length=*/512,
-                           /*batch_size=*/64,
-                           /*dim_feedforward=*/2048,
-                           /*num_heads=*/8,
-                           /*num_encoder_layers=*/6,
-                           /*num_decoder_layers=*/6,
+  return TransformerConfig{/*num_features=*/512_n,
+                           /*sequence_length=*/512_n,
+                           /*batch_size=*/64_n,
+                           /*dim_feedforward=*/2048_n,
+                           /*num_heads=*/8_n,
+                           /*num_encoder_layers=*/6_n,
+                           /*num_decoder_layers=*/6_n,
                            /*dropout=*/0.1,
                            /*layer_norm_eps=*/1e-05,
-                           /*vocab_size=*/64};
+                           /*vocab_size=*/64_n};
 }
 
 tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb,
@@ -32,18 +32,20 @@ tensor_guid_t create_feedforward_network(ComputationGraphBuilder &cgb,
 tensor_guid_t create_transformer_encoder_layer(ComputationGraphBuilder &cgb,
                                                TransformerConfig const &config,
                                                tensor_guid_t const &input) {
-  std::vector<int> layer_norm_axis{2}; // Normalize the last dim
-  int kdim = config.dim_feedforward / config.num_heads;
-  int vdim = config.dim_feedforward / config.num_heads;
-  tensor_guid_t self_attention = cgb.multihead_attention(input,
-                                                         input,
-                                                         input,
-                                                         config.num_features,
-                                                         config.num_heads,
-                                                         kdim,
-                                                         vdim,
-                                                         config.dropout,
-                                                         /*bias=*/false);
+  std::vector<relative_ff_dim_t> layer_norm_axis = {
+      relative_ff_dim_t{-1}}; // Normalize the last dim
+  nonnegative_int kdim = config.dim_feedforward / config.num_heads;
+  nonnegative_int vdim = config.dim_feedforward / config.num_heads;
+  tensor_guid_t self_attention =
+      cgb.multihead_attention(/*query=*/input,
+                              /*key=*/input,
+                              /*value=*/input,
+                              /*embed_dim=*/config.num_features,
+                              /*num_heads=*/config.num_heads,
+                              /*kdim=*/kdim,
+                              /*vdim=*/vdim,
+                              /*dropout=*/config.dropout,
+                              /*bias=*/false);
   assert(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, self_attention));
 
@@ -79,18 +81,20 @@ tensor_guid_t
                                      TransformerConfig const &config,
                                      tensor_guid_t const &input,
                                      tensor_guid_t const &encoder_output) {
-  std::vector<int> layer_norm_axis{2}; // Normalize the last dim
-  int kdim = config.dim_feedforward / config.num_heads;
-  int vdim = config.dim_feedforward / config.num_heads;
-  tensor_guid_t self_attention = cgb.multihead_attention(input,
-                                                         input,
-                                                         input,
-                                                         config.num_features,
-                                                         config.num_heads,
-                                                         kdim,
-                                                         vdim,
-                                                         config.dropout,
-                                                         /*bias=*/false);
+  std::vector<relative_ff_dim_t> layer_norm_axis = {
+      relative_ff_dim_t{-1}}; // Normalize the last dim
+  nonnegative_int kdim = config.dim_feedforward / config.num_heads;
+  nonnegative_int vdim = config.dim_feedforward / config.num_heads;
+  tensor_guid_t self_attention =
+      cgb.multihead_attention(/*query=*/input,
+                              /*key=*/input,
+                              /*value=*/input,
+                              /*embed_dim=*/config.num_features,
+                              /*num_heads=*/config.num_heads,
+                              /*kdim=*/kdim,
+                              /*vdim=*/vdim,
+                              /*dropout=*/config.dropout,
+                              /*bias=*/false);
   assert(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, self_attention));
 
@@ -102,15 +106,16 @@ tensor_guid_t
   assert(are_tensor_guid_shapes_equivalent(
       cgb.computation_graph, input, self_attention_normalized));
 
-  tensor_guid_t mha = cgb.multihead_attention(self_attention_normalized,
-                                              encoder_output,
-                                              encoder_output,
-                                              config.num_features,
-                                              config.num_heads,
-                                              kdim,
-                                              vdim,
-                                              config.dropout,
-                                              /*bias=*/false);
+  tensor_guid_t mha =
+      cgb.multihead_attention(/*query=*/self_attention_normalized,
+                              /*key=*/encoder_output,
+                              /*value=*/encoder_output,
+                              /*embed_dim=*/config.num_features,
+                              /*num_heads=*/config.num_heads,
+                              /*kdim=*/kdim,
+                              /*vdim=*/vdim,
+                              /*dropout=*/config.dropout,
+                              /*bias=*/false);
   assert(are_tensor_guid_shapes_equivalent(cgb.computation_graph, input, mha));
 
   tensor_guid_t mha_normalized =
@@ -148,7 +153,7 @@ ComputationGraph
   ComputationGraphBuilder cgb;
 
   TensorShape input_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           config.batch_size, config.sequence_length, config.num_features}},
       DataType::FLOAT,
   };
diff --git a/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml b/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml
index 014526a601..f1c5fe6b23 100644
--- a/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml
+++ b/lib/op-attrs/include/op-attrs/computation_graph_op_attrs.variant.toml
@@ -11,7 +11,7 @@ features = [
 
 includes = [
   "op-attrs/ops/attention_attrs.dtg.h", 
-  "op-attrs/ops/batch_matmul.dtg.h", 
+  "op-attrs/ops/batch_matmul_attrs.dtg.h", 
   "op-attrs/ops/batch_norm_attrs.dtg.h", 
   "op-attrs/ops/broadcast_attrs.dtg.h",
   "op-attrs/ops/cast_attrs.dtg.h", 
diff --git a/lib/op-attrs/include/op-attrs/datatype.h b/lib/op-attrs/include/op-attrs/datatype.h
index 5af00fb510..3a817af38c 100644
--- a/lib/op-attrs/include/op-attrs/datatype.h
+++ b/lib/op-attrs/include/op-attrs/datatype.h
@@ -4,6 +4,7 @@
 #include "op-attrs/datatype.dtg.h"
 #include "utils/fmt.h"
 #include "utils/fp16.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <variant>
 
 namespace FlexFlow {
@@ -49,7 +50,7 @@ typename data_type_enum_to_class<DT>::type cast_to(T t) {
 template <DataType DT>
 using real_type_t = typename data_type_enum_to_class<DT>::type;
 
-size_t size_of_datatype(DataType);
+nonnegative_int size_of_datatype(DataType);
 
 bool can_strictly_promote_datatype_from_to(DataType, DataType);
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
index 3977f4e0fd..f2355289dc 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/dim_ordered.h
@@ -32,19 +32,13 @@ struct DimOrdered {
       : contents(contents.begin(), contents.end()) {}
 
   T const &at(Idx idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
+    nonnegative_int raw = idx.value;
+    return this->contents.at(raw.unwrap_nonnegative());
   }
 
   T &at(Idx idx) {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return this->contents.at(raw);
+    nonnegative_int raw = idx.value;
+    return this->contents.at(raw.unwrap_nonnegative());
   }
 
   T const &operator[](Idx idx) const {
@@ -56,11 +50,8 @@ struct DimOrdered {
   }
 
   bool idx_is_valid(Idx const &idx) const {
-    int raw = idx.value;
-    if (raw < 0) {
-      raw = this->contents.size() + raw;
-    }
-    return (raw >= 0 && raw < this->contents.size());
+    nonnegative_int raw = idx.value;
+    return (raw < this->contents.size());
   }
 
   bool operator==(DimOrdered const &other) const {
@@ -172,7 +163,7 @@ struct DimOrdered<ff_dim_t, T> {
       : contents(contents.begin(), contents.end()) {}
 
   T const &at(ff_dim_t idx) const {
-    int raw = idx.value.get_value();
+    int raw = idx.value.unwrap_nonnegative();
     return this->contents.at(raw);
   }
 
@@ -185,7 +176,7 @@ struct DimOrdered<ff_dim_t, T> {
   }
 
   T &at(ff_dim_t idx) {
-    int raw = idx.value.get_value();
+    int raw = idx.value.unwrap_nonnegative();
     return this->contents.at(raw);
   }
 
@@ -214,7 +205,7 @@ struct DimOrdered<ff_dim_t, T> {
   }
 
   bool idx_is_valid(ff_dim_t const &idx) const {
-    int raw = idx.value.get_value();
+    int raw = idx.value.unwrap_nonnegative();
     return raw < this->contents.size();
   }
 
diff --git a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
index c9e6db4d17..166916dd44 100644
--- a/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
+++ b/lib/op-attrs/include/op-attrs/dim_ordered/slice.h
@@ -27,8 +27,8 @@ FFOrdered<T> ff_dim_t_nonoverloaded_slice(FFOrdered<T> const &d,
                                           std::optional<ff_dim_t> const &end) {
   auto to_raw_idx =
       [](std::optional<ff_dim_t> const &idx) -> std::optional<int> {
-    return transform(idx,
-                     [](ff_dim_t const &i) { return i.value.get_value(); });
+    return transform(
+        idx, [](ff_dim_t const &i) { return i.value.unwrap_nonnegative(); });
   };
 
   return FFOrdered<T>{subvec(vector_of(d), to_raw_idx(start), to_raw_idx(end))};
diff --git a/lib/op-attrs/include/op-attrs/get_op_type.h b/lib/op-attrs/include/op-attrs/get_op_type.h
index b60880a98b..7799900709 100644
--- a/lib/op-attrs/include/op-attrs/get_op_type.h
+++ b/lib/op-attrs/include/op-attrs/get_op_type.h
@@ -2,7 +2,7 @@
 #define _FLEXFLOW_OP_ATTRS_GET_OP_TYPE_H
 
 #include "op-attrs/ops/attention_attrs.dtg.h"
-#include "op-attrs/ops/batch_matmul.dtg.h"
+#include "op-attrs/ops/batch_matmul_attrs.dtg.h"
 #include "op-attrs/ops/batch_norm_attrs.dtg.h"
 #include "op-attrs/ops/broadcast_attrs.dtg.h"
 #include "op-attrs/ops/cast_attrs.dtg.h"
diff --git a/lib/op-attrs/include/op-attrs/ops/attention.h b/lib/op-attrs/include/op-attrs/ops/attention.h
index e06d795c04..5f1b11c1bb 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention.h
+++ b/lib/op-attrs/include/op-attrs/ops/attention.h
@@ -12,31 +12,31 @@
 
 namespace FlexFlow {
 
-int get_qProjSize(MultiHeadAttentionAttrs const &);
-int get_vProjSize(MultiHeadAttentionAttrs const &);
-int get_kProjSize(MultiHeadAttentionAttrs const &);
-int get_oProjSize(MultiHeadAttentionAttrs const &);
+nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &);
+nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &);
+nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &);
+nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &);
 
-int get_qSize(MultiHeadAttentionParallelInputs const &);
-int get_qSize(MultiHeadAttentionInputs const &);
+nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_qSize(MultiHeadAttentionInputs const &);
 
-int get_kSize(MultiHeadAttentionParallelInputs const &);
-int get_kSize(MultiHeadAttentionInputs const &);
+nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_kSize(MultiHeadAttentionInputs const &);
 
-int get_vSize(MultiHeadAttentionParallelInputs const &);
-int get_vSize(MultiHeadAttentionInputs const &);
+nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_vSize(MultiHeadAttentionInputs const &);
 
-int get_oSize(ParallelTensorShape const &);
-int get_oSize(TensorShape const &);
+nonnegative_int get_oSize(ParallelTensorShape const &);
+nonnegative_int get_oSize(TensorShape const &);
 
-int get_qoSeqLength(MultiHeadAttentionParallelInputs const &);
-int get_qoSeqLength(MultiHeadAttentionInputs const &);
+nonnegative_int get_qoSeqLength(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &);
 
-int get_kvSeqLength(MultiHeadAttentionParallelInputs const &);
-int get_kvSeqLength(MultiHeadAttentionInputs const &);
+nonnegative_int get_kvSeqLength(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &);
 
-int get_num_samples(MultiHeadAttentionParallelInputs const &);
-int get_num_samples(MultiHeadAttentionInputs const &);
+nonnegative_int get_num_samples(MultiHeadAttentionParallelInputs const &);
+nonnegative_int get_num_samples(MultiHeadAttentionInputs const &);
 
 std::vector<IncomingTensorRole>
     get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs const &);
diff --git a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml
index b82b285451..f85b7268af 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/attention/multihead_attention_inputs.struct.toml
@@ -10,29 +10,29 @@ features = [
 ]
 
 includes = [
-  "<cstddef>",
   "op-attrs/datatype.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "batch_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "sequence_length"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "query_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "key_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "value_size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "datatype"
diff --git a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml
index d96d8af69c..019131b07c 100644
--- a/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/attention_attrs.struct.toml
@@ -10,21 +10,25 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "embed_dim"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_heads"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "kdim"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "vdim"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dropout"
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
index 574b4ef579..333da4fa29 100644
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul.h
@@ -1,7 +1,7 @@
 #ifndef _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_BATCH_MATMUL_H
 #define _FLEXFLOW_LIB_OP_ATTRS_INCLUDE_OP_ATTRS_OPS_BATCH_MATMUL_H
 
-#include "op-attrs/ops/batch_matmul.dtg.h"
+#include "op-attrs/ops/batch_matmul_attrs.dtg.h"
 #include "op-attrs/ops/core.h"
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "op-attrs/tensor_shape.dtg.h"
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml b/lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml
deleted file mode 100644
index 3b1dd3f687..0000000000
--- a/lib/op-attrs/include/op-attrs/ops/batch_matmul.struct.toml
+++ /dev/null
@@ -1,19 +0,0 @@
-namespace = "FlexFlow"
-name = "BatchMatmulAttrs"
-
-features = [
-  "eq",
-  "ord",
-  "hash",
-  "json",
-  "rapidcheck",
-  "fmt",
-]
-
-[[fields]]
-name = "a_seq_length_dim"
-type = "int"
-
-[[fields]]
-name = "b_seq_length_dim"
-type = "int"
diff --git a/lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml
new file mode 100644
index 0000000000..394dfb5fcc
--- /dev/null
+++ b/lib/op-attrs/include/op-attrs/ops/batch_matmul_attrs.struct.toml
@@ -0,0 +1,30 @@
+namespace = "FlexFlow"
+name = "BatchMatmulAttrs"
+
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "json",
+  "rapidcheck",
+  "fmt",
+]
+
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+  "<optional>",
+]
+
+src_includes = [
+  "utils/fmt/optional.h",
+  "utils/json/optional.h",
+  "utils/rapidcheck/optional.h",
+]
+
+[[fields]]
+name = "a_seq_length_dim"
+type = "std::optional<::FlexFlow::nonnegative_int>"
+
+[[fields]]
+name = "b_seq_length_dim"
+type = "std::optional<::FlexFlow::nonnegative_int>"
diff --git a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
index e7eeedec06..b3c574264c 100644
--- a/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/combine_attrs.struct.toml
@@ -12,6 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -20,4 +21,4 @@ type = "::FlexFlow::ff_dim_t"
 
 [[fields]]
 name = "combine_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml
index 77e8c51244..c4fb74ebd8 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_input_shape.struct.toml
@@ -12,23 +12,24 @@ features = [
 includes = [
   "<cstddef>",
   "op-attrs/datatype.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "num_samples"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_channels"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "height"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "width"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "datatype"
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml
index 68cbd878d1..fdf0eaca78 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d/conv_2d_parallel_input_shape.struct.toml
@@ -12,6 +12,7 @@ features = [
 includes = [
   "op-attrs/shard_parallel_dim.dtg.h",
   "op-attrs/datatype.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -32,11 +33,11 @@ type = "::FlexFlow::ShardParallelDim"
 
 [[fields]]
 name = "sum_reduction_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "discard_copy_reduction_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "datatype"
diff --git a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml
index 5bef144cd9..8b86d42e04 100644
--- a/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/conv_2d_attrs.struct.toml
@@ -12,6 +12,7 @@ features = [
 includes = [
   "<optional>",
   "op-attrs/activation.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -21,14 +22,14 @@ src_includes = [
 ]
 
 fields = [
-  { name = "out_channels", type = "int" },
-  { name = "kernel_h", type = "int" },
-  { name = "kernel_w", type = "int" },
-  { name = "stride_h", type = "int" },
-  { name = "stride_w", type = "int" },
-  { name = "padding_h", type = "int" },
-  { name = "padding_w", type = "int" },
-  { name = "groups", type = "int" },
+  { name = "out_channels", type = "::FlexFlow::nonnegative_int" },
+  { name = "kernel_h", type = "::FlexFlow::nonnegative_int" },
+  { name = "kernel_w", type = "::FlexFlow::nonnegative_int" },
+  { name = "stride_h", type = "::FlexFlow::nonnegative_int" },
+  { name = "stride_w", type = "::FlexFlow::nonnegative_int" },
+  { name = "padding_h", type = "::FlexFlow::nonnegative_int" },
+  { name = "padding_w", type = "::FlexFlow::nonnegative_int" },
+  { name = "groups", type = "::FlexFlow::nonnegative_int" },
   { name = "activation", type = "std::optional<::FlexFlow::Activation>" },
   { name = "use_bias", type = "bool" },
 ]
diff --git a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
index b8d15284e9..5a857efb3e 100644
--- a/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/embedding_attrs.struct.toml
@@ -10,9 +10,10 @@ features = [
 ]
 
 includes = [
-  "utils/stack_vector/stack_vector.h",
   "op-attrs/aggregate_op.dtg.h",
   "op-attrs/datatype.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
+  "<optional>",
 ]
 
 src_includes = [
@@ -23,11 +24,11 @@ src_includes = [
 
 [[fields]]
 name = "num_entries"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "out_channels"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "aggr"
diff --git a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml
index 0a35a6c5ec..ffbe93c975 100644
--- a/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/linear_attrs.struct.toml
@@ -14,6 +14,7 @@ includes = [
   "op-attrs/activation.dtg.h",
   "op-attrs/regularizer_attrs.dtg.h",
   "<optional>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -24,7 +25,7 @@ src_includes = [
 
 [[fields]]
 name = "out_channels"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "use_bias"
diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d.h b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
index 1af22ad022..af11d61f07 100644
--- a/lib/op-attrs/include/op-attrs/ops/pool_2d.h
+++ b/lib/op-attrs/include/op-attrs/ops/pool_2d.h
@@ -13,8 +13,8 @@ CHECK_VALID_OP_ATTR(Pool2DAttrs);
 
 tl::expected<Pool2DAttrs, std::string>
     make_adaptive_pool2d_attrs(TensorDims const &input_dims,
-                               int output_h,
-                               int output_w,
+                               nonnegative_int output_h,
+                               nonnegative_int output_w,
                                PoolOp pool_type,
                                std::optional<Activation> const &activation);
 
diff --git a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml
index 20ca7deabc..fea318d46d 100644
--- a/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/pool_2d_attrs.struct.toml
@@ -13,6 +13,7 @@ includes = [
   "op-attrs/pool_op.dtg.h",
   "op-attrs/activation.dtg.h",
   "<optional>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -23,27 +24,27 @@ src_includes = [
 
 [[fields]]
 name = "kernel_h"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "kernel_w"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "stride_h"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "stride_w"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "padding_h"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "padding_w"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "pool_type"
diff --git a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml
index ee0ae54132..2798a85caf 100644
--- a/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/reduction_attrs.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "reduction_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
index 69c4b7580f..965c40c05a 100644
--- a/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/repartition_attrs.struct.toml
@@ -12,6 +12,7 @@ features = [
 includes = [
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -20,4 +21,4 @@ type = "::FlexFlow::ff_dim_t"
 
 [[fields]]
 name = "repartition_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml
index 4e43ea747a..58e365c0f2 100644
--- a/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/replicate_attrs.struct.toml
@@ -9,8 +9,10 @@ features = [
   "fmt",
 ]
 
-includes = [ ]
+includes = [ 
+  "utils/nonnegative_int/nonnegative_int.h",
+]
 
 [[fields]]
 name = "replicate_degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml
index fce827f5c2..7ce1ad7e34 100644
--- a/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/split_attrs.struct.toml
@@ -13,11 +13,12 @@ includes = [
   "utils/stack_vector/stack_vector.h",
   "op-attrs/ff_dim_t.h",
   "op-attrs/ff_dim_t.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "splits"
-type = "::FlexFlow::stack_vector<int, MAX_NUM_OUTPUTS>"
+type = "::FlexFlow::stack_vector<::FlexFlow::nonnegative_int, MAX_NUM_OUTPUTS>"
 
 [[fields]]
 name = "axis"
diff --git a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml
index 9ecbf1d725..1c5bfc8e10 100644
--- a/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml
+++ b/lib/op-attrs/include/op-attrs/ops/topk_attrs.struct.toml
@@ -9,9 +9,13 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "k"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "sorted"
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
index 974b27d2a7..be3a95eec8 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dim_degrees.struct.toml
@@ -13,6 +13,7 @@ includes = [
   "op-attrs/parallel_tensor_shape/sum_degree.dtg.h",
   "op-attrs/parallel_tensor_shape/discard_copy_degree.dtg.h",
   "op-attrs/dim_ordered/dim_ordered.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -25,4 +26,4 @@ type = "::FlexFlow::DiscardCopyDegree"
 
 [[fields]]
 name = "shard_degrees"
-type = "::FlexFlow::FFOrdered<int>"
+type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>"
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
index 6b88a7bda1..67864e637b 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_dims.h
@@ -9,27 +9,27 @@
 namespace FlexFlow {
 
 FFOrdered<ShardParallelDim> ff_ordered_shard_dims(ParallelTensorDims const &);
-FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorDims const &);
+FFOrdered<nonnegative_int> ff_ordered_shard_degrees(ParallelTensorDims const &);
 std::unordered_set<ReplicaParallelDim> replica_dims(ParallelTensorDims const &);
 
 /* size_t get_volume(ParallelTensorDims const &); */
-size_t num_shard_dims(ParallelTensorDims const &);
+nonnegative_int num_shard_dims(ParallelTensorDims const &);
 
 ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &);
 
 ParallelTensorDims lift_to_parallel(TensorDims const &);
-ParallelTensorDims
-    lift_to_parallel_with_degrees(TensorDims const &,
-                                  SumDegree const &,
-                                  DiscardCopyDegree const &,
-                                  FFOrdered<int> const &shard_degrees);
+ParallelTensorDims lift_to_parallel_with_degrees(
+    TensorDims const &,
+    SumDegree const &,
+    DiscardCopyDegree const &,
+    FFOrdered<nonnegative_int> const &shard_degrees);
 ParallelTensorDims
     lift_to_parallel_with_degrees(TensorDims const &,
                                   ParallelTensorDimDegrees const &);
 
-int total_replica_degree(ParallelTensorDims const &);
-int total_shard_degree(ParallelTensorDims const &);
-int total_parallel_degree(ParallelTensorDims const &);
+nonnegative_int total_replica_degree(ParallelTensorDims const &);
+nonnegative_int total_shard_degree(ParallelTensorDims const &);
+nonnegative_int total_parallel_degree(ParallelTensorDims const &);
 
 ShardParallelDim shard_dim_at_idx(ParallelTensorDims const &,
                                   relative_ff_dim_t);
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
index 0339b9b8a6..d461ffc9e4 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape.h
@@ -12,12 +12,13 @@
 
 namespace FlexFlow {
 
-int num_shard_dims(ParallelTensorShape const &);
+nonnegative_int num_shard_dims(ParallelTensorShape const &);
 ShardParallelDim shard_dim_at_idx(ParallelTensorShape const &,
                                   relative_ff_dim_t);
 ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &, relative_ff_dim_t);
 
-FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorShape const &);
+FFOrdered<nonnegative_int>
+    ff_ordered_shard_degrees(ParallelTensorShape const &);
 
 std::optional<ShardParallelDim>
     try_get_shard_dim_at_idx(ParallelTensorShape const &, relative_ff_dim_t);
@@ -25,11 +26,11 @@ std::optional<ShardParallelDim>
 ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorShape const &);
 
 ParallelTensorShape lift_to_parallel(TensorShape const &);
-ParallelTensorShape
-    lift_to_parallel_with_degrees(TensorShape const &,
-                                  SumDegree const &,
-                                  DiscardCopyDegree const &,
-                                  FFOrdered<int> const &shard_degrees);
+ParallelTensorShape lift_to_parallel_with_degrees(
+    TensorShape const &,
+    SumDegree const &,
+    DiscardCopyDegree const &,
+    FFOrdered<nonnegative_int> const &shard_degrees);
 ParallelTensorShape
     lift_to_parallel_with_degrees(TensorShape const &,
                                   ParallelTensorDimDegrees const &);
@@ -37,13 +38,13 @@ ParallelTensorShape
 std::unordered_set<ReplicaParallelDim>
     replica_dims(ParallelTensorShape const &);
 TensorShape get_piece_shape(ParallelTensorShape const &);
-int get_num_replica_dims(ParallelTensorShape const &);
-int get_num_replicas(ParallelTensorShape const &);
+nonnegative_int get_num_replica_dims(ParallelTensorShape const &);
+nonnegative_int get_num_replicas(ParallelTensorShape const &);
 
-int get_sum_degree(ParallelTensorShape const &);
-int get_discard_copy_degree(ParallelTensorShape const &);
+nonnegative_int get_sum_degree(ParallelTensorShape const &);
+nonnegative_int get_discard_copy_degree(ParallelTensorShape const &);
 
-int get_total_parallel_degree(ParallelTensorShape const &);
+nonnegative_int get_total_parallel_degree(ParallelTensorShape const &);
 
 bool is_valid(ParallelTensorShape const &);
 
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml
index b4905fb0ce..76b52bcdef 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/discard_copy_degree.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "value"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml
index d86917211e..550a384ba9 100644
--- a/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml
+++ b/lib/op-attrs/include/op-attrs/parallel_tensor_shape/sum_degree.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "value"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml
index a44d712dbf..fdd11ac11f 100644
--- a/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml
+++ b/lib/op-attrs/include/op-attrs/pcg_operator_attrs.variant.toml
@@ -11,7 +11,7 @@ features = [
 
 includes = [
   "op-attrs/ops/attention_attrs.dtg.h", 
-  "op-attrs/ops/batch_matmul.dtg.h", 
+  "op-attrs/ops/batch_matmul_attrs.dtg.h", 
   "op-attrs/ops/batch_norm_attrs.dtg.h", 
   "op-attrs/ops/broadcast_attrs.dtg.h", 
   "op-attrs/ops/cast_attrs.dtg.h", 
diff --git a/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h
index af51cc69be..5205b1ead8 100644
--- a/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h
+++ b/lib/op-attrs/include/op-attrs/relative_ff_dim_t.h
@@ -7,7 +7,7 @@
 
 namespace FlexFlow {
 ff_dim_t ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t ff_dim,
-                                         int input_dim);
+                                         nonnegative_int input_dim);
 } // namespace FlexFlow
 
 namespace rc {
diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml
index 2ad442aa22..5ca486181e 100644
--- a/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml
+++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim.struct.toml
@@ -11,11 +11,12 @@ features = [
 
 includes = [
   "op-attrs/replica_type.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "replica_type"
diff --git a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
index 74a8df339b..92d2b0abb2 100644
--- a/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
+++ b/lib/op-attrs/include/op-attrs/replica_parallel_dim_set.h
@@ -8,7 +8,8 @@
 namespace FlexFlow {
 
 ReplicaParallelDimSet empty_replica_parallel_dim_set();
-int get_degree_of_replica_type(ReplicaParallelDimSet const &, ReplicaType);
+nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &,
+                                           ReplicaType);
 std::unordered_set<ReplicaParallelDim>
     get_replica_dims(ReplicaParallelDimSet const &);
 bool is_valid(ReplicaParallelDimSet const &);
diff --git a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml
index 21c81396d1..5c5d2dc5b2 100644
--- a/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml
+++ b/lib/op-attrs/include/op-attrs/shard_parallel_dim.struct.toml
@@ -9,10 +9,14 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "size"
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "degree"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.h b/lib/op-attrs/include/op-attrs/tensor_dims.h
index 5e1503360b..bf11f36e51 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.h
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.h
@@ -6,11 +6,11 @@
 
 namespace FlexFlow {
 
-FFOrdered<size_t> const &ff_ordered(TensorDims const &);
+FFOrdered<nonnegative_int> const &ff_ordered(TensorDims const &);
 
-size_t num_dims(TensorDims const &);
-size_t dim_at_idx(TensorDims const &, relative_ff_dim_t);
-size_t &dim_at_idx(TensorDims &, relative_ff_dim_t);
+nonnegative_int num_dims(TensorDims const &);
+nonnegative_int dim_at_idx(TensorDims const &, relative_ff_dim_t);
+nonnegative_int &dim_at_idx(TensorDims &, relative_ff_dim_t);
 
 bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
                                      TensorDims const &goal);
diff --git a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
index b262dd32b6..e86b866fd6 100644
--- a/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
+++ b/lib/op-attrs/include/op-attrs/tensor_dims.struct.toml
@@ -8,10 +8,12 @@ features = [
   "rapidcheck",
   "fmt",
 ]
+
 includes = [
   "op-attrs/dim_ordered/dim_ordered.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "ff_ordered"
-type = "::FlexFlow::FFOrdered<size_t>"
+type = "::FlexFlow::FFOrdered<::FlexFlow::nonnegative_int>"
diff --git a/lib/op-attrs/include/op-attrs/tensor_shape.h b/lib/op-attrs/include/op-attrs/tensor_shape.h
index b8733cddbe..15958a1daf 100644
--- a/lib/op-attrs/include/op-attrs/tensor_shape.h
+++ b/lib/op-attrs/include/op-attrs/tensor_shape.h
@@ -5,11 +5,11 @@
 
 namespace FlexFlow {
 
-size_t num_dims(TensorShape const &);
-size_t dim_at_idx(TensorShape const &, relative_ff_dim_t);
-size_t &dim_at_idx(TensorShape &, relative_ff_dim_t);
-size_t get_num_elements(TensorShape const &);
-size_t get_size_in_bytes(TensorShape const &);
+nonnegative_int num_dims(TensorShape const &);
+nonnegative_int dim_at_idx(TensorShape const &, relative_ff_dim_t);
+nonnegative_int &dim_at_idx(TensorShape &, relative_ff_dim_t);
+nonnegative_int get_num_elements(TensorShape const &);
+nonnegative_int get_size_in_bytes(TensorShape const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/src/op-attrs/datatype.cc b/lib/op-attrs/src/op-attrs/datatype.cc
index 3bee05c253..9bb3b34390 100644
--- a/lib/op-attrs/src/op-attrs/datatype.cc
+++ b/lib/op-attrs/src/op-attrs/datatype.cc
@@ -1,23 +1,24 @@
 #include "op-attrs/datatype.h"
 #include "utils/containers/contains.h"
 #include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
-size_t size_of_datatype(DataType data_type) {
+nonnegative_int size_of_datatype(DataType data_type) {
   switch (data_type) {
     case DataType::BOOL:
-      return sizeof(bool);
+      return nonnegative_int{sizeof(bool)};
     case DataType::INT32:
-      return sizeof(int32_t);
+      return nonnegative_int{sizeof(int32_t)};
     case DataType::INT64:
-      return sizeof(int64_t);
+      return nonnegative_int{sizeof(int64_t)};
     case DataType::HALF:
-      return sizeof(float) / 2;
+      return nonnegative_int{sizeof(float)} / 2_n;
     case DataType::FLOAT:
-      return sizeof(float);
+      return nonnegative_int{sizeof(float)};
     case DataType::DOUBLE:
-      return sizeof(double);
+      return nonnegative_int{sizeof(double)};
     default:
       throw mk_runtime_error(fmt::format("Unknown DataType {}", data_type));
   }
diff --git a/lib/op-attrs/src/op-attrs/ff_dim_t.cc b/lib/op-attrs/src/op-attrs/ff_dim_t.cc
index 0a99e39a91..44672fc391 100644
--- a/lib/op-attrs/src/op-attrs/ff_dim_t.cc
+++ b/lib/op-attrs/src/op-attrs/ff_dim_t.cc
@@ -2,7 +2,7 @@
 
 namespace FlexFlow {
 relative_ff_dim_t relative_ff_dim_t_from_ff_dim_t(ff_dim_t ff_dim) {
-  return relative_ff_dim_t{ff_dim.value.get_value()};
+  return relative_ff_dim_t{ff_dim.value.unwrap_nonnegative()};
 }
 } // namespace FlexFlow
 
diff --git a/lib/op-attrs/src/op-attrs/ops/attention.cc b/lib/op-attrs/src/op-attrs/ops/attention.cc
index 57c7105534..10fbf412f7 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention.cc
@@ -16,79 +16,82 @@ namespace FlexFlow {
 /*   return is_valid; */
 /* } */
 
-int get_qProjSize(MultiHeadAttentionAttrs const &attrs) {
+nonnegative_int get_qProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.kdim;
 }
 
-int get_vProjSize(MultiHeadAttentionAttrs const &attrs) {
+nonnegative_int get_vProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.vdim;
 }
 
-int get_kProjSize(MultiHeadAttentionAttrs const &attrs) {
+nonnegative_int get_kProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.kdim;
 }
 
-int get_oProjSize(MultiHeadAttentionAttrs const &attrs) {
+nonnegative_int get_oProjSize(MultiHeadAttentionAttrs const &attrs) {
   return attrs.embed_dim;
 }
 
-int get_qSize(TensorShape const &query_shape) {
+nonnegative_int get_qSize(TensorShape const &query_shape) {
   return dim_at_idx(query_shape, relative_ff_dim_t{0});
 }
 
-int get_kSize(TensorShape const &key_shape) {
+nonnegative_int get_kSize(TensorShape const &key_shape) {
   return dim_at_idx(key_shape, relative_ff_dim_t{0});
 }
 
-int get_vSize(TensorShape const &value_shape) {
+nonnegative_int get_vSize(TensorShape const &value_shape) {
   return dim_at_idx(value_shape, relative_ff_dim_t{0});
 }
 
-int get_qSize(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int get_qSize(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.query_dim.size;
 }
 
-int get_qSize(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_qSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.query_size;
 }
 
-int get_kSize(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int get_kSize(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.key_dim.size;
 }
 
-int get_kSize(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_kSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.key_size;
 }
 
-int get_vSize(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int get_vSize(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.value_dim.size;
 }
 
-int get_vSize(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_vSize(MultiHeadAttentionInputs const &inputs) {
   return inputs.value_size;
 }
 
-int get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int
+    get_kvSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.sequence_dim.size;
 }
 
-int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_kvSeqLength(MultiHeadAttentionInputs const &inputs) {
   return inputs.sequence_length;
 }
 
-int get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int
+    get_qoSeqLength(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.sequence_dim.size; // FIXME -- assumes only prefill
 }
 
-int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_qoSeqLength(MultiHeadAttentionInputs const &inputs) {
   return inputs.sequence_length; // FIXME -- assumes only prefil
 }
 
-int get_num_samples(MultiHeadAttentionParallelInputs const &inputs) {
+nonnegative_int
+    get_num_samples(MultiHeadAttentionParallelInputs const &inputs) {
   return inputs.batch_dim.size;
 }
 
-int get_num_samples(MultiHeadAttentionInputs const &inputs) {
+nonnegative_int get_num_samples(MultiHeadAttentionInputs const &inputs) {
   return inputs.batch_size;
 }
 
@@ -124,10 +127,10 @@ tl::expected<TensorShape, std::string>
   MultiHeadAttentionInputs parsed = parse_result.value();
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           parsed.batch_size,
           parsed.sequence_length,
-          size_t_from_int(attrs.embed_dim),
+          attrs.embed_dim,
       }},
       parsed.datatype,
   };
@@ -147,23 +150,23 @@ tl::expected<TensorShape, std::string>
   MultiHeadAttentionInputs parsed = parse_result.value();
 
   // W^Q_i in "Attention Is All You Need" top of page 5
-  size_t qProjectWeightSize = parsed.query_size * attrs.kdim;
+  nonnegative_int qProjectWeightSize = parsed.query_size * attrs.kdim;
 
   // W^K_i in "Attention Is All You Need" top of page 5 (all i's put together)
-  size_t kProjectWeightSize = parsed.key_size * attrs.kdim;
+  nonnegative_int kProjectWeightSize = parsed.key_size * attrs.kdim;
 
   // W^V_i in "Attention Is All You Need" top of page 5 (all i's put together)
-  size_t vProjectWeightSize = parsed.value_size * attrs.vdim;
+  nonnegative_int vProjectWeightSize = parsed.value_size * attrs.vdim;
 
   // W^O in "Attention Is All You Need" top of page 5, with num_heads factored
   // out
-  size_t outWeightSize = attrs.vdim * attrs.embed_dim;
+  nonnegative_int outWeightSize = attrs.vdim * attrs.embed_dim;
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           (qProjectWeightSize + kProjectWeightSize + vProjectWeightSize +
            outWeightSize),
-          size_t_from_int(attrs.num_heads),
+          attrs.num_heads,
       }},
       parsed.datatype,
   };
@@ -184,8 +187,8 @@ tl::expected<TensorShape, std::string>
   });
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(attrs.kdim + attrs.kdim + attrs.vdim),
+      TensorDims{FFOrdered<nonnegative_int>{
+          attrs.kdim + attrs.kdim + attrs.vdim,
       }},
       parsed.datatype,
   };
@@ -206,8 +209,8 @@ tl::expected<TensorShape, std::string>
   });
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(attrs.embed_dim),
+      TensorDims{FFOrdered<nonnegative_int>{
+          attrs.embed_dim,
       }},
       parsed.datatype,
   };
@@ -235,14 +238,14 @@ tl::expected<ParallelTensorShape, std::string>
   }
   TensorShape unpar_shape = result_unpar_get_shape.value();
 
-  int joined_dim_degree = 1;
-  int head_dim_degree = parsed.discard_copy_degree.value;
+  nonnegative_int joined_dim_degree = 1_n;
+  nonnegative_int head_dim_degree = parsed.discard_copy_degree.value;
 
   return lift_to_parallel_with_degrees(
       unpar_shape,
-      SumDegree{1},
+      SumDegree{1_n},
       DiscardCopyDegree{parsed.batch_dim.degree},
-      FFOrdered<int>{joined_dim_degree, head_dim_degree});
+      FFOrdered<nonnegative_int>{joined_dim_degree, head_dim_degree});
 }
 
 tl::expected<ParallelTensorShape, std::string>
@@ -273,10 +276,10 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1};
+  SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       parsed.batch_dim.degree * parsed.discard_copy_degree.value};
-  FFOrdered<int> shard_degrees = FFOrdered<int>{1};
+  FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{1_n};
   return lift_to_parallel_with_degrees(
       unpar_shape, sum_degree, discard_copy_degree, shard_degrees);
 }
@@ -309,10 +312,10 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1};
+  SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       parsed.batch_dim.degree * parsed.discard_copy_degree.value};
-  FFOrdered<int> shard_degrees = FFOrdered<int>{1};
+  FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{1_n};
   return lift_to_parallel_with_degrees(
       unpar_shape, sum_degree, discard_copy_degree, shard_degrees);
 }
@@ -339,402 +342,25 @@ tl::expected<ParallelTensorShape, std::string>
   }
   TensorShape unpar_shape = result_unpar_get_shape.value();
 
-  int sum_degree = parsed.discard_copy_degree.value;
-  int discard_copy_degree = 1;
-  int batch_degree = parsed.batch_dim.degree;
-  int seq_len_degree = 1;
-  int out_dim_degree = 1;
+  nonnegative_int sum_degree = parsed.discard_copy_degree.value;
+  nonnegative_int discard_copy_degree = 1_n;
+  nonnegative_int batch_degree = parsed.batch_dim.degree;
+  nonnegative_int seq_len_degree = 1_n;
+  nonnegative_int out_dim_degree = 1_n;
 
   return lift_to_parallel_with_degrees(
       unpar_shape,
       SumDegree{sum_degree},
       DiscardCopyDegree{discard_copy_degree},
-      FFOrdered<int>{batch_degree, seq_len_degree, out_dim_degree});
+      FFOrdered<nonnegative_int>{batch_degree, seq_len_degree, out_dim_degree});
 }
 
-int get_oSize(ParallelTensorShape const &) {
+nonnegative_int get_oSize(ParallelTensorShape const &) {
   NOT_IMPLEMENTED();
 }
 
-int get_oSize(TensorShape const &) {
+nonnegative_int get_oSize(TensorShape const &) {
   NOT_IMPLEMENTED();
 }
 
 } // namespace FlexFlow
-
-// Tensor FFModel::multihead_attention(const Tensor query,
-//                                     const Tensor key,
-//                                     const Tensor value,
-//                                     int embed_dim,
-//                                     int num_heads,
-//                                     int kdim,
-//                                     int vdim,
-//                                     float dropout,
-//                                     bool bias,
-//                                     bool add_bias_kv,
-//                                     bool add_zero_attn,
-//                                     Initializer *kernel_initializer,
-//                                     char const *name) {
-//   Layer *li = new Layer(this,
-//                         OP_MULTIHEAD_ATTENTION,
-//                         DT_FLOAT,
-//                         name,
-//                         3 /*inputs*/,
-//                         1 /*weights*/,
-//                         1 /*outputs*/,
-//                         query,
-//                         key,
-//                         value);
-//   {
-//     int numdims = query->num_dims;
-//     int dims[MAX_TENSOR_DIM];
-//     for (int i = 0; i < numdims; i++) {
-//       dims[i] = query->dims[i];
-//     }
-//     dims[0] = embed_dim;
-//     li->outputs[0] = create_tensor_legion_ordering(
-//         numdims, dims, DT_FLOAT, li, 0, true /*create_grad*/);
-//   }
-//   {
-//     // Compute weight size
-//     int qProjSize = kdim, kProjSize = kdim, vProjSize = kdim,
-//         oProjSize = embed_dim;
-//     int qSize = query->dims[0], kSize = key->dims[0], vSize = value->dims[0];
-//     int qParas = qProjSize * qSize;
-//     int kParas = kProjSize * kSize;
-//     int vParas = vProjSize * vSize;
-//     int oParas = oProjSize * (vProjSize > 0 ? vProjSize : vSize);
-//     int dims[2] = {qParas + kParas + vParas + oParas, num_heads};
-//     li->weights[0] = create_weight_legion_ordering(2,
-//                                                    dims,
-//                                                    DT_FLOAT,
-//                                                    li,
-//                                                    true /*create_grad*/,
-//                                                    kernel_initializer,
-//                                                    CHOSEN_SYNC_TYPE);
-//   }
-//   li->data_type = DT_FLOAT;
-//   li->add_int_property("embed_dim", embed_dim);
-//   li->add_int_property("num_heads", num_heads);
-//   li->add_int_property("kdim", kdim);
-//   li->add_int_property("vdim", vdim);
-//   li->add_int_property("bias", bias);
-//   li->add_int_property("add_bias_kv", add_bias_kv);
-//   li->add_int_property("add_zero_attn", add_zero_attn);
-//   li->add_float_property("dropout", dropout);
-//   layers.push_back(li);
-//   return li->outputs[0];
-// }
-
-// MultiHeadAttention::MultiHeadAttention(FFModel &model,
-//                                        LayerID const &_layer_guid,
-//                                        const ParallelTensor _query,
-//                                        const ParallelTensor _key,
-//                                        const ParallelTensor _value,
-//                                        int _embed_dim,
-//                                        int _num_heads,
-//                                        int _kdim,
-//                                        int _vdim,
-//                                        float _dropout,
-//                                        bool _bias,
-//                                        bool _add_bias_kv,
-//                                        bool _add_zero_attn,
-//                                        bool allocate_weights,
-//                                        char const *name)
-//     // Initializer* _bias_initializer)
-//     : Op(model,
-//          OP_MULTIHEAD_ATTENTION,
-//          DT_FLOAT,
-//          name,
-//          3 /*inputs*/,
-//          1 /*weights*/,
-//          1 /*outputs*/,
-//          _query,
-//          _key,
-//          _value),
-//       attrs(_embed_dim,
-//             _num_heads,
-//             _kdim,
-//             _vdim,
-//             _dropout,
-//             _bias,
-//             _add_bias_kv,
-//             _add_zero_attn),
-//       qSize(_query->dims[0].size), kSize(_key->dims[0].size),
-//       vSize(_value->dims[0].size), qProjSize(_kdim),
-//       qoSeqLength(_query->dims[1].size), kvSeqLength(_key->dims[1].size) {
-//   // overwrite layer_guid
-//   layer_guid = _layer_guid;
-
-//   // assert key and value have the same sequence length
-//   assert(_key->dims[1] == _value->dims[1]);
-//   numOutputs = 1;
-//   int numdim = _query->num_dims;
-//   ParallelDim dims[MAX_TENSOR_DIM];
-//   for (int i = 0; i < numdim; i++) {
-//     dims[i] = _query->dims[i];
-//   }
-//   dims[0].size = _embed_dim;
-//   // Currently require no parallelism along this dim
-//   assert(dims[0].degree == 1);
-//   if (allocate_weights) {
-//     // Create weight tensor
-//     int num_dims = inputs[0]->num_dims;
-//     // Compute weight size
-//     int qParas = this->qProjSize * this->qSize;
-//     int kParas = kProjSize(attrs) * this->kSize;
-//     int vParas = vProjSize(attrs) * this->vSize;
-//     int oParas = oProjSize(attrs) *
-//                  (vProjSize(attrs) > 0 ? vProjSize(attrs) : this->vSize);
-//     ParallelDim dims[3];
-//     dims[0] = inputs[0]->dims[num_dims - 2];
-//     dims[0].size = dims[0].degree;
-//     dims[1] = inputs[0]->dims[num_dims - 1];
-//     dims[1].size = this->attrs.num_heads;
-//     dims[2].size = qParas + kParas + vParas + oParas;
-//     dims[2].degree = 1;
-//     dims[2].parallel_idx = -1;
-//     int seed = std::rand();
-//     Initializer *initializer = new GlorotUniform(seed);
-// #ifdef USE_NCCL
-//     ParameterSyncType comm_type = ParameterSyncType::NCCL;
-// #else
-//     ParameterSyncType comm_type = ParameterSyncType::PS;
-// #endif
-//     weights[0] = model.create_parallel_weight<3>(dims,
-//                                                  DT_FLOAT,
-//                                                  NULL /*owner_op*/,
-//                                                  true /*create_grad*/,
-//                                                  initializer,
-//                                                  comm_type);
-//   }
-
-//   outputs[0] = model.create_parallel_tensor_legion_ordering(
-//       _query->num_dims, dims, DT_FLOAT, this);
-//   /* for (int i = 0; i < numdim; i++) { */
-//   /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-//   /* } */
-//   /* // Check correctness */
-//   /* assert(check_output_input_weight_parallel_dims()); */
-// }
-
-// MultiHeadAttention::MultiHeadAttention(FFModel &model,
-//                                        const ParallelTensor _query,
-//                                        const ParallelTensor _key,
-//                                        const ParallelTensor _value,
-//                                        const ParallelTensor _weight,
-//                                        int _embed_dim,
-//                                        int _num_heads,
-//                                        int _kdim,
-//                                        int _vdim,
-//                                        float _dropout,
-//                                        bool _bias,
-//                                        bool _add_bias_kv,
-//                                        bool _add_zero_attn,
-//                                        bool allocate_weights,
-//                                        char const *name)
-//     // Initializer* _bias_initializer)
-//     : Op(model,
-//          OP_MULTIHEAD_ATTENTION,
-//          DT_FLOAT,
-//          name,
-//          3 /*inputs*/,
-//          1 /*weights*/,
-//          1 /*outputs*/,
-//          _query,
-//          _key,
-//          _value,
-//          _weight),
-//       attrs(_embed_dim,
-//             _num_heads,
-//             _kdim,
-//             _vdim,
-//             _dropout,
-//             _bias,
-//             _add_bias_kv,
-//             _add_zero_attn),
-//       qSize(_query->dims[0].size), kSize(_key->dims[0].size),
-//       vSize(_value->dims[0].size), qProjSize(_kdim),
-//       qoSeqLength(_query->dims[1].size), kvSeqLength(_key->dims[1].size)
-// // bias_initializer(_bias_initializer)
-// {
-//   // assert key and value have the same sequence length
-//   assert(_key->dims[1] == _value->dims[1]);
-//   numOutputs = 1;
-//   int numdim = _query->num_dims;
-//   ParallelDim dims[MAX_TENSOR_DIM];
-//   for (int i = 0; i < numdim; i++) {
-//     dims[i] = _query->dims[i];
-//   }
-//   // assert key and value have the same sequence length
-//   assert(_key->dims[1] == _value->dims[1]);
-//   dims[0].size = _embed_dim;
-//   // Currently require no parallelism along this dim
-//   assert(dims[0].degree == 1);
-//   if (allocate_weights) {
-//     // Create weight tensor
-//     int num_dims = inputs[0]->num_dims;
-//     // Compute weight size
-//     int qParas = this->qProjSize * this->qSize;
-//     int kParas = kProjSize(attrs) * this->kSize;
-//     int vParas = vProjSize(attrs) * this->vSize;
-//     int oParas = oProjSize(attrs) *
-//                  (vProjSize(attrs) > 0 ? vProjSize(attrs) : this->vSize);
-//     ParallelDim dims[3];
-//     dims[0] = inputs[0]->dims[num_dims - 2];
-//     dims[0].size = dims[0].degree;
-//     dims[1] = inputs[0]->dims[num_dims - 1];
-//     dims[1].size = this->attrs.num_heads;
-//     dims[2].size = qParas + kParas + vParas + oParas;
-//     int seed = std::rand();
-//     Initializer *initializer = new GlorotUniform(seed);
-// #ifdef USE_NCCL
-//     ParameterSyncType comm_type = ParameterSyncType::NCCL;
-// #else
-//     ParameterSyncType comm_type = ParameterSyncType::PS;
-// #endif
-//     weights[0] = model.create_parallel_weight<3>(dims,
-//                                                  DT_FLOAT,
-//                                                  NULL /*owner_op*/,
-//                                                  true /*create_grad*/,
-//                                                  initializer,
-//                                                  comm_type);
-//   }
-//   outputs[0] = model.create_parallel_tensor_legion_ordering(
-//       _query->num_dims, dims, DT_FLOAT, this);
-
-//   /* for (int i = 0; i < numdim; i++) { */
-//   /*   register_output_input_parallel_dims(outputs[0], i, inputs[0], i); */
-//   /* } */
-//   /* register_output_weight_parallel_dims(outputs[0], numdim-1, _weight, 1);
-//   */
-//   /* register_output_weight_parallel_dims(outputs[0], numdim-2, _weight, 2);
-//   */
-//   // Check correctness
-//   /* assert(check_output_input_weight_parallel_dims()); */
-// }
-
-// void MultiHeadAttention::forward(FFModel const &ff) {
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_forward(ff, argmap);
-//   int idx = 0;
-//   IndexLauncher launcher(ATTENTION_FWD_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(NULL, 0),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region));
-//   launcher.add_field(idx++, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[1]->region));
-//   launcher.add_field(idx++, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[2]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[2]->region));
-//   launcher.add_field(idx++, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     weights[0]->region));
-//   launcher.add_field(idx++, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     WRITE_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region));
-//   launcher.add_field(4, FID_DATA);
-//   runtime->execute_index_space(ctx, launcher);
-// }
-
-// void MultiHeadAttention::backward(FFModel const &ff) {
-//   ArgumentMap argmap;
-//   Context ctx = ff.config.lg_ctx;
-//   Runtime *runtime = ff.config.lg_hlr;
-//   set_argumentmap_for_backward(ff, argmap);
-//   IndexLauncher launcher(ATTENTION_BWD_TASK_ID,
-//                          parallel_is,
-//                          TaskArgument(NULL, 0),
-//                          argmap,
-//                          Predicate::TRUE_PRED,
-//                          false /*must*/,
-//                          0 /*mapper_id*/,
-//                          outputs[0]->machine_view.hash());
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region));
-//   launcher.add_field(0, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[1]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[1]->region));
-//   launcher.add_field(1, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[2]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     inputs[2]->region));
-//   launcher.add_field(2, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(weights[0]->part,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     weights[0]->region));
-//   launcher.add_field(3, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(outputs[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_ONLY,
-//                                                     EXCLUSIVE,
-//                                                     outputs[0]->region_grad));
-//   launcher.add_field(4, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(weights[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_WRITE,
-//                                                     EXCLUSIVE,
-//                                                     weights[0]->region_grad));
-//   launcher.add_field(5, FID_DATA);
-//   launcher.add_region_requirement(RegionRequirement(inputs[0]->part_grad,
-//                                                     0 /*projection id*/,
-//                                                     READ_WRITE,
-//                                                     EXCLUSIVE,
-//                                                     inputs[0]->region_grad));
-//   launcher.add_field(6, FID_DATA);
-//   int num_regions = 7;
-//   if (inputs[1]->region != inputs[0]->region) {
-//     // when key != query
-//     launcher.add_region_requirement(RegionRequirement(inputs[1]->part_grad,
-//                                                       0 /*projection id*/,
-//                                                       READ_WRITE,
-//                                                       EXCLUSIVE,
-//                                                       inputs[1]->region_grad));
-//     launcher.add_field(num_regions++, FID_DATA);
-//   }
-//   if ((inputs[2]->region != inputs[0]->region) &&
-//       (inputs[2]->region != inputs[1]->region)) {
-//     // when value != key and value != query
-//     launcher.add_region_requirement(RegionRequirement(inputs[2]->part_grad,
-//                                                       0 /*projection id*/,
-//                                                       READ_WRITE,
-//                                                       EXCLUSIVE,
-//                                                       inputs[2]->region_grad));
-//     launcher.add_field(num_regions++, FID_DATA);
-//   }
-//   runtime->execute_index_space(ctx, launcher);
-// }
diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
index 97544d1750..b9049bf461 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_inputs.cc
@@ -31,9 +31,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
                     3));
   }
 
-  size_t seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2});
-  size_t seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2});
-  size_t seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2});
+  nonnegative_int seq_len_q = dim_at_idx(input_q, relative_ff_dim_t{-2});
+  nonnegative_int seq_len_k = dim_at_idx(input_k, relative_ff_dim_t{-2});
+  nonnegative_int seq_len_v = dim_at_idx(input_v, relative_ff_dim_t{-2});
 
   if (!all_same(seq_len_q, seq_len_k, seq_len_v)) {
     return tl::unexpected(fmt::format(
@@ -43,9 +43,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         seq_len_v));
   }
 
-  size_t batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3});
-  size_t batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3});
-  size_t batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3});
+  nonnegative_int batch_size_q = dim_at_idx(input_q, relative_ff_dim_t{-3});
+  nonnegative_int batch_size_k = dim_at_idx(input_k, relative_ff_dim_t{-3});
+  nonnegative_int batch_size_v = dim_at_idx(input_v, relative_ff_dim_t{-3});
 
   if (!all_same(batch_size_q, batch_size_k, batch_size_v)) {
     return tl::unexpected(fmt::format(
@@ -63,9 +63,9 @@ tl::expected<MultiHeadAttentionInputs, std::string>
         input_v.data_type));
   }
 
-  size_t q_size = dim_at_idx(input_q, relative_ff_dim_t{-1});
-  size_t k_size = dim_at_idx(input_k, relative_ff_dim_t{-1});
-  size_t v_size = dim_at_idx(input_v, relative_ff_dim_t{-1});
+  nonnegative_int q_size = dim_at_idx(input_q, relative_ff_dim_t{-1});
+  nonnegative_int k_size = dim_at_idx(input_k, relative_ff_dim_t{-1});
+  nonnegative_int v_size = dim_at_idx(input_v, relative_ff_dim_t{-1});
 
   return MultiHeadAttentionInputs{
       batch_size_q,
diff --git a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
index 3bd0825555..d69b62b759 100644
--- a/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
+++ b/lib/op-attrs/src/op-attrs/ops/attention/multihead_attention_parallel_inputs.cc
@@ -107,9 +107,9 @@ tl::expected<MultiHeadAttentionParallelInputs, std::string>
                     value_dim.degree));
   }
 
-  int discard_copy_q = get_discard_copy_degree(input_q);
-  int discard_copy_k = get_discard_copy_degree(input_k);
-  int discard_copy_v = get_discard_copy_degree(input_v);
+  nonnegative_int discard_copy_q = get_discard_copy_degree(input_q);
+  nonnegative_int discard_copy_k = get_discard_copy_degree(input_k);
+  nonnegative_int discard_copy_v = get_discard_copy_degree(input_v);
 
   if (!all_same(discard_copy_q, discard_copy_k, discard_copy_v)) {
     return tl::unexpected(fmt::format("Q, K, V disagree on the discard-copy "
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
index 71118db7a6..d32ae33d14 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_matmul.cc
@@ -57,13 +57,13 @@ tl::expected<TensorShape, std::string>
                                       input_rhs.data_type));
   }
 
-  size_t lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0});
-  size_t n = dim_at_idx(input_lhs, relative_ff_dim_t{1});
-  size_t lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2});
+  nonnegative_int lhs_b = dim_at_idx(input_lhs, relative_ff_dim_t{0});
+  nonnegative_int n = dim_at_idx(input_lhs, relative_ff_dim_t{1});
+  nonnegative_int lhs_m = dim_at_idx(input_lhs, relative_ff_dim_t{2});
 
-  size_t rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0});
-  size_t rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1});
-  size_t p = dim_at_idx(input_rhs, relative_ff_dim_t{2});
+  nonnegative_int rhs_b = dim_at_idx(input_rhs, relative_ff_dim_t{0});
+  nonnegative_int rhs_m = dim_at_idx(input_rhs, relative_ff_dim_t{1});
+  nonnegative_int p = dim_at_idx(input_rhs, relative_ff_dim_t{2});
 
   if (lhs_b != rhs_b) {
     return tl::unexpected(
@@ -76,7 +76,7 @@ tl::expected<TensorShape, std::string>
 
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{
+          FFOrdered<nonnegative_int>{
               lhs_b,
               n,
               p,
@@ -151,9 +151,10 @@ tl::expected<ParallelTensorShape, std::string>
   ShardParallelDim output_n = n;
   ShardParallelDim output_p = p;
 
-  int output_discard_copy_degree = 1;
-  int output_sum_degree = get_total_parallel_degree(input_lhs) /
-                          (output_b.degree * output_n.degree * output_p.degree);
+  nonnegative_int output_discard_copy_degree = 1_n;
+  nonnegative_int output_sum_degree =
+      get_total_parallel_degree(input_lhs) /
+      (output_b.degree * output_n.degree * output_p.degree);
 
   ParallelTensorShape result = ParallelTensorShape{
       ParallelTensorDims{
diff --git a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
index 472e5f1a25..ed58fe5189 100644
--- a/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/batch_norm.cc
@@ -67,10 +67,10 @@ tl::expected<TensorShape, std::string>
     return tl::unexpected("No gamma weights exist for attrs.affine = false");
   }
 
-  size_t num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
+  nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           num_channels,
       }},
       DataType::FLOAT,
@@ -97,26 +97,23 @@ static std::optional<std::string>
                        input_degrees);
   }
 
-  if (input_degrees.sum_degree != SumDegree{1}) {
+  if (input_degrees.sum_degree != SumDegree{1_n}) {
     return fmt::format("Expected sum degree 1, but receieved sum degree {}",
                        input_degrees.sum_degree);
   }
 
-  if (input_degrees.discard_copy_degree != DiscardCopyDegree{1}) {
+  if (input_degrees.discard_copy_degree != DiscardCopyDegree{1_n}) {
     return fmt::format(
         "Expected discard copy degree 1, but receieved discard copy degree {}",
         input_degrees.discard_copy_degree);
   }
 
-  FFOrdered<int> non_channel_degrees =
-      concat(slice(input_degrees.shard_degrees,
-                   ff_dim_t{nonnegative_int{0}},
-                   ff_dim_t{nonnegative_int{1}}),
-             slice(input_degrees.shard_degrees,
-                   ff_dim_t{nonnegative_int{2}},
-                   std::nullopt));
+  FFOrdered<nonnegative_int> non_channel_degrees =
+      concat(slice(input_degrees.shard_degrees, ff_dim_t{0_n}, ff_dim_t{1_n}),
+             slice(input_degrees.shard_degrees, ff_dim_t{2_n}, std::nullopt));
 
-  if (any_of(non_channel_degrees, [](int degree) { return degree != 1; })) {
+  if (any_of(non_channel_degrees,
+             [](nonnegative_int degree) { return degree != 1_n; })) {
     return fmt::format("Expected parallel degree of all non-channel dimensions "
                        "to be 1, but received input with degrees {}",
                        input_degrees);
@@ -159,9 +156,9 @@ tl::expected<ParallelTensorDimDegrees, std::string>
   relative_ff_dim_t channel_dim = relative_ff_dim_t{1};
 
   return ParallelTensorDimDegrees{
-      SumDegree{1},
-      DiscardCopyDegree{1},
-      FFOrdered<int>{input_degrees.shard_degrees.at(channel_dim)},
+      SumDegree{1_n},
+      DiscardCopyDegree{1_n},
+      FFOrdered<nonnegative_int>{input_degrees.shard_degrees.at(channel_dim)},
   };
 }
 
diff --git a/lib/op-attrs/src/op-attrs/ops/concat.cc b/lib/op-attrs/src/op-attrs/ops/concat.cc
index 3019151236..fc42241ef2 100644
--- a/lib/op-attrs/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/concat.cc
@@ -17,7 +17,8 @@ tl::expected<TensorShape, std::string>
     get_output_shape(ConcatAttrs const &attrs,
                      std::vector<TensorShape> const &inputs) {
   auto get_non_axis_dims = [&](TensorShape const &s) {
-    std::map<ff_dim_t, size_t> dim_sizes = enumerate(ff_ordered(s.dims));
+    std::map<ff_dim_t, nonnegative_int> dim_sizes =
+        enumerate(ff_ordered(s.dims));
     dim_sizes.erase(attrs.axis);
     return dim_sizes;
   };
@@ -40,8 +41,8 @@ tl::expected<TensorShape, std::string>
                     inputs));
   }
 
-  std::map<ff_dim_t, size_t> non_axis_dims = ({
-    tl::expected<std::map<ff_dim_t, size_t>, std::string> returned =
+  std::map<ff_dim_t, nonnegative_int> non_axis_dims = ({
+    tl::expected<std::map<ff_dim_t, nonnegative_int>, std::string> returned =
         require_all_same1(transform(inputs, get_non_axis_dims));
     if (!returned.has_value()) {
       return tl::unexpected(returned.error());
@@ -49,12 +50,12 @@ tl::expected<TensorShape, std::string>
     returned.value();
   });
 
-  std::vector<size_t> axis_dim_sizes =
+  std::vector<nonnegative_int> axis_dim_sizes =
       transform(inputs, [&](TensorShape const &s) {
         return dim_at_idx(s, relative_ff_dim_t_from_ff_dim_t(attrs.axis));
       });
 
-  size_t output_axis_dim_size = sum(axis_dim_sizes);
+  nonnegative_int output_axis_dim_size = sum(axis_dim_sizes);
 
   non_axis_dims.insert({attrs.axis, output_axis_dim_size});
 
@@ -88,7 +89,7 @@ tl::expected<ParallelTensorShape, std::string>
   });
 
   SumDegree sum_degree = ({
-    tl::expected<int, std::string> returned =
+    tl::expected<nonnegative_int, std::string> returned =
         require_all_same1(transform(inputs, get_sum_degree));
     if (!returned.has_value()) {
       return tl::unexpected(returned.error());
@@ -97,7 +98,7 @@ tl::expected<ParallelTensorShape, std::string>
   });
 
   DiscardCopyDegree discard_copy_degree = ({
-    tl::expected<int, std::string> returned =
+    tl::expected<nonnegative_int, std::string> returned =
         require_all_same1(transform(inputs, get_discard_copy_degree));
     if (!returned.has_value()) {
       return tl::unexpected(returned.error());
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
index eac756cc15..d1ba536d24 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d.cc
@@ -25,11 +25,11 @@ TensorShape get_kernel_shape(Conv2DAttrs const &attrs,
   Conv2DInputShape input = parse_input_shape(raw_input_shape);
 
   return TensorShape{
-      TensorDims{FFOrdered<size_t>{
-          size_t_from_int(attrs.out_channels),
+      TensorDims{FFOrdered<nonnegative_int>{
+          attrs.out_channels,
           input.num_channels,
-          size_t_from_int(attrs.kernel_h),
-          size_t_from_int(attrs.kernel_w),
+          attrs.kernel_h,
+          attrs.kernel_w,
       }},
       input.datatype,
   };
@@ -42,29 +42,44 @@ TensorShape get_bias_shape(Conv2DAttrs const &attrs,
 
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{size_t_from_int(attrs.out_channels)},
+          FFOrdered<nonnegative_int>{attrs.out_channels},
       },
       input.datatype,
   };
 }
 
+static nonnegative_int calculate_output_size(nonnegative_int input_size,
+                                             nonnegative_int padding_size,
+                                             nonnegative_int kernel_size,
+                                             nonnegative_int stride) {
+  int input_size_raw = input_size.unwrap_nonnegative();
+  int padding_raw = padding_size.unwrap_nonnegative();
+  int kernel_size_raw = kernel_size.unwrap_nonnegative();
+  int stride_raw = stride.unwrap_nonnegative();
+
+  return nonnegative_int{
+      (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1};
+}
+
 TensorShape get_output_shape(Conv2DAttrs const &attrs,
                              TensorShape const &raw_input_shape) {
   assert(attrs.groups == 1); // TODO(@lockshaw): currently not supported
   Conv2DInputShape input = parse_input_shape(raw_input_shape);
 
-  size_t out_height =
-      (input.height + (2 * attrs.padding_h) - attrs.kernel_h) / attrs.stride_h +
-      1;
-  size_t out_width =
-      (input.width + (2 * attrs.padding_w) - attrs.kernel_w) / attrs.stride_w +
-      1;
-
-  assert(attrs.out_channels > 0);
-
-  return TensorShape{TensorDims{FFOrdered<size_t>{
+  nonnegative_int out_height =
+      calculate_output_size(/*input_size=*/input.height,
+                            /*padding_size=*/attrs.padding_h,
+                            /*kernel_size=*/attrs.kernel_h,
+                            /*stride_size=*/attrs.stride_h);
+  nonnegative_int out_width =
+      calculate_output_size(/*input_size=*/input.width,
+                            /*padding_size=*/attrs.padding_w,
+                            /*kernel_size=*/attrs.kernel_w,
+                            /*stride_size=*/attrs.stride_w);
+
+  return TensorShape{TensorDims{FFOrdered<nonnegative_int>{
                          input.num_samples,
-                         size_t_from_int(attrs.out_channels),
+                         attrs.out_channels,
                          out_height,
                          out_width,
                      }},
@@ -82,14 +97,14 @@ ParallelTensorShape get_kernel_shape(Conv2DAttrs const &attrs,
   assert(parsed.height_dim.degree == 1);
   assert(parsed.width_dim.degree == 1);
 
-  SumDegree sum_degree = SumDegree{1};
+  SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree =
       DiscardCopyDegree{parsed.sample_dim.degree * parsed.sum_reduction_degree};
-  FFOrdered<int> shard_degrees = {
+  FFOrdered<nonnegative_int> shard_degrees = {
       parsed.discard_copy_reduction_degree,
       parsed.channel_dim.degree,
-      1,
-      1,
+      1_n,
+      1_n,
   };
 
   return lift_to_parallel_with_degrees(
@@ -109,7 +124,7 @@ ParallelTensorShape get_bias_shape(Conv2DAttrs const &attrs,
   DiscardCopyDegree discard_copy_degree =
       DiscardCopyDegree{parsed.height_dim.degree * parsed.width_dim.degree *
                         parsed.sample_dim.degree};
-  FFOrdered<int> shard_degrees = {
+  FFOrdered<nonnegative_int> shard_degrees = {
       parsed.discard_copy_reduction_degree,
   };
 
@@ -130,12 +145,12 @@ ParallelTensorShape get_output_shape(Conv2DAttrs const &attrs,
 
   SumDegree sum_degree =
       SumDegree{parsed.sum_reduction_degree * parsed.channel_dim.degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1};
-  FFOrdered<int> shard_degrees = {
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n};
+  FFOrdered<nonnegative_int> shard_degrees = {
       parsed.sample_dim.degree,
       parsed.discard_copy_reduction_degree,
-      1,
-      1,
+      1_n,
+      1_n,
   };
 
   return lift_to_parallel_with_degrees(
diff --git a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
index aad067feb2..1491410491 100644
--- a/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
+++ b/lib/op-attrs/src/op-attrs/ops/conv_2d/conv_2d_input_shape.cc
@@ -6,10 +6,10 @@ namespace FlexFlow {
 Conv2DInputShape parse_input_shape(TensorShape const &input) {
   assert(num_dims(input) == 4);
 
-  size_t num_samples = dim_at_idx(input, relative_ff_dim_t{0});
-  size_t in_channels = dim_at_idx(input, relative_ff_dim_t{1});
-  size_t in_height = dim_at_idx(input, relative_ff_dim_t{2});
-  size_t in_width = dim_at_idx(input, relative_ff_dim_t{3});
+  nonnegative_int num_samples = dim_at_idx(input, relative_ff_dim_t{0});
+  nonnegative_int in_channels = dim_at_idx(input, relative_ff_dim_t{1});
+  nonnegative_int in_height = dim_at_idx(input, relative_ff_dim_t{2});
+  nonnegative_int in_width = dim_at_idx(input, relative_ff_dim_t{3});
 
   return Conv2DInputShape{
       num_samples,
diff --git a/lib/op-attrs/src/op-attrs/ops/embedding.cc b/lib/op-attrs/src/op-attrs/ops/embedding.cc
index fe557695da..29bd70be2f 100644
--- a/lib/op-attrs/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/src/op-attrs/ops/embedding.cc
@@ -50,9 +50,9 @@ tl::expected<TensorShape, std::string>
 
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{
-              size_t_from_int(attrs.num_entries),
-              size_t_from_int(attrs.out_channels),
+          FFOrdered<nonnegative_int>{
+              attrs.num_entries,
+              attrs.out_channels,
           },
       },
       attrs.data_type,
@@ -74,8 +74,8 @@ tl::expected<ParallelTensorShape, std::string>
 
   SumDegree sum_degree =
       SumDegree{shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1};
-  FFOrdered<int> shard_degrees =
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n};
+  FFOrdered<nonnegative_int> shard_degrees =
       transform(input.dims.shard_dims,
                 [](ShardParallelDim const &d) { return d.degree; });
   shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input);
@@ -96,13 +96,13 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(
-      transform(ff_ordered_shard_dims(input.dims),
-                [](ShardParallelDim const &d) -> int { return d.degree; }))};
-  int entry_dim_degree = 1;
-  int out_channel_degree = get_discard_copy_degree(input);
-  FFOrdered<int> shard_degrees = {
+  SumDegree sum_degree = SumDegree{1_n};
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(transform(
+      ff_ordered_shard_dims(input.dims),
+      [](ShardParallelDim const &d) -> nonnegative_int { return d.degree; }))};
+  nonnegative_int entry_dim_degree = 1_n;
+  nonnegative_int out_channel_degree = get_discard_copy_degree(input);
+  FFOrdered<nonnegative_int> shard_degrees = {
       entry_dim_degree,
       out_channel_degree,
   };
diff --git a/lib/op-attrs/src/op-attrs/ops/flat.cc b/lib/op-attrs/src/op-attrs/ops/flat.cc
index bc86102566..8ed12167b3 100644
--- a/lib/op-attrs/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/src/op-attrs/ops/flat.cc
@@ -11,12 +11,11 @@ namespace FlexFlow {
 
 TensorShape get_output_shape(FlatAttrs const &attrs,
                              TensorShape const &input_shape) {
-  FFOrdered<size_t> leading_dims = slice(ff_ordered(input_shape.dims),
-                                         ff_dim_t{nonnegative_int{0}},
-                                         attrs.start_dim);
-  FFOrdered<size_t> flattened_dims =
+  FFOrdered<nonnegative_int> leading_dims =
+      slice(ff_ordered(input_shape.dims), ff_dim_t{0_n}, attrs.start_dim);
+  FFOrdered<nonnegative_int> flattened_dims =
       slice(ff_ordered(input_shape.dims), attrs.start_dim, attrs.end_dim);
-  FFOrdered<size_t> trailing_dims =
+  FFOrdered<nonnegative_int> trailing_dims =
       slice(ff_ordered(input_shape.dims), attrs.end_dim, std::nullopt);
 
   if (flattened_dims.empty()) {
@@ -38,14 +37,15 @@ TensorShape get_output_shape(FlatAttrs const &attrs,
 tl::expected<ParallelTensorDimDegrees, std::string>
     get_output_parallel_dim_degrees(
         FlatAttrs const &attrs, ParallelTensorDimDegrees const &input_degrees) {
-  FFOrdered<int> flattened_dim_degrees =
+  FFOrdered<nonnegative_int> flattened_dim_degrees =
       slice(input_degrees.shard_degrees, attrs.start_dim, attrs.end_dim);
 
   if (flattened_dim_degrees.empty()) {
     return input_degrees;
   }
 
-  if (any_of(flattened_dim_degrees, [](int degree) { return degree != 1; })) {
+  if (any_of(flattened_dim_degrees,
+             [](nonnegative_int degree) { return degree != 1; })) {
     return tl::unexpected(
         fmt::format("get_output_parallel_dim_degrees for {} expected all shard "
                     "degrees of flattened dimensions to be 1, but received {}",
@@ -58,9 +58,7 @@ tl::expected<ParallelTensorDimDegrees, std::string>
       /*discard_copy_degree=*/input_degrees.discard_copy_degree,
       /*shard_degrees=*/
       concat(std::vector{
-          slice(input_degrees.shard_degrees,
-                ff_dim_t{nonnegative_int{0}},
-                attrs.start_dim),
+          slice(input_degrees.shard_degrees, ff_dim_t{0_n}, attrs.start_dim),
           {product(flattened_dim_degrees)},
           slice(input_degrees.shard_degrees, attrs.end_dim, std::nullopt),
       }),
diff --git a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
index 86426dd18f..2394579e53 100644
--- a/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/src/op-attrs/ops/layer_norm.cc
@@ -71,7 +71,7 @@ tl::expected<TensorShape, std::string>
   std::vector<ff_dim_t> non_layer_norm_dim_idxs = filter(
       get_idxs(input_shape.dims.ff_ordered),
       [&](ff_dim_t const &dim_idx) { return !contains(attrs.axes, dim_idx); });
-  std::vector<size_t> raw_weight_dims =
+  std::vector<nonnegative_int> raw_weight_dims =
       transform(non_layer_norm_dim_idxs, [&](ff_dim_t const &dim_idx) {
         return dim_at_idx(input_shape,
                           relative_ff_dim_t_from_ff_dim_t(dim_idx));
@@ -174,8 +174,8 @@ tl::expected<ParallelTensorShape, std::string>
       ParallelTensorDims{
           ff_ordered_of(raw_weight_shard_dims),
           ReplicaParallelDimSet{
-              SumDegree{1},
-              DiscardCopyDegree{1},
+              SumDegree{1_n},
+              DiscardCopyDegree{1_n},
           },
       },
       DataType::FLOAT,
diff --git a/lib/op-attrs/src/op-attrs/ops/linear.cc b/lib/op-attrs/src/op-attrs/ops/linear.cc
index e00a47d490..0387c143d7 100644
--- a/lib/op-attrs/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/src/op-attrs/ops/linear.cc
@@ -41,11 +41,11 @@ RecordFormatter as_dot(LinearAttrs const &attrs) {
 tl::expected<TensorShape, std::string>
     get_projection_shape(LinearAttrs const &attrs,
                          TensorShape const &input_shape) {
-  size_t in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1});
+  nonnegative_int in_channels = dim_at_idx(input_shape, relative_ff_dim_t{-1});
 
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{in_channels, size_t_from_int(attrs.out_channels)},
+          FFOrdered<nonnegative_int>{in_channels, attrs.out_channels},
       },
       input_shape.data_type,
   };
@@ -55,7 +55,7 @@ tl::expected<TensorShape, std::string>
     get_bias_shape(LinearAttrs const &attrs, TensorShape const &input_shape) {
   return TensorShape{
       TensorDims{
-          FFOrdered<size_t>{size_t_from_int(attrs.out_channels)},
+          FFOrdered<nonnegative_int>{attrs.out_channels},
       },
       input_shape.data_type,
   };
@@ -64,8 +64,7 @@ tl::expected<TensorShape, std::string>
 tl::expected<TensorShape, std::string>
     get_output_shape(LinearAttrs const &attrs, TensorShape const &input_shape) {
   TensorShape output_shape = input_shape;
-  output_shape.dims.ff_ordered.at(relative_ff_dim_t{-1}) =
-      size_t_from_int(attrs.out_channels);
+  output_shape.dims.ff_ordered.at(relative_ff_dim_t{-1}) = attrs.out_channels;
 
   return output_shape;
 }
@@ -82,12 +81,12 @@ tl::expected<ParallelTensorShape, std::string>
     result_unpar.value();
   });
 
-  SumDegree sum_degree = SumDegree{1};
+  SumDegree sum_degree = SumDegree{1_n};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{
       get_sum_degree(input) * product(slice(ff_ordered_shard_degrees(input),
                                             std::nullopt,
                                             relative_ff_dim_t{-1}))};
-  FFOrdered<int> shard_degrees = FFOrdered<int>{
+  FFOrdered<nonnegative_int> shard_degrees = FFOrdered<nonnegative_int>{
       shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree,
       get_discard_copy_degree(input),
   };
@@ -112,7 +111,8 @@ tl::expected<ParallelTensorShape, std::string>
                 shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
   DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{product(slice(
       ff_ordered_shard_degrees(input), std::nullopt, relative_ff_dim_t{-1}))};
-  FFOrdered<int> shard_degrees = FFOrdered<int>{get_discard_copy_degree(input)};
+  FFOrdered<nonnegative_int> shard_degrees =
+      FFOrdered<nonnegative_int>{get_discard_copy_degree(input)};
 
   return lift_to_parallel_with_degrees(
       unpar, sum_degree, discard_copy_degree, shard_degrees);
@@ -133,8 +133,8 @@ tl::expected<ParallelTensorShape, std::string>
   SumDegree sum_degree =
       SumDegree{get_sum_degree(input) *
                 shard_dim_at_idx(input, relative_ff_dim_t{-1}).degree};
-  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1};
-  FFOrdered<int> shard_degrees = ff_ordered_shard_degrees(input);
+  DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{1_n};
+  FFOrdered<nonnegative_int> shard_degrees = ff_ordered_shard_degrees(input);
   shard_degrees.at(relative_ff_dim_t{-1}) = get_discard_copy_degree(input);
 
   return lift_to_parallel_with_degrees(
diff --git a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
index 86d287ebc8..f9630e16b1 100644
--- a/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/src/op-attrs/ops/pool_2d.cc
@@ -8,8 +8,8 @@ namespace FlexFlow {
 
 tl::expected<Pool2DAttrs, std::string>
     make_adaptive_pool2d_attrs(TensorDims const &input_dims,
-                               int output_h,
-                               int output_w,
+                               nonnegative_int output_h,
+                               nonnegative_int output_w,
                                PoolOp pool_type,
                                std::optional<Activation> const &activation) {
   // AdaptivePool2D semantics pulled from
@@ -22,10 +22,10 @@ tl::expected<Pool2DAttrs, std::string>
                     input_dims));
   }
 
-  size_t num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0});
-  size_t num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1});
-  size_t input_h = dim_at_idx(input_dims, relative_ff_dim_t{2});
-  size_t input_w = dim_at_idx(input_dims, relative_ff_dim_t{3});
+  nonnegative_int num_samples = dim_at_idx(input_dims, relative_ff_dim_t{0});
+  nonnegative_int num_channels = dim_at_idx(input_dims, relative_ff_dim_t{1});
+  nonnegative_int input_h = dim_at_idx(input_dims, relative_ff_dim_t{2});
+  nonnegative_int input_w = dim_at_idx(input_dims, relative_ff_dim_t{3});
 
   if (input_h % output_h != 0) {
     return tl::unexpected(fmt::format(
@@ -55,29 +55,29 @@ tl::expected<Pool2DAttrs, std::string>
   //               = `ind / outd`
   //               = `stride`
 
-  int kernel_h = input_h / output_h;
-  int kernel_w = input_w / output_w;
+  nonnegative_int kernel_h = input_h / output_h;
+  nonnegative_int kernel_w = input_w / output_w;
 
-  int stride_h = kernel_h;
-  int stride_w = kernel_w;
+  nonnegative_int stride_h = kernel_h;
+  nonnegative_int stride_w = kernel_w;
 
   Pool2DAttrs attrs = Pool2DAttrs{
       /*kernel_h=*/kernel_h,
       /*kernel_w=*/kernel_w,
       /*stride_h=*/stride_h,
       /*stride_w=*/stride_w,
-      /*padding_h=*/0,
-      /*padding_w=*/0,
+      /*padding_h=*/0_n,
+      /*padding_w=*/0_n,
       /*pool_type=*/pool_type,
       /*activation=*/activation,
   };
 
   TensorShape expected_ouput_shape = TensorShape{
-      TensorDims{FFOrdered<size_t>{
+      TensorDims{FFOrdered<nonnegative_int>{
           num_samples,
           num_channels,
-          size_t_from_int(output_h),
-          size_t_from_int(output_w),
+          output_h,
+          output_w,
       }},
       DataType::FLOAT,
   };
@@ -104,6 +104,19 @@ tl::expected<Pool2DAttrs, std::string>
   return attrs;
 }
 
+static nonnegative_int calculate_output_size(nonnegative_int input_size,
+                                             nonnegative_int padding_size,
+                                             nonnegative_int kernel_size,
+                                             nonnegative_int stride) {
+  int input_size_raw = input_size.unwrap_nonnegative();
+  int padding_raw = padding_size.unwrap_nonnegative();
+  int kernel_size_raw = kernel_size.unwrap_nonnegative();
+  int stride_raw = stride.unwrap_nonnegative();
+
+  return nonnegative_int{
+      (input_size_raw + (2 * padding_raw) - kernel_size_raw) / stride_raw + 1};
+}
+
 tl::expected<TensorShape, std::string>
     get_output_shape(Pool2DAttrs const &attrs, TensorShape const &input_shape) {
   if (num_dims(input_shape) != 4) {
@@ -113,19 +126,23 @@ tl::expected<TensorShape, std::string>
                     input_shape));
   }
 
-  size_t num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0});
-  size_t num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
-  size_t input_height = dim_at_idx(input_shape, relative_ff_dim_t{2});
-  size_t input_width = dim_at_idx(input_shape, relative_ff_dim_t{3});
-
-  size_t output_height =
-      (input_height + 2 * attrs.padding_h - attrs.kernel_h) / attrs.stride_h +
-      1;
-
-  size_t output_width =
-      (input_width + 2 * attrs.padding_w - attrs.kernel_w) / attrs.stride_w + 1;
-
-  return TensorShape{TensorDims{FFOrdered<size_t>{
+  nonnegative_int num_samples = dim_at_idx(input_shape, relative_ff_dim_t{0});
+  nonnegative_int num_channels = dim_at_idx(input_shape, relative_ff_dim_t{1});
+  nonnegative_int input_height = dim_at_idx(input_shape, relative_ff_dim_t{2});
+  nonnegative_int input_width = dim_at_idx(input_shape, relative_ff_dim_t{3});
+
+  nonnegative_int output_height =
+      calculate_output_size(/*input_size=*/input_height,
+                            /*padding_size=*/attrs.padding_h,
+                            /*kernel_size=*/attrs.kernel_h,
+                            /*stride_size=*/attrs.stride_h);
+  nonnegative_int output_width =
+      calculate_output_size(/*input_size=*/input_width,
+                            /*padding_size=*/attrs.padding_w,
+                            /*kernel_size=*/attrs.kernel_w,
+                            /*stride_size=*/attrs.stride_w);
+
+  return TensorShape{TensorDims{FFOrdered<nonnegative_int>{
                          num_samples,
                          num_channels,
                          output_height,
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
index 0bb940924a..7a8f91e498 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_dims.cc
@@ -7,9 +7,11 @@
 #include "op-attrs/tensor_dims.h"
 #include "utils/containers/all_of.h"
 #include "utils/containers/product.h"
+#include "utils/containers/repeat_element.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/vector_of.h"
 #include "utils/integer_conversions.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -17,7 +19,8 @@ FFOrdered<ShardParallelDim> ff_ordered_shard_dims(ParallelTensorDims const &d) {
   return d.shard_dims;
 }
 
-FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorDims const &d) {
+FFOrdered<nonnegative_int>
+    ff_ordered_shard_degrees(ParallelTensorDims const &d) {
   return transform(d.shard_dims,
                    [](ShardParallelDim const &d) { return d.degree; });
 }
@@ -27,8 +30,8 @@ std::unordered_set<ReplicaParallelDim>
   return get_replica_dims(d.replica_dims);
 }
 
-size_t num_shard_dims(ParallelTensorDims const &dims) {
-  return dims.shard_dims.size();
+nonnegative_int num_shard_dims(ParallelTensorDims const &dims) {
+  return num_elements(dims.shard_dims);
 }
 
 ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) {
@@ -40,22 +43,22 @@ ParallelTensorDimDegrees get_parallel_degrees(ParallelTensorDims const &d) {
 }
 
 ParallelTensorDims lift_to_parallel(TensorDims const &dims) {
-  std::vector<int> shard_degrees(num_dims(dims),
-                                 1); // 1 repeated num_dims(dims) times
+  std::vector<nonnegative_int> shard_degrees =
+      repeat_element(/*num_times=*/num_dims(dims), /*element=*/1_n);
   return lift_to_parallel_with_degrees(
-      dims, SumDegree{1}, DiscardCopyDegree{1}, shard_degrees);
+      dims, SumDegree{1_n}, DiscardCopyDegree{1_n}, shard_degrees);
 }
 
-ParallelTensorDims
-    lift_to_parallel_with_degrees(TensorDims const &unpar,
-                                  SumDegree const &sum_degree,
-                                  DiscardCopyDegree const &discard_copy_degree,
-                                  FFOrdered<int> const &shard_degrees) {
+ParallelTensorDims lift_to_parallel_with_degrees(
+    TensorDims const &unpar,
+    SumDegree const &sum_degree,
+    DiscardCopyDegree const &discard_copy_degree,
+    FFOrdered<nonnegative_int> const &shard_degrees) {
   std::vector<ShardParallelDim> lifted =
       transform(zip(vector_of(unpar.ff_ordered), vector_of(shard_degrees)),
-                [](std::pair<size_t, int> const &p) {
-                  size_t size = p.first;
-                  int degree = p.second;
+                [](std::pair<nonnegative_int, nonnegative_int> const &p) {
+                  nonnegative_int size = p.first;
+                  nonnegative_int degree = p.second;
                   return ShardParallelDim{size, degree};
                 });
 
@@ -75,17 +78,17 @@ ParallelTensorDims
                                        degrees.shard_degrees);
 }
 
-int total_replica_degree(ParallelTensorDims const &dims) {
+nonnegative_int total_replica_degree(ParallelTensorDims const &dims) {
   return dims.replica_dims.discard_copy_degree.value *
          dims.replica_dims.sum_degree.value;
 }
 
-int total_shard_degree(ParallelTensorDims const &dims) {
+nonnegative_int total_shard_degree(ParallelTensorDims const &dims) {
   return product(transform(vector_of(dims.shard_dims),
                            [](ShardParallelDim const &d) { return d.degree; }));
 }
 
-int total_parallel_degree(ParallelTensorDims const &dims) {
+nonnegative_int total_parallel_degree(ParallelTensorDims const &dims) {
   return total_replica_degree(dims) * total_shard_degree(dims);
 }
 
@@ -115,7 +118,7 @@ TensorDims get_tensor_dims_unsafe(ParallelTensorDims const &) {
 }
 
 TensorDims get_reduced_dims(ParallelTensorDims const &dims) {
-  FFOrdered<size_t> dim_sizes = transform(
+  FFOrdered<nonnegative_int> dim_sizes = transform(
       dims.shard_dims, [](ShardParallelDim const &d) { return d.size; });
   return TensorDims{dim_sizes};
 }
diff --git a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
index bbad13b46b..260ec7c3cd 100644
--- a/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/parallel_tensor_shape.cc
@@ -6,11 +6,12 @@
 #include "utils/containers/range.h"
 #include "utils/containers/transform.h"
 #include "utils/hash-utils.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
 
-int num_shard_dims(ParallelTensorShape const &s) {
+nonnegative_int num_shard_dims(ParallelTensorShape const &s) {
   return num_shard_dims(s.dims);
 }
 
@@ -19,21 +20,21 @@ std::unordered_set<ReplicaParallelDim>
   return replica_dims(s.dims);
 }
 
-int get_num_replicas(ParallelTensorShape const &shape) {
-  return product(
-      transform(replica_dims(shape),
-                [](ReplicaParallelDim const &d) -> int { return d.degree; }));
+nonnegative_int get_num_replicas(ParallelTensorShape const &shape) {
+  return product(transform(
+      replica_dims(shape),
+      [](ReplicaParallelDim const &d) -> nonnegative_int { return d.degree; }));
 }
 
-int get_sum_degree(ParallelTensorShape const &shape) {
+nonnegative_int get_sum_degree(ParallelTensorShape const &shape) {
   return shape.dims.replica_dims.sum_degree.value;
 }
 
-int get_discard_copy_degree(ParallelTensorShape const &shape) {
+nonnegative_int get_discard_copy_degree(ParallelTensorShape const &shape) {
   return shape.dims.replica_dims.discard_copy_degree.value;
 }
 
-int get_total_parallel_degree(ParallelTensorShape const &s) {
+nonnegative_int get_total_parallel_degree(ParallelTensorShape const &s) {
   return total_parallel_degree(s.dims);
 }
 
@@ -51,7 +52,8 @@ ShardParallelDim &shard_dim_at_idx(ParallelTensorShape &s,
   return shard_dim_at_idx(s.dims, d);
 }
 
-FFOrdered<int> ff_ordered_shard_degrees(ParallelTensorShape const &s) {
+FFOrdered<nonnegative_int>
+    ff_ordered_shard_degrees(ParallelTensorShape const &s) {
   return ff_ordered_shard_degrees(s.dims);
 }
 
@@ -73,11 +75,11 @@ ParallelTensorShape lift_to_parallel(TensorShape const &s) {
   return ParallelTensorShape{lift_to_parallel(s.dims), s.data_type};
 }
 
-ParallelTensorShape
-    lift_to_parallel_with_degrees(TensorShape const &unpar,
-                                  SumDegree const &sum_degree,
-                                  DiscardCopyDegree const &discard_copy_degree,
-                                  FFOrdered<int> const &shard_degrees) {
+ParallelTensorShape lift_to_parallel_with_degrees(
+    TensorShape const &unpar,
+    SumDegree const &sum_degree,
+    DiscardCopyDegree const &discard_copy_degree,
+    FFOrdered<nonnegative_int> const &shard_degrees) {
   return ParallelTensorShape{
       lift_to_parallel_with_degrees(
           unpar.dims, sum_degree, discard_copy_degree, shard_degrees),
@@ -95,8 +97,8 @@ ParallelTensorShape
 }
 
 TensorShape require_not_parallel(ParallelTensorShape const &s) {
-  int total_degree = get_total_parallel_degree(s);
-  if (total_degree != 1) {
+  nonnegative_int total_degree = get_total_parallel_degree(s);
+  if (total_degree != 1_n) {
     throw mk_runtime_error(
         fmt::format("Error: require_not_parallel received a parallel tensor "
                     "shape with parallel degree {}: {}",
@@ -124,25 +126,27 @@ TensorShape get_reduced_shape(ParallelTensorShape const &s) {
 
 ParallelDim get_parallel_dim_at_idx(ParallelTensorShape const &shape,
                                     parallel_tensor_dim_idx_t idx) {
-  return idx.visit<ParallelDim>(
-      overload{[&](ff_dim_t shard_dim) {
-                 return ParallelDim{shape.dims.shard_dims.at(shard_dim)};
-               },
-               [&](ReplicaType replica_type) {
-                 ReplicaParallelDimSet replicas = shape.dims.replica_dims;
-                 int degree = (ReplicaType::SUM == replica_type
-                                   ? replicas.sum_degree.value
-                                   : replicas.discard_copy_degree.value);
-                 return ParallelDim{ReplicaParallelDim{degree, replica_type}};
-               }});
+  return idx.visit<ParallelDim>(overload{
+      [&](ff_dim_t shard_dim) {
+        return ParallelDim{shape.dims.shard_dims.at(shard_dim)};
+      },
+      [&](ReplicaType replica_type) {
+        ReplicaParallelDimSet replicas = shape.dims.replica_dims;
+        nonnegative_int degree = (ReplicaType::SUM == replica_type
+                                      ? replicas.sum_degree.value
+                                      : replicas.discard_copy_degree.value);
+        return ParallelDim{ReplicaParallelDim{degree, replica_type}};
+      }});
 }
 
 std::unordered_set<parallel_tensor_dim_idx_t>
     get_parallel_tensor_dim_indices(ParallelTensorShape const &shape) {
   std::unordered_set<parallel_tensor_dim_idx_t> indices;
-  extend(indices, transform(range(num_shard_dims(shape.dims)), [](int idx) {
-           return parallel_tensor_dim_idx_t{ff_dim_t{nonnegative_int{idx}}};
-         }));
+  extend(indices,
+         transform(nonnegative_range(num_shard_dims(shape.dims)),
+                   [](nonnegative_int idx) {
+                     return parallel_tensor_dim_idx_t{ff_dim_t{idx}};
+                   }));
   indices.insert(parallel_tensor_dim_idx_t{ReplicaType::SUM});
   indices.insert(parallel_tensor_dim_idx_t{ReplicaType::DISCARD_COPY});
   return indices;
diff --git a/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc b/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc
index 0671bb05f2..a987841b18 100644
--- a/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc
+++ b/lib/op-attrs/src/op-attrs/relative_ff_dim_t.cc
@@ -3,10 +3,10 @@
 
 namespace FlexFlow {
 ff_dim_t ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t ff_dim,
-                                         int input_dim) {
+                                         nonnegative_int input_dim) {
   int raw = ff_dim.value;
   if (raw < 0) {
-    raw = input_dim + raw;
+    raw = input_dim.unwrap_nonnegative() + raw;
   }
   return ff_dim_t{nonnegative_int{raw}};
 }
diff --git a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
index 20c88c77dc..fc712be10b 100644
--- a/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
+++ b/lib/op-attrs/src/op-attrs/replica_parallel_dim_set.cc
@@ -4,11 +4,11 @@
 namespace FlexFlow {
 
 ReplicaParallelDimSet empty_replica_parallel_dim_set() {
-  return ReplicaParallelDimSet{SumDegree{1}, DiscardCopyDegree{1}};
+  return ReplicaParallelDimSet{SumDegree{1_n}, DiscardCopyDegree{1_n}};
 }
 
-int get_order_of_replica_type(ReplicaParallelDimSet const &s,
-                              ReplicaType replica_type) {
+nonnegative_int get_degree_of_replica_type(ReplicaParallelDimSet const &s,
+                                           ReplicaType replica_type) {
   switch (replica_type) {
     case ReplicaType::SUM:
       return s.sum_degree.value;
diff --git a/lib/op-attrs/src/op-attrs/tensor_dims.cc b/lib/op-attrs/src/op-attrs/tensor_dims.cc
index f0ac88d8e4..f9198bbe28 100644
--- a/lib/op-attrs/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_dims.cc
@@ -8,22 +8,23 @@
 #include "utils/containers/vector_of.h"
 #include "utils/containers/zip.h"
 #include "utils/integer_conversions.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
-FFOrdered<size_t> const &ff_ordered(TensorDims const &dims) {
+FFOrdered<nonnegative_int> const &ff_ordered(TensorDims const &dims) {
   return dims.ff_ordered;
 }
 
-size_t num_dims(TensorDims const &dims) {
-  return dims.ff_ordered.size();
+nonnegative_int num_dims(TensorDims const &dims) {
+  return num_elements(dims.ff_ordered);
 }
 
-size_t dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) {
+nonnegative_int dim_at_idx(TensorDims const &dims, relative_ff_dim_t idx) {
   return dims.ff_ordered.at(idx);
 }
 
-size_t &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) {
+nonnegative_int &dim_at_idx(TensorDims &dims, relative_ff_dim_t idx) {
   return dims.ff_ordered.at(idx);
 }
 
@@ -33,8 +34,8 @@ bool tensor_dims_is_broadcastable_to(TensorDims const &curr,
     return false;
   }
 
-  std::vector<size_t> curr_dims = vector_of(curr.ff_ordered);
-  std::vector<size_t> goal_dims = vector_of(goal.ff_ordered);
+  std::vector<nonnegative_int> curr_dims = vector_of(curr.ff_ordered);
+  std::vector<nonnegative_int> goal_dims = vector_of(goal.ff_ordered);
 
   for (auto const &[curr_dim, goal_dim] :
        zip(reversed(curr_dims), reversed(goal_dims))) {
diff --git a/lib/op-attrs/src/op-attrs/tensor_shape.cc b/lib/op-attrs/src/op-attrs/tensor_shape.cc
index 70ed58aac6..690a07d26a 100644
--- a/lib/op-attrs/src/op-attrs/tensor_shape.cc
+++ b/lib/op-attrs/src/op-attrs/tensor_shape.cc
@@ -4,26 +4,27 @@
 #include "utils/containers/get_only.h"
 #include "utils/containers/product.h"
 #include "utils/containers/transform.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
-size_t num_dims(TensorShape const &s) {
-  return s.dims.ff_ordered.size();
+nonnegative_int num_dims(TensorShape const &s) {
+  return num_elements(s.dims.ff_ordered);
 }
 
-size_t dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) {
+nonnegative_int dim_at_idx(TensorShape const &s, relative_ff_dim_t idx) {
   return dim_at_idx(s.dims, idx);
 }
 
-size_t &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) {
+nonnegative_int &dim_at_idx(TensorShape &s, relative_ff_dim_t idx) {
   return dim_at_idx(s.dims, idx);
 }
 
-size_t get_num_elements(TensorShape const &s) {
+nonnegative_int get_num_elements(TensorShape const &s) {
   return product(s.dims.ff_ordered);
 }
 
-size_t get_size_in_bytes(TensorShape const &s) {
+nonnegative_int get_size_in_bytes(TensorShape const &s) {
   return get_num_elements(s) * size_of_datatype(s.data_type);
 }
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/attention.cc b/lib/op-attrs/test/src/op-attrs/ops/attention.cc
index eca8559b21..b317c5c69c 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/attention.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/attention.cc
@@ -10,10 +10,10 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_attention_incoming_tensor_roles(MultiHeadAttentionAttrs)") {
     auto make_attrs = [](bool bias) {
       return MultiHeadAttentionAttrs{
-          /*embed_dim=*/32,
-          /*num_heads=*/10,
-          /*kdim=*/32,
-          /*vdim=*/32,
+          /*embed_dim=*/32_n,
+          /*num_heads=*/10_n,
+          /*kdim=*/32_n,
+          /*vdim=*/32_n,
           /*dropout=*/0.0,
           /*bias=*/bias,
           /*add_bias_kv=*/false,
@@ -58,8 +58,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(MultiHeadAttentionAttrs, TensorShape, "
             "TensorShape, TensorShape)") {
-    int embed_dim = 32;
-    int num_heads = 10;
+    nonnegative_int embed_dim = 32_n;
+    nonnegative_int num_heads = 10_n;
 
     /* Parameter meanings match those at
      * https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html
@@ -75,13 +75,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*add_zero_attn=*/false,
     };
 
-    size_t batch_size = 40;
-    size_t seq_len = 48;
-    size_t feature_size = 36;
+    nonnegative_int batch_size = 40_n;
+    nonnegative_int seq_len = 48_n;
+    nonnegative_int feature_size = 36_n;
 
     TensorShape input_q = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 seq_len,
                 feature_size,
@@ -92,7 +92,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_k = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 seq_len,
                 feature_size,
@@ -103,7 +103,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_v = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 seq_len,
                 feature_size,
@@ -114,10 +114,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 seq_len,
-                size_t_from_int(attrs.embed_dim),
+                attrs.embed_dim,
             },
         },
         DataType::FLOAT,
@@ -125,9 +125,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape weights = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                (feature_size * embed_dim) * 3 + (embed_dim * embed_dim),
-                size_t_from_int(num_heads),
+            FFOrdered<nonnegative_int>{
+                (feature_size * embed_dim) * 3_n + (embed_dim * embed_dim),
+                num_heads,
             },
         },
         DataType::FLOAT,
@@ -135,8 +135,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_bias = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                size_t_from_int(embed_dim * 3),
+            FFOrdered<nonnegative_int>{
+                embed_dim * 3_n,
             },
         },
         DataType::FLOAT,
@@ -144,8 +144,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output_bias = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                size_t_from_int(embed_dim),
+            FFOrdered<nonnegative_int>{
+                embed_dim,
             },
         },
         DataType::FLOAT,
@@ -184,72 +184,94 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("parallel shape inference") {
       auto make_q = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_batch,
-                        int o_seq_len,
-                        int o_q) {
+                        nonnegative_int o_batch,
+                        nonnegative_int o_seq_len,
+                        nonnegative_int o_q) {
         return lift_to_parallel_with_degrees(
-            input_q, o_sum, o_eq, FFOrdered<int>{o_batch, o_seq_len, o_q});
+            input_q,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_q});
       };
 
       auto make_k = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_batch,
-                        int o_seq_len,
-                        int o_k) {
+                        nonnegative_int o_batch,
+                        nonnegative_int o_seq_len,
+                        nonnegative_int o_k) {
         return lift_to_parallel_with_degrees(
-            input_k, o_sum, o_eq, FFOrdered<int>{o_batch, o_seq_len, o_k});
+            input_k,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_k});
       };
 
       auto make_v = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_batch,
-                        int o_seq_len,
-                        int o_v) {
+                        nonnegative_int o_batch,
+                        nonnegative_int o_seq_len,
+                        nonnegative_int o_v) {
         return lift_to_parallel_with_degrees(
-            input_v, o_sum, o_eq, FFOrdered<int>{o_batch, o_seq_len, o_v});
+            input_v,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_v});
       };
 
       auto make_o = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_batch,
-                        int o_seq_len,
-                        int o_o) {
+                        nonnegative_int o_batch,
+                        nonnegative_int o_seq_len,
+                        nonnegative_int o_o) {
         return lift_to_parallel_with_degrees(
-            output, o_sum, o_eq, FFOrdered<int>{o_batch, o_seq_len, o_o});
+            output,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_seq_len, o_o});
       };
 
-      auto make_w =
-          [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_e, int o_h) {
-            return lift_to_parallel_with_degrees(
-                weights, o_sum, o_eq, FFOrdered<int>{o_e, o_h});
-          };
+      auto make_w = [&](SumDegree o_sum,
+                        DiscardCopyDegree o_eq,
+                        nonnegative_int o_e,
+                        nonnegative_int o_h) {
+        return lift_to_parallel_with_degrees(
+            weights, o_sum, o_eq, FFOrdered<nonnegative_int>{o_e, o_h});
+      };
 
-      auto make_input_bias =
-          [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_in_proj_channel) {
-            return lift_to_parallel_with_degrees(
-                input_bias, o_sum, o_eq, FFOrdered<int>{o_in_proj_channel});
-          };
+      auto make_input_bias = [&](SumDegree o_sum,
+                                 DiscardCopyDegree o_eq,
+                                 nonnegative_int o_in_proj_channel) {
+        return lift_to_parallel_with_degrees(
+            input_bias,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_in_proj_channel});
+      };
 
-      auto make_output_bias =
-          [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_out_proj_channel) {
-            return lift_to_parallel_with_degrees(
-                output_bias, o_sum, o_eq, FFOrdered<int>{o_out_proj_channel});
-          };
+      auto make_output_bias = [&](SumDegree o_sum,
+                                  DiscardCopyDegree o_eq,
+                                  nonnegative_int o_out_proj_channel) {
+        return lift_to_parallel_with_degrees(
+            output_bias,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_out_proj_channel});
+      };
 
       SUBCASE("data parallelism") {
-        int o_b = 4;
+        nonnegative_int o_b = 4_n;
         ParallelTensorShape q =
-            make_q(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+            make_q(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
         ParallelTensorShape k =
-            make_k(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+            make_k(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
         ParallelTensorShape v =
-            make_v(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+            make_v(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
 
         SUBCASE("get_output_shape") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_o(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+              make_o(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
           CHECK(result == correct);
         }
 
@@ -257,7 +279,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_weights_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_w(SumDegree{1}, DiscardCopyDegree{o_b}, 1, 1);
+              make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, 1_n);
           CHECK(result == correct);
         }
 
@@ -265,7 +287,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_input_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_input_bias(SumDegree{1}, DiscardCopyDegree{o_b}, 1);
+              make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n);
           CHECK(result == correct);
         }
 
@@ -273,25 +295,25 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_output_bias(SumDegree{1}, DiscardCopyDegree{o_b}, 1);
+              make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n);
           CHECK(result == correct);
         }
       }
 
       SUBCASE("attention head parallelism") {
-        int o_h = 2;
+        nonnegative_int o_h = 2_n;
         ParallelTensorShape q =
-            make_q(SumDegree{1}, DiscardCopyDegree{o_h}, 1, 1, 1);
+            make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n);
         ParallelTensorShape k =
-            make_k(SumDegree{1}, DiscardCopyDegree{o_h}, 1, 1, 1);
+            make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n);
         ParallelTensorShape v =
-            make_v(SumDegree{1}, DiscardCopyDegree{o_h}, 1, 1, 1);
+            make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n, 1_n, 1_n);
 
         SUBCASE("get_output_shape") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_o(SumDegree{o_h}, DiscardCopyDegree{1}, 1, 1, 1);
+              make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
           CHECK(result == correct);
         }
 
@@ -299,7 +321,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_weights_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_w(SumDegree{1}, DiscardCopyDegree{1}, 1, o_h);
+              make_w(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_h);
           CHECK(result == correct);
         }
 
@@ -307,7 +329,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_input_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_input_bias(SumDegree{1}, DiscardCopyDegree{o_h}, 1);
+              make_input_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n);
           CHECK(result == correct);
         }
 
@@ -315,26 +337,26 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_output_bias(SumDegree{1}, DiscardCopyDegree{o_h}, 1);
+              make_output_bias(SumDegree{1_n}, DiscardCopyDegree{o_h}, 1_n);
           CHECK(result == correct);
         }
       }
 
       SUBCASE("combined data & attention head parallelism") {
-        int o_b = 4;
-        int o_h = 2;
+        nonnegative_int o_b = 4_n;
+        nonnegative_int o_h = 2_n;
         ParallelTensorShape q =
-            make_q(SumDegree{1}, DiscardCopyDegree{o_h}, o_b, 1, 1);
+            make_q(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n);
         ParallelTensorShape k =
-            make_k(SumDegree{1}, DiscardCopyDegree{o_h}, o_b, 1, 1);
+            make_k(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n);
         ParallelTensorShape v =
-            make_v(SumDegree{1}, DiscardCopyDegree{o_h}, o_b, 1, 1);
+            make_v(SumDegree{1_n}, DiscardCopyDegree{o_h}, o_b, 1_n, 1_n);
 
         SUBCASE("get_output_shape") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_o(SumDegree{o_h}, DiscardCopyDegree{1}, o_b, 1, 1);
+              make_o(SumDegree{o_h}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
           CHECK(result == correct);
         }
 
@@ -342,7 +364,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_weights_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_w(SumDegree{1}, DiscardCopyDegree{o_b}, 1, o_h);
+              make_w(SumDegree{1_n}, DiscardCopyDegree{o_b}, 1_n, o_h);
           CHECK(result == correct);
         }
 
@@ -350,7 +372,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_input_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_input_bias(SumDegree{1}, DiscardCopyDegree{o_b * o_h}, 1);
+              make_input_bias(
+                  SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n);
           CHECK(result == correct);
         }
 
@@ -358,7 +381,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_bias_shape(attrs, q, k, v);
           tl::expected<ParallelTensorShape, std::string> correct =
-              make_output_bias(SumDegree{1}, DiscardCopyDegree{o_b * o_h}, 1);
+              make_output_bias(
+                  SumDegree{1_n}, DiscardCopyDegree{o_b * o_h}, 1_n);
           CHECK(result == correct);
         }
       }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc
index 56a2e3fa52..27c59ee497 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/batch_matmul.cc
@@ -6,20 +6,20 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(BatchMatmulAttrs, TensorShape)") {
-    size_t b = 4;
-    size_t m = 6;
-    size_t n = 8;
-    size_t p = 10;
+    nonnegative_int b = 4_n;
+    nonnegative_int m = 6_n;
+    nonnegative_int n = 8_n;
+    nonnegative_int p = 10_n;
 
     BatchMatmulAttrs attrs = BatchMatmulAttrs{
-        /*a_seq_length_dim=*/0, // TODO figure out if these arguments are still
-                                // relevant
-        /*b_seq_length_dim=*/0,
+        /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are
+                                  // still relevant
+        /*b_seq_length_dim=*/0_n,
     };
 
     TensorShape input_lhs_shape = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 b,
                 n,
                 m,
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("valid") {
       TensorShape input_rhs_shape = TensorShape{
           TensorDims{
-              FFOrdered<size_t>{
+              FFOrdered<nonnegative_int>{
                   b,
                   m,
                   p,
@@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       tl::expected<TensorShape, std::string> correct_output_shape = TensorShape{
           TensorDims{
-              FFOrdered<size_t>{
+              FFOrdered<nonnegative_int>{
                   b,
                   n,
                   p,
@@ -60,8 +60,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("mismatched b") {
       TensorShape input_rhs_shape = TensorShape{
           TensorDims{
-              FFOrdered<size_t>{
-                  b + 1,
+              FFOrdered<nonnegative_int>{
+                  b + 1_n,
                   m,
                   p,
               },
@@ -78,9 +78,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("mismatched m") {
       TensorShape input_rhs_shape = TensorShape{
           TensorDims{
-              FFOrdered<size_t>{
+              FFOrdered<nonnegative_int>{
                   b,
-                  m + 1,
+                  m + 1_n,
                   p,
               },
           },
@@ -95,27 +95,27 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("get_output_shape(BatchMatmulAttrs, ParallelTensorShape)") {
-    size_t b = 2 * 2;
-    int o_b = 2;
-    size_t m = 3 * 3;
-    int o_m = 3;
-    size_t n = 5 * 5;
-    int o_n = 5;
-    size_t p = 7 * 7;
-    int o_p = 7;
-    int o_sum = 11;
+    nonnegative_int b = 2_n * 2_n;
+    nonnegative_int o_b = 2_n;
+    nonnegative_int m = 3_n * 3_n;
+    nonnegative_int o_m = 3_n;
+    nonnegative_int n = 5_n * 5_n;
+    nonnegative_int o_n = 5_n;
+    nonnegative_int p = 7_n * 7_n;
+    nonnegative_int o_p = 7_n;
+    nonnegative_int o_sum = 11_n;
 
     BatchMatmulAttrs attrs = BatchMatmulAttrs{
-        /*a_seq_length_dim=*/0, // TODO figure out if these arguments are still
-                                // relevant
-        /*b_seq_length_dim=*/0,
+        /*a_seq_length_dim=*/0_n, // TODO figure out if these arguments are
+                                  // still relevant
+        /*b_seq_length_dim=*/0_n,
     };
 
     auto make_lhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_b,
-                        int o_n,
-                        int o_m) {
+                        nonnegative_int o_b,
+                        nonnegative_int o_n,
+                        nonnegative_int o_m) {
       return ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
@@ -134,9 +134,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_rhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_b,
-                        int o_m,
-                        int o_p) {
+                        nonnegative_int o_b,
+                        nonnegative_int o_m,
+                        nonnegative_int o_p) {
       return ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
@@ -155,9 +155,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_b,
-                           int o_n,
-                           int o_p) {
+                           nonnegative_int o_b,
+                           nonnegative_int o_n,
+                           nonnegative_int o_p) {
       return ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
@@ -177,10 +177,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("data parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, o_b, 1, 1);
+          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, o_b, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -188,10 +188,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("n parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, 1, o_n, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{o_n}, 1, 1, 1));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_n}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, o_n, 1);
+          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -199,10 +199,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("p parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{o_p}, 1, 1, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, o_p));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_p}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, o_p);
+          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_p);
 
       CHECK(result == correct);
     }
@@ -210,10 +210,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction parallel") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, o_m),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, 1, o_m, 1));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, o_m),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, o_m, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_m}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_output(SumDegree{o_m}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -221,10 +221,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("propagate reduction lhs") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{o_sum}, 1, 1, 1));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -232,10 +232,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("propagate reduction rhs") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{1}, DiscardCopyDegree{o_sum}, 1, 1, 1),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1));
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -243,10 +243,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & reduction rhs") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, 1, 1),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, 1, 1));
-      tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum * o_sum}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, 1_n, 1_n));
+      tl::expected<ParallelTensorShape, std::string> correct = make_output(
+          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -254,8 +254,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & rhs (invalid)") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, 1, 1));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n));
 
       CHECK_MESSAGE(
           !result.has_value(), "Unexpected successful value: ", result);
@@ -264,10 +264,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & n") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, o_n, 1),
-          make_rhs(SumDegree{1}, DiscardCopyDegree{o_sum * o_n}, 1, 1, 1));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n),
+          make_rhs(
+              SumDegree{1_n}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{o_sum}, DiscardCopyDegree{1}, 1, o_n, 1);
+          make_output(SumDegree{o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -275,10 +276,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & reduction rhs & n") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, o_n, 1),
-          make_rhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1, 1, 1));
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, 1_n),
+          make_rhs(
+              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, 1_n, 1_n));
       tl::expected<ParallelTensorShape, std::string> correct = make_output(
-          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1}, 1, o_n, 1);
+          SumDegree{o_sum * o_sum}, DiscardCopyDegree{1_n}, 1_n, o_n, 1_n);
 
       CHECK(result == correct);
     }
@@ -286,11 +288,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("reduction lhs & reduction rhs & n & m") {
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
           attrs,
-          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1, o_n, o_m),
+          make_lhs(SumDegree{o_sum}, DiscardCopyDegree{o_sum}, 1_n, o_n, o_m),
           make_rhs(
-              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1, o_m, 1));
-      tl::expected<ParallelTensorShape, std::string> correct = make_output(
-          SumDegree{o_sum * o_sum * o_m}, DiscardCopyDegree{1}, 1, o_n, 1);
+              SumDegree{o_sum}, DiscardCopyDegree{o_sum * o_n}, 1_n, o_m, 1_n));
+      tl::expected<ParallelTensorShape, std::string> correct =
+          make_output(SumDegree{o_sum * o_sum * o_m},
+                      DiscardCopyDegree{1_n},
+                      1_n,
+                      o_n,
+                      1_n);
 
       CHECK(result == correct);
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc
index 4196394d00..cd9796945c 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/batch_norm.cc
@@ -60,11 +60,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }();
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
-            18,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
+            18_n,
         }},
         DataType::FLOAT,
     };
@@ -72,8 +72,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape output = input;
 
     TensorShape gamma = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            14,
+        TensorDims{FFOrdered<nonnegative_int>{
+            14_n,
         }},
         DataType::FLOAT,
     };
@@ -140,16 +140,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     }();
 
     SUBCASE("partition parallelism (in channel dim)") {
-      int degree = 2;
+      nonnegative_int degree = 2_n;
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{
-              1,
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{
+              1_n,
               degree,
-              1,
-              1,
+              1_n,
+              1_n,
           },
       };
 
@@ -169,9 +169,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_gamma_weights_parallel_dim_degrees(attrs_affine_true, input);
           tl::expected<ParallelTensorDimDegrees, std::string> correct =
               ParallelTensorDimDegrees{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
-                  FFOrdered<int>{degree},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
+                  FFOrdered<nonnegative_int>{degree},
               };
 
           CHECK(result == correct);
@@ -194,9 +194,9 @@ TEST_SUITE(FF_TEST_SUITE) {
               get_beta_weights_parallel_dim_degrees(attrs_affine_true, input);
           tl::expected<ParallelTensorDimDegrees, std::string> correct =
               ParallelTensorDimDegrees{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
-                  FFOrdered<int>{degree},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
+                  FFOrdered<nonnegative_int>{degree},
               };
 
           CHECK(result == correct);
@@ -214,12 +214,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("partition parallelism (not in channel dim)") {
-      int degree = 2;
+      nonnegative_int degree = 2_n;
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{1, 1, degree, 1},
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, degree, 1_n},
       };
 
       SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, "
@@ -251,12 +251,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("sum parallelism") {
-      SumDegree sum_degree = SumDegree{2};
+      SumDegree sum_degree = SumDegree{2_n};
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
           sum_degree,
-          DiscardCopyDegree{1},
-          FFOrdered<int>{1, 1, 1, 1},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
       };
 
       SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, "
@@ -288,12 +288,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy parallelism") {
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
+          SumDegree{1_n},
           discard_copy_degree,
-          FFOrdered<int>{1, 1, 1, 1},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
       };
 
       SUBCASE("get_output_parallel_dim_degrees(BatchNormAttrs, "
@@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 1},
-                ShardParallelDim{14, 2},
-                ShardParallelDim{16, 1},
-                ShardParallelDim{18, 1},
+                ShardParallelDim{12_n, 1_n},
+                ShardParallelDim{14_n, 2_n},
+                ShardParallelDim{16_n, 1_n},
+                ShardParallelDim{18_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -368,11 +368,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorShape{
               ParallelTensorDims{
                   FFOrdered<ShardParallelDim>{
-                      ShardParallelDim{14, 2},
+                      ShardParallelDim{14_n, 2_n},
                   },
                   ReplicaParallelDimSet{
-                      SumDegree{1},
-                      DiscardCopyDegree{1},
+                      SumDegree{1_n},
+                      DiscardCopyDegree{1_n},
                   },
               },
               DataType::FLOAT,
@@ -388,11 +388,11 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorShape{
               ParallelTensorDims{
                   FFOrdered<ShardParallelDim>{
-                      ShardParallelDim{14, 2},
+                      ShardParallelDim{14_n, 2_n},
                   },
                   ReplicaParallelDimSet{
-                      SumDegree{1},
-                      DiscardCopyDegree{1},
+                      SumDegree{1_n},
+                      DiscardCopyDegree{1_n},
                   },
               },
               DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/cast.cc b/lib/op-attrs/test/src/op-attrs/ops/cast.cc
index c7395316ad..e9ec890b4b 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/cast.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/cast.cc
@@ -12,15 +12,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     CastAttrs attrs = CastAttrs{output_datatype};
 
-    size_t d1 = 12;
-    size_t d2 = 16;
+    nonnegative_int d1 = 12_n;
+    nonnegative_int d2 = 16_n;
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{d1, d2}},
+        TensorDims{FFOrdered<nonnegative_int>{d1, d2}},
         input_datatype,
     };
 
     TensorShape output = TensorShape{
-        TensorDims{FFOrdered<size_t>{d1, d2}},
+        TensorDims{FFOrdered<nonnegative_int>{d1, d2}},
         output_datatype,
     };
 
@@ -34,24 +34,30 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("get_output_shape(CastAttrs, ParallelTensorShape)") {
       auto make_input = [&](SumDegree o_sum,
                             DiscardCopyDegree o_eq,
-                            int o_batch,
-                            int o_features) {
+                            nonnegative_int o_batch,
+                            nonnegative_int o_features) {
         return lift_to_parallel_with_degrees(
-            input, o_sum, o_eq, FFOrdered<int>{o_batch, o_features});
+            input,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_features});
       };
 
       auto make_output = [&](SumDegree o_sum,
                              DiscardCopyDegree o_eq,
-                             int o_batch,
-                             int o_outchannels) {
+                             nonnegative_int o_batch,
+                             nonnegative_int o_outchannels) {
         return lift_to_parallel_with_degrees(
-            output, o_sum, o_eq, FFOrdered<int>{o_batch, o_outchannels});
+            output,
+            o_sum,
+            o_eq,
+            FFOrdered<nonnegative_int>{o_batch, o_outchannels});
       };
 
-      SumDegree sum_degree = SumDegree{2};
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3};
-      int batch_degree = 4;
-      int feature_degree = 8;
+      SumDegree sum_degree = SumDegree{2_n};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{3_n};
+      nonnegative_int batch_degree = 4_n;
+      nonnegative_int feature_degree = 8_n;
       ParallelTensorShape par_input = make_input(
           sum_degree, discard_copy_degree, batch_degree, feature_degree);
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/combine.cc b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
index 577961b7b1..14fbca5b3a 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/combine.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/combine.cc
@@ -10,22 +10,22 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 2},
-                ShardParallelDim{14, 1},
-                ShardParallelDim{16, 3},
-                ShardParallelDim{18, 2},
+                ShardParallelDim{12_n, 2_n},
+                ShardParallelDim{14_n, 1_n},
+                ShardParallelDim{16_n, 3_n},
+                ShardParallelDim{18_n, 2_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{2},
+                SumDegree{3_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
     };
 
     SUBCASE("valid") {
-      ff_dim_t dim = ff_dim_t{nonnegative_int{2}};
-      int degree = 3;
+      ff_dim_t dim = ff_dim_t{2_n};
+      nonnegative_int degree = 3_n;
       CombineAttrs attrs = CombineAttrs{
           /*repartition_dim=*/dim,
           /*repartition_degree=*/degree,
@@ -44,8 +44,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("invalid") {
-      ff_dim_t dim = ff_dim_t{nonnegative_int{2}};
-      int degree = 4;
+      ff_dim_t dim = ff_dim_t{2_n};
+      nonnegative_int degree = 4_n;
       CombineAttrs attrs = CombineAttrs{
           /*repartition_dim=*/dim,
           /*repartition_degree=*/degree,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/concat.cc b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
index 2d9842b1dd..b84cf38753 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/concat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/concat.cc
@@ -23,12 +23,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(result == correct);
     }
 
-    size_t dim0_size = 12;
-    size_t dim2_size = 20;
+    nonnegative_int dim0_size = 12_n;
+    nonnegative_int dim2_size = 20_n;
     TensorShape input_shape1 = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             dim0_size,
-            14,
+            14_n,
             dim2_size,
         }},
         DataType::FLOAT,
@@ -45,26 +45,26 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     TensorShape input_shape2 = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             dim0_size,
-            16,
+            16_n,
             dim2_size,
         }},
         DataType::FLOAT,
     };
 
     TensorShape input_shape3 = TensorShape{
-        TensorDims{FFOrdered<size_t>{dim0_size, 18, dim2_size}},
+        TensorDims{FFOrdered<nonnegative_int>{dim0_size, 18_n, dim2_size}},
         DataType::FLOAT,
     };
 
     SUBCASE("input shapes do not shared the same num_dims") {
       TensorShape mismatched_num_dims = TensorShape{
-          TensorDims{FFOrdered<size_t>{
+          TensorDims{FFOrdered<nonnegative_int>{
               dim0_size,
-              20,
+              20_n,
               dim2_size,
-              1,
+              1_n,
           }},
           DataType::FLOAT,
       };
@@ -101,9 +101,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input_shapes);
       tl::expected<TensorShape, std::string> correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
+          TensorDims{FFOrdered<nonnegative_int>{
               dim0_size,
-              14 + 16 + 18,
+              14_n + 16_n + 18_n,
               dim2_size,
           }},
           DataType::FLOAT,
@@ -118,84 +118,97 @@ TEST_SUITE(FF_TEST_SUITE) {
         ff_dim_t{nonnegative_int{1}},
     };
 
-    size_t dim0_size = 12;
-    size_t dim2_size = 20;
+    nonnegative_int dim0_size = 12_n;
+    nonnegative_int dim2_size = 20_n;
 
     TensorShape input_shape1 = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             dim0_size,
-            14,
+            14_n,
             dim2_size,
         }},
         DataType::FLOAT,
     };
 
     TensorShape input_shape2 = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             dim0_size,
-            16,
+            16_n,
             dim2_size,
         }},
         DataType::FLOAT,
     };
 
     TensorShape input_shape3 = TensorShape{
-        TensorDims{FFOrdered<size_t>{dim0_size, 18, dim2_size}},
+        TensorDims{FFOrdered<nonnegative_int>{dim0_size, 18_n, dim2_size}},
         DataType::FLOAT,
     };
 
     TensorShape output_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{dim0_size, 14 + 16 + 18, dim2_size}},
+        TensorDims{FFOrdered<nonnegative_int>{
+            dim0_size, 14_n + 16_n + 18_n, dim2_size}},
         DataType::FLOAT,
     };
 
-    auto lift_input1 =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input_shape1, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto lift_input1 = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input_shape1, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto lift_input2 =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input_shape2, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto lift_input2 = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input_shape2, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto lift_input3 =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input_shape3, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto lift_input3 = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input_shape3, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto lift_output =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              output_shape, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto lift_output = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          output_shape, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
     SUBCASE("sum reduction parallelism") {
       SUBCASE("matching") {
-        SumDegree sum_degree = SumDegree{2};
+        SumDegree sum_degree = SumDegree{2_n};
 
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(sum_degree, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input2(sum_degree, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input3(sum_degree, DiscardCopyDegree{1}, 1, 1, 1),
+            lift_input1(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input2(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input3(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
         };
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, inputs);
         tl::expected<ParallelTensorShape, std::string> correct =
-            lift_output(sum_degree, DiscardCopyDegree{1}, 1, 1, 1);
+            lift_output(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
         CHECK(result == correct);
       }
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{2}, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input2(SumDegree{4}, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input3(SumDegree{4}, DiscardCopyDegree{1}, 1, 1, 1),
+            lift_input1(SumDegree{2_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input2(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input3(SumDegree{4_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -208,27 +221,27 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("discard copy reduction parallelism") {
       SUBCASE("matching") {
-        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, discard_copy_degree, 1, 1, 1),
-            lift_input2(SumDegree{1}, discard_copy_degree, 1, 1, 1),
-            lift_input3(SumDegree{1}, discard_copy_degree, 1, 1, 1),
+            lift_input1(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n),
+            lift_input2(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n),
+            lift_input3(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n),
         };
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, inputs);
         tl::expected<ParallelTensorShape, std::string> correct =
-            lift_output(SumDegree{1}, discard_copy_degree, 1, 1, 1);
+            lift_output(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n);
 
         CHECK(result == correct);
       }
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, DiscardCopyDegree{2}, 1, 1, 1),
-            lift_input2(SumDegree{1}, DiscardCopyDegree{2}, 1, 1, 1),
-            lift_input3(SumDegree{1}, DiscardCopyDegree{4}, 1, 1, 1),
+            lift_input1(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n),
+            lift_input2(SumDegree{1_n}, DiscardCopyDegree{2_n}, 1_n, 1_n, 1_n),
+            lift_input3(SumDegree{1_n}, DiscardCopyDegree{4_n}, 1_n, 1_n, 1_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -241,12 +254,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("parallelism in axis dim") {
       SUBCASE("matching") {
-        int degree = 2;
+        nonnegative_int degree = 2_n;
 
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1),
-            lift_input2(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1),
-            lift_input3(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1),
+            lift_input1(
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n),
+            lift_input2(
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n),
+            lift_input3(
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -258,9 +274,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input2(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, 1),
-            lift_input3(SumDegree{1}, DiscardCopyDegree{1}, 1, 2, 1),
+            lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n),
+            lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 2_n, 1_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -273,31 +289,31 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("parallelism in non-axis shard dims") {
       SUBCASE("matching") {
-        int degree0 = 2;
-        int degree2 = 4;
+        nonnegative_int degree0 = 2_n;
+        nonnegative_int degree2 = 4_n;
 
         std::vector<ParallelTensorShape> inputs = {
             lift_input1(
-                SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2),
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2),
             lift_input2(
-                SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2),
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2),
             lift_input3(
-                SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2),
+                SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2),
         };
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, inputs);
         tl::expected<ParallelTensorShape, std::string> correct = lift_output(
-            SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
         CHECK(result == correct);
       }
 
       SUBCASE("not matching") {
         std::vector<ParallelTensorShape> inputs = {
-            lift_input1(SumDegree{1}, DiscardCopyDegree{1}, 2, 1, 4),
-            lift_input2(SumDegree{1}, DiscardCopyDegree{1}, 4, 1, 2),
-            lift_input3(SumDegree{1}, DiscardCopyDegree{1}, 4, 1, 2),
+            lift_input1(SumDegree{1_n}, DiscardCopyDegree{1_n}, 2_n, 1_n, 4_n),
+            lift_input2(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n),
+            lift_input3(SumDegree{1_n}, DiscardCopyDegree{1_n}, 4_n, 1_n, 2_n),
         };
 
         std::optional<ParallelTensorShape> result =
@@ -309,21 +325,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("parallelism degrees are not mutually exclusive") {
-      SumDegree sum_degree = SumDegree{3};
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5};
-      int degree0 = 2;
-      int degree2 = 4;
+      SumDegree sum_degree = SumDegree{3_n};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{5_n};
+      nonnegative_int degree0 = 2_n;
+      nonnegative_int degree2 = 4_n;
 
       std::vector<ParallelTensorShape> inputs = {
-          lift_input1(sum_degree, discard_copy_degree, degree0, 1, degree2),
-          lift_input2(sum_degree, discard_copy_degree, degree0, 1, degree2),
-          lift_input3(sum_degree, discard_copy_degree, degree0, 1, degree2),
+          lift_input1(sum_degree, discard_copy_degree, degree0, 1_n, degree2),
+          lift_input2(sum_degree, discard_copy_degree, degree0, 1_n, degree2),
+          lift_input3(sum_degree, discard_copy_degree, degree0, 1_n, degree2),
       };
 
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, inputs);
       tl::expected<ParallelTensorShape, std::string> correct =
-          lift_output(sum_degree, discard_copy_degree, degree0, 1, degree2);
+          lift_output(sum_degree, discard_copy_degree, degree0, 1_n, degree2);
 
       CHECK(result == correct);
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
index 7abb98f3e3..f5006d4352 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/conv_2d.cc
@@ -7,14 +7,14 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_conv2d_incoming_tensor_roles(Conv2DAttrs") {
     auto make_attrs = [](bool use_bias) {
-      return Conv2DAttrs{/*out_channels=*/4,
-                         /*kernel_h=*/3,
-                         /*kernel_w=*/2,
-                         /*stride_h=*/2,
-                         /*stride_w=*/2,
-                         /*padding_h=*/1,
-                         /*padding_w=*/1,
-                         /*groups=*/1,
+      return Conv2DAttrs{/*out_channels=*/4_n,
+                         /*kernel_h=*/3_n,
+                         /*kernel_w=*/2_n,
+                         /*stride_h=*/2_n,
+                         /*stride_w=*/2_n,
+                         /*padding_h=*/1_n,
+                         /*padding_w=*/1_n,
+                         /*groups=*/1_n,
                          /*activation=*/std::nullopt,
                          /*use_bias=*/use_bias};
     };
@@ -48,14 +48,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Conv2D shape inference") {
-    int out_channels = 4;
-    int kernel_h = 3;
-    int kernel_w = 2;
-    int stride_h = 2;
-    int stride_w = 2;
-    int padding_h = 1;
-    int padding_w = 1;
-    int groups = 1;
+    nonnegative_int out_channels = 4_n;
+    nonnegative_int kernel_h = 3_n;
+    nonnegative_int kernel_w = 2_n;
+    nonnegative_int stride_h = 2_n;
+    nonnegative_int stride_w = 2_n;
+    nonnegative_int padding_h = 1_n;
+    nonnegative_int padding_w = 1_n;
+    nonnegative_int groups = 1_n;
     std::optional<Activation> activation = std::nullopt;
     bool use_bias = true;
 
@@ -72,13 +72,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*use_bias=*/true,
     };
 
-    size_t num_samples = 7;
-    size_t input_channels = 4;
-    size_t input_height = 11;
-    size_t input_width = 15;
+    nonnegative_int num_samples = 7_n;
+    nonnegative_int input_channels = 4_n;
+    nonnegative_int input_height = 11_n;
+    nonnegative_int input_width = 15_n;
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             num_samples,
             input_channels,
             input_height,
@@ -87,13 +87,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataType::FLOAT,
     };
 
-    size_t output_height = 6;
-    size_t output_width = 8;
+    nonnegative_int output_height = 6_n;
+    nonnegative_int output_width = 8_n;
 
     TensorShape output = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             num_samples,
-            size_t_from_int(out_channels),
+            out_channels,
             output_height,
             output_width,
         }},
@@ -101,18 +101,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape kernel = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            size_t_from_int(out_channels),
+        TensorDims{FFOrdered<nonnegative_int>{
+            out_channels,
             input_channels,
-            size_t_from_int(kernel_h),
-            size_t_from_int(kernel_w),
+            kernel_h,
+            kernel_w,
         }},
         DataType::FLOAT,
     };
 
     TensorShape bias = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            size_t_from_int(out_channels),
+        TensorDims{FFOrdered<nonnegative_int>{
+            out_channels,
         }},
         DataType::FLOAT,
     };
@@ -137,147 +137,149 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          int o_n,
-                          int o_c,
-                          int o_h,
-                          int o_w) {
+                          nonnegative_int o_n,
+                          nonnegative_int o_c,
+                          nonnegative_int o_h,
+                          nonnegative_int o_w) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o_n, o_c, o_h, o_w});
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o_n, o_c, o_h, o_w});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_n,
-                           int o_c,
-                           int o_h,
-                           int o_w) {
+                           nonnegative_int o_n,
+                           nonnegative_int o_c,
+                           nonnegative_int o_h,
+                           nonnegative_int o_w) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<int>{o_n, o_c, o_h, o_w});
+          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o_n, o_c, o_h, o_w});
     };
 
     auto make_kernel = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_outchannels,
-                           int o_inchannels,
-                           int o_kernel_h,
-                           int o_kernel_w) {
+                           nonnegative_int o_outchannels,
+                           nonnegative_int o_inchannels,
+                           nonnegative_int o_kernel_h,
+                           nonnegative_int o_kernel_w) {
       return lift_to_parallel_with_degrees(
           kernel,
           o_sum,
           o_eq,
-          FFOrdered<int>{o_outchannels, o_inchannels, o_kernel_h, o_kernel_w});
+          FFOrdered<nonnegative_int>{
+              o_outchannels, o_inchannels, o_kernel_h, o_kernel_w});
     };
 
-    auto make_bias =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_outchannels) {
-          return lift_to_parallel_with_degrees(
-              bias, o_sum, o_eq, FFOrdered<int>{o_outchannels});
-        };
+    auto make_bias = [&](SumDegree o_sum,
+                         DiscardCopyDegree o_eq,
+                         nonnegative_int o_outchannels) {
+      return lift_to_parallel_with_degrees(
+          bias, o_sum, o_eq, FFOrdered<nonnegative_int>{o_outchannels});
+    };
 
     SUBCASE("data parallelism") {
-      int degree = 2;
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1, 1);
+      nonnegative_int degree = 2_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_output(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1, 1);
+        ParallelTensorShape correct = make_output(
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_kernel(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1, 1);
+        ParallelTensorShape correct = make_kernel(
+            SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{1}, DiscardCopyDegree{degree}, 1);
+            make_bias(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("input channel parallelism") {
-      int degree = 2;
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1, 1);
+      nonnegative_int degree = 2_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1, 1);
+        ParallelTensorShape correct = make_output(
+            SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_kernel(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1, 1);
+        ParallelTensorShape correct = make_kernel(
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{degree}, DiscardCopyDegree{1}, 1);
+            make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("output channel parallelism") {
-      int degree = 2;
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1, 1);
+      nonnegative_int degree = 2_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1, 1);
+        ParallelTensorShape correct = make_output(
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_kernel(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1, 1);
+        ParallelTensorShape correct = make_kernel(
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{1}, DiscardCopyDegree{1}, degree);
+            make_bias(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("propagating sum degree") {
-      int degree = 2;
-      ParallelTensorShape par_input =
-          make_input(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1, 1);
+      nonnegative_int degree = 2_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
 
       SUBCASE("get_output_shape") {
         ParallelTensorShape result = get_output_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1, 1);
+        ParallelTensorShape correct = make_output(
+            SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_kernel_shape") {
         ParallelTensorShape result = get_kernel_shape(attrs, par_input);
-        ParallelTensorShape correct =
-            make_kernel(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1, 1);
+        ParallelTensorShape correct = make_kernel(
+            SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n, 1_n);
         CHECK(result == correct);
       }
 
       SUBCASE("get_bias_shape") {
         ParallelTensorShape result = get_bias_shape(attrs, par_input);
         ParallelTensorShape correct =
-            make_bias(SumDegree{degree}, DiscardCopyDegree{1}, 1);
+            make_bias(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n);
         CHECK(result == correct);
       }
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc
index 7580de24e5..e1a03a7613 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/dropout.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/dropout.cc
@@ -15,10 +15,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
@@ -36,48 +36,54 @@ TEST_SUITE(FF_TEST_SUITE) {
     };
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
 
     TensorShape output = input;
 
-    auto make_input =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto make_input = [&](SumDegree o_sum,
+                          DiscardCopyDegree o_eq,
+                          nonnegative_int o0,
+                          nonnegative_int o1,
+                          nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto make_output =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              output, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto make_output = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
     SUBCASE("partition parallelism (allowed)") {
-      int degree0 = 2;
-      int degree2 = 4;
+      nonnegative_int degree0 = 2_n;
+      nonnegative_int degree2 = 4_n;
 
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, par_input);
-      tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+      tl::expected<ParallelTensorShape, std::string> correct = make_output(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
       CHECK(result == correct);
     }
 
     SUBCASE("sum parallelism (not allowed)") {
-      SumDegree sum_degree = SumDegree{2};
+      SumDegree sum_degree = SumDegree{2_n};
 
       ParallelTensorShape par_input =
-          make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1);
+          make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
@@ -87,10 +93,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy parallelism (not allowed)") {
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
       ParallelTensorShape par_input =
-          make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1);
+          make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
index d5aab55cb2..d6a92036f0 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_binary.cc
@@ -7,9 +7,9 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("EWAdd shape inference") {
-    size_t d1 = 16;
-    size_t d2 = 32;
-    size_t d3 = 24;
+    nonnegative_int d1 = 16_n;
+    nonnegative_int d2 = 32_n;
+    nonnegative_int d3 = 24_n;
 
     ElementBinaryAttrs attrs = ElementBinaryAttrs{
         OperatorType::EW_ADD,
@@ -20,7 +20,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape input_lhs = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 d1,
                 d2,
                 d3,
@@ -41,7 +41,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("mismatched dim size") {
       TensorShape incorrect_rhs = input_lhs;
-      dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1;
+      dim_at_idx(incorrect_rhs, relative_ff_dim_t{0}) += 1_n;
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, incorrect_rhs);
@@ -53,9 +53,9 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("EWAdd parallel shape inference") {
-    size_t d1 = 16;
-    size_t d2 = 32;
-    size_t d3 = 24;
+    nonnegative_int d1 = 16_n;
+    nonnegative_int d2 = 32_n;
+    nonnegative_int d3 = 24_n;
 
     ElementBinaryAttrs attrs = ElementBinaryAttrs{
         OperatorType::EW_ADD,
@@ -66,7 +66,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape unpar_lhs = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 d1,
                 d2,
                 d3,
@@ -83,68 +83,68 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_lhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_1,
-                        int o_2,
-                        int o_3) {
+                        nonnegative_int o_1,
+                        nonnegative_int o_2,
+                        nonnegative_int o_3) {
       return lift_to_parallel_with_degrees(
-          unpar_lhs, o_sum, o_eq, FFOrdered<int>{o_1, o_2, o_3});
+          unpar_lhs, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
     };
 
     auto make_rhs = [&](SumDegree o_sum,
                         DiscardCopyDegree o_eq,
-                        int o_1,
-                        int o_2,
-                        int o_3) {
+                        nonnegative_int o_1,
+                        nonnegative_int o_2,
+                        nonnegative_int o_3) {
       return lift_to_parallel_with_degrees(
-          unpar_rhs, o_sum, o_eq, FFOrdered<int>{o_1, o_2, o_3});
+          unpar_rhs, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_1,
-                           int o_2,
-                           int o_3) {
+                           nonnegative_int o_1,
+                           nonnegative_int o_2,
+                           nonnegative_int o_3) {
       return lift_to_parallel_with_degrees(
-          unpar_output, o_sum, o_eq, FFOrdered<int>{o_1, o_2, o_3});
+          unpar_output, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
     };
 
     SUBCASE("data parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1);
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1);
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{1}, DiscardCopyDegree{1}, degree, 1, 1);
+          make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n, 1_n);
 
       CHECK(result == correct);
     }
 
     SUBCASE("reduction parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_lhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_rhs(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
       tl::expected<ParallelTensorShape, std::string> correct =
-          make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1);
+          make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       CHECK(result == correct);
     }
 
     SUBCASE("invalid discard copy parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1);
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1);
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
 
@@ -154,12 +154,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("invalid mismatched parallelism degrees") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape input_lhs =
-          make_lhs(SumDegree{1}, DiscardCopyDegree{1}, 1, degree, 1);
+          make_lhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree, 1_n);
       ParallelTensorShape input_rhs =
-          make_rhs(SumDegree{1}, DiscardCopyDegree{1}, 1, 1, degree);
+          make_rhs(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, 1_n, degree);
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, input_lhs, input_rhs);
 
diff --git a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
index 94c382356e..bac6efba3f 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/element_unary.cc
@@ -7,16 +7,16 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ReLU shape inference") {
-    size_t d1 = 16;
-    size_t d2 = 32;
-    size_t d3 = 24;
+    nonnegative_int d1 = 16_n;
+    nonnegative_int d2 = 32_n;
+    nonnegative_int d3 = 24_n;
 
     ElementUnaryAttrs attrs =
         ElementUnaryAttrs{OperatorType::RELU, std::nullopt};
 
     TensorShape input = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 d1,
                 d2,
                 d3,
@@ -31,20 +31,20 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     CHECK(result == correct);
 
-    auto make_i = [&](SumDegree o_sum,
-                      DiscardCopyDegree o_eq,
-                      int o_1,
-                      int o_2,
-                      int o_3) {
+    auto make_input = [&](SumDegree o_sum,
+                          DiscardCopyDegree o_eq,
+                          nonnegative_int o_1,
+                          nonnegative_int o_2,
+                          nonnegative_int o_3) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o_1, o_2, o_3});
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o_1, o_2, o_3});
     };
 
     SUBCASE("partition i.e., sharding parallelism") {
-      int degree1 = 4;
-      int degree2 = 8;
-      ParallelTensorShape par_input =
-          make_i(SumDegree{1}, DiscardCopyDegree{1}, degree1, 1, degree2);
+      nonnegative_int degree1 = 4_n;
+      nonnegative_int degree2 = 8_n;
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree1, 1_n, degree2);
 
       tl::expected<ParallelTensorShape, std::string> result =
           get_output_shape(attrs, par_input);
@@ -54,10 +54,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("sum degree > 1") {
-      int degree = 2;
+      nonnegative_int degree = 2_n;
 
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
-          attrs, make_i(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1, 1));
+          attrs,
+          make_input(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n));
 
       CHECK_MESSAGE(!result.has_value(),
                     "Unexpected successful result: ",
@@ -65,10 +66,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy degree > 1") {
-      int degree = 2;
+      nonnegative_int degree = 2_n;
 
       tl::expected<ParallelTensorShape, std::string> result = get_output_shape(
-          attrs, make_i(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1, 1));
+          attrs,
+          make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n, 1_n));
 
       CHECK_MESSAGE(!result.has_value(),
                     "Unexpected successful result: ",
diff --git a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
index 134737f6c0..8fe50a4217 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/embedding.cc
@@ -8,8 +8,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Sum embedding shape inference") {
-    int out_channels = 128;
-    int num_entries = 1024;
+    nonnegative_int out_channels = 128_n;
+    nonnegative_int num_entries = 1024_n;
     EmbeddingAttrs attrs = EmbeddingAttrs{
         /*num_entries=*/num_entries,
         /*out_channels=*/out_channels,
@@ -17,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*data_type=*/DataType::FLOAT,
     };
 
-    size_t batch_size = 48;
-    size_t features_dim = 56;
+    nonnegative_int batch_size = 48_n;
+    nonnegative_int features_dim = 56_n;
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
+        TensorDims{FFOrdered<nonnegative_int>{
             batch_size,
             features_dim,
         }},
@@ -30,9 +30,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
-                size_t_from_int(out_channels),
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -40,9 +40,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape weights = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                size_t_from_int(num_entries),
-                size_t_from_int(out_channels),
+            FFOrdered<nonnegative_int>{
+                num_entries,
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -66,38 +66,44 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          int o_batch,
-                          int o_features) {
+                          nonnegative_int o_batch,
+                          nonnegative_int o_features) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o_batch, o_features});
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o_batch, o_features});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_batch,
-                           int o_outchannels) {
+                           nonnegative_int o_batch,
+                           nonnegative_int o_outchannels) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<int>{o_batch, o_outchannels});
+          output,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_batch, o_outchannels});
     };
 
     auto make_weights = [&](SumDegree o_sum,
                             DiscardCopyDegree o_eq,
-                            int o_entries,
-                            int o_outchannels) {
+                            nonnegative_int o_entries,
+                            nonnegative_int o_outchannels) {
       return lift_to_parallel_with_degrees(
-          weights, o_sum, o_eq, FFOrdered<int>{o_entries, o_outchannels});
+          weights,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_entries, o_outchannels});
     };
 
     SUBCASE("data parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
       ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, degree, 1);
+          make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_output(SumDegree{1}, DiscardCopyDegree{1}, degree, 1);
+            make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, degree, 1_n);
         CHECK(result == correct);
       }
 
@@ -105,21 +111,21 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_weights_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_weights(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1);
+            make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("input features parallelism") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
       ParallelTensorShape input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree);
+          make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_output(SumDegree{degree}, DiscardCopyDegree{1}, 1, 1);
+            make_output(SumDegree{degree}, DiscardCopyDegree{1_n}, 1_n, 1_n);
         CHECK(result == correct);
       }
 
@@ -127,7 +133,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_weights_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_weights(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1);
+            make_weights(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n);
         CHECK(result == correct);
       }
     }
@@ -139,15 +145,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       // dimension. For now we choose to represent parallelism in the channel
       // dimension, but partitioning in the entry dimension is also potentially
       // useful as it produces sum parallelism in the output
-      int degree = 4;
+      nonnegative_int degree = 4_n;
       ParallelTensorShape input =
-          make_input(SumDegree{1}, DiscardCopyDegree{degree}, 1, 1);
+          make_input(SumDegree{1_n}, DiscardCopyDegree{degree}, 1_n, 1_n);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_output(SumDegree{1}, DiscardCopyDegree{1}, 1, degree);
+            make_output(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree);
         CHECK(result == correct);
       }
 
@@ -155,7 +161,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_weights_shape(attrs, input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_weights(SumDegree{1}, DiscardCopyDegree{1}, 1, degree);
+            make_weights(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree);
         CHECK(result == correct);
       }
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/flat.cc b/lib/op-attrs/test/src/op-attrs/ops/flat.cc
index 8998dfaffd..ebd869b3e5 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/flat.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/flat.cc
@@ -9,25 +9,25 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(FlatAttrs, TensorShape)") {
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            2,
-            4,
-            2,
-            3,
+        TensorDims{FFOrdered<nonnegative_int>{
+            2_n,
+            4_n,
+            2_n,
+            3_n,
         }},
         DataType::FLOAT,
     };
 
     SUBCASE("flatten all dims") {
       FlatAttrs attrs = FlatAttrs{
-          /*start_dim=*/ff_dim_t{nonnegative_int{0}},
-          /*end_dim=*/ff_dim_t{nonnegative_int{4}},
+          /*start_dim=*/ff_dim_t{0_n},
+          /*end_dim=*/ff_dim_t{4_n},
       };
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              2 * 4 * 2 * 3,
+          TensorDims{FFOrdered<nonnegative_int>{
+              2_n * 4_n * 2_n * 3_n,
           }},
           DataType::FLOAT,
       };
@@ -43,10 +43,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              2,
-              4,
-              2 * 3,
+          TensorDims{FFOrdered<nonnegative_int>{
+              2_n,
+              4_n,
+              2_n * 3_n,
           }},
           DataType::FLOAT,
       };
@@ -62,10 +62,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              2 * 4,
-              2,
-              3,
+          TensorDims{FFOrdered<nonnegative_int>{
+              2_n * 4_n,
+              2_n,
+              3_n,
           }},
           DataType::FLOAT,
       };
@@ -81,10 +81,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       TensorShape result = get_output_shape(attrs, input_shape);
       TensorShape correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              2,
-              4 * 2,
-              3,
+          TensorDims{FFOrdered<nonnegative_int>{
+              2_n,
+              4_n * 2_n,
+              3_n,
           }},
           DataType::FLOAT,
       };
@@ -124,18 +124,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("allows shard parallelism in non-flattened dims") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{2, 1, 1, 3},
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{2_n, 1_n, 1_n, 3_n},
       };
 
       tl::expected<ParallelTensorDimDegrees, std::string> result =
           get_output_parallel_dim_degrees(attrs, input);
       tl::expected<ParallelTensorDimDegrees, std::string> correct =
           ParallelTensorDimDegrees{
-              SumDegree{1},
-              DiscardCopyDegree{1},
-              FFOrdered<int>{2, 1, 3},
+              SumDegree{1_n},
+              DiscardCopyDegree{1_n},
+              FFOrdered<nonnegative_int>{2_n, 1_n, 3_n},
           };
 
       CHECK(result == correct);
@@ -143,9 +143,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("does not allow shard parallelism in flattened dims") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{1, 1, 2, 1},
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 2_n, 1_n},
       };
 
       std::optional<ParallelTensorDimDegrees> result =
@@ -157,18 +157,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("allows sum parallelism") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{2},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{1, 1, 1, 1},
+          SumDegree{2_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
       };
 
       std::optional<ParallelTensorDimDegrees> result =
           optional_from_expected(get_output_parallel_dim_degrees(attrs, input));
       std::optional<ParallelTensorDimDegrees> correct =
           ParallelTensorDimDegrees{
-              SumDegree{2},
-              DiscardCopyDegree{1},
-              FFOrdered<int>{1, 1, 1},
+              SumDegree{2_n},
+              DiscardCopyDegree{1_n},
+              FFOrdered<nonnegative_int>{1_n, 1_n, 1_n},
           };
 
       CHECK(result == correct);
@@ -176,18 +176,18 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("allows discard copy parallelism") {
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{2},
-          FFOrdered<int>{1, 1, 1, 1},
+          SumDegree{1_n},
+          DiscardCopyDegree{2_n},
+          FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 1_n},
       };
 
       std::optional<ParallelTensorDimDegrees> result =
           optional_from_expected(get_output_parallel_dim_degrees(attrs, input));
       std::optional<ParallelTensorDimDegrees> correct =
           ParallelTensorDimDegrees{
-              SumDegree{1},
-              DiscardCopyDegree{2},
-              FFOrdered<int>{1, 1, 1},
+              SumDegree{1_n},
+              DiscardCopyDegree{2_n},
+              FFOrdered<nonnegative_int>{1_n, 1_n, 1_n},
           };
 
       CHECK(result == correct);
@@ -203,22 +203,22 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{4, 2},
-                ShardParallelDim{8, 1},
-                ShardParallelDim{6, 1},
-                ShardParallelDim{9, 3},
+                ShardParallelDim{4_n, 2_n},
+                ShardParallelDim{8_n, 1_n},
+                ShardParallelDim{6_n, 1_n},
+                ShardParallelDim{9_n, 3_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{7},
-                DiscardCopyDegree{5},
+                SumDegree{7_n},
+                DiscardCopyDegree{5_n},
             },
         },
         DataType::FLOAT,
     };
 
     FlatAttrs attrs = FlatAttrs{
-        /*start_dim=*/ff_dim_t{nonnegative_int{1}},
-        /*end_dim=*/ff_dim_t{nonnegative_int{3}},
+        /*start_dim=*/ff_dim_t{nonnegative_int{1_n}},
+        /*end_dim=*/ff_dim_t{nonnegative_int{3_n}},
     };
 
     tl::expected<ParallelTensorShape, std::string> result =
@@ -227,13 +227,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         ParallelTensorShape{
             ParallelTensorDims{
                 FFOrdered<ShardParallelDim>{
-                    ShardParallelDim{4, 2},
-                    ShardParallelDim{8 * 6, 1},
-                    ShardParallelDim{9, 3},
+                    ShardParallelDim{4_n, 2_n},
+                    ShardParallelDim{8_n * 6_n, 1_n},
+                    ShardParallelDim{9_n, 3_n},
                 },
                 ReplicaParallelDimSet{
-                    SumDegree{7},
-                    DiscardCopyDegree{5},
+                    SumDegree{7_n},
+                    DiscardCopyDegree{5_n},
                 },
             },
             DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
index b9426a89a2..b9aa3c0677 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/layer_norm.cc
@@ -58,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }();
 
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
-            18,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
+            18_n,
         }},
         DataType::FLOAT,
     };
@@ -70,9 +70,9 @@ TEST_SUITE(FF_TEST_SUITE) {
     TensorShape output = input;
 
     TensorShape gamma = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
@@ -125,49 +125,58 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          int o0,
-                          int o1,
-                          int o2,
-                          int o3) {
+                          nonnegative_int o0,
+                          nonnegative_int o1,
+                          nonnegative_int o2,
+                          nonnegative_int o3) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o0, o1, o2, o3});
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2, o3});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o0,
-                           int o1,
-                           int o2,
-                           int o3) {
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2,
+                           nonnegative_int o3) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<int>{o0, o1, o2, o3});
+          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2, o3});
     };
 
-    auto make_gamma_weights =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o2) {
-          return lift_to_parallel_with_degrees(
-              gamma, o_sum, o_eq, FFOrdered<int>{o0, o2});
-        };
+    auto make_gamma_weights = [&](SumDegree o_sum,
+                                  DiscardCopyDegree o_eq,
+                                  nonnegative_int o0,
+                                  nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          gamma, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o2});
+    };
 
-    auto make_beta_weights =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o2) {
-          return lift_to_parallel_with_degrees(
-              beta, o_sum, o_eq, FFOrdered<int>{o0, o2});
-        };
+    auto make_beta_weights = [&](SumDegree o_sum,
+                                 DiscardCopyDegree o_eq,
+                                 nonnegative_int o0,
+                                 nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          beta, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o2});
+    };
 
     SUBCASE("parallel shape inference (LayerNorm)") {
       SUBCASE("partition parallelism (not in axes)") {
-        int degree0 = 2;
-        int degree2 = 3;
+        nonnegative_int degree0 = 2_n;
+        nonnegative_int degree2 = 3_n;
 
         ParallelTensorShape par_input = make_input(
-            SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2, 1);
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2, 1_n);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           tl::expected<ParallelTensorShape, std::string> result =
               get_output_shape(attrs_affine_true, par_input);
-          tl::expected<ParallelTensorShape, std::string> correct = make_output(
-              SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2, 1);
+          tl::expected<ParallelTensorShape, std::string> correct =
+              make_output(SumDegree{1_n},
+                          DiscardCopyDegree{1_n},
+                          degree0,
+                          1_n,
+                          degree2,
+                          1_n);
 
           CHECK(result == correct);
         }
@@ -179,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                 get_gamma_weights_shape(attrs_affine_true, par_input);
             tl::expected<ParallelTensorShape, std::string> correct =
                 make_gamma_weights(
-                    SumDegree{1}, DiscardCopyDegree{1}, degree0, degree2);
+                    SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2);
 
             CHECK(result == correct);
           }
@@ -199,7 +208,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                 get_beta_weights_shape(attrs_affine_true, par_input);
             tl::expected<ParallelTensorShape, std::string> correct =
                 make_beta_weights(
-                    SumDegree{1}, DiscardCopyDegree{1}, degree0, degree2);
+                    SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, degree2);
 
             CHECK(result == correct);
           }
@@ -215,11 +224,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("partition parallelism (in axes)") {
-        int degree1 = 2;
-        int degree2 = 4;
+        nonnegative_int degree1 = 2_n;
+        nonnegative_int degree2 = 4_n;
 
         ParallelTensorShape par_input = make_input(
-            SumDegree{1}, DiscardCopyDegree{1}, 1, degree1, degree2, 1);
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, degree2, 1_n);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           std::optional<ParallelTensorShape> result = optional_from_expected(
@@ -248,10 +257,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("sum parallelism") {
-        SumDegree sum_degree = SumDegree{2};
+        SumDegree sum_degree = SumDegree{2_n};
 
         ParallelTensorShape par_input =
-            make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1, 1);
+            make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n, 1_n);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           std::optional<ParallelTensorShape> result = optional_from_expected(
@@ -280,10 +289,10 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("discard copy parallelism") {
-        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+        DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
         ParallelTensorShape par_input =
-            make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1, 1);
+            make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n, 1_n);
 
         SUBCASE("get_output_shape(LayerNormAttrs, ParallelTensorShape)") {
           std::optional<ParallelTensorShape> result = optional_from_expected(
diff --git a/lib/op-attrs/test/src/op-attrs/ops/linear.cc b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
index 191515b062..eaa99ef099 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/linear.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/linear.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_linear_incoming_tensor_roles(LinearAttrs)") {
     auto make_attrs = [](bool use_bias) {
       return LinearAttrs{
-          /*out_channels=*/16,
+          /*out_channels=*/16_n,
           /*use_bias=*/use_bias,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
@@ -47,7 +47,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("Linear shape inference") {
-    int out_channels = 16;
+    nonnegative_int out_channels = 16_n;
     LinearAttrs attrs = LinearAttrs{
         /*out_channels=*/out_channels,
         /*use_bias=*/true,
@@ -56,13 +56,13 @@ TEST_SUITE(FF_TEST_SUITE) {
         /*regularizer=*/std::nullopt,
     };
 
-    size_t batch_size = 12;
-    size_t extra_dim = 16;
-    size_t in_channels = 8;
+    nonnegative_int batch_size = 12_n;
+    nonnegative_int extra_dim = 16_n;
+    nonnegative_int in_channels = 8_n;
 
     TensorShape input = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 extra_dim,
                 in_channels,
@@ -73,10 +73,10 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape output = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 batch_size,
                 extra_dim,
-                size_t_from_int(out_channels),
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -84,9 +84,9 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape projection = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
+            FFOrdered<nonnegative_int>{
                 in_channels,
-                size_t_from_int(out_channels),
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -94,8 +94,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     TensorShape bias = TensorShape{
         TensorDims{
-            FFOrdered<size_t>{
-                size_t_from_int(out_channels),
+            FFOrdered<nonnegative_int>{
+                out_channels,
             },
         },
         DataType::FLOAT,
@@ -127,56 +127,66 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     auto make_input = [&](SumDegree o_sum,
                           DiscardCopyDegree o_eq,
-                          int o_batch,
-                          int o_extra_dim,
-                          int o_channel) {
+                          nonnegative_int o_batch,
+                          nonnegative_int o_extra_dim,
+                          nonnegative_int o_channel) {
       return lift_to_parallel_with_degrees(
-          input, o_sum, o_eq, FFOrdered<int>{o_batch, o_extra_dim, o_channel});
+          input,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_batch, o_extra_dim, o_channel});
     };
 
     auto make_output = [&](SumDegree o_sum,
                            DiscardCopyDegree o_eq,
-                           int o_batch,
-                           int o_extra_dim,
-                           int o_channel) {
+                           nonnegative_int o_batch,
+                           nonnegative_int o_extra_dim,
+                           nonnegative_int o_channel) {
       return lift_to_parallel_with_degrees(
-          output, o_sum, o_eq, FFOrdered<int>{o_batch, o_extra_dim, o_channel});
+          output,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_batch, o_extra_dim, o_channel});
     };
 
     auto make_projection = [&](SumDegree o_sum,
                                DiscardCopyDegree o_eq,
-                               int o_inchannel,
-                               int o_outchannel) {
+                               nonnegative_int o_inchannel,
+                               nonnegative_int o_outchannel) {
       return lift_to_parallel_with_degrees(
-          projection, o_sum, o_eq, FFOrdered<int>{o_inchannel, o_outchannel});
+          projection,
+          o_sum,
+          o_eq,
+          FFOrdered<nonnegative_int>{o_inchannel, o_outchannel});
     };
 
-    auto make_bias =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o_outchannel) {
-          return lift_to_parallel_with_degrees(
-              bias, o_sum, o_eq, FFOrdered<int>{o_outchannel});
-        };
+    auto make_bias = [&](SumDegree o_sum,
+                         DiscardCopyDegree o_eq,
+                         nonnegative_int o_outchannel) {
+      return lift_to_parallel_with_degrees(
+          bias, o_sum, o_eq, FFOrdered<nonnegative_int>{o_outchannel});
+    };
 
     SUBCASE("data parallelism") {
-      int input_sum_degree = 2;
-      int extra_dim_degree = 8;
-      int degree = 4;
+      nonnegative_int input_sum_degree = 2_n;
+      nonnegative_int extra_dim_degree = 8_n;
+      nonnegative_int degree = 4_n;
 
       ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree},
-                                                 DiscardCopyDegree{1},
+                                                 DiscardCopyDegree{1_n},
                                                  degree,
                                                  extra_dim_degree,
-                                                 1);
+                                                 1_n);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_output(SumDegree{input_sum_degree},
-                        DiscardCopyDegree{1},
+                        DiscardCopyDegree{1_n},
                         degree,
                         extra_dim_degree,
-                        1);
+                        1_n);
         CHECK(result == correct);
       }
 
@@ -185,10 +195,10 @@ TEST_SUITE(FF_TEST_SUITE) {
             get_projection_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_projection(
-                SumDegree{1},
+                SumDegree{1_n},
                 DiscardCopyDegree{input_sum_degree * degree * extra_dim_degree},
-                1,
-                1);
+                1_n,
+                1_n);
         CHECK(result == correct);
       }
 
@@ -198,27 +208,30 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> correct =
             make_bias(SumDegree{input_sum_degree},
                       DiscardCopyDegree{degree * extra_dim_degree},
-                      1);
+                      1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("reduction parallelism") {
-      int input_sum_degree = 2;
-      int degree = 4;
+      nonnegative_int input_sum_degree = 2_n;
+      nonnegative_int degree = 4_n;
 
-      ParallelTensorShape par_input = make_input(
-          SumDegree{input_sum_degree}, DiscardCopyDegree{1}, 1, 1, degree);
+      ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree},
+                                                 DiscardCopyDegree{1_n},
+                                                 1_n,
+                                                 1_n,
+                                                 degree);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
             make_output(SumDegree{input_sum_degree * degree},
-                        DiscardCopyDegree{1},
-                        1,
-                        1,
-                        1);
+                        DiscardCopyDegree{1_n},
+                        1_n,
+                        1_n,
+                        1_n);
         CHECK(result == correct);
       }
 
@@ -226,8 +239,10 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_projection_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_projection(
-                SumDegree{1}, DiscardCopyDegree{input_sum_degree}, degree, 1);
+            make_projection(SumDegree{1_n},
+                            DiscardCopyDegree{input_sum_degree},
+                            degree,
+                            1_n);
         CHECK(result == correct);
       }
 
@@ -235,23 +250,30 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_bias_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct = make_bias(
-            SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1}, 1);
+            SumDegree{input_sum_degree * degree}, DiscardCopyDegree{1_n}, 1_n);
         CHECK(result == correct);
       }
     }
 
     SUBCASE("output channel parallelism") {
-      int input_sum_degree = 2;
-      int degree = 4;
+      nonnegative_int input_sum_degree = 2_n;
+      nonnegative_int degree = 4_n;
 
-      ParallelTensorShape par_input = make_input(
-          SumDegree{input_sum_degree}, DiscardCopyDegree{degree}, 1, 1, 1);
+      ParallelTensorShape par_input = make_input(SumDegree{input_sum_degree},
+                                                 DiscardCopyDegree{degree},
+                                                 1_n,
+                                                 1_n,
+                                                 1_n);
 
       {
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
-        tl::expected<ParallelTensorShape, std::string> correct = make_output(
-            SumDegree{input_sum_degree}, DiscardCopyDegree{1}, 1, 1, degree);
+        tl::expected<ParallelTensorShape, std::string> correct =
+            make_output(SumDegree{input_sum_degree},
+                        DiscardCopyDegree{1_n},
+                        1_n,
+                        1_n,
+                        degree);
         CHECK(result == correct);
       }
 
@@ -259,8 +281,10 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_projection_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct =
-            make_projection(
-                SumDegree{1}, DiscardCopyDegree{input_sum_degree}, 1, degree);
+            make_projection(SumDegree{1_n},
+                            DiscardCopyDegree{input_sum_degree},
+                            1_n,
+                            degree);
         CHECK(result == correct);
       }
 
@@ -268,7 +292,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<ParallelTensorShape, std::string> result =
             get_bias_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct = make_bias(
-            SumDegree{input_sum_degree}, DiscardCopyDegree{1}, degree);
+            SumDegree{input_sum_degree}, DiscardCopyDegree{1_n}, degree);
         CHECK(result == correct);
       }
     }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
index 0c14c0fc2a..6c14a226a2 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/pool_2d.cc
@@ -9,27 +9,27 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("make_adaptive_pool2d") {
-    size_t input_n = 10;
-    size_t input_c = 11;
-    size_t input_h = 15;
-    size_t input_w = 20;
+    nonnegative_int input_n = 10_n;
+    nonnegative_int input_c = 11_n;
+    nonnegative_int input_h = 15_n;
+    nonnegative_int input_w = 20_n;
     Activation activation = Activation::RELU;
     PoolOp op = PoolOp::AVG;
 
-    TensorDims input_dims =
-        TensorDims{FFOrdered<size_t>{input_n, input_c, input_h, input_w}};
+    TensorDims input_dims = TensorDims{
+        FFOrdered<nonnegative_int>{input_n, input_c, input_h, input_w}};
 
     SUBCASE("input_h divisible by output_h && input_w divisible by output_w") {
-      int output_h = 5;
-      int output_w = 2;
+      nonnegative_int output_h = 5_n;
+      nonnegative_int output_w = 2_n;
 
       Pool2DAttrs correct_attrs = Pool2DAttrs{
-          /*kernel_h=*/3,
-          /*kernel_w=*/10,
-          /*stride_h=*/3,
-          /*stride_w=*/10,
-          /*padding_h=*/0,
-          /*padding_w=*/0,
+          /*kernel_h=*/3_n,
+          /*kernel_w=*/10_n,
+          /*stride_h=*/3_n,
+          /*stride_w=*/10_n,
+          /*padding_h=*/0_n,
+          /*padding_w=*/0_n,
           /*pool_type=*/op,
           /*activation=*/activation,
       };
@@ -50,11 +50,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         tl::expected<TensorShape, std::string> result =
             get_output_shape(correct_attrs, input_shape);
         tl::expected<TensorShape, std::string> correct = TensorShape{
-            TensorDims{FFOrdered<size_t>{
+            TensorDims{FFOrdered<nonnegative_int>{
                 input_n,
                 input_c,
-                size_t_from_int(output_h),
-                size_t_from_int(output_w),
+                output_h,
+                output_w,
             }},
             DataType::FLOAT,
         };
@@ -64,8 +64,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input_h not divisible by output_h") {
-      int output_h = 6;
-      int output_w = 2;
+      nonnegative_int output_h = 6_n;
+      nonnegative_int output_w = 2_n;
 
       std::optional<Pool2DAttrs> result =
           optional_from_expected(make_adaptive_pool2d_attrs(
@@ -76,8 +76,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input_w not divisible by output_w") {
-      int output_h = 5;
-      int output_w = 3;
+      nonnegative_int output_h = 5_n;
+      nonnegative_int output_w = 3_n;
 
       std::optional<Pool2DAttrs> result =
           optional_from_expected(make_adaptive_pool2d_attrs(
@@ -88,16 +88,16 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input_h == output_h and input_w == output_w") {
-      int output_h = input_h;
-      int output_w = input_w;
+      nonnegative_int output_h = input_h;
+      nonnegative_int output_w = input_w;
 
       Pool2DAttrs correct_attrs = Pool2DAttrs{
-          /*kernel_h=*/1,
-          /*kernel_w=*/1,
-          /*stride_h=*/1,
-          /*stride_w=*/1,
-          /*padding_h=*/0,
-          /*padding_w=*/0,
+          /*kernel_h=*/1_n,
+          /*kernel_w=*/1_n,
+          /*stride_h=*/1_n,
+          /*stride_w=*/1_n,
+          /*padding_h=*/0_n,
+          /*padding_w=*/0_n,
           /*pool_type=*/op,
           /*activation=*/activation,
       };
@@ -126,22 +126,22 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(Pool2DAttrs, TensorShape)") {
     Pool2DAttrs attrs = Pool2DAttrs{
-        /*kernel_h=*/3,
-        /*kernel_w=*/2,
-        /*stride_h=*/2,
-        /*stride_w=*/2,
-        /*padding_h=*/1,
-        /*padding_w=*/1,
+        /*kernel_h=*/3_n,
+        /*kernel_w=*/2_n,
+        /*stride_h=*/2_n,
+        /*stride_w=*/2_n,
+        /*padding_h=*/1_n,
+        /*padding_w=*/1_n,
         /*pool_type=*/PoolOp::MAX,
         /*activation=*/std::nullopt,
     };
 
     SUBCASE("fails on non-4d inputs") {
       TensorShape input = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              10,
-              12,
-              14,
+          TensorDims{FFOrdered<nonnegative_int>{
+              10_n,
+              12_n,
+              14_n,
           }},
           DataType::FLOAT,
       };
@@ -155,14 +155,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("4d input") {
       TensorShape input = TensorShape{
-          TensorDims{FFOrdered<size_t>{11, 13, 12, 6}},
+          TensorDims{FFOrdered<nonnegative_int>{11_n, 13_n, 12_n, 6_n}},
           DataType::FLOAT,
       };
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input);
       tl::expected<TensorShape, std::string> correct = TensorShape{
-          TensorDims{FFOrdered<size_t>{11, 13, 6, 4}},
+          TensorDims{FFOrdered<nonnegative_int>{11_n, 13_n, 6_n, 4_n}},
           DataType::FLOAT,
       };
 
@@ -175,12 +175,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     auto make_attrs = [](PoolOp pool_type,
                          std::optional<Activation> const &activation) {
       return Pool2DAttrs{
-          /*kernel_h=*/3,
-          /*kernel_w=*/2,
-          /*stride_h=*/2,
-          /*stride_w=*/2,
-          /*padding_h=*/1,
-          /*padding_w=*/1,
+          /*kernel_h=*/3_n,
+          /*kernel_w=*/2_n,
+          /*stride_h=*/2_n,
+          /*stride_w=*/2_n,
+          /*padding_h=*/1_n,
+          /*padding_w=*/1_n,
           /*pool_type=*/pool_type,
           /*activation=*/activation,
       };
@@ -190,13 +190,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{
-              4,
-              1,
-              1,
-              1,
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{
+              4_n,
+              1_n,
+              1_n,
+              1_n,
           },
       };
 
@@ -211,13 +211,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{1},
-          FFOrdered<int>{
-              4,
-              2,
-              5,
-              6,
+          SumDegree{1_n},
+          DiscardCopyDegree{1_n},
+          FFOrdered<nonnegative_int>{
+              4_n,
+              2_n,
+              5_n,
+              6_n,
           },
       };
 
@@ -232,13 +232,13 @@ TEST_SUITE(FF_TEST_SUITE) {
       Pool2DAttrs attrs = make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
       ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-          SumDegree{1},
-          DiscardCopyDegree{3},
-          FFOrdered<int>{
-              1,
-              1,
-              1,
-              1,
+          SumDegree{1_n},
+          DiscardCopyDegree{3_n},
+          FFOrdered<nonnegative_int>{
+              1_n,
+              1_n,
+              1_n,
+              1_n,
           },
       };
 
@@ -256,13 +256,13 @@ TEST_SUITE(FF_TEST_SUITE) {
               make_attrs(PoolOp::MAX, /*activation=*/std::nullopt);
 
           ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-              SumDegree{2},
-              DiscardCopyDegree{1},
-              FFOrdered<int>{
-                  1,
-                  1,
-                  1,
-                  1,
+              SumDegree{2_n},
+              DiscardCopyDegree{1_n},
+              FFOrdered<nonnegative_int>{
+                  1_n,
+                  1_n,
+                  1_n,
+                  1_n,
               },
           };
 
@@ -279,13 +279,13 @@ TEST_SUITE(FF_TEST_SUITE) {
               make_attrs(PoolOp::AVG, /*activation=*/std::nullopt);
 
           ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-              SumDegree{2},
-              DiscardCopyDegree{1},
-              FFOrdered<int>{
-                  1,
-                  1,
-                  1,
-                  1,
+              SumDegree{2_n},
+              DiscardCopyDegree{1_n},
+              FFOrdered<nonnegative_int>{
+                  1_n,
+                  1_n,
+                  1_n,
+                  1_n,
               },
           };
 
@@ -302,13 +302,13 @@ TEST_SUITE(FF_TEST_SUITE) {
             make_attrs(PoolOp::AVG, /*activation=*/Activation::RELU);
 
         ParallelTensorDimDegrees input = ParallelTensorDimDegrees{
-            SumDegree{2},
-            DiscardCopyDegree{1},
-            FFOrdered<int>{
-                1,
-                1,
-                1,
-                1,
+            SumDegree{2_n},
+            DiscardCopyDegree{1_n},
+            FFOrdered<nonnegative_int>{
+                1_n,
+                1_n,
+                1_n,
+                1_n,
             },
         };
 
@@ -326,12 +326,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     // just do a single test to make sure it works/exists
 
     Pool2DAttrs attrs = Pool2DAttrs{
-        /*kernel_h=*/3,
-        /*kernel_w=*/2,
-        /*stride_h=*/2,
-        /*stride_w=*/2,
-        /*padding_h=*/1,
-        /*padding_w=*/1,
+        /*kernel_h=*/3_n,
+        /*kernel_w=*/2_n,
+        /*stride_h=*/2_n,
+        /*stride_w=*/2_n,
+        /*padding_h=*/1_n,
+        /*padding_w=*/1_n,
         /*pool_type=*/PoolOp::MAX,
         /*activation=*/std::nullopt,
     };
@@ -340,14 +340,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelTensorShape input = ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{14, 7},
-                  ShardParallelDim{16, 8},
-                  ShardParallelDim{12, 3},
-                  ShardParallelDim{6, 2},
+                  ShardParallelDim{14_n, 7_n},
+                  ShardParallelDim{16_n, 8_n},
+                  ShardParallelDim{12_n, 3_n},
+                  ShardParallelDim{6_n, 2_n},
               },
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{2},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{2_n},
               },
           },
           DataType::FLOAT,
@@ -359,14 +359,14 @@ TEST_SUITE(FF_TEST_SUITE) {
           ParallelTensorShape{
               ParallelTensorDims{
                   FFOrdered<ShardParallelDim>{
-                      ShardParallelDim{14, 7},
-                      ShardParallelDim{16, 8},
-                      ShardParallelDim{6, 3},
-                      ShardParallelDim{4, 2},
+                      ShardParallelDim{14_n, 7_n},
+                      ShardParallelDim{16_n, 8_n},
+                      ShardParallelDim{6_n, 3_n},
+                      ShardParallelDim{4_n, 2_n},
                   },
                   ReplicaParallelDimSet{
-                      SumDegree{1},
-                      DiscardCopyDegree{2},
+                      SumDegree{1_n},
+                      DiscardCopyDegree{2_n},
                   },
               },
               DataType::FLOAT,
@@ -377,14 +377,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelTensorShape input = ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{14, 1},
-                  ShardParallelDim{16, 1},
-                  ShardParallelDim{12, 1},
-                  ShardParallelDim{6, 1},
+                  ShardParallelDim{14_n, 1_n},
+                  ShardParallelDim{16_n, 1_n},
+                  ShardParallelDim{12_n, 1_n},
+                  ShardParallelDim{6_n, 1_n},
               },
               ReplicaParallelDimSet{
-                  SumDegree{2},
-                  DiscardCopyDegree{1},
+                  SumDegree{2_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
index 0d1c8bdf98..dc12eb12a8 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/reduction.cc
@@ -10,21 +10,21 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 2},
-                ShardParallelDim{14, 1},
-                ShardParallelDim{16, 3},
-                ShardParallelDim{18, 2},
+                ShardParallelDim{12_n, 2_n},
+                ShardParallelDim{14_n, 1_n},
+                ShardParallelDim{16_n, 3_n},
+                ShardParallelDim{18_n, 2_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{2},
+                SumDegree{3_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
     };
 
     SUBCASE("valid") {
-      int degree = 3;
+      nonnegative_int degree = 3_n;
       ReductionAttrs attrs = ReductionAttrs{
           /*repartition_degree=*/degree,
       };
@@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("invalid") {
-      int degree = 4;
+      nonnegative_int degree = 4_n;
       ReductionAttrs attrs = ReductionAttrs{
           /*repartition_degree=*/degree,
       };
diff --git a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
index ba213f54f4..36a265ce9f 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/repartition.cc
@@ -6,8 +6,8 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Repartition shape inference") {
-    ff_dim_t dim = ff_dim_t{nonnegative_int{2}};
-    int degree = 4;
+    ff_dim_t dim = ff_dim_t{2_n};
+    nonnegative_int degree = 4_n;
     RepartitionAttrs attrs = RepartitionAttrs{
         /*repartition_dim=*/dim,
         /*repartition_degree=*/degree,
@@ -16,14 +16,14 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 2},
-                ShardParallelDim{14, 1},
-                ShardParallelDim{16, 3},
-                ShardParallelDim{18, 2},
+                ShardParallelDim{12_n, 2_n},
+                ShardParallelDim{14_n, 1_n},
+                ShardParallelDim{16_n, 3_n},
+                ShardParallelDim{18_n, 2_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{2},
+                SumDegree{3_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc
index 60a1018479..770ae20d38 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/replicate.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/replicate.cc
@@ -6,20 +6,20 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("Replicate shape inference") {
     ReplicateAttrs attrs = ReplicateAttrs{
-        /*replicate_degree=*/4,
+        /*replicate_degree=*/4_n,
     };
 
     ParallelTensorShape input = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
-                ShardParallelDim{14, 2},
-                ShardParallelDim{16, 2},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
+                ShardParallelDim{14_n, 2_n},
+                ShardParallelDim{16_n, 2_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{2},
+                SumDegree{3_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
@@ -28,7 +28,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape result = get_output_shape(attrs, input);
 
     ParallelTensorShape correct_output = input;
-    correct_output.dims.replica_dims.discard_copy_degree = DiscardCopyDegree{8};
+    correct_output.dims.replica_dims.discard_copy_degree =
+        DiscardCopyDegree{8_n};
 
     CHECK(result == correct_output);
   }
diff --git a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
index 5808e5ef42..8c80e348c0 100644
--- a/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
+++ b/lib/op-attrs/test/src/op-attrs/ops/softmax.cc
@@ -10,16 +10,16 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_output_shape(SoftmaxAttrs, TensorShape)") {
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
 
     SUBCASE("attrs.dim in bounds") {
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       tl::expected<TensorShape, std::string> result =
           get_output_shape(attrs, input);
@@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("attrs.dims out of bounds") {
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{4}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{4_n}};
 
       std::optional<TensorShape> result =
           optional_from_expected(get_output_shape(attrs, input));
@@ -41,47 +41,53 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("get_output_shape(SoftmaxAttrs, ParallelTensorShape)") {
     TensorShape input = TensorShape{
-        TensorDims{FFOrdered<size_t>{
-            12,
-            14,
-            16,
+        TensorDims{FFOrdered<nonnegative_int>{
+            12_n,
+            14_n,
+            16_n,
         }},
         DataType::FLOAT,
     };
     TensorShape output = input;
 
-    auto make_input =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              input, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto make_input = [&](SumDegree o_sum,
+                          DiscardCopyDegree o_eq,
+                          nonnegative_int o0,
+                          nonnegative_int o1,
+                          nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          input, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
-    auto make_output =
-        [&](SumDegree o_sum, DiscardCopyDegree o_eq, int o0, int o1, int o2) {
-          return lift_to_parallel_with_degrees(
-              output, o_sum, o_eq, FFOrdered<int>{o0, o1, o2});
-        };
+    auto make_output = [&](SumDegree o_sum,
+                           DiscardCopyDegree o_eq,
+                           nonnegative_int o0,
+                           nonnegative_int o1,
+                           nonnegative_int o2) {
+      return lift_to_parallel_with_degrees(
+          output, o_sum, o_eq, FFOrdered<nonnegative_int>{o0, o1, o2});
+    };
 
     SUBCASE("partition parallelism in non-softmax-dim (valid)") {
-      int degree0 = 2;
-      int degree2 = 4;
+      nonnegative_int degree0 = 2_n;
+      nonnegative_int degree2 = 4_n;
 
-      ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+      ParallelTensorShape par_input = make_input(
+          SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
       SUBCASE("attrs.dim in bounds") {
-        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
         tl::expected<ParallelTensorShape, std::string> result =
             get_output_shape(attrs, par_input);
         tl::expected<ParallelTensorShape, std::string> correct = make_output(
-            SumDegree{1}, DiscardCopyDegree{1}, degree0, 1, degree2);
+            SumDegree{1_n}, DiscardCopyDegree{1_n}, degree0, 1_n, degree2);
 
         CHECK(result == correct);
       }
 
       SUBCASE("attrs.dims out of bounds") {
-        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{4}}};
+        SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{4_n}};
 
         std::optional<ParallelTensorShape> result =
             optional_from_expected(get_output_shape(attrs, par_input));
@@ -92,12 +98,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("partition parallism in softmax dim (invalid)") {
-      int degree1 = 2;
+      nonnegative_int degree1 = 2_n;
 
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       ParallelTensorShape par_input =
-          make_input(SumDegree{1}, DiscardCopyDegree{1}, 1, degree1, 1);
+          make_input(SumDegree{1_n}, DiscardCopyDegree{1_n}, 1_n, degree1, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
@@ -107,12 +113,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("sum parallelism (invalid)") {
-      SumDegree sum_degree = SumDegree{2};
+      SumDegree sum_degree = SumDegree{2_n};
 
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       ParallelTensorShape par_input =
-          make_input(sum_degree, DiscardCopyDegree{1}, 1, 1, 1);
+          make_input(sum_degree, DiscardCopyDegree{1_n}, 1_n, 1_n, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
@@ -122,12 +128,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("discard copy parallelism (invalid)") {
-      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2};
+      DiscardCopyDegree discard_copy_degree = DiscardCopyDegree{2_n};
 
-      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{nonnegative_int{1}}};
+      SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t{1_n}};
 
       ParallelTensorShape par_input =
-          make_input(SumDegree{1}, discard_copy_degree, 1, 1, 1);
+          make_input(SumDegree{1_n}, discard_copy_degree, 1_n, 1_n, 1_n);
 
       std::optional<ParallelTensorShape> result =
           optional_from_expected(get_output_shape(attrs, par_input));
diff --git a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
index 73f5f0674d..1187bfcfbf 100644
--- a/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
+++ b/lib/op-attrs/test/src/op-attrs/pcg_operator_attrs.cc
@@ -6,8 +6,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("PCGOperatorAttrs to/from json") {
     PCGOperatorAttrs correct = PCGOperatorAttrs{RepartitionAttrs{
-        /*repartition_dim=*/ff_dim_t{nonnegative_int{1}},
-        /*repartition_degree=*/4,
+        /*repartition_dim=*/ff_dim_t{1_n},
+        /*repartition_degree=*/4_n,
     }};
     nlohmann::json j = correct;
     auto result = j.get<PCGOperatorAttrs>();
diff --git a/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc b/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc
index c09c1ec3df..e3f3f4534e 100644
--- a/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc
+++ b/lib/op-attrs/test/src/op-attrs/relative_ff_dim_t.cc
@@ -5,13 +5,13 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ff_dim_t_from_relative_ff_dim_t") {
-    int input_dim = 5;
+    nonnegative_int input_dim = 5_n;
 
     SUBCASE("relative index is zero") {
       relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{0};
       ff_dim_t ff_dim =
           ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
-      CHECK(ff_dim == ff_dim_t{nonnegative_int{0}});
+      CHECK(ff_dim == ff_dim_t{0_n});
     }
 
     SUBCASE("relative index is positive") {
@@ -20,14 +20,14 @@ TEST_SUITE(FF_TEST_SUITE) {
         relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{1};
         ff_dim_t ff_dim =
             ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
-        CHECK(ff_dim == ff_dim_t{nonnegative_int{1}});
+        CHECK(ff_dim == ff_dim_t{1_n});
       }
 
       SUBCASE("relative index is out of range") {
         relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{10};
         ff_dim_t ff_dim =
             ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
-        CHECK(ff_dim == ff_dim_t{nonnegative_int{10}});
+        CHECK(ff_dim == ff_dim_t{10_n});
       }
     }
 
@@ -37,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         relative_ff_dim_t relative_ff_dim = relative_ff_dim_t{-1};
         ff_dim_t ff_dim =
             ff_dim_t_from_relative_ff_dim_t(relative_ff_dim, input_dim);
-        CHECK(ff_dim == ff_dim_t{nonnegative_int{4}});
+        CHECK(ff_dim == ff_dim_t{4_n});
       }
 
       SUBCASE("relative index is out of range") {
diff --git a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
index 60d87300c1..7e072d82d9 100644
--- a/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
+++ b/lib/op-attrs/test/src/op-attrs/tensor_dims.cc
@@ -7,7 +7,8 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("tensor_dims_is_broadcastable_to(TensorDims, TensorDims)") {
 
-    TensorDims goal = TensorDims{FFOrdered<size_t>{1, 1, 4, 3}};
+    TensorDims goal =
+        TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 4_n, 3_n}};
 
     SUBCASE("dims match") {
       bool result = tensor_dims_is_broadcastable_to(goal, goal);
@@ -17,7 +18,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr only needs num_dims promotion") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{4, 3}};
+      TensorDims curr = TensorDims{FFOrdered<nonnegative_int>{4_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -26,7 +27,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr only needs dim expansion") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{1, 1, 1, 3}};
+      TensorDims curr =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -35,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr needs both num_dims promotion and dim expansion") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{1, 3}};
+      TensorDims curr = TensorDims{FFOrdered<nonnegative_int>{1_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = true;
@@ -44,7 +46,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("curr needs invalid dim promotion") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{1, 1, 2, 3}};
+      TensorDims curr =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 2_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = false;
@@ -53,7 +56,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("num_dims(goal) < num_dims(curr)") {
-      TensorDims curr = TensorDims{FFOrdered<size_t>{1, 1, 10, 4, 3}};
+      TensorDims curr =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 10_n, 4_n, 3_n}};
 
       bool result = tensor_dims_is_broadcastable_to(curr, goal);
       bool correct = false;
@@ -63,12 +67,13 @@ TEST_SUITE(FF_TEST_SUITE) {
   }
 
   TEST_CASE("get_broadcast_target_dims(std::unordered_set<TensorDims>)") {
-    TensorDims d1 = TensorDims{FFOrdered<size_t>{1, 10, 4, 3}};
+    TensorDims d1 = TensorDims{FFOrdered<nonnegative_int>{1_n, 10_n, 4_n, 3_n}};
 
-    TensorDims d2 = TensorDims{FFOrdered<size_t>{10, 4, 1}};
+    TensorDims d2 = TensorDims{FFOrdered<nonnegative_int>{10_n, 4_n, 1_n}};
 
     SUBCASE("has target in inputs") {
-      TensorDims d3 = TensorDims{FFOrdered<size_t>{1, 1, 4, 3}};
+      TensorDims d3 =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 4_n, 3_n}};
 
       std::optional<TensorDims> result =
           get_broadcast_target_dims({d1, d2, d3});
@@ -78,7 +83,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("has no possible target") {
-      TensorDims d3 = TensorDims{FFOrdered<size_t>{1, 1, 1, 4}};
+      TensorDims d3 =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 4_n}};
 
       std::optional<TensorDims> result =
           get_broadcast_target_dims({d1, d2, d3});
@@ -88,10 +94,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("has possible target, but not in inputs") {
-      TensorDims d3 = TensorDims{FFOrdered<size_t>{1, 1, 1, 4, 3}};
+      TensorDims d3 =
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 1_n, 4_n, 3_n}};
 
       TensorDims possible_target =
-          TensorDims{FFOrdered<size_t>{1, 1, 10, 4, 3}};
+          TensorDims{FFOrdered<nonnegative_int>{1_n, 1_n, 10_n, 4_n, 3_n}};
 
       REQUIRE(tensor_dims_is_broadcastable_to(d1, possible_target));
       REQUIRE(tensor_dims_is_broadcastable_to(d2, possible_target));
diff --git a/lib/pcg/include/pcg/computation_graph_builder.h b/lib/pcg/include/pcg/computation_graph_builder.h
index df93f69f2e..290df8574e 100644
--- a/lib/pcg/include/pcg/computation_graph_builder.h
+++ b/lib/pcg/include/pcg/computation_graph_builder.h
@@ -85,15 +85,15 @@ struct ComputationGraphBuilder {
   // Add a 2D convolutional layer
   tensor_guid_t conv2d(
       tensor_guid_t const &input,
-      int outChannels,
-      int kernelH,
-      int kernelW,
-      int strideH,
-      int strideW,
-      int paddingH,
-      int paddingW,
+      nonnegative_int outChannels,
+      nonnegative_int kernelH,
+      nonnegative_int kernelW,
+      nonnegative_int strideH,
+      nonnegative_int strideW,
+      nonnegative_int paddingH,
+      nonnegative_int paddingW,
       std::optional<Activation> const &activation = std::nullopt,
-      int groups = 1,
+      nonnegative_int groups = 1_n,
       bool use_bias = true,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
       std::optional<InitializerAttrs> const &bias_initializer = std::nullopt,
@@ -107,8 +107,8 @@ struct ComputationGraphBuilder {
   // Add an embedding layer
   tensor_guid_t embedding(
       tensor_guid_t const &input,
-      int num_entries,
-      int outDim,
+      nonnegative_int num_entries,
+      nonnegative_int outDim,
       AggregateOp aggr,
       DataType dtype = DataType::FLOAT,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
@@ -121,32 +121,32 @@ struct ComputationGraphBuilder {
   // Add a cache layer
   tensor_guid_t
       cache(tensor_guid_t const &input,
-            int num_batches,
+            nonnegative_int num_batches,
             std::function<float(float *, void const *, void const *, int)>
                 score_f = {},
             std::optional<std::string> const &name = std::nullopt);
   // Add a 2D pooling layer
   tensor_guid_t
       pool2d(tensor_guid_t const &input,
-             int kernelH,
-             int kernelW,
-             int strideH,
-             int strideW,
-             int paddingH,
-             int paddingW,
+             nonnegative_int kernelH,
+             nonnegative_int kernelW,
+             nonnegative_int strideH,
+             nonnegative_int strideW,
+             nonnegative_int paddingH,
+             nonnegative_int paddingW,
              PoolOp type = PoolOp::MAX,
              std::optional<Activation> const &activation = std::nullopt,
              std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t adaptive_pool2d(
       tensor_guid_t const &input,
-      int output_h,
-      int output_w,
+      nonnegative_int output_h,
+      nonnegative_int output_w,
       PoolOp type = PoolOp::MAX,
       std::optional<Activation> const &activation = std::nullopt,
       std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t
       layer_norm(tensor_guid_t const &input,
-                 std::vector<int> const &axes,
+                 std::vector<relative_ff_dim_t> const &axes,
                  bool elementwise_affine,
                  float eps,
                  std::optional<std::string> const &name = std::nullopt);
@@ -157,15 +157,15 @@ struct ComputationGraphBuilder {
                  float eps,
                  std::optional<float> const &momentum,
                  std::optional<std::string> const &name = std::nullopt);
-  tensor_guid_t
-      batch_matmul(tensor_guid_t const &A,
-                   tensor_guid_t const &B,
-                   int a_seq_length_dim = -1,
-                   int b_seq_length_dim = -1,
-                   std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t batch_matmul(
+      tensor_guid_t const &A,
+      tensor_guid_t const &B,
+      std::optional<nonnegative_int> const &a_seq_length_dim = std::nullopt,
+      std::optional<nonnegative_int> const &b_seq_length_dim = std::nullopt,
+      std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t dense(
       tensor_guid_t const &input,
-      int outDim,
+      nonnegative_int outDim,
       std::optional<Activation> activation = std::nullopt,
       bool use_bias = true,
       DataType data_type = DataType::FLOAT,
@@ -181,7 +181,7 @@ struct ComputationGraphBuilder {
                      std::optional<std::string> const &name = std::nullopt);
   // Add a concat layer
   tensor_guid_t concat(std::vector<tensor_guid_t> const &tensors,
-                       int axis,
+                       relative_ff_dim_t axis,
                        std::optional<std::string> const &name = std::nullopt);
   // Add a mean layer
   tensor_guid_t mean(tensor_guid_t const &input,
@@ -191,47 +191,48 @@ struct ComputationGraphBuilder {
   // Add a split layer
   std::vector<tensor_guid_t>
       split(tensor_guid_t const &input,
-            std::vector<int> const &split,
-            int axis,
+            std::vector<nonnegative_int> const &split,
+            relative_ff_dim_t axis,
             std::optional<std::string> const &name = std::nullopt);
   // Add a flat layer
-  tensor_guid_t flat(tensor_guid_t const &input,
-                     int start_dim = 0,
-                     std::optional<int> const &end_dim = std::nullopt,
-                     std::optional<std::string> const &name = std::nullopt);
+  tensor_guid_t
+      flat(tensor_guid_t const &input,
+           relative_ff_dim_t start_dim = relative_ff_dim_t{0},
+           std::optional<relative_ff_dim_t> const &end_dim = std::nullopt,
+           std::optional<std::string> const &name = std::nullopt);
   // Add a softmax layer
   tensor_guid_t softmax(tensor_guid_t const &input,
-                        std::optional<int> dim = std::nullopt,
+                        std::optional<relative_ff_dim_t> dim = std::nullopt,
                         std::optional<std::string> const &name = std::nullopt);
   // Create input tensors and constants
   tensor_guid_t
       transpose(tensor_guid_t const &input,
-                std::vector<int> const &perm,
+                std::vector<nonnegative_int> const &perm,
                 std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t
       reduce_sum(tensor_guid_t const &input,
-                 std::vector<int> const &axes,
+                 std::vector<relative_ff_dim_t> const &axes,
                  bool keepdims = false,
                  std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t reshape(tensor_guid_t const &input,
-                        std::vector<int> const &shape,
+                        std::vector<nonnegative_int> const &shape,
                         std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t reverse(tensor_guid_t const &input,
-                        int axis,
+                        relative_ff_dim_t axis,
                         std::optional<std::string> const &name = std::nullopt);
   std::vector<tensor_guid_t>
       top_k(tensor_guid_t const &input,
-            int k,
+            nonnegative_int k,
             bool sorted,
             std::optional<std::string> const &name = std::nullopt);
   tensor_guid_t multihead_attention(
       tensor_guid_t const &query,
       tensor_guid_t const &key,
       tensor_guid_t const &value,
-      int embed_dim,
-      int num_heads,
-      int kdim = 0,
-      int vdim = 0,
+      nonnegative_int embed_dim,
+      nonnegative_int num_heads,
+      nonnegative_int kdim = 0_n,
+      nonnegative_int vdim = 0_n,
       float dropout = 0.0f,
       bool bias = true,
       bool add_bias_kv = false,
@@ -254,7 +255,7 @@ struct ComputationGraphBuilder {
                     std::optional<std::string> const &name = std::nullopt);
 
   std::vector<tensor_guid_t> get_outputs(LayerAttrs const &) const;
-  tensor_guid_t get_output(LayerAttrs const &, int idx) const;
+  tensor_guid_t get_output(LayerAttrs const &, nonnegative_int idx) const;
 
   std::vector<tensor_guid_t>
       add_layer(LayerAttrs const &layer,
diff --git a/lib/pcg/include/pcg/cpu_id_t.struct.toml b/lib/pcg/include/pcg/cpu_id_t.struct.toml
index 0492a937be..152debbded 100644
--- a/lib/pcg/include/pcg/cpu_id_t.struct.toml
+++ b/lib/pcg/include/pcg/cpu_id_t.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "cpu_index"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/pcg/include/pcg/device_id.h b/lib/pcg/include/pcg/device_id.h
index 28cf30eaba..36ea9de6b3 100644
--- a/lib/pcg/include/pcg/device_id.h
+++ b/lib/pcg/include/pcg/device_id.h
@@ -13,9 +13,9 @@ device_id_t operator+(device_id_t, size_t);
 DeviceType get_device_type(device_id_t const &device_id);
 gpu_id_t unwrap_gpu(device_id_t);
 cpu_id_t unwrap_cpu(device_id_t);
-int get_raw_id(device_id_t);
+nonnegative_int get_raw_id(device_id_t);
 
-device_id_t device_id_from_index(int, DeviceType);
+device_id_t device_id_from_index(nonnegative_int, DeviceType);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h
index 05c486f0f7..9554995fa0 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.h
@@ -8,7 +8,7 @@ namespace FlexFlow {
 
 V1DataflowGraph to_v1(DataflowGraphView const &);
 V1DataflowGraph to_v1(DataflowGraphView const &,
-                      std::unordered_map<Node, int> const &);
+                      std::unordered_map<Node, nonnegative_int> const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml
index c332b6b41d..57b559a18e 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_dataflow_graph.struct.toml
@@ -13,6 +13,7 @@ includes = [
   "<vector>",
   "<unordered_set>",
   "pcg/file_format/v1/graphs/v1_graph_edge.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -24,7 +25,7 @@ src_includes = [
 
 [[fields]]
 name = "nodes"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "edges"
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml b/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml
index 752706fe1d..9150c20056 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_graph_edge.struct.toml
@@ -9,18 +9,22 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "srcNode"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "srcIdx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dstNode"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "dstIdx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h
index fc9dfcef9a..426bad5a82 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h
@@ -13,18 +13,19 @@
 namespace FlexFlow {
 
 template <typename NodeLabel, typename OutputLabel>
-std::pair<V1LabelledDataflowGraph<NodeLabel, OutputLabel>, bidict<int, Node>>
+std::pair<V1LabelledDataflowGraph<NodeLabel, OutputLabel>,
+          bidict<nonnegative_int, Node>>
     to_v1_including_node_numbering(
         LabelledDataflowGraphView<NodeLabel, OutputLabel> const &g) {
 
-  bidict<int, Node> nodes = bidict_from_enumerating(get_nodes(g));
+  bidict<nonnegative_int, Node> nodes = bidict_from_enumerating(get_nodes(g));
 
   V1DataflowGraph unlabelled = to_v1(g, nodes.reversed());
 
-  std::unordered_map<int, NodeLabel> node_labels = map_values(
+  std::unordered_map<nonnegative_int, NodeLabel> node_labels = map_values(
       nodes.as_unordered_map(), [&](Node const &n) { return g.at(n); });
 
-  std::unordered_map<int, std::vector<OutputLabel>> output_labels =
+  std::unordered_map<nonnegative_int, std::vector<OutputLabel>> output_labels =
       map_values(nodes.as_unordered_map(), [&](Node const &n) {
         return transform(get_outputs(g, n),
                          [&](DataflowOutput const &o) { return g.at(o); });
diff --git a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml
index b440d0f03d..1f69f5cd93 100644
--- a/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml
+++ b/lib/pcg/include/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.struct.toml
@@ -18,6 +18,7 @@ includes = [
   "<unordered_map>",
   "pcg/file_format/v1/graphs/v1_dataflow_graph.dtg.h",
   "pcg/file_format/v1/graphs/v1_graph_output.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -29,11 +30,11 @@ src_includes = [
 
 [[fields]]
 name = "node_labels"
-type = "std::unordered_map<int, NodeLabel>"
+type = "std::unordered_map<::FlexFlow::nonnegative_int, NodeLabel>"
 
 [[fields]]
 name = "output_labels"
-type = "std::unordered_map<int, std::vector<OutputLabel>>"
+type = "std::unordered_map<::FlexFlow::nonnegative_int, std::vector<OutputLabel>>"
 
 [[fields]]
 name = "graph"
diff --git a/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml b/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml
index 0fe0b1761f..bd60564465 100644
--- a/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml
+++ b/lib/pcg/include/pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_sp_decomposition.variant.toml
@@ -9,6 +9,7 @@ features = [
 includes = [
   "pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_series_split.dtg.h",
   "pcg/file_format/v1/v1_binary_sp_decomposition/v1_binary_parallel_split.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[values]]
@@ -20,5 +21,5 @@ type = "::FlexFlow::V1BinaryParallelSplit"
 key = "parallel"
 
 [[values]]
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 key = "leaf"
diff --git a/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h b/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h
index 5590d6999b..c0e9966425 100644
--- a/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h
+++ b/lib/pcg/include/pcg/file_format/v1/v1_computation_graph.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 V1ComputationGraph to_v1(ComputationGraph const &);
 
-std::pair<V1ComputationGraph, bidict<int, layer_guid_t>>
+std::pair<V1ComputationGraph, bidict<nonnegative_int, layer_guid_t>>
     to_v1_including_node_numbering(ComputationGraph const &);
 
 } // namespace FlexFlow
diff --git a/lib/pcg/include/pcg/gpu_id_t.struct.toml b/lib/pcg/include/pcg/gpu_id_t.struct.toml
index 170dbb96fa..7a85b4c0a7 100644
--- a/lib/pcg/include/pcg/gpu_id_t.struct.toml
+++ b/lib/pcg/include/pcg/gpu_id_t.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "gpu_index"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/pcg/include/pcg/machine_space_coordinate.struct.toml b/lib/pcg/include/pcg/machine_space_coordinate.struct.toml
index 9b197a74c9..2528eab849 100644
--- a/lib/pcg/include/pcg/machine_space_coordinate.struct.toml
+++ b/lib/pcg/include/pcg/machine_space_coordinate.struct.toml
@@ -11,15 +11,16 @@ features = [
 
 includes = [ 
   "pcg/device_type.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
 name = "node_idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "device_idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "device_type"
diff --git a/lib/pcg/include/pcg/machine_specification.h b/lib/pcg/include/pcg/machine_specification.h
index 39591e8a70..11c5a81bba 100644
--- a/lib/pcg/include/pcg/machine_specification.h
+++ b/lib/pcg/include/pcg/machine_specification.h
@@ -8,12 +8,12 @@
 
 namespace FlexFlow {
 
-int get_num_gpus(MachineSpecification const &ms);
-int get_num_cpus(MachineSpecification const &ms);
-int get_num_devices(MachineSpecification const &ms,
-                    DeviceType const &device_type);
-int get_num_devices_per_node(MachineSpecification const &ms,
-                             DeviceType const &device_type);
+nonnegative_int get_num_gpus(MachineSpecification const &ms);
+nonnegative_int get_num_cpus(MachineSpecification const &ms);
+nonnegative_int get_num_devices(MachineSpecification const &ms,
+                                DeviceType const &device_type);
+nonnegative_int get_num_devices_per_node(MachineSpecification const &ms,
+                                         DeviceType const &device_type);
 
 bool is_valid_machine_space_coordinate(MachineSpecification const &ms,
                                        MachineSpaceCoordinate const &coord);
diff --git a/lib/pcg/include/pcg/machine_specification.struct.toml b/lib/pcg/include/pcg/machine_specification.struct.toml
index e75b5018cb..7c624c7240 100644
--- a/lib/pcg/include/pcg/machine_specification.struct.toml
+++ b/lib/pcg/include/pcg/machine_specification.struct.toml
@@ -9,17 +9,21 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "num_nodes"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_cpus_per_node"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "num_gpus_per_node"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[fields]]
 name = "inter_node_bandwidth"
diff --git a/lib/pcg/include/pcg/machine_view.h b/lib/pcg/include/pcg/machine_view.h
index f72b2359dc..6ed9e7dd9c 100644
--- a/lib/pcg/include/pcg/machine_view.h
+++ b/lib/pcg/include/pcg/machine_view.h
@@ -5,7 +5,7 @@
 #include "machine_view.dtg.h"
 #include "pcg/device_id_t.dtg.h"
 #include "pcg/operator_task_space.dtg.h"
-#include "task_space_coordinate.dtg.h"
+#include "pcg/task_space_coordinate.dtg.h"
 #include <cstddef>
 #include <optional>
 #include <unordered_set>
diff --git a/lib/pcg/include/pcg/operator_task_space.h b/lib/pcg/include/pcg/operator_task_space.h
index 1a19397c72..b095fad088 100644
--- a/lib/pcg/include/pcg/operator_task_space.h
+++ b/lib/pcg/include/pcg/operator_task_space.h
@@ -16,8 +16,8 @@ std::unordered_set<TaskSpaceCoordinate>
 TaskSpaceCoordinate
     get_task_space_maximum_coordinate(OperatorTaskSpace const &task);
 
-size_t num_dims(OperatorTaskSpace const &task);
-size_t num_tasks(OperatorTaskSpace const &task);
+nonnegative_int num_dims(OperatorTaskSpace const &task);
+nonnegative_int num_tasks(OperatorTaskSpace const &task);
 
 OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
                                           parallel_layer_guid_t const &layer);
diff --git a/lib/pcg/include/pcg/operator_task_space.struct.toml b/lib/pcg/include/pcg/operator_task_space.struct.toml
index 3ab8b83173..9cc4f6b93a 100644
--- a/lib/pcg/include/pcg/operator_task_space.struct.toml
+++ b/lib/pcg/include/pcg/operator_task_space.struct.toml
@@ -11,6 +11,7 @@ features = [
 
 includes = [ 
   "<vector>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -20,4 +21,4 @@ src_includes = [
 
 [[fields]]
 name = "degrees"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
index 019b120936..faa9b73d95 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_builder.h
@@ -33,15 +33,15 @@ struct ParallelComputationGraphBuilder {
 
   parallel_tensor_guid_t conv2d(
       parallel_tensor_guid_t const &input,
-      int outChannels,
-      int kernelH,
-      int kernelW,
-      int strideH,
-      int strideW,
-      int paddingH,
-      int paddingW,
+      nonnegative_int outChannels,
+      nonnegative_int kernelH,
+      nonnegative_int kernelW,
+      nonnegative_int strideH,
+      nonnegative_int strideW,
+      nonnegative_int paddingH,
+      nonnegative_int paddingW,
       std::optional<Activation> const &activation = std::nullopt,
-      int groups = 1,
+      nonnegative_int groups = 1_n,
       bool use_bias = true,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
       std::optional<InitializerAttrs> const &bias_initializer = std::nullopt,
@@ -50,7 +50,7 @@ struct ParallelComputationGraphBuilder {
 
   parallel_tensor_guid_t dense(
       parallel_tensor_guid_t const &input,
-      int outDim,
+      nonnegative_int outDim,
       std::optional<Activation> activation = std::nullopt,
       bool use_bias = true,
       DataType data_type = DataType::FLOAT,
@@ -61,8 +61,8 @@ struct ParallelComputationGraphBuilder {
 
   parallel_tensor_guid_t embedding(
       parallel_tensor_guid_t const &input,
-      int num_entries,
-      int outDim,
+      nonnegative_int num_entries,
+      nonnegative_int outDim,
       AggregateOp aggr,
       DataType dtype = DataType::FLOAT,
       std::optional<InitializerAttrs> const &kernel_initializer = std::nullopt,
@@ -72,10 +72,10 @@ struct ParallelComputationGraphBuilder {
       parallel_tensor_guid_t const &query,
       parallel_tensor_guid_t const &key,
       parallel_tensor_guid_t const &value,
-      int embed_dim,
-      int num_heads,
-      std::optional<int> kdim = std::nullopt,
-      std::optional<int> vdim = std::nullopt,
+      nonnegative_int embed_dim,
+      nonnegative_int num_heads,
+      std::optional<nonnegative_int> kdim = std::nullopt,
+      std::optional<nonnegative_int> vdim = std::nullopt,
       float dropout = 0.0f,
       bool bias = true,
       bool add_bias_kv = false,
@@ -120,20 +120,20 @@ struct ParallelComputationGraphBuilder {
   parallel_tensor_guid_t
       parallel_partition(parallel_tensor_guid_t const &x,
                          ff_dim_t dim,
-                         int degree,
+                         nonnegative_int degree,
                          std::optional<std::string> const &name = std::nullopt);
   parallel_tensor_guid_t
       parallel_combine(parallel_tensor_guid_t const &x,
                        ff_dim_t dim,
-                       int degree,
+                       nonnegative_int degree,
                        std::optional<std::string> const &name = std::nullopt);
   parallel_tensor_guid_t
       parallel_replicate(parallel_tensor_guid_t const &x,
-                         int degree,
+                         nonnegative_int degree,
                          std::optional<std::string> const &name = std::nullopt);
   parallel_tensor_guid_t
       parallel_reduce(parallel_tensor_guid_t const &x,
-                      int degree,
+                      nonnegative_int degree,
                       std::optional<std::string> const &name = std::nullopt);
 
 private:
diff --git a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h
index 7aac8558e4..5bce560020 100644
--- a/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h
+++ b/lib/pcg/include/pcg/parallel_computation_graph/parallel_computation_graph_edge.h
@@ -11,7 +11,7 @@ parallel_tensor_guid_t
     get_parallel_tensor(ParallelComputationGraphEdge const &);
 parallel_layer_guid_t get_src_layer(ParallelComputationGraphEdge const &);
 parallel_layer_guid_t get_dst_layer(ParallelComputationGraphEdge const &);
-int get_dst_layer_input_idx(ParallelComputationGraphEdge const &);
+nonnegative_int get_dst_layer_input_idx(ParallelComputationGraphEdge const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/pcg/include/pcg/start_invariant_machine_view.h b/lib/pcg/include/pcg/start_invariant_machine_view.h
index f5091c69d1..cdf17213f9 100644
--- a/lib/pcg/include/pcg/start_invariant_machine_view.h
+++ b/lib/pcg/include/pcg/start_invariant_machine_view.h
@@ -17,7 +17,7 @@ MachineView
 StartInvariantMachineView
     start_invariant_from_machine_view(MachineView const &mv);
 
-size_t num_dims(StartInvariantMachineView const &mv);
+nonnegative_int num_dims(StartInvariantMachineView const &mv);
 
 DeviceType get_device_type(StartInvariantMachineView const &mv);
 
diff --git a/lib/pcg/include/pcg/stride_t.struct.toml b/lib/pcg/include/pcg/stride_t.struct.toml
index a764497b8b..8d950c5f39 100644
--- a/lib/pcg/include/pcg/stride_t.struct.toml
+++ b/lib/pcg/include/pcg/stride_t.struct.toml
@@ -9,6 +9,10 @@ features = [
   "fmt",
 ]
 
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
 [[fields]]
 name = "unwrapped"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/pcg/include/pcg/task_space_coordinate.struct.toml b/lib/pcg/include/pcg/task_space_coordinate.struct.toml
index 65aea167cb..1057676b8e 100644
--- a/lib/pcg/include/pcg/task_space_coordinate.struct.toml
+++ b/lib/pcg/include/pcg/task_space_coordinate.struct.toml
@@ -11,6 +11,7 @@ features = [
 
 includes = [ 
   "<vector>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -20,4 +21,4 @@ src_includes = [
 
 [[fields]]
 name = "raw_coord"
-type = "std::vector<int>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
diff --git a/lib/pcg/src/pcg/computation_graph_builder.cc b/lib/pcg/src/pcg/computation_graph_builder.cc
index 2d523c78ac..d7e6ea3291 100644
--- a/lib/pcg/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/computation_graph_builder.cc
@@ -375,30 +375,32 @@ tensor_guid_t
 
 tensor_guid_t ComputationGraphBuilder::conv2d(
     tensor_guid_t const &x,
-    int outChannels,
-    int kernelH,
-    int kernelW,
-    int strideH,
-    int strideW,
-    int paddingH,
-    int paddingW,
+    nonnegative_int outChannels,
+    nonnegative_int kernelH,
+    nonnegative_int kernelW,
+    nonnegative_int strideH,
+    nonnegative_int strideW,
+    nonnegative_int paddingH,
+    nonnegative_int paddingW,
     std::optional<Activation> const &activation,
-    int groups,
+    nonnegative_int groups,
     bool use_bias,
     std::optional<InitializerAttrs> const &kernel_initializer,
     std::optional<InitializerAttrs> const &bias_initializer,
     std::optional<RegularizerAttrs> const &kernel_regularizer,
     std::optional<std::string> const &maybe_name) {
-  Conv2DAttrs attrs = Conv2DAttrs{outChannels,
-                                  kernelH,
-                                  kernelW,
-                                  strideH,
-                                  strideW,
-                                  paddingH,
-                                  paddingW,
-                                  groups,
-                                  activation,
-                                  use_bias};
+  Conv2DAttrs attrs = Conv2DAttrs{
+      /*out_channels=*/outChannels,
+      /*kernel_h=*/kernelH,
+      /*kernel_w=*/kernelW,
+      /*stride_h=*/strideH,
+      /*stride_w=*/strideW,
+      /*padding_h=*/paddingH,
+      /*padding_w=*/paddingW,
+      /*groups=*/groups,
+      /*activation=*/activation,
+      /*use_bias=*/use_bias,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
@@ -450,13 +452,18 @@ tensor_guid_t ComputationGraphBuilder::dropout(
 
 tensor_guid_t ComputationGraphBuilder::embedding(
     tensor_guid_t const &x,
-    int num_entries,
-    int outDim,
+    nonnegative_int num_entries,
+    nonnegative_int outDim,
     AggregateOp aggr,
     DataType dtype,
     std::optional<InitializerAttrs> const &kernel_initializer,
     std::optional<std::string> const &maybe_name) {
-  EmbeddingAttrs attrs = EmbeddingAttrs{num_entries, outDim, aggr, dtype};
+  EmbeddingAttrs attrs = EmbeddingAttrs{
+      /*num_entries=*/num_entries,
+      /*out_channels=*/outDim,
+      /*aggr=*/aggr,
+      /*data_type=*/dtype,
+  };
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
 
@@ -508,12 +515,12 @@ tensor_guid_t ComputationGraphBuilder::gather(
 }
 tensor_guid_t ComputationGraphBuilder::pool2d(
     tensor_guid_t const &x,
-    int kernelH,
-    int kernelW,
-    int strideH,
-    int strideW,
-    int paddingH,
-    int paddingW,
+    nonnegative_int kernelH,
+    nonnegative_int kernelW,
+    nonnegative_int strideH,
+    nonnegative_int strideW,
+    nonnegative_int paddingH,
+    nonnegative_int paddingW,
     PoolOp type,
     std::optional<Activation> const &activation,
     std::optional<std::string> const &maybe_name) {
@@ -546,8 +553,8 @@ tensor_guid_t ComputationGraphBuilder::pool2d(
 
 tensor_guid_t ComputationGraphBuilder::adaptive_pool2d(
     tensor_guid_t const &uncasted_input,
-    int output_h,
-    int output_w,
+    nonnegative_int output_h,
+    nonnegative_int output_w,
     PoolOp type,
     std::optional<Activation> const &activation,
     std::optional<std::string> const &maybe_name) {
@@ -636,10 +643,10 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
     tensor_guid_t const &query,
     tensor_guid_t const &key,
     tensor_guid_t const &value,
-    int embed_dim,
-    int num_heads,
-    int kdim,
-    int vdim,
+    nonnegative_int embed_dim,
+    nonnegative_int num_heads,
+    nonnegative_int kdim,
+    nonnegative_int vdim,
     float dropout,
     bool bias,
     bool add_bias_kv,
@@ -661,14 +668,16 @@ tensor_guid_t ComputationGraphBuilder::multihead_attention(
         "If you need this functionality, please create an issue.");
   }
 
-  MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{embed_dim,
-                                                          num_heads,
-                                                          kdim,
-                                                          vdim,
-                                                          dropout,
-                                                          bias,
-                                                          add_bias_kv,
-                                                          add_zero_attn};
+  MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
+      /*embed_dim=*/embed_dim,
+      /*num_heads=*/num_heads,
+      /*kdim=*/kdim,
+      /*vdim=*/vdim,
+      /*dropout=*/dropout,
+      /*bias=*/bias,
+      /*add_bias_kv=*/add_bias_kv,
+      /*add_zero_attn=*/add_zero_attn,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
@@ -742,7 +751,7 @@ TensorDims ComputationGraphBuilder::get_broadcast_target_dims(
 
 tensor_guid_t ComputationGraphBuilder::dense(
     tensor_guid_t const &input,
-    int outDim,
+    nonnegative_int outDim,
     std::optional<Activation> activation,
     bool use_bias,
     DataType data_type,
@@ -751,8 +760,13 @@ tensor_guid_t ComputationGraphBuilder::dense(
     std::optional<std::string> const &maybe_name,
     std::optional<std::string> const &projection_name,
     std::optional<std::string> const &bias_name) {
-  LinearAttrs attrs =
-      LinearAttrs{outDim, use_bias, data_type, activation, std::nullopt};
+  LinearAttrs attrs = LinearAttrs{
+      /*out_channels=*/outDim,
+      /*use_bias=*/use_bias,
+      /*data_type=*/data_type,
+      /*activation=*/activation,
+      /*regularizer=*/std::nullopt,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
@@ -793,12 +807,11 @@ tensor_guid_t ComputationGraphBuilder::dense(
 
 tensor_guid_t ComputationGraphBuilder::concat(
     std::vector<tensor_guid_t> const &inputs,
-    int axis,
+    relative_ff_dim_t axis,
     std::optional<std::string> const &maybe_name) {
 
-  relative_ff_dim_t wrapped_axis = relative_ff_dim_t{axis};
   ConcatAttrs attrs = ConcatAttrs{ff_dim_t_from_relative_ff_dim_t(
-      wrapped_axis, num_dims(this->get_shape(inputs[0])))};
+      axis, num_dims(this->get_shape(inputs[0])))};
 
   std::string name =
       maybe_name.value_or(get_default_name(ComputationGraphOpAttrs{attrs}));
@@ -816,17 +829,17 @@ tensor_guid_t ComputationGraphBuilder::concat(
 
 tensor_guid_t ComputationGraphBuilder::flat(
     tensor_guid_t const &input,
-    int start_dim,
-    std::optional<int> const &end_dim,
+    relative_ff_dim_t start_dim,
+    std::optional<relative_ff_dim_t> const &end_dim,
     std::optional<std::string> const &maybe_name) {
-  int input_num_dims = num_dims(this->get_shape(input));
+  nonnegative_int input_num_dims = num_dims(this->get_shape(input));
 
   FlatAttrs attrs = FlatAttrs{
-      /*start_dim=*/ff_dim_t_from_relative_ff_dim_t(
-          relative_ff_dim_t{start_dim}, input_num_dims),
+      /*start_dim=*/ff_dim_t_from_relative_ff_dim_t(start_dim, input_num_dims),
       /*end_dim=*/
-      ff_dim_t_from_relative_ff_dim_t(
-          relative_ff_dim_t{end_dim.value_or(input_num_dims)}, input_num_dims),
+      ff_dim_t_from_relative_ff_dim_t(end_dim.value_or(relative_ff_dim_t{
+                                          input_num_dims.unwrap_nonnegative()}),
+                                      input_num_dims),
   };
 
   std::string name =
@@ -842,16 +855,15 @@ tensor_guid_t ComputationGraphBuilder::flat(
 
 tensor_guid_t ComputationGraphBuilder::layer_norm(
     tensor_guid_t const &input,
-    std::vector<int> const &relative_axes,
+    std::vector<relative_ff_dim_t> const &relative_axes,
     bool elementwise_affine,
     float eps,
     std::optional<std::string> const &maybe_name) {
 
   TensorShape input_shape = this->get_shape(input);
 
-  auto resolve_dim_idx = [&](int dim_idx) {
-    return ff_dim_t_from_relative_ff_dim_t(relative_ff_dim_t{dim_idx},
-                                           num_dims(input_shape));
+  auto resolve_dim_idx = [&](relative_ff_dim_t dim_idx) {
+    return ff_dim_t_from_relative_ff_dim_t(dim_idx, num_dims(input_shape));
   };
 
   stack_vector<ff_dim_t, MAX_TENSOR_DIM> axes = stack_vector_of<MAX_TENSOR_DIM>(
@@ -909,15 +921,16 @@ tensor_guid_t ComputationGraphBuilder::layer_norm(
 
 tensor_guid_t ComputationGraphBuilder::softmax(
     tensor_guid_t const &input,
-    std::optional<int> maybe_dim,
+    std::optional<relative_ff_dim_t> maybe_dim,
     std::optional<std::string> const &maybe_name) {
 
   TensorShape input_shape = this->get_shape(input);
 
-  int dim = maybe_dim.value_or(num_dims(input_shape) - 1);
+  relative_ff_dim_t dim = maybe_dim.value_or(
+      relative_ff_dim_t{num_dims(input_shape).unwrap_nonnegative() - 1});
 
-  SoftmaxAttrs attrs = SoftmaxAttrs{ff_dim_t_from_relative_ff_dim_t(
-      relative_ff_dim_t{dim}, num_dims(input_shape))};
+  SoftmaxAttrs attrs =
+      SoftmaxAttrs{ff_dim_t_from_relative_ff_dim_t(dim, num_dims(input_shape))};
 
   if (attrs.dim.value >= num_dims(input_shape)) {
     throw mk_runtime_error(
diff --git a/lib/pcg/src/pcg/device_id.cc b/lib/pcg/src/pcg/device_id.cc
index a8cfe1f82f..1a4f7b7d22 100644
--- a/lib/pcg/src/pcg/device_id.cc
+++ b/lib/pcg/src/pcg/device_id.cc
@@ -25,7 +25,7 @@ cpu_id_t unwrap_cpu(device_id_t device_id) {
   return device_id.get<cpu_id_t>();
 }
 
-int get_raw_id(device_id_t device_id) {
+nonnegative_int get_raw_id(device_id_t device_id) {
   switch (get_device_type(device_id)) {
     case DeviceType::GPU:
       return unwrap_gpu(device_id).gpu_index;
@@ -36,7 +36,7 @@ int get_raw_id(device_id_t device_id) {
   }
 }
 
-device_id_t device_id_from_index(int idx, DeviceType device_type) {
+device_id_t device_id_from_index(nonnegative_int idx, DeviceType device_type) {
   switch (device_type) {
     case DeviceType::GPU:
       return device_id_t{gpu_id_t{idx}};
diff --git a/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc b/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc
index cf150a339f..064e2d81d3 100644
--- a/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc
+++ b/lib/pcg/src/pcg/file_format/v1/graphs/v1_dataflow_graph.cc
@@ -10,15 +10,15 @@
 namespace FlexFlow {
 
 V1DataflowGraph to_v1(DataflowGraphView const &g) {
-  bidict<int, Node> node_enumeration_bidict =
+  bidict<nonnegative_int, Node> node_enumeration_bidict =
       bidict_from_enumerating(get_nodes(g));
-  std::unordered_map<Node, int> node_enumeration =
+  std::unordered_map<Node, nonnegative_int> node_enumeration =
       node_enumeration_bidict.reversed().as_unordered_map();
   return to_v1(g, node_enumeration);
 }
 
 V1DataflowGraph to_v1(DataflowGraphView const &g,
-                      std::unordered_map<Node, int> const &nodes) {
+                      std::unordered_map<Node, nonnegative_int> const &nodes) {
   std::unordered_set<V1GraphEdge> edges;
   for (DataflowEdge const &e : get_edges(g)) {
     edges.insert(V1GraphEdge{
diff --git a/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc b/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc
index d353ccdda3..ac819db342 100644
--- a/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc
+++ b/lib/pcg/src/pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.cc
@@ -1 +1,17 @@
 #include "pcg/file_format/v1/graphs/v1_labelled_dataflow_graph.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using NodeLabel = value_type<0>;
+using OutputLabel = value_type<1>;
+
+template std::pair<V1LabelledDataflowGraph<NodeLabel, OutputLabel>,
+                   bidict<nonnegative_int, Node>>
+    to_v1_including_node_numbering(
+        LabelledDataflowGraphView<NodeLabel, OutputLabel> const &);
+
+template V1LabelledDataflowGraph<NodeLabel, OutputLabel>
+    to_v1(LabelledDataflowGraphView<NodeLabel, OutputLabel> const &);
+
+} // namespace FlexFlow
diff --git a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
index 5341e03c0a..d39652a7e2 100644
--- a/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
+++ b/lib/pcg/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
@@ -21,7 +21,7 @@ V1BinarySPDecomposition
     };
   } else if (type == "leaf") {
     return V1BinarySPDecomposition{
-        j.at("value").get<int>(),
+        j.at("value").get<nonnegative_int>(),
     };
   } else {
     throw mk_runtime_error(fmt::format(
@@ -45,7 +45,7 @@ void adl_serializer<V1BinarySPDecomposition>::to_json(
         j["type"] = "parallel";
         return std::monostate{};
       },
-      [&](int leaf) {
+      [&](nonnegative_int leaf) {
         j["value"] = leaf;
         j["type"] = "leaf";
         return std::monostate{};
diff --git a/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc b/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc
index 975e92dfb7..3511ccc269 100644
--- a/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc
+++ b/lib/pcg/src/pcg/file_format/v1/v1_computation_graph.cc
@@ -9,13 +9,14 @@ V1ComputationGraph to_v1(ComputationGraph const &g) {
   };
 }
 
-std::pair<V1ComputationGraph, bidict<int, layer_guid_t>>
+std::pair<V1ComputationGraph, bidict<nonnegative_int, layer_guid_t>>
     to_v1_including_node_numbering(ComputationGraph const &cg) {
-  std::pair<V1LabelledDataflowGraph<LayerAttrs, TensorAttrs>, bidict<int, Node>>
+  std::pair<V1LabelledDataflowGraph<LayerAttrs, TensorAttrs>,
+            bidict<nonnegative_int, Node>>
       raw =
           to_v1_including_node_numbering<LayerAttrs, TensorAttrs>(cg.raw_graph);
   V1ComputationGraph v1_cg = V1ComputationGraph{raw.first};
-  bidict<int, layer_guid_t> v1_node_ids =
+  bidict<nonnegative_int, layer_guid_t> v1_node_ids =
       map_values(raw.second, [](Node const &n) { return layer_guid_t{n}; });
 
   return {v1_cg, v1_node_ids};
diff --git a/lib/pcg/src/pcg/machine_space_offset.cc b/lib/pcg/src/pcg/machine_space_offset.cc
index 9990023f8c..4aa79b3d1b 100644
--- a/lib/pcg/src/pcg/machine_space_offset.cc
+++ b/lib/pcg/src/pcg/machine_space_offset.cc
@@ -17,8 +17,10 @@ MachineSpaceOffset get_machine_space_offset_from_coordinate(
         fmt::format("{} has different DeviceType from {}", start, coord));
   }
 
-  return MachineSpaceOffset{coord.node_idx - start.node_idx,
-                            coord.device_idx - start.device_idx,
+  return MachineSpaceOffset{coord.node_idx.unwrap_nonnegative() -
+                                start.node_idx.unwrap_nonnegative(),
+                            coord.device_idx.unwrap_nonnegative() -
+                                start.device_idx.unwrap_nonnegative(),
                             coord.device_type};
 }
 
diff --git a/lib/pcg/src/pcg/machine_specification.cc b/lib/pcg/src/pcg/machine_specification.cc
index 19ff50b4b7..0fefeddd27 100644
--- a/lib/pcg/src/pcg/machine_specification.cc
+++ b/lib/pcg/src/pcg/machine_specification.cc
@@ -4,14 +4,16 @@
 #include "utils/exception.h"
 namespace FlexFlow {
 
-int get_num_gpus(MachineSpecification const &ms) {
+nonnegative_int get_num_gpus(MachineSpecification const &ms) {
   return ms.num_nodes * ms.num_gpus_per_node;
 }
-int get_num_cpus(MachineSpecification const &ms) {
+
+nonnegative_int get_num_cpus(MachineSpecification const &ms) {
   return ms.num_nodes * ms.num_cpus_per_node;
 }
-int get_num_devices(MachineSpecification const &ms,
-                    DeviceType const &device_type) {
+
+nonnegative_int get_num_devices(MachineSpecification const &ms,
+                                DeviceType const &device_type) {
   switch (device_type) {
     case DeviceType::GPU:
       return get_num_gpus(ms);
@@ -22,8 +24,8 @@ int get_num_devices(MachineSpecification const &ms,
   }
 }
 
-int get_num_devices_per_node(MachineSpecification const &ms,
-                             DeviceType const &device_type) {
+nonnegative_int get_num_devices_per_node(MachineSpecification const &ms,
+                                         DeviceType const &device_type) {
   switch (device_type) {
     case DeviceType::GPU:
       return ms.num_gpus_per_node;
@@ -33,6 +35,7 @@ int get_num_devices_per_node(MachineSpecification const &ms,
       throw mk_runtime_error(fmt::format("Unknown DeviceType {}", device_type));
   }
 }
+
 bool is_valid_machine_space_coordinate(MachineSpecification const &ms,
                                        MachineSpaceCoordinate const &coord) {
   return (coord.node_idx < ms.num_nodes) &&
@@ -45,7 +48,7 @@ device_id_t get_device_id(MachineSpecification const &ms,
     throw mk_runtime_error(fmt::format(
         "Invalid coordinate {} for machine specification {}", ms, coord));
   }
-  int raw_idx =
+  nonnegative_int raw_idx =
       coord.node_idx * get_num_devices_per_node(ms, coord.device_type) +
       coord.device_idx;
   return device_id_from_index(raw_idx, coord.device_type);
diff --git a/lib/pcg/src/pcg/machine_view.cc b/lib/pcg/src/pcg/machine_view.cc
index cc42ad83b2..fe319dc63c 100644
--- a/lib/pcg/src/pcg/machine_view.cc
+++ b/lib/pcg/src/pcg/machine_view.cc
@@ -16,6 +16,9 @@
 #include "utils/containers/transform.h"
 #include "utils/containers/zip.h"
 #include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
+
 namespace FlexFlow {
 
 size_t num_dims(MachineView const &mv) {
@@ -71,47 +74,57 @@ std::optional<MachineSpaceCoordinate> get_machine_space_coordinate(
   }
 
   auto get_dimension_indices_for_dimension =
-      [&](MachineSpecificationDimension dimension) {
-        std::vector<MachineSpecificationDimension> mv_dimensions =
-            get_dimensions(machine_view);
-        return filter(count(mv_dimensions.size()), [&](size_t idx) {
-          return mv_dimensions.at(idx) == dimension;
-        });
-      };
-
-  auto compute_index = [&](int start_idx,
-                           std::vector<size_t> const &dimension_indices) {
-    std::vector<stride_t> mv_strides = get_strides(machine_view);
-
-    std::vector<int> sizes = transform(dimension_indices, [&](size_t i) {
-      return task.degrees.at(i) * mv_strides.at(i).unwrapped;
-    });
-    std::vector<int> coord_points = transform(
-        dimension_indices, [&](size_t i) { return coord.raw_coord.at(i); });
-    std::vector<int> strides = transform(dimension_indices, [&](size_t i) {
-      return mv_strides.at(i).unwrapped;
-    });
-
-    std::vector<int> coeffs = scanl(sizes, 1, std::multiplies<int>());
-
-    int index = start_idx;
-    for (auto [coeff, coord_point, stride] :
-         zip(coeffs, coord_points, strides)) {
-      index += coeff * coord_point * stride;
-    }
-    return index;
+      [&](MachineSpecificationDimension dimension)
+      -> std::vector<nonnegative_int> {
+    std::vector<MachineSpecificationDimension> mv_dimensions =
+        get_dimensions(machine_view);
+    return filter(nonnegative_range(num_elements(mv_dimensions)),
+                  [&](nonnegative_int idx) {
+                    return mv_dimensions.at(idx.unwrap_nonnegative()) ==
+                           dimension;
+                  });
   };
 
-  std::vector<size_t> inter_dimension_indices =
+  auto compute_index =
+      [&](nonnegative_int start_idx,
+          std::vector<nonnegative_int> const &dimension_indices) {
+        std::vector<stride_t> mv_strides = get_strides(machine_view);
+
+        std::vector<nonnegative_int> sizes =
+            transform(dimension_indices, [&](nonnegative_int i) {
+              return task.degrees.at(i.unwrap_nonnegative()) *
+                     mv_strides.at(i.unwrap_nonnegative()).unwrapped;
+            });
+        std::vector<nonnegative_int> coord_points =
+            transform(dimension_indices, [&](nonnegative_int i) {
+              return coord.raw_coord.at(i.unwrap_nonnegative());
+            });
+        std::vector<nonnegative_int> strides =
+            transform(dimension_indices, [&](nonnegative_int i) {
+              return mv_strides.at(i.unwrap_nonnegative()).unwrapped;
+            });
+
+        std::vector<nonnegative_int> coeffs = scanl(
+            sizes, nonnegative_int{1}, std::multiplies<nonnegative_int>());
+
+        nonnegative_int index = start_idx;
+        for (auto [coeff, coord_point, stride] :
+             zip(coeffs, coord_points, strides)) {
+          index += coeff * coord_point * stride;
+        }
+        return index;
+      };
+
+  std::vector<nonnegative_int> inter_dimension_indices =
       get_dimension_indices_for_dimension(
           MachineSpecificationDimension::INTER_NODE);
-  std::vector<size_t> intra_dimension_indices =
+  std::vector<nonnegative_int> intra_dimension_indices =
       get_dimension_indices_for_dimension(
           MachineSpecificationDimension::INTRA_NODE);
 
-  int node_idx =
+  nonnegative_int node_idx =
       compute_index(machine_view.start.node_idx, inter_dimension_indices);
-  int device_idx =
+  nonnegative_int device_idx =
       compute_index(machine_view.start.device_idx, intra_dimension_indices);
   MachineSpaceCoordinate ms_coord = MachineSpaceCoordinate{
       node_idx, device_idx, get_device_type(machine_view)};
diff --git a/lib/pcg/src/pcg/operator_task_space.cc b/lib/pcg/src/pcg/operator_task_space.cc
index 7157b75082..57af6eedc7 100644
--- a/lib/pcg/src/pcg/operator_task_space.cc
+++ b/lib/pcg/src/pcg/operator_task_space.cc
@@ -14,18 +14,23 @@
 #include "utils/containers/unordered_set_of.h"
 #include "utils/containers/vector_of.h"
 #include "utils/fmt/unordered_set.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
+
 namespace FlexFlow {
 
 std::unordered_set<TaskSpaceCoordinate>
     get_task_space_coordinates(OperatorTaskSpace const &task) {
 
-  std::vector<std::vector<int>> coordinate_ranges = transform(
-      task.degrees, [&](int const &num_points) { return range(num_points); });
+  std::vector<std::vector<nonnegative_int>> coordinate_ranges =
+      transform(task.degrees, [&](nonnegative_int num_points) {
+        return nonnegative_range(num_points);
+      });
 
-  std::unordered_set<std::vector<int>> raw_coordinates =
+  std::unordered_set<std::vector<nonnegative_int>> raw_coordinates =
       unordered_set_of(cartesian_product(coordinate_ranges));
   std::unordered_set<TaskSpaceCoordinate> task_space_coordinates =
-      transform(raw_coordinates, [](std::vector<int> const &point) {
+      transform(raw_coordinates, [](std::vector<nonnegative_int> const &point) {
         return TaskSpaceCoordinate{point};
       });
   return task_space_coordinates;
@@ -36,10 +41,11 @@ TaskSpaceCoordinate
   return maximum(get_task_space_coordinates(task));
 }
 
-size_t num_dims(OperatorTaskSpace const &task) {
-  return task.degrees.size();
+nonnegative_int num_dims(OperatorTaskSpace const &task) {
+  return num_elements(task.degrees);
 }
-size_t num_tasks(OperatorTaskSpace const &task) {
+
+nonnegative_int num_tasks(OperatorTaskSpace const &task) {
   return product(task.degrees);
 }
 
@@ -48,7 +54,7 @@ OperatorTaskSpace get_operator_task_space(ParallelComputationGraph const &pcg,
   parallel_tensor_guid_t out_tensor = get_layer_outputs(pcg, layer).at(0);
   ParallelTensorShape shape = get_parallel_tensor_shape(pcg, out_tensor);
 
-  std::vector<int> degrees;
+  std::vector<nonnegative_int> degrees;
   extend(degrees, vector_of(ff_ordered_shard_degrees(shape)));
   degrees.push_back(get_sum_degree(shape));
   degrees.push_back(get_discard_copy_degree(shape));
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
index dadad6277f..2cf149f78a 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/generate_weight_transform.cc
@@ -9,7 +9,7 @@ std::unordered_set<ParallelOpAttrs>
                               ParallelTensorShape const &goal) {
   std::unordered_set<ParallelOpAttrs> result;
 
-  int sum_degree = get_sum_degree(goal);
+  nonnegative_int sum_degree = get_sum_degree(goal);
   if (sum_degree != 1) {
     throw mk_runtime_error(
         fmt::format("generate_weight_transform currently only supports "
@@ -17,7 +17,7 @@ std::unordered_set<ParallelOpAttrs>
                     sum_degree));
   }
 
-  int discard_copy_degree = get_discard_copy_degree(goal);
+  nonnegative_int discard_copy_degree = get_discard_copy_degree(goal);
   if (discard_copy_degree != 1) {
     result.insert(ParallelOpAttrs{ReplicateAttrs{discard_copy_degree}});
   }
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index f33b4dcd17..25f8dc0c5f 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -107,8 +107,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::batch_matmul(
     std::optional<std::string> const &maybe_name) {
 
   BatchMatmulAttrs attrs = BatchMatmulAttrs{
-      /*a_seq_length_dim=*/-1,
-      /*b_seq_length_dim=*/-1,
+      /*a_seq_length_dim=*/std::nullopt,
+      /*b_seq_length_dim=*/std::nullopt,
   };
 
   std::string name =
@@ -140,30 +140,32 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::cast(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d(
     parallel_tensor_guid_t const &raw_input,
-    int outChannels,
-    int kernelH,
-    int kernelW,
-    int strideH,
-    int strideW,
-    int paddingH,
-    int paddingW,
+    nonnegative_int outChannels,
+    nonnegative_int kernelH,
+    nonnegative_int kernelW,
+    nonnegative_int strideH,
+    nonnegative_int strideW,
+    nonnegative_int paddingH,
+    nonnegative_int paddingW,
     std::optional<Activation> const &activation,
-    int groups,
+    nonnegative_int groups,
     bool use_bias,
     std::optional<InitializerAttrs> const &kernel_initializer,
     std::optional<InitializerAttrs> const &bias_initializer,
     std::optional<RegularizerAttrs> const &kernel_regularizer,
     std::optional<std::string> const &maybe_name) {
-  Conv2DAttrs attrs = Conv2DAttrs{outChannels,
-                                  kernelH,
-                                  kernelW,
-                                  strideH,
-                                  strideW,
-                                  paddingH,
-                                  paddingW,
-                                  groups,
-                                  activation,
-                                  use_bias};
+  Conv2DAttrs attrs = Conv2DAttrs{
+      /*out_channels=*/outChannels,
+      /*kernel_h=*/kernelH,
+      /*kernel_w=*/kernelW,
+      /*stride_h=*/strideH,
+      /*stride_w=*/strideW,
+      /*padding_h=*/paddingH,
+      /*padding_w=*/paddingW,
+      /*groups=*/groups,
+      /*activation=*/activation,
+      /*use_bias=*/use_bias,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(PCGOperatorAttrs{attrs}));
@@ -191,7 +193,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::conv2d(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::dense(
     parallel_tensor_guid_t const &input,
-    int outDim,
+    nonnegative_int outDim,
     std::optional<Activation> activation,
     bool use_bias,
     DataType data_type,
@@ -199,11 +201,11 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::dense(
     std::optional<InitializerAttrs> const &bias_initializer,
     std::optional<std::string> const &maybe_name) {
   LinearAttrs attrs = LinearAttrs{
-      outDim,
-      use_bias,
-      data_type,
-      activation,
-      std::nullopt,
+      /*out_channels=*/outDim,
+      /*use_bias=*/use_bias,
+      /*data_type=*/data_type,
+      /*activation=*/activation,
+      /*regularizer=*/std::nullopt,
   };
 
   std::string name =
@@ -238,18 +240,18 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::dense(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::embedding(
     parallel_tensor_guid_t const &input,
-    int num_entries,
-    int outDim,
+    nonnegative_int num_entries,
+    nonnegative_int outDim,
     AggregateOp aggr,
     DataType dtype,
     std::optional<InitializerAttrs> const &kernel_initializer,
     std::optional<std::string> const &maybe_name) {
 
   EmbeddingAttrs attrs = EmbeddingAttrs{
-      num_entries,
-      outDim,
-      aggr,
-      dtype,
+      /*num_entries=*/num_entries,
+      /*out_channels=*/outDim,
+      /*aggr=*/aggr,
+      /*data_type=*/dtype,
   };
 
   std::string name =
@@ -273,10 +275,10 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention(
     parallel_tensor_guid_t const &query,
     parallel_tensor_guid_t const &key,
     parallel_tensor_guid_t const &value,
-    int embed_dim,
-    int num_heads,
-    std::optional<int> maybe_kdim,
-    std::optional<int> maybe_vdim,
+    nonnegative_int embed_dim,
+    nonnegative_int num_heads,
+    std::optional<nonnegative_int> maybe_kdim,
+    std::optional<nonnegative_int> maybe_vdim,
     float dropout,
     bool bias,
     bool add_bias_kv,
@@ -286,8 +288,8 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::multihead_attention(
     std::optional<InitializerAttrs> output_bias_initializer,
     std::optional<std::string> const &maybe_name) {
 
-  int kdim = maybe_kdim.value_or(embed_dim);
-  int vdim = maybe_vdim.value_or(embed_dim);
+  nonnegative_int kdim = maybe_kdim.value_or(embed_dim);
+  nonnegative_int vdim = maybe_vdim.value_or(embed_dim);
 
   MultiHeadAttentionAttrs attrs = MultiHeadAttentionAttrs{
       /*embed_dim=*/embed_dim,
@@ -490,10 +492,13 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::elu(
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition(
     parallel_tensor_guid_t const &input,
     ff_dim_t dim,
-    int degree,
+    nonnegative_int degree,
     std::optional<std::string> const &maybe_name) {
 
-  RepartitionAttrs attrs = RepartitionAttrs{dim, degree};
+  RepartitionAttrs attrs = RepartitionAttrs{
+      /*repartition_dim=*/dim,
+      /*repartition_degree=*/degree,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(PCGOperatorAttrs{attrs}));
@@ -509,10 +514,13 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_partition(
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine(
     parallel_tensor_guid_t const &input,
     ff_dim_t dim,
-    int degree,
+    nonnegative_int degree,
     std::optional<std::string> const &maybe_name) {
 
-  CombineAttrs attrs = CombineAttrs{dim, degree};
+  CombineAttrs attrs = CombineAttrs{
+      /*combine_dim=*/dim,
+      /*combine_degree=*/degree,
+  };
 
   std::string name =
       maybe_name.value_or(get_default_name(PCGOperatorAttrs{attrs}));
@@ -527,7 +535,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_combine(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate(
     parallel_tensor_guid_t const &input,
-    int degree,
+    nonnegative_int degree,
     std::optional<std::string> const &maybe_name) {
 
   ReplicateAttrs attrs = ReplicateAttrs{degree};
@@ -545,7 +553,7 @@ parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_replicate(
 
 parallel_tensor_guid_t ParallelComputationGraphBuilder::parallel_reduce(
     parallel_tensor_guid_t const &input,
-    int degree,
+    nonnegative_int degree,
     std::optional<std::string> const &maybe_name) {
 
   ReductionAttrs attrs = ReductionAttrs{degree};
@@ -661,7 +669,7 @@ std::vector<parallel_tensor_guid_t> ParallelComputationGraphBuilder::add_layer(
 
   std::vector<DataflowOutput> raw_weight_tensors;
   for (auto const &kv : enumerate_vector(weights)) {
-    int weight_idx = kv.first;
+    nonnegative_int weight_idx = kv.first;
     ParallelTensorAttrs weight_tensor_attrs = kv.second;
 
     std::optional<std::string> weight_name =
diff --git a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
index d30739486e..f37d08dc8a 100644
--- a/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
+++ b/lib/pcg/src/pcg/parallel_computation_graph/parallel_computation_graph_edge.cc
@@ -16,7 +16,7 @@ parallel_layer_guid_t get_dst_layer(ParallelComputationGraphEdge const &e) {
   return parallel_layer_guid_t{e.raw_edge.dst.node};
 }
 
-int get_dst_layer_input_idx(ParallelComputationGraphEdge const &e) {
+nonnegative_int get_dst_layer_input_idx(ParallelComputationGraphEdge const &e) {
   return e.raw_edge.dst.idx;
 }
 
diff --git a/lib/pcg/src/pcg/start_invariant_machine_view.cc b/lib/pcg/src/pcg/start_invariant_machine_view.cc
index 1fcc3ea12f..e9f864d416 100644
--- a/lib/pcg/src/pcg/start_invariant_machine_view.cc
+++ b/lib/pcg/src/pcg/start_invariant_machine_view.cc
@@ -7,6 +7,7 @@
 #include "utils/containers/scanl.h"
 #include "utils/containers/transform.h"
 #include "utils/containers/zip.h"
+#include "utils/nonnegative_int/num_elements.h"
 namespace FlexFlow {
 
 MachineView machine_view_from_start_invariant(
@@ -20,8 +21,8 @@ StartInvariantMachineView
   return StartInvariantMachineView{mv.dimensions, get_device_type(mv)};
 }
 
-size_t num_dims(StartInvariantMachineView const &start_inv_mv) {
-  return start_inv_mv.dimensions.size();
+nonnegative_int num_dims(StartInvariantMachineView const &start_inv_mv) {
+  return num_elements(start_inv_mv.dimensions);
 }
 
 DeviceType get_device_type(StartInvariantMachineView const &start_inv_mv) {
@@ -59,7 +60,7 @@ std::optional<MachineSpaceOffset> get_machine_space_offset(
     TaskSpaceCoordinate const &coord,
     MachineSpecification const &machine_specification) {
   MachineSpaceCoordinate dummy_start =
-      MachineSpaceCoordinate{0, 0, get_device_type(start_inv_machine_view)};
+      MachineSpaceCoordinate{0_n, 0_n, get_device_type(start_inv_machine_view)};
   MachineView mv =
       machine_view_from_start_invariant(start_inv_machine_view, dummy_start);
   std::optional<MachineSpaceCoordinate> ms_coord =
diff --git a/lib/pcg/test/src/pcg/computation_graph.cc b/lib/pcg/test/src/pcg/computation_graph.cc
index e2ed51b2f1..d92d65ad7b 100644
--- a/lib/pcg/test/src/pcg/computation_graph.cc
+++ b/lib/pcg/test/src/pcg/computation_graph.cc
@@ -13,9 +13,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -40,9 +40,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              10,
-              12,
+          TensorDims{FFOrdered<nonnegative_int>{
+              10_n,
+              12_n,
           }},
           DataType::FLOAT,
       };
@@ -66,16 +66,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              10,
-              12,
+          TensorDims{FFOrdered<nonnegative_int>{
+              10_n,
+              12_n,
           }},
           DataType::FLOAT,
       };
 
       tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
       b.dense(input,
-              /*outDim=*/14,
+              /*outDim=*/14_n,
               /*activation=*/Activation::RELU,
               /*use_bias=*/true,
               /*data_type=*/DataType::FLOAT,
@@ -103,9 +103,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -131,9 +131,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
@@ -161,16 +161,16 @@ TEST_SUITE(FF_TEST_SUITE) {
         ComputationGraphBuilder b;
 
         TensorShape input_shape = TensorShape{
-            TensorDims{FFOrdered<size_t>{
-                10,
-                12,
+            TensorDims{FFOrdered<nonnegative_int>{
+                10_n,
+                12_n,
             }},
             DataType::FLOAT,
         };
 
         tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
         b.dense(input,
-                /*outDim=*/14,
+                /*outDim=*/14_n,
                 /*activation=*/Activation::RELU,
                 /*use_bias=*/true,
                 /*data_type=*/DataType::FLOAT,
diff --git a/lib/pcg/test/src/pcg/computation_graph_builder.cc b/lib/pcg/test/src/pcg/computation_graph_builder.cc
index e7fa853be9..98a4e2a241 100644
--- a/lib/pcg/test/src/pcg/computation_graph_builder.cc
+++ b/lib/pcg/test/src/pcg/computation_graph_builder.cc
@@ -8,22 +8,22 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ComputationGraphBuilder") {
     ComputationGraphBuilder b;
 
-    size_t batch_size = 2;
+    nonnegative_int batch_size = 2_n;
 
     TensorShape input_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, 3, 10, 10}},
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, 3_n, 10_n, 10_n}},
         DataType::FLOAT,
     };
 
     tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
     tensor_guid_t output = b.conv2d(input,
-                                    /*outChannels=*/5,
-                                    /*kernelH=*/3,
-                                    /*kernelW=*/3,
-                                    /*strideH=*/1,
-                                    /*strideW=*/1,
-                                    /*paddingH=*/0,
-                                    /*paddingW=*/0);
+                                    /*outChannels=*/5_n,
+                                    /*kernelH=*/3_n,
+                                    /*kernelW=*/3_n,
+                                    /*strideH=*/1_n,
+                                    /*strideW=*/1_n,
+                                    /*paddingH=*/0_n,
+                                    /*paddingW=*/0_n);
     // ComputationGraph cg = b.computation_graph;
     // CHECK(get_layers(cg).size() == 1);
   }
diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
index 9068e14517..4102efd48e 100644
--- a/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
+++ b/lib/pcg/test/src/pcg/file_format/v1/v1_binary_sp_decomposition/json.cc
@@ -9,11 +9,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         V1BinarySeriesSplit{
             V1BinarySPDecomposition{
                 V1BinaryParallelSplit{
-                    V1BinarySPDecomposition{2},
-                    V1BinarySPDecomposition{2},
+                    V1BinarySPDecomposition{2_n},
+                    V1BinarySPDecomposition{2_n},
                 },
             },
-            V1BinarySPDecomposition{3},
+            V1BinarySPDecomposition{3_n},
         },
     };
 
@@ -68,11 +68,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     V1BinarySeriesSplit example_split = V1BinarySeriesSplit{
         V1BinarySPDecomposition{
             V1BinaryParallelSplit{
-                V1BinarySPDecomposition{2},
-                V1BinarySPDecomposition{2},
+                V1BinarySPDecomposition{2_n},
+                V1BinarySPDecomposition{2_n},
             },
         },
-        V1BinarySPDecomposition{3},
+        V1BinarySPDecomposition{3_n},
     };
 
     nlohmann::json example_json = {
@@ -124,11 +124,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     V1BinaryParallelSplit example_split = V1BinaryParallelSplit{
         V1BinarySPDecomposition{
             V1BinaryParallelSplit{
-                V1BinarySPDecomposition{2},
-                V1BinarySPDecomposition{2},
+                V1BinarySPDecomposition{2_n},
+                V1BinarySPDecomposition{2_n},
             },
         },
-        V1BinarySPDecomposition{3},
+        V1BinarySPDecomposition{3_n},
     };
 
     nlohmann::json example_json = {
diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc
index 8336d81bb4..59c606adb1 100644
--- a/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/file_format/v1/v1_computation_graph.cc
@@ -10,15 +10,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       ComputationGraphBuilder b;
 
       TensorShape input_shape = TensorShape{
-          TensorDims{FFOrdered<size_t>{
-              12,
-              16,
+          TensorDims{FFOrdered<nonnegative_int>{
+              12_n,
+              16_n,
           }},
           DataType::FLOAT,
       };
 
       tensor_guid_t input = b.create_input(input_shape, CreateGrad::YES);
-      tensor_guid_t mm_output = b.dense(input, 8);
+      tensor_guid_t mm_output = b.dense(input, 8_n);
       tensor_guid_t relu_output = b.relu(mm_output);
 
       return b.computation_graph;
diff --git a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
index 8ce25c4bc5..682cf2d798 100644
--- a/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/file_format/v1/v1_parallel_computation_graph.cc
@@ -12,19 +12,19 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelTensorShape input_shape = ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{12, 2},
-                  ShardParallelDim{16, 1},
+                  ShardParallelDim{12_n, 2_n},
+                  ShardParallelDim{16_n, 1_n},
               },
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
       };
 
       parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-      parallel_tensor_guid_t mm_output = b.dense(input, 8);
+      parallel_tensor_guid_t mm_output = b.dense(input, 8_n);
       parallel_tensor_guid_t relu_output = b.relu(mm_output);
 
       return b.pcg;
diff --git a/lib/pcg/test/src/pcg/machine_specification.cc b/lib/pcg/test/src/pcg/machine_specification.cc
index c183ae0d31..6d339350a0 100644
--- a/lib/pcg/test/src/pcg/machine_specification.cc
+++ b/lib/pcg/test/src/pcg/machine_specification.cc
@@ -7,11 +7,10 @@ using namespace FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("MachineSpecification") {
-
     MachineSpecification ms = MachineSpecification{
-        /*num_nodes=*/4,
-        /*num_cpus_per_node=*/16,
-        /*num_gpus_per_node=*/8,
+        /*num_nodes=*/4_n,
+        /*num_cpus_per_node=*/16_n,
+        /*num_gpus_per_node=*/8_n,
         /*inter_node_bandwidth=*/0,
         /*intra_node_bandwidth=*/0,
     };
@@ -32,19 +31,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("get_device_id") {
       SUBCASE("valid MachineSpaceCoordinate") {
         MachineSpaceCoordinate coord = MachineSpaceCoordinate{
-            /*node_idx=*/2,
-            /*device_idx=*/12,
+            /*node_idx=*/2_n,
+            /*device_idx=*/12_n,
             DeviceType::CPU,
         };
         device_id_t correct =
-            device_id_from_index(2 * 16 + 12, DeviceType::CPU);
+            device_id_from_index(nonnegative_int{2 * 16 + 12}, DeviceType::CPU);
         device_id_t result = get_device_id(ms, coord);
         CHECK(correct == result);
       }
       SUBCASE("MachineSpaceCoordinate out of bounds for given machine spec") {
         MachineSpaceCoordinate coord = MachineSpaceCoordinate{
-            /*node_idx=*/2,
-            /*device_idx=*/18,
+            /*node_idx=*/2_n,
+            /*device_idx=*/18_n,
             DeviceType::CPU,
         };
         CHECK_THROWS(get_device_id(ms, coord));
diff --git a/lib/pcg/test/src/pcg/machine_view.cc b/lib/pcg/test/src/pcg/machine_view.cc
index 3e9d48fac3..e286f08bf2 100644
--- a/lib/pcg/test/src/pcg/machine_view.cc
+++ b/lib/pcg/test/src/pcg/machine_view.cc
@@ -12,10 +12,10 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("MachineView - utility functions") {
     MachineView mv = MachineView{
         MachineSpaceCoordinate{
-            /*node_idx=*/0, /*device_idx=*/0, DeviceType::GPU},
-        {MachineViewDimension{stride_t{2},
+            /*node_idx=*/0_n, /*device_idx=*/0_n, DeviceType::GPU},
+        {MachineViewDimension{stride_t{2_n},
                               MachineSpecificationDimension::INTER_NODE},
-         MachineViewDimension{stride_t{2},
+         MachineViewDimension{stride_t{2_n},
                               MachineSpecificationDimension::INTER_NODE}}};
 
     SUBCASE("num_dims") {
@@ -43,48 +43,48 @@ TEST_SUITE(FF_TEST_SUITE) {
        * Where the (x,) are the `TaskSpaceCoordinate`s, and the underlying grid
        * is the machine space.
        */
-      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
       MachineView mv = MachineView{
           MachineSpaceCoordinate{
-              /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
-          {MachineViewDimension{stride_t{2},
+              /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+          {MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}}};
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/1,
-                               /*num_cpus_per_node=*/6,
-                               /*num_gpus_per_node=*/6,
+          MachineSpecification{/*num_nodes=*/1_n,
+                               /*num_cpus_per_node=*/6_n,
+                               /*num_gpus_per_node=*/6_n,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
       SUBCASE("Task with TaskSpaceCoordinate = (0,)") {
-        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0}};
+        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n}};
         MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-            /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU};
+            /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU};
         MachineSpaceCoordinate result =
             get_machine_space_coordinate(task, mv, coord, ms).value();
         CHECK(correct == result);
       }
 
       SUBCASE("Task with TaskSpaceCoordinate = (1,)") {
-        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1}};
+        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n}};
         MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-            /*node_idx=*/0, /*device_idx=*/3, DeviceType::GPU};
+            /*node_idx=*/0_n, /*device_idx=*/3_n, DeviceType::GPU};
         MachineSpaceCoordinate result =
             get_machine_space_coordinate(task, mv, coord, ms).value();
         CHECK(correct == result);
       }
 
       SUBCASE("Task with TaskSpaceCoordinate = (2,)") {
-        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2}};
+        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2_n}};
         MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-            /*node_idx=*/0, /*device_idx=*/5, DeviceType::GPU};
+            /*node_idx=*/0_n, /*device_idx=*/5_n, DeviceType::GPU};
         MachineSpaceCoordinate result =
             get_machine_space_coordinate(task, mv, coord, ms).value();
         CHECK(correct == result);
       }
 
       SUBCASE("TaskSpaceCoordinate is out of bounds") {
-        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{4}};
+        TaskSpaceCoordinate coord = TaskSpaceCoordinate{{4_n}};
         std::optional<MachineSpaceCoordinate> result =
             get_machine_space_coordinate(task, mv, coord, ms);
         std::optional<MachineSpaceCoordinate> correct = std::nullopt;
@@ -112,52 +112,52 @@ TEST_SUITE(FF_TEST_SUITE) {
          * grid is the machine space.
          */
 
-        OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+        OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
         MachineView mv = MachineView{
             MachineSpaceCoordinate{
-                /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU},
-            {MachineViewDimension{stride_t{1},
+                /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU},
+            {MachineViewDimension{stride_t{1_n},
                                   MachineSpecificationDimension::INTER_NODE},
-             MachineViewDimension{stride_t{2},
+             MachineViewDimension{stride_t{2_n},
                                   MachineSpecificationDimension::INTRA_NODE}}};
         MachineSpecification ms =
-            MachineSpecification{/*num_nodes=*/3,
-                                 /*num_cpus_per_node=*/5,
-                                 /*num_gpus_per_node=*/5,
+            MachineSpecification{/*num_nodes=*/3_n,
+                                 /*num_cpus_per_node=*/5_n,
+                                 /*num_gpus_per_node=*/5_n,
                                  /*inter_node_bandwidth=*/0,
                                  /*intra_node_bandwidth=*/0};
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/4, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/4_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/2, /*device_idx=*/2, DeviceType::GPU};
+              /*node_idx=*/2_n, /*device_idx=*/2_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/2, /*device_idx=*/4, DeviceType::GPU};
+              /*node_idx=*/2_n, /*device_idx=*/4_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
@@ -179,52 +179,52 @@ TEST_SUITE(FF_TEST_SUITE) {
          * grid is the machine space.
          */
 
-        OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+        OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
         MachineView mv = MachineView{
             MachineSpaceCoordinate{
-                /*node_idx=*/1, /*device_idx=*/0, DeviceType::GPU},
-            {MachineViewDimension{stride_t{1},
+                /*node_idx=*/1_n, /*device_idx=*/0_n, DeviceType::GPU},
+            {MachineViewDimension{stride_t{1_n},
                                   MachineSpecificationDimension::INTRA_NODE},
-             MachineViewDimension{stride_t{2},
+             MachineViewDimension{stride_t{2_n},
                                   MachineSpecificationDimension::INTRA_NODE}}};
         MachineSpecification ms =
-            MachineSpecification{/*num_nodes=*/2,
-                                 /*num_cpus_per_node=*/6,
-                                 /*num_gpus_per_node=*/6,
+            MachineSpecification{/*num_nodes=*/2_n,
+                                 /*num_cpus_per_node=*/6_n,
+                                 /*num_gpus_per_node=*/6_n,
                                  /*inter_node_bandwidth=*/0,
                                  /*intra_node_bandwidth=*/0};
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/0, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/0_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/4, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/4_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/1, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/1_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/5, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/5_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
@@ -253,45 +253,45 @@ TEST_SUITE(FF_TEST_SUITE) {
          * grid is the machine space.
          */
 
-        OperatorTaskSpace task = OperatorTaskSpace{{2, 2, 2}};
+        OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n, 2_n}};
         MachineView mv = MachineView{
             MachineSpaceCoordinate{
-                /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
-            {MachineViewDimension{stride_t{1},
+                /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+            {MachineViewDimension{stride_t{1_n},
                                   MachineSpecificationDimension::INTER_NODE},
-             MachineViewDimension{stride_t{2},
+             MachineViewDimension{stride_t{2_n},
                                   MachineSpecificationDimension::INTRA_NODE},
-             MachineViewDimension{stride_t{1},
+             MachineViewDimension{stride_t{1_n},
                                   MachineSpecificationDimension::INTRA_NODE}}};
         MachineSpecification ms =
-            MachineSpecification{/*num_nodes=*/2,
-                                 /*num_cpus_per_node=*/8,
-                                 /*num_gpus_per_node=*/8,
+            MachineSpecification{/*num_nodes=*/2_n,
+                                 /*num_cpus_per_node=*/8_n,
+                                 /*num_gpus_per_node=*/8_n,
                                  /*inter_node_bandwidth=*/0,
                                  /*intra_node_bandwidth=*/0};
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,0,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n, 0_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/0, /*device_idx=*/3, DeviceType::GPU};
+              /*node_idx=*/0_n, /*device_idx=*/3_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/5, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/5_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n, 1_n}};
           MachineSpaceCoordinate correct = MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/7, DeviceType::GPU};
+              /*node_idx=*/1_n, /*device_idx=*/7_n, DeviceType::GPU};
           MachineSpaceCoordinate result =
               get_machine_space_coordinate(task, mv, coord, ms).value();
           CHECK(correct == result);
@@ -319,23 +319,23 @@ TEST_SUITE(FF_TEST_SUITE) {
        * select
        */
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/1,
-                               /*num_cpus_per_node=*/6,
-                               /*num_gpus_per_node=*/6,
+          MachineSpecification{/*num_nodes=*/1_n,
+                               /*num_cpus_per_node=*/6_n,
+                               /*num_gpus_per_node=*/6_n,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
       MachineView mv = MachineView{
           MachineSpaceCoordinate{
-              /*node_idx=*/0, /*device_idx=*/1, DeviceType::GPU},
-          {MachineViewDimension{stride_t{2},
+              /*node_idx=*/0_n, /*device_idx=*/1_n, DeviceType::GPU},
+          {MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}}};
 
       std::unordered_set<device_id_t> correct = {
-          device_id_t{gpu_id_t{1}},
-          device_id_t{gpu_id_t{3}},
-          device_id_t{gpu_id_t{5}},
+          device_id_t{gpu_id_t{1_n}},
+          device_id_t{gpu_id_t{3_n}},
+          device_id_t{gpu_id_t{5_n}},
       };
       std::unordered_set<device_id_t> result = get_device_ids(task, mv, ms);
       CHECK(result == correct);
@@ -364,26 +364,26 @@ TEST_SUITE(FF_TEST_SUITE) {
        */
 
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/3,
-                               /*num_cpus_per_node=*/5,
-                               /*num_gpus_per_node=*/5,
+          MachineSpecification{/*num_nodes=*/3_n,
+                               /*num_cpus_per_node=*/5_n,
+                               /*num_gpus_per_node=*/5_n,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
-      OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
       MachineView mv = MachineView{
           MachineSpaceCoordinate{
-              /*node_idx=*/1, /*device_idx=*/2, DeviceType::GPU},
-          {MachineViewDimension{stride_t{1},
+              /*node_idx=*/1_n, /*device_idx=*/2_n, DeviceType::GPU},
+          {MachineViewDimension{stride_t{1_n},
                                 MachineSpecificationDimension::INTER_NODE},
-           MachineViewDimension{stride_t{2},
+           MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}}};
 
       std::unordered_set<device_id_t> correct = {
-          device_id_t{gpu_id_t{7}},
-          device_id_t{gpu_id_t{9}},
-          device_id_t{gpu_id_t{12}},
-          device_id_t{gpu_id_t{14}},
+          device_id_t{gpu_id_t{7_n}},
+          device_id_t{gpu_id_t{9_n}},
+          device_id_t{gpu_id_t{12_n}},
+          device_id_t{gpu_id_t{14_n}},
       };
       std::unordered_set<device_id_t> result = get_device_ids(task, mv, ms);
       CHECK(result == correct);
diff --git a/lib/pcg/test/src/pcg/operator_task_space.cc b/lib/pcg/test/src/pcg/operator_task_space.cc
index 13198d9456..fa06af3635 100644
--- a/lib/pcg/test/src/pcg/operator_task_space.cc
+++ b/lib/pcg/test/src/pcg/operator_task_space.cc
@@ -18,13 +18,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
     SUBCASE("OperatorTaskSpace has 2 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
 
       std::unordered_set<TaskSpaceCoordinate> correct = {{
-          TaskSpaceCoordinate{{0, 0}},
-          TaskSpaceCoordinate{{0, 1}},
-          TaskSpaceCoordinate{{1, 0}},
-          TaskSpaceCoordinate{{1, 1}},
+          TaskSpaceCoordinate{{0_n, 0_n}},
+          TaskSpaceCoordinate{{0_n, 1_n}},
+          TaskSpaceCoordinate{{1_n, 0_n}},
+          TaskSpaceCoordinate{{1_n, 1_n}},
       }};
       std::unordered_set<TaskSpaceCoordinate> result =
           get_task_space_coordinates(task);
@@ -32,13 +32,13 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
     SUBCASE("OperatorTaskSpace has 3 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{1, 2, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{1_n, 2_n, 2_n}};
 
       std::unordered_set<TaskSpaceCoordinate> correct = {{
-          TaskSpaceCoordinate{{0, 0, 0}},
-          TaskSpaceCoordinate{{0, 0, 1}},
-          TaskSpaceCoordinate{{0, 1, 0}},
-          TaskSpaceCoordinate{{0, 1, 1}},
+          TaskSpaceCoordinate{{0_n, 0_n, 0_n}},
+          TaskSpaceCoordinate{{0_n, 0_n, 1_n}},
+          TaskSpaceCoordinate{{0_n, 1_n, 0_n}},
+          TaskSpaceCoordinate{{0_n, 1_n, 1_n}},
       }};
       std::unordered_set<TaskSpaceCoordinate> result =
           get_task_space_coordinates(task);
@@ -48,17 +48,17 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_task_space_maximum_coordinate") {
     SUBCASE("OperatorTaskSpace has 2 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n}};
 
-      TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2, 1}};
+      TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n}};
       TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task);
       CHECK(correct == result);
     }
     SUBCASE("OperatorTaskSpace has 3 dimensions") {
 
-      OperatorTaskSpace task = OperatorTaskSpace{{3, 2, 4}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n, 2_n, 4_n}};
 
-      TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2, 1, 3}};
+      TaskSpaceCoordinate correct = TaskSpaceCoordinate{{2_n, 1_n, 3_n}};
       TaskSpaceCoordinate result = get_task_space_maximum_coordinate(task);
       CHECK(correct == result);
     }
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
index dd8308561f..979a96d204 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph.cc
@@ -45,12 +45,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -84,7 +84,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t input =
           b.create_input_tensor(input_shape, CreateGrad::YES);
       b.dense(input,
-              /*outDim=*/14,
+              /*outDim=*/14_n,
               /*activation=*/Activation::RELU,
               /*use_bias=*/true,
               /*data_type=*/DataType::FLOAT,
@@ -110,12 +110,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape tensor_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -186,12 +186,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -246,7 +246,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       ParallelComputationGraph pcg = empty_parallel_computation_graph();
 
       LinearAttrs op_attrs = LinearAttrs{
-          /*out_channels=*/14,
+          /*out_channels=*/14_n,
           /*use_bias=*/false,
           /*data_type=*/DataType::FLOAT,
           /*activation=*/Activation::RELU,
@@ -293,7 +293,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                                {},
                                {raw_projection_tensor_attrs});
 
-        ReplicateAttrs replicate_attrs = ReplicateAttrs{/*degree=*/2};
+        ReplicateAttrs replicate_attrs = ReplicateAttrs{/*degree=*/2_n};
         ParallelLayerAttrs replicate_layer_attrs = ParallelLayerAttrs{
             PCGOperatorAttrs{replicate_attrs},
             std::nullopt,
@@ -346,12 +346,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape tensor_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{12, 2},
-                ShardParallelDim{10, 1},
+                ShardParallelDim{12_n, 2_n},
+                ShardParallelDim{10_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{2},
-                DiscardCopyDegree{2},
+                SumDegree{2_n},
+                DiscardCopyDegree{2_n},
             },
         },
         DataType::FLOAT,
diff --git a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
index 3f66b33b6e..ef3173d744 100644
--- a/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
+++ b/lib/pcg/test/src/pcg/parallel_computation_graph/parallel_computation_graph_builder.cc
@@ -26,18 +26,18 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::add") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim d1 = ShardParallelDim{10, 2};
-    ShardParallelDim d2 = ShardParallelDim{15, 3};
+    ShardParallelDim d1 = ShardParallelDim{10_n, 2_n};
+    ShardParallelDim d2 = ShardParallelDim{15_n, 3_n};
 
     ParallelTensorShape lhs_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{15, 3},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{15_n, 3_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{2},
-                DiscardCopyDegree{1},
+                SumDegree{2_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -76,18 +76,18 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::batch_matmul") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{4, 2};
+    ShardParallelDim batch_dim = ShardParallelDim{4_n, 2_n};
 
     ParallelTensorShape a_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
                 batch_dim,
-                ShardParallelDim{10, 1},
-                ShardParallelDim{15, 3},
+                ShardParallelDim{10_n, 1_n},
+                ShardParallelDim{15_n, 3_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -97,12 +97,12 @@ TEST_SUITE(FF_TEST_SUITE) {
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
                 batch_dim,
-                ShardParallelDim{15, 3},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{15_n, 3_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -130,7 +130,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("op attrs") {
       PCGOperatorAttrs result = get_parallel_layer_attrs(b.pcg, layer).op_attrs;
-      PCGOperatorAttrs correct = PCGOperatorAttrs{BatchMatmulAttrs{-1, -1}};
+      PCGOperatorAttrs correct =
+          PCGOperatorAttrs{BatchMatmulAttrs{std::nullopt, std::nullopt}};
       CHECK(result == correct);
     }
   }
@@ -141,12 +142,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{12, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{12_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{3},
-                DiscardCopyDegree{1},
+                SumDegree{3_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -179,28 +180,28 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::conv2d") {
     ParallelComputationGraphBuilder b;
 
-    size_t batch_size = 2;
+    nonnegative_int batch_size = 2_n;
 
     TensorShape unpar_input_shape = TensorShape{
-        TensorDims{FFOrdered<size_t>{batch_size, 3, 10, 10}},
+        TensorDims{FFOrdered<nonnegative_int>{batch_size, 3_n, 10_n, 10_n}},
         DataType::FLOAT,
     };
 
-    ParallelTensorShape input_shape =
-        lift_to_parallel_with_degrees(unpar_input_shape,
-                                      SumDegree{1},
-                                      DiscardCopyDegree{1},
-                                      FFOrdered<int>{2, 1, 1, 1});
+    ParallelTensorShape input_shape = lift_to_parallel_with_degrees(
+        unpar_input_shape,
+        SumDegree{1_n},
+        DiscardCopyDegree{1_n},
+        FFOrdered<nonnegative_int>{2_n, 1_n, 1_n, 1_n});
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
 
-    int outChannels = 6;
-    int kernelH = 5;
-    int kernelW = 4;
-    int strideH = 3;
-    int strideW = 2;
-    int paddingH = 1;
-    int paddingW = 0;
+    nonnegative_int outChannels = 6_n;
+    nonnegative_int kernelH = 5_n;
+    nonnegative_int kernelW = 4_n;
+    nonnegative_int strideH = 3_n;
+    nonnegative_int strideW = 2_n;
+    nonnegative_int paddingH = 1_n;
+    nonnegative_int paddingW = 0_n;
     parallel_tensor_guid_t output = b.conv2d(input,
                                              /*outChannels=*/outChannels,
                                              /*kernelH=*/kernelH,
@@ -254,7 +255,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         strideW,
         paddingH,
         paddingW,
-        /*groups=*/1,
+        /*groups=*/1_n,
         /*activation=*/std::nullopt,
         /*use_bias=*/true,
     };
@@ -301,18 +302,18 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{10, 2},
-                ShardParallelDim{16, 1},
+                ShardParallelDim{10_n, 2_n},
+                ShardParallelDim{16_n, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
     };
 
-    int outDim = 14;
+    nonnegative_int outDim = 14_n;
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output = b.dense(input,
@@ -341,8 +342,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::embedding") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{12, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{12_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
@@ -350,8 +351,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::INT32,
@@ -359,8 +360,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output = b.embedding(input,
-                                                /*num_entries=*/32,
-                                                /*outDim=*/8,
+                                                /*num_entries=*/32_n,
+                                                /*outDim=*/8_n,
                                                 AggregateOp::SUM,
                                                 DataType::FLOAT);
     parallel_layer_guid_t layer = get_source_layer(output);
@@ -384,9 +385,9 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::multihead_attention") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{12, 2};
-    ShardParallelDim sequence_dim = ShardParallelDim{16, 1};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{12_n, 2_n};
+    ShardParallelDim sequence_dim = ShardParallelDim{16_n, 1_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
     ParallelTensorShape query_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
@@ -395,8 +396,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -405,8 +406,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     ParallelTensorShape key_shape = query_shape;
     ParallelTensorShape value_shape = query_shape;
 
-    int embed_dim = 8;
-    int num_heads = 6;
+    nonnegative_int embed_dim = 8_n;
+    nonnegative_int num_heads = 6_n;
 
     parallel_tensor_guid_t query = b.create_input_tensor(query_shape);
     parallel_tensor_guid_t key = b.create_input_tensor(key_shape);
@@ -435,8 +436,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::relu") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 3};
-    ShardParallelDim feature_dim = ShardParallelDim{32, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 3_n};
+    ShardParallelDim feature_dim = ShardParallelDim{32_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -445,8 +446,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -474,8 +475,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::parallel_partition") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -484,8 +485,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -493,7 +494,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output =
-        b.parallel_partition(input, ff_dim_t{nonnegative_int{0}}, 2);
+        b.parallel_partition(input, ff_dim_t{nonnegative_int{0}}, 2_n);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -514,8 +515,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::parallel_combine") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -524,8 +525,8 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -533,7 +534,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
     parallel_tensor_guid_t output =
-        b.parallel_combine(input, ff_dim_t{nonnegative_int{0}}, 2);
+        b.parallel_combine(input, ff_dim_t{nonnegative_int{0}}, 2_n);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -554,8 +555,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::parallel_replicate") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -564,15 +565,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    parallel_tensor_guid_t output = b.parallel_replicate(input, 2);
+    parallel_tensor_guid_t output = b.parallel_replicate(input, 2_n);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
@@ -593,8 +594,8 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("ParallelComputationGraphBuilder::parallel_reduce") {
     ParallelComputationGraphBuilder b;
 
-    ShardParallelDim batch_dim = ShardParallelDim{18, 2};
-    ShardParallelDim feature_dim = ShardParallelDim{10, 1};
+    ShardParallelDim batch_dim = ShardParallelDim{18_n, 2_n};
+    ShardParallelDim feature_dim = ShardParallelDim{10_n, 1_n};
 
     ParallelTensorShape input_shape = ParallelTensorShape{
         ParallelTensorDims{
@@ -603,15 +604,15 @@ TEST_SUITE(FF_TEST_SUITE) {
                 feature_dim,
             },
             ReplicaParallelDimSet{
-                SumDegree{4},
-                DiscardCopyDegree{1},
+                SumDegree{4_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
     };
 
     parallel_tensor_guid_t input = b.create_input_tensor(input_shape);
-    parallel_tensor_guid_t output = b.parallel_reduce(input, 2);
+    parallel_tensor_guid_t output = b.parallel_reduce(input, 2_n);
     parallel_layer_guid_t layer = get_source_layer(output);
 
     SUBCASE("incoming") {
diff --git a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc
index 8383754aa2..71c4d1b1d0 100644
--- a/lib/pcg/test/src/pcg/start_invariant_machine_view.cc
+++ b/lib/pcg/test/src/pcg/start_invariant_machine_view.cc
@@ -8,15 +8,15 @@ using namespace FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("StartInvariantMachineView - utility functions") {
     StartInvariantMachineView simv = StartInvariantMachineView{
-        {MachineViewDimension{stride_t{2},
+        {MachineViewDimension{stride_t{2_n},
                               MachineSpecificationDimension::INTER_NODE},
-         MachineViewDimension{stride_t{2},
+         MachineViewDimension{stride_t{2_n},
                               MachineSpecificationDimension::INTER_NODE}},
         DeviceType::GPU};
 
     SUBCASE("num_dims") {
-      int result = num_dims(simv);
-      int correct = 2;
+      nonnegative_int result = num_dims(simv);
+      nonnegative_int correct = 2_n;
       CHECK(result == correct);
     }
 
@@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("get_strides") {
       std::vector<stride_t> result = get_strides(simv);
-      std::vector<stride_t> correct = {stride_t{2}, stride_t{2}};
+      std::vector<stride_t> correct = {stride_t{2_n}, stride_t{2_n}};
       CHECK(result == correct);
     }
 
@@ -43,11 +43,11 @@ TEST_SUITE(FF_TEST_SUITE) {
 
   TEST_CASE("StartInvariantMachineView - conversions") {
     MachineSpaceCoordinate start =
-        MachineSpaceCoordinate{1, 2, DeviceType::GPU};
+        MachineSpaceCoordinate{1_n, 2_n, DeviceType::GPU};
     std::vector<MachineViewDimension> dimensions = {
-        MachineViewDimension{stride_t{2},
+        MachineViewDimension{stride_t{2_n},
                              MachineSpecificationDimension::INTER_NODE},
-        MachineViewDimension{stride_t{3},
+        MachineViewDimension{stride_t{3_n},
                              MachineSpecificationDimension::INTRA_NODE}};
 
     MachineView mv = MachineView{start, dimensions};
@@ -94,21 +94,21 @@ TEST_SUITE(FF_TEST_SUITE) {
        *  | (0,)  |       | (1,)  |       | (2,)  |       |
        *  +-------+-------+-------+-------+-------+-------+
        */
-      OperatorTaskSpace task = OperatorTaskSpace{{3}};
+      OperatorTaskSpace task = OperatorTaskSpace{{3_n}};
       StartInvariantMachineView simv = StartInvariantMachineView{
-          {MachineViewDimension{stride_t{2},
+          {MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}},
           DeviceType::GPU};
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/1,
-                               /*num_cpus_per_node=*/6,
-                               /*num_gpus_per_node=*/6,
-                               /*inter_node_bandwidth=*/0,
-                               /*intra_node_bandwidth=*/0};
+          MachineSpecification{/*num_nodes=*/1_n,
+                               /*num_cpus_per_node=*/6_n,
+                               /*num_gpus_per_node=*/6_n,
+                               /*inter_node_bandwidth=*/0.0,
+                               /*intra_node_bandwidth=*/0.0};
 
       SUBCASE("get_machine_space_offset") {
         SUBCASE("Task with TaskSpaceCoordinate = (0,)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 0, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -117,7 +117,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 2, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -126,7 +126,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (2,)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{2_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 4, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -162,23 +162,23 @@ TEST_SUITE(FF_TEST_SUITE) {
        *  +-------+-------+-------+-------+
        */
 
-      OperatorTaskSpace task = OperatorTaskSpace{{2, 2}};
+      OperatorTaskSpace task = OperatorTaskSpace{{2_n, 2_n}};
       StartInvariantMachineView simv = StartInvariantMachineView{
-          {MachineViewDimension{stride_t{1},
+          {MachineViewDimension{stride_t{1_n},
                                 MachineSpecificationDimension::INTER_NODE},
-           MachineViewDimension{stride_t{2},
+           MachineViewDimension{stride_t{2_n},
                                 MachineSpecificationDimension::INTRA_NODE}},
           DeviceType::GPU};
       MachineSpecification ms =
-          MachineSpecification{/*num_nodes=*/2,
-                               /*num_cpus_per_node=*/4,
-                               /*num_gpus_per_node=*/4,
+          MachineSpecification{/*num_nodes=*/2_n,
+                               /*num_cpus_per_node=*/4_n,
+                               /*num_gpus_per_node=*/4_n,
                                /*inter_node_bandwidth=*/0,
                                /*intra_node_bandwidth=*/0};
 
       SUBCASE("get_machine_space_offset") {
         SUBCASE("Task with TaskSpaceCoordinate = (0,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 0_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 0, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -187,7 +187,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (0,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{0_n, 1_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{0, 2, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -196,7 +196,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,0)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 0}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 0_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{1, 0, DeviceType::GPU};
           MachineSpaceOffset result =
@@ -205,7 +205,7 @@ TEST_SUITE(FF_TEST_SUITE) {
         }
 
         SUBCASE("Task with TaskSpaceCoordinate = (1,1)") {
-          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1, 1}};
+          TaskSpaceCoordinate coord = TaskSpaceCoordinate{{1_n, 1_n}};
           MachineSpaceOffset correct =
               MachineSpaceOffset{1, 2, DeviceType::GPU};
           MachineSpaceOffset result =
diff --git a/lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h
new file mode 100644
index 0000000000..92f7bb1c03
--- /dev/null
+++ b/lib/substitutions/include/substitutions/apply_substitution/apply_substitution.h
@@ -0,0 +1,31 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_APPLY_SUBSTITUTION_H
+
+#include "substitutions/pcg_pattern_match.dtg.h"
+#include "substitutions/sub_parallel_computation_graph.dtg.h"
+#include "substitutions/substitution.dtg.h"
+
+namespace FlexFlow {
+
+/**
+ * @brief Applies \p substitution to \p sub_pcg at the location specified by \p
+ * match, returning the resulting SubParallelComputationGraph
+ *
+ * @param sub_pcg
+ * @param substitution
+ * @param match The location at which to apply substitution. This location in
+ * sub_pcg should match substitution's PCGPattern. Likely created by running
+ * FlexFlow::find_pattern_matches(PCGPattern const &,
+ * SubParallelComputationGraph const &).
+ * @return SubParallelComputationGraph A sub-PCG similar to sub_pcg, but with
+ * the subgraph specified by match replaced with the result of the output
+ * expression of substitution
+ */
+SubParallelComputationGraph
+    apply_substitution(SubParallelComputationGraph const &sub_pcg,
+                       Substitution const &substitution,
+                       PCGPatternMatch const &match);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/substitution_internal/evaluate_substitution_output.h b/lib/substitutions/include/substitutions/apply_substitution/evaluate_substitution_output.h
similarity index 76%
rename from lib/substitutions/include/substitutions/substitution_internal/evaluate_substitution_output.h
rename to lib/substitutions/include/substitutions/apply_substitution/evaluate_substitution_output.h
index a0461b075b..74089c5aab 100644
--- a/lib/substitutions/include/substitutions/substitution_internal/evaluate_substitution_output.h
+++ b/lib/substitutions/include/substitutions/apply_substitution/evaluate_substitution_output.h
@@ -1,10 +1,10 @@
-#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_EVALUATE_SUBSTITUTION_OUTPUT_H
-#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_EVALUATE_SUBSTITUTION_OUTPUT_H
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_EVALUATE_SUBSTITUTION_OUTPUT_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_EVALUATE_SUBSTITUTION_OUTPUT_H
 
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.dtg.h"
 #include "substitutions/pcg_pattern_match.dtg.h"
 #include "substitutions/sub_parallel_computation_graph.dtg.h"
 #include "substitutions/substitution.dtg.h"
-#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.dtg.h"
 #include <utility>
 
 namespace FlexFlow {
diff --git a/lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h b/lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h
similarity index 62%
rename from lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h
rename to lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h
index 603cb670bf..cd7e782909 100644
--- a/lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h
+++ b/lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h
@@ -1,11 +1,11 @@
-#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H
-#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_OUTPUT_EXPR_TO_RESULT_SUB_PCG_MAPPING_H
 
 #include "pcg/parallel_computation_graph/parallel_tensor_guid_t.dtg.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.dtg.h"
 #include "substitutions/output_graph/output_graph_expr.dtg.h"
 #include "substitutions/output_graph/output_graph_expr_node_output.dtg.h"
 #include "substitutions/sub_parallel_computation_graph.dtg.h"
-#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.dtg.h"
 
 namespace FlexFlow {
 
diff --git a/lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.struct.toml b/lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.struct.toml
similarity index 100%
rename from lib/substitutions/include/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.struct.toml
rename to lib/substitutions/include/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.struct.toml
diff --git a/lib/substitutions/include/substitutions/substitution_internal/perform_shape_inference.h b/lib/substitutions/include/substitutions/apply_substitution/perform_shape_inference.h
similarity index 85%
rename from lib/substitutions/include/substitutions/substitution_internal/perform_shape_inference.h
rename to lib/substitutions/include/substitutions/apply_substitution/perform_shape_inference.h
index b7ce13db0e..c3f9eff349 100644
--- a/lib/substitutions/include/substitutions/substitution_internal/perform_shape_inference.h
+++ b/lib/substitutions/include/substitutions/apply_substitution/perform_shape_inference.h
@@ -1,5 +1,5 @@
-#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_PERFORM_SHAPE_INFERENCE_H
-#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_INTERNAL_PERFORM_SHAPE_INFERENCE_H
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_PERFORM_SHAPE_INFERENCE_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_APPLY_SUBSTITUTION_PERFORM_SHAPE_INFERENCE_H
 
 #include "op-attrs/parallel_tensor_shape.dtg.h"
 #include "pcg/parallel_computation_graph/parallel_layer_attrs.dtg.h"
diff --git a/lib/substitutions/include/substitutions/constraint_type.enum.toml b/lib/substitutions/include/substitutions/constraint_type.enum.toml
index 8646ba1c83..f366a17725 100644
--- a/lib/substitutions/include/substitutions/constraint_type.enum.toml
+++ b/lib/substitutions/include/substitutions/constraint_type.enum.toml
@@ -9,3 +9,6 @@ features = [
 
 [[values]]
 name = "EQUAL"
+
+[[values]]
+name = "DIVISIBLE_BY"
diff --git a/lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h b/lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h
new file mode 100644
index 0000000000..2b31dada04
--- /dev/null
+++ b/lib/substitutions/include/substitutions/operator_pattern/get_attribute_map.h
@@ -0,0 +1,15 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_GET_ATTRIBUTE_MAP_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_GET_ATTRIBUTE_MAP_H
+
+#include "op-attrs/pcg_operator_attrs.dtg.h"
+#include "substitutions/operator_pattern/operator_attribute_key.dtg.h"
+#include "substitutions/operator_pattern/operator_attribute_value.dtg.h"
+
+namespace FlexFlow {
+
+std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+    get_attribute_map(PCGOperatorAttrs const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h
index 4affdd697f..c2c11fac51 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_constraint.h
@@ -9,6 +9,8 @@ OperatorAttributeConstraint op_type_equals_constraint(OperatorType);
 
 OperatorAttributeConstraint op_attr_key_equals(OperatorAttributeKey,
                                                OperatorAttributeValue const &);
+OperatorAttributeConstraint
+    op_attr_key_divisible_by(OperatorAttributeKey, nonnegative_int denominator);
 OperatorAttributeConstraint
     make_equals_constraint(OperatorAttributeExpr const &,
                            OperatorAttributeValue const &);
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml
index eb758ea4fc..af3666d46f 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.enum.toml
@@ -56,6 +56,7 @@ values = [
   { name = "SHOULD_BROADCAST_RHS" },
   { name = "DIM" },
   { name = "AFFINE" },
+  { name = "ELEMENTWISE_AFFINE" },
   { name = "MOMENTUM" },
   { name = "REGULARIZER" },
   { name = "SHAPE" },
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h
new file mode 100644
index 0000000000..d46403a847
--- /dev/null
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_key.h
@@ -0,0 +1,12 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_OPERATOR_ATTRIBUTE_KEY_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OPERATOR_PATTERN_OPERATOR_ATTRIBUTE_KEY_H
+
+#include "substitutions/operator_pattern/operator_attribute_key.dtg.h"
+
+namespace FlexFlow {
+
+std::vector<OperatorAttributeKey> all_operator_attribute_keys();
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml
index bceff393d2..4ed226907e 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_list_access.struct.toml
@@ -10,7 +10,8 @@ features = [
 ]
 
 includes = [
-  "substitutions/operator_pattern/operator_attribute_key.dtg.h"
+  "substitutions/operator_pattern/operator_attribute_key.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -19,4 +20,4 @@ type = "::FlexFlow::OperatorAttributeKey"
 
 [[fields]]
 name = "index"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
index 8fe4a9494d..3312b292a0 100644
--- a/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
+++ b/lib/substitutions/include/substitutions/operator_pattern/operator_attribute_value.variant.toml
@@ -7,7 +7,6 @@ features = [
   "fmt",
   "json",
 ]
-explicit_constructors = false
 
 includes = [
   "<vector>",
@@ -21,6 +20,7 @@ includes = [
   "op-attrs/tensor_shape.dtg.h",
   "op-attrs/datatype.dtg.h",
   "<cstddef>",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 src_includes = [
@@ -31,7 +31,7 @@ src_includes = [
 ]
 
 [[values]]
-type = "int"
+type = "::FlexFlow::nonnegative_int"
 
 [[values]]
 type = "bool"
@@ -40,7 +40,10 @@ type = "bool"
 type = "float"
 
 [[values]]
-type = "std::vector<int>"
+type = "std::optional<float>"
+
+[[values]]
+type = "std::vector<::FlexFlow::nonnegative_int>"
 
 [[values]]
 type = "std::vector<::FlexFlow::ff_dim_t>"
@@ -55,10 +58,7 @@ type = "std::optional<::FlexFlow::Activation>"
 type = "::FlexFlow::ff_dim_t"
 
 [[values]]
-type = "size_t"
-
-[[values]]
-type = "::FlexFlow::AggregateOp"
+type = "std::optional<::FlexFlow::AggregateOp>"
 
 [[values]]
 type = "std::optional<::FlexFlow::RegularizerAttrs>"
diff --git a/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h b/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h
index e550767292..8c047fc44d 100644
--- a/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h
+++ b/lib/substitutions/include/substitutions/output_graph/output_graph_expr.h
@@ -2,14 +2,19 @@
 #define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OUTPUT_GRAPH_OUTPUT_GRAPH_EXPR_H
 
 #include "substitutions/output_graph/output_graph_expr.dtg.h"
+#include "substitutions/output_graph/output_graph_expr_input.dtg.h"
 #include "substitutions/output_graph/output_graph_expr_node.dtg.h"
 #include "substitutions/output_graph/output_graph_expr_node_output.dtg.h"
 
 namespace FlexFlow {
 
+std::unordered_set<OutputGraphExprNode> get_nodes(OutputGraphExpr const &);
+
 std::vector<OutputGraphExprNodeOutput>
     get_node_outputs(OutputGraphExpr const &, OutputGraphExprNode const &);
 
+std::unordered_set<OutputGraphExprInput> get_inputs(OutputGraphExpr const &);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h
new file mode 100644
index 0000000000..e172edb025
--- /dev/null
+++ b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.h
@@ -0,0 +1,16 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OUTPUT_GRAPH_OUTPUT_GRAPH_EXPR_VALUE_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_OUTPUT_GRAPH_OUTPUT_GRAPH_EXPR_VALUE_H
+
+#include "substitutions/output_graph/output_graph_expr_value.dtg.h"
+#include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h"
+
+namespace FlexFlow {
+
+OpenDataflowValue raw_open_dataflow_value_from_output_graph_expr_value(
+    OutputGraphExprValue const &);
+OutputGraphExprValue output_graph_expr_value_from_raw_open_dataflow_value(
+    OpenDataflowValue const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml
new file mode 100644
index 0000000000..641250e1f0
--- /dev/null
+++ b/lib/substitutions/include/substitutions/output_graph/output_graph_expr_value.variant.toml
@@ -0,0 +1,19 @@
+namespace = "FlexFlow"
+name = "OutputGraphExprValue"
+features = [
+  "eq",
+  "ord",
+  "hash",
+  "fmt",
+]
+
+includes = [
+  "substitutions/output_graph/output_graph_expr_input.dtg.h",
+  "substitutions/output_graph/output_graph_expr_node_output.dtg.h",
+]
+
+[[values]]
+type = "::FlexFlow::OutputGraphExprNodeOutput"
+
+[[values]]
+type = "::FlexFlow::OutputGraphExprInput"
diff --git a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h
index 60540c0711..0921569d62 100644
--- a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h
+++ b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.h
@@ -20,6 +20,9 @@ std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
     set_attr_to_constant(OperatorAttributeKey key,
                          OperatorAttributeValue const &value);
 
+std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
+    set_op_type_attr(OperatorType);
+
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml
index d712ea96f7..483f27791a 100644
--- a/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml
+++ b/lib/substitutions/include/substitutions/output_graph/output_operator_attrs_assignment.struct.toml
@@ -18,11 +18,12 @@ includes = [
 src_includes = [
   "utils/hash/unordered_map.h",
   "utils/fmt/unordered_map.h",
+  "utils/fmt/optional.h",
 ]
 
-# [[fields]]
-# name = "clone_operator"
-# type = "std::optional<PatternNode>"
+[[fields]]
+name = "template_operator"
+type = "std::optional<::FlexFlow::PatternNode>"
 
 # NOTE(@wmdi): Not sure if it aligns with other design. Or alternatively we can
 # define the assignment for each operator type.
diff --git a/lib/substitutions/include/substitutions/pcg_pattern.h b/lib/substitutions/include/substitutions/pcg_pattern.h
index 7342e8169f..f0962b15c2 100644
--- a/lib/substitutions/include/substitutions/pcg_pattern.h
+++ b/lib/substitutions/include/substitutions/pcg_pattern.h
@@ -10,6 +10,8 @@
 
 namespace FlexFlow {
 
+std::unordered_set<PatternNode> get_nodes(PCGPattern const &);
+
 /**
  * @brief Find all locations in \p pcg that match \p pattern
  */
diff --git a/lib/substitutions/include/substitutions/pcg_pattern_match.h b/lib/substitutions/include/substitutions/pcg_pattern_match.h
index 388377d70c..b946173422 100644
--- a/lib/substitutions/include/substitutions/pcg_pattern_match.h
+++ b/lib/substitutions/include/substitutions/pcg_pattern_match.h
@@ -6,7 +6,7 @@
 #include "substitutions/pcg_pattern_match.dtg.h"
 #include "substitutions/sub_parallel_computation_graph.dtg.h"
 #include "substitutions/unlabelled/pattern_node_output.dtg.h"
-#include "substitutions/unlabelled/unlabelled_dataflow_graph_pattern_match.dtg.h"
+#include "substitutions/unlabelled/unlabelled_dataflow_graph_pattern_match.h"
 
 namespace FlexFlow {
 
@@ -17,7 +17,7 @@ bidict<PatternNodeOutput, parallel_tensor_guid_t>
         SubParallelComputationGraph const &spcg);
 
 UnlabelledDataflowGraphPatternMatch
-    get_unlabelled_pattern_match(PCGPatternMatch const &);
+    get_unlabelled_pattern_match(PCGPatternMatch const &match);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h b/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h
index 15cbb6127c..c0544abe1b 100644
--- a/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h
+++ b/lib/substitutions/include/substitutions/sub_parallel_computation_graph_edge.h
@@ -12,7 +12,7 @@ namespace FlexFlow {
 SubParallelComputationGraphEdge
     subpcg_edge_from_tensor_and_dst(parallel_tensor_guid_t const &tensor,
                                     parallel_layer_guid_t const &layer,
-                                    int input_idx);
+                                    nonnegative_int input_idx);
 SubParallelComputationGraphEdge
     subpcg_edge_from_tensor_and_use(open_parallel_tensor_guid_t const &tensor,
                                     parallel_tensor_use_t const &use);
diff --git a/lib/substitutions/include/substitutions/substitution.h b/lib/substitutions/include/substitutions/substitution.h
index 7b4e5e6912..7dc4e714ab 100644
--- a/lib/substitutions/include/substitutions/substitution.h
+++ b/lib/substitutions/include/substitutions/substitution.h
@@ -1,12 +1,14 @@
 #ifndef _FLEXFLOW_SUBSTITUTIONS_SUBSTITUTION_H
 #define _FLEXFLOW_SUBSTITUTIONS_SUBSTITUTION_H
 
-#include "substitutions/pcg_pattern_match.dtg.h"
-#include "substitutions/sub_parallel_computation_graph.dtg.h"
 #include "substitutions/substitution.dtg.h"
 
 namespace FlexFlow {
 
+bool is_isomorphic_to(Substitution const &, Substitution const &);
+
+std::string as_dot(Substitution const &);
+
 /**
  * @brief Checks that all internal invariants of the given substitution hold
  *
@@ -22,25 +24,6 @@ namespace FlexFlow {
  */
 bool is_valid_substitution(Substitution const &);
 
-/**
- * @brief Applies \p substitution to \p sub_pcg at the location specified by \p
- * match, returning the resulting SubParallelComputationGraph
- *
- * @param sub_pcg
- * @param substitution
- * @param match The location at which to apply substitution. This location in
- * sub_pcg should match substitution's PCGPattern. Likely created by running
- * FlexFlow::find_pattern_matches(PCGPattern const &,
- * SubParallelComputationGraph const &).
- * @return SubParallelComputationGraph A sub-PCG similar to sub_pcg, but with
- * the subgraph specified by match replaced with the result of the output
- * expression of substitution
- */
-SubParallelComputationGraph
-    apply_substitution(SubParallelComputationGraph const &sub_pcg,
-                       Substitution const &substitution,
-                       PCGPatternMatch const &match);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/substitutions/include/substitutions/substitution_builder.h b/lib/substitutions/include/substitutions/substitution_builder.h
new file mode 100644
index 0000000000..1548b2269b
--- /dev/null
+++ b/lib/substitutions/include/substitutions/substitution_builder.h
@@ -0,0 +1,49 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_BUILDER_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_SUBSTITUTION_BUILDER_H
+
+#include "substitutions/output_graph/output_graph_expr_value.dtg.h"
+#include "substitutions/substitution.dtg.h"
+#include "substitutions/unlabelled/pattern_value.dtg.h"
+#include <tl/expected.hpp>
+
+namespace FlexFlow {
+
+struct SubstitutionBuilder {
+public:
+  SubstitutionBuilder();
+
+  std::pair<PatternValue, OutputGraphExprValue>
+      add_input(TensorAttributePattern const &,
+                std::optional<std::string> const &name = std::nullopt);
+  void equate_outputs(PatternValue const &, OutputGraphExprValue const &);
+
+  std::vector<PatternValue> add_pattern_node(
+      OperatorAttributePattern const &node_pattern,
+      std::vector<PatternValue> const &inputs,
+      std::vector<TensorAttributePattern> const &output_patterns,
+      std::optional<std::string> const &name = std::nullopt);
+
+  std::vector<OutputGraphExprValue>
+      add_output_graph_node(OutputOperatorAttrsAssignment const &node_expr,
+                            std::vector<OutputGraphExprValue> const &inputs,
+                            nonnegative_int num_outputs);
+
+  PatternNode pattern_node_named(std::string const &) const;
+  PatternInput pattern_input_named(std::string const &) const;
+
+  Substitution get_substitution() const;
+
+private:
+  LabelledOpenDataflowGraph<OperatorAttributePattern, TensorAttributePattern>
+      pattern_g;
+  LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment, std::monostate>
+      output_g;
+  bidict<PatternInput, OutputGraphExprInput> input_mapping;
+  bidict<PatternNode, std::string> pattern_node_names;
+  bidict<PatternInput, std::string> pattern_input_names;
+  bidict<PatternNodeOutput, OutputGraphExprNodeOutput> output_mapping;
+};
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml
index a57dd25845..71e11a12d5 100644
--- a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml
+++ b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_list_access.struct.toml
@@ -10,7 +10,8 @@ features = [
 ]
 
 includes = [
-  "substitutions/tensor_pattern/tensor_attribute_key.dtg.h"
+  "substitutions/tensor_pattern/tensor_attribute_key.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -19,4 +20,4 @@ type = "::FlexFlow::TensorAttributeKey"
 
 [[fields]]
 name = "index"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h
index 5b7ebf4ef8..c1e28f8d8f 100644
--- a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h
+++ b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_pattern.h
@@ -2,10 +2,13 @@
 #define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_TENSOR_PATTERN_TENSOR_ATTRIBUTE_PATTERN_H
 
 #include "substitutions/tensor_pattern/tensor_attribute_pattern.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
 TensorAttributePattern tensor_attribute_pattern_match_all();
+TensorAttributePattern
+    tensor_attr_pattern_require_num_dims(nonnegative_int num_dims);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml
index 46b703a7fc..d2b931fb2d 100644
--- a/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml
+++ b/lib/substitutions/include/substitutions/tensor_pattern/tensor_attribute_value.variant.toml
@@ -12,10 +12,11 @@ includes = [
   "<vector>",
   "utils/hash/vector.h",
   "utils/fmt/vector.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[values]]
-type = "size_t"
+type = "::FlexFlow::nonnegative_int"
 
 [[values]]
-type = "std::vector<size_t>"
+type = "std::vector<::FlexFlow::nonnegative_int>"
diff --git a/lib/substitutions/include/substitutions/unity_substitution_set.h b/lib/substitutions/include/substitutions/unity_substitution_set.h
new file mode 100644
index 0000000000..183f76ac8a
--- /dev/null
+++ b/lib/substitutions/include/substitutions/unity_substitution_set.h
@@ -0,0 +1,47 @@
+#ifndef _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_UNITY_SUBSTITUTION_SET_H
+#define _FLEXFLOW_LIB_SUBSTITUTIONS_INCLUDE_SUBSTITUTIONS_UNITY_SUBSTITUTION_SET_H
+
+#include "pcg/machine_specification.dtg.h"
+#include "substitutions/substitution.dtg.h"
+#include "utils/fmt/vector.h"
+
+namespace FlexFlow {
+
+std::vector<Substitution>
+    get_substitution_set(MachineSpecification const &resources);
+
+Substitution create_combine_inception(nonnegative_int num_convs,
+                                      nonnegative_int num_dims,
+                                      nonnegative_int degree);
+Substitution create_combine_concat(nonnegative_int num_inputs,
+                                   nonnegative_int num_dims,
+                                   nonnegative_int degree);
+Substitution create_replicate_linear_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree,
+                                             bool use_bias);
+Substitution create_partition_linear_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree,
+                                             Activation activation,
+                                             bool use_bias);
+Substitution create_partition_conv2d_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree);
+Substitution create_partition_attention_combine(nonnegative_int num_heads,
+                                                nonnegative_int degree);
+Substitution create_replicate_attention_reduce(nonnegative_int num_heads,
+                                               nonnegative_int degree);
+Substitution create_partition_add_combine(ff_dim_t parallel_dim,
+                                          nonnegative_int degree);
+Substitution create_partition_relu_combine(ff_dim_t parallel_dim,
+                                           nonnegative_int degree);
+Substitution create_partition_concat_combine(nonnegative_int num_inputs,
+                                             ff_dim_t concat_dim,
+                                             ff_dim_t parallel_dim,
+                                             nonnegative_int degree);
+Substitution create_partition_softmax_combine(ff_dim_t softmax_dim,
+                                              ff_dim_t partition_dim,
+                                              nonnegative_int degree);
+Substitution create_fuse_linear_activation(Activation activation);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h b/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h
index 7a7c9c3c28..8c58cb991c 100644
--- a/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h
+++ b/lib/substitutions/include/substitutions/unlabelled/input_pattern_edge.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 PatternInput get_src_input(InputPatternEdge const &);
 PatternNode get_dst_node(InputPatternEdge const &);
-int get_dst_idx(InputPatternEdge const &);
+nonnegative_int get_dst_idx(InputPatternEdge const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h b/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h
index 14c0b9ddcc..ce30b18f55 100644
--- a/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h
+++ b/lib/substitutions/include/substitutions/unlabelled/pattern_matching.h
@@ -9,13 +9,10 @@
 
 namespace FlexFlow {
 
-// OpenDataflowGraphView apply_match(UnlabelledGraphPattern const &pattern,
-//                                   UnlabelledDataflowGraphPatternMatch const
-//                                   &match);
-
 OpenDataflowSubgraphResult
     subgraph_matched(OpenDataflowGraphView const &graph,
                      UnlabelledDataflowGraphPatternMatch const &match);
+
 bool pattern_matches_subgraph_under(
     UnlabelledGraphPattern const &pattern,
     OpenDataflowGraphView const &subgraph,
@@ -30,11 +27,6 @@ bool unlabelled_pattern_does_match(
     UnlabelledDataflowGraphPatternMatch const &match,
     MatchAdditionalCriterion const &additional_criterion);
 
-std::vector<UnlabelledDataflowGraphPatternMatch>
-    find_pattern_matches(UnlabelledGraphPattern const &pattern,
-                         OpenDataflowGraphView const &graph,
-                         MatchAdditionalCriterion const &additional_criterion);
-
 } // namespace FlexFlow
 
 #endif
diff --git a/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h b/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h
index 3dd5b262c9..67f513b8b1 100644
--- a/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h
+++ b/lib/substitutions/include/substitutions/unlabelled/pattern_node_output.h
@@ -6,7 +6,7 @@
 namespace FlexFlow {
 
 PatternNode get_src_node(PatternNodeOutput const &);
-int get_idx(PatternNodeOutput const &);
+nonnegative_int get_idx(PatternNodeOutput const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h b/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h
index 7316098fb5..817e829709 100644
--- a/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h
+++ b/lib/substitutions/include/substitutions/unlabelled/standard_pattern_edge.h
@@ -8,8 +8,8 @@ namespace FlexFlow {
 
 PatternNode get_src_node(StandardPatternEdge const &);
 PatternNode get_dst_node(StandardPatternEdge const &);
-int get_src_idx(StandardPatternEdge const &);
-int get_dst_idx(StandardPatternEdge const &);
+nonnegative_int get_src_idx(StandardPatternEdge const &);
+nonnegative_int get_dst_idx(StandardPatternEdge const &);
 
 } // namespace FlexFlow
 
diff --git a/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
new file mode 100644
index 0000000000..61bfe15d7b
--- /dev/null
+++ b/lib/substitutions/src/substitutions/apply_substitution/apply_substitution.cc
@@ -0,0 +1,165 @@
+#include "substitutions/apply_substitution/apply_substitution.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
+#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
+#include "substitutions/open_parallel_tensor_guid_t.h"
+#include "substitutions/pcg_pattern_match.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/sub_parallel_computation_graph_data.dtg.h"
+#include "substitutions/sub_parallel_computation_graph_edge.h"
+#include "utils/containers/keys.h"
+#include "utils/containers/merge_maps.h"
+#include "utils/containers/restrict_keys.h"
+#include "utils/containers/set_minus.h"
+#include "utils/containers/values.h"
+
+namespace FlexFlow {
+
+SubParallelComputationGraph
+    apply_substitution(SubParallelComputationGraph const &spcg,
+                       Substitution const &sub,
+                       PCGPatternMatch const &match) {
+  auto substitution_output_result =
+      evaluate_substitution_output(spcg, sub, match);
+  SubParallelComputationGraph substitution_output_graph =
+      substitution_output_result.first;
+  OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping =
+      substitution_output_result.second;
+
+  SubParallelComputationGraphData output_graph_data =
+      get_sub_pcg_data(substitution_output_graph);
+  SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg);
+
+  std::unordered_set<parallel_layer_guid_t> pre_nodes =
+      keys(pre_data.node_data);
+  std::unordered_set<parallel_layer_guid_t> matched_nodes =
+      unordered_set_of(values(match.node_assignment));
+  std::unordered_set<parallel_layer_guid_t> post_nodes_from_original_graph =
+      set_minus(pre_nodes, matched_nodes);
+
+  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
+      [&] {
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_orig = restrict_keys(
+                pre_data.node_data, post_nodes_from_original_graph);
+        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
+            post_node_data_from_sub = output_graph_data.node_data;
+
+        return merge_disjoint_maps(post_node_data_from_orig,
+                                   post_node_data_from_sub);
+      }();
+
+  std::unordered_set<SubParallelComputationGraphEdge> post_edges = [&] {
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_orig =
+        filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) {
+          if (e.raw_edge.has<DataflowInputEdge>()) {
+            return true;
+          } else {
+            DataflowEdge dfe = e.raw_edge.get<DataflowEdge>();
+            parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node};
+            parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node};
+            return !(contains(matched_nodes, src) ||
+                     contains(matched_nodes, dst));
+          }
+        });
+
+    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_sub =
+        filter(output_graph_data.edges,
+               [&](SubParallelComputationGraphEdge const &e) {
+                 return !e.raw_edge.has<DataflowInputEdge>();
+               });
+
+    bidict<PatternNodeOutput, parallel_tensor_guid_t>
+        output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match(
+            match, sub.pcg_pattern, spcg);
+    bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
+        output_post_outexpr_mapping = get_output_graph_expr_output_mapping(
+            output_expr_to_result_sub_pcg_mapping,
+            sub.output_graph_expr,
+            substitution_output_graph);
+
+    std::unordered_set<SubParallelComputationGraphEdge> incoming_to_sub_edges;
+    for (auto const &[pattern_input, base_graph_tensor] :
+         match.input_assignment) {
+      OutputGraphExprInput output_expr_input =
+          sub.inputs_mapping.at_l(pattern_input);
+      input_parallel_tensor_guid_t output_graph_input =
+          output_expr_to_result_sub_pcg_mapping.input_mapping.at_r(
+              output_expr_input);
+      std::unordered_set<parallel_tensor_use_t> uses = get_parallel_tensor_uses(
+          substitution_output_graph,
+          open_parallel_tensor_guid_from_input(output_graph_input));
+      for (parallel_tensor_use_t const &use : uses) {
+        SubParallelComputationGraphEdge new_edge =
+            subpcg_edge_from_tensor_and_use(base_graph_tensor, use);
+        incoming_to_sub_edges.insert(new_edge);
+      }
+    }
+
+    std::unordered_set<SubParallelComputationGraphEdge> outgoing_from_sub_edges;
+    for (ParallelComputationGraphEdge const &outgoing_edge :
+         get_subgraph_outgoing_edges(spcg, matched_nodes)) {
+      parallel_tensor_guid_t original_tensor =
+          get_parallel_tensor(outgoing_edge);
+      PatternNodeOutput pattern_tensor =
+          output_orig_pattern_mapping.at_r(original_tensor);
+      OutputGraphExprNodeOutput output_graph_tensor =
+          sub.outputs_mapping.at_l(pattern_tensor);
+      parallel_tensor_guid_t new_tensor =
+          output_post_outexpr_mapping.at_r(output_graph_tensor);
+
+      SubParallelComputationGraphEdge new_edge =
+          subpcg_edge_from_tensor_and_dst(
+              new_tensor,
+              get_dst_layer(outgoing_edge),
+              get_dst_layer_input_idx(outgoing_edge));
+      outgoing_from_sub_edges.insert(new_edge);
+    }
+
+    return set_union(std::vector{
+        post_edges_from_orig,
+        post_edges_from_sub,
+        incoming_to_sub_edges,
+        outgoing_from_sub_edges,
+    });
+  }();
+
+  std::unordered_set<input_parallel_tensor_guid_t> post_inputs =
+      pre_data.inputs;
+
+  std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+      post_value_data = [&] {
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_orig = filter_keys(
+                pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) {
+                  return visit_open_parallel_tensor_guid(
+                      t,
+                      overload{
+                          [&](parallel_tensor_guid_t const &t) {
+                            return contains(post_nodes_from_original_graph,
+                                            get_source_layer(t));
+                          },
+                          [](input_parallel_tensor_guid_t const &) {
+                            return true;
+                          },
+                      });
+                });
+
+        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
+            post_value_data_from_sub = output_graph_data.value_data;
+        return merge_disjoint_maps(post_value_data_from_orig,
+                                   post_value_data_from_sub);
+      }();
+
+  SubParallelComputationGraphData post_data = SubParallelComputationGraphData{
+      post_node_data,
+      post_edges,
+      post_inputs,
+      post_value_data,
+  };
+
+  return sub_pcg_from_graph_data(post_data);
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/substitution_internal/evaluate_substitution_output.cc b/lib/substitutions/src/substitutions/apply_substitution/evaluate_substitution_output.cc
similarity index 96%
rename from lib/substitutions/src/substitutions/substitution_internal/evaluate_substitution_output.cc
rename to lib/substitutions/src/substitutions/apply_substitution/evaluate_substitution_output.cc
index 186e2fc03a..a921201c3a 100644
--- a/lib/substitutions/src/substitutions/substitution_internal/evaluate_substitution_output.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/evaluate_substitution_output.cc
@@ -1,7 +1,7 @@
-#include "substitutions/substitution_internal/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/perform_shape_inference.h"
 #include "substitutions/output_graph/output_operator_attrs_assignment.h"
 #include "substitutions/sub_parallel_computation_graph.h"
-#include "substitutions/substitution_internal/perform_shape_inference.h"
 #include "utils/containers/map_keys.h"
 #include "utils/containers/map_values.h"
 #include "utils/graph/labelled_open_dataflow_graph/algorithms/permute_input_ids.h"
diff --git a/lib/substitutions/src/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.cc b/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc
similarity index 93%
rename from lib/substitutions/src/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.cc
rename to lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc
index 22e6a9f333..a5fc9a2e06 100644
--- a/lib/substitutions/src/substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.cc
@@ -1,4 +1,4 @@
-#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h"
+#include "substitutions/apply_substitution/output_expr_to_result_sub_pcg_mapping.h"
 #include "substitutions/output_graph/output_graph_expr.h"
 #include "substitutions/sub_parallel_computation_graph.h"
 #include "utils/bidict/algorithms/bidict_from_keys_and_values.h"
diff --git a/lib/substitutions/src/substitutions/substitution_internal/perform_shape_inference.cc b/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc
similarity index 95%
rename from lib/substitutions/src/substitutions/substitution_internal/perform_shape_inference.cc
rename to lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc
index 9fa91d75b7..f49c7e0a3e 100644
--- a/lib/substitutions/src/substitutions/substitution_internal/perform_shape_inference.cc
+++ b/lib/substitutions/src/substitutions/apply_substitution/perform_shape_inference.cc
@@ -1,4 +1,4 @@
-#include "substitutions/substitution_internal/perform_shape_inference.h"
+#include "substitutions/apply_substitution/perform_shape_inference.h"
 #include "op-attrs/get_output_shapes.h"
 #include "utils/containers/map_keys.h"
 #include "utils/containers/transform.h"
diff --git a/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc b/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc
index 53973dc1cb..6f41772a9e 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/eval_list_access.cc
@@ -1,5 +1,8 @@
 #include "substitutions/operator_pattern/eval_list_access.h"
 #include "substitutions/operator_pattern/get_attribute.h"
+#include "utils/containers/at_idx.h"
+#include "utils/containers/make.h"
+#include "utils/containers/transform.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -18,20 +21,12 @@ std::optional<OperatorAttributeValue>
       [&](auto const &v) -> std::optional<OperatorAttributeValue> {
         using T = std::decay_t<decltype(v)>;
 
-        if constexpr (std::is_same_v<T, std::vector<int>>) {
-          if (acc.index >= v.size()) {
-            return std::nullopt;
-          } else {
-            int value = v.at(acc.index);
-            return OperatorAttributeValue{value};
-          }
+        if constexpr (std::is_same_v<T, std::vector<nonnegative_int>>) {
+          return transform(at_idx(v, acc.index),
+                           make<OperatorAttributeValue>());
         } else if constexpr (std::is_same_v<T, std::vector<ff_dim_t>>) {
-          if (acc.index >= v.size()) {
-            return std::nullopt;
-          } else {
-            ff_dim_t value = v.at(acc.index);
-            return OperatorAttributeValue{value};
-          }
+          return transform(at_idx(v, acc.index),
+                           make<OperatorAttributeValue>());
         } else {
           throw mk_runtime_error("Invalid operand");
         }
diff --git a/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc b/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc
index a3ae9c84d1..fb0fd7f47b 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/eval_list_size.cc
@@ -1,5 +1,6 @@
 #include "substitutions/operator_pattern/eval_list_size.h"
 #include "substitutions/operator_pattern/get_attribute.h"
+#include "utils/nonnegative_int/num_elements.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -18,9 +19,9 @@ std::optional<OperatorAttributeValue>
       [&](auto const &v) -> std::optional<OperatorAttributeValue> {
         using T = std::decay_t<decltype(v)>;
 
-        if constexpr (std::is_same_v<T, std::vector<int>> ||
+        if constexpr (std::is_same_v<T, std::vector<nonnegative_int>> ||
                       std::is_same_v<T, std::vector<ff_dim_t>>) {
-          size_t size = v.size();
+          nonnegative_int size = num_elements(v);
           return OperatorAttributeValue{size};
         } else {
           throw mk_runtime_error("Invalid operand");
diff --git a/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc b/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc
index 442d3345a1..cb733e16ff 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/get_attribute.cc
@@ -8,7 +8,7 @@ std::optional<OperatorAttributeValue> get_attribute(BatchMatmulAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -18,13 +18,13 @@ std::optional<OperatorAttributeValue> get_attribute(BatchNormAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::EPSILON:
-      return p.eps;
+      return OperatorAttributeValue{p.eps};
     case OperatorAttributeKey::AFFINE:
-      return p.affine;
+      return OperatorAttributeValue{p.affine};
     case OperatorAttributeKey::MOMENTUM:
-      return p.momentum;
+      return OperatorAttributeValue{p.momentum};
     default:
       return std::nullopt;
   }
@@ -34,9 +34,9 @@ std::optional<OperatorAttributeValue> get_attribute(BroadcastAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::TARGET_DIMS:
-      return p.target_dims;
+      return OperatorAttributeValue{p.target_dims};
     default:
       return std::nullopt;
   }
@@ -46,9 +46,9 @@ std::optional<OperatorAttributeValue> get_attribute(CastAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::DATA_TYPE:
-      return p.dtype;
+      return OperatorAttributeValue{p.dtype};
     default:
       return std::nullopt;
   }
@@ -58,11 +58,11 @@ std::optional<OperatorAttributeValue> get_attribute(CombineAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PARALLEL_OP_DIM:
-      return p.combine_dim;
+      return OperatorAttributeValue{p.combine_dim};
     case OperatorAttributeKey::PARALLEL_DIM:
-      return p.combine_degree;
+      return OperatorAttributeValue{p.combine_degree};
     default:
       return std::nullopt;
   }
@@ -72,9 +72,9 @@ std::optional<OperatorAttributeValue> get_attribute(ConcatAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.axis;
+      return OperatorAttributeValue{p.axis};
     default:
       return std::nullopt;
   }
@@ -84,25 +84,25 @@ std::optional<OperatorAttributeValue> get_attribute(Conv2DAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::KERNEL_H:
-      return p.kernel_h;
+      return OperatorAttributeValue{p.kernel_h};
     case OperatorAttributeKey::KERNEL_W:
-      return p.kernel_w;
+      return OperatorAttributeValue{p.kernel_w};
     case OperatorAttributeKey::STRIDE_H:
-      return p.stride_h;
+      return OperatorAttributeValue{p.stride_h};
     case OperatorAttributeKey::STRIDE_W:
-      return p.stride_w;
+      return OperatorAttributeValue{p.stride_w};
     case OperatorAttributeKey::PADDING_H:
-      return p.padding_h;
+      return OperatorAttributeValue{p.padding_h};
     case OperatorAttributeKey::PADDING_W:
-      return p.padding_w;
+      return OperatorAttributeValue{p.padding_w};
     case OperatorAttributeKey::GROUPS:
-      return p.groups;
+      return OperatorAttributeValue{p.groups};
     case OperatorAttributeKey::ACTIVATION:
-      return p.activation;
+      return OperatorAttributeValue{p.activation};
     case OperatorAttributeKey::USE_BIAS:
-      return p.use_bias;
+      return OperatorAttributeValue{p.use_bias};
     default:
       return std::nullopt;
   }
@@ -112,7 +112,7 @@ std::optional<OperatorAttributeValue> get_attribute(ElementBinaryAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -122,7 +122,7 @@ std::optional<OperatorAttributeValue> get_attribute(ElementUnaryAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -132,7 +132,7 @@ std::optional<OperatorAttributeValue> get_attribute(DropoutAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -142,15 +142,15 @@ std::optional<OperatorAttributeValue> get_attribute(EmbeddingAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::DATA_TYPE:
-      return p.data_type;
+      return OperatorAttributeValue{p.data_type};
     case OperatorAttributeKey::AGGR:
-      return p.aggr;
+      return OperatorAttributeValue{p.aggr};
     case OperatorAttributeKey::NUM_ENTRIES:
-      return p.num_entries;
+      return OperatorAttributeValue{p.num_entries};
     case OperatorAttributeKey::OUT_CHANNELS:
-      return p.out_channels;
+      return OperatorAttributeValue{p.out_channels};
     default:
       return std::nullopt;
   }
@@ -160,7 +160,7 @@ std::optional<OperatorAttributeValue> get_attribute(FlatAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -170,9 +170,9 @@ std::optional<OperatorAttributeValue> get_attribute(GatherAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.dim;
+      return OperatorAttributeValue{p.dim};
     default:
       return std::nullopt;
   }
@@ -182,7 +182,7 @@ std::optional<OperatorAttributeValue> get_attribute(InputAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -192,11 +192,11 @@ std::optional<OperatorAttributeValue> get_attribute(LayerNormAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AFFINE:
-      return p.elementwise_affine;
+      return OperatorAttributeValue{p.elementwise_affine};
     case OperatorAttributeKey::AXES:
-      return vector_of(p.axes);
+      return OperatorAttributeValue{vector_of(p.axes)};
     default:
       return std::nullopt;
   }
@@ -206,17 +206,17 @@ std::optional<OperatorAttributeValue> get_attribute(LinearAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::OUT_CHANNELS:
-      return p.out_channels;
+      return OperatorAttributeValue{p.out_channels};
     case OperatorAttributeKey::USE_BIAS:
-      return p.use_bias;
+      return OperatorAttributeValue{p.use_bias};
     case OperatorAttributeKey::DATA_TYPE:
-      return p.data_type;
+      return OperatorAttributeValue{p.data_type};
     case OperatorAttributeKey::ACTIVATION:
-      return p.activation;
+      return OperatorAttributeValue{p.activation};
     case OperatorAttributeKey::REGULARIZER:
-      return p.regularizer;
+      return OperatorAttributeValue{p.regularizer};
     default:
       return std::nullopt;
   }
@@ -226,13 +226,13 @@ std::optional<OperatorAttributeValue>
     get_attribute(MultiHeadAttentionAttrs const &p, OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::NUM_HEADS:
-      return p.num_heads;
+      return OperatorAttributeValue{p.num_heads};
     case OperatorAttributeKey::USE_BIAS:
-      return p.bias;
+      return OperatorAttributeValue{p.bias};
     case OperatorAttributeKey::DROPOUT:
-      return p.dropout;
+      return OperatorAttributeValue{p.dropout};
     default:
       return std::nullopt;
   }
@@ -242,7 +242,7 @@ std::optional<OperatorAttributeValue> get_attribute(NoopAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -252,23 +252,23 @@ std::optional<OperatorAttributeValue> get_attribute(Pool2DAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::KERNEL_H:
-      return p.kernel_h;
+      return OperatorAttributeValue{p.kernel_h};
     case OperatorAttributeKey::KERNEL_W:
-      return p.kernel_w;
+      return OperatorAttributeValue{p.kernel_w};
     case OperatorAttributeKey::STRIDE_H:
-      return p.stride_h;
+      return OperatorAttributeValue{p.stride_h};
     case OperatorAttributeKey::STRIDE_W:
-      return p.stride_w;
+      return OperatorAttributeValue{p.stride_w};
     case OperatorAttributeKey::PADDING_H:
-      return p.padding_h;
+      return OperatorAttributeValue{p.padding_h};
     case OperatorAttributeKey::PADDING_W:
-      return p.padding_w;
+      return OperatorAttributeValue{p.padding_w};
     case OperatorAttributeKey::POOL_TYPE:
-      return p.pool_type;
+      return OperatorAttributeValue{p.pool_type};
     case OperatorAttributeKey::ACTIVATION:
-      return std::optional<Activation>{p.activation};
+      return OperatorAttributeValue{p.activation};
     default:
       return std::nullopt;
   }
@@ -278,7 +278,7 @@ std::optional<OperatorAttributeValue> get_attribute(ReduceAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -288,9 +288,9 @@ std::optional<OperatorAttributeValue> get_attribute(ReductionAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PARALLEL_OP_DEGREE:
-      return p.reduction_degree;
+      return OperatorAttributeValue{p.reduction_degree};
     default:
       return std::nullopt;
   }
@@ -300,11 +300,11 @@ std::optional<OperatorAttributeValue> get_attribute(RepartitionAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PARALLEL_OP_DIM:
-      return p.repartition_dim;
+      return OperatorAttributeValue{p.repartition_dim};
     case OperatorAttributeKey::PARALLEL_OP_DEGREE:
-      return p.repartition_degree;
+      return OperatorAttributeValue{p.repartition_degree};
     default:
       return std::nullopt;
   }
@@ -314,9 +314,9 @@ std::optional<OperatorAttributeValue> get_attribute(ReplicateAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PARALLEL_OP_DEGREE:
-      return p.replicate_degree;
+      return OperatorAttributeValue{p.replicate_degree};
     default:
       return std::nullopt;
   }
@@ -326,7 +326,7 @@ std::optional<OperatorAttributeValue> get_attribute(ReshapeAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -336,9 +336,9 @@ std::optional<OperatorAttributeValue> get_attribute(ReverseAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.axis;
+      return OperatorAttributeValue{p.axis};
     default:
       return std::nullopt;
   }
@@ -348,9 +348,9 @@ std::optional<OperatorAttributeValue> get_attribute(SplitAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.axis;
+      return OperatorAttributeValue{p.axis};
     default:
       return std::nullopt;
   }
@@ -360,9 +360,9 @@ std::optional<OperatorAttributeValue> get_attribute(SoftmaxAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::AXIS:
-      return p.dim;
+      return OperatorAttributeValue{p.dim};
     default:
       return std::nullopt;
   }
@@ -372,7 +372,7 @@ std::optional<OperatorAttributeValue> get_attribute(TopKAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
@@ -382,9 +382,9 @@ std::optional<OperatorAttributeValue> get_attribute(TransposeAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     case OperatorAttributeKey::PERMUTATION:
-      return vector_of(p.perm);
+      return OperatorAttributeValue{vector_of(p.perm)};
     default:
       return std::nullopt;
   }
@@ -394,7 +394,7 @@ std::optional<OperatorAttributeValue> get_attribute(WeightAttrs const &p,
                                                     OperatorAttributeKey key) {
   switch (key) {
     case OperatorAttributeKey::OP_TYPE:
-      return get_op_type(p);
+      return OperatorAttributeValue{get_op_type(p)};
     default:
       return std::nullopt;
   }
diff --git a/lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc b/lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc
new file mode 100644
index 0000000000..f1b7440aed
--- /dev/null
+++ b/lib/substitutions/src/substitutions/operator_pattern/get_attribute_map.cc
@@ -0,0 +1,25 @@
+#include "substitutions/operator_pattern/get_attribute_map.h"
+#include "substitutions/operator_pattern/get_attribute.h"
+#include "substitutions/operator_pattern/operator_attribute_key.dtg.h"
+#include "substitutions/operator_pattern/operator_attribute_key.h"
+#include "substitutions/operator_pattern/operator_attribute_value.dtg.h"
+
+namespace FlexFlow {
+
+std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+    get_attribute_map(PCGOperatorAttrs const &op_attrs) {
+  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue> result;
+
+  for (OperatorAttributeKey const &attr_key : all_operator_attribute_keys()) {
+    std::optional<OperatorAttributeValue> attr_value =
+        get_attribute(op_attrs, attr_key);
+
+    if (attr_value.has_value()) {
+      result.insert({attr_key, attr_value.value()});
+    }
+  }
+
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc
index 5ab528ed3d..29aef07e3a 100644
--- a/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc
+++ b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_constraint.cc
@@ -20,6 +20,16 @@ OperatorAttributeConstraint
   };
 }
 
+OperatorAttributeConstraint
+    op_attr_key_divisible_by(OperatorAttributeKey key,
+                             nonnegative_int denominator) {
+  return OperatorAttributeConstraint{
+      ConstraintType::DIVISIBLE_BY,
+      OperatorAttributeExpr{key},
+      OperatorAttributeValue{denominator},
+  };
+}
+
 OperatorAttributeConstraint
     make_equals_constraint(OperatorAttributeExpr const &expr,
                            OperatorAttributeValue const &val) {
diff --git a/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc
new file mode 100644
index 0000000000..232d2c2f12
--- /dev/null
+++ b/lib/substitutions/src/substitutions/operator_pattern/operator_attribute_key.cc
@@ -0,0 +1,68 @@
+#include "substitutions/operator_pattern/operator_attribute_key.h"
+
+namespace FlexFlow {
+
+// This should probably be integrated into proj,
+// tracked in https://github.com/flexflow/FlexFlow/issues/1478
+std::vector<OperatorAttributeKey> all_operator_attribute_keys() {
+  return {
+      OperatorAttributeKey::OP_TYPE,
+      OperatorAttributeKey::USE_BIAS,
+      OperatorAttributeKey::GROUPS,
+      OperatorAttributeKey::POOL_TYPE,
+      OperatorAttributeKey::KERNEL_H,
+      OperatorAttributeKey::KERNEL_W,
+      OperatorAttributeKey::DATA_TYPE,
+      OperatorAttributeKey::SCALAR,
+      OperatorAttributeKey::STRIDE_H,
+      OperatorAttributeKey::STRIDE_W,
+      OperatorAttributeKey::PADDING_H,
+      OperatorAttributeKey::PADDING_W,
+      OperatorAttributeKey::AGGR,
+      OperatorAttributeKey::NUM_ENTRIES,
+      OperatorAttributeKey::OUT_CHANNELS,
+      OperatorAttributeKey::ACTIVATION,
+      OperatorAttributeKey::NUMDIM,
+      OperatorAttributeKey::AXIS,
+      OperatorAttributeKey::PERMUTATION,
+      OperatorAttributeKey::OUTSHUFFLE,
+      OperatorAttributeKey::MERGE_GCONV_COUNT,
+      OperatorAttributeKey::AXES,
+      OperatorAttributeKey::KEEP_DIMS,
+      OperatorAttributeKey::EPSILON,
+      OperatorAttributeKey::PARALLEL_OP_DIM,
+      OperatorAttributeKey::PARALLEL_OP_DEGREE,
+      OperatorAttributeKey::SOFTMAX_DIM,
+      OperatorAttributeKey::NUM_HEADS,
+      OperatorAttributeKey::PARALLEL_DIM,
+      OperatorAttributeKey::PARALLEL_DEGREE,
+      OperatorAttributeKey::PAD,
+      OperatorAttributeKey::EMBED_DIM,
+      OperatorAttributeKey::KDIM,
+      OperatorAttributeKey::VDIM,
+      OperatorAttributeKey::DROPOUT,
+      OperatorAttributeKey::BIAS,
+      OperatorAttributeKey::ADD_BIAS_KV,
+      OperatorAttributeKey::ADD_ZERO_ATTN,
+      OperatorAttributeKey::A_SEQ_LENGTH_DIM,
+      OperatorAttributeKey::B_SEQ_LENGTH_DIM,
+      OperatorAttributeKey::RELU,
+      OperatorAttributeKey::TARGET_DIMS,
+      OperatorAttributeKey::RATE,
+      OperatorAttributeKey::SEED,
+      OperatorAttributeKey::SHOULD_BROADCAST_LHS,
+      OperatorAttributeKey::SHOULD_BROADCAST_RHS,
+      OperatorAttributeKey::DIM,
+      OperatorAttributeKey::ELEMENTWISE_AFFINE,
+      OperatorAttributeKey::REGULARIZER,
+      OperatorAttributeKey::SHAPE,
+      OperatorAttributeKey::SPLITS,
+      OperatorAttributeKey::K,
+      OperatorAttributeKey::SORTED,
+      OperatorAttributeKey::COMBINE_DIM,
+      OperatorAttributeKey::COMBINE_DEGREE,
+      OperatorAttributeKey::NUM_INPUTS,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
index 7d65f687c8..4f11b343f8 100644
--- a/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
+++ b/lib/substitutions/src/substitutions/output_graph/materialize_operator_from_attrs_map.cc
@@ -33,10 +33,12 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
   switch (op_type) {
     case OperatorType::MULTIHEAD_ATTENTION:
       return PCGOperatorAttrs{MultiHeadAttentionAttrs{
-          /*embed_dim=*/acc.get<int>(OperatorAttributeKey::EMBED_DIM),
-          /*num_heads=*/acc.get<int>(OperatorAttributeKey::NUM_HEADS),
-          /*kdim=*/acc.get<int>(OperatorAttributeKey::KDIM),
-          /*vdim=*/acc.get<int>(OperatorAttributeKey::VDIM),
+          /*embed_dim=*/acc.get<nonnegative_int>(
+              OperatorAttributeKey::EMBED_DIM),
+          /*num_heads=*/
+          acc.get<nonnegative_int>(OperatorAttributeKey::NUM_HEADS),
+          /*kdim=*/acc.get<nonnegative_int>(OperatorAttributeKey::KDIM),
+          /*vdim=*/acc.get<nonnegative_int>(OperatorAttributeKey::VDIM),
           /*dropout=*/acc.get<float>(OperatorAttributeKey::DROPOUT),
           /*bias=*/acc.get<bool>(OperatorAttributeKey::BIAS),
           /*add_bias_kv=*/acc.get<bool>(OperatorAttributeKey::ADD_BIAS_KV),
@@ -44,12 +46,14 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
       }};
     case OperatorType::POOL2D:
       return PCGOperatorAttrs{Pool2DAttrs{
-          /*kernel_h=*/acc.get<int>(OperatorAttributeKey::KERNEL_H),
-          /*kernel_w=*/acc.get<int>(OperatorAttributeKey::KERNEL_W),
-          /*stride_h=*/acc.get<int>(OperatorAttributeKey::STRIDE_H),
-          /*stride_w=*/acc.get<int>(OperatorAttributeKey::STRIDE_W),
-          /*padding_h=*/acc.get<int>(OperatorAttributeKey::PADDING_H),
-          /*padding_w=*/acc.get<int>(OperatorAttributeKey::PADDING_W),
+          /*kernel_h=*/acc.get<nonnegative_int>(OperatorAttributeKey::KERNEL_H),
+          /*kernel_w=*/acc.get<nonnegative_int>(OperatorAttributeKey::KERNEL_W),
+          /*stride_h=*/acc.get<nonnegative_int>(OperatorAttributeKey::STRIDE_H),
+          /*stride_w=*/acc.get<nonnegative_int>(OperatorAttributeKey::STRIDE_W),
+          /*padding_h=*/
+          acc.get<nonnegative_int>(OperatorAttributeKey::PADDING_H),
+          /*padding_w=*/
+          acc.get<nonnegative_int>(OperatorAttributeKey::PADDING_W),
           /*pool_type=*/acc.get<PoolOp>(OperatorAttributeKey::POOL_TYPE),
           /*activation=*/
           acc.get<std::optional<Activation>>(OperatorAttributeKey::ACTIVATION)
@@ -62,7 +66,8 @@ PCGOperatorAttrs materialize_operator_from_attrs_map(
     case OperatorType::DROPOUT:
     case OperatorType::LINEAR:
       return PCGOperatorAttrs{LinearAttrs{
-          /*out_channels=*/acc.get<int>(OperatorAttributeKey::OUT_CHANNELS),
+          /*out_channels=*/acc.get<nonnegative_int>(
+              OperatorAttributeKey::OUT_CHANNELS),
           /*use_bias=*/acc.get<bool>(OperatorAttributeKey::USE_BIAS),
           /*data_type=*/acc.get<DataType>(OperatorAttributeKey::DATA_TYPE),
           /*activation=*/
diff --git a/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc b/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc
index 3d6aadc795..f6d1410a07 100644
--- a/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc
+++ b/lib/substitutions/src/substitutions/output_graph/output_graph_expr.cc
@@ -1,9 +1,18 @@
 #include "substitutions/output_graph/output_graph_expr.h"
 #include "utils/containers/transform.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h"
 
 namespace FlexFlow {
 
+std::unordered_set<OutputGraphExprNode> get_nodes(OutputGraphExpr const &g) {
+  std::unordered_set<Node> raw_nodes = get_nodes(g.raw_graph);
+
+  return transform(raw_nodes,
+                   [](Node const &n) { return OutputGraphExprNode{n}; });
+}
+
 std::vector<OutputGraphExprNodeOutput>
     get_node_outputs(OutputGraphExpr const &g, OutputGraphExprNode const &n) {
   std::vector<DataflowOutput> raw_outputs =
@@ -14,4 +23,13 @@ std::vector<OutputGraphExprNodeOutput>
   });
 }
 
+std::unordered_set<OutputGraphExprInput> get_inputs(OutputGraphExpr const &g) {
+  std::unordered_set<DataflowGraphInput> raw_inputs =
+      get_open_dataflow_graph_inputs(g.raw_graph);
+
+  return transform(raw_inputs, [](DataflowGraphInput const &i) {
+    return OutputGraphExprInput{i};
+  });
+}
+
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc b/lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc
new file mode 100644
index 0000000000..b35f3bbeae
--- /dev/null
+++ b/lib/substitutions/src/substitutions/output_graph/output_graph_expr_value.cc
@@ -0,0 +1,30 @@
+#include "substitutions/output_graph/output_graph_expr_value.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+OpenDataflowValue raw_open_dataflow_value_from_output_graph_expr_value(
+    OutputGraphExprValue const &v) {
+  return v.visit<OpenDataflowValue>(overload{
+      [](OutputGraphExprNodeOutput const &o) {
+        return OpenDataflowValue{o.raw_dataflow_output};
+      },
+      [](OutputGraphExprInput const &i) {
+        return OpenDataflowValue{i.raw_dataflow_graph_input};
+      },
+  });
+}
+
+OutputGraphExprValue output_graph_expr_value_from_raw_open_dataflow_value(
+    OpenDataflowValue const &v) {
+  return v.visit<OutputGraphExprValue>(overload{
+      [](DataflowOutput const &o) {
+        return OutputGraphExprValue{OutputGraphExprNodeOutput{o}};
+      },
+      [](DataflowGraphInput const &i) {
+        return OutputGraphExprValue{OutputGraphExprInput{i}};
+      },
+  });
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc b/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc
index fa247cd151..f6b90ef054 100644
--- a/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc
+++ b/lib/substitutions/src/substitutions/output_graph/output_operator_attrs_assignment.cc
@@ -1,7 +1,9 @@
 #include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/operator_pattern/get_attribute_map.h"
 #include "substitutions/output_graph/materialize_operator_from_attrs_map.h"
 #include "substitutions/output_graph/output_operator_attribute_expr.h"
 #include "utils/containers/map_values.h"
+#include "utils/containers/merge_maps.h"
 
 namespace FlexFlow {
 
@@ -12,14 +14,31 @@ OutputOperatorAttrsAssignment output_operator_clone_node(PatternNode const &) {
 PCGOperatorAttrs materialize_output_operator_from_attrs_assignment(
     OutputOperatorAttrsAssignment const &attrs_assignment,
     std::unordered_map<PatternNode, PCGOperatorAttrs> const &node_match) {
-  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue> attr_map =
-      map_values(attrs_assignment.assignments,
-                 [&](OutputOperatorAttributeExpr const &expr) {
-                   return evaluate_output_operator_attribute_expr(expr,
-                                                                  node_match);
-                 });
-
-  return materialize_operator_from_attrs_map(attr_map);
+
+  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+      template_attrs_map = [&]()
+      -> std::unordered_map<OperatorAttributeKey, OperatorAttributeValue> {
+    if (attrs_assignment.template_operator.has_value()) {
+      PatternNode template_node = attrs_assignment.template_operator.value();
+      PCGOperatorAttrs template_op_attrs = node_match.at(template_node);
+      return get_attribute_map(template_op_attrs);
+    } else {
+      return {};
+    }
+  }();
+
+  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+      assignments_attrs_map = map_values(
+          attrs_assignment.assignments,
+          [&](OutputOperatorAttributeExpr const &expr) {
+            return evaluate_output_operator_attribute_expr(expr, node_match);
+          });
+
+  std::unordered_map<OperatorAttributeKey, OperatorAttributeValue>
+      joined_attrs_map =
+          merge_map_right_dominates(template_attrs_map, assignments_attrs_map);
+
+  return materialize_operator_from_attrs_map(joined_attrs_map);
 }
 
 std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
@@ -39,4 +58,10 @@ std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
   };
 }
 
+std::pair<OperatorAttributeKey, OutputOperatorAttributeExpr>
+    set_op_type_attr(OperatorType op_type) {
+  return set_attr_to_constant(OperatorAttributeKey::OP_TYPE,
+                              OperatorAttributeValue{op_type});
+}
+
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/pcg_pattern.cc b/lib/substitutions/src/substitutions/pcg_pattern.cc
index e53877006d..a0af875848 100644
--- a/lib/substitutions/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/src/substitutions/pcg_pattern.cc
@@ -3,13 +3,23 @@
 #include "substitutions/pcg_pattern_match.h"
 #include "substitutions/sub_parallel_computation_graph.h"
 #include "substitutions/tensor_pattern/satisfies_pattern.h"
+#include "substitutions/unlabelled/find_pattern_matches.h"
 #include "substitutions/unlabelled/pattern_value.h"
 #include "utils/containers/map_values.h"
 #include "utils/containers/transform.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h"
 
 namespace FlexFlow {
 
+std::unordered_set<PatternNode> get_nodes(PCGPattern const &p) {
+  std::unordered_set<Node> raw_nodes = get_nodes(p.raw_graph);
+
+  return transform(raw_nodes, [](Node const &n) { return PatternNode{n}; });
+}
+
 static MatchAdditionalCriterion
     pcg_pattern_criteria(PCGPattern const &pattern,
                          SubParallelComputationGraph const &pcg) {
@@ -63,6 +73,14 @@ OperatorAttributePattern get_operator_pattern(PCGPattern const &p,
   return p.raw_graph.at(n.raw_node);
 }
 
+std::unordered_set<PatternInput> get_inputs(PCGPattern const &p) {
+  std::unordered_set<DataflowGraphInput> raw_inputs =
+      get_open_dataflow_graph_inputs(p.raw_graph);
+
+  return transform(raw_inputs,
+                   [](DataflowGraphInput const &i) { return PatternInput{i}; });
+}
+
 std::vector<PatternNodeOutput>
     get_pattern_node_outputs(PCGPattern const &pattern,
                              PatternNode const &node) {
diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
index 0c673f0a8a..83df74f21b 100644
--- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
+++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph.cc
@@ -188,33 +188,34 @@ bool sub_pcgs_are_isomorphic(SubParallelComputationGraph const &lhs,
 }
 
 std::string as_dot(SubParallelComputationGraph const &spcg) {
-  std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
-      [](ParallelLayerAttrs const &a) -> std::string {
-    RecordFormatter r = as_dot(a.op_attrs);
-
-    if (a.name.has_value()) {
-      RecordFormatter rr;
-      rr << "Name" << a.name.value();
-      r << rr;
-    }
-
-    std::ostringstream oss;
-    oss << r;
-    return oss.str();
-  };
-
-  std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
-      [](ParallelTensorAttrs const &a) -> std::string {
-    RecordFormatter r;
-
-    r << fmt::to_string(a.shape);
-
-    std::ostringstream oss;
-    oss << r;
-    return oss.str();
-  };
-
-  return as_dot(spcg.raw_graph, get_node_label, get_input_label);
+  NOT_IMPLEMENTED();
+  // std::function<std::string(ParallelLayerAttrs const &)> get_node_label =
+  //     [](ParallelLayerAttrs const &a) -> std::string {
+  //   RecordFormatter r = as_dot(a.op_attrs);
+  //
+  //   if (a.name.has_value()) {
+  //     RecordFormatter rr;
+  //     rr << "Name" << a.name.value();
+  //     r << rr;
+  //   }
+  //
+  //   std::ostringstream oss;
+  //   oss << r;
+  //   return oss.str();
+  // };
+  //
+  // std::function<std::string(ParallelTensorAttrs const &)> get_input_label =
+  //     [](ParallelTensorAttrs const &a) -> std::string {
+  //   RecordFormatter r;
+  //
+  //   r << fmt::to_string(a.shape);
+  //
+  //   std::ostringstream oss;
+  //   oss << r;
+  //   return oss.str();
+  // };
+  //
+  // return as_dot(spcg.raw_graph, get_node_label, get_input_label);
 }
 
 void debug_print_dot(SubParallelComputationGraph const &spcg) {
diff --git a/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc b/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc
index bb8cb449bc..0d2b912049 100644
--- a/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc
+++ b/lib/substitutions/src/substitutions/sub_parallel_computation_graph_edge.cc
@@ -6,7 +6,7 @@ namespace FlexFlow {
 SubParallelComputationGraphEdge
     subpcg_edge_from_tensor_and_dst(parallel_tensor_guid_t const &tensor,
                                     parallel_layer_guid_t const &layer,
-                                    int input_idx) {
+                                    nonnegative_int input_idx) {
   return SubParallelComputationGraphEdge{
       OpenDataflowEdge{
           DataflowEdge{
diff --git a/lib/substitutions/src/substitutions/substitution.cc b/lib/substitutions/src/substitutions/substitution.cc
index 22e15cb01a..874700d303 100644
--- a/lib/substitutions/src/substitutions/substitution.cc
+++ b/lib/substitutions/src/substitutions/substitution.cc
@@ -1,169 +1,164 @@
 #include "substitutions/substitution.h"
-#include "pcg/parallel_computation_graph/parallel_computation_graph_edge.h"
-#include "pcg/parallel_computation_graph/parallel_tensor_guid_t.h"
-#include "substitutions/open_parallel_tensor_guid_t.h"
-#include "substitutions/output_graph/output_operator_attrs_assignment.h"
-#include "substitutions/pcg_pattern_match.h"
-#include "substitutions/sub_parallel_computation_graph.h"
-#include "substitutions/sub_parallel_computation_graph_edge.h"
-#include "substitutions/substitution_internal/evaluate_substitution_output.h"
-#include "substitutions/substitution_internal/output_expr_to_result_sub_pcg_mapping.h"
-#include "utils/containers/merge_maps.h"
-#include "utils/containers/restrict_keys.h"
-#include "utils/containers/set_minus.h"
-#include "utils/containers/values.h"
-#include "utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.h"
-#include "utils/graph/node/algorithms.h"
-#include "utils/overload.h"
+#include "substitutions/output_graph/output_graph_expr.h"
+#include "substitutions/pcg_pattern.h"
+#include "utils/bidict/algorithms/left_entries.h"
+#include "utils/bidict/algorithms/right_entries.h"
+#include "utils/containers/map_values.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/find_isomorphism.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/rewrite_node_labels.h"
+#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.dtg.h"
+#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h"
 
 namespace FlexFlow {
 
-bool is_valid_substitution(Substitution const &) {
-  NOT_IMPLEMENTED();
-}
+bool is_isomorphic_to(Substitution const &l, Substitution const &r) {
+  OpenDataflowGraphIsomorphism pcg_pattern_isomorphism = ({
+    std::optional<OpenDataflowGraphIsomorphism> maybe_isomorphism =
+        find_isomorphism(l.pcg_pattern.raw_graph, r.pcg_pattern.raw_graph);
 
-SubParallelComputationGraph
-    apply_substitution(SubParallelComputationGraph const &spcg,
-                       Substitution const &sub,
-                       PCGPatternMatch const &match) {
-  auto substitution_output_result =
-      evaluate_substitution_output(spcg, sub, match);
-  SubParallelComputationGraph substitution_output_graph =
-      substitution_output_result.first;
-  OutputExprToResultSubPCGMapping output_expr_to_result_sub_pcg_mapping =
-      substitution_output_result.second;
-
-  SubParallelComputationGraphData output_graph_data =
-      get_sub_pcg_data(substitution_output_graph);
-  SubParallelComputationGraphData pre_data = get_sub_pcg_data(spcg);
-
-  std::unordered_set<parallel_layer_guid_t> pre_nodes =
-      keys(pre_data.node_data);
-  std::unordered_set<parallel_layer_guid_t> matched_nodes =
-      unordered_set_of(values(match.node_assignment));
-  std::unordered_set<parallel_layer_guid_t> post_nodes_from_original_graph =
-      set_minus(pre_nodes, matched_nodes);
-
-  std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs> post_node_data =
-      [&] {
-        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
-            post_node_data_from_orig = restrict_keys(
-                pre_data.node_data, post_nodes_from_original_graph);
-        std::unordered_map<parallel_layer_guid_t, ParallelLayerAttrs>
-            post_node_data_from_sub = output_graph_data.node_data;
-
-        return merge_maps(post_node_data_from_orig, post_node_data_from_sub);
-      }();
-
-  std::unordered_set<SubParallelComputationGraphEdge> post_edges = [&] {
-    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_orig =
-        filter(pre_data.edges, [&](SubParallelComputationGraphEdge const &e) {
-          if (e.raw_edge.has<DataflowInputEdge>()) {
-            return true;
-          } else {
-            DataflowEdge dfe = e.raw_edge.get<DataflowEdge>();
-            parallel_layer_guid_t src = parallel_layer_guid_t{dfe.src.node};
-            parallel_layer_guid_t dst = parallel_layer_guid_t{dfe.dst.node};
-            return !(contains(matched_nodes, src) ||
-                     contains(matched_nodes, dst));
-          }
-        });
-
-    std::unordered_set<SubParallelComputationGraphEdge> post_edges_from_sub =
-        filter(output_graph_data.edges,
-               [&](SubParallelComputationGraphEdge const &e) {
-                 return !e.raw_edge.has<DataflowInputEdge>();
-               });
-
-    bidict<PatternNodeOutput, parallel_tensor_guid_t>
-        output_orig_pattern_mapping = get_output_mapping_for_pcg_pattern_match(
-            match, sub.pcg_pattern, spcg);
-    bidict<parallel_tensor_guid_t, OutputGraphExprNodeOutput>
-        output_post_outexpr_mapping = get_output_graph_expr_output_mapping(
-            output_expr_to_result_sub_pcg_mapping,
-            sub.output_graph_expr,
-            substitution_output_graph);
-
-    std::unordered_set<SubParallelComputationGraphEdge> incoming_to_sub_edges;
-    for (auto const &[pattern_input, base_graph_tensor] :
-         match.input_assignment) {
-      OutputGraphExprInput output_expr_input =
-          sub.inputs_mapping.at_l(pattern_input);
-      input_parallel_tensor_guid_t output_graph_input =
-          output_expr_to_result_sub_pcg_mapping.input_mapping.at_r(
-              output_expr_input);
-      std::unordered_set<parallel_tensor_use_t> uses = get_parallel_tensor_uses(
-          substitution_output_graph,
-          open_parallel_tensor_guid_from_input(output_graph_input));
-      for (parallel_tensor_use_t const &use : uses) {
-        SubParallelComputationGraphEdge new_edge =
-            subpcg_edge_from_tensor_and_use(base_graph_tensor, use);
-        incoming_to_sub_edges.insert(new_edge);
-      }
+    if (!maybe_isomorphism.has_value()) {
+      return false;
     }
 
-    std::unordered_set<SubParallelComputationGraphEdge> outgoing_from_sub_edges;
-    for (ParallelComputationGraphEdge const &outgoing_edge :
-         get_subgraph_outgoing_edges(spcg, matched_nodes)) {
-      parallel_tensor_guid_t original_tensor =
-          get_parallel_tensor(outgoing_edge);
-      PatternNodeOutput pattern_tensor =
-          output_orig_pattern_mapping.at_r(original_tensor);
-      OutputGraphExprNodeOutput output_graph_tensor =
-          sub.outputs_mapping.at_l(pattern_tensor);
-      parallel_tensor_guid_t new_tensor =
-          output_post_outexpr_mapping.at_r(output_graph_tensor);
-
-      SubParallelComputationGraphEdge new_edge =
-          subpcg_edge_from_tensor_and_dst(
-              new_tensor,
-              get_dst_layer(outgoing_edge),
-              get_dst_layer_input_idx(outgoing_edge));
-      outgoing_from_sub_edges.insert(new_edge);
-    }
+    maybe_isomorphism.value();
+  });
+
+  auto l_from_r_pattern_node = [&](PatternNode const &r_node) {
+    return PatternNode{
+        pcg_pattern_isomorphism.node_mapping.at_r(r_node.raw_node),
+    };
+  };
 
-    return set_union(std::vector{
-        post_edges_from_orig,
-        post_edges_from_sub,
-        incoming_to_sub_edges,
-        outgoing_from_sub_edges,
-    });
-  }();
-
-  std::unordered_set<input_parallel_tensor_guid_t> post_inputs =
-      pre_data.inputs;
-
-  std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
-      post_value_data = [&] {
-        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
-            post_value_data_from_orig = filter_keys(
-                pre_data.value_data, [&](open_parallel_tensor_guid_t const &t) {
-                  return visit_open_parallel_tensor_guid(
-                      t,
-                      overload{
-                          [&](parallel_tensor_guid_t const &t) {
-                            return contains(post_nodes_from_original_graph,
-                                            get_source_layer(t));
-                          },
-                          [](input_parallel_tensor_guid_t const &) {
-                            return true;
-                          },
-                      });
+  auto l_from_r_output_attrs_assignment =
+      [&](OutputOperatorAttrsAssignment const &r_attrs) {
+        std::optional<PatternNode> l_template_operator =
+            transform(r_attrs.template_operator, l_from_r_pattern_node);
+        std::unordered_map<OperatorAttributeKey, OutputOperatorAttributeExpr>
+            l_assignments = map_values(
+                r_attrs.assignments,
+                [&](OutputOperatorAttributeExpr const &r_expr) {
+                  return r_expr.visit<OutputOperatorAttributeExpr>(
+                      overload{[&](AttrConstant const &) { return r_expr; },
+                               [&](OutputOperatorAttrAccess const &r_acc) {
+                                 return OutputOperatorAttributeExpr{
+                                     OutputOperatorAttrAccess{
+                                         l_from_r_pattern_node(r_acc.node),
+                                         r_acc.attr_expr,
+                                     },
+                                 };
+                               }});
                 });
+        return OutputOperatorAttrsAssignment{
+            l_template_operator,
+            l_assignments,
+        };
+      };
+
+  OpenDataflowGraphIsomorphism output_graph_expr_isomorphism = ({
+    std::optional<OpenDataflowGraphIsomorphism> maybe_isomorphism =
+        find_isomorphism(
+            l.output_graph_expr.raw_graph,
+            rewrite_node_labels(
+                r.output_graph_expr.raw_graph,
+                [&](Node const &, OutputOperatorAttrsAssignment const &a) {
+                  return l_from_r_output_attrs_assignment(a);
+                }));
+    if (!maybe_isomorphism.has_value()) {
+      return false;
+    }
 
-        std::unordered_map<open_parallel_tensor_guid_t, ParallelTensorAttrs>
-            post_value_data_from_sub = output_graph_data.value_data;
-        return merge_maps(post_value_data_from_orig, post_value_data_from_sub);
-      }();
+    maybe_isomorphism.value();
+  });
 
-  SubParallelComputationGraphData post_data = SubParallelComputationGraphData{
-      post_node_data,
-      post_edges,
-      post_inputs,
-      post_value_data,
+  auto l_from_r_pattern_input = [&](PatternInput const &r_input) {
+    return PatternInput{
+        pcg_pattern_isomorphism.input_mapping.at_r(
+            r_input.raw_dataflow_graph_input),
+    };
   };
 
-  return sub_pcg_from_graph_data(post_data);
+  auto l_from_r_output_graph_input = [&](OutputGraphExprInput const &r_input) {
+    return OutputGraphExprInput{
+        output_graph_expr_isomorphism.input_mapping.at_r(
+            r_input.raw_dataflow_graph_input),
+    };
+  };
+
+  auto l_from_r_pattern_output = [&](PatternNodeOutput const &r_output) {
+    return PatternNodeOutput{
+        isomorphism_map_l_dataflow_output_from_r(pcg_pattern_isomorphism,
+                                                 r_output.raw_dataflow_output),
+    };
+  };
+
+  auto l_from_r_output_graph_output =
+      [&](OutputGraphExprNodeOutput const &r_output) {
+        return OutputGraphExprNodeOutput{
+            isomorphism_map_l_dataflow_output_from_r(
+                output_graph_expr_isomorphism, r_output.raw_dataflow_output),
+        };
+      };
+
+  bidict<PatternInput, OutputGraphExprInput> l_input_mapping_from_r =
+      transform(r.inputs_mapping,
+                [&](PatternInput const &r_p, OutputGraphExprInput const &r_o) {
+                  return std::pair{
+                      l_from_r_pattern_input(r_p),
+                      l_from_r_output_graph_input(r_o),
+                  };
+                });
+  if (l_input_mapping_from_r != l.inputs_mapping) {
+    return false;
+  }
+
+  bidict<PatternNodeOutput, OutputGraphExprNodeOutput> l_output_mapping_from_r =
+      transform(r.outputs_mapping,
+                [&](PatternNodeOutput const &r_p,
+                    OutputGraphExprNodeOutput const &r_o) {
+                  return std::pair{
+                      l_from_r_pattern_output(r_p),
+                      l_from_r_output_graph_output(r_o),
+                  };
+                });
+  if (l_output_mapping_from_r != l.outputs_mapping) {
+    return false;
+  }
+
+  return true;
+}
+
+bool is_valid_substitution(Substitution const &sub) {
+  {
+    std::unordered_set<PatternInput> pattern_inputs =
+        get_inputs(sub.pcg_pattern);
+    std::unordered_set<PatternInput> mapped_inputs =
+        left_entries(sub.inputs_mapping);
+
+    if (pattern_inputs != mapped_inputs) {
+      return false;
+    }
+  }
+
+  {
+    std::unordered_set<OutputGraphExprInput> output_graph_inputs =
+        get_inputs(sub.output_graph_expr);
+    std::unordered_set<OutputGraphExprInput> mapped_inputs =
+        right_entries(sub.inputs_mapping);
+
+    if (output_graph_inputs != mapped_inputs) {
+      return false;
+    }
+  }
+
+  if (get_nodes(sub.pcg_pattern).empty()) {
+    return false;
+  }
+
+  if (get_nodes(sub.output_graph_expr).empty()) {
+    return false;
+  }
+
+  return true;
 }
 
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/substitution_builder.cc b/lib/substitutions/src/substitutions/substitution_builder.cc
new file mode 100644
index 0000000000..a267b8113f
--- /dev/null
+++ b/lib/substitutions/src/substitutions/substitution_builder.cc
@@ -0,0 +1,162 @@
+#include "substitutions/substitution_builder.h"
+#include "substitutions/output_graph/output_graph_expr_value.h"
+#include "substitutions/substitution.h"
+#include "substitutions/unlabelled/pattern_value.h"
+#include "utils/containers/repeat_element.h"
+#include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+SubstitutionBuilder::SubstitutionBuilder()
+    : pattern_g(LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                          TensorAttributePattern>::
+                    create<UnorderedSetLabelledOpenDataflowGraph<
+                        OperatorAttributePattern,
+                        TensorAttributePattern>>()),
+      output_g(LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment,
+                                         std::monostate>::
+                   create<UnorderedSetLabelledOpenDataflowGraph<
+                       OutputOperatorAttrsAssignment,
+                       std::monostate>>()) {}
+
+std::pair<PatternValue, OutputGraphExprValue> SubstitutionBuilder::add_input(
+    TensorAttributePattern const &input_tensor_pattern,
+    std::optional<std::string> const &name) {
+  PatternInput pattern_input = PatternInput{
+      this->pattern_g.add_input(input_tensor_pattern),
+  };
+
+  OutputGraphExprInput output_graph_expr_input = OutputGraphExprInput{
+      this->output_g.add_input(std::monostate{}),
+  };
+
+  this->input_mapping.equate(pattern_input, output_graph_expr_input);
+
+  if (name.has_value()) {
+    this->pattern_input_names.equate(pattern_input, name.value());
+  }
+
+  return {
+      PatternValue{pattern_input},
+      OutputGraphExprValue{output_graph_expr_input},
+  };
+}
+
+std::vector<PatternValue> SubstitutionBuilder::add_pattern_node(
+    OperatorAttributePattern const &node_pattern,
+    std::vector<PatternValue> const &inputs,
+    std::vector<TensorAttributePattern> const &output_patterns,
+    std::optional<std::string> const &maybe_name) {
+  NodeAddedResult node_added = this->pattern_g.add_node(
+      node_pattern,
+      transform(inputs, raw_open_dataflow_value_from_pattern_value),
+      output_patterns);
+
+  if (maybe_name.has_value()) {
+    std::string name = maybe_name.value();
+
+    if (this->pattern_node_names.contains_r(name)) {
+      throw mk_runtime_error(fmt::format("Attempted to name node {}, but a "
+                                         "node with that name already exists!",
+                                         name));
+    }
+
+    this->pattern_node_names.equate(PatternNode{node_added.node}, name);
+  }
+
+  return transform(node_added.outputs, [](DataflowOutput const &o) {
+    return pattern_value_from_raw_open_dataflow_value(OpenDataflowValue{o});
+  });
+}
+
+std::vector<OutputGraphExprValue> SubstitutionBuilder::add_output_graph_node(
+    OutputOperatorAttrsAssignment const &node_expr,
+    std::vector<OutputGraphExprValue> const &inputs,
+    nonnegative_int num_outputs) {
+  NodeAddedResult node_added = this->output_g.add_node(
+      node_expr,
+      transform(inputs, raw_open_dataflow_value_from_output_graph_expr_value),
+      repeat_element(/*num_times=*/num_outputs, /*element=*/std::monostate{}));
+
+  return transform(node_added.outputs, [](DataflowOutput const &o) {
+    return output_graph_expr_value_from_raw_open_dataflow_value(
+        OpenDataflowValue{o});
+  });
+}
+
+void SubstitutionBuilder::equate_outputs(
+    PatternValue const &maybe_pattern_output,
+    OutputGraphExprValue const &maybe_output_graph_expr_output) {
+  PatternNodeOutput pattern_output =
+      maybe_pattern_output.visit<PatternNodeOutput>(overload{
+          [](PatternNodeOutput const &o) { return o; },
+          [&](PatternInput const &) -> PatternNodeOutput {
+            throw mk_runtime_error(fmt::format(
+                "SubstitutionBuilder::equate_outputs expected a PatternValue "
+                "holding a PatternNodeOutput, but received {}",
+                maybe_pattern_output));
+          },
+      });
+
+  OutputGraphExprNodeOutput output_graph_expr_output =
+      maybe_output_graph_expr_output.visit<OutputGraphExprNodeOutput>(overload{
+          [](OutputGraphExprNodeOutput const &o) { return o; },
+          [&](OutputGraphExprInput const &) -> OutputGraphExprNodeOutput {
+            throw mk_runtime_error(
+                fmt::format("SubstitutionBuilder::equate_outputs expected an "
+                            "OutputGraphExprValue holding a "
+                            "OutputGraphExprNodeOutput, but received {}",
+                            maybe_output_graph_expr_output));
+          },
+      });
+
+  if (this->output_mapping.contains_l(pattern_output)) {
+    throw mk_runtime_error(
+        fmt::format("SubstitutionBuilder::equate_outputs expected a "
+                    "PatternValue holding a PatternValueOutput"
+                    "that is not contained in the output_mapping forward graph,"
+                    "but received {}",
+                    pattern_output));
+  }
+  if (this->output_mapping.contains_r(output_graph_expr_output)) {
+    throw mk_runtime_error(fmt::format(
+        "SubstitutionBuilder::output_graph_expr_output expected a "
+        "OutputGraphExprValue holding a OutputGraphExprNodeOutput"
+        "that is not contained in the output_mapping backward graph,"
+        "but received {}",
+        output_graph_expr_output));
+  }
+
+  this->output_mapping.equate(pattern_output, output_graph_expr_output);
+}
+
+PatternNode
+    SubstitutionBuilder::pattern_node_named(std::string const &name) const {
+  return this->pattern_node_names.at_r(name);
+}
+
+PatternInput
+    SubstitutionBuilder::pattern_input_named(std::string const &name) const {
+  return this->pattern_input_names.at_r(name);
+}
+
+Substitution SubstitutionBuilder::get_substitution() const {
+  Substitution result = Substitution{
+      PCGPattern{this->pattern_g},
+      OutputGraphExpr{this->output_g},
+      this->input_mapping,
+      this->output_mapping,
+  };
+
+  if (!is_valid_substitution(result)) {
+    throw mk_runtime_error(
+        "get_substitution cannot return a Substitution, as the Substitution is "
+        "currently invalid. Ensure you have finished constructing the "
+        "Substitution and have mapped all of the outputs.");
+  }
+
+  return result;
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc
index efbcf4a6f1..7bfb1f5e9e 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_access.cc
@@ -11,9 +11,8 @@ TensorAttributeValue
   TensorAttributeValue from_attr = get_attribute(attrs, acc.attribute_key);
 
   return from_attr.visit<TensorAttributeValue>(overload{
-      [&](std::vector<int> const &v) -> TensorAttributeValue {
-        return TensorAttributeValue{
-            static_cast<size_t>(at_idx(v, acc.index).value())};
+      [&](std::vector<nonnegative_int> const &v) -> TensorAttributeValue {
+        return TensorAttributeValue{at_idx(v, acc.index).value()};
       },
       [](auto &&) -> TensorAttributeValue {
         throw mk_runtime_error("Invalid operand");
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc
index d1e97adc37..5acfdf406a 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/eval_list_size.cc
@@ -1,5 +1,6 @@
 #include "substitutions/tensor_pattern/eval_list_size.h"
 #include "substitutions/tensor_pattern/get_attribute.h"
+#include "utils/nonnegative_int/num_elements.h"
 #include "utils/overload.h"
 
 namespace FlexFlow {
@@ -9,8 +10,8 @@ TensorAttributeValue eval_list_size(ParallelTensorAttrs const &attrs,
   TensorAttributeValue from_attr = get_attribute(attrs, acc.attribute_key);
 
   return from_attr.visit<TensorAttributeValue>(overload{
-      [](std::vector<int> const &v) -> TensorAttributeValue {
-        return TensorAttributeValue{v.size()};
+      [](std::vector<nonnegative_int> const &v) -> TensorAttributeValue {
+        return TensorAttributeValue{num_elements(v)};
       },
       [](auto &&) -> TensorAttributeValue {
         throw mk_runtime_error("Invalid operand");
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc b/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc
index 286bc69b84..3539b06832 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/get_attribute.cc
@@ -10,15 +10,15 @@ TensorAttributeValue get_attribute(ParallelTensorAttrs const &attrs,
                                    TensorAttributeKey key) {
   switch (key) {
     case TensorAttributeKey::DIM_SIZES: {
-      std::vector<size_t> sizes =
-          transform(vector_of(ff_ordered_shard_dims(attrs.shape.dims)),
-                    [](ShardParallelDim const &d) { return d.size; });
+      std::vector<nonnegative_int> sizes = transform(
+          vector_of(ff_ordered_shard_dims(attrs.shape.dims)),
+          [](ShardParallelDim const &d) { return nonnegative_int{d.size}; });
       return TensorAttributeValue{sizes};
     }
     case TensorAttributeKey::DIM_DEGREES: {
-      std::vector<size_t> degrees = transform(
+      std::vector<nonnegative_int> degrees = transform(
           vector_of(ff_ordered_shard_dims(attrs.shape.dims)),
-          [](ShardParallelDim const &d) { return size_t_from_int(d.degree); });
+          [](ShardParallelDim const &d) { return nonnegative_int{d.degree}; });
       return TensorAttributeValue{degrees};
     }
     default:
diff --git a/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc b/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc
index 794ab5abda..e1c1fe7cf6 100644
--- a/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc
+++ b/lib/substitutions/src/substitutions/tensor_pattern/tensor_attribute_pattern.cc
@@ -1,4 +1,5 @@
 #include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
+#include "utils/integer_conversions.h"
 
 namespace FlexFlow {
 
@@ -6,4 +7,19 @@ TensorAttributePattern tensor_attribute_pattern_match_all() {
   return TensorAttributePattern{{}};
 }
 
+TensorAttributePattern
+    tensor_attr_pattern_require_num_dims(nonnegative_int num_dims) {
+  return TensorAttributePattern{{
+      TensorAttributeConstraint{
+          ConstraintType::EQUAL,
+          TensorAttributeExpr{
+              TensorAttributeListSize{
+                  TensorAttributeKey::DIM_SIZES,
+              },
+          },
+          TensorAttributeValue{num_dims},
+      },
+  }};
+}
+
 } // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/unity_substitution_set.cc b/lib/substitutions/src/substitutions/unity_substitution_set.cc
new file mode 100644
index 0000000000..4b00cdd95f
--- /dev/null
+++ b/lib/substitutions/src/substitutions/unity_substitution_set.cc
@@ -0,0 +1,235 @@
+#include "substitutions/unity_substitution_set.h"
+#include "pcg/machine_specification.h"
+#include "substitutions/operator_pattern/operator_attribute_constraint.h"
+#include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/substitution_builder.h"
+#include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
+#include "utils/containers/get_only.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+
+namespace FlexFlow {
+
+std::vector<Substitution>
+    get_substitution_set(MachineSpecification const &resources) {
+  std::vector<Substitution> substitutions;
+  for (nonnegative_int num_dims :
+       nonnegative_range(1_n, nonnegative_int{MAX_TENSOR_DIM})) {
+    for (nonnegative_int degree = 1_n; degree <= get_num_gpus(resources);
+         degree *= 2_n) {
+      substitutions.push_back(
+          create_replicate_linear_combine(num_dims, degree, true));
+      substitutions.push_back(
+          create_replicate_linear_combine(num_dims, degree, false));
+    }
+  }
+  substitutions.push_back(create_fuse_linear_activation(Activation::RELU));
+  substitutions.push_back(create_fuse_linear_activation(Activation::SIGMOID));
+  substitutions.push_back(create_fuse_linear_activation(Activation::TANH));
+  substitutions.push_back(create_fuse_linear_activation(Activation::GELU));
+  return substitutions;
+}
+
+Substitution create_combine_inception(nonnegative_int num_convs,
+                                      nonnegative_int num_dims,
+                                      nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_combine_concat(nonnegative_int num_inputs,
+                                   nonnegative_int num_dims,
+                                   nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_replicate_linear_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree,
+                                             bool use_bias) {
+  SubstitutionBuilder b;
+
+  auto [p_input, o_input] = b.add_input(tensor_attribute_pattern_match_all());
+  auto [p_weight, o_weight] = b.add_input(tensor_attribute_pattern_match_all());
+  std::vector<PatternValue> p_inputs = {p_input, p_weight};
+
+  std::optional<OutputGraphExprValue> o_bias = std::nullopt;
+  if (use_bias) {
+    std::pair<PatternValue, OutputGraphExprValue> bias =
+        b.add_input(tensor_attribute_pattern_match_all());
+    p_inputs.push_back(bias.first);
+    o_bias = bias.second;
+  }
+
+  OperatorAttributePattern linear_pattern = OperatorAttributePattern{{
+      op_type_equals_constraint(OperatorType::LINEAR),
+      op_attr_key_equals(OperatorAttributeKey::BIAS,
+                         OperatorAttributeValue{use_bias}),
+      op_attr_key_divisible_by(OperatorAttributeKey::OUT_CHANNELS,
+                               nonnegative_int{degree}),
+  }};
+
+  PatternValue p_linear_output = get_only(b.add_pattern_node(
+      linear_pattern,
+      p_inputs,
+      {tensor_attr_pattern_require_num_dims(nonnegative_int{num_dims})},
+      "linear"));
+
+  OutputOperatorAttrsAssignment replicate_input_expr =
+      OutputOperatorAttrsAssignment{
+          std::nullopt,
+          {
+              set_op_type_attr(OperatorType::REPLICATE),
+              set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE,
+                                   OperatorAttributeValue{degree}),
+          }};
+  OutputGraphExprValue o_replicate_input_output =
+      get_only(b.add_output_graph_node(replicate_input_expr, {o_input}, 1_n));
+
+  OutputOperatorAttrsAssignment partition_weights_expr =
+      OutputOperatorAttrsAssignment{
+          std::nullopt,
+          {
+              set_op_type_attr(OperatorType::REPARTITION),
+              set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE,
+                                   OperatorAttributeValue{degree}),
+              set_attr_to_constant(OperatorAttributeKey::PARALLEL_DIM,
+                                   OperatorAttributeValue{ff_dim_t{1_n}}),
+          }};
+  OutputGraphExprValue o_partition_weights_output = get_only(
+      b.add_output_graph_node(partition_weights_expr, {o_weight}, 1_n));
+
+  std::vector<OutputGraphExprValue> o_linear_inputs = {
+      o_replicate_input_output, o_partition_weights_output};
+
+  if (use_bias) {
+    OutputOperatorAttrsAssignment partition_bias_expr =
+        OutputOperatorAttrsAssignment{
+            std::nullopt,
+            {
+                set_op_type_attr(OperatorType::REPARTITION),
+                set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE,
+                                     OperatorAttributeValue{degree}),
+                set_attr_to_constant(OperatorAttributeKey::PARALLEL_DIM,
+                                     OperatorAttributeValue{ff_dim_t{1_n}}),
+            }};
+    OutputGraphExprValue o_partition_bias_output = get_only(
+        b.add_output_graph_node(partition_bias_expr, {o_bias.value()}, 1_n));
+    o_linear_inputs.push_back(o_partition_bias_output);
+  }
+
+  OutputOperatorAttrsAssignment linear_expr = OutputOperatorAttrsAssignment{
+      b.pattern_node_named("linear"),
+      {},
+  };
+  OutputGraphExprValue o_linear_output =
+      get_only(b.add_output_graph_node(linear_expr, o_linear_inputs, 1_n));
+
+  OutputOperatorAttrsAssignment combine_expr = OutputOperatorAttrsAssignment{
+      std::nullopt,
+      {
+          set_op_type_attr(OperatorType::COMBINE),
+          set_attr_to_constant(OperatorAttributeKey::PARALLEL_DEGREE,
+                               OperatorAttributeValue{degree}),
+          set_attr_to_constant(
+              OperatorAttributeKey::PARALLEL_DIM,
+              OperatorAttributeValue{ff_dim_t{
+                  nonnegative_int{num_dims.unwrap_nonnegative() - 1},
+              }}),
+      },
+  };
+  OutputGraphExprValue o_combine_output =
+      get_only(b.add_output_graph_node(combine_expr, {o_linear_output}, 1_n));
+
+  b.equate_outputs(p_linear_output, o_combine_output);
+
+  return b.get_substitution();
+}
+
+Substitution create_partition_linear_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree,
+                                             Activation activation,
+                                             bool use_bias) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_conv2d_combine(nonnegative_int num_dims,
+                                             nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_attention_combine(nonnegative_int num_heads,
+                                                nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_replicate_attention_reduce(nonnegative_int num_heads,
+                                               nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_add_combine(ff_dim_t parallel_dim,
+                                          nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_relu_combine(ff_dim_t parallel_dim,
+                                           nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_concat_combine(nonnegative_int num_inputs,
+                                             ff_dim_t concat_dim,
+                                             ff_dim_t parallel_dim,
+                                             nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_partition_softmax_combine(ff_dim_t softmax_dim,
+                                              ff_dim_t partition_dim,
+                                              nonnegative_int degree) {
+  NOT_IMPLEMENTED();
+}
+
+Substitution create_fuse_linear_activation(Activation activation) {
+  SubstitutionBuilder b;
+
+  auto [p_input, o_input] =
+      b.add_input(tensor_attribute_pattern_match_all(), "input");
+  auto [p_weight, o_weight] =
+      b.add_input(tensor_attribute_pattern_match_all(), "weight");
+
+  OperatorAttributePattern mm_pattern = OperatorAttributePattern{{
+      op_type_equals_constraint(OperatorType::LINEAR),
+      op_attr_key_equals(
+          OperatorAttributeKey::ACTIVATION,
+          OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
+  }};
+  PatternValue p_mm_output =
+      get_only(b.add_pattern_node(mm_pattern,
+                                  {p_input, p_weight},
+                                  {tensor_attribute_pattern_match_all()},
+                                  "mm"));
+
+  OperatorAttributePattern relu_pattern = OperatorAttributePattern{{
+      op_type_equals_constraint(OperatorType::RELU),
+  }};
+  PatternValue p_relu_output =
+      get_only(b.add_pattern_node(relu_pattern,
+                                  {p_mm_output},
+                                  {tensor_attribute_pattern_match_all()},
+                                  "relu"));
+
+  OutputOperatorAttrsAssignment fused_node_expr = OutputOperatorAttrsAssignment{
+      b.pattern_node_named("mm"),
+      {
+          set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                               OperatorAttributeValue{activation}),
+      }};
+  OutputGraphExprValue o_fused_node_output = get_only(
+      b.add_output_graph_node(fused_node_expr, {o_input, o_weight}, 1_n));
+
+  b.equate_outputs(p_relu_output, o_fused_node_output);
+
+  return b.get_substitution();
+}
+
+} // namespace FlexFlow
diff --git a/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc b/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc
index e8deacebec..dff600ecf0 100644
--- a/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/input_pattern_edge.cc
@@ -11,7 +11,7 @@ PatternNode get_dst_node(InputPatternEdge const &e) {
   return PatternNode{e.raw_edge.dst.node};
 }
 
-int get_dst_idx(InputPatternEdge const &e) {
+nonnegative_int get_dst_idx(InputPatternEdge const &e) {
   return e.raw_edge.dst.idx;
 }
 
diff --git a/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc b/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc
index 9abdc4e83c..24bbb6f4d1 100644
--- a/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/pattern_node_output.cc
@@ -6,7 +6,7 @@ PatternNode get_src_node(PatternNodeOutput const &o) {
   return PatternNode{o.raw_dataflow_output.node};
 }
 
-int get_idx(PatternNodeOutput const &o) {
+nonnegative_int get_idx(PatternNodeOutput const &o) {
   return o.raw_dataflow_output.idx;
 }
 
diff --git a/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc b/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc
index dea3e5f500..17d05f1122 100644
--- a/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc
+++ b/lib/substitutions/src/substitutions/unlabelled/standard_pattern_edge.cc
@@ -10,11 +10,11 @@ PatternNode get_dst_node(StandardPatternEdge const &e) {
   return PatternNode{e.raw_edge.dst.node};
 }
 
-int get_src_idx(StandardPatternEdge const &e) {
+nonnegative_int get_src_idx(StandardPatternEdge const &e) {
   return e.raw_edge.src.idx;
 }
 
-int get_dst_idx(StandardPatternEdge const &e) {
+nonnegative_int get_dst_idx(StandardPatternEdge const &e) {
   return e.raw_edge.dst.idx;
 }
 
diff --git a/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
new file mode 100644
index 0000000000..5fd923f71f
--- /dev/null
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/apply_substitution.cc
@@ -0,0 +1,174 @@
+#include "substitutions/apply_substitution/apply_substitution.h"
+#include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
+#include "substitutions/operator_pattern/operator_attribute_constraint.h"
+#include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution_builder.h"
+#include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
+#include "utils/containers/get_only.h"
+#include "utils/integer_conversions.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("apply_substitution") {
+    SubstitutionBuilder b;
+
+    auto [p_input, o_input] =
+        b.add_input(tensor_attribute_pattern_match_all(), "input");
+    auto [p_weight, o_weight] =
+        b.add_input(tensor_attribute_pattern_match_all(), "weight");
+
+    PatternValue p_mm_output = [&] {
+      auto pattern = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::LINEAR),
+          op_attr_key_equals(
+              OperatorAttributeKey::ACTIVATION,
+              OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
+      }};
+
+      return get_only(b.add_pattern_node(pattern,
+                                         {p_input, p_weight},
+                                         {tensor_attribute_pattern_match_all()},
+                                         "mm"));
+    }();
+
+    PatternValue p_relu_output = [&] {
+      auto pattern = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::RELU),
+      }};
+
+      return get_only(b.add_pattern_node(pattern,
+                                         {p_mm_output},
+                                         {tensor_attribute_pattern_match_all()},
+                                         "relu"));
+    }();
+
+    OutputGraphExprValue o_fused_output = [&] {
+      auto node_expr = OutputOperatorAttrsAssignment{
+          b.pattern_node_named("mm"),
+          {
+              set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                   OperatorAttributeValue{Activation::RELU}),
+          }};
+
+      return get_only(
+          b.add_output_graph_node(node_expr, {o_input, o_weight}, 1_n));
+    }();
+
+    b.equate_outputs(p_relu_output, o_fused_output);
+
+    Substitution sub = b.get_substitution();
+
+    nonnegative_int in_channels = 24_n;
+    nonnegative_int batch_size = 4_n;
+    nonnegative_int batch_degree = 2_n;
+    std::string mm_match = "mm_match";
+    std::string relu_match = "relu_match";
+
+    SubParallelComputationGraph pcg = [&] {
+      ParallelComputationGraphBuilder b;
+      parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{
+                  ShardParallelDim{batch_size, batch_degree},
+                  ShardParallelDim{in_channels, 1_n},
+              },
+              ReplicaParallelDimSet{
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
+              },
+          },
+          DataType::FLOAT,
+      });
+      t = b.dense(t,
+                  /*outDim=*/16_n,
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/12_n,
+                  /*activation=*/std::nullopt,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt,
+                  /*name=*/mm_match);
+      t = b.relu(t,
+                 /*name=*/relu_match);
+      t = b.dense(t,
+                  /*outDim=*/8_n,
+                  /*activation=*/Activation::RELU);
+
+      return sub_pcg_from_full_pcg(b.pcg);
+    }();
+
+    PCGPatternMatch match = [&] {
+      parallel_layer_guid_t mm_match_layer =
+          get_parallel_layer_by_name(pcg, mm_match);
+      parallel_layer_guid_t relu_match_layer =
+          get_parallel_layer_by_name(pcg, relu_match);
+      open_parallel_tensor_guid_t mm_match_layer_input_activations =
+          get_layer_inputs(pcg, mm_match_layer).at(0);
+      open_parallel_tensor_guid_t mm_match_layer_input_weights =
+          get_layer_inputs(pcg, mm_match_layer).at(1);
+
+      return PCGPatternMatch{
+          bidict<PatternNode, parallel_layer_guid_t>{
+              {b.pattern_node_named("mm"), mm_match_layer},
+              {b.pattern_node_named("relu"), relu_match_layer},
+          },
+          std::unordered_map<PatternInput, open_parallel_tensor_guid_t>{
+              {
+                  b.pattern_input_named("input"),
+                  mm_match_layer_input_activations,
+              },
+              {
+                  b.pattern_input_named("weight"),
+                  mm_match_layer_input_weights,
+              }},
+      };
+    }();
+
+    SubParallelComputationGraph result = apply_substitution(pcg, sub, match);
+
+    SubParallelComputationGraph correct = [&] {
+      ParallelComputationGraphBuilder b;
+      parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
+          ParallelTensorDims{
+              FFOrdered<ShardParallelDim>{
+                  ShardParallelDim{batch_size, batch_degree},
+                  ShardParallelDim{in_channels, 1_n},
+              },
+              ReplicaParallelDimSet{
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
+              },
+          },
+          DataType::FLOAT,
+      });
+      t = b.dense(t,
+                  /*outDim=*/16_n,
+                  /*activation=*/std::nullopt);
+      t = b.gelu(t);
+      t = b.dense(t,
+                  /*outDim=*/12_n,
+                  /*activation=*/Activation::RELU,
+                  /*use_bias=*/false,
+                  /*data_type=*/DataType::FLOAT,
+                  /*kernel_initializer=*/std::nullopt,
+                  /*bias_initializer=*/std::nullopt,
+                  /*name=*/std::nullopt);
+      t = b.dense(t,
+                  /*outDim=*/8_n,
+                  /*activation=*/Activation::RELU);
+
+      return sub_pcg_from_full_pcg(b.pcg);
+    }();
+
+    // since the new nodes produced by the substitution have new ids, it's
+    // easier/more correct to check that the graphs are isomorphic rather than
+    // checking their exact graph data
+    CHECK(sub_pcgs_are_isomorphic(result, correct));
+  }
+}
diff --git a/lib/substitutions/test/src/substitutions/substitution_internal/evaluate_substitution_output.cc b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
similarity index 86%
rename from lib/substitutions/test/src/substitutions/substitution_internal/evaluate_substitution_output.cc
rename to lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
index 52b54b32fb..7bdcc5a3bd 100644
--- a/lib/substitutions/test/src/substitutions/substitution_internal/evaluate_substitution_output.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/evaluate_substitution_output.cc
@@ -1,4 +1,4 @@
-#include "substitutions/substitution_internal/evaluate_substitution_output.h"
+#include "substitutions/apply_substitution/evaluate_substitution_output.h"
 #include "pcg/parallel_computation_graph/parallel_computation_graph_builder.h"
 #include "substitutions/open_parallel_tensor_guid_t.h"
 #include "substitutions/operator_pattern/operator_attribute_constraint.h"
@@ -64,20 +64,23 @@ TEST_SUITE(FF_TEST_SUITE) {
         OutputGraphExprInput{output_g.add_input({})};
 
     OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment =
-        OutputOperatorAttrsAssignment{{
-            set_attr_to_constant(OperatorAttributeKey::OP_TYPE,
-                                 OperatorAttributeValue{OperatorType::LINEAR}),
-            copy_attr_from_pattern_node(OperatorAttributeKey::OUT_CHANNELS,
-                                        pattern_mm_node),
-            copy_attr_from_pattern_node(OperatorAttributeKey::USE_BIAS,
-                                        pattern_mm_node),
-            copy_attr_from_pattern_node(OperatorAttributeKey::DATA_TYPE,
-                                        pattern_mm_node),
-            set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
-                                 OperatorAttributeValue{Activation::RELU}),
-            copy_attr_from_pattern_node(OperatorAttributeKey::REGULARIZER,
-                                        pattern_mm_node),
-        }};
+        OutputOperatorAttrsAssignment{
+            std::nullopt,
+            {
+                set_attr_to_constant(
+                    OperatorAttributeKey::OP_TYPE,
+                    OperatorAttributeValue{OperatorType::LINEAR}),
+                copy_attr_from_pattern_node(OperatorAttributeKey::OUT_CHANNELS,
+                                            pattern_mm_node),
+                copy_attr_from_pattern_node(OperatorAttributeKey::USE_BIAS,
+                                            pattern_mm_node),
+                copy_attr_from_pattern_node(OperatorAttributeKey::DATA_TYPE,
+                                            pattern_mm_node),
+                set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                     OperatorAttributeValue{Activation::RELU}),
+                copy_attr_from_pattern_node(OperatorAttributeKey::REGULARIZER,
+                                            pattern_mm_node),
+            }};
     NodeAddedResult fused_mm_relu_added = output_g.add_node(
         fused_mm_relu_attrs_assignment,
         {OpenDataflowValue{output_i_activation.raw_dataflow_graph_input},
@@ -108,9 +111,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         },
     };
 
-    int in_channels = 24;
-    int batch_size = 4;
-    int batch_degree = 2;
+    nonnegative_int in_channels = 24_n;
+    nonnegative_int batch_size = 4_n;
+    nonnegative_int batch_degree = 2_n;
     std::string mm_match = "mm_match";
     std::string relu_match = "relu_match";
 
@@ -119,22 +122,22 @@ TEST_SUITE(FF_TEST_SUITE) {
       parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
           ParallelTensorDims{
               FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{size_t_from_int(batch_size), batch_degree},
-                  ShardParallelDim{size_t_from_int(in_channels), 1},
+                  ShardParallelDim{batch_size, batch_degree},
+                  ShardParallelDim{in_channels, 1_n},
               },
               ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
+                  SumDegree{1_n},
+                  DiscardCopyDegree{1_n},
               },
           },
           DataType::FLOAT,
       });
       t = b.dense(t,
-                  /*outDim=*/16,
+                  /*outDim=*/16_n,
                   /*activation=*/std::nullopt);
       t = b.gelu(t);
       t = b.dense(t,
-                  /*outDim=*/12,
+                  /*outDim=*/12_n,
                   /*activation=*/std::nullopt,
                   /*use_bias=*/false,
                   /*data_type=*/DataType::FLOAT,
@@ -144,7 +147,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       t = b.relu(t,
                  /*name=*/relu_match);
       t = b.dense(t,
-                  /*outDim=*/8,
+                  /*outDim=*/8_n,
                   /*activation=*/Activation::RELU);
 
       return sub_pcg_from_full_pcg(b.pcg);
@@ -186,10 +189,10 @@ TEST_SUITE(FF_TEST_SUITE) {
           result_input_map = result.second.input_mapping;
 
       LinearAttrs correct_result_fused_mm_relu_attrs = LinearAttrs{
-          12,
+          /*out_channels=*/12_n,
           /*use_bias=*/false,
-          DataType::FLOAT,
-          Activation::RELU,
+          /*data_type=*/DataType::FLOAT,
+          /*activation=*/Activation::RELU,
           /*regularizer=*/std::nullopt,
       };
 
@@ -228,7 +231,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                               result_i_activation.raw_dataflow_graph_input,
                               DataflowInput{
                                   result_fused_mm_relu_node.raw_graph_node,
-                                  0,
+                                  0_n,
                               },
                           },
                       },
@@ -239,7 +242,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                               result_i_weights.raw_dataflow_graph_input,
                               DataflowInput{
                                   result_fused_mm_relu_node.raw_graph_node,
-                                  1,
+                                  1_n,
                               },
                           },
                       },
diff --git a/lib/substitutions/test/src/substitutions/substitution_internal/perform_shape_inference.cc b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
similarity index 78%
rename from lib/substitutions/test/src/substitutions/substitution_internal/perform_shape_inference.cc
rename to lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
index 4d4e557fb8..950e833771 100644
--- a/lib/substitutions/test/src/substitutions/substitution_internal/perform_shape_inference.cc
+++ b/lib/substitutions/test/src/substitutions/apply_substitution/perform_shape_inference.cc
@@ -1,4 +1,4 @@
-#include "substitutions/substitution_internal/perform_shape_inference.h"
+#include "substitutions/apply_substitution/perform_shape_inference.h"
 #include "op-attrs/ops/element_unary.h"
 #include "op-attrs/ops/linear.h"
 #include "op-attrs/parallel_tensor_shape.h"
@@ -18,21 +18,21 @@ TEST_SUITE(FF_TEST_SUITE) {
             UnorderedSetLabelledOpenDataflowGraph<ParallelLayerAttrs,
                                                   std::monostate>>();
 
-    int in_channels = 24;
-    int out_channels = 16;
-    int batch_size = 4;
-    int batch_degree = 2;
+    nonnegative_int in_channels = 24_n;
+    nonnegative_int out_channels = 16_n;
+    nonnegative_int batch_size = 4_n;
+    nonnegative_int batch_degree = 2_n;
 
     DataflowGraphInput i0 = g.add_input({});
     ParallelTensorShape i0_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
-                ShardParallelDim{size_t_from_int(batch_size), batch_degree},
-                ShardParallelDim{size_t_from_int(in_channels), 1},
+                ShardParallelDim{batch_size, batch_degree},
+                ShardParallelDim{in_channels, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -40,28 +40,28 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     bool use_bias = false;
     LinearAttrs n1_op_attrs = LinearAttrs{
-        out_channels,
-        use_bias,
-        DataType::FLOAT,
-        std::nullopt,
-        std::nullopt,
+        /*out_channels=*/out_channels,
+        /*use_bias=*/use_bias,
+        /*data_type=*/DataType::FLOAT,
+        /*activation=*/std::nullopt,
+        /*regularizer=*/std::nullopt,
     };
     ParallelLayerAttrs n1_attrs = ParallelLayerAttrs{
-        PCGOperatorAttrs{
+        /*op_attrs=*/PCGOperatorAttrs{
             n1_op_attrs,
         },
-        std::nullopt,
+        /*name=*/std::nullopt,
     };
 
     ElementUnaryAttrs n2_op_attrs = ElementUnaryAttrs{
-        OperatorType::RELU,
-        std::nullopt,
+        /*op_type=*/OperatorType::RELU,
+        /*scalar=*/std::nullopt,
     };
     ParallelLayerAttrs n2_attrs = ParallelLayerAttrs{
-        PCGOperatorAttrs{
+        /*op_attrs=*/PCGOperatorAttrs{
             n2_op_attrs,
         },
-        std::nullopt,
+        /*name=*/std::nullopt,
     };
 
     ParallelTensorShape n1_output_shape =
@@ -131,22 +131,22 @@ TEST_SUITE(FF_TEST_SUITE) {
                 OpenDataflowEdge{
                     DataflowInputEdge{
                         i0,
-                        DataflowInput{n1, 0},
+                        DataflowInput{n1, 0_n},
                     },
                 },
                 OpenDataflowEdge{DataflowEdge{
-                    DataflowOutput{n1_weight_node, 0},
-                    DataflowInput{n1_weight_replicate_node, 0},
+                    DataflowOutput{n1_weight_node, 0_n},
+                    DataflowInput{n1_weight_replicate_node, 0_n},
                 }},
                 OpenDataflowEdge{
                     DataflowEdge{
-                        DataflowOutput{n1_weight_replicate_node, 0},
-                        DataflowInput{n1, 1},
+                        DataflowOutput{n1_weight_replicate_node, 0_n},
+                        DataflowInput{n1, 1_n},
                     },
                 },
                 OpenDataflowEdge{DataflowEdge{
-                    DataflowOutput{n1, 0},
-                    DataflowInput{n2, 0},
+                    DataflowOutput{n1, 0_n},
+                    DataflowInput{n2, 0_n},
                 }},
             },
             {i0},
@@ -155,19 +155,20 @@ TEST_SUITE(FF_TEST_SUITE) {
                  i0_shape,
              },
              {
-                 OpenDataflowValue{DataflowOutput{n1_weight_node, 0}},
+                 OpenDataflowValue{DataflowOutput{n1_weight_node, 0_n}},
                  lift_to_parallel(get_reduced_shape(n1_weight_shape)),
              },
              {
-                 OpenDataflowValue{DataflowOutput{n1_weight_replicate_node, 0}},
+                 OpenDataflowValue{
+                     DataflowOutput{n1_weight_replicate_node, 0_n}},
                  n1_weight_shape,
              },
              {
-                 OpenDataflowValue{DataflowOutput{n1, 0}},
+                 OpenDataflowValue{DataflowOutput{n1, 0_n}},
                  n1_output_shape,
              },
              {
-                 OpenDataflowValue{DataflowOutput{n2, 0}},
+                 OpenDataflowValue{DataflowOutput{n2, 0_n}},
                  n2_output_shape,
              }}};
 
diff --git a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc
index 95b61e0ef4..24f9e9bd56 100644
--- a/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc
+++ b/lib/substitutions/test/src/substitutions/operator_pattern/get_attribute.cc
@@ -6,7 +6,7 @@ using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_attribute(LinearAttrs, OperatorAttributeKey)") {
-    int out_channels = 16;
+    nonnegative_int out_channels = 16_n;
     bool use_bias = true;
     std::optional<Activation> activation = Activation::GELU;
     std::optional<RegularizerAttrs> regularizer = RegularizerAttrs{
diff --git a/lib/substitutions/test/src/substitutions/pcg_pattern.cc b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
index d9273b4bcf..9ff368a8eb 100644
--- a/lib/substitutions/test/src/substitutions/pcg_pattern.cc
+++ b/lib/substitutions/test/src/substitutions/pcg_pattern.cc
@@ -15,19 +15,19 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("find_pattern_matches(PCGPattern, SubParallelComputationGraph)") {
     ParallelComputationGraphBuilder builder;
 
-    size_t batch_size = 16;
-    int batch_degree = 2;
-    size_t num_channels = 24;
+    nonnegative_int batch_size = 16_n;
+    nonnegative_int batch_degree = 2_n;
+    nonnegative_int num_channels = 24_n;
 
     ParallelTensorShape a_shape = ParallelTensorShape{
         ParallelTensorDims{
             FFOrdered<ShardParallelDim>{
                 ShardParallelDim{batch_size, batch_degree},
-                ShardParallelDim{num_channels, 1},
+                ShardParallelDim{num_channels, 1_n},
             },
             ReplicaParallelDimSet{
-                SumDegree{1},
-                DiscardCopyDegree{1},
+                SumDegree{1_n},
+                DiscardCopyDegree{1_n},
             },
         },
         DataType::FLOAT,
@@ -37,7 +37,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     parallel_tensor_guid_t a_tensor =
         builder.create_input_tensor(a_shape, CreateGrad::YES, a_name);
 
-    int outDim = 16;
+    nonnegative_int outDim = 16_n;
     std::string x_matmul_name = "x_matmul";
     std::string y_matmul_name = "y_matmul";
     parallel_tensor_guid_t t0 =
diff --git a/lib/substitutions/test/src/substitutions/substitution.cc b/lib/substitutions/test/src/substitutions/substitution.cc
index 1718b03b5c..ef27cb7606 100644
--- a/lib/substitutions/test/src/substitutions/substitution.cc
+++ b/lib/substitutions/test/src/substitutions/substitution.cc
@@ -4,226 +4,173 @@
 #include "substitutions/operator_pattern/operator_attribute_constraint.h"
 #include "substitutions/output_graph/output_graph_expr_node.dtg.h"
 #include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/pcg_pattern.h"
 #include "substitutions/pcg_pattern_builder.h"
 #include "substitutions/sub_parallel_computation_graph.h"
+#include "substitutions/substitution_builder.h"
 #include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
 #include "utils/containers/get_only.h"
 #include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
 #include "utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h"
+#include "utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h"
 #include "utils/integer_conversions.h"
 #include <doctest/doctest.h>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  // TEST_CASE("is_valid_substitution") {
-  //   FAIL("TODO");
-  // }
-
-  TEST_CASE("evaluate_substitution_output(SubParallelComputationGraph, "
-            "Substitution, PCGPatternMatch)") {
-    // Currently Substitution creation is very verbose.
-    // This is being addressed in
-    // https://github.com/flexflow/FlexFlow/issues/1473.
-    auto pattern_g = LabelledOpenDataflowGraph<OperatorAttributePattern,
-                                               TensorAttributePattern>::
-        create<UnorderedSetLabelledOpenDataflowGraph<OperatorAttributePattern,
-                                                     TensorAttributePattern>>();
-
-    PatternInput pattern_i_activation =
-        PatternInput{pattern_g.add_input(tensor_attribute_pattern_match_all())};
-    PatternInput pattern_i_weights =
-        PatternInput{pattern_g.add_input(tensor_attribute_pattern_match_all())};
-
-    OperatorAttributePattern mm_pattern = OperatorAttributePattern{{
-        op_type_equals_constraint(OperatorType::LINEAR),
-        op_attr_key_equals(
-            OperatorAttributeKey::ACTIVATION,
-            OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
-    }};
-    NodeAddedResult mm_added = pattern_g.add_node(
-        mm_pattern,
-        {OpenDataflowValue{pattern_i_activation.raw_dataflow_graph_input},
-         OpenDataflowValue{pattern_i_weights.raw_dataflow_graph_input}},
-        {tensor_attribute_pattern_match_all()});
-    PatternNode pattern_mm_node = PatternNode{mm_added.node};
-    DataflowOutput mm_output = get_only(mm_added.outputs);
-
-    OperatorAttributePattern relu_pattern = OperatorAttributePattern{{
-        op_type_equals_constraint(OperatorType::RELU),
-    }};
-    NodeAddedResult relu_added =
-        pattern_g.add_node(relu_pattern,
-                           {OpenDataflowValue{mm_output}},
-                           {tensor_attribute_pattern_match_all()});
-    PatternNode pattern_relu_node = PatternNode{relu_added.node};
-    DataflowOutput relu_output = get_only(relu_added.outputs);
-
-    LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment, std::monostate>
-        output_g = LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment,
-                                             std::monostate>::
-            create<UnorderedSetLabelledOpenDataflowGraph<
-                OutputOperatorAttrsAssignment,
-                std::monostate>>();
-
-    OutputGraphExprInput output_i_activation =
-        OutputGraphExprInput{output_g.add_input({})};
-    OutputGraphExprInput output_i_weights =
-        OutputGraphExprInput{output_g.add_input({})};
-
-    OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment =
-        OutputOperatorAttrsAssignment{{
-            set_attr_to_constant(OperatorAttributeKey::OP_TYPE,
-                                 OperatorAttributeValue{OperatorType::LINEAR}),
-            copy_attr_from_pattern_node(OperatorAttributeKey::OUT_CHANNELS,
-                                        pattern_mm_node),
-            copy_attr_from_pattern_node(OperatorAttributeKey::USE_BIAS,
-                                        pattern_mm_node),
-            copy_attr_from_pattern_node(OperatorAttributeKey::DATA_TYPE,
-                                        pattern_mm_node),
-            set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
-                                 OperatorAttributeValue{Activation::RELU}),
-            copy_attr_from_pattern_node(OperatorAttributeKey::REGULARIZER,
-                                        pattern_mm_node),
+  TEST_CASE("is_isomorphic_to(Substitution, Substitution)") {
+    auto make_substitution = [] {
+      SubstitutionBuilder b;
+
+      auto [p_input, o_input] =
+          b.add_input(tensor_attribute_pattern_match_all());
+      auto [p_weight, o_weight] =
+          b.add_input(tensor_attribute_pattern_match_all());
+
+      PatternValue p_mm_output = [&] {
+        auto pattern = OperatorAttributePattern{{
+            op_type_equals_constraint(OperatorType::LINEAR),
+            op_attr_key_equals(OperatorAttributeKey::ACTIVATION,
+                               OperatorAttributeValue{
+                                   std::optional<Activation>{std::nullopt}}),
         }};
-    NodeAddedResult fused_mm_relu_added = output_g.add_node(
-        fused_mm_relu_attrs_assignment,
-        {OpenDataflowValue{output_i_activation.raw_dataflow_graph_input},
-         OpenDataflowValue{output_i_weights.raw_dataflow_graph_input}},
-        {{}});
-    OutputGraphExprNode fused_mm_relu_node =
-        OutputGraphExprNode{fused_mm_relu_added.node};
-    DataflowOutput fused_mm_relu_output = get_only(fused_mm_relu_added.outputs);
-
-    Substitution sub = Substitution{
-        PCGPattern{pattern_g},
-        OutputGraphExpr{output_g},
-        bidict<PatternInput, OutputGraphExprInput>{
-            {
-                pattern_i_activation,
-                output_i_activation,
-            },
-            {
-                pattern_i_weights,
-                output_i_weights,
-            },
-        },
-        bidict<PatternNodeOutput, OutputGraphExprNodeOutput>{
+
+        return get_only(
+            b.add_pattern_node(pattern,
+                               {p_input, p_weight},
+                               {tensor_attribute_pattern_match_all()},
+                               "mm"));
+      }();
+
+      PatternValue p_relu_output = [&] {
+        auto pattern = OperatorAttributePattern{{
+            op_type_equals_constraint(OperatorType::RELU),
+        }};
+
+        return get_only(
+            b.add_pattern_node(pattern,
+                               {p_mm_output},
+                               {tensor_attribute_pattern_match_all()},
+                               "relu"));
+      }();
+
+      OutputGraphExprValue o_fused_output = [&] {
+        auto node_expr = OutputOperatorAttrsAssignment{
+            b.pattern_node_named("mm"),
             {
-                PatternNodeOutput{relu_output},
-                OutputGraphExprNodeOutput{fused_mm_relu_output},
-            },
-        },
+                set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                     OperatorAttributeValue{Activation::RELU}),
+            }};
+
+        return get_only(b.add_output_graph_node(
+            node_expr, {o_input, o_weight}, nonnegative_int{1}));
+      }();
+
+      b.equate_outputs(p_relu_output, o_fused_output);
+
+      return b.get_substitution();
     };
 
-    int in_channels = 24;
-    int batch_size = 4;
-    int batch_degree = 2;
-    std::string mm_match = "mm_match";
-    std::string relu_match = "relu_match";
-
-    SubParallelComputationGraph pcg = [&] {
-      ParallelComputationGraphBuilder b;
-      parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
-          ParallelTensorDims{
-              FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{size_t_from_int(batch_size), batch_degree},
-                  ShardParallelDim{size_t_from_int(in_channels), 1},
-              },
-              ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
-              },
-          },
-          DataType::FLOAT,
-      });
-      t = b.dense(t,
-                  /*outDim=*/16,
-                  /*activation=*/std::nullopt);
-      t = b.gelu(t);
-      t = b.dense(t,
-                  /*outDim=*/12,
-                  /*activation=*/std::nullopt,
-                  /*use_bias=*/false,
-                  /*data_type=*/DataType::FLOAT,
-                  /*kernel_initializer=*/std::nullopt,
-                  /*bias_initializer=*/std::nullopt,
-                  /*name=*/mm_match);
-      t = b.relu(t,
-                 /*name=*/relu_match);
-      t = b.dense(t,
-                  /*outDim=*/8,
-                  /*activation=*/Activation::RELU);
-
-      return sub_pcg_from_full_pcg(b.pcg);
+    Substitution sub1 = make_substitution();
+    Substitution sub2 = make_substitution();
+
+    CHECK(is_isomorphic_to(sub1, sub1));
+    CHECK(is_isomorphic_to(sub1, sub2));
+  }
+
+  TEST_CASE("is_valid_substitution") {
+    SubstitutionBuilder b;
+
+    auto [p_input, o_input] = b.add_input(tensor_attribute_pattern_match_all());
+    auto [p_weight, o_weight] =
+        b.add_input(tensor_attribute_pattern_match_all());
+
+    PatternValue p_mm_output = [&] {
+      auto pattern = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::LINEAR),
+          op_attr_key_equals(
+              OperatorAttributeKey::ACTIVATION,
+              OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
+      }};
+
+      return get_only(b.add_pattern_node(pattern,
+                                         {p_input, p_weight},
+                                         {tensor_attribute_pattern_match_all()},
+                                         "mm"));
     }();
 
-    PCGPatternMatch match = [&] {
-      parallel_layer_guid_t mm_match_layer =
-          get_parallel_layer_by_name(pcg, mm_match);
-      parallel_layer_guid_t relu_match_layer =
-          get_parallel_layer_by_name(pcg, relu_match);
-      open_parallel_tensor_guid_t mm_match_layer_input_activations =
-          get_layer_inputs(pcg, mm_match_layer).at(0);
-      open_parallel_tensor_guid_t mm_match_layer_input_weights =
-          get_layer_inputs(pcg, mm_match_layer).at(1);
-
-      return PCGPatternMatch{
-          bidict<PatternNode, parallel_layer_guid_t>{
-              {pattern_mm_node, mm_match_layer},
-              {pattern_relu_node, relu_match_layer},
-          },
-          std::unordered_map<PatternInput, open_parallel_tensor_guid_t>{
-              {
-                  PatternInput{pattern_i_activation},
-                  mm_match_layer_input_activations,
-              },
-              {
-                  PatternInput{pattern_i_weights},
-                  mm_match_layer_input_weights,
-              }},
-      };
+    PatternValue p_relu_output = [&] {
+      auto pattern = OperatorAttributePattern{{
+          op_type_equals_constraint(OperatorType::RELU),
+      }};
+
+      return get_only(b.add_pattern_node(pattern,
+                                         {p_mm_output},
+                                         {tensor_attribute_pattern_match_all()},
+                                         "relu"));
     }();
 
-    SubParallelComputationGraph result = apply_substitution(pcg, sub, match);
-
-    SubParallelComputationGraph correct = [&] {
-      ParallelComputationGraphBuilder b;
-      parallel_tensor_guid_t t = b.create_input_tensor(ParallelTensorShape{
-          ParallelTensorDims{
-              FFOrdered<ShardParallelDim>{
-                  ShardParallelDim{size_t_from_int(batch_size), batch_degree},
-                  ShardParallelDim{size_t_from_int(in_channels), 1},
-              },
-              ReplicaParallelDimSet{
-                  SumDegree{1},
-                  DiscardCopyDegree{1},
-              },
-          },
-          DataType::FLOAT,
-      });
-      t = b.dense(t,
-                  /*outDim=*/16,
-                  /*activation=*/std::nullopt);
-      t = b.gelu(t);
-      t = b.dense(t,
-                  /*outDim=*/12,
-                  /*activation=*/Activation::RELU,
-                  /*use_bias=*/false,
-                  /*data_type=*/DataType::FLOAT,
-                  /*kernel_initializer=*/std::nullopt,
-                  /*bias_initializer=*/std::nullopt,
-                  /*name=*/std::nullopt);
-      t = b.dense(t,
-                  /*outDim=*/8,
-                  /*activation=*/Activation::RELU);
-
-      return sub_pcg_from_full_pcg(b.pcg);
+    OutputGraphExprValue o_fused_output = [&] {
+      auto node_expr = OutputOperatorAttrsAssignment{
+          b.pattern_node_named("mm"),
+          {
+              set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                   OperatorAttributeValue{Activation::RELU}),
+          }};
+
+      return get_only(b.add_output_graph_node(
+          node_expr, {o_input, o_weight}, nonnegative_int{1}));
     }();
 
-    // since the new nodes produced by the substitution have new ids, it's
-    // easier/more correct to check that the graphs are isomorphic rather than
-    // checking their exact graph data
-    CHECK(sub_pcgs_are_isomorphic(result, correct));
+    b.equate_outputs(p_relu_output, o_fused_output);
+
+    SUBCASE("pattern inputs != mapped inputs") {
+      Substitution sub = b.get_substitution();
+      sub.pcg_pattern.raw_graph.add_input(tensor_attribute_pattern_match_all());
+      CHECK_FALSE(is_valid_substitution(sub));
+    }
+
+    SUBCASE("output graph inputs != mapped inputs") {
+      Substitution sub = b.get_substitution();
+      sub.output_graph_expr.raw_graph.add_input(std::monostate{});
+      CHECK_FALSE(is_valid_substitution(sub));
+    }
+
+    SUBCASE("pattern has no nodes") {
+      // Could revamp this test to only trigger the
+      // get_nodes(sub.pcg_pattern).empty() case
+      Substitution sub = b.get_substitution();
+      LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                TensorAttributePattern>
+          zero_node_pattern =
+              LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                        TensorAttributePattern>::
+                  create<UnorderedSetLabelledOpenDataflowGraph<
+                      OperatorAttributePattern,
+                      TensorAttributePattern>>();
+      sub.pcg_pattern = PCGPattern{zero_node_pattern};
+      CHECK_FALSE(is_valid_substitution(sub));
+    }
+
+    SUBCASE("output graph has no nodes") {
+      // Could revamp this test to only trigger the
+      // get_nodes(sub.output_graph_expr).empty() case
+      Substitution sub = b.get_substitution();
+      LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment, std::monostate>
+          zero_node_pattern =
+              LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment,
+                                        std::monostate>::
+                  create<UnorderedSetLabelledOpenDataflowGraph<
+                      OutputOperatorAttrsAssignment,
+                      std::monostate>>();
+      sub.output_graph_expr = OutputGraphExpr{zero_node_pattern};
+      CHECK_FALSE(is_valid_substitution(sub));
+    }
+
+    SUBCASE("valid substitution") {
+      Substitution sub = b.get_substitution();
+      CHECK(is_valid_substitution(sub));
+    }
   }
 }
diff --git a/lib/substitutions/test/src/substitutions/substitution_builder.cc b/lib/substitutions/test/src/substitutions/substitution_builder.cc
new file mode 100644
index 0000000000..028a4e59c9
--- /dev/null
+++ b/lib/substitutions/test/src/substitutions/substitution_builder.cc
@@ -0,0 +1,145 @@
+#include "substitutions/substitution_builder.h"
+#include "substitutions/operator_pattern/operator_attribute_constraint.h"
+#include "substitutions/output_graph/output_graph_expr_node.dtg.h"
+#include "substitutions/output_graph/output_operator_attrs_assignment.h"
+#include "substitutions/substitution.h"
+#include "substitutions/tensor_pattern/tensor_attribute_pattern.h"
+#include "utils/containers/get_only.h"
+#include "utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("SubstitutionBuilder") {
+    OperatorAttributePattern relu_pattern = OperatorAttributePattern{{
+        op_type_equals_constraint(OperatorType::RELU),
+    }};
+
+    OperatorAttributePattern mm_pattern = OperatorAttributePattern{{
+        op_type_equals_constraint(OperatorType::LINEAR),
+        op_attr_key_equals(
+            OperatorAttributeKey::ACTIVATION,
+            OperatorAttributeValue{std::optional<Activation>{std::nullopt}}),
+    }};
+
+    std::unordered_map<OperatorAttributeKey, OutputOperatorAttributeExpr>
+        fused_mm_relu_attr_assignments = {
+            set_attr_to_constant(OperatorAttributeKey::ACTIVATION,
+                                 OperatorAttributeValue{Activation::RELU}),
+        };
+
+    Substitution correct = [&] {
+      auto pattern_g = LabelledOpenDataflowGraph<OperatorAttributePattern,
+                                                 TensorAttributePattern>::
+          create<
+              UnorderedSetLabelledOpenDataflowGraph<OperatorAttributePattern,
+                                                    TensorAttributePattern>>();
+
+      PatternInput pattern_i_activation = PatternInput{
+          pattern_g.add_input(tensor_attribute_pattern_match_all())};
+      PatternInput pattern_i_weights = PatternInput{
+          pattern_g.add_input(tensor_attribute_pattern_match_all())};
+
+      NodeAddedResult mm_added = pattern_g.add_node(
+          mm_pattern,
+          {OpenDataflowValue{pattern_i_activation.raw_dataflow_graph_input},
+           OpenDataflowValue{pattern_i_weights.raw_dataflow_graph_input}},
+          {tensor_attribute_pattern_match_all()});
+      PatternNode pattern_mm_node = PatternNode{mm_added.node};
+      DataflowOutput mm_output = get_only(mm_added.outputs);
+
+      NodeAddedResult relu_added =
+          pattern_g.add_node(relu_pattern,
+                             {OpenDataflowValue{mm_output}},
+                             {tensor_attribute_pattern_match_all()});
+      PatternNode pattern_relu_node = PatternNode{relu_added.node};
+      DataflowOutput relu_output = get_only(relu_added.outputs);
+
+      LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment, std::monostate>
+          output_g = LabelledOpenDataflowGraph<OutputOperatorAttrsAssignment,
+                                               std::monostate>::
+              create<UnorderedSetLabelledOpenDataflowGraph<
+                  OutputOperatorAttrsAssignment,
+                  std::monostate>>();
+
+      OutputGraphExprInput output_i_activation =
+          OutputGraphExprInput{output_g.add_input({})};
+      OutputGraphExprInput output_i_weights =
+          OutputGraphExprInput{output_g.add_input({})};
+
+      OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment =
+          OutputOperatorAttrsAssignment{
+              pattern_mm_node,
+              fused_mm_relu_attr_assignments,
+          };
+      NodeAddedResult fused_mm_relu_added = output_g.add_node(
+          fused_mm_relu_attrs_assignment,
+          {OpenDataflowValue{output_i_activation.raw_dataflow_graph_input},
+           OpenDataflowValue{output_i_weights.raw_dataflow_graph_input}},
+          {{}});
+      OutputGraphExprNode fused_mm_relu_node =
+          OutputGraphExprNode{fused_mm_relu_added.node};
+      DataflowOutput fused_mm_relu_output =
+          get_only(fused_mm_relu_added.outputs);
+
+      return Substitution{
+          PCGPattern{pattern_g},
+          OutputGraphExpr{output_g},
+          bidict<PatternInput, OutputGraphExprInput>{
+              {
+                  pattern_i_activation,
+                  output_i_activation,
+              },
+              {
+                  pattern_i_weights,
+                  output_i_weights,
+              },
+          },
+          bidict<PatternNodeOutput, OutputGraphExprNodeOutput>{
+              {
+                  PatternNodeOutput{relu_output},
+                  OutputGraphExprNodeOutput{fused_mm_relu_output},
+              },
+          },
+      };
+    }();
+
+    Substitution result = [&] {
+      SubstitutionBuilder b;
+
+      auto [p_input, o_input] =
+          b.add_input(tensor_attribute_pattern_match_all());
+      auto [p_weight, o_weight] =
+          b.add_input(tensor_attribute_pattern_match_all());
+
+      PatternValue p_mm_output =
+          get_only(b.add_pattern_node(mm_pattern,
+                                      {p_input, p_weight},
+                                      {tensor_attribute_pattern_match_all()},
+                                      "mm"));
+
+      PatternValue p_relu_output =
+          get_only(b.add_pattern_node(relu_pattern,
+                                      {p_mm_output},
+                                      {tensor_attribute_pattern_match_all()},
+                                      "relu"));
+
+      OutputOperatorAttrsAssignment fused_mm_relu_attrs_assignment =
+          OutputOperatorAttrsAssignment{
+              b.pattern_node_named("mm"),
+              fused_mm_relu_attr_assignments,
+          };
+      OutputGraphExprValue o_fused_output =
+          get_only(b.add_output_graph_node(fused_mm_relu_attrs_assignment,
+                                           {o_input, o_weight},
+                                           nonnegative_int{1}));
+
+      b.equate_outputs(p_relu_output, o_fused_output);
+
+      return b.get_substitution();
+    }();
+
+    CHECK(is_isomorphic_to(result, correct));
+  }
+}
diff --git a/lib/substitutions/test/src/substitutions/unity_substitution_set.cc b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc
new file mode 100644
index 0000000000..804fa99bef
--- /dev/null
+++ b/lib/substitutions/test/src/substitutions/unity_substitution_set.cc
@@ -0,0 +1,20 @@
+#include "substitutions/unity_substitution_set.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("get_substitution_set") {
+    MachineSpecification machine_spec = MachineSpecification{
+        /*num_nodes=*/2_n,
+        /*num_cpus_per_node=*/8_n,
+        /*num_gpus_per_node=*/4_n,
+        /*inter_node_bandwidth=*/0.0,
+        /*intra_node_bandwidth=*/0.0,
+    };
+
+    std::vector<Substitution> result = get_substitution_set(machine_spec);
+
+    CHECK(result.size() == 36);
+  }
+}
diff --git a/lib/substitutions/test/src/test_pattern_matches.cc b/lib/substitutions/test/src/substitutions/unlabelled/find_pattern_matches.cc
similarity index 94%
rename from lib/substitutions/test/src/test_pattern_matches.cc
rename to lib/substitutions/test/src/substitutions/unlabelled/find_pattern_matches.cc
index aeedd65f82..ab79ad6ff6 100644
--- a/lib/substitutions/test/src/test_pattern_matches.cc
+++ b/lib/substitutions/test/src/substitutions/unlabelled/find_pattern_matches.cc
@@ -9,7 +9,6 @@
 #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_inputs.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_graph.h"
-#include "utils/overload.h"
 #include <doctest/doctest.h>
 
 using namespace FlexFlow;
@@ -59,30 +58,30 @@ namespace rc {
 //     OpenMultiDiGraphView subgraph =
 //         get_subgraph<OpenMultiDiSubgraphView>(as_openmultidigraph(g),
 //         subgraph_nodes);
-
+//
 //     std::vector<MultiDiGraphPatternMatch> matches =
 //         find_pattern_matches(subgraph, as_openmultidigraph(g), AlwaysTrue{});
-
+//
 //     RC_ASSERT(!matches.empty());
-
+//
 //     for (MultiDiGraphPatternMatch const &match : matches) {
 //       RC_ASSERT(pattern_matches(subgraph, as_openmultidigraph(g), match,
 //       AlwaysTrue{}));
 //     }
 //   });
-// }
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("find_pattern_matches_small") {
+  TEST_CASE("find_pattern_matches") {
     OpenDataflowGraph pattern_graph =
         OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult pattern_n0_added = pattern_graph.add_node({}, 1);
+    NodeAddedResult pattern_n0_added = pattern_graph.add_node({}, 1_n);
     Node pattern_n0 = pattern_n0_added.node;
     OpenDataflowValue pattern_v0 =
         OpenDataflowValue{get_only(pattern_n0_added.outputs)};
 
-    NodeAddedResult pattern_n1_added = pattern_graph.add_node({pattern_v0}, 1);
+    NodeAddedResult pattern_n1_added =
+        pattern_graph.add_node({pattern_v0}, 1_n);
     Node pattern_n1 = pattern_n1_added.node;
     OpenDataflowValue pattern_v1 =
         OpenDataflowValue{get_only(pattern_n1_added.outputs)};
@@ -94,19 +93,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpenDataflowGraph graph =
         OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n0_added = graph.add_node({}, 1);
+    NodeAddedResult n0_added = graph.add_node({}, 1_n);
     Node n0 = n0_added.node;
     OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
 
-    NodeAddedResult n1_added = graph.add_node({v0}, 1);
+    NodeAddedResult n1_added = graph.add_node({v0}, 1_n);
     Node n1 = n1_added.node;
     OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
 
-    NodeAddedResult n2_added = graph.add_node({v1}, 1);
+    NodeAddedResult n2_added = graph.add_node({v1}, 1_n);
     Node n2 = n2_added.node;
     OpenDataflowValue v2 = OpenDataflowValue{get_only(n2_added.outputs)};
 
-    NodeAddedResult n3_added = graph.add_node({v2}, 1);
+    NodeAddedResult n3_added = graph.add_node({v2}, 1_n);
     Node n3 = n3_added.node;
     OpenDataflowValue v3 = OpenDataflowValue{get_only(n3_added.outputs)};
 
@@ -128,8 +127,8 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     std::vector<OpenDataflowEdge> n1_incoming = {OpenDataflowEdge{
         DataflowEdge{
-            DataflowOutput{n0, 0},
-            DataflowInput{n1, 0},
+            DataflowOutput{n0, 0_n},
+            DataflowInput{n1, 0_n},
         },
     }};
 
@@ -201,7 +200,7 @@ TEST_SUITE(FF_TEST_SUITE) {
           OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
       DataflowGraphInput i0 = g.add_input();
 
-      NodeAddedResult g_n0_added = g.add_node({OpenDataflowValue{i0}}, 1);
+      NodeAddedResult g_n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
       Node g_n0 = g_n0_added.node;
       OpenDataflowValue g_v0 = OpenDataflowValue{get_only(g_n0_added.outputs)};
       PatternNode g_p0 = PatternNode{g_n0};
diff --git a/lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc b/lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc
new file mode 100644
index 0000000000..8fd468d186
--- /dev/null
+++ b/lib/substitutions/test/src/substitutions/unlabelled/pattern_matching.cc
@@ -0,0 +1,210 @@
+#include "substitutions/unlabelled/pattern_matching.h"
+#include "substitutions/unlabelled/find_pattern_matches.h"
+#include "substitutions/unlabelled/match_additional_criterion.h"
+#include "utils/containers/get_only.h"
+#include "utils/graph/instances/unordered_set_dataflow_graph.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_subgraph_inputs.h"
+#include "utils/graph/open_dataflow_graph/open_dataflow_graph.h"
+#include "utils/overload.h"
+#include <doctest/doctest.h>
+
+using namespace FlexFlow;
+
+namespace rc {
+
+// template <>
+// struct Arbitrary<MultiDiGraph> {
+//   static int const MAX_GRAPH_SIZE = 200;
+//   static int const MAX_EDGE_SIZE = 1000;
+//
+//   static Gen<MultiDiGraph> arbitrary() {
+//     return gen::exec([&] {
+//       int num_nodes = *gen::inRange(1, MAX_GRAPH_SIZE + 1);
+//       MultiDiGraph g = MultiDiGraph::template
+//       create<AdjacencyMultiDiGraph>();
+//
+//       std::vector<Node> nodes;
+//       for (int i = 0; i < num_nodes; ++i) {
+//         nodes.push_back(g.add_node());
+//       }
+//
+//       int num_edges = *gen::inRange(1, MAX_GRAPH_SIZE + 1);
+//       for (int i = 0; i < num_edges; ++i) {
+//         int src_id = *gen::inRange(0, num_nodes);
+//         int dst_id = *gen::inRange(0, num_nodes);
+//         if (src_id > dst_id) {
+//           std::swap(src_id, dst_id);
+//         }
+//
+//         g.add_edge(MultiDiEdge{nodes[dst_id],
+//                                g.add_node_port(),
+//                                nodes[src_id],
+//                                g.add_node_port()});
+//       }
+//
+//       return g;
+//     });
+//   }
+// };
+
+} // namespace rc
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("find_pattern_matches") {
+    OpenDataflowGraph pattern_graph =
+        OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult pattern_n0_added = pattern_graph.add_node({}, 1_n);
+    Node pattern_n0 = pattern_n0_added.node;
+    OpenDataflowValue pattern_v0 =
+        OpenDataflowValue{get_only(pattern_n0_added.outputs)};
+
+    NodeAddedResult pattern_n1_added =
+        pattern_graph.add_node({pattern_v0}, 1_n);
+    Node pattern_n1 = pattern_n1_added.node;
+    OpenDataflowValue pattern_v1 =
+        OpenDataflowValue{get_only(pattern_n1_added.outputs)};
+
+    UnlabelledGraphPattern pattern = UnlabelledGraphPattern{pattern_graph};
+    PatternNode p0 = PatternNode{pattern_n0};
+    PatternNode p1 = PatternNode{pattern_n1};
+
+    OpenDataflowGraph graph =
+        OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
+
+    NodeAddedResult n0_added = graph.add_node({}, 1_n);
+    Node n0 = n0_added.node;
+    OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
+
+    NodeAddedResult n1_added = graph.add_node({v0}, 1_n);
+    Node n1 = n1_added.node;
+    OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
+
+    NodeAddedResult n2_added = graph.add_node({v1}, 1_n);
+    Node n2 = n2_added.node;
+    OpenDataflowValue v2 = OpenDataflowValue{get_only(n2_added.outputs)};
+
+    NodeAddedResult n3_added = graph.add_node({v2}, 1_n);
+    Node n3 = n3_added.node;
+    OpenDataflowValue v3 = OpenDataflowValue{get_only(n3_added.outputs)};
+
+    UnlabelledDataflowGraphPatternMatch match =
+        UnlabelledDataflowGraphPatternMatch{
+            bidict<PatternNode, Node>{
+                {p0, n0},
+                {p1, n1},
+            },
+            bidict<PatternInput, OpenDataflowValue>{}};
+
+    UnlabelledDataflowGraphPatternMatch invalid_match =
+        UnlabelledDataflowGraphPatternMatch{
+            bidict<PatternNode, Node>{
+                {p0, n1},
+                {p1, n2},
+            },
+            bidict<PatternInput, OpenDataflowValue>{}};
+
+    std::vector<OpenDataflowEdge> n1_incoming = {OpenDataflowEdge{
+        DataflowEdge{
+            DataflowOutput{n0, 0_n},
+            DataflowInput{n1, 0_n},
+        },
+    }};
+
+    SUBCASE("get_incoming_edges") {
+      SUBCASE("n0") {
+        std::vector<OpenDataflowEdge> result = get_incoming_edges(graph, n0);
+        std::vector<OpenDataflowEdge> correct = {};
+        CHECK(result == correct);
+      }
+      SUBCASE("n1") {
+        std::vector<OpenDataflowEdge> result = get_incoming_edges(graph, n1);
+        std::vector<OpenDataflowEdge> correct = n1_incoming;
+        CHECK(result == correct);
+      }
+      SUBCASE("both") {
+        std::unordered_map<Node, std::vector<OpenDataflowEdge>> result =
+            get_incoming_edges(graph, {n0, n1});
+        std::unordered_map<Node, std::vector<OpenDataflowEdge>> correct = {
+            {n0, {}}, {n1, n1_incoming}};
+        CHECK(result == correct);
+      }
+    }
+
+    SUBCASE("get_subgraph_inputs") {
+      std::unordered_set<OpenDataflowValue> result =
+          get_subgraph_inputs(graph, {n0, n1});
+      std::unordered_set<OpenDataflowValue> correct = {};
+      CHECK(result == correct);
+    }
+
+    SUBCASE("get_subgraph") {
+      OpenDataflowGraphView g = get_subgraph(graph, {n0, n1}).graph;
+      SUBCASE("nodes") {
+        std::unordered_set<Node> result = get_nodes(g);
+        std::unordered_set<Node> correct = {n0, n1};
+        CHECK(result == correct);
+      }
+      SUBCASE("inputs") {
+        std::unordered_set<DataflowGraphInput> result = g.get_inputs();
+        std::unordered_set<DataflowGraphInput> correct = {};
+        CHECK(result == correct);
+      }
+      SUBCASE("get_open_dataflow_values") {
+        std::unordered_set<OpenDataflowValue> values =
+            get_open_dataflow_values(g);
+        CHECK(values.size() == 2);
+      }
+    }
+
+    SUBCASE("subgraph_matched") {
+      OpenDataflowGraphView result = subgraph_matched(graph, match).graph;
+      std::unordered_set<Node> result_nodes = get_nodes(result);
+      std::unordered_set<Node> correct_nodes = {n0, n1};
+      CHECK(result_nodes == correct_nodes);
+    }
+
+    SUBCASE("unlabelled_pattern_does_match") {
+      CHECK(unlabelled_pattern_does_match(
+          pattern, graph, match, match_additional_crition_always_true()));
+      CHECK_FALSE(unlabelled_pattern_does_match(
+          pattern,
+          graph,
+          invalid_match,
+          match_additional_crition_always_true()));
+    }
+
+    SUBCASE("unlabelled_pattern_does_match") {
+      OpenDataflowGraph g =
+          OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
+      DataflowGraphInput i0 = g.add_input();
+
+      NodeAddedResult g_n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
+      Node g_n0 = g_n0_added.node;
+      OpenDataflowValue g_v0 = OpenDataflowValue{get_only(g_n0_added.outputs)};
+      PatternNode g_p0 = PatternNode{g_n0};
+      PatternInput g_pi0 = PatternInput{i0};
+
+      UnlabelledGraphPattern open_pattern = UnlabelledGraphPattern{g};
+
+      UnlabelledDataflowGraphPatternMatch open_match =
+          UnlabelledDataflowGraphPatternMatch{
+              bidict<PatternNode, Node>{
+                  {g_p0, n1},
+              },
+              bidict<PatternInput, OpenDataflowValue>{
+                  {g_pi0, v0},
+              }};
+
+      CHECK(unlabelled_pattern_does_match(
+          open_pattern,
+          graph,
+          open_match,
+          match_additional_crition_always_true()));
+    }
+  }
+}
diff --git a/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc b/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc
index e4d763d9c3..1bddb9f680 100644
--- a/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc
+++ b/lib/substitutions/test/src/substitutions/unlabelled/pattern_split.cc
@@ -13,11 +13,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     OpenDataflowGraph g =
         OpenDataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n0_added = g.add_node({}, 1);
+    NodeAddedResult n0_added = g.add_node({}, 1_n);
     Node n0 = n0_added.node;
     OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
 
-    NodeAddedResult n1_added = g.add_node({v0}, 1);
+    NodeAddedResult n1_added = g.add_node({v0}, 1_n);
     Node n1 = n1_added.node;
     OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
 
@@ -77,11 +77,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     DataflowGraphInput i0 = g.add_input();
     DataflowGraphInput i1 = g.add_input();
 
-    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1);
+    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
     Node n0 = n0_added.node;
     OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
 
-    NodeAddedResult n1_added = g.add_node({OpenDataflowValue{i1}}, 1);
+    NodeAddedResult n1_added = g.add_node({OpenDataflowValue{i1}}, 1_n);
     Node n1 = n1_added.node;
     OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
 
diff --git a/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc b/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc
index e0805dbfd4..22d1b8a2a5 100644
--- a/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc
+++ b/lib/substitutions/test/src/substitutions/unlabelled/unlabelled_graph_pattern.cc
@@ -17,7 +17,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK_FALSE(is_singleton_pattern(pattern));
     }
 
-    NodeAddedResult n0_added = g.add_node({}, 1);
+    NodeAddedResult n0_added = g.add_node({}, 1_n);
     OpenDataflowValue v0 = OpenDataflowValue{get_only(n0_added.outputs)};
 
     SUBCASE("1 node") {
@@ -26,7 +26,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(is_singleton_pattern(pattern));
     }
 
-    NodeAddedResult n1_added = g.add_node({v0}, 1);
+    NodeAddedResult n1_added = g.add_node({v0}, 1_n);
     OpenDataflowValue v1 = OpenDataflowValue{get_only(n1_added.outputs)};
 
     SUBCASE("more than 1 node") {
diff --git a/lib/substitutions/test/src/test_substitution.cc b/lib/substitutions/test/src/test_substitution.cc
deleted file mode 100644
index dcb06a78fa..0000000000
--- a/lib/substitutions/test/src/test_substitution.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-#include "doctest/doctest.h"
-#include "op-attrs/get_op_type.h"
-#include "rapidcheck.h"
-#include "substitutions/substitution.h"
-
-using namespace FlexFlow;
-
-// TEST_SUITE(FF_TEST_SUITE) {
-//   TEST_CASE("substitution") {
-// PCGPattern pattern;
-// OutputGraphExpr output_expr;
-// bidict<DataflowGraphInput
-// Substitution s;
-//   }
-// }
-
-// TEST_SUITE(FF_TEST_SUITE) {
-//   TEST_CASE("apply_substitution") {
-//     OperatorPattern operator_pattern_n0{
-//         std::vector<OperatorAttributeConstraint>{
-//             OperatorAttributeConstraint{ConstraintType::EQUAL,
-//                                         OperatorAttributeKey::OP_TYPE,
-//                                         OperatorType::LINEAR}}};
-//
-//     ParallelTensorPattern tensor_pattern_e0{
-//         std::vector<TensorAttributeConstraint>{
-//             TensorAttributeConstraint{ConstraintType::EQUAL,
-//                                       ListIndexAccess<TensorAttributeKey>{
-//                                           TensorAttributeKey::DIM_SIZES, 0},
-//                                       2}}};
-//
-//     ParallelTensorPattern tensor_pattern_empty{
-//         std::vector<TensorAttributeConstraint>{}};
-//
-//     auto ig =
-//         OutputLabelledOpenMultiDiGraph<OperatorPattern,
-//         ParallelTensorPattern>::
-//             create<UnorderedOutputLabelledOpenMultiDiGraph<
-//                 OperatorPattern,
-//                 ParallelTensorPattern>>();
-//     Node n0 = ig.add_node(operator_pattern_n0);
-//     NodePort p0 = ig.add_node_port();
-//     InputMultiDiEdge e0{n0, p0, std::make_pair(p0.value(), p0.value())};
-//     ig.add_edge(e0);
-//     ig.add_label(e0, tensor_pattern_e0);
-//
-//     RC_ASSERT(get_nodes(ig).size() == 1);
-//     RC_ASSERT(get_edges(ig).size() == 1);
-//
-//     GraphPattern input_graph{ig};
-//
-//     OperatorAttrAssignment op_ass_n1{
-//         {{OperatorAttributeKey::OP_TYPE,
-//           AttrConstant{OperatorType::REPARTITION}},
-//          {OperatorAttributeKey::PARALLEL_DIM,
-//          AttrConstant{ff_dim_t{nonnegative_int{0}}}},
-//          {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
-//
-//     OperatorAttrAssignment op_ass_n2{
-//         {{OperatorAttributeKey::OP_TYPE, AttrConstant{OperatorType::LINEAR}},
-//          {OperatorAttributeKey::OUT_CHANNELS,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::OUT_CHANNELS}},
-//          {OperatorAttributeKey::USE_BIAS,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::USE_BIAS}},
-//          {OperatorAttributeKey::DATA_TYPE,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::DATA_TYPE}},
-//          {OperatorAttributeKey::ACTIVATION,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::ACTIVATION}},
-//          {OperatorAttributeKey::REGULARIZER,
-//           OperatorAttrAccess{n0, OperatorAttributeKey::REGULARIZER}}}};
-//
-//     OperatorAttrAssignment op_ass_n3{
-//         {{OperatorAttributeKey::OP_TYPE,
-//         AttrConstant{OperatorType::REDUCTION}},
-//          {OperatorAttributeKey::PARALLEL_DIM,
-//          AttrConstant{ff_dim_t{nonnegative_int{0}}}},
-//          {OperatorAttributeKey::PARALLEL_DEGREE, AttrConstant{2}}}};
-//
-//     auto og = NodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>::create<
-//         UnorderedNodeLabelledOpenMultiDiGraph<OperatorAttrAssignment>>();
-//     Node n1 = og.add_node(op_ass_n1);
-//     Node n2 = og.add_node(op_ass_n2);
-//     Node n3 = og.add_node(op_ass_n3);
-//     NodePort p1 = og.add_node_port();
-//     NodePort p2 = og.add_node_port();
-//     NodePort p3 = og.add_node_port();
-//     InputMultiDiEdge e1{n1, p1, {p1.value(), p1.value()}};
-//     MultiDiEdge e2{n2, p2, n1, p1};
-//     MultiDiEdge e3{n3, p3, n2, p2};
-//     og.add_edge(e1);
-//     og.add_edge(e2);
-//     og.add_edge(e3);
-//     OutputGraphExpr output_graph_expr{og};
-//
-//     RC_ASSERT(get_nodes(og).size() == 3);
-//     RC_ASSERT(get_edges(og).size() == 3);
-//
-//     bidict<InputMultiDiEdge, InputMultiDiEdge> input_mapping;
-//     input_mapping.equate(e0, e1);
-//     bidict<OutputMultiDiEdge, OutputMultiDiEdge> output_mapping;
-//
-//     Substitution substitution{
-//         input_graph, output_graph_expr, input_mapping, output_mapping};
-//
-//     SubParallelComputationGraph pcg =
-//         OutputLabelledOpenMultiDiGraph<Operator, ParallelTensor>::create<
-//             UnorderedOutputLabelledOpenMultiDiGraph<Operator,
-//                                                     ParallelTensor>>();
-//
-//     Node n4 = pcg.add_node(Operator{InputAttrs{}, "input"});
-//     Node n5 = pcg.add_node(Operator{
-//         LinearAttrs{1, false, DataType::FLOAT, Activation::RELU,
-//         std::nullopt}, "linear"});
-//     NodePort p4 = pcg.add_node_port();
-//     NodePort p5 = pcg.add_node_port();
-//
-//     MultiDiEdge e4{n5, p5, n4, p4};
-//     pcg.add_edge(e4);
-//     ParallelDim dim = {2, 1, false};
-//     ParallelTensorDims dims = {FFOrdered<ParallelDim>{dim}};
-//     pcg.add_label(e4, ParallelTensor(dims, DataType::FLOAT,
-//     CreateGrad::YES));
-//
-//     MatchAdditionalCriterion criterion{
-//         [&](Node const &pattern_node, Node const &graph_node) {
-//           return operator_satisfies(pcg.at(graph_node),
-//                                     input_graph.value().at(pattern_node));
-//         },
-//         [&](OpenMultiDiEdge const &pattern_edge,
-//             OpenMultiDiEdge const &graph_edge) {
-//           return parallel_tensor_satisfies(
-//               pcg.at(graph_edge), input_graph.value().at(pattern_edge));
-//         }};
-//
-//     RC_ASSERT(criterion.node_criterion(n0, n5));
-//
-//     std::vector<MultiDiGraphPatternMatch> matches =
-//         find_pattern_matches(input_graph, pcg, criterion);
-//
-//     RC_ASSERT(matches.size() == 1);
-//
-//     SubParallelComputationGraph new_pcg =
-//         apply_substitution(pcg, substitution, matches[0]);
-//
-//     RC_ASSERT(get_nodes(new_pcg).size() == 4);
-//     RC_ASSERT(get_edges(new_pcg).size() == 3);
-//   }
-// }
diff --git a/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h b/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h
index 86ef6c4b4d..83afc32e0c 100644
--- a/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h
+++ b/lib/utils/include/utils/bidict/algorithms/bidict_from_enumerating.h
@@ -2,14 +2,16 @@
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_BIDICT_ALGORITHMS_BIDICT_FROM_ENUMERATING_H
 
 #include "utils/bidict/bidict.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <set>
 
 namespace FlexFlow {
 
 template <typename T>
-bidict<int, T> bidict_from_enumerating(std::unordered_set<T> const &s) {
-  bidict<int, T> result;
-  int idx = 0;
+bidict<nonnegative_int, T>
+    bidict_from_enumerating(std::unordered_set<T> const &s) {
+  bidict<nonnegative_int, T> result;
+  nonnegative_int idx = 0_n;
   for (T const &t : s) {
     result.equate(idx, t);
     idx++;
@@ -19,9 +21,9 @@ bidict<int, T> bidict_from_enumerating(std::unordered_set<T> const &s) {
 }
 
 template <typename T>
-bidict<int, T> bidict_from_enumerating(std::set<T> const &s) {
-  bidict<int, T> result;
-  int idx = 0;
+bidict<nonnegative_int, T> bidict_from_enumerating(std::set<T> const &s) {
+  bidict<nonnegative_int, T> result;
+  nonnegative_int idx = 0_n;
   for (T const &t : s) {
     result.equate(idx, t);
     idx++;
diff --git a/lib/utils/include/utils/cli/cli_flag_key.struct.toml b/lib/utils/include/utils/cli/cli_flag_key.struct.toml
index 790a752911..9c02fddc3e 100644
--- a/lib/utils/include/utils/cli/cli_flag_key.struct.toml
+++ b/lib/utils/include/utils/cli/cli_flag_key.struct.toml
@@ -6,8 +6,10 @@ features = [
   "fmt",
 ]
 
-includes = []
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
 
 [[fields]]
 name = "raw_idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml b/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml
index d571d0deb3..4c50c277c0 100644
--- a/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml
+++ b/lib/utils/include/utils/cli/cli_positional_argument_key.struct.toml
@@ -6,8 +6,10 @@ features = [
   "fmt",
 ]
 
-includes = []
+includes = [
+  "utils/nonnegative_int/nonnegative_int.h",
+]
 
 [[fields]]
 name = "raw_idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/utils/include/utils/containers/at_idx.h b/lib/utils/include/utils/containers/at_idx.h
index 757da5c548..fdc13a0231 100644
--- a/lib/utils/include/utils/containers/at_idx.h
+++ b/lib/utils/include/utils/containers/at_idx.h
@@ -1,17 +1,18 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_AT_IDX_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_AT_IDX_H
 
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <optional>
 #include <vector>
 
 namespace FlexFlow {
 
 template <typename E>
-std::optional<E> at_idx(std::vector<E> const &v, size_t idx) {
+std::optional<E> at_idx(std::vector<E> const &v, nonnegative_int idx) {
   if (idx >= v.size()) {
     return std::nullopt;
   } else {
-    return v.at(idx);
+    return v.at(idx.unwrap_nonnegative());
   }
 }
 
diff --git a/lib/utils/include/utils/containers/enumerate.h b/lib/utils/include/utils/containers/enumerate.h
index e3722e52c6..1e8bc1f3dc 100644
--- a/lib/utils/include/utils/containers/enumerate.h
+++ b/lib/utils/include/utils/containers/enumerate.h
@@ -11,14 +11,14 @@ namespace FlexFlow {
 /**
  * @brief Generate a map from indices to elements of \p c.
  *
- * @note We return a <tt>std::map<int, T></tt> rather than a
- * <tt>std::vector<std::pair<int, T>></tt> for consistency
+ * @note We return a <tt>std::map<nonnegative_int, T></tt> rather than a
+ * <tt>std::vector<std::pair<nonnegative_int, T>></tt> for consistency
  * with enumerate(FFOrdered<T> const &). Note that <tt>std::map</tt>
  * provides ordered iteration in increasing order, so iterating through
  * the result of this function should still function as expected.
  */
 template <typename T>
-std::map<int, T> enumerate(std::vector<T> const &c) {
+std::map<nonnegative_int, T> enumerate(std::vector<T> const &c) {
   return enumerate_vector(c);
 }
 
@@ -27,16 +27,16 @@ std::map<int, T> enumerate(std::vector<T> const &c) {
  * return a map from indices of this ordering to elements of \p c.
 
  *
- * @note We return a <tt>std::map<int, T></tt> rather than a
- * <tt>std::vector<std::pair<int, T>></tt> for consistency
+ * @note We return a <tt>std::map<nonnegative_int, T></tt> rather than a
+ * <tt>std::vector<std::pair<nonnegative_int, T>></tt> for consistency
  * with enumerate(FFOrdered<T> const &). Note that <tt>std::map</tt>
  * provides ordered iteration in increasing order, so iterating through
  * the result of this function should still function as expected.
  */
 template <typename T>
-std::map<int, T> enumerate(std::unordered_set<T> const &c) {
-  std::map<int, T> result;
-  int idx = 0;
+std::map<nonnegative_int, T> enumerate(std::unordered_set<T> const &c) {
+  std::map<nonnegative_int, T> result;
+  nonnegative_int idx = 0_n;
   for (auto const &v : c) {
     result.insert({idx++, v});
   }
diff --git a/lib/utils/include/utils/containers/enumerate_vector.h b/lib/utils/include/utils/containers/enumerate_vector.h
index 700106ea3f..1e66279306 100644
--- a/lib/utils/include/utils/containers/enumerate_vector.h
+++ b/lib/utils/include/utils/containers/enumerate_vector.h
@@ -1,16 +1,19 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ENUMERATE_VECTOR_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_ENUMERATE_VECTOR_H
 
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 #include <map>
 #include <vector>
 
 namespace FlexFlow {
 
 template <typename T>
-std::map<int, T> enumerate_vector(std::vector<T> const &v) {
-  std::map<int, T> result;
-  for (int i = 0; i < v.size(); i++) {
-    result.insert({i, v.at(i)});
+std::map<nonnegative_int, T> enumerate_vector(std::vector<T> const &v) {
+  std::map<nonnegative_int, T> result;
+  for (nonnegative_int i : nonnegative_range(num_elements(v))) {
+    result.insert({i, v.at(i.unwrap_nonnegative())});
   }
   return result;
 }
diff --git a/lib/utils/include/utils/containers/flatmap.h b/lib/utils/include/utils/containers/flatmap.h
index b016a1e03d..a7848b88aa 100644
--- a/lib/utils/include/utils/containers/flatmap.h
+++ b/lib/utils/include/utils/containers/flatmap.h
@@ -4,6 +4,7 @@
 #include "utils/containers/extend.h"
 #include "utils/containers/get_element_type.h"
 #include "utils/containers/merge_maps.h"
+#include <string>
 #include <type_traits>
 #include <unordered_map>
 
@@ -52,7 +53,19 @@ std::unordered_map<OutK, OutV> flatmap(std::unordered_map<InK, InV> const &m,
   std::unordered_map<OutK, OutV> result;
 
   for (auto const &[k, v] : m) {
-    result = merge_maps(result, f(k, v));
+    result = merge_disjoint_maps(result, f(k, v));
+  }
+
+  return result;
+}
+
+template <typename F>
+std::string flatmap(std::string const &input, F const &f) {
+  std::string result = "";
+
+  for (char c : input) {
+    std::string for_c = f(c);
+    result += for_c;
   }
 
   return result;
diff --git a/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h b/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h
index ccdde0131a..0a7e9d16c2 100644
--- a/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h
+++ b/lib/utils/include/utils/containers/get_all_permutations_with_repetition.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_GET_ALL_PERMUTATIONS_WITH_REPETITION_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_GET_ALL_PERMUTATIONS_WITH_REPETITION_H
 
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <unordered_set>
 #include <vector>
 
@@ -14,7 +15,8 @@ namespace FlexFlow {
  **/
 template <typename C, typename T = typename C::value_type>
 std::unordered_multiset<std::vector<T>>
-    get_all_permutations_with_repetition(C const &container, int n) {
+    get_all_permutations_with_repetition(C const &container,
+                                         nonnegative_int n) {
   std::unordered_multiset<std::vector<T>> result;
 
   if (container.empty() || n == 0) {
@@ -22,16 +24,16 @@ std::unordered_multiset<std::vector<T>>
   }
 
   std::vector<T> elements(std::begin(container), std::end(container));
-  std::vector<int> indices(n, 0);
+  std::vector<int> indices(n.unwrap_nonnegative(), 0);
 
   while (true) {
-    std::vector<T> perm(n);
+    std::vector<T> perm(n.unwrap_nonnegative());
     for (int i = 0; i < n; ++i) {
       perm[i] = elements[indices[i]];
     }
     result.insert(perm);
 
-    int i = n - 1;
+    int i = n.unwrap_nonnegative() - 1;
     while (i != -1 && ++indices[i] == elements.size()) {
       indices[i] = 0;
       --i;
diff --git a/lib/utils/include/utils/containers/make.h b/lib/utils/include/utils/containers/make.h
new file mode 100644
index 0000000000..f7b15dfa02
--- /dev/null
+++ b/lib/utils/include/utils/containers/make.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MAKE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_MAKE_H
+
+namespace FlexFlow {
+
+template <typename T>
+decltype(auto) make() {
+  return [](auto const &x) { return T{x}; };
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/containers/merge_maps.h b/lib/utils/include/utils/containers/merge_maps.h
index dd886ab8aa..bfc2446d99 100644
--- a/lib/utils/include/utils/containers/merge_maps.h
+++ b/lib/utils/include/utils/containers/merge_maps.h
@@ -3,30 +3,64 @@
 
 #include "utils/containers/are_disjoint.h"
 #include "utils/containers/keys.h"
+#include "utils/containers/merge_method.dtg.h"
 #include "utils/exception.h"
 #include "utils/fmt/unordered_map.h"
+#include "utils/fmt/unordered_set.h"
 #include <unordered_map>
 
 namespace FlexFlow {
 
 template <typename K, typename V>
-std::unordered_map<K, V> merge_maps(std::unordered_map<K, V> const &lhs,
-                                    std::unordered_map<K, V> const &rhs) {
-  if (!are_disjoint(keys(lhs), keys(rhs))) {
-    throw mk_runtime_error(fmt::format("Key sets of merge_maps parameters are "
-                                       "non-disjoint: lhs = {}, rhs = {}",
-                                       lhs,
-                                       rhs));
+void merge_in_map(std::unordered_map<K, V> const &m,
+                  std::unordered_map<K, V> &result) {
+  for (auto const &[k, v] : m) {
+    auto it = result.find(k);
+    if (it != result.end()) {
+      it->second = v;
+    } else {
+      result.insert({k, v});
+    }
   }
+}
 
-  std::unordered_map<K, V> result;
-  for (auto const &kv : lhs) {
-    result.insert(kv);
-  }
-  for (auto const &kv : rhs) {
-    result.insert(kv);
+template <typename K, typename V>
+std::unordered_map<K, V>
+    merge_disjoint_maps(std::unordered_map<K, V> const &lhs,
+                        std::unordered_map<K, V> const &rhs) {
+
+  std::unordered_set<K> lhs_keys = keys(lhs);
+  std::unordered_set<K> rhs_keys = keys(rhs);
+  std::unordered_set<K> shared_keys = intersection(lhs_keys, rhs_keys);
+  if (!shared_keys.empty()) {
+    throw mk_runtime_error(
+        fmt::format("merge_maps expected disjoint maps, but maps share keys {}",
+                    shared_keys));
   }
 
+  std::unordered_map<K, V> result;
+  merge_in_map(lhs, result);
+  merge_in_map(rhs, result);
+  return result;
+}
+
+template <typename K, typename V>
+std::unordered_map<K, V>
+    merge_map_left_dominates(std::unordered_map<K, V> const &lhs,
+                             std::unordered_map<K, V> const &rhs) {
+  std::unordered_map<K, V> result;
+  merge_in_map(rhs, result);
+  merge_in_map(lhs, result);
+  return result;
+}
+
+template <typename K, typename V>
+std::unordered_map<K, V>
+    merge_map_right_dominates(std::unordered_map<K, V> const &lhs,
+                              std::unordered_map<K, V> const &rhs) {
+  std::unordered_map<K, V> result;
+  merge_in_map(lhs, result);
+  merge_in_map(rhs, result);
   return result;
 }
 
diff --git a/lib/utils/include/utils/containers/merge_method.enum.toml b/lib/utils/include/utils/containers/merge_method.enum.toml
new file mode 100644
index 0000000000..ec0ed067dd
--- /dev/null
+++ b/lib/utils/include/utils/containers/merge_method.enum.toml
@@ -0,0 +1,17 @@
+namespace = "FlexFlow"
+name = "MergeMethod"
+features = [
+  "json",
+  "hash",
+  "fmt",
+  "rapidcheck",
+]
+
+[[values]]
+name = "REQUIRE_DISJOINT"
+
+[[values]]
+name = "LEFT_DOMINATES"
+
+[[values]]
+name = "RIGHT_DOMINATES"
diff --git a/lib/utils/include/utils/containers/product.h b/lib/utils/include/utils/containers/product.h
index af04edcb81..30aac2681a 100644
--- a/lib/utils/include/utils/containers/product.h
+++ b/lib/utils/include/utils/containers/product.h
@@ -10,7 +10,7 @@ namespace FlexFlow {
  **/
 template <typename Container, typename Element = typename Container::value_type>
 Element product(Container const &container) {
-  Element result = 1;
+  Element result = Element{1};
   for (Element const &element : container) {
     result *= element;
   }
diff --git a/lib/utils/include/utils/containers/repeat.h b/lib/utils/include/utils/containers/repeat.h
index 18de92cf4a..9782d6265a 100644
--- a/lib/utils/include/utils/containers/repeat.h
+++ b/lib/utils/include/utils/containers/repeat.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPEAT_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPEAT_H
 
+#include "utils/nonnegative_int/nonnegative_int.h"
 #include <cassert>
 #include <type_traits>
 #include <vector>
@@ -8,9 +9,7 @@
 namespace FlexFlow {
 
 template <typename F, typename Out = std::invoke_result_t<F>>
-std::vector<Out> repeat(int n, F const &f) {
-  assert(n >= 0);
-
+std::vector<Out> repeat(nonnegative_int n, F const &f) {
   std::vector<Out> result;
   for (int i = 0; i < n; i++) {
     result.push_back(f());
diff --git a/lib/utils/include/utils/containers/repeat_element.h b/lib/utils/include/utils/containers/repeat_element.h
new file mode 100644
index 0000000000..e1ac508116
--- /dev/null
+++ b/lib/utils/include/utils/containers/repeat_element.h
@@ -0,0 +1,22 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H
+
+#include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include <fmt/format.h>
+#include <vector>
+
+namespace FlexFlow {
+
+template <typename T>
+std::vector<T> repeat_element(nonnegative_int num_times, T const &element) {
+  std::vector<T> result;
+  for (int i = 0; i < num_times; ++i) {
+    result.push_back(element);
+  }
+  return result;
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/containers/replicate.h b/lib/utils/include/utils/containers/replicate.h
deleted file mode 100644
index aa3d0a7e35..0000000000
--- a/lib/utils/include/utils/containers/replicate.h
+++ /dev/null
@@ -1,15 +0,0 @@
-#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H
-#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_CONTAINERS_REPLICATE_H
-
-#include <vector>
-
-namespace FlexFlow {
-
-template <typename T>
-std::vector<T> replicate(int n, T const &element) {
-  return std::vector<T>(n, element);
-}
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/utils/include/utils/containers/sum.h b/lib/utils/include/utils/containers/sum.h
index 135e704045..d6061e396e 100644
--- a/lib/utils/include/utils/containers/sum.h
+++ b/lib/utils/include/utils/containers/sum.h
@@ -8,7 +8,7 @@ namespace FlexFlow {
  **/
 template <typename Container, typename Element = typename Container::value_type>
 Element sum(Container const &container) {
-  Element result = 0;
+  Element result = Element{0};
   for (Element const &element : container) {
     result += element;
   }
diff --git a/lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h b/lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
new file mode 100644
index 0000000000..b12e20124f
--- /dev/null
+++ b/lib/utils/include/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
@@ -0,0 +1,34 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H
+
+#include "utils/graph/dataflow_graph/dataflow_graph_view.h"
+#include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h"
+
+namespace FlexFlow {
+
+struct ViewDataflowGraphAsOpenDataflowGraph final
+    : public IOpenDataflowGraphView {
+
+  ViewDataflowGraphAsOpenDataflowGraph() = delete;
+  ViewDataflowGraphAsOpenDataflowGraph(DataflowGraphView const &);
+
+  std::unordered_set<Node> query_nodes(NodeQuery const &) const override;
+  std::unordered_set<DataflowOutput>
+      query_outputs(DataflowOutputQuery const &) const override;
+  std::unordered_set<DataflowGraphInput> get_inputs() const override;
+  std::unordered_set<OpenDataflowEdge>
+      query_edges(OpenDataflowEdgeQuery const &) const override;
+
+  ViewDataflowGraphAsOpenDataflowGraph *clone() const override;
+
+  virtual ~ViewDataflowGraphAsOpenDataflowGraph() = default;
+
+private:
+  DataflowGraphView g;
+};
+
+OpenDataflowGraphView view_as_open_dataflow_graph(DataflowGraphView const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml
index 0b0c5a41d8..aed0c28aeb 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_edge_query.struct.toml
@@ -10,6 +10,7 @@ features = [
 includes = [
   "utils/graph/query_set.h",
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -18,7 +19,7 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>"
 
 [[fields]]
 name = "src_idxs"
-type = "::FlexFlow::query_set<int>"
+type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>"
 
 [[fields]]
 name = "dst_nodes"
@@ -26,4 +27,4 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>"
 
 [[fields]]
 name = "dst_idxs"
-type = "::FlexFlow::query_set<int>"
+type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>"
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h b/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h
index 6a1898dd13..58c28aaff6 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_graph.h
@@ -4,13 +4,14 @@
 #include "utils/graph/dataflow_graph/dataflow_graph_view.h"
 #include "utils/graph/dataflow_graph/i_dataflow_graph.h"
 #include "utils/graph/dataflow_graph/node_added_result.dtg.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
 struct DataflowGraph : virtual public DataflowGraphView {
 public:
   NodeAddedResult add_node(std::vector<DataflowOutput> const &inputs,
-                           int num_outputs);
+                           nonnegative_int num_outputs);
 
   void add_node_unsafe(Node const &node,
                        std::vector<DataflowOutput> const &inputs,
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml
index f322fa63fe..eb9c30d558 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_input.struct.toml
@@ -9,6 +9,7 @@ features = [
 
 includes = [
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -17,4 +18,4 @@ type = "::FlexFlow::Node"
 
 [[fields]]
 name = "idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml
index f3ccebe046..19d92a3d4c 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output.struct.toml
@@ -9,6 +9,7 @@ features = [
 
 includes = [
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -17,4 +18,4 @@ type = "::FlexFlow::Node"
 
 [[fields]]
 name = "idx"
-type = "int"
+type = "::FlexFlow::nonnegative_int"
diff --git a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml
index 0701855ba6..d1af6d5c0d 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml
+++ b/lib/utils/include/utils/graph/dataflow_graph/dataflow_output_query.struct.toml
@@ -10,6 +10,10 @@ features = [
 includes = [
   "utils/graph/query_set.h",
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
+]
+
+src_includes = [
   "utils/fmt/unordered_set.h",
 ]
 
@@ -19,4 +23,4 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>"
 
 [[fields]]
 name = "output_idxs"
-type = "::FlexFlow::query_set<int>"
+type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>"
diff --git a/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h b/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h
index 87882a6242..2572fe5c68 100644
--- a/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/dataflow_graph/i_dataflow_graph.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 struct IDataflowGraph : virtual public IDataflowGraphView {
   virtual NodeAddedResult add_node(std::vector<DataflowOutput> const &inputs,
-                                   int num_outputs) = 0;
+                                   nonnegative_int num_outputs) = 0;
 
   virtual void add_node_unsafe(Node const &node,
                                std::vector<DataflowOutput> const &inputs,
diff --git a/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h b/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h
index 4ed83834a2..ecba7921af 100644
--- a/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/instances/unordered_set_dataflow_graph.h
@@ -14,9 +14,9 @@ struct UnorderedSetDataflowGraph final : virtual public IDataflowGraph,
   UnorderedSetDataflowGraph();
 
   NodeAddedResult add_node(std::vector<DataflowOutput> const &inputs,
-                           int num_outputs) override;
+                           nonnegative_int num_outputs) override;
   NodeAddedResult add_node(std::vector<OpenDataflowValue> const &inputs,
-                           int num_outputs) override;
+                           nonnegative_int num_outputs) override;
   DataflowGraphInput add_input() override;
 
   std::unordered_set<Node> query_nodes(NodeQuery const &) const override;
diff --git a/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h b/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h
index f1063c1f21..159778bb6d 100644
--- a/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/instances/unordered_set_labelled_open_dataflow_graph.h
@@ -57,9 +57,10 @@ struct UnorderedSetLabelledOpenDataflowGraph final
     }
 
     std::vector<DataflowOutput> new_outputs =
-        transform(count(output_labels.size()), [&](int output_idx) {
-          return DataflowOutput{new_node, output_idx};
-        });
+        transform(nonnegative_range(num_elements(output_labels)),
+                  [&](nonnegative_int output_idx) {
+                    return DataflowOutput{new_node, output_idx};
+                  });
 
     for (auto const &[output, output_label] : zip(new_outputs, output_labels)) {
       this->values.insert({OpenDataflowValue{output}, output_label});
diff --git a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h
index ec8f025ac3..2115a03cda 100644
--- a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h
+++ b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/get_graph_data.h
@@ -4,6 +4,7 @@
 #include "utils/graph/labelled_open_dataflow_graph/algorithms/labelled_open_dataflow_graph_data.dtg.h"
 #include "utils/graph/labelled_open_dataflow_graph/labelled_open_dataflow_graph_view.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_edges.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_values.h"
 
 namespace FlexFlow {
diff --git a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h
index 2d1dd03755..88950635d2 100644
--- a/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h
+++ b/lib/utils/include/utils/graph/labelled_open_dataflow_graph/algorithms/permute_node_ids.h
@@ -1,6 +1,7 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_LABELLED_OPEN_DATAFLOW_GRAPH_ALGORITHMS_PERMUTE_NODE_IDS_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_LABELLED_OPEN_DATAFLOW_GRAPH_ALGORITHMS_PERMUTE_NODE_IDS_H
 
+#include "utils/containers/generate_map.h"
 #include "utils/graph/labelled_open_dataflow_graph/algorithms/with_labelling.h"
 #include "utils/graph/labelled_open_dataflow_graph/labelled_open_dataflow_graph_view.h"
 #include "utils/graph/node/algorithms.h"
diff --git a/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h b/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h
index 737f2d0d23..80d0ca3eaf 100644
--- a/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h
+++ b/lib/utils/include/utils/graph/multidigraph/algorithms/add_nodes.h
@@ -2,10 +2,11 @@
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_MULTIDIGRAPH_ALGORITHMS_ADD_NODES_H
 
 #include "utils/graph/multidigraph/multidigraph.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
 
 namespace FlexFlow {
 
-std::vector<Node> add_nodes(MultiDiGraph &, int num_nodes);
+std::vector<Node> add_nodes(MultiDiGraph &, nonnegative_int num_nodes);
 
 } // namespace FlexFlow
 
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h
new file mode 100644
index 0000000000..ae99e2850f
--- /dev/null
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h
@@ -0,0 +1,13 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_ARE_ISOMORPHIC_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_ARE_ISOMORPHIC_H
+
+#include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h"
+
+namespace FlexFlow {
+
+bool are_isomorphic(OpenDataflowGraphView const &,
+                    OpenDataflowGraphView const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h
new file mode 100644
index 0000000000..fe282a8c2e
--- /dev/null
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h
@@ -0,0 +1,21 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_OPEN_DATAFLOW_GRAPH_ISOMORPHISM_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_GRAPH_OPEN_DATAFLOW_GRAPH_ALGORITHMS_OPEN_DATAFLOW_GRAPH_ISOMORPHISM_H
+
+#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.dtg.h"
+#include "utils/graph/open_dataflow_graph/open_dataflow_value.dtg.h"
+
+namespace FlexFlow {
+
+OpenDataflowValue isomorphism_map_r_open_dataflow_value_from_l(
+    OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &l_value);
+OpenDataflowValue isomorphism_map_l_open_dataflow_value_from_r(
+    OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &r_value);
+
+DataflowOutput isomorphism_map_r_dataflow_output_from_l(
+    OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &l_output);
+DataflowOutput isomorphism_map_l_dataflow_output_from_r(
+    OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &r_output);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml b/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml
index 544a05af85..f67e8b88e0 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/dataflow_input_edge_query.struct.toml
@@ -11,6 +11,7 @@ includes = [
   "utils/graph/query_set.h",
   "utils/graph/open_dataflow_graph/dataflow_graph_input.dtg.h",
   "utils/graph/node/node.dtg.h",
+  "utils/nonnegative_int/nonnegative_int.h",
 ]
 
 [[fields]]
@@ -23,4 +24,4 @@ type = "::FlexFlow::query_set<::FlexFlow::Node>"
 
 [[fields]]
 name = "dst_idxs"
-type = "::FlexFlow::query_set<int>"
+type = "::FlexFlow::query_set<::FlexFlow::nonnegative_int>"
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h b/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h
index 6edfa408d4..9b71b06e62 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/i_open_dataflow_graph.h
@@ -9,7 +9,7 @@ namespace FlexFlow {
 
 struct IOpenDataflowGraph : virtual public IOpenDataflowGraphView {
   virtual NodeAddedResult add_node(std::vector<OpenDataflowValue> const &inputs,
-                                   int num_outputs) = 0;
+                                   nonnegative_int num_outputs) = 0;
   virtual DataflowGraphInput add_input() = 0;
   virtual IOpenDataflowGraph *clone() const = 0;
 
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h
index 09499f8e5f..1102bf0586 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_edge.h
@@ -7,7 +7,7 @@
 namespace FlexFlow {
 
 Node get_open_dataflow_edge_dst_node(OpenDataflowEdge const &);
-int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &);
+nonnegative_int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &);
 DataflowInput get_open_dataflow_edge_dst(OpenDataflowEdge const &);
 OpenDataflowValue get_open_dataflow_edge_src(OpenDataflowEdge const &);
 OpenDataflowEdge
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h
index e8ecce76e8..9d48020d5f 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/open_dataflow_graph.h
@@ -11,7 +11,7 @@ namespace FlexFlow {
 struct OpenDataflowGraph : virtual public OpenDataflowGraphView {
 public:
   NodeAddedResult add_node(std::vector<OpenDataflowValue> const &inputs,
-                           int num_outputs);
+                           nonnegative_int num_outputs);
   DataflowGraphInput add_input();
 
   template <typename T, typename... Args>
diff --git a/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h b/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h
index 7b921772d6..f3d54e4329 100644
--- a/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h
+++ b/lib/utils/include/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.h
@@ -12,7 +12,7 @@ struct UnorderedSetOpenDataflowGraph : public IOpenDataflowGraph {
   UnorderedSetOpenDataflowGraph();
 
   NodeAddedResult add_node(std::vector<OpenDataflowValue> const &inputs,
-                           int num_outputs) override;
+                           nonnegative_int num_outputs) override;
 
   std::unordered_set<Node> query_nodes(NodeQuery const &) const override;
   std::unordered_set<OpenDataflowEdge>
diff --git a/lib/utils/include/utils/graph/render_dot.h b/lib/utils/include/utils/graph/render_dot.h
new file mode 100644
index 0000000000..632ba736ea
--- /dev/null
+++ b/lib/utils/include/utils/graph/render_dot.h
@@ -0,0 +1,19 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RENDER_DOT_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_RENDER_DOT_H
+
+#include "utils/graph/labelled_open_dataflow_graph/labelled_open_dataflow_graph_view.h"
+#include <string>
+#include <unordered_map>
+
+namespace FlexFlow {
+
+std::string escape_dot_string(std::string const &);
+std::string render_dot_node_attrs(
+    std::unordered_map<std::string, std::string> const &attrs);
+std::string render_dot(
+    LabelledDataflowGraphView<std::unordered_map<std::string, std::string>,
+                              std::string> const &);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/nonnegative_int/ceildiv.h b/lib/utils/include/utils/nonnegative_int/ceildiv.h
new file mode 100644
index 0000000000..939ea3de51
--- /dev/null
+++ b/lib/utils/include/utils/nonnegative_int/ceildiv.h
@@ -0,0 +1,11 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_CEILDIV_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_CEILDIV_H
+
+#include "utils/nonnegative_int/nonnegative_int.h"
+namespace FlexFlow {
+
+nonnegative_int ceildiv(nonnegative_int numerator, nonnegative_int denominator);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
index 0749497c56..0bcc8cfd6f 100644
--- a/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
+++ b/lib/utils/include/utils/nonnegative_int/nonnegative_int.h
@@ -1,12 +1,11 @@
 #ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_INT_H
 #define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_INT_H
 
-#include "rapidcheck.h"
-
 #include <any>
 #include <fmt/format.h>
 #include <functional>
 #include <nlohmann/json.hpp>
+#include <rapidcheck.h>
 #include <string>
 
 namespace FlexFlow {
@@ -14,6 +13,7 @@ class nonnegative_int {
 public:
   nonnegative_int() = delete;
   explicit nonnegative_int(int value);
+  explicit nonnegative_int(size_t value);
 
   explicit operator int() const noexcept;
 
@@ -39,16 +39,31 @@ class nonnegative_int {
   friend bool operator>=(int const &lhs, nonnegative_int const &rhs);
 
   nonnegative_int operator+(nonnegative_int const &other) const;
+  nonnegative_int &operator++();
+  nonnegative_int operator++(int);
+  nonnegative_int &operator+=(nonnegative_int const &other);
+
+  nonnegative_int operator*(nonnegative_int const &other) const;
+  nonnegative_int &operator*=(nonnegative_int const &other);
+
+  nonnegative_int operator/(nonnegative_int const &other) const;
+  nonnegative_int &operator/=(nonnegative_int const &other);
+
+  nonnegative_int operator%(nonnegative_int const &other) const;
+  nonnegative_int &operator%=(nonnegative_int const &other);
 
   friend std::ostream &operator<<(std::ostream &os, nonnegative_int const &n);
 
   friend int format_as(nonnegative_int const &);
 
-  int get_value() const;
+  int unwrap_nonnegative() const;
 
 private:
   int value_;
 };
+
+nonnegative_int operator""_n(unsigned long long int);
+
 } // namespace FlexFlow
 
 namespace nlohmann {
@@ -59,6 +74,13 @@ struct adl_serializer<::FlexFlow::nonnegative_int> {
 };
 } // namespace nlohmann
 
+namespace rc {
+template <>
+struct Arbitrary<::FlexFlow::nonnegative_int> {
+  static Gen<::FlexFlow::nonnegative_int> arbitrary();
+};
+} // namespace rc
+
 namespace std {
 template <>
 struct hash<::FlexFlow::nonnegative_int> {
diff --git a/lib/utils/include/utils/nonnegative_int/nonnegative_range.h b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h
new file mode 100644
index 0000000000..af323aef42
--- /dev/null
+++ b/lib/utils/include/utils/nonnegative_int/nonnegative_range.h
@@ -0,0 +1,14 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_RANGE_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NONNEGATIVE_RANGE_H
+
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+std::vector<nonnegative_int> nonnegative_range(nonnegative_int end);
+std::vector<nonnegative_int>
+    nonnegative_range(nonnegative_int start, nonnegative_int end, int step = 1);
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/nonnegative_int/num_elements.h b/lib/utils/include/utils/nonnegative_int/num_elements.h
new file mode 100644
index 0000000000..57bc98ee50
--- /dev/null
+++ b/lib/utils/include/utils/nonnegative_int/num_elements.h
@@ -0,0 +1,17 @@
+#ifndef _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NUM_ELEMENTS_H
+#define _FLEXFLOW_LIB_UTILS_INCLUDE_UTILS_NONNEGATIVE_INT_NUM_ELEMENTS_H
+
+#include "utils/exception.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+
+namespace FlexFlow {
+
+template <typename T>
+nonnegative_int num_elements(T const &t) {
+  size_t t_size = t.size();
+  return nonnegative_int{t_size};
+}
+
+} // namespace FlexFlow
+
+#endif
diff --git a/lib/utils/include/utils/variant.h b/lib/utils/include/utils/variant.h
index 241d631200..75a8851362 100644
--- a/lib/utils/include/utils/variant.h
+++ b/lib/utils/include/utils/variant.h
@@ -4,6 +4,7 @@
 #include "rapidcheck.h"
 #include "utils/type_traits.h"
 #include <optional>
+#include <utility>
 #include <variant>
 
 namespace FlexFlow {
diff --git a/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc b/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc
index 350f08600c..67e0b32d6e 100644
--- a/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc
+++ b/lib/utils/src/utils/bidict/algorithms/bidict_from_enumerating.cc
@@ -1 +1,14 @@
 #include "utils/bidict/algorithms/bidict_from_enumerating.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template bidict<nonnegative_int, T>
+    bidict_from_enumerating(std::unordered_set<T> const &);
+
+template bidict<nonnegative_int, T>
+    bidict_from_enumerating(std::set<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/cli/cli_parse.cc b/lib/utils/src/utils/cli/cli_parse.cc
index 07982c0c2d..36d5837f9c 100644
--- a/lib/utils/src/utils/cli/cli_parse.cc
+++ b/lib/utils/src/utils/cli/cli_parse.cc
@@ -32,7 +32,7 @@ tl::expected<CLIParseResult, std::string>
       {},
   };
 
-  int consumed_positional_args = 0;
+  nonnegative_int consumed_positional_args = 0_n;
   auto parse_positional_arg =
       [&](std::string const &arg) -> std::optional<std::string> {
     if (consumed_positional_args >= cli.positional_arguments.size()) {
@@ -40,8 +40,8 @@ tl::expected<CLIParseResult, std::string>
                          cli.positional_arguments.size());
     }
 
-    CLIPositionalArgumentSpec arg_spec =
-        cli.positional_arguments.at(consumed_positional_args);
+    CLIPositionalArgumentSpec arg_spec = cli.positional_arguments.at(
+        consumed_positional_args.unwrap_nonnegative());
 
     if (arg_spec.choices.has_value() &&
         !contains(arg_spec.choices.value(), arg)) {
diff --git a/lib/utils/src/utils/cli/cli_spec.cc b/lib/utils/src/utils/cli/cli_spec.cc
index ca51cfe57f..e314f6fd55 100644
--- a/lib/utils/src/utils/cli/cli_spec.cc
+++ b/lib/utils/src/utils/cli/cli_spec.cc
@@ -2,6 +2,8 @@
 #include "utils/containers/count.h"
 #include "utils/containers/transform.h"
 #include "utils/integer_conversions.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -10,8 +12,8 @@ CLISpec empty_cli_spec() {
 }
 
 std::vector<CLIFlagKey> cli_get_flag_keys(CLISpec const &cli) {
-  return transform(count(cli.flags.size()),
-                   [](int idx) { return CLIFlagKey{idx}; });
+  return transform(nonnegative_range(num_elements(cli.flags)),
+                   [](nonnegative_int idx) { return CLIFlagKey{idx}; });
 }
 
 CLIArgumentKey cli_add_help_flag(CLISpec &cli) {
@@ -21,17 +23,18 @@ CLIArgumentKey cli_add_help_flag(CLISpec &cli) {
 }
 
 CLIArgumentKey cli_add_flag(CLISpec &cli, CLIFlagSpec const &flag_spec) {
+  CLIArgumentKey key = CLIArgumentKey{CLIFlagKey{num_elements(cli.flags)}};
   cli.flags.push_back(flag_spec);
-
-  return CLIArgumentKey{CLIFlagKey{int_from_size_t(cli.flags.size()) - 1}};
+  return key;
 }
 
 CLIArgumentKey
     cli_add_positional_argument(CLISpec &cli,
                                 CLIPositionalArgumentSpec const &arg) {
+  CLIArgumentKey key = CLIArgumentKey{
+      CLIPositionalArgumentKey{num_elements(cli.positional_arguments)}};
   cli.positional_arguments.push_back(arg);
-  return CLIArgumentKey{CLIPositionalArgumentKey{
-      int_from_size_t(cli.positional_arguments.size()) - 1}};
+  return key;
 }
 
 } // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/at_idx.cc b/lib/utils/src/utils/containers/at_idx.cc
index 45b1a31fce..14a0695c6d 100644
--- a/lib/utils/src/utils/containers/at_idx.cc
+++ b/lib/utils/src/utils/containers/at_idx.cc
@@ -1 +1,10 @@
 #include "utils/containers/at_idx.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using E = value_type<0>;
+
+template std::optional<E> at_idx(std::vector<E> const &, nonnegative_int);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/enumerate.cc b/lib/utils/src/utils/containers/enumerate.cc
index 0984b6dc63..ca5ad6ddc1 100644
--- a/lib/utils/src/utils/containers/enumerate.cc
+++ b/lib/utils/src/utils/containers/enumerate.cc
@@ -1 +1,12 @@
 #include "utils/containers/enumerate.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::map<nonnegative_int, T> enumerate(std::vector<T> const &);
+
+template std::map<nonnegative_int, T> enumerate(std::unordered_set<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/enumerate_vector.cc b/lib/utils/src/utils/containers/enumerate_vector.cc
index d4fd131af2..0d0bd1c277 100644
--- a/lib/utils/src/utils/containers/enumerate_vector.cc
+++ b/lib/utils/src/utils/containers/enumerate_vector.cc
@@ -1 +1,10 @@
 #include "utils/containers/enumerate_vector.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::map<nonnegative_int, T> enumerate_vector(std::vector<T> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/make.cc b/lib/utils/src/utils/containers/make.cc
new file mode 100644
index 0000000000..29b5bc5184
--- /dev/null
+++ b/lib/utils/src/utils/containers/make.cc
@@ -0,0 +1,8 @@
+#include "utils/containers/make.h"
+#include <vector>
+
+namespace FlexFlow {
+
+template decltype(auto) make<std::vector<int>>();
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/range.cc b/lib/utils/src/utils/containers/range.cc
index d3ebd1063b..f3baab3db1 100644
--- a/lib/utils/src/utils/containers/range.cc
+++ b/lib/utils/src/utils/containers/range.cc
@@ -1,5 +1,6 @@
 #include "utils/containers/range.h"
 #include <cassert>
+#include <iostream>
 
 namespace FlexFlow {
 
diff --git a/lib/utils/src/utils/containers/repeat.cc b/lib/utils/src/utils/containers/repeat.cc
index 76e46f0fdc..777996d995 100644
--- a/lib/utils/src/utils/containers/repeat.cc
+++ b/lib/utils/src/utils/containers/repeat.cc
@@ -1 +1,11 @@
 #include "utils/containers/repeat.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using Out = value_type<0>;
+using F = std::function<Out()>;
+
+template std::vector<Out> repeat(nonnegative_int, F const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/repeat_element.cc b/lib/utils/src/utils/containers/repeat_element.cc
new file mode 100644
index 0000000000..70889eb971
--- /dev/null
+++ b/lib/utils/src/utils/containers/repeat_element.cc
@@ -0,0 +1,10 @@
+#include "utils/containers/repeat_element.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using T = value_type<0>;
+
+template std::vector<T> repeat_element(nonnegative_int, T const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/containers/replicate.cc b/lib/utils/src/utils/containers/replicate.cc
deleted file mode 100644
index 2fb2f079f6..0000000000
--- a/lib/utils/src/utils/containers/replicate.cc
+++ /dev/null
@@ -1 +0,0 @@
-#include "utils/containers/replicate.h"
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc
index f0e52d6fc2..7069146057 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms.cc
@@ -27,7 +27,7 @@ std::vector<DataflowOutput> get_outputs(DataflowGraphView const &g,
                                         Node const &n) {
   return sorted_by(g.query_outputs(DataflowOutputQuery{
                        query_set<Node>{n},
-                       query_set<int>::matchall(),
+                       query_set<nonnegative_int>::matchall(),
                    }),
                    [](DataflowOutput const &l, DataflowOutput const &r) {
                      return l.idx < r.idx;
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc
index 47c30ce998..2ae903fa0b 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/as_dot.cc
@@ -1,27 +1,36 @@
 #include "utils/graph/dataflow_graph/algorithms/as_dot.h"
+#include "utils/containers/generate_map.h"
+#include "utils/containers/map_keys.h"
 #include "utils/dot_file.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
+#include "utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h"
+#include "utils/graph/labelled_open_dataflow_graph/algorithms/with_labelling.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/graph/render_dot.h"
 #include "utils/record_formatter.h"
 
 namespace FlexFlow {
 
-// WARN(@lockshaw): doing this all with string ids is ugly and error prone,
-// as it requires duplicating the stringification logic across functions.
-//
-// Fixing this is tracked in issue
 std::string as_dot(DataflowGraphView const &g) {
-  std::ostringstream oss;
-  DotFile<std::string> dot = DotFile<std::string>{oss};
+  auto get_node_attrs = [](Node const &) {
+    return std::unordered_map<std::string, std::string>{};
+  };
+
+  std::unordered_map<Node, std::unordered_map<std::string, std::string>>
+      node_labels = generate_map(get_nodes(g), get_node_attrs);
 
-  std::function<std::string(Node const &)> get_node_label =
-      [](Node const &n) -> std::string {
-    return fmt::format("n{}", n.raw_uid);
+  auto get_output_label = [](DataflowOutput const &o) {
+    return fmt::to_string(o.idx);
   };
-  as_dot(dot, g, get_node_label);
 
-  dot.close();
-  return oss.str();
+  std::unordered_map<DataflowOutput, std::string> output_labels =
+      generate_map(get_all_dataflow_outputs(g), get_output_label);
+  std::unordered_map<OpenDataflowValue, std::string> value_labels =
+      map_keys(output_labels,
+               [](DataflowOutput const &o) { return OpenDataflowValue{o}; });
+
+  return render_dot(with_labelling(
+      view_as_open_dataflow_graph(g), node_labels, value_labels));
 }
 
 void as_dot(DotFile<std::string> &dot,
@@ -29,9 +38,13 @@ void as_dot(DotFile<std::string> &dot,
             std::function<std::string(Node const &)> const &get_node_label) {
   auto get_node_name = [](Node n) { return fmt::format("n{}", n.raw_uid); };
 
-  auto get_input_field = [](int idx) { return fmt::format("i{}", idx); };
+  auto get_input_field = [](nonnegative_int idx) {
+    return fmt::format("i{}", idx);
+  };
 
-  auto get_output_field = [](int idx) { return fmt::format("o{}", idx); };
+  auto get_output_field = [](nonnegative_int idx) {
+    return fmt::format("o{}", idx);
+  };
 
   for (Node const &n : get_nodes(g)) {
     std::vector<DataflowInput> n_inputs = get_dataflow_inputs(g, n);
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
index c07d344d05..73afc11acc 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
@@ -6,9 +6,9 @@ std::unordered_set<DataflowEdge> get_dataflow_edges_from_node_to_node(
     DataflowGraphView const &g, Node const &src, Node const &dst) {
   return g.query_edges(DataflowEdgeQuery{
       /*src_nodes=*/query_set<Node>{src},
-      /*src_idxs=*/query_set<int>::matchall(),
+      /*src_idxs=*/query_set<nonnegative_int>::matchall(),
       /*dst_nodes=*/query_set<Node>{dst},
-      /*dst_idxs=*/query_set<int>::matchall(),
+      /*dst_idxs=*/query_set<nonnegative_int>::matchall(),
   });
 }
 
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
index 9500836db1..c4947f967a 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
@@ -7,9 +7,9 @@ std::vector<DataflowEdge> get_incoming_edges(DataflowGraphView const &g,
                                              Node const &n) {
   return sorted_by(g.query_edges(DataflowEdgeQuery{
                        query_set<Node>::matchall(),
-                       query_set<int>::matchall(),
+                       query_set<nonnegative_int>::matchall(),
                        {n},
-                       query_set<int>::matchall(),
+                       query_set<nonnegative_int>::matchall(),
                    }),
                    [](DataflowEdge const &l, DataflowEdge const &r) {
                      return l.dst.idx < r.dst.idx;
@@ -21,9 +21,9 @@ std::unordered_set<DataflowEdge>
                        std::unordered_set<Node> const &ns) {
   DataflowEdgeQuery query = DataflowEdgeQuery{
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>{ns},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
   return g.query_edges(query);
 }
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
index 2376e4897f..16b2b82b2d 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
@@ -7,9 +7,9 @@ std::unordered_set<DataflowEdge> get_outgoing_edges(DataflowGraphView const &g,
                                                     Node const &n) {
   return g.query_edges(DataflowEdgeQuery{
       {n},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   });
 }
 
@@ -18,9 +18,9 @@ std::unordered_set<DataflowEdge>
                        std::unordered_set<Node> const &ns) {
   DataflowEdgeQuery query = DataflowEdgeQuery{
       query_set<Node>{ns},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
   return g.query_edges(query);
 }
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
index d17a84dd12..a06ec1ab31 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
@@ -13,9 +13,9 @@ std::unordered_set<DataflowEdge>
 
   DataflowEdgeQuery query = DataflowEdgeQuery{
       src_query,
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>{ns},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 
   return g.query_edges(query);
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
index c442a26dab..f94dd94e11 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
@@ -13,9 +13,9 @@ std::unordered_set<DataflowEdge>
 
   DataflowEdgeQuery query = DataflowEdgeQuery{
       query_set<Node>{ns},
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       dst_query,
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 
   return g.query_edges(query);
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc b/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc
index 0fd0b85b71..703db4bf91 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.cc
@@ -3,16 +3,18 @@
 
 namespace FlexFlow {
 
-ViewDataflowGraphAsOpen::ViewDataflowGraphAsOpen(DataflowGraphView const &g)
+ViewDataflowGraphAsOpenDataflowGraph::ViewDataflowGraphAsOpenDataflowGraph(
+    DataflowGraphView const &g)
     : g(g) {}
 
-std::unordered_set<Node>
-    ViewDataflowGraphAsOpen::query_nodes(NodeQuery const &q) const {
+std::unordered_set<Node> ViewDataflowGraphAsOpenDataflowGraph::query_nodes(
+    NodeQuery const &q) const {
   return this->g.query_nodes(q);
 }
 
 std::unordered_set<OpenDataflowEdge>
-    ViewDataflowGraphAsOpen::query_edges(OpenDataflowEdgeQuery const &q) const {
+    ViewDataflowGraphAsOpenDataflowGraph::query_edges(
+        OpenDataflowEdgeQuery const &q) const {
   std::unordered_set<DataflowEdge> closed_edges =
       this->g.query_edges(q.standard_edge_query);
 
@@ -21,21 +23,23 @@ std::unordered_set<OpenDataflowEdge>
 }
 
 std::unordered_set<DataflowOutput>
-    ViewDataflowGraphAsOpen::query_outputs(DataflowOutputQuery const &q) const {
+    ViewDataflowGraphAsOpenDataflowGraph::query_outputs(
+        DataflowOutputQuery const &q) const {
   return this->g.query_outputs(q);
 }
 
 std::unordered_set<DataflowGraphInput>
-    ViewDataflowGraphAsOpen::get_inputs() const {
+    ViewDataflowGraphAsOpenDataflowGraph::get_inputs() const {
   return {};
 }
 
-ViewDataflowGraphAsOpen *ViewDataflowGraphAsOpen::clone() const {
-  return new ViewDataflowGraphAsOpen{this->g};
+ViewDataflowGraphAsOpenDataflowGraph *
+    ViewDataflowGraphAsOpenDataflowGraph::clone() const {
+  return new ViewDataflowGraphAsOpenDataflowGraph{this->g};
 }
 
 OpenDataflowGraphView view_as_open_dataflow_graph(DataflowGraphView const &g) {
-  return OpenDataflowGraphView::create<ViewDataflowGraphAsOpen>(g);
+  return OpenDataflowGraphView::create<ViewDataflowGraphAsOpenDataflowGraph>(g);
 }
 
 } // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h b/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
deleted file mode 100644
index bec9d0e019..0000000000
--- a/lib/utils/src/utils/graph/dataflow_graph/algorithms/view_as_open_dataflow_graph.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef _FLEXFLOW_LIB_UTILS_SRC_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H
-#define _FLEXFLOW_LIB_UTILS_SRC_UTILS_GRAPH_DATAFLOW_GRAPH_ALGORITHMS_VIEW_AS_OPEN_DATAFLOW_GRAPH_H
-
-#include "utils/graph/open_dataflow_graph/open_dataflow_graph_view.h"
-
-namespace FlexFlow {
-
-struct ViewDataflowGraphAsOpen final : public IOpenDataflowGraphView {
-public:
-  ViewDataflowGraphAsOpen() = delete;
-  ViewDataflowGraphAsOpen(DataflowGraphView const &);
-
-  std::unordered_set<Node> query_nodes(NodeQuery const &) const override;
-  std::unordered_set<OpenDataflowEdge>
-      query_edges(OpenDataflowEdgeQuery const &) const override;
-  std::unordered_set<DataflowOutput>
-      query_outputs(DataflowOutputQuery const &) const override;
-  std::unordered_set<DataflowGraphInput> get_inputs() const override;
-
-  ViewDataflowGraphAsOpen *clone() const override;
-
-  ~ViewDataflowGraphAsOpen() = default;
-
-private:
-  DataflowGraphView g;
-};
-
-OpenDataflowGraphView view_as_open_dataflow_graph(DataflowGraphView const &);
-
-} // namespace FlexFlow
-
-#endif
diff --git a/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc b/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc
index 2196f7a028..982969f3a5 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/dataflow_edge_query.cc
@@ -5,18 +5,18 @@ namespace FlexFlow {
 DataflowEdgeQuery dataflow_edge_query_all() {
   return DataflowEdgeQuery{
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 
 DataflowEdgeQuery dataflow_edge_query_none() {
   return DataflowEdgeQuery{
       query_set<Node>::match_none(),
-      query_set<int>::match_none(),
+      query_set<nonnegative_int>::match_none(),
       query_set<Node>::match_none(),
-      query_set<int>::match_none(),
+      query_set<nonnegative_int>::match_none(),
   };
 }
 
@@ -30,9 +30,9 @@ bool dataflow_edge_query_includes_dataflow_edge(DataflowEdgeQuery const &q,
 DataflowEdgeQuery dataflow_edge_query_for_edge(DataflowEdge const &e) {
   return DataflowEdgeQuery{
       query_set<Node>{e.src.node},
-      query_set<int>{e.src.idx},
+      query_set<nonnegative_int>{e.src.idx},
       query_set<Node>{e.dst.node},
-      query_set<int>{e.dst.idx},
+      query_set<nonnegative_int>{e.dst.idx},
   };
 }
 
@@ -40,9 +40,9 @@ DataflowEdgeQuery
     dataflow_edge_query_all_outgoing_from(DataflowOutput const &src) {
   return DataflowEdgeQuery{
       query_set<Node>{src.node},
-      query_set<int>{src.idx},
+      query_set<nonnegative_int>{src.idx},
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 
@@ -50,9 +50,9 @@ DataflowEdgeQuery
     dataflow_edge_query_all_incoming_to(DataflowInput const &dst) {
   return DataflowEdgeQuery{
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
       query_set<Node>{dst.node},
-      query_set<int>{dst.idx},
+      query_set<nonnegative_int>{dst.idx},
   };
 }
 
diff --git a/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc b/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc
index 868dd61c6d..8ed36135e1 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/dataflow_graph.cc
@@ -4,7 +4,7 @@ namespace FlexFlow {
 
 NodeAddedResult
     DataflowGraph::add_node(std::vector<DataflowOutput> const &inputs,
-                            int num_outputs) {
+                            nonnegative_int num_outputs) {
   return this->get_interface().add_node(inputs, num_outputs);
 }
 
diff --git a/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc b/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc
index 64df4c77f2..ceaad2bfdf 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/dataflow_output_query.cc
@@ -5,14 +5,14 @@ namespace FlexFlow {
 DataflowOutputQuery dataflow_output_query_all() {
   return DataflowOutputQuery{
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 
 DataflowOutputQuery dataflow_output_query_none() {
   return DataflowOutputQuery{
       query_set<Node>::match_none(),
-      query_set<int>::match_none(),
+      query_set<nonnegative_int>::match_none(),
   };
 }
 
@@ -24,7 +24,7 @@ bool dataflow_output_query_includes_dataflow_output(
 DataflowOutputQuery dataflow_output_query_for_output(DataflowOutput const &o) {
   return DataflowOutputQuery{
       query_set<Node>{o.node},
-      query_set<int>{o.idx},
+      query_set<nonnegative_int>{o.idx},
   };
 }
 
diff --git a/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc b/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc
index 300b5de546..ef9412b939 100644
--- a/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc
+++ b/lib/utils/src/utils/graph/dataflow_graph/i_dataflow_graph_view.cc
@@ -7,9 +7,9 @@ std::unordered_set<DirectedEdge>
     IDataflowGraphView::query_edges(DirectedEdgeQuery const &q) const {
   DataflowEdgeQuery dataflow_query = DataflowEdgeQuery{
       q.srcs,
-      matchall<int>(),
+      matchall<nonnegative_int>(),
       q.dsts,
-      matchall<int>(),
+      matchall<nonnegative_int>(),
   };
   std::unordered_set<DataflowEdge> dataflow_edges =
       this->query_edges(dataflow_query);
diff --git a/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc b/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc
index 3efea1c138..2de3056068 100644
--- a/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc
+++ b/lib/utils/src/utils/graph/digraph/algorithms/transitive_closure.cc
@@ -6,6 +6,7 @@
 #include "utils/graph/digraph/algorithms/materialize_digraph_view.h"
 #include "utils/graph/instances/adjacency_digraph.h"
 #include "utils/graph/node/algorithms.h"
+#include "utils/nonnegative_int/num_elements.h"
 
 namespace FlexFlow {
 
@@ -15,7 +16,9 @@ DiGraphView transitive_closure(DiGraphView const &g) {
   // incredibly slow (> minutes) for even moderately sized graphs
   // (i.e., 200 nodes) without optimization enabled.
 
-  bidict<int, Node> nodes = bidict_from_enumerating(get_nodes(g));
+  bidict<int, Node> nodes =
+      map_keys(bidict_from_enumerating(get_nodes(g)),
+               [](nonnegative_int x) { return x.unwrap_nonnegative(); });
   std::unordered_set<DirectedEdge> edges = get_edges(g);
 
   int num_nodes = nodes.size();
diff --git a/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc b/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc
index 97a2439263..69b24b716c 100644
--- a/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc
+++ b/lib/utils/src/utils/graph/digraph/algorithms/transitive_reduction.cc
@@ -37,7 +37,9 @@ DiGraphView transitive_reduction(DiGraphView const &g) {
   // transitive_closure inlined to avoid any drifts in node numbering
   // between transitive_closure and transitive_reduction
 
-  bidict<int, Node> nodes = bidict_from_enumerating(get_nodes(g));
+  bidict<int, Node> nodes =
+      map_keys(bidict_from_enumerating(get_nodes(g)),
+               [](nonnegative_int x) { return x.unwrap_nonnegative(); });
   int num_nodes = nodes.size();
 
   std::vector<bool> edge_matrix(num_nodes * num_nodes, false);
diff --git a/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc b/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc
index 1ffc5f423f..a5a1fb82bf 100644
--- a/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/instances/unordered_set_dataflow_graph.cc
@@ -1,6 +1,5 @@
 #include "utils/graph/instances/unordered_set_dataflow_graph.h"
 #include "utils/containers/are_disjoint.h"
-#include "utils/containers/count.h"
 #include "utils/containers/enumerate_vector.h"
 #include "utils/containers/extend.h"
 #include "utils/containers/transform.h"
@@ -9,6 +8,7 @@
 #include "utils/graph/node/algorithms.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_edge.h"
 #include "utils/graph/open_dataflow_graph/open_dataflow_edge_query.h"
+#include "utils/nonnegative_int/nonnegative_range.h"
 
 namespace FlexFlow {
 
@@ -25,18 +25,18 @@ UnorderedSetDataflowGraph::UnorderedSetDataflowGraph(
 }
 
 NodeAddedResult UnorderedSetDataflowGraph::add_node(
-    std::vector<DataflowOutput> const &inputs, int num_outputs) {
+    std::vector<DataflowOutput> const &inputs, nonnegative_int num_outputs) {
   std::vector<OpenDataflowValue> open_inputs = transform(
       inputs, [](DataflowOutput const &o) { return OpenDataflowValue{o}; });
   return this->add_node(open_inputs, num_outputs);
 }
 
 NodeAddedResult UnorderedSetDataflowGraph::add_node(
-    std::vector<OpenDataflowValue> const &inputs, int num_outputs) {
+    std::vector<OpenDataflowValue> const &inputs, nonnegative_int num_outputs) {
   Node new_node = this->node_source.new_node();
 
-  std::vector<DataflowOutput> new_outputs =
-      transform(count(num_outputs), [&](int output_idx) {
+  std::vector<DataflowOutput> new_outputs = transform(
+      nonnegative_range(num_outputs), [&](nonnegative_int output_idx) {
         return DataflowOutput{new_node, output_idx};
       });
 
diff --git a/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc b/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc
index a404423284..fd4a8782a4 100644
--- a/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc
+++ b/lib/utils/src/utils/graph/multidigraph/algorithms/add_nodes.cc
@@ -3,7 +3,7 @@
 
 namespace FlexFlow {
 
-std::vector<Node> add_nodes(MultiDiGraph &g, int num_nodes) {
+std::vector<Node> add_nodes(MultiDiGraph &g, nonnegative_int num_nodes) {
   return repeat(num_nodes, [&]() { return g.add_node(); });
 }
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc
new file mode 100644
index 0000000000..f7f8a9fd34
--- /dev/null
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/are_isomorphic.cc
@@ -0,0 +1,11 @@
+#include "utils/graph/open_dataflow_graph/algorithms/are_isomorphic.h"
+#include "utils/graph/open_dataflow_graph/algorithms/find_isomorphism.h"
+
+namespace FlexFlow {
+
+bool are_isomorphic(OpenDataflowGraphView const &src,
+                    OpenDataflowGraphView const &dst) {
+  return find_isomorphism(src, dst).has_value();
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
index 9077ea5f9a..261de287a9 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/as_dot.cc
@@ -2,13 +2,16 @@
 #include "utils/dot_file.h"
 #include "utils/graph/dataflow_graph/algorithms.h"
 #include "utils/graph/dataflow_graph/algorithms/as_dot.h"
+#include "utils/graph/labelled_dataflow_graph/labelled_dataflow_graph.h"
 #include "utils/graph/node/algorithms.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.h"
+#include "utils/graph/open_dataflow_graph/algorithms/get_inputs.h"
 #include "utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.h"
 
 namespace FlexFlow {
 
 std::string as_dot(OpenDataflowGraphView const &g) {
+
   std::function<std::string(Node const &)> get_node_label = [](Node const &n) {
     return fmt::format("n{}", n.raw_uid);
   };
@@ -36,9 +39,13 @@ std::string
 
   auto get_node_name = [](Node n) { return fmt::format("n{}", n.raw_uid); };
 
-  auto get_input_field = [](int idx) { return fmt::format("i{}", idx); };
+  auto get_input_field = [](nonnegative_int idx) {
+    return fmt::format("i{}", idx);
+  };
 
-  auto get_output_field = [](int idx) { return fmt::format("o{}", idx); };
+  auto get_output_field = [](nonnegative_int idx) {
+    return fmt::format("o{}", idx);
+  };
 
   auto get_graph_input_name = [](DataflowGraphInput i) {
     return fmt::format("gi{}", i.idx);
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc
index cad00c71e1..728dc75678 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_incoming_edges.cc
@@ -27,13 +27,13 @@ std::vector<OpenDataflowEdge> get_incoming_edges(OpenDataflowGraphView const &g,
                        DataflowInputEdgeQuery{
                            query_set<DataflowGraphInput>::matchall(),
                            {n},
-                           query_set<int>::matchall(),
+                           query_set<nonnegative_int>::matchall(),
                        },
                        DataflowEdgeQuery{
                            query_set<Node>::matchall(),
-                           query_set<int>::matchall(),
+                           query_set<nonnegative_int>::matchall(),
                            {n},
-                           query_set<int>::matchall(),
+                           query_set<nonnegative_int>::matchall(),
                        },
                    }),
                    [](OpenDataflowEdge const &l, OpenDataflowEdge const &r) {
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
index 95a8e095fc..6448da9c73 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
@@ -13,13 +13,13 @@ std::unordered_set<OpenDataflowEdge>
       DataflowInputEdgeQuery{
           query_set<DataflowGraphInput>::matchall(),
           query_set<Node>{ns},
-          query_set<int>::matchall(),
+          query_set<nonnegative_int>::matchall(),
       },
       DataflowEdgeQuery{
           query_set<Node>{nodes_not_in_ns},
-          query_set<int>::matchall(),
+          query_set<nonnegative_int>::matchall(),
           query_set<Node>{ns},
-          query_set<int>::matchall(),
+          query_set<nonnegative_int>::matchall(),
       },
   };
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc
new file mode 100644
index 0000000000..c55c4fe360
--- /dev/null
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.cc
@@ -0,0 +1,54 @@
+#include "utils/graph/open_dataflow_graph/algorithms/open_dataflow_graph_isomorphism.h"
+#include "utils/overload.h"
+
+namespace FlexFlow {
+
+OpenDataflowValue isomorphism_map_r_open_dataflow_value_from_l(
+    OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &l_value) {
+  return l_value.visit<OpenDataflowValue>(overload{
+      [&](DataflowGraphInput const &l_input) {
+        return OpenDataflowValue{
+            iso.input_mapping.at_l(l_input),
+        };
+      },
+      [&](DataflowOutput const &l_output) {
+        return OpenDataflowValue{
+            isomorphism_map_r_dataflow_output_from_l(iso, l_output),
+        };
+      },
+  });
+}
+
+OpenDataflowValue isomorphism_map_l_open_dataflow_value_from_r(
+    OpenDataflowGraphIsomorphism const &iso, OpenDataflowValue const &r_value) {
+  return r_value.visit<OpenDataflowValue>(overload{
+      [&](DataflowGraphInput const &r_input) {
+        return OpenDataflowValue{
+            iso.input_mapping.at_r(r_input),
+        };
+      },
+      [&](DataflowOutput const &r_output) {
+        return OpenDataflowValue{
+            isomorphism_map_l_dataflow_output_from_r(iso, r_output),
+        };
+      },
+  });
+}
+
+DataflowOutput isomorphism_map_r_dataflow_output_from_l(
+    OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &l_output) {
+  return DataflowOutput{
+      iso.node_mapping.at_l(l_output.node),
+      l_output.idx,
+  };
+}
+
+DataflowOutput isomorphism_map_l_dataflow_output_from_r(
+    OpenDataflowGraphIsomorphism const &iso, DataflowOutput const &r_output) {
+  return DataflowOutput{
+      iso.node_mapping.at_r(r_output.node),
+      r_output.idx,
+  };
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc b/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc
index 8736f2d157..34adea6b09 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/dataflow_input_edge_query.cc
@@ -6,14 +6,14 @@ DataflowInputEdgeQuery dataflow_input_edge_query_all() {
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>::matchall(),
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 DataflowInputEdgeQuery dataflow_input_edge_query_none() {
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>::match_none(),
       query_set<Node>::match_none(),
-      query_set<int>::match_none(),
+      query_set<nonnegative_int>::match_none(),
   };
 }
 
@@ -28,7 +28,7 @@ DataflowInputEdgeQuery
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>{e.src},
       query_set<Node>{e.dst.node},
-      query_set<int>{e.dst.idx},
+      query_set<nonnegative_int>{e.dst.idx},
   };
 }
 
@@ -37,7 +37,7 @@ DataflowInputEdgeQuery
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>{src},
       query_set<Node>::matchall(),
-      query_set<int>::matchall(),
+      query_set<nonnegative_int>::matchall(),
   };
 }
 
@@ -46,7 +46,7 @@ DataflowInputEdgeQuery
   return DataflowInputEdgeQuery{
       query_set<DataflowGraphInput>::matchall(),
       query_set<Node>{dst.node},
-      query_set<int>{dst.idx},
+      query_set<nonnegative_int>{dst.idx},
   };
 }
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc
index d5e5b614af..d51562a6c6 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_edge.cc
@@ -7,7 +7,7 @@ Node get_open_dataflow_edge_dst_node(OpenDataflowEdge const &e) {
   return get_open_dataflow_edge_dst(e).node;
 }
 
-int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &e) {
+nonnegative_int get_open_dataflow_edge_dst_idx(OpenDataflowEdge const &e) {
   return get_open_dataflow_edge_dst(e).idx;
 }
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc
index 63222dd360..949f837665 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/open_dataflow_graph.cc
@@ -4,7 +4,7 @@ namespace FlexFlow {
 
 NodeAddedResult
     OpenDataflowGraph::add_node(std::vector<OpenDataflowValue> const &inputs,
-                                int num_outputs) {
+                                nonnegative_int num_outputs) {
   return this->get_interface().add_node(inputs, num_outputs);
 }
 
diff --git a/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc b/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc
index 0fdb2f408b..171b321c66 100644
--- a/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc
+++ b/lib/utils/src/utils/graph/open_dataflow_graph/unordered_set_open_dataflow_graph.cc
@@ -18,7 +18,7 @@ UnorderedSetOpenDataflowGraph::UnorderedSetOpenDataflowGraph(
       outputs(outputs), graph_inputs(graph_inputs) {}
 
 NodeAddedResult UnorderedSetOpenDataflowGraph::add_node(
-    std::vector<OpenDataflowValue> const &inputs, int num_outputs) {
+    std::vector<OpenDataflowValue> const &inputs, nonnegative_int num_outputs) {
   NOT_IMPLEMENTED();
 }
 
diff --git a/lib/utils/src/utils/graph/render_dot.cc b/lib/utils/src/utils/graph/render_dot.cc
new file mode 100644
index 0000000000..8bdc001c80
--- /dev/null
+++ b/lib/utils/src/utils/graph/render_dot.cc
@@ -0,0 +1,90 @@
+#include "utils/graph/render_dot.h"
+#include "utils/containers/flatmap.h"
+#include "utils/containers/try_at.h"
+#include "utils/graph/dataflow_graph/algorithms.h"
+#include "utils/graph/node/algorithms.h"
+#include "utils/record_formatter.h"
+
+namespace FlexFlow {
+
+std::string escape_dot_string(std::string const &s) {
+  auto escape_dot_char = [](char c) -> std::string {
+    switch (c) {
+      case '\\':
+      case '"':
+        return std::string{'\\'} + c;
+      default:
+        return std::string{c};
+    }
+  };
+
+  return flatmap(s, escape_dot_char);
+}
+
+std::string render_dot_node_attrs(
+    std::unordered_map<std::string, std::string> const &node_attrs) {
+  std::ostringstream oss;
+  for (auto const &[k, v] : node_attrs) {
+    oss << fmt::format(
+        "\"{}\"=\"{}\",", escape_dot_string(k), escape_dot_string(v));
+  }
+  return oss.str();
+}
+
+std::string render_node_label(
+    LabelledDataflowGraphView<std::unordered_map<std::string, std::string>,
+                              std::string> const &g,
+    Node const &n) {
+  std::vector<DataflowInput> n_inputs = get_dataflow_inputs(g, n);
+  std::vector<DataflowOutput> n_outputs = get_outputs(g, n);
+
+  RecordFormatter inputs_record;
+  for (DataflowInput const &i : n_inputs) {
+    inputs_record << fmt::format("<i{}>{}", i.idx, i.idx);
+  }
+
+  RecordFormatter outputs_record;
+  for (DataflowOutput const &o : n_outputs) {
+    outputs_record << fmt::format("<o{}>{}", o.idx, g.at(o));
+  }
+
+  RecordFormatter rec;
+  rec << inputs_record
+      << try_at(g.at(n), std::string{"label"})
+             .value_or(fmt::to_string(n.raw_uid))
+      << outputs_record;
+
+  std::ostringstream oss;
+  oss << rec;
+  return oss.str();
+}
+
+std::string render_dot(
+    LabelledDataflowGraphView<std::unordered_map<std::string, std::string>,
+                              std::string> const &g) {
+  std::vector<std::string> lines;
+  lines.push_back("digraph {");
+
+  for (Node const &n : get_nodes(g)) {
+    std::unordered_map<std::string, std::string> node_attrs = g.at(n);
+    node_attrs.at("label") = render_node_label(g, n);
+    node_attrs["shape"] = "record";
+
+    lines.push_back(fmt::format(
+        "  n{} [{}];", n.raw_uid, render_dot_node_attrs(node_attrs)));
+  }
+
+  for (DataflowEdge const &e : get_edges(g)) {
+    lines.push_back(fmt::format("  n{}:o{} -> n{}:i{};",
+                                e.src.node.raw_uid,
+                                e.src.idx,
+                                e.dst.node.raw_uid,
+                                e.dst.idx));
+  }
+
+  lines.push_back("}");
+
+  return join_strings(lines, "\n");
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/src/utils/nonnegative_int/ceildiv.cc
new file mode 100644
index 0000000000..f1115b25b5
--- /dev/null
+++ b/lib/utils/src/utils/nonnegative_int/ceildiv.cc
@@ -0,0 +1,20 @@
+#include "utils/nonnegative_int/ceildiv.h"
+#include "utils/exception.h"
+
+namespace FlexFlow {
+
+nonnegative_int ceildiv(nonnegative_int numerator,
+                        nonnegative_int denominator) {
+  if (denominator == 0) {
+    throw mk_runtime_error(fmt::format(
+        "ceildiv expected denominator != 0, but received {}", denominator));
+  }
+
+  int n = numerator.unwrap_nonnegative();
+  int d = denominator.unwrap_nonnegative();
+
+  int result = (n + d - 1) / d;
+  return nonnegative_int{result};
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
index 9088cc4bf9..e86c242250 100644
--- a/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
+++ b/lib/utils/src/utils/nonnegative_int/nonnegative_int.cc
@@ -1,4 +1,5 @@
 #include "utils/nonnegative_int/nonnegative_int.h"
+#include "utils/exception.h"
 
 namespace FlexFlow {
 
@@ -10,6 +11,15 @@ nonnegative_int::nonnegative_int(int value) {
   this->value_ = value;
 }
 
+nonnegative_int::nonnegative_int(size_t value) {
+  if (value > std::numeric_limits<int>::max()) {
+    throw std::invalid_argument(fmt::format(
+        "Input {} to nonnegative_int(size_t) is out-of-bounds for int", value));
+  }
+  this->value_ = static_cast<int>(value);
+  assert(this->value_ >= 0);
+}
+
 nonnegative_int::operator int() const noexcept {
   return this->value_;
 }
@@ -75,18 +85,72 @@ nonnegative_int nonnegative_int::operator+(nonnegative_int const &other) const {
   return nonnegative_int{this->value_ + other.value_};
 }
 
+nonnegative_int &nonnegative_int::operator++() {
+  this->value_++;
+  return *this;
+}
+
+nonnegative_int nonnegative_int::operator++(int) {
+  nonnegative_int result = *this;
+  this->value_++;
+  return result;
+}
+
+nonnegative_int &nonnegative_int::operator+=(nonnegative_int const &other) {
+  this->value_ += other.value_;
+  return *this;
+}
+
+nonnegative_int nonnegative_int::operator*(nonnegative_int const &other) const {
+  return nonnegative_int{this->value_ * other.value_};
+}
+
+nonnegative_int &nonnegative_int::operator*=(nonnegative_int const &other) {
+  this->value_ *= other.value_;
+  return *this;
+}
+
+nonnegative_int nonnegative_int::operator/(nonnegative_int const &other) const {
+  return nonnegative_int{this->value_ / other.value_};
+}
+
+nonnegative_int &nonnegative_int::operator/=(nonnegative_int const &other) {
+  this->value_ /= other.value_;
+  return *this;
+}
+
+nonnegative_int nonnegative_int::operator%(nonnegative_int const &other) const {
+  return nonnegative_int{this->value_ % other.value_};
+}
+
+nonnegative_int &nonnegative_int::operator%=(nonnegative_int const &other) {
+  this->value_ %= other.value_;
+  return *this;
+}
+
 std::ostream &operator<<(std::ostream &os, nonnegative_int const &n) {
   os << n.value_;
   return os;
 }
 
-int nonnegative_int::get_value() const {
+int nonnegative_int::unwrap_nonnegative() const {
   return this->value_;
 }
 
 int format_as(nonnegative_int const &x) {
-  return x.get_value();
+  return x.unwrap_nonnegative();
 }
+
+nonnegative_int operator""_n(unsigned long long int x) {
+  if (x >
+      static_cast<unsigned long long int>(std::numeric_limits<int>::max())) {
+    throw mk_runtime_error(
+        fmt::format("Value too large to wrap as nonnegative_int: {}", x));
+  }
+
+  return nonnegative_int{static_cast<int>(x)};
+}
+
 } // namespace FlexFlow
 
 namespace nlohmann {
@@ -97,13 +161,20 @@ ::FlexFlow::nonnegative_int
 
 void adl_serializer<::FlexFlow::nonnegative_int>::to_json(
     json &j, ::FlexFlow::nonnegative_int t) {
-  j = t.get_value();
+  j = t.unwrap_nonnegative();
 }
 } // namespace nlohmann
 
+namespace rc {
+Gen<::FlexFlow::nonnegative_int>
+    Arbitrary<::FlexFlow::nonnegative_int>::arbitrary() {
+  return gen::construct<::FlexFlow::nonnegative_int>(gen::nonNegative<int>());
+}
+} // namespace rc
+
 namespace std {
 std::size_t hash<::FlexFlow::nonnegative_int>::operator()(
     FlexFlow::nonnegative_int const &n) const noexcept {
-  return std::hash<int>{}(n.get_value());
+  return std::hash<int>{}(n.unwrap_nonnegative());
 }
 } // namespace std
diff --git a/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc
new file mode 100644
index 0000000000..f31db6d589
--- /dev/null
+++ b/lib/utils/src/utils/nonnegative_int/nonnegative_range.cc
@@ -0,0 +1,19 @@
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "utils/containers/range.h"
+#include "utils/containers/transform.h"
+
+namespace FlexFlow {
+
+std::vector<nonnegative_int> nonnegative_range(nonnegative_int end) {
+  return transform(range(end.unwrap_nonnegative()),
+                   [](int x) { return nonnegative_int{x}; });
+}
+
+std::vector<nonnegative_int>
+    nonnegative_range(nonnegative_int start, nonnegative_int end, int step) {
+  return transform(
+      range(start.unwrap_nonnegative(), end.unwrap_nonnegative(), step),
+      [](int x) { return nonnegative_int{x}; });
+}
+
+} // namespace FlexFlow
diff --git a/lib/utils/src/utils/nonnegative_int/num_elements.cc b/lib/utils/src/utils/nonnegative_int/num_elements.cc
new file mode 100644
index 0000000000..21292bf2ab
--- /dev/null
+++ b/lib/utils/src/utils/nonnegative_int/num_elements.cc
@@ -0,0 +1,10 @@
+#include "utils/nonnegative_int/num_elements.h"
+#include "utils/archetypes/value_type.h"
+
+namespace FlexFlow {
+
+using E = value_type<0>;
+
+template nonnegative_int num_elements(std::vector<E> const &);
+
+} // namespace FlexFlow
diff --git a/lib/utils/test/src/main.cc b/lib/utils/test/src/main.cc
deleted file mode 100644
index 9522fa7fdb..0000000000
--- a/lib/utils/test/src/main.cc
+++ /dev/null
@@ -1,2 +0,0 @@
-#define DOCTEST_CONFIG_IMPLEMENT_WITH_MAIN
-#include "doctest/doctest.h"
diff --git a/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc b/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc
index b5a373e5c9..a669869fb8 100644
--- a/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc
+++ b/lib/utils/test/src/utils/bidict/algorithms/bidict_from_enumerating.cc
@@ -10,10 +10,12 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("bidict_from_enumerating(std::unordered_set<T>)") {
     std::unordered_set<std::string> input = {"zero", "one", "two"};
 
-    bidict<int, std::string> result = bidict_from_enumerating(input);
+    bidict<nonnegative_int, std::string> result =
+        bidict_from_enumerating(input);
 
-    std::unordered_set<int> result_left_entries = left_entries(result);
-    std::unordered_set<int> correct_left_entries = {0, 1, 2};
+    std::unordered_set<nonnegative_int> result_left_entries =
+        left_entries(result);
+    std::unordered_set<nonnegative_int> correct_left_entries = {0_n, 1_n, 2_n};
     CHECK(result_left_entries == correct_left_entries);
 
     std::unordered_set<std::string> result_right_entries =
@@ -25,13 +27,14 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("bidict_from_enumerating(std::set<T>)") {
     std::set<std::string> input = {"a", "c", "b"};
 
-    bidict<int, std::string> correct = {
-        {0, "a"},
-        {1, "b"},
-        {2, "c"},
+    bidict<nonnegative_int, std::string> correct = {
+        {0_n, "a"},
+        {1_n, "b"},
+        {2_n, "c"},
     };
 
-    bidict<int, std::string> result = bidict_from_enumerating(input);
+    bidict<nonnegative_int, std::string> result =
+        bidict_from_enumerating(input);
 
     CHECK(result == correct);
   }
diff --git a/lib/utils/test/src/utils/cli/cli_parse.cc b/lib/utils/test/src/utils/cli/cli_parse.cc
index 40dea86ae0..72a09efbde 100644
--- a/lib/utils/test/src/utils/cli/cli_parse.cc
+++ b/lib/utils/test/src/utils/cli/cli_parse.cc
@@ -24,8 +24,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         {},
     };
 
-    CLIFlagKey key_flag1 = CLIFlagKey{0};
-    CLIFlagKey key_flag2 = CLIFlagKey{1};
+    CLIFlagKey key_flag1 = CLIFlagKey{0_n};
+    CLIFlagKey key_flag2 = CLIFlagKey{1_n};
 
     SUBCASE("correctly parses short flag") {
       std::string input = "-2";
@@ -94,8 +94,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           },
           {},
       };
-      CLIFlagKey key_flag1 = CLIFlagKey{0};
-      CLIFlagKey key_flag2 = CLIFlagKey{1};
+      CLIFlagKey key_flag1 = CLIFlagKey{0_n};
+      CLIFlagKey key_flag2 = CLIFlagKey{1_n};
 
       SUBCASE("parses flags in any order") {
         std::vector<std::string> inputs = {"prog_name", "-2", "--flag1"};
@@ -180,8 +180,8 @@ TEST_SUITE(FF_TEST_SUITE) {
             },
         };
 
-        CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0};
-        CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1};
+        CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0_n};
+        CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1_n};
 
         SUBCASE("can parse multiple positional arguments") {
           std::vector<std::string> inputs = {"prog_name", "hello", "world"};
@@ -266,7 +266,7 @@ TEST_SUITE(FF_TEST_SUITE) {
               },
           };
 
-          CLIPositionalArgumentKey key_posarg = CLIPositionalArgumentKey{0};
+          CLIPositionalArgumentKey key_posarg = CLIPositionalArgumentKey{0_n};
 
           SUBCASE(
               "succeeds if a positional argument is set to a valid choice") {
@@ -351,11 +351,11 @@ TEST_SUITE(FF_TEST_SUITE) {
               },
           },
       };
-      CLIFlagKey key_flag1 = CLIFlagKey{0};
-      CLIFlagKey key_flag2 = CLIFlagKey{1};
-      CLIFlagKey key_flag3 = CLIFlagKey{2};
-      CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0};
-      CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1};
+      CLIFlagKey key_flag1 = CLIFlagKey{0_n};
+      CLIFlagKey key_flag2 = CLIFlagKey{1_n};
+      CLIFlagKey key_flag3 = CLIFlagKey{2_n};
+      CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0_n};
+      CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1_n};
 
       SUBCASE("works if flags are before positional arguments") {
         std::vector<std::string> inputs = {
@@ -449,11 +449,11 @@ TEST_SUITE(FF_TEST_SUITE) {
             },
         },
     };
-    CLIFlagKey key_flag1 = CLIFlagKey{0};
-    CLIFlagKey key_flag2 = CLIFlagKey{1};
-    CLIFlagKey key_flag3 = CLIFlagKey{2};
-    CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0};
-    CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1};
+    CLIFlagKey key_flag1 = CLIFlagKey{0_n};
+    CLIFlagKey key_flag2 = CLIFlagKey{1_n};
+    CLIFlagKey key_flag3 = CLIFlagKey{2_n};
+    CLIPositionalArgumentKey key_posarg1 = CLIPositionalArgumentKey{0_n};
+    CLIPositionalArgumentKey key_posarg2 = CLIPositionalArgumentKey{1_n};
 
     int argc = 5;
     char const *argv[] = {"prog_name", "red", "-f", "world", "--flag3"};
diff --git a/lib/utils/test/src/utils/containers/at_idx.cc b/lib/utils/test/src/utils/containers/at_idx.cc
new file mode 100644
index 0000000000..b2a6286b62
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/at_idx.cc
@@ -0,0 +1,29 @@
+#include "utils/containers/at_idx.h"
+#include "test/utils/doctest/fmt/optional.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("at_idx(std::vector<E>, nonnegative_int)") {
+    std::vector<int> vec = {1, 3, 2, 3};
+
+    SUBCASE("idx is in bounds") {
+      nonnegative_int idx = 1_n;
+
+      std::optional<int> result = at_idx(vec, idx);
+      std::optional<int> correct = 3;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("idx is out of bounds") {
+      nonnegative_int idx = 4_n;
+
+      std::optional<int> result = at_idx(vec, idx);
+      std::optional<int> correct = std::nullopt;
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/containers/enumerate.cc b/lib/utils/test/src/utils/containers/enumerate.cc
index 2f9a5b3c02..2fdb2e481e 100644
--- a/lib/utils/test/src/utils/containers/enumerate.cc
+++ b/lib/utils/test/src/utils/containers/enumerate.cc
@@ -17,26 +17,27 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("enumerate(std::vector<T>)") {
     std::vector<std::string> input = {"zero", "one", "two", "three"};
 
-    std::map<int, std::string> correct = {
-        {0, "zero"},
-        {1, "one"},
-        {2, "two"},
-        {3, "three"},
+    std::map<nonnegative_int, std::string> correct = {
+        {0_n, "zero"},
+        {1_n, "one"},
+        {2_n, "two"},
+        {3_n, "three"},
     };
 
-    std::map<int, std::string> result = enumerate(input);
+    std::map<nonnegative_int, std::string> result = enumerate(input);
 
     CHECK(result == correct);
 
     SUBCASE("check iteration order") {
-      std::vector<std::pair<int const, std::string>> iterated_result =
-          vector_of(result);
-      std::vector<std::pair<int const, std::string>> correct_iteration_order = {
-          {0, "zero"},
-          {1, "one"},
-          {2, "two"},
-          {3, "three"},
-      };
+      std::vector<std::pair<nonnegative_int const, std::string>>
+          iterated_result = vector_of(result);
+      std::vector<std::pair<nonnegative_int const, std::string>>
+          correct_iteration_order = {
+              {0_n, "zero"},
+              {1_n, "one"},
+              {2_n, "two"},
+              {3_n, "three"},
+          };
 
       CHECK(iterated_result == correct_iteration_order);
     }
@@ -45,9 +46,9 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("enumerate(std::unordered_set<T>)") {
     std::unordered_set<std::string> input = {"A", "B", "C", "D"};
 
-    std::unordered_set<int> correct_keys = {0, 1, 2, 3};
+    std::unordered_set<nonnegative_int> correct_keys = {0_n, 1_n, 2_n, 3_n};
     std::unordered_multiset<std::string> correct_values = {"A", "B", "C", "D"};
-    std::map<int, std::string> result = enumerate(input);
+    std::map<nonnegative_int, std::string> result = enumerate(input);
 
     CHECK(keys(result) == correct_keys);
     CHECK(unordered_multiset_of(values(result)) == correct_values);
diff --git a/lib/utils/test/src/utils/containers/enumerate_vector.cc b/lib/utils/test/src/utils/containers/enumerate_vector.cc
new file mode 100644
index 0000000000..fa5c5cf6fb
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/enumerate_vector.cc
@@ -0,0 +1,33 @@
+#include "utils/containers/enumerate_vector.h"
+#include "test/utils/doctest/fmt/map.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("enumerate_vector(std::vector<T>)") {
+    SUBCASE("input vector is empty") {
+      std::vector<int> input = {};
+
+      std::map<nonnegative_int, int> result = enumerate_vector(input);
+      std::map<nonnegative_int, int> correct = {};
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("input vector is not empty") {
+      std::vector<int> input = {2, 3, 1, 3, 3};
+
+      std::map<nonnegative_int, int> result = enumerate_vector(input);
+      std::map<nonnegative_int, int> correct = {
+          {0_n, 2},
+          {1_n, 3},
+          {2_n, 1},
+          {3_n, 3},
+          {4_n, 3},
+      };
+
+      CHECK(result == correct);
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/containers/flatmap.cc b/lib/utils/test/src/utils/containers/flatmap.cc
index bd6d3ae5be..6a6d3c86a8 100644
--- a/lib/utils/test/src/utils/containers/flatmap.cc
+++ b/lib/utils/test/src/utils/containers/flatmap.cc
@@ -73,6 +73,38 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
   }
 
+  TEST_CASE("flatmap(std::string, F)") {
+    std::string input = "aBabcBc";
+
+    SUBCASE("replacement length > 1") {
+      std::string result = flatmap(input, [](char c) -> std::string {
+        if (c == 'B') {
+          return "..";
+        } else {
+          return std::string{c};
+        }
+      });
+
+      std::string correct = "a..abc..c";
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("replacement length == 0") {
+      std::string result = flatmap(input, [](char c) -> std::string {
+        if (c == 'B') {
+          return "";
+        } else {
+          return std::string{c};
+        }
+      });
+
+      std::string correct = "aabcc";
+
+      CHECK(result == correct);
+    }
+  }
+
   TEST_CASE("flatmap(std::unordered_map<K, V>, F)") {
     auto de_nest_keys = [](int k1,
                            std::unordered_map<int, std::string> const &v) {
diff --git a/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc b/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc
index f25bcf65b1..9fb4048691 100644
--- a/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc
+++ b/lib/utils/test/src/utils/containers/get_all_permutations_with_repetition.cc
@@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int> input = {1, 2, 3};
 
       std::unordered_multiset<std::vector<int>> result =
-          get_all_permutations_with_repetition(input, 1);
+          get_all_permutations_with_repetition(input, 1_n);
       std::unordered_multiset<std::vector<int>> correct = {
           {1},
           {2},
@@ -27,7 +27,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int> input = {1};
 
       std::unordered_multiset<std::vector<int>> result =
-          get_all_permutations_with_repetition(input, 2);
+          get_all_permutations_with_repetition(input, 2_n);
       std::unordered_multiset<std::vector<int>> correct = {
           {1, 1},
       };
@@ -39,7 +39,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int> input = {1, 2};
 
       std::unordered_multiset<std::vector<int>> result =
-          get_all_permutations_with_repetition(input, 3);
+          get_all_permutations_with_repetition(input, 3_n);
       std::unordered_multiset<std::vector<int>> correct = {
           {1, 1, 1},
           {1, 1, 2},
@@ -58,7 +58,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::vector<int> input = {1, 2, 2};
 
       std::unordered_multiset<std::vector<int>> result =
-          get_all_permutations_with_repetition(input, 2);
+          get_all_permutations_with_repetition(input, 2_n);
       std::unordered_multiset<std::vector<int>> correct = {{1, 1},
                                                            {1, 2},
                                                            {1, 2},
diff --git a/lib/utils/test/src/utils/containers/make.cc b/lib/utils/test/src/utils/containers/make.cc
new file mode 100644
index 0000000000..4070f5b35a
--- /dev/null
+++ b/lib/utils/test/src/utils/containers/make.cc
@@ -0,0 +1,15 @@
+#include "utils/containers/make.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("make") {
+    auto f = make<int>();
+
+    int result = f(true);
+    int correct = 1;
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/utils/test/src/utils/containers/merge_maps.cc b/lib/utils/test/src/utils/containers/merge_maps.cc
index a083e94de3..4ec8054892 100644
--- a/lib/utils/test/src/utils/containers/merge_maps.cc
+++ b/lib/utils/test/src/utils/containers/merge_maps.cc
@@ -1,30 +1,80 @@
 #include "utils/containers/merge_maps.h"
 #include "test/utils/doctest/fmt/unordered_map.h"
 #include <doctest/doctest.h>
-#include <unordered_map>
 
 using namespace ::FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("merge_disjoint_maps") {
+    std::unordered_map<int, std::string> l_map = {
+        {1, "one"},
+        {2, "two"},
+    };
 
-  TEST_CASE("merge_maps") {
+    std::unordered_map<int, std::string> r_map = {
+        {3, "three"},
+    };
 
-    SUBCASE("disjoint keys") {
-      std::unordered_map<int, std::string> lhs = {{1, "one"}, {2, "two"}};
-      std::unordered_map<int, std::string> rhs = {{3, "three"}, {4, "four"}};
-
-      std::unordered_map<int, std::string> result = merge_maps(lhs, rhs);
-      std::unordered_map<int, std::string> correct = {
-          {1, "one"}, {2, "two"}, {3, "three"}, {4, "four"}};
+    std::unordered_map<int, std::string> correct = {
+        {1, "one"},
+        {2, "two"},
+        {3, "three"},
+    };
+    SUBCASE("maps are disjoint") {
+      std::unordered_map<int, std::string> result =
+          merge_disjoint_maps(l_map, r_map);
 
       CHECK(result == correct);
     }
 
-    SUBCASE("overlapping keys") {
-      std::unordered_map<int, std::string> lhs = {{1, "one"}, {2, "two"}};
-      std::unordered_map<int, std::string> rhs = {{2, "three"}, {3, "four"}};
-
-      CHECK_THROWS(merge_maps(lhs, rhs));
+    SUBCASE("maps are not disjoint") {
+      CHECK_THROWS(merge_disjoint_maps(l_map, l_map));
     }
   }
+
+  TEST_CASE("merge_map_left_dominates") {
+    std::unordered_map<int, std::string> l_map = {
+        {1, "one"},
+        {2, "left_two"},
+    };
+
+    std::unordered_map<int, std::string> r_map = {
+        {2, "right_two"},
+        {3, "three"},
+    };
+
+    std::unordered_map<int, std::string> correct = {
+        {1, "one"},
+        {2, "left_two"},
+        {3, "three"},
+    };
+
+    std::unordered_map<int, std::string> result =
+        merge_map_left_dominates(l_map, r_map);
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("merge_map_right_dominates") {
+    std::unordered_map<int, std::string> l_map = {
+        {1, "one"},
+        {2, "left_two"},
+    };
+
+    std::unordered_map<int, std::string> r_map = {
+        {2, "right_two"},
+        {3, "three"},
+    };
+
+    std::unordered_map<int, std::string> correct = {
+        {1, "one"},
+        {2, "right_two"},
+        {3, "three"},
+    };
+
+    std::unordered_map<int, std::string> result =
+        merge_map_right_dominates(l_map, r_map);
+
+    CHECK(result == correct);
+  }
 }
diff --git a/lib/utils/test/src/utils/containers/product.cc b/lib/utils/test/src/utils/containers/product.cc
index 3fa94c8e9e..2278bfba17 100644
--- a/lib/utils/test/src/utils/containers/product.cc
+++ b/lib/utils/test/src/utils/containers/product.cc
@@ -1,4 +1,6 @@
 #include "utils/containers/product.h"
+#include "utils/nonnegative_int/nonnegative_int.h"
+#include <climits>
 #include <doctest/doctest.h>
 #include <set>
 #include <unordered_set>
@@ -29,4 +31,22 @@ TEST_SUITE(FF_TEST_SUITE) {
       CHECK(correct == result);
     }
   }
+
+  TEST_CASE("product(std::vector<nonnegative_int>)") {
+    SUBCASE("non-empty container") {
+      std::vector<nonnegative_int> input = {1_n, 2_n, 3_n, 5_n};
+      nonnegative_int correct = 30_n;
+      auto result = product(input);
+      CHECK(correct == result);
+    }
+
+    SUBCASE("empty container") {
+      std::vector<nonnegative_int> input = {5_n};
+      nonnegative_int correct = 5_n;
+      // correct = nonnegative_int{x};
+      // CHECK(x == 3);
+      nonnegative_int result = product(input);
+      CHECK(correct == correct);
+    }
+  }
 }
diff --git a/lib/utils/test/src/utils/containers/repeat.cc b/lib/utils/test/src/utils/containers/repeat.cc
index d8ffe76a64..d2fc595f49 100644
--- a/lib/utils/test/src/utils/containers/repeat.cc
+++ b/lib/utils/test/src/utils/containers/repeat.cc
@@ -7,7 +7,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("repeat") {
     int x = 0;
-    std::vector<int> result = repeat(3, [&]() {
+    std::vector<int> result = repeat(3_n, [&]() {
       int result = x;
       x += 2;
       return result;
diff --git a/lib/utils/test/src/utils/containers/replicate.cc b/lib/utils/test/src/utils/containers/repeat_element.cc
similarity index 69%
rename from lib/utils/test/src/utils/containers/replicate.cc
rename to lib/utils/test/src/utils/containers/repeat_element.cc
index 1c7845642e..08bee8bec8 100644
--- a/lib/utils/test/src/utils/containers/replicate.cc
+++ b/lib/utils/test/src/utils/containers/repeat_element.cc
@@ -1,4 +1,4 @@
-#include "utils/containers/replicate.h"
+#include "utils/containers/repeat_element.h"
 #include "test/utils/doctest/fmt/unordered_set.h"
 #include "test/utils/doctest/fmt/vector.h"
 #include <doctest/doctest.h>
@@ -7,16 +7,17 @@
 using namespace FlexFlow;
 
 TEST_SUITE(FF_TEST_SUITE) {
-  TEST_CASE("replicate") {
+  TEST_CASE("repeat_element") {
     SUBCASE("ints") {
       int x = 42;
-      std::vector<int> result = replicate(5, x);
+      std::vector<int> result = repeat_element(nonnegative_int{5}, x);
       std::vector<int> correct = {42, 42, 42, 42, 42};
       CHECK(result == correct);
     }
     SUBCASE("unordered_set") {
       std::unordered_set<float> x = {1.0, 1.5};
-      std::vector<std::unordered_set<float>> result = replicate(3, x);
+      std::vector<std::unordered_set<float>> result =
+          repeat_element(nonnegative_int{3}, x);
       std::vector<std::unordered_set<float>> correct = {
           {1.0, 1.5}, {1.0, 1.5}, {1.0, 1.5}};
       CHECK(result == correct);
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc
index 25f990f80e..ff491f6b85 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms.cc
@@ -12,19 +12,19 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_inputs/get_outputs") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({}, 1);
+    NodeAddedResult n2_added = g.add_node({}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({}, 1);
+    NodeAddedResult n3_added = g.add_node({}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -44,15 +44,15 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("topological_ordering") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc
index f991b4a65e..0f812f2dec 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/dataflow_graphs_are_isomorphic.cc
@@ -11,21 +11,21 @@ TEST_SUITE(FF_TEST_SUITE) {
       "dataflow_graphs_are_isomorphic(DataflowGraphView, DataflowGraphView)") {
     auto g1 = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult g1_n1_added = g1.add_node({}, 1);
+    NodeAddedResult g1_n1_added = g1.add_node({}, 1_n);
     Node g1_n1_node = g1_n1_added.node;
     DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs);
 
-    NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1);
+    NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1_n);
     Node g1_n2_node = g1_n2_added.node;
 
     auto g2 = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
     SUBCASE("input graphs are isomorphic") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
       bool correct = true;
@@ -36,12 +36,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input graphs are not isomorphic (different connectivity)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
       NodeAddedResult g2_n2_added =
-          g2.add_node({g2_n1_output, g2_n1_output}, 1);
+          g2.add_node({g2_n1_output, g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
       bool correct = false;
@@ -53,14 +53,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not isomorphic (different number of src and sink "
             "nodes)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
-      NodeAddedResult g2_n3_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n3_added = g2.add_node({}, 1_n);
       Node g2_n3_node = g2_n3_added.node;
 
       bool correct = false;
@@ -72,15 +72,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not isomorphic (different number of internal "
             "nodes)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
       DataflowOutput g2_n2_output = get_only(g2_n2_added.outputs);
 
-      NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1);
+      NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1_n);
       Node g2_n3_node = g2_n3_added.node;
 
       bool correct = false;
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc
index 160e4c4f73..8974d09832 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/find_isomorphism.cc
@@ -10,21 +10,21 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("find_isomorphism(DataflowGraphView, DataflowGraphView)") {
     auto g1 = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult g1_n1_added = g1.add_node({}, 1);
+    NodeAddedResult g1_n1_added = g1.add_node({}, 1_n);
     Node g1_n1_node = g1_n1_added.node;
     DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs);
 
-    NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1);
+    NodeAddedResult g1_n2_added = g1.add_node({g1_n1_output}, 1_n);
     Node g1_n2_node = g1_n2_added.node;
 
     auto g2 = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
     SUBCASE("input graphs are isomorphic") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
       std::optional<DataflowGraphIsomorphism> correct_isomorphism =
@@ -41,12 +41,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("input graphs are not isomorphic (different connectivity)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
       NodeAddedResult g2_n2_added =
-          g2.add_node({g2_n1_output, g2_n1_output}, 1);
+          g2.add_node({g2_n1_output, g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
       std::optional<DataflowGraphIsomorphism> correct_isomorphism =
@@ -59,14 +59,14 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not isomorphic (different number of src and sink "
             "nodes)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
 
-      NodeAddedResult g2_n3_added = g2.add_node({}, 0);
+      NodeAddedResult g2_n3_added = g2.add_node({}, 0_n);
       Node g2_n3_node = g2_n3_added.node;
 
       std::optional<DataflowGraphIsomorphism> correct_isomorphism =
@@ -79,15 +79,15 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not isomorphic (different number of internal "
             "nodes)") {
-      NodeAddedResult g2_n1_added = g2.add_node({}, 1);
+      NodeAddedResult g2_n1_added = g2.add_node({}, 1_n);
       Node g2_n1_node = g2_n1_added.node;
       DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
 
-      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1);
+      NodeAddedResult g2_n2_added = g2.add_node({g2_n1_output}, 1_n);
       Node g2_n2_node = g2_n2_added.node;
       DataflowOutput g2_n2_output = get_only(g2_n2_added.outputs);
 
-      NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1);
+      NodeAddedResult g2_n3_added = g2.add_node({g2_n2_output}, 1_n);
       Node g2_n3_node = g2_n3_added.node;
 
       std::optional<DataflowGraphIsomorphism> correct_isomorphism =
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
index fec5d3401e..e619cc3b1c 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_dataflow_edges_from_node_to_node.cc
@@ -11,12 +11,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
     SUBCASE("gets edges if there are multiple") {
-      NodeAddedResult n1_added = g.add_node({}, 2);
+      NodeAddedResult n1_added = g.add_node({}, 2_n);
       Node n1 = n1_added.node;
       DataflowOutput n1_o0 = n1_added.outputs.at(0);
       DataflowOutput n1_o1 = n1_added.outputs.at(1);
 
-      NodeAddedResult n2_added = g.add_node({n1_o0, n1_o0, n1_o1}, 0);
+      NodeAddedResult n2_added = g.add_node({n1_o0, n1_o0, n1_o1}, 0_n);
       Node n2 = n2_added.node;
 
       std::unordered_set<DataflowEdge> result =
@@ -24,15 +24,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> correct = {
           DataflowEdge{
               n1_o0,
-              DataflowInput{n2, 0},
+              DataflowInput{n2, 0_n},
           },
           DataflowEdge{
               n1_o0,
-              DataflowInput{n2, 1},
+              DataflowInput{n2, 1_n},
           },
           DataflowEdge{
               n1_o1,
-              DataflowInput{n2, 2},
+              DataflowInput{n2, 2_n},
           },
       };
 
@@ -40,15 +40,15 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("does not get edges to/from other nodes") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
       DataflowOutput o1 = get_only(n1_added.outputs);
 
-      NodeAddedResult n2_added = g.add_node({o1}, 1);
+      NodeAddedResult n2_added = g.add_node({o1}, 1_n);
       Node n2 = n2_added.node;
       DataflowOutput o2 = get_only(n2_added.outputs);
 
-      NodeAddedResult n3_added = g.add_node({o2}, 1);
+      NodeAddedResult n3_added = g.add_node({o2}, 1_n);
       Node n3 = n3_added.node;
       DataflowOutput o3 = get_only(n3_added.outputs);
 
@@ -61,11 +61,11 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE(
         "does not get flipped edges (i.e., respects from vs to direction)") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
       DataflowOutput o1 = get_only(n1_added.outputs);
 
-      NodeAddedResult n2_added = g.add_node({o1}, 0);
+      NodeAddedResult n2_added = g.add_node({o1}, 0_n);
       Node n2 = n2_added.node;
 
       std::unordered_set<DataflowEdge> result =
@@ -76,10 +76,10 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("returns empty set if no edges exist between the given nodes") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
 
-      NodeAddedResult n2_added = g.add_node({}, 1);
+      NodeAddedResult n2_added = g.add_node({}, 1_n);
       Node n2 = n2_added.node;
 
       std::unordered_set<DataflowEdge> result =
@@ -91,7 +91,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("returns empty set if src node == dst node (as cycles cannot exist "
             "in DataflowGraph") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
 
       std::unordered_set<DataflowEdge> result =
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
index 86e4802cdb..f55afbacc1 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_incoming_edges.cc
@@ -10,34 +10,34 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_incoming_edges(DataflowGraphView, Node)") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({}, 1);
+    NodeAddedResult n2_added = g.add_node({}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
     SUBCASE("n4 - multiple incoming edges") {
       std::vector<DataflowEdge> result = get_incoming_edges(g, n4);
       std::vector<DataflowEdge> correct = {
-          DataflowEdge{o2, DataflowInput{n4, 0}},
-          DataflowEdge{o3, DataflowInput{n4, 1}}};
+          DataflowEdge{o2, DataflowInput{n4, 0_n}},
+          DataflowEdge{o3, DataflowInput{n4, 1_n}}};
       CHECK(result == correct);
     }
 
     SUBCASE("n3- single incoming edge") {
       std::vector<DataflowEdge> result = get_incoming_edges(g, n3);
       std::vector<DataflowEdge> correct = {
-          DataflowEdge{o2, DataflowInput{n3, 0}},
+          DataflowEdge{o2, DataflowInput{n3, 0_n}},
       };
       CHECK(result == correct);
     }
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
index be874b7e29..c37dcf5be7 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_outgoing_edges.cc
@@ -10,26 +10,26 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_outgoing_edges(DataflowGraphView, Node)") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1}, 1);
+    NodeAddedResult n3_added = g.add_node({o1}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2}, 1);
+    NodeAddedResult n4_added = g.add_node({o2}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
     SUBCASE("n2 - single outgoing edge") {
       std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n2);
       std::unordered_set<DataflowEdge> correct = {
-          DataflowEdge{o2, DataflowInput{n4, 0}},
+          DataflowEdge{o2, DataflowInput{n4, 0_n}},
       };
       CHECK(result == correct);
     }
@@ -37,8 +37,8 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("n1 - multiple outgoing edges") {
       std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, n1);
       std::unordered_set<DataflowEdge> correct = {
-          DataflowEdge{o1, DataflowInput{n2, 0}},
-          DataflowEdge{o1, DataflowInput{n3, 0}},
+          DataflowEdge{o1, DataflowInput{n2, 0_n}},
+          DataflowEdge{o1, DataflowInput{n3, 0_n}},
       };
       CHECK(result == correct);
     }
@@ -53,19 +53,19 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_outgoing_edges(DataflowGraphView, std::unordered_set<Node>)") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1}, 1);
+    NodeAddedResult n3_added = g.add_node({o1}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2}, 1);
+    NodeAddedResult n4_added = g.add_node({o2}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -73,9 +73,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<Node> nodes = {n1, n2};
       std::unordered_set<DataflowEdge> result = get_outgoing_edges(g, nodes);
       std::unordered_set<DataflowEdge> correct = {
-          DataflowEdge{o1, DataflowInput{n2, 0}},
-          DataflowEdge{o1, DataflowInput{n3, 0}},
-          DataflowEdge{o2, DataflowInput{n4, 0}},
+          DataflowEdge{o1, DataflowInput{n2, 0_n}},
+          DataflowEdge{o1, DataflowInput{n3, 0_n}},
+          DataflowEdge{o2, DataflowInput{n4, 0_n}},
       };
       CHECK(result == correct);
     }
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
index 330628adfd..6c770a9d29 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_incoming_edges.cc
@@ -11,19 +11,19 @@ TEST_SUITE(FF_TEST_SUITE) {
             "std::unordered_set<Node>") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1, o2, o1}, 1);
+    NodeAddedResult n3_added = g.add_node({o1, o2, o1}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -33,9 +33,9 @@ TEST_SUITE(FF_TEST_SUITE) {
         get_subgraph_incoming_edges(g, input_node_set);
 
     std::unordered_set<DataflowEdge> correct = {
-        DataflowEdge{o1, DataflowInput{n2, 0}},
-        DataflowEdge{o1, DataflowInput{n3, 0}},
-        DataflowEdge{o1, DataflowInput{n3, 2}},
+        DataflowEdge{o1, DataflowInput{n2, 0_n}},
+        DataflowEdge{o1, DataflowInput{n3, 0_n}},
+        DataflowEdge{o1, DataflowInput{n3, 2_n}},
     };
 
     CHECK(result == correct);
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
index 779d0a9560..bb7f3c4c30 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/get_subgraph_outgoing_edges.cc
@@ -11,19 +11,19 @@ TEST_SUITE(FF_TEST_SUITE) {
             "std::unordered_set<Node>") {
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o1, o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -33,8 +33,8 @@ TEST_SUITE(FF_TEST_SUITE) {
         get_subgraph_outgoing_edges(g, input_node_set);
 
     std::unordered_set<DataflowEdge> correct = {
-        DataflowEdge{o2, DataflowInput{n4, 1}},
-        DataflowEdge{o3, DataflowInput{n4, 2}},
+        DataflowEdge{o2, DataflowInput{n4, 1_n}},
+        DataflowEdge{o3, DataflowInput{n4, 2_n}},
     };
 
     CHECK(result == correct);
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc
index c35789044d..4e26812315 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_boundary_nodes_for_split.cc
@@ -19,19 +19,19 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1, o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o1, o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc
index 1f8f66b932..38b722ec70 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_edges_across_split.cc
@@ -25,19 +25,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     auto make_leaf = [](Node const &n) { return BinarySPDecompositionTree{n}; };
 
     SUBCASE("multiple nodes with edges across") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
       DataflowOutput o1 = get_only(n1_added.outputs);
 
-      NodeAddedResult n2_added = g.add_node({}, 1);
+      NodeAddedResult n2_added = g.add_node({}, 1_n);
       Node n2 = n2_added.node;
       DataflowOutput o2 = get_only(n2_added.outputs);
 
-      NodeAddedResult n3_added = g.add_node({o2, o1}, 1);
+      NodeAddedResult n3_added = g.add_node({o2, o1}, 1_n);
       Node n3 = n3_added.node;
       DataflowOutput o3 = get_only(n3_added.outputs);
 
-      NodeAddedResult n4_added = g.add_node({o1}, 1);
+      NodeAddedResult n4_added = g.add_node({o1}, 1_n);
       Node n4 = n4_added.node;
       DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -54,15 +54,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> correct = {
           DataflowEdge{
               o1,
-              DataflowInput{n3, 1},
+              DataflowInput{n3, 1_n},
           },
           DataflowEdge{
               o2,
-              DataflowInput{n3, 0},
+              DataflowInput{n3, 0_n},
           },
           DataflowEdge{
               o1,
-              DataflowInput{n4, 0},
+              DataflowInput{n4, 0_n},
           },
       };
 
@@ -70,12 +70,12 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("nodes each have multiple edges across") {
-      NodeAddedResult n1_added = g.add_node({}, 2);
+      NodeAddedResult n1_added = g.add_node({}, 2_n);
       Node n1 = n1_added.node;
       DataflowOutput n1_o1 = n1_added.outputs.at(0);
       DataflowOutput n1_o2 = n1_added.outputs.at(1);
 
-      NodeAddedResult n2_added = g.add_node({n1_o1, n1_o2, n1_o1}, 1);
+      NodeAddedResult n2_added = g.add_node({n1_o1, n1_o2, n1_o1}, 1_n);
       Node n2 = n2_added.node;
 
       TransitiveReducedDataflowGraphView tr_g =
@@ -91,15 +91,15 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> correct = {
           DataflowEdge{
               n1_o1,
-              DataflowInput{n2, 0},
+              DataflowInput{n2, 0_n},
           },
           DataflowEdge{
               n1_o2,
-              DataflowInput{n2, 1},
+              DataflowInput{n2, 1_n},
           },
           DataflowEdge{
               n1_o1,
-              DataflowInput{n2, 2},
+              DataflowInput{n2, 2_n},
           },
       };
 
@@ -107,19 +107,19 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("does not return edges eliminated by transitive reduction") {
-      NodeAddedResult n1_added = g.add_node({}, 1);
+      NodeAddedResult n1_added = g.add_node({}, 1_n);
       Node n1 = n1_added.node;
       DataflowOutput o1 = get_only(n1_added.outputs);
 
-      NodeAddedResult n2_added = g.add_node({o1}, 1);
+      NodeAddedResult n2_added = g.add_node({o1}, 1_n);
       Node n2 = n2_added.node;
       DataflowOutput o2 = get_only(n2_added.outputs);
 
-      NodeAddedResult n3_added = g.add_node({o1, o2}, 1);
+      NodeAddedResult n3_added = g.add_node({o1, o2}, 1_n);
       Node n3 = n3_added.node;
       DataflowOutput o3 = get_only(n3_added.outputs);
 
-      NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+      NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
       Node n4 = n4_added.node;
       DataflowOutput o4 = get_only(n4_added.outputs);
 
@@ -136,7 +136,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> correct = {
           DataflowEdge{
               o2,
-              DataflowInput{n3, 1},
+              DataflowInput{n3, 1_n},
           },
       };
 
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc
index 0e77739434..f922721fde 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/algorithms/transitive_reduced_dataflow_graph/get_transitive_reduced_outputs_across_split.cc
@@ -19,19 +19,19 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     DataflowGraph g = DataflowGraph::create<UnorderedSetDataflowGraph>();
 
-    NodeAddedResult n1_added = g.add_node({}, 1);
+    NodeAddedResult n1_added = g.add_node({}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput o1 = get_only(n1_added.outputs);
 
-    NodeAddedResult n2_added = g.add_node({o1}, 1);
+    NodeAddedResult n2_added = g.add_node({o1}, 1_n);
     Node n2 = n2_added.node;
     DataflowOutput o2 = get_only(n2_added.outputs);
 
-    NodeAddedResult n3_added = g.add_node({o1, o2}, 1);
+    NodeAddedResult n3_added = g.add_node({o1, o2}, 1_n);
     Node n3 = n3_added.node;
     DataflowOutput o3 = get_only(n3_added.outputs);
 
-    NodeAddedResult n4_added = g.add_node({o2, o3}, 1);
+    NodeAddedResult n4_added = g.add_node({o2, o3}, 1_n);
     Node n4 = n4_added.node;
     DataflowOutput o4 = get_only(n4_added.outputs);
 
diff --git a/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc b/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc
index 7a3237d432..ec3ad86fe6 100644
--- a/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc
+++ b/lib/utils/test/src/utils/graph/dataflow_graph/unordered_open_dataflow_graph.cc
@@ -31,7 +31,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       REQUIRE(result == correct);
     }
 
-    NodeAddedResult added = g.add_node({}, 2);
+    NodeAddedResult added = g.add_node({}, 2_n);
 
     {
       std::unordered_set<Node> result = g.query_nodes(node_query_all());
@@ -54,7 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       REQUIRE(result == correct);
     }
 
-    NodeAddedResult added2 = g.add_node(added.outputs, 3);
+    NodeAddedResult added2 = g.add_node(added.outputs, 3_n);
 
     {
       std::unordered_set<Node> result = g.query_nodes(node_query_all());
@@ -66,8 +66,8 @@ TEST_SUITE(FF_TEST_SUITE) {
       std::unordered_set<DataflowEdge> result =
           g.query_edges(dataflow_edge_query_all());
       std::unordered_set<DataflowEdge> correct = {
-          DataflowEdge{added.outputs.at(0), DataflowInput{added2.node, 0}},
-          DataflowEdge{added.outputs.at(1), DataflowInput{added2.node, 1}},
+          DataflowEdge{added.outputs.at(0), DataflowInput{added2.node, 0_n}},
+          DataflowEdge{added.outputs.at(1), DataflowInput{added2.node, 1_n}},
       };
       REQUIRE(result == correct);
     }
diff --git a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc
index 93d3d9605b..d9d91a03e9 100644
--- a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc
+++ b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_edges.cc
@@ -10,7 +10,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("add_edges(MultiDiGraph &, std::vector<std::pair<Node, Node>>)") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
-    std::vector<Node> n = add_nodes(g, 3);
+    std::vector<Node> n = add_nodes(g, 3_n);
 
     std::vector<std::pair<Node, Node>> input = {
         {n.at(0), n.at(1)},
diff --git a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc
index e41bf33d6c..e3d9ee6a29 100644
--- a/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc
+++ b/lib/utils/test/src/utils/graph/multidigraph/algorithms/add_nodes.cc
@@ -9,7 +9,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("add_nodes(MultiDiGraph &, int)") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
-    std::unordered_set<Node> result = unordered_set_of(add_nodes(g, 3));
+    std::unordered_set<Node> result = unordered_set_of(add_nodes(g, 3_n));
     std::unordered_set<Node> correct = g.query_nodes(node_query_all());
 
     CHECK(result == correct);
diff --git a/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc b/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc
index aef6d9baff..0dfcc8a851 100644
--- a/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc
+++ b/lib/utils/test/src/utils/graph/multidigraph/algorithms/get_edges.cc
@@ -11,7 +11,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_edges(MultiDiGraphView)") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
-    std::vector<Node> n = add_nodes(g, 3);
+    std::vector<Node> n = add_nodes(g, 3_n);
     std::vector<MultiDiEdge> e = add_edges(g,
                                            {
                                                {n.at(0), n.at(1)},
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc
index 78aaa8d9fc..55b7b34e52 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/find_isomorphism.cc
@@ -26,12 +26,13 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not empty") {
       DataflowGraphInput g1_i1 = g1.add_input();
-      NodeAddedResult g1_n1_added = g1.add_node({OpenDataflowValue{g1_i1}}, 1);
+      NodeAddedResult g1_n1_added =
+          g1.add_node({OpenDataflowValue{g1_i1}}, 1_n);
       Node g1_n1_node = g1_n1_added.node;
       DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs);
 
       NodeAddedResult g1_n2_added = g1.add_node(
-          {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1);
+          {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1_n);
       Node g1_n2_node = g1_n2_added.node;
 
       SUBCASE("one graph is empty") {
@@ -46,11 +47,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are isomorphic") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         std::optional<OpenDataflowGraphIsomorphism> correct =
@@ -75,11 +76,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataflowGraphInput g2_i1 = g2.add_input();
         DataflowGraphInput g2_i2 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         std::optional<OpenDataflowGraphIsomorphism> correct = std::nullopt;
@@ -93,12 +94,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are not isomorphic (different connectivity)") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
             {OpenDataflowValue{g2_n1_output}, OpenDataflowValue{g2_n1_output}},
-            1);
+            1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         std::optional<OpenDataflowGraphIsomorphism> correct = std::nullopt;
@@ -112,14 +113,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are not isomorphic (different numbers of nodes)") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
-        NodeAddedResult g2_n3_added = g2.add_node({}, 0);
+        NodeAddedResult g2_n3_added = g2.add_node({}, 0_n);
         Node g2_n3_node = g2_n3_added.node;
 
         std::optional<OpenDataflowGraphIsomorphism> correct = std::nullopt;
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc
index ff75e8fe48..fd54b801ce 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_graph_inputs.cc
@@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     DataflowGraphInput i0 = g.add_input();
     DataflowGraphInput i1 = g.add_input();
 
-    NodeAddedResult n0_added = g.add_node({}, 1);
+    NodeAddedResult n0_added = g.add_node({}, 1_n);
 
     std::unordered_set<DataflowGraphInput> result =
         get_open_dataflow_graph_inputs(g);
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc
index 7496c3009d..c7d294a588 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_open_dataflow_value_uses.cc
@@ -18,19 +18,19 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       NodeAddedResult n0_added = g.add_node(
           {OpenDataflowValue{i0}, OpenDataflowValue{i1}, OpenDataflowValue{i0}},
-          1);
+          1_n);
       Node n0 = n0_added.node;
       DataflowOutput o0 = get_only(n0_added.outputs);
 
       NodeAddedResult n1_added = g.add_node(
           {OpenDataflowValue{i1}, OpenDataflowValue{o0}, OpenDataflowValue{i0}},
-          1);
+          1_n);
       Node n1 = n1_added.node;
 
       std::unordered_set<DataflowInput> correct = {
-          DataflowInput{n0, 0},
-          DataflowInput{n0, 2},
-          DataflowInput{n1, 2},
+          DataflowInput{n0, 0_n},
+          DataflowInput{n0, 2_n},
+          DataflowInput{n1, 2_n},
       };
 
       std::unordered_set<DataflowInput> result =
@@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       DataflowGraphInput i0 = g.add_input();
 
-      NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 2);
+      NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 2_n);
       Node n0 = n0_added.node;
       DataflowOutput o0_0 = n0_added.outputs.at(0);
       DataflowOutput o0_1 = n0_added.outputs.at(1);
@@ -53,16 +53,16 @@ TEST_SUITE(FF_TEST_SUITE) {
       NodeAddedResult n1_added = g.add_node({OpenDataflowValue{i0},
                                              OpenDataflowValue{o0_1},
                                              OpenDataflowValue{o0_0}},
-                                            1);
+                                            1_n);
       Node n1 = n1_added.node;
 
       NodeAddedResult n2_added =
-          g.add_node({OpenDataflowValue{o0_1}, OpenDataflowValue{i0}}, 1);
+          g.add_node({OpenDataflowValue{o0_1}, OpenDataflowValue{i0}}, 1_n);
       Node n2 = n2_added.node;
 
       std::unordered_set<DataflowInput> correct = {
-          DataflowInput{n1, 1},
-          DataflowInput{n2, 0},
+          DataflowInput{n1, 1_n},
+          DataflowInput{n2, 0_n},
       };
 
       std::unordered_set<DataflowInput> result =
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc
index ddd6d74119..e1a2062865 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/get_unused_open_dataflow_graph_inputs.cc
@@ -13,7 +13,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       DataflowGraphInput g_i2 = g.add_input();
       DataflowGraphInput g_i3 = g.add_input();
 
-      NodeAddedResult g_n1_added = g.add_node({OpenDataflowValue{g_i2}}, 1);
+      NodeAddedResult g_n1_added = g.add_node({OpenDataflowValue{g_i2}}, 1_n);
 
       std::unordered_set<DataflowGraphInput> result =
           get_unused_open_dataflow_graph_inputs(g);
@@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       DataflowGraphInput g_i2 = g.add_input();
 
       NodeAddedResult g_n1_added =
-          g.add_node({OpenDataflowValue{g_i1}, OpenDataflowValue{g_i2}}, 1);
+          g.add_node({OpenDataflowValue{g_i1}, OpenDataflowValue{g_i2}}, 1_n);
 
       std::unordered_set<DataflowGraphInput> result =
           get_unused_open_dataflow_graph_inputs(g);
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc
index bdb1bb4814..c53e069f68 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/open_dataflow_graphs_are_isomorphic.cc
@@ -21,12 +21,13 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("input graphs are not empty") {
       DataflowGraphInput g1_i1 = g1.add_input();
-      NodeAddedResult g1_n1_added = g1.add_node({OpenDataflowValue{g1_i1}}, 1);
+      NodeAddedResult g1_n1_added =
+          g1.add_node({OpenDataflowValue{g1_i1}}, 1_n);
       Node g1_n1_node = g1_n1_added.node;
       DataflowOutput g1_n1_output = get_only(g1_n1_added.outputs);
 
       NodeAddedResult g1_n2_added = g1.add_node(
-          {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1);
+          {OpenDataflowValue{g1_i1}, OpenDataflowValue{g1_n1_output}}, 1_n);
       Node g1_n2_node = g1_n2_added.node;
 
       SUBCASE("one input graph is empty") {
@@ -39,11 +40,11 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are isomorphic") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         bool correct = true;
@@ -57,11 +58,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         DataflowGraphInput g2_i1 = g2.add_input();
         DataflowGraphInput g2_i2 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         bool correct = false;
@@ -73,12 +74,12 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are not isomorphic (different connectivity)") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
             {OpenDataflowValue{g2_n1_output}, OpenDataflowValue{g2_n1_output}},
-            1);
+            1_n);
         Node g2_n2_node = g2_n2_added.node;
 
         bool correct = false;
@@ -90,14 +91,14 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("input graphs are not isomorphic (different numbers of nodes)") {
         DataflowGraphInput g2_i1 = g2.add_input();
         NodeAddedResult g2_n1_added =
-            g2.add_node({OpenDataflowValue{g2_i1}}, 1);
+            g2.add_node({OpenDataflowValue{g2_i1}}, 1_n);
         Node g2_n1_node = g2_n1_added.node;
         DataflowOutput g2_n1_output = get_only(g2_n1_added.outputs);
         NodeAddedResult g2_n2_added = g2.add_node(
-            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1);
+            {OpenDataflowValue{g2_i1}, OpenDataflowValue{g2_n1_output}}, 1_n);
         Node g2_n2_node = g2_n2_added.node;
 
-        NodeAddedResult g2_n3_added = g2.add_node({}, 0);
+        NodeAddedResult g2_n3_added = g2.add_node({}, 0_n);
         Node g2_n3_node = g2_n3_added.node;
 
         bool correct = false;
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc
index b565e46e67..90682cf0f0 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_input_ids.cc
@@ -17,11 +17,11 @@ TEST_SUITE(FF_TEST_SUITE) {
     DataflowGraphInput i0 = g.add_input();
     DataflowGraphInput i1 = g.add_input();
 
-    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1);
+    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
     Node n0 = n0_added.node;
     DataflowOutput n0_output = get_only(n0_added.outputs);
 
-    NodeAddedResult n1_added = g.add_node({OpenDataflowValue{n0_output}}, 1);
+    NodeAddedResult n1_added = g.add_node({OpenDataflowValue{n0_output}}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput n1_output = get_only(n1_added.outputs);
 
@@ -44,7 +44,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                     new_i0,
                     DataflowInput{
                         n0,
-                        0,
+                        0_n,
                     },
                 },
             },
@@ -52,11 +52,11 @@ TEST_SUITE(FF_TEST_SUITE) {
                 DataflowEdge{
                     DataflowOutput{
                         n0,
-                        0,
+                        0_n,
                     },
                     DataflowInput{
                         n1,
-                        0,
+                        0_n,
                     },
                 },
             },
@@ -65,11 +65,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         {
             DataflowOutput{
                 n0,
-                0,
+                0_n,
             },
             DataflowOutput{
                 n1,
-                0,
+                0_n,
             },
         },
     };
diff --git a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc
index 36bcd16dad..1e7ad87d88 100644
--- a/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc
+++ b/lib/utils/test/src/utils/graph/open_dataflow_graph/algorithms/permute_node_ids.cc
@@ -17,12 +17,12 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     DataflowGraphInput i0 = g.add_input();
 
-    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1);
+    NodeAddedResult n0_added = g.add_node({OpenDataflowValue{i0}}, 1_n);
     Node n0 = n0_added.node;
     DataflowOutput n0_output = get_only(n0_added.outputs);
 
     NodeAddedResult n1_added =
-        g.add_node({OpenDataflowValue{i0}, OpenDataflowValue{n0_output}}, 1);
+        g.add_node({OpenDataflowValue{i0}, OpenDataflowValue{n0_output}}, 1_n);
     Node n1 = n1_added.node;
     DataflowOutput n1_output = get_only(n1_added.outputs);
 
@@ -45,7 +45,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                     i0,
                     DataflowInput{
                         new_node0,
-                        0,
+                        0_n,
                     },
                 },
             },
@@ -54,7 +54,7 @@ TEST_SUITE(FF_TEST_SUITE) {
                     i0,
                     DataflowInput{
                         new_node1,
-                        0,
+                        0_n,
                     },
                 },
             },
@@ -62,11 +62,11 @@ TEST_SUITE(FF_TEST_SUITE) {
                 DataflowEdge{
                     DataflowOutput{
                         new_node0,
-                        0,
+                        0_n,
                     },
                     DataflowInput{
                         new_node1,
-                        1,
+                        1_n,
                     },
                 },
             },
@@ -75,11 +75,11 @@ TEST_SUITE(FF_TEST_SUITE) {
         {
             DataflowOutput{
                 new_node0,
-                0,
+                0_n,
             },
             DataflowOutput{
                 new_node1,
-                0,
+                0_n,
             },
         },
     };
@@ -109,9 +109,9 @@ TEST_SUITE(FF_TEST_SUITE) {
       SUBCASE("check access to old edges") {
         OpenDataflowEdgeQuery query = OpenDataflowEdgeQuery{
             dataflow_input_edge_query_for_edge(
-                DataflowInputEdge{i0, DataflowInput{n0, 0}}),
+                DataflowInputEdge{i0, DataflowInput{n0, 0_n}}),
             dataflow_edge_query_for_edge(
-                DataflowEdge{n0_output, DataflowInput{n1, 1}}),
+                DataflowEdge{n0_output, DataflowInput{n1, 1_n}}),
         };
         std::unordered_set<OpenDataflowEdge> result_nodes =
             result.query_edges(query);
@@ -121,12 +121,12 @@ TEST_SUITE(FF_TEST_SUITE) {
 
       SUBCASE("check access to new edges") {
         DataflowEdge new_standard_edge = DataflowEdge{
-            DataflowOutput{new_node0, 0},
-            DataflowInput{new_node1, 1},
+            DataflowOutput{new_node0, 0_n},
+            DataflowInput{new_node1, 1_n},
         };
         DataflowInputEdge new_input_edge = DataflowInputEdge{
             i0,
-            DataflowInput{new_node0, 0},
+            DataflowInput{new_node0, 0_n},
         };
         OpenDataflowEdgeQuery query = OpenDataflowEdgeQuery{
             dataflow_input_edge_query_for_edge(new_input_edge),
@@ -159,7 +159,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("check access to new outputs") {
-        DataflowOutput new_output = DataflowOutput{new_node0, 0};
+        DataflowOutput new_output = DataflowOutput{new_node0, 0_n};
 
         DataflowOutputQuery query =
             dataflow_output_query_for_output(new_output);
diff --git a/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc b/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc
index a62f528bcf..a2f818b5e9 100644
--- a/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc
+++ b/lib/utils/test/src/utils/graph/series_parallel/parallel_reduction.cc
@@ -14,7 +14,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("find_parallel_reduction") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
     SUBCASE("base case") {
-      std::vector<Node> n = add_nodes(g, 2);
+      std::vector<Node> n = add_nodes(g, 2_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -28,7 +28,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("does not apply when there is only one edge") {
-      std::vector<Node> n = add_nodes(g, 2);
+      std::vector<Node> n = add_nodes(g, 2_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -40,7 +40,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("requires both ends be the same") {
-      std::vector<Node> n = add_nodes(g, 3);
+      std::vector<Node> n = add_nodes(g, 3_n);
       SUBCASE("branch out") {
         std::vector<MultiDiEdge> e = add_edges(g,
                                                {
@@ -67,7 +67,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("finds one reduction when there are multiple") {
-      std::vector<Node> n = add_nodes(g, 2);
+      std::vector<Node> n = add_nodes(g, 2_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -86,7 +86,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("in larger graph") {
-      std::vector<Node> n = add_nodes(g, 5);
+      std::vector<Node> n = add_nodes(g, 5_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -109,7 +109,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
     SUBCASE("base case") {
-      std::vector<Node> n = add_nodes(g, 2);
+      std::vector<Node> n = add_nodes(g, 2_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -142,7 +142,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("in larger graph") {
-      std::vector<Node> n = add_nodes(g, 5);
+      std::vector<Node> n = add_nodes(g, 5_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
diff --git a/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc b/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc
index c6b45ec6ce..4bb57aeb0d 100644
--- a/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc
+++ b/lib/utils/test/src/utils/graph/series_parallel/series_reduction.cc
@@ -12,7 +12,7 @@ using namespace ::FlexFlow;
 TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("get_pre/post/center_node") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
-    std::vector<Node> n = add_nodes(g, 3);
+    std::vector<Node> n = add_nodes(g, 3_n);
     std::vector<MultiDiEdge> e = add_edges(g,
                                            {
                                                {n.at(0), n.at(1)},
@@ -42,7 +42,7 @@ TEST_SUITE(FF_TEST_SUITE) {
   TEST_CASE("find_series_reduction") {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
     SUBCASE("base case") {
-      std::vector<Node> n = add_nodes(g, 3);
+      std::vector<Node> n = add_nodes(g, 3_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -57,7 +57,7 @@ TEST_SUITE(FF_TEST_SUITE) {
 
     SUBCASE("does not find if other edges are involved with center node") {
       SUBCASE("duplicate edge") {
-        std::vector<Node> n = add_nodes(g, 3);
+        std::vector<Node> n = add_nodes(g, 3_n);
         std::vector<MultiDiEdge> e = add_edges(g,
                                                {
                                                    {n.at(0), n.at(1)},
@@ -71,7 +71,7 @@ TEST_SUITE(FF_TEST_SUITE) {
       }
 
       SUBCASE("misc edge") {
-        std::vector<Node> n = add_nodes(g, 4);
+        std::vector<Node> n = add_nodes(g, 4_n);
         std::vector<MultiDiEdge> e = add_edges(g,
                                                {
                                                    {n.at(0), n.at(1)},
@@ -86,7 +86,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("does find if other edges are involved with non-center node") {
-      std::vector<Node> n = add_nodes(g, 4);
+      std::vector<Node> n = add_nodes(g, 4_n);
       SUBCASE("edge from dst") {
         std::vector<MultiDiEdge> e = add_edges(g,
                                                {
@@ -107,7 +107,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("finds one reduction when there are multiple") {
-      std::vector<Node> n = add_nodes(g, 4);
+      std::vector<Node> n = add_nodes(g, 4_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -125,7 +125,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("in larger graph") {
-      std::vector<Node> n = add_nodes(g, 8);
+      std::vector<Node> n = add_nodes(g, 8_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(2)},
@@ -149,7 +149,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     MultiDiGraph g = MultiDiGraph::create<AdjacencyMultiDiGraph>();
 
     SUBCASE("base case") {
-      std::vector<Node> n = add_nodes(g, 3);
+      std::vector<Node> n = add_nodes(g, 3_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(1)},
@@ -188,7 +188,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
 
     SUBCASE("in larger graph") {
-      std::vector<Node> n = add_nodes(g, 8);
+      std::vector<Node> n = add_nodes(g, 8_n);
       std::vector<MultiDiEdge> e = add_edges(g,
                                              {
                                                  {n.at(0), n.at(2)},
diff --git a/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc b/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc
new file mode 100644
index 0000000000..7ac882ff9f
--- /dev/null
+++ b/lib/utils/test/src/utils/nonnegative_int/ceildiv.cc
@@ -0,0 +1,52 @@
+#include "utils/nonnegative_int/ceildiv.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("ceildiv(nonnegative_int, nonnegative_int)") {
+    SUBCASE("divides evenly") {
+      nonnegative_int numerator = 12_n;
+      nonnegative_int denominator = 3_n;
+
+      nonnegative_int result = ceildiv(numerator, denominator);
+      nonnegative_int correct = 4_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("does not divide evenly") {
+      nonnegative_int numerator = 17_n;
+      nonnegative_int denominator = 4_n;
+
+      nonnegative_int result = ceildiv(numerator, denominator);
+      nonnegative_int correct = 5_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("denominator is zero") {
+      nonnegative_int numerator = 15_n;
+      nonnegative_int denominator = 0_n;
+
+      CHECK_THROWS(ceildiv(numerator, denominator));
+    }
+
+    SUBCASE("numerator is zero") {
+      nonnegative_int numerator = 0_n;
+      nonnegative_int denominator = 1_n;
+
+      nonnegative_int result = ceildiv(numerator, denominator);
+      nonnegative_int correct = 0_n;
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("denominator and numerator are zero") {
+      nonnegative_int numerator = 0_n;
+      nonnegative_int denominator = 0_n;
+
+      CHECK_THROWS(ceildiv(numerator, denominator));
+    }
+  }
+}
diff --git a/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc b/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc
index 73d382d830..dfde11f9bd 100644
--- a/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc
+++ b/lib/utils/test/src/utils/nonnegative_int/nonnegative_int.cc
@@ -198,13 +198,89 @@ TEST_SUITE(FF_TEST_SUITE) {
     }
   }
 
-  TEST_CASE("nonnegative_int + operation") {
-    nonnegative_int nn_int_1a = nonnegative_int{1};
-    nonnegative_int nn_int_1b = nonnegative_int{1};
-    nonnegative_int nn_int_2 = nonnegative_int{2};
-    SUBCASE("LHS: nonnegative_int, RHS: nonnegative_int") {
-      CHECK(nn_int_1a + nn_int_1b == nn_int_2);
-    }
+  TEST_CASE("nonnegative_int::operator+(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{1} + nonnegative_int{2};
+    nonnegative_int correct = nonnegative_int{3};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator++() (pre-increment)") {
+    nonnegative_int input = nonnegative_int{1};
+
+    nonnegative_int result = ++input;
+    nonnegative_int correct = nonnegative_int{2};
+
+    CHECK(result == correct);
+    CHECK(input == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator++(int) (post-increment)") {
+    nonnegative_int input = nonnegative_int{1};
+
+    nonnegative_int result = input++;
+    nonnegative_int correct_input = nonnegative_int{2};
+    nonnegative_int correct_result = nonnegative_int{1};
+
+    CHECK(result == correct_result);
+    CHECK(input == correct_input);
+  }
+
+  TEST_CASE("nonnegative_int::operator+=(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{1};
+    result += nonnegative_int{3};
+
+    nonnegative_int correct = nonnegative_int{4};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator*(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{2} * nonnegative_int{3};
+    nonnegative_int correct = nonnegative_int{6};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator*=(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{3};
+    result *= nonnegative_int{6};
+
+    nonnegative_int correct = nonnegative_int{18};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator/(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{5} / nonnegative_int{2};
+    nonnegative_int correct = nonnegative_int{2};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator/=(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{13};
+    result /= nonnegative_int{3};
+
+    nonnegative_int correct = nonnegative_int{4};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator%(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{5} % nonnegative_int{2};
+    nonnegative_int correct = nonnegative_int{1};
+
+    CHECK(result == correct);
+  }
+
+  TEST_CASE("nonnegative_int::operator%=(nonnegative_int)") {
+    nonnegative_int result = nonnegative_int{15};
+    result %= nonnegative_int{4};
+
+    nonnegative_int correct = nonnegative_int{3};
+
+    CHECK(result == correct);
   }
 
   TEST_CASE("adl_serializer<nonnegative_int>") {
diff --git a/lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc b/lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc
new file mode 100644
index 0000000000..db8fca295e
--- /dev/null
+++ b/lib/utils/test/src/utils/nonnegative_int/nonnegative_range.cc
@@ -0,0 +1,42 @@
+#include "utils/nonnegative_int/nonnegative_range.h"
+#include "test/utils/doctest/fmt/vector.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("nonnegative_range(nonnegative_int)") {
+    SUBCASE("bound is greater than zero") {
+      std::vector<nonnegative_int> result =
+          nonnegative_range(nonnegative_int{3});
+      std::vector<nonnegative_int> correct = {
+          nonnegative_int{0},
+          nonnegative_int{1},
+          nonnegative_int{2},
+      };
+
+      CHECK(result == correct);
+    }
+
+    SUBCASE("bound is zero") {
+      std::vector<nonnegative_int> result =
+          nonnegative_range(nonnegative_int{0});
+      std::vector<nonnegative_int> correct = {};
+
+      CHECK(result == correct);
+    }
+  }
+
+  TEST_CASE("nonnegative_range(nonnegative_int, nonnegative_int, int)") {
+    std::vector<nonnegative_int> result = nonnegative_range(
+        /*start=*/nonnegative_int{7},
+        /*end=*/nonnegative_int{3},
+        /*step=*/-2);
+    std::vector<nonnegative_int> correct = {
+        nonnegative_int{7},
+        nonnegative_int{5},
+    };
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/utils/test/src/utils/nonnegative_int/num_elements.cc b/lib/utils/test/src/utils/nonnegative_int/num_elements.cc
new file mode 100644
index 0000000000..0878be0410
--- /dev/null
+++ b/lib/utils/test/src/utils/nonnegative_int/num_elements.cc
@@ -0,0 +1,15 @@
+#include "utils/nonnegative_int/num_elements.h"
+#include <doctest/doctest.h>
+
+using namespace ::FlexFlow;
+
+TEST_SUITE(FF_TEST_SUITE) {
+  TEST_CASE("num_elements") {
+    std::vector<int> input = {-1, 3, 3, 1};
+
+    nonnegative_int result = num_elements(input);
+    nonnegative_int correct = nonnegative_int{4};
+
+    CHECK(result == correct);
+  }
+}
diff --git a/lib/utils/test/src/utils/random_utils.cc b/lib/utils/test/src/utils/random_utils.cc
index 8e7d22138f..fdc48a64dd 100644
--- a/lib/utils/test/src/utils/random_utils.cc
+++ b/lib/utils/test/src/utils/random_utils.cc
@@ -29,7 +29,7 @@ TEST_SUITE(FF_TEST_SUITE) {
     SUBCASE("correct distribution") {
       auto check_probabilities = [](std::vector<int> const &values,
                                     std::vector<float> const &weights) {
-        int num_iterations = 10'000;
+        nonnegative_int num_iterations = 10'000_n;
         std::vector<int> trials = repeat(
             num_iterations, [&]() { return select_random(values, weights); });
 
@@ -39,8 +39,8 @@ TEST_SUITE(FF_TEST_SUITE) {
           float expectedProbability = w / sum(weights);
           int num_occurrences =
               filter(trials, [&](int c) { return (c == v); }).size();
-          float observedProbability =
-              static_cast<float>(num_occurrences) / num_iterations;
+          float observedProbability = static_cast<float>(num_occurrences) /
+                                      num_iterations.unwrap_nonnegative();
           CHECK(observedProbability ==
                 doctest::Approx(expectedProbability).epsilon(0.01f));
         }