diff --git a/vpr/src/place/net_cost_handler.cpp b/vpr/src/place/net_cost_handler.cpp
index 7cbfed53c53..3e12ef446ba 100644
--- a/vpr/src/place/net_cost_handler.cpp
+++ b/vpr/src/place/net_cost_handler.cpp
@@ -112,6 +112,8 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
     , placer_opts_(placer_opts) {
     const int num_layers = g_vpr_ctx.device().grid.get_num_layers();
 
+    is_multi_layer_ = num_layers > 1;
+
     // Either 3D BB or per layer BB data structure are used, not both.
     if (cube_bb_) {
         ts_bb_edge_new_.resize(num_nets, t_bb());
@@ -145,10 +147,11 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts,
      * been recomputed. */
     bb_update_status_.resize(num_nets, NetUpdateState::NOT_UPDATED_YET);
 
-    alloc_and_load_chan_w_factors_for_place_cost_(placer_opts_.place_cost_exp);
+    alloc_and_load_chan_w_factors_for_place_cost_();
 }
 
-void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_cost_exp) {
+void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() {
+    const double place_cost_exp = static_cast<double>(placer_opts_.place_cost_exp);
     auto& device_ctx = g_vpr_ctx.device();
 
     const int grid_height = device_ctx.grid.height();
@@ -190,7 +193,7 @@ void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_c
             }
 
             chanx_place_cost_fac_[high][low] = (high - low + 1.) / chanx_place_cost_fac_[high][low];
-            chanx_place_cost_fac_[high][low] = pow((double)chanx_place_cost_fac_[high][low], (double)place_cost_exp);
+            chanx_place_cost_fac_[high][low] = pow((double)chanx_place_cost_fac_[high][low], place_cost_exp);
         }
     }
 
@@ -220,16 +223,16 @@ void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_c
             }
 
             chany_place_cost_fac_[high][low] = (high - low + 1.) / chany_place_cost_fac_[high][low];
-            chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], (double)place_cost_exp);
+            chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], place_cost_exp);
         }
     }
     
-    if (device_ctx.grid.get_num_layers() > 1) {
-        alloc_and_load_for_fast_vertical_cost_update_(place_cost_exp);
+    if (is_multi_layer_) {
+        alloc_and_load_for_fast_vertical_cost_update_();
     }
 }
 
-void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_(float place_cost_exp) {
+void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() {
     const auto& device_ctx = g_vpr_ctx.device();
     const auto& rr_graph = device_ctx.rr_graph;
     
@@ -237,16 +240,35 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_(float place_c
     const size_t grid_width = device_ctx.grid.width();
 
 
-    chanz_place_cost_fac_ = vtr::NdMatrix<float, 4>({grid_width, grid_height, grid_width, grid_height}, 0.);
+    acc_tile_num_inter_die_conn_ = vtr::NdMatrix<int, 2>({grid_width, grid_height}, 0.); 
+
+    vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid_width, grid_height}, 0.);         
 
-    vtr::NdMatrix<float, 2> tile_num_inter_die_conn({grid_width, grid_height}, 0.);                           
+    /*
+     * Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location 
+     * in the device. We count all these edges, regardless of which layers they connect. Then we divide by 
+     * the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors 
+     * what we do for the horizontal and vertical channels where we assume the channel width doesn't change 
+     * along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited 
+     * if someday we have architectures with widely varying connectivity between different layers in a stack.
+     */                  
 
+    /*
+    * To calculate the accumulative number of inter-die connections we first need to get the number of 
+    * inter-die connection per location. To be able to work for the cases that RR Graph is read instead 
+    * of being made from the architecture file, we calculate this number by iterating over the RR graph. Once 
+    * tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_. First,  
+    * we populate the first row and column. Then, we iterate over the rest of blocks and get the number of 
+    * inter-die connections by adding up the number of inter-die block at that location + the accumulation 
+    * for the  block below and  left to it. Then, since the accumulated number of inter-die connection to 
+    * the block on the lower left connection of the block is added twice, that part needs to be removed.
+    */
     for (const auto& src_rr_node : rr_graph.nodes()) {
-        for (const auto& rr_edge_idx : rr_graph.configurable_edges(src_rr_node)) {
+        for (const auto& rr_edge_idx : rr_graph.edges(src_rr_node)) {
             const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
             if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) {
                 // We assume that the nodes driving the inter-layer connection or being driven by it
-                // are not streched across multiple tiles
+                // are not stretched across multiple tiles
                 int src_x = rr_graph.node_xhigh(src_rr_node);
                 int src_y = rr_graph.node_yhigh(src_rr_node);
                 VTR_ASSERT(rr_graph.node_xlow(src_rr_node) == src_x && rr_graph.node_ylow(src_rr_node) == src_y);
@@ -254,37 +276,34 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_(float place_c
                 tile_num_inter_die_conn[src_x][src_y]++;
             }
         }
+    }
 
-        for (const auto& rr_edge_idx : rr_graph.non_configurable_edges(src_rr_node)) {
-            const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx);
-            if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) {
-                int src_x = rr_graph.node_xhigh(src_rr_node);
-                VTR_ASSERT(rr_graph.node_xlow(src_rr_node) == src_x && rr_graph.node_xlow(src_rr_node) == src_x);
-                int src_y = rr_graph.node_yhigh(src_rr_node);
-                VTR_ASSERT(rr_graph.node_ylow(src_rr_node) == src_y && rr_graph.node_ylow(src_rr_node) == src_y);
-                tile_num_inter_die_conn[src_x][src_y]++;
-            }
+    int num_layers = device_ctx.grid.get_num_layers();
+    for (size_t x = 0; x < device_ctx.grid.width(); x++) {
+        for (size_t y = 0; y < device_ctx.grid.height(); y++) {
+            tile_num_inter_die_conn[x][y] /= (num_layers-1);
         }
     }
 
-    for (int x_high = 0; x_high < (int)device_ctx.grid.width(); x_high++) {
-        for (int y_high = 0; y_high < (int)device_ctx.grid.height(); y_high++) {
-            for (int x_low = 0; x_low <= x_high; x_low++) {
-                for (int y_low = 0; y_low <= y_high; y_low++) {
-                    int num_inter_die_conn = 0;
-                    for (int x = x_low; x <= x_high; x++) {
-                        for (int y = y_low; y <= y_high; y++) {
-                            num_inter_die_conn += tile_num_inter_die_conn[x][y];
-                        }
-                    }
-                    int seen_num_tiles = (x_high - x_low + 1) * (y_high - y_low + 1);
-                    chanz_place_cost_fac_[x_high][y_high][x_low][y_low] = seen_num_tiles / static_cast<float>(num_inter_die_conn);
-
-                    chanz_place_cost_fac_[x_high][y_high][x_low][y_low] = pow(
-                        (double)chanz_place_cost_fac_[x_high][y_high][x_low][y_low],
-                        (double)place_cost_exp);
-                }
-            }
+    // Step 2: Calculate prefix sum of the inter-die connectivity up to and including the channel at (x, y).
+    acc_tile_num_inter_die_conn_[0][0] = tile_num_inter_die_conn[0][0];
+    // Initialize the first row and column
+    for (size_t x = 1; x < device_ctx.grid.width(); x++) {
+        acc_tile_num_inter_die_conn_[x][0] = acc_tile_num_inter_die_conn_[x-1][0] +
+                                            tile_num_inter_die_conn[x][0];
+    }
+
+    for (size_t y = 1; y < device_ctx.grid.height(); y++) {
+        acc_tile_num_inter_die_conn_[0][y] = acc_tile_num_inter_die_conn_[0][y-1] +
+                                            tile_num_inter_die_conn[0][y];
+    }
+    
+    for (size_t x_high = 1; x_high < device_ctx.grid.width(); x_high++) {
+        for (size_t y_high = 1; y_high < device_ctx.grid.height(); y_high++) {
+            acc_tile_num_inter_die_conn_[x_high][y_high] = acc_tile_num_inter_die_conn_[x_high-1][y_high] +
+                                                          acc_tile_num_inter_die_conn_[x_high][y_high-1] +
+                                                          tile_num_inter_die_conn[x_high][y_high] -
+                                                          acc_tile_num_inter_die_conn_[x_high-1][y_high-1];
         }
     }
 }
@@ -818,7 +837,7 @@ void NetCostHandler::update_bb_(ClusterNetId net_id,
     }
 
     /* Now account for the layer motion. */
-    if (num_layers > 1) {
+    if (is_multi_layer_) {
         /* We need to update it only if multiple layers are available */
         for (int layer_num = 0; layer_num < num_layers; layer_num++) {
             num_sink_pin_layer_new[layer_num] = curr_num_sink_pin_layer[layer_num];
@@ -1402,8 +1421,6 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
 
     const t_bb& bb = use_ts ? ts_bb_coord_new_[net_id] : placer_state_.move().bb_coords[net_id];
 
-    const bool is_multi_layer = (g_vpr_ctx.device().grid.get_num_layers() > 1);
-
     double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size());
 
     /* Could insert a check for xmin == xmax.  In that case, assume  *
@@ -1420,12 +1437,14 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) {
      */
 
     double ncost;
-    ncost = (bb.xmax - bb.xmin + 1) * crossing * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1];
-    ncost += (bb.ymax - bb.ymin + 1) * crossing * chany_place_cost_fac_[bb.xmax][bb.xmin - 1];
-    if (is_multi_layer) {
-        ncost += (bb.layer_max - bb.layer_min) * crossing * chanz_place_cost_fac_[bb.xmax][bb.ymax][bb.xmin][bb.ymin];
+    ncost = (bb.xmax - bb.xmin + 1) * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1];
+    ncost += (bb.ymax - bb.ymin + 1) * chany_place_cost_fac_[bb.xmax][bb.xmin - 1];
+    if (is_multi_layer_) {
+        ncost += (bb.layer_max - bb.layer_min) * get_chanz_cost_factor_(bb);
     }
 
+    ncost *= crossing;
+
     return ncost;
 }
 
@@ -1526,6 +1545,39 @@ double NetCostHandler::get_net_wirelength_from_layer_bb_(ClusterNetId net_id) {
     return ncost;
 }
 
+float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) {
+    float place_cost_exp = placer_opts_.place_cost_exp;
+
+    int num_inter_dir_conn;
+
+    if (bb.xmin == 0 && bb.ymin == 0) {
+        num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax];
+    } else if (bb.xmin == 0) {
+        num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] -
+                             acc_tile_num_inter_die_conn_[bb.xmax][bb.ymin-1];
+    } else if (bb.ymin == 0) {
+        num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] -
+                             acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymax];
+    } else {
+        num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] -
+                             acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymax] -
+                             acc_tile_num_inter_die_conn_[bb.xmax][bb.ymin-1] +
+                             acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymin-1];
+    }
+    
+    float z_cost_factor;
+    if (num_inter_dir_conn == 0) {
+        return 1.0f;
+    } else {
+        int bb_num_tiles = (bb.xmax - bb.xmin + 1) * (bb.ymax - bb.ymin + 1);
+        z_cost_factor = bb_num_tiles / static_cast<float>(num_inter_dir_conn);
+        z_cost_factor = pow((double)z_cost_factor, (double)place_cost_exp);
+    }
+
+    return z_cost_factor;
+
+}
+
 double NetCostHandler::recompute_bb_cost_() {
     double cost = 0;
 
diff --git a/vpr/src/place/net_cost_handler.h b/vpr/src/place/net_cost_handler.h
index 3048b7637ea..fd6c7a46767 100644
--- a/vpr/src/place/net_cost_handler.h
+++ b/vpr/src/place/net_cost_handler.h
@@ -123,6 +123,8 @@ class NetCostHandler {
   private:
     ///@brief Specifies whether the bounding box is computed using cube method or per-layer method.
     bool cube_bb_ = false;
+    ///@brief Determines whether the FPGA has multiple dies (layers)
+    bool is_multi_layer_ = false;
     ///@brief A reference to the placer's state to be updated by this object.
     PlacerState& placer_state_;
     ///@brief Contains some parameter that determine how the placement cost is computed.
@@ -196,12 +198,14 @@ class NetCostHandler {
     vtr::NdOffsetMatrix<float, 2> chanx_place_cost_fac_; // [-1...device_ctx.grid.width()-1]
     vtr::NdOffsetMatrix<float, 2> chany_place_cost_fac_; // [-1...device_ctx.grid.height()-1]
     /**
-      @brief This data structure functions similarly to the matrices described above 
-      but is applied to 3D connections linking different FPGA layers. It is used in the 
-      placement cost function calculation, where the height of the bounding box is divided 
-      by the average number of inter-die connections within the bounding box.
+     * @brief The matrix below is used to calculate a chanz_place_cost_fac based on the average channel width in 
+     * the cross-die-layer direction over a 2D (x,y) region. We don't assume the inter-die connectivity is the same at all (x,y) locations, so we
+     * can't compute the full chanz_place_cost_fac for all possible (xlow,ylow)(xhigh,yhigh) without a 4D array, which would
+     * be too big: O(n^2) in circuit size. Instead we compute a prefix sum that stores the number of inter-die connections per layer from
+     * (x=0,y=0) to (x,y). Given this, we can compute the average number of inter-die connections over a (xlow,ylow) to (xhigh,yhigh) 
+     * region in O(1) (by adding and subtracting 4 entries)
      */
-    vtr::NdMatrix<float, 4> chanz_place_cost_fac_; // [0...device_ctx.grid.width()-1][0...device_ctx.grid.height()-1][0...device_ctx.grid.width()-1][0...device_ctx.grid.height()-1]
+    vtr::NdMatrix<int, 2> acc_tile_num_inter_die_conn_; // [0..grid_width-1][0..grid_height-1]
 
 
   private:
@@ -250,23 +254,17 @@ class NetCostHandler {
      * have to bother calling this routine; when using the cost function described above, however, you must always
      * call this routine before you do any placement cost determination. The place_cost_exp factor specifies to
      * what power the width of the channel should be taken -- larger numbers make narrower channels more expensive.
-     *
-     * @param place_cost_exp It is an exponent to which you take the average inverse channel capacity;
-     * a higher value would favour wider channels more over narrower channels during placement (usually we use 1).
      */
-    void alloc_and_load_chan_w_factors_for_place_cost_(float place_cost_exp);
+    void alloc_and_load_chan_w_factors_for_place_cost_();
 
     /**
-    * @brief Allocates and loads the chanz_place_cost_fac array with the inverse of
-    * the average number of inter-die connections between [subhigh] and [sublow].
+    * @brief Allocates and loads acc_tile_num_inter_die_conn_ which contains the accumulative number of inter-die
+    * conntections.
     *
     * @details This is only useful for multi-die FPGAs. The place_cost_exp factor specifies to
     * what power the average number of inter-die connections should be take -- larger numbers make narrower channels more expensive.
-    *
-    * @param place_cost_exp It is an exponent to which you take the average number of inter-die connections;
-    * a higher value would favour areas with more inter-die connections over areas with less of those during placement (usually we use 1).
     */
-    void alloc_and_load_for_fast_vertical_cost_update_(float place_cost_exp);
+    void alloc_and_load_for_fast_vertical_cost_update_();
 
     /**
      * @brief Calculate the new connection delay and timing cost of all the
@@ -511,4 +509,16 @@ class NetCostHandler {
      */
     double get_net_wirelength_from_layer_bb_(ClusterNetId net_id);
 
+    /**
+     * @brief Calculate the chanz cost factor based on the inverse of the average number of inter-die connections 
+     * in the given bounding box. This cost factor increases the placement cost for blocks that require inter-layer 
+     * connections in areas with, on average, fewer inter-die connections. If inter-die connections are evenly 
+     * distributed across tiles, the cost factor will be the same for all bounding boxes, but it will still 
+     * weight z-directed vs. x- and y-directed connections appropriately.
+     *
+     * @param bounding_box Bounding box of the net which chanz cost factor is to be calculated
+     * @return ChanZ cost factor
+     */
+    float get_chanz_cost_factor_(const t_bb& bounding_box);
+
 };
diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_manual_annealing/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_manual_annealing/config/golden_results.txt
index 905d2f3ba19..291897f65f4 100644
--- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_manual_annealing/config/golden_results.txt
+++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_manual_annealing/config/golden_results.txt
@@ -1,2 +1,2 @@
  arch	  circuit	  script_params	  vtr_flow_elapsed_time	  vtr_max_mem_stage	  vtr_max_mem	  error	  odin_synth_time	  max_odin_mem	  parmys_synth_time	  max_parmys_mem	  abc_depth	  abc_synth_time	  abc_cec_time	  abc_sec_time	  max_abc_mem	  ace_time	  max_ace_mem	  num_clb	  num_io	  num_memories	  num_mult	  vpr_status	  vpr_revision	  vpr_build_info	  vpr_compiler	  vpr_compiled	  hostname	  rundir	  max_vpr_mem	  num_primary_inputs	  num_primary_outputs	  num_pre_packed_nets	  num_pre_packed_blocks	  num_netlist_clocks	  num_post_packed_nets	  num_post_packed_blocks	  device_width	  device_height	  device_grid_tiles	  device_limiting_resources	  device_name	  pack_mem	  pack_time	  placed_wirelength_est	  place_mem	  place_time	  place_quench_time	  placed_CPD_est	  placed_setup_TNS_est	  placed_setup_WNS_est	  placed_geomean_nonvirtual_intradomain_critical_path_delay_est	  place_delay_matrix_lookup_time	  place_quench_timing_analysis_time	  place_quench_sta_time	  place_total_timing_analysis_time	  place_total_sta_time	  min_chan_width	  routed_wirelength	  min_chan_width_route_success_iteration	  logic_block_area_total	  logic_block_area_used	  min_chan_width_routing_area_total	  min_chan_width_routing_area_per_tile	  min_chan_width_route_time	  min_chan_width_total_timing_analysis_time	  min_chan_width_total_sta_time	  crit_path_routed_wirelength	  crit_path_route_success_iteration	  crit_path_total_nets_routed	  crit_path_total_connections_routed	  crit_path_total_heap_pushes	  crit_path_total_heap_pops	  critical_path_delay	  geomean_nonvirtual_intradomain_critical_path_delay	  setup_TNS	  setup_WNS	  hold_TNS	  hold_WNS	  crit_path_routing_area_total	  crit_path_routing_area_per_tile	  router_lookahead_computation_time	  crit_path_route_time	  crit_path_total_timing_analysis_time	  crit_path_total_sta_time	 
- k6_frac_N10_40nm.xml	  stereovision3.v	  common	  1.44	  vpr	  57.96 MiB	  	  -1	  -1	  0.42	  25620	  5	  0.11	  -1	  -1	  36164	  -1	  -1	  7	  10	  -1	  -1	  success	  v8.0.0-6989-g4a9293e1e-dirty	  release IPO VTR_ASSERT_LEVEL=3	  GNU 11.3.0 on Linux-5.15.0-58-generic x86_64	  2023-02-04T01:37:29	  dev	  /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing	  59352	  10	  2	  181	  183	  1	  37	  19	  5	  5	  25	  clb	  auto	  19.7 MiB	  0.05	  108	  58.0 MiB	  0.01	  0.00	  1.93928	  -79.1821	  -1.93928	  1.93928	  0.02	  0.000104618	  8.1277e-05	  0.00610425	  0.00496002	  24	  129	  10	  485046	  377258	  28445.8	  1137.83	  0.13	  0.0425172	  0.0362734	  109	  8	  74	  103	  1476	  611	  2.06938	  2.06938	  -89.2305	  -2.06938	  0	  0	  37126.9	  1485.07	  0.02	  0.01	  0.00966903	  0.00924379	 
+ k6_frac_N10_40nm.xml	  stereovision3.v	  common	  1.44	  vpr	  57.96 MiB	  	  -1	  -1	  0.42	  25620	  5	  0.11	  -1	  -1	  36164	  -1	  -1	  7	  10	  -1	  -1	  success	  v8.0.0-6989-g4a9293e1e-dirty	  release IPO VTR_ASSERT_LEVEL=3	  GNU 11.3.0 on Linux-5.15.0-58-generic x86_64	  2023-02-04T01:37:29	  dev	  /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing	  59352	  10	  2	  181	  183	  1	  37	  19	  5	  5	  25	  clb	  auto	  19.7 MiB	  0.05	  108	  58.0 MiB	  0.01	  0.00	  1.93928	  -79.1821	  -1.93928	  1.93928	  0.02	  0.000104618	  8.1277e-05	  0.00610425	  0.00496002	  26	  129	  10	  485046	  377258	  34134.96	  1365.396	  0.13	  0.0425172	  0.0362734	  109	  8	  74	  103	  1476	  611	  2.06938	  2.06938	  -89.2305	  -2.06938	  0	  0	  37126.9	  1485.07	  0.02	  0.01	  0.00966903	  0.00924379