diff --git a/vpr/src/place/net_cost_handler.cpp b/vpr/src/place/net_cost_handler.cpp index 7cbfed53c53..3e12ef446ba 100644 --- a/vpr/src/place/net_cost_handler.cpp +++ b/vpr/src/place/net_cost_handler.cpp @@ -112,6 +112,8 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts, , placer_opts_(placer_opts) { const int num_layers = g_vpr_ctx.device().grid.get_num_layers(); + is_multi_layer_ = num_layers > 1; + // Either 3D BB or per layer BB data structure are used, not both. if (cube_bb_) { ts_bb_edge_new_.resize(num_nets, t_bb()); @@ -145,10 +147,11 @@ NetCostHandler::NetCostHandler(const t_placer_opts& placer_opts, * been recomputed. */ bb_update_status_.resize(num_nets, NetUpdateState::NOT_UPDATED_YET); - alloc_and_load_chan_w_factors_for_place_cost_(placer_opts_.place_cost_exp); + alloc_and_load_chan_w_factors_for_place_cost_(); } -void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_cost_exp) { +void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_() { + const double place_cost_exp = static_cast(placer_opts_.place_cost_exp); auto& device_ctx = g_vpr_ctx.device(); const int grid_height = device_ctx.grid.height(); @@ -190,7 +193,7 @@ void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_c } chanx_place_cost_fac_[high][low] = (high - low + 1.) / chanx_place_cost_fac_[high][low]; - chanx_place_cost_fac_[high][low] = pow((double)chanx_place_cost_fac_[high][low], (double)place_cost_exp); + chanx_place_cost_fac_[high][low] = pow((double)chanx_place_cost_fac_[high][low], place_cost_exp); } } @@ -220,16 +223,16 @@ void NetCostHandler::alloc_and_load_chan_w_factors_for_place_cost_(float place_c } chany_place_cost_fac_[high][low] = (high - low + 1.) / chany_place_cost_fac_[high][low]; - chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], (double)place_cost_exp); + chany_place_cost_fac_[high][low] = pow((double)chany_place_cost_fac_[high][low], place_cost_exp); } } - if (device_ctx.grid.get_num_layers() > 1) { - alloc_and_load_for_fast_vertical_cost_update_(place_cost_exp); + if (is_multi_layer_) { + alloc_and_load_for_fast_vertical_cost_update_(); } } -void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_(float place_cost_exp) { +void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_() { const auto& device_ctx = g_vpr_ctx.device(); const auto& rr_graph = device_ctx.rr_graph; @@ -237,16 +240,35 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_(float place_c const size_t grid_width = device_ctx.grid.width(); - chanz_place_cost_fac_ = vtr::NdMatrix({grid_width, grid_height, grid_width, grid_height}, 0.); + acc_tile_num_inter_die_conn_ = vtr::NdMatrix({grid_width, grid_height}, 0.); + + vtr::NdMatrix tile_num_inter_die_conn({grid_width, grid_height}, 0.); - vtr::NdMatrix tile_num_inter_die_conn({grid_width, grid_height}, 0.); + /* + * Step 1: iterate over the rr-graph, recording how many edges go between layers at each (x,y) location + * in the device. We count all these edges, regardless of which layers they connect. Then we divide by + * the number of layers - 1 to get the average cross-layer edge count per (x,y) location -- this mirrors + * what we do for the horizontal and vertical channels where we assume the channel width doesn't change + * along the length of the channel. It lets us be more memory-efficient for 3D devices, and could be revisited + * if someday we have architectures with widely varying connectivity between different layers in a stack. + */ + /* + * To calculate the accumulative number of inter-die connections we first need to get the number of + * inter-die connection per location. To be able to work for the cases that RR Graph is read instead + * of being made from the architecture file, we calculate this number by iterating over the RR graph. Once + * tile_num_inter_die_conn is populated, we can start populating acc_tile_num_inter_die_conn_. First, + * we populate the first row and column. Then, we iterate over the rest of blocks and get the number of + * inter-die connections by adding up the number of inter-die block at that location + the accumulation + * for the block below and left to it. Then, since the accumulated number of inter-die connection to + * the block on the lower left connection of the block is added twice, that part needs to be removed. + */ for (const auto& src_rr_node : rr_graph.nodes()) { - for (const auto& rr_edge_idx : rr_graph.configurable_edges(src_rr_node)) { + for (const auto& rr_edge_idx : rr_graph.edges(src_rr_node)) { const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx); if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) { // We assume that the nodes driving the inter-layer connection or being driven by it - // are not streched across multiple tiles + // are not stretched across multiple tiles int src_x = rr_graph.node_xhigh(src_rr_node); int src_y = rr_graph.node_yhigh(src_rr_node); VTR_ASSERT(rr_graph.node_xlow(src_rr_node) == src_x && rr_graph.node_ylow(src_rr_node) == src_y); @@ -254,37 +276,34 @@ void NetCostHandler::alloc_and_load_for_fast_vertical_cost_update_(float place_c tile_num_inter_die_conn[src_x][src_y]++; } } + } - for (const auto& rr_edge_idx : rr_graph.non_configurable_edges(src_rr_node)) { - const auto& sink_rr_node = rr_graph.edge_sink_node(src_rr_node, rr_edge_idx); - if (rr_graph.node_layer(src_rr_node) != rr_graph.node_layer(sink_rr_node)) { - int src_x = rr_graph.node_xhigh(src_rr_node); - VTR_ASSERT(rr_graph.node_xlow(src_rr_node) == src_x && rr_graph.node_xlow(src_rr_node) == src_x); - int src_y = rr_graph.node_yhigh(src_rr_node); - VTR_ASSERT(rr_graph.node_ylow(src_rr_node) == src_y && rr_graph.node_ylow(src_rr_node) == src_y); - tile_num_inter_die_conn[src_x][src_y]++; - } + int num_layers = device_ctx.grid.get_num_layers(); + for (size_t x = 0; x < device_ctx.grid.width(); x++) { + for (size_t y = 0; y < device_ctx.grid.height(); y++) { + tile_num_inter_die_conn[x][y] /= (num_layers-1); } } - for (int x_high = 0; x_high < (int)device_ctx.grid.width(); x_high++) { - for (int y_high = 0; y_high < (int)device_ctx.grid.height(); y_high++) { - for (int x_low = 0; x_low <= x_high; x_low++) { - for (int y_low = 0; y_low <= y_high; y_low++) { - int num_inter_die_conn = 0; - for (int x = x_low; x <= x_high; x++) { - for (int y = y_low; y <= y_high; y++) { - num_inter_die_conn += tile_num_inter_die_conn[x][y]; - } - } - int seen_num_tiles = (x_high - x_low + 1) * (y_high - y_low + 1); - chanz_place_cost_fac_[x_high][y_high][x_low][y_low] = seen_num_tiles / static_cast(num_inter_die_conn); - - chanz_place_cost_fac_[x_high][y_high][x_low][y_low] = pow( - (double)chanz_place_cost_fac_[x_high][y_high][x_low][y_low], - (double)place_cost_exp); - } - } + // Step 2: Calculate prefix sum of the inter-die connectivity up to and including the channel at (x, y). + acc_tile_num_inter_die_conn_[0][0] = tile_num_inter_die_conn[0][0]; + // Initialize the first row and column + for (size_t x = 1; x < device_ctx.grid.width(); x++) { + acc_tile_num_inter_die_conn_[x][0] = acc_tile_num_inter_die_conn_[x-1][0] + + tile_num_inter_die_conn[x][0]; + } + + for (size_t y = 1; y < device_ctx.grid.height(); y++) { + acc_tile_num_inter_die_conn_[0][y] = acc_tile_num_inter_die_conn_[0][y-1] + + tile_num_inter_die_conn[0][y]; + } + + for (size_t x_high = 1; x_high < device_ctx.grid.width(); x_high++) { + for (size_t y_high = 1; y_high < device_ctx.grid.height(); y_high++) { + acc_tile_num_inter_die_conn_[x_high][y_high] = acc_tile_num_inter_die_conn_[x_high-1][y_high] + + acc_tile_num_inter_die_conn_[x_high][y_high-1] + + tile_num_inter_die_conn[x_high][y_high] - + acc_tile_num_inter_die_conn_[x_high-1][y_high-1]; } } } @@ -818,7 +837,7 @@ void NetCostHandler::update_bb_(ClusterNetId net_id, } /* Now account for the layer motion. */ - if (num_layers > 1) { + if (is_multi_layer_) { /* We need to update it only if multiple layers are available */ for (int layer_num = 0; layer_num < num_layers; layer_num++) { num_sink_pin_layer_new[layer_num] = curr_num_sink_pin_layer[layer_num]; @@ -1402,8 +1421,6 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) { const t_bb& bb = use_ts ? ts_bb_coord_new_[net_id] : placer_state_.move().bb_coords[net_id]; - const bool is_multi_layer = (g_vpr_ctx.device().grid.get_num_layers() > 1); - double crossing = wirelength_crossing_count(cluster_ctx.clb_nlist.net_pins(net_id).size()); /* Could insert a check for xmin == xmax. In that case, assume * @@ -1420,12 +1437,14 @@ double NetCostHandler::get_net_cube_bb_cost_(ClusterNetId net_id, bool use_ts) { */ double ncost; - ncost = (bb.xmax - bb.xmin + 1) * crossing * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1]; - ncost += (bb.ymax - bb.ymin + 1) * crossing * chany_place_cost_fac_[bb.xmax][bb.xmin - 1]; - if (is_multi_layer) { - ncost += (bb.layer_max - bb.layer_min) * crossing * chanz_place_cost_fac_[bb.xmax][bb.ymax][bb.xmin][bb.ymin]; + ncost = (bb.xmax - bb.xmin + 1) * chanx_place_cost_fac_[bb.ymax][bb.ymin - 1]; + ncost += (bb.ymax - bb.ymin + 1) * chany_place_cost_fac_[bb.xmax][bb.xmin - 1]; + if (is_multi_layer_) { + ncost += (bb.layer_max - bb.layer_min) * get_chanz_cost_factor_(bb); } + ncost *= crossing; + return ncost; } @@ -1526,6 +1545,39 @@ double NetCostHandler::get_net_wirelength_from_layer_bb_(ClusterNetId net_id) { return ncost; } +float NetCostHandler::get_chanz_cost_factor_(const t_bb& bb) { + float place_cost_exp = placer_opts_.place_cost_exp; + + int num_inter_dir_conn; + + if (bb.xmin == 0 && bb.ymin == 0) { + num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax]; + } else if (bb.xmin == 0) { + num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] - + acc_tile_num_inter_die_conn_[bb.xmax][bb.ymin-1]; + } else if (bb.ymin == 0) { + num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] - + acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymax]; + } else { + num_inter_dir_conn = acc_tile_num_inter_die_conn_[bb.xmax][bb.ymax] - + acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymax] - + acc_tile_num_inter_die_conn_[bb.xmax][bb.ymin-1] + + acc_tile_num_inter_die_conn_[bb.xmin-1][bb.ymin-1]; + } + + float z_cost_factor; + if (num_inter_dir_conn == 0) { + return 1.0f; + } else { + int bb_num_tiles = (bb.xmax - bb.xmin + 1) * (bb.ymax - bb.ymin + 1); + z_cost_factor = bb_num_tiles / static_cast(num_inter_dir_conn); + z_cost_factor = pow((double)z_cost_factor, (double)place_cost_exp); + } + + return z_cost_factor; + +} + double NetCostHandler::recompute_bb_cost_() { double cost = 0; diff --git a/vpr/src/place/net_cost_handler.h b/vpr/src/place/net_cost_handler.h index 3048b7637ea..fd6c7a46767 100644 --- a/vpr/src/place/net_cost_handler.h +++ b/vpr/src/place/net_cost_handler.h @@ -123,6 +123,8 @@ class NetCostHandler { private: ///@brief Specifies whether the bounding box is computed using cube method or per-layer method. bool cube_bb_ = false; + ///@brief Determines whether the FPGA has multiple dies (layers) + bool is_multi_layer_ = false; ///@brief A reference to the placer's state to be updated by this object. PlacerState& placer_state_; ///@brief Contains some parameter that determine how the placement cost is computed. @@ -196,12 +198,14 @@ class NetCostHandler { vtr::NdOffsetMatrix chanx_place_cost_fac_; // [-1...device_ctx.grid.width()-1] vtr::NdOffsetMatrix chany_place_cost_fac_; // [-1...device_ctx.grid.height()-1] /** - @brief This data structure functions similarly to the matrices described above - but is applied to 3D connections linking different FPGA layers. It is used in the - placement cost function calculation, where the height of the bounding box is divided - by the average number of inter-die connections within the bounding box. + * @brief The matrix below is used to calculate a chanz_place_cost_fac based on the average channel width in + * the cross-die-layer direction over a 2D (x,y) region. We don't assume the inter-die connectivity is the same at all (x,y) locations, so we + * can't compute the full chanz_place_cost_fac for all possible (xlow,ylow)(xhigh,yhigh) without a 4D array, which would + * be too big: O(n^2) in circuit size. Instead we compute a prefix sum that stores the number of inter-die connections per layer from + * (x=0,y=0) to (x,y). Given this, we can compute the average number of inter-die connections over a (xlow,ylow) to (xhigh,yhigh) + * region in O(1) (by adding and subtracting 4 entries) */ - vtr::NdMatrix chanz_place_cost_fac_; // [0...device_ctx.grid.width()-1][0...device_ctx.grid.height()-1][0...device_ctx.grid.width()-1][0...device_ctx.grid.height()-1] + vtr::NdMatrix acc_tile_num_inter_die_conn_; // [0..grid_width-1][0..grid_height-1] private: @@ -250,23 +254,17 @@ class NetCostHandler { * have to bother calling this routine; when using the cost function described above, however, you must always * call this routine before you do any placement cost determination. The place_cost_exp factor specifies to * what power the width of the channel should be taken -- larger numbers make narrower channels more expensive. - * - * @param place_cost_exp It is an exponent to which you take the average inverse channel capacity; - * a higher value would favour wider channels more over narrower channels during placement (usually we use 1). */ - void alloc_and_load_chan_w_factors_for_place_cost_(float place_cost_exp); + void alloc_and_load_chan_w_factors_for_place_cost_(); /** - * @brief Allocates and loads the chanz_place_cost_fac array with the inverse of - * the average number of inter-die connections between [subhigh] and [sublow]. + * @brief Allocates and loads acc_tile_num_inter_die_conn_ which contains the accumulative number of inter-die + * conntections. * * @details This is only useful for multi-die FPGAs. The place_cost_exp factor specifies to * what power the average number of inter-die connections should be take -- larger numbers make narrower channels more expensive. - * - * @param place_cost_exp It is an exponent to which you take the average number of inter-die connections; - * a higher value would favour areas with more inter-die connections over areas with less of those during placement (usually we use 1). */ - void alloc_and_load_for_fast_vertical_cost_update_(float place_cost_exp); + void alloc_and_load_for_fast_vertical_cost_update_(); /** * @brief Calculate the new connection delay and timing cost of all the @@ -511,4 +509,16 @@ class NetCostHandler { */ double get_net_wirelength_from_layer_bb_(ClusterNetId net_id); + /** + * @brief Calculate the chanz cost factor based on the inverse of the average number of inter-die connections + * in the given bounding box. This cost factor increases the placement cost for blocks that require inter-layer + * connections in areas with, on average, fewer inter-die connections. If inter-die connections are evenly + * distributed across tiles, the cost factor will be the same for all bounding boxes, but it will still + * weight z-directed vs. x- and y-directed connections appropriately. + * + * @param bounding_box Bounding box of the net which chanz cost factor is to be calculated + * @return ChanZ cost factor + */ + float get_chanz_cost_factor_(const t_bb& bounding_box); + }; diff --git a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_manual_annealing/config/golden_results.txt b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_manual_annealing/config/golden_results.txt index 905d2f3ba19..291897f65f4 100644 --- a/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_manual_annealing/config/golden_results.txt +++ b/vtr_flow/tasks/regression_tests/vtr_reg_strong/strong_manual_annealing/config/golden_results.txt @@ -1,2 +1,2 @@ arch circuit script_params vtr_flow_elapsed_time vtr_max_mem_stage vtr_max_mem error odin_synth_time max_odin_mem parmys_synth_time max_parmys_mem abc_depth abc_synth_time abc_cec_time abc_sec_time max_abc_mem ace_time max_ace_mem num_clb num_io num_memories num_mult vpr_status vpr_revision vpr_build_info vpr_compiler vpr_compiled hostname rundir max_vpr_mem num_primary_inputs num_primary_outputs num_pre_packed_nets num_pre_packed_blocks num_netlist_clocks num_post_packed_nets num_post_packed_blocks device_width device_height device_grid_tiles device_limiting_resources device_name pack_mem pack_time placed_wirelength_est place_mem place_time place_quench_time placed_CPD_est placed_setup_TNS_est placed_setup_WNS_est placed_geomean_nonvirtual_intradomain_critical_path_delay_est place_delay_matrix_lookup_time place_quench_timing_analysis_time place_quench_sta_time place_total_timing_analysis_time place_total_sta_time min_chan_width routed_wirelength min_chan_width_route_success_iteration logic_block_area_total logic_block_area_used min_chan_width_routing_area_total min_chan_width_routing_area_per_tile min_chan_width_route_time min_chan_width_total_timing_analysis_time min_chan_width_total_sta_time crit_path_routed_wirelength crit_path_route_success_iteration crit_path_total_nets_routed crit_path_total_connections_routed crit_path_total_heap_pushes crit_path_total_heap_pops critical_path_delay geomean_nonvirtual_intradomain_critical_path_delay setup_TNS setup_WNS hold_TNS hold_WNS crit_path_routing_area_total crit_path_routing_area_per_tile router_lookahead_computation_time crit_path_route_time crit_path_total_timing_analysis_time crit_path_total_sta_time - k6_frac_N10_40nm.xml stereovision3.v common 1.44 vpr 57.96 MiB -1 -1 0.42 25620 5 0.11 -1 -1 36164 -1 -1 7 10 -1 -1 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 59352 10 2 181 183 1 37 19 5 5 25 clb auto 19.7 MiB 0.05 108 58.0 MiB 0.01 0.00 1.93928 -79.1821 -1.93928 1.93928 0.02 0.000104618 8.1277e-05 0.00610425 0.00496002 24 129 10 485046 377258 28445.8 1137.83 0.13 0.0425172 0.0362734 109 8 74 103 1476 611 2.06938 2.06938 -89.2305 -2.06938 0 0 37126.9 1485.07 0.02 0.01 0.00966903 0.00924379 + k6_frac_N10_40nm.xml stereovision3.v common 1.44 vpr 57.96 MiB -1 -1 0.42 25620 5 0.11 -1 -1 36164 -1 -1 7 10 -1 -1 success v8.0.0-6989-g4a9293e1e-dirty release IPO VTR_ASSERT_LEVEL=3 GNU 11.3.0 on Linux-5.15.0-58-generic x86_64 2023-02-04T01:37:29 dev /home/dev/Desktop/CAS-Atlantic/vtr-verilog-to-routing 59352 10 2 181 183 1 37 19 5 5 25 clb auto 19.7 MiB 0.05 108 58.0 MiB 0.01 0.00 1.93928 -79.1821 -1.93928 1.93928 0.02 0.000104618 8.1277e-05 0.00610425 0.00496002 26 129 10 485046 377258 34134.96 1365.396 0.13 0.0425172 0.0362734 109 8 74 103 1476 611 2.06938 2.06938 -89.2305 -2.06938 0 0 37126.9 1485.07 0.02 0.01 0.00966903 0.00924379