diff --git a/book/api/metrics-generated.md b/book/api/metrics-generated.md index c4b9586052..8573c01013 100644 --- a/book/api/metrics-generated.md +++ b/book/api/metrics-generated.md @@ -43,7 +43,11 @@ | net_​sent_​bytes | `counter` | Total bytes sent (including IP, UDP headers). | | net_​xdp_​rx_​dropped_​ring_​full | `counter` | Number of packets dropped because the RX completion queue was empty. This is only reported for net tile 0, since the measurement is across all RX queues. | | net_​xdp_​rx_​dropped_​other | `counter` | Number of packets dropped for other reasons. This is only reported for net tile 0, since the measurement is across all RX queues. | -| net_​tx_​dropped | `counter` | Number of packets dropped because the TX submission queue was empty. This is reported for all net tiles. | +| net_​tx_​dropped_​interface_​no_​xdp | `counter` | Number of packets dropped because the output interface has no XDP socket. | +| net_​tx_​dropped_​full_​lo | `counter` | Number of packets dropped because the TX ring was full (loopback) | +| net_​tx_​dropped_​full_​main | `counter` | Number of packets dropped because the TX ring was full (main) | +| net_​tx_​dropped_​route_​fail | `counter` | Number of packets dropped due to routing failure | +| net_​tx_​dropped_​neighbor_​fail | `counter` | Number of packets dropped due to missing neighbor/ARP info | ## Quic Tile | Metric | Type | Description | diff --git a/src/app/fdctl/run/tiles/fd_net.c b/src/app/fdctl/run/tiles/fd_net.c index 90d2224078..c7853be9ec 100644 --- a/src/app/fdctl/run/tiles/fd_net.c +++ b/src/app/fdctl/run/tiles/fd_net.c @@ -38,14 +38,16 @@ #include "generated/net_seccomp.h" #include "../../../../disco/metrics/fd_metrics.h" +#include "../../../../disco/netlink/fd_netlink_tile.h" /* neigh4_solicit */ -#include "../../../../waltz/xdp/fd_xdp.h" +#include "../../../../waltz/xdp/fd_xdp_redirect_user.h" /* fd_xsk_activate */ #include "../../../../waltz/xdp/fd_xdp1.h" #include "../../../../waltz/xdp/fd_xsk_aio_private.h" #include "../../../../waltz/xdp/fd_xsk_private.h" +#include "../../../../waltz/ip/fd_fib4.h" +#include "../../../../waltz/neigh/fd_neigh4_map.h" #include "../../../../util/log/fd_dtrace.h" #include "../../../../util/net/fd_ip4.h" -#include "../../../../waltz/ip/fd_ip.h" #include #include @@ -84,6 +86,8 @@ typedef struct { uchar frame[ FD_NET_MTU ]; + uint lo_if_idx; + uint main_if_idx; uint src_ip_addr; uchar src_mac_addr[6]; @@ -102,11 +106,17 @@ typedef struct { fd_net_out_ctx_t gossip_out[1]; fd_net_out_ctx_t repair_out[1]; - fd_ip_t * ip; - long ip_next_upd; + fd_fib4_t const * fib_local; + fd_fib4_t const * fib_main; + fd_neigh4_hmap_t neigh4[1]; + fd_netlink_neigh4_solicit_link_t neigh4_solicit[1]; struct { - ulong tx_dropped_cnt; + ulong tx_drops_iface; /* dropped due to route targeting wrong interface */ + ulong tx_drops_xsk_lo; /* dropped due to TX backpressure (loopback) */ + ulong tx_drops_xsk_main; /* dropped due to TX backpressure (main interface) */ + ulong tx_drops_route; /* dropped due to routing table */ + ulong tx_drops_neigh; /* dropped due to neighbor not resolved */ } metrics; } fd_net_ctx_t; @@ -128,7 +138,6 @@ scratch_footprint( fd_topo_tile_t const * tile ) { l = FD_LAYOUT_APPEND( l, fd_xsk_align(), fd_xsk_footprint( FD_NET_MTU, tile->net.xdp_rx_queue_size, tile->net.xdp_rx_queue_size, tile->net.xdp_tx_queue_size, tile->net.xdp_tx_queue_size ) ); l = FD_LAYOUT_APPEND( l, fd_xsk_aio_align(), fd_xsk_aio_footprint( tile->net.xdp_tx_queue_size, tile->net.xdp_aio_depth ) ); } - l = FD_LAYOUT_APPEND( l, fd_ip_align(), fd_ip_footprint( 0UL, 0UL ) ); return FD_LAYOUT_FINI( l, scratch_align() ); } @@ -261,7 +270,11 @@ metrics_write( fd_net_ctx_t * ctx ) { FD_MCNT_SET( NET, SENT_PACKETS, tx_cnt ); FD_MCNT_SET( NET, SENT_BYTES, tx_sz ); - FD_MCNT_SET( NET, TX_DROPPED, ctx->metrics.tx_dropped_cnt ); + FD_MCNT_SET( NET, TX_DROPPED_INTERFACE_NO_XDP, ctx->metrics.tx_drops_iface ); + FD_MCNT_SET( NET, TX_DROPPED_FULL_LO, ctx->metrics.tx_drops_xsk_lo ); + FD_MCNT_SET( NET, TX_DROPPED_FULL_MAIN, ctx->metrics.tx_drops_xsk_main ); + FD_MCNT_SET( NET, TX_DROPPED_ROUTE_FAIL, ctx->metrics.tx_drops_route ); + FD_MCNT_SET( NET, TX_DROPPED_NEIGHBOR_FAIL, ctx->metrics.tx_drops_neigh ); } static void @@ -321,44 +334,23 @@ poll_xdp_statistics( fd_net_ctx_t * ctx ) { static void during_housekeeping( fd_net_ctx_t * ctx ) { - long now = fd_log_wallclock(); - if( FD_UNLIKELY( now > ctx->ip_next_upd ) ) { - ctx->ip_next_upd = now + (long)60e9; - fd_ip_arp_fetch( ctx->ip ); - fd_ip_route_fetch( ctx->ip ); - } - /* Only net tile 0 polls the statistics, as they are retrieved for the XDP socket which is shared across all net tiles. */ if( FD_LIKELY( !ctx->round_robin_id ) ) poll_xdp_statistics( ctx ); } -FD_FN_PURE static int -route_loopback( uint tile_ip_addr, - ulong sig ) { - return fd_disco_netmux_sig_dst_ip( sig )==FD_IP4_ADDR(127,0,0,1) || - fd_disco_netmux_sig_dst_ip( sig )==tile_ip_addr; -} - static inline int before_frag( fd_net_ctx_t * ctx, ulong in_idx, ulong seq, ulong sig ) { - (void)in_idx; + (void)ctx; (void)seq; (void)in_idx; ulong proto = fd_disco_netmux_sig_proto( sig ); if( FD_UNLIKELY( proto!=DST_PROTO_OUTGOING ) ) return 1; - /* Round robin by sequence number for now, QUIC should be modified to - echo the net tile index back so we can transmit on the same queue. - - 127.0.0.1 packets for localhost must go out on net tile 0 which - owns the loopback interface XSK, which only has 1 queue. */ - - if( FD_UNLIKELY( route_loopback( ctx->src_ip_addr, sig ) ) ) return ctx->round_robin_id != 0UL; - else return (seq % ctx->round_robin_cnt) != ctx->round_robin_id; + return 0; } static inline void @@ -379,31 +371,6 @@ during_frag( fd_net_ctx_t * ctx, fd_memcpy( ctx->frame, src, sz ); // TODO: Change xsk_aio interface to eliminate this copy } -static void -send_arp_probe( fd_net_ctx_t * ctx, - uint dst_ip_addr, - uint ifindex ) { - uchar arp_buf[FD_IP_ARP_SZ]; - ulong arp_len = 0UL; - - uint src_ip_addr = ctx->src_ip_addr; - uchar * src_mac_addr = ctx->src_mac_addr; - - /* prepare arp table */ - int arp_table_rtn = fd_ip_update_arp_table( ctx->ip, dst_ip_addr, ifindex ); - - if( FD_UNLIKELY( arp_table_rtn == FD_IP_SUCCESS ) ) { - /* generate a probe */ - fd_ip_arp_gen_arp_probe( arp_buf, FD_IP_ARP_SZ, &arp_len, dst_ip_addr, fd_uint_bswap( src_ip_addr ), src_mac_addr ); - - /* send the probe */ - fd_aio_pkt_info_t aio_buf = { .buf = arp_buf, .buf_sz = (ushort)arp_len }; - ulong sent_cnt; - int aio_err = ctx->tx->send_func( ctx->xsk_aio[ 0 ], &aio_buf, 1, &sent_cnt, 1 ); - ctx->metrics.tx_dropped_cnt += aio_err!=FD_AIO_SUCCESS; - } -} - static void after_frag( fd_net_ctx_t * ctx, ulong in_idx, @@ -412,87 +379,64 @@ after_frag( fd_net_ctx_t * ctx, ulong sz, ulong tsorig, fd_stem_context_t * stem ) { - (void)in_idx; - (void)seq; - (void)sig; - (void)tsorig; - (void)stem; + (void)in_idx; (void)seq; (void)sig; (void)tsorig; (void)stem; fd_aio_pkt_info_t aio_buf = { .buf = ctx->frame, .buf_sz = (ushort)sz }; - if( FD_UNLIKELY( route_loopback( ctx->src_ip_addr, sig ) ) ) { + uint dst_ip = fd_uint_bswap( fd_disco_netmux_sig_dst_ip( sig ) ); + ulong sent_cnt; /* dummy */ + + /* Routing */ + + fd_fib4_hop_t hop[2] = {0}; + fd_fib4_lookup( ctx->fib_local, hop+0, dst_ip, 0UL ); + fd_fib4_lookup( ctx->fib_main, hop+1, dst_ip, 0UL ); + fd_fib4_hop_t const * next_hop = fd_fib4_hop_or( hop+0, hop+1 ); + + /* FIXME This assumes that loopback is always interface index 1 */ + uint if_idx = next_hop->if_idx; + if( next_hop->rtype==FD_FIB4_RTYPE_LOCAL ) if_idx = 1; + + if( if_idx==1 ) { /* Set Ethernet src and dst address to 00:00:00:00:00:00 */ memset( ctx->frame, 0, 12UL ); - ulong sent_cnt; int aio_err = ctx->lo_tx->send_func( ctx->xsk_aio[ 1 ], &aio_buf, 1, &sent_cnt, 1 ); - ctx->metrics.tx_dropped_cnt += aio_err!=FD_AIO_SUCCESS; - } else { - /* extract dst ip */ - uint dst_ip = fd_uint_bswap( fd_disco_netmux_sig_dst_ip( sig ) ); - - uint next_hop = 0U; - uchar dst_mac[6] = {0}; - uint if_idx = 0; - - /* route the packet */ - /* - * determine the destination: - * same host - * same subnet - * other - * determine the next hop - * localhost - * gateway - * subnet local host - * determine the mac address of the next hop address - * and the local ipv4 and eth addresses */ - int rtn = fd_ip_route_ip_addr( dst_mac, &next_hop, &if_idx, ctx->ip, dst_ip ); - if( FD_UNLIKELY( rtn == FD_IP_PROBE_RQD ) ) { - /* another fd_net instance might have already resolved this address - so simply try another fetch */ - fd_ip_arp_fetch( ctx->ip ); - rtn = fd_ip_route_ip_addr( dst_mac, &next_hop, &if_idx, ctx->ip, dst_ip ); - } + ctx->metrics.tx_drops_iface += aio_err!=FD_AIO_SUCCESS; + return; + } - long now; - switch( rtn ) { - case FD_IP_PROBE_RQD: - /* TODO possibly buffer some data while waiting for ARPs to complete */ - /* TODO rate limit ARPs */ - /* TODO add caching of ip_dst -> routing info */ - send_arp_probe( ctx, next_hop, if_idx ); - - /* refresh tables */ - now = fd_log_wallclock(); - ctx->ip_next_upd = now + (long)200e3; - break; - case FD_IP_NO_ROUTE: - /* cannot make progress here */ - break; - case FD_IP_SUCCESS: - /* set destination mac address */ - memcpy( ctx->frame, dst_mac, 6UL ); - - /* set source mac address */ - memcpy( ctx->frame + 6UL, ctx->src_mac_addr, 6UL ); - - ulong sent_cnt; - int aio_err = ctx->tx->send_func( ctx->xsk_aio[ 0 ], &aio_buf, 1, &sent_cnt, 1 ); - ctx->metrics.tx_dropped_cnt += aio_err!=FD_AIO_SUCCESS; - break; - case FD_IP_RETRY: - /* refresh tables */ - now = fd_log_wallclock(); - ctx->ip_next_upd = now + (long)200e3; - /* TODO consider buffering */ - break; - case FD_IP_MULTICAST: - case FD_IP_BROADCAST: - default: - /* should not occur in current use cases */ - break; - } + if( FD_UNLIKELY( if_idx!=ctx->xsk[ 0 ]->if_idx ) ) { + ctx->metrics.tx_drops_iface++; + return; + } + + if( FD_UNLIKELY( next_hop->rtype!=FD_FIB4_RTYPE_UNICAST ) ) { + ctx->metrics.tx_drops_route++; + return; } + + /* Neighbor resolve */ + + fd_neigh4_hmap_query_t neigh_query[1]; + int neigh_res = fd_neigh4_hmap_query_try( ctx->neigh4, &next_hop->ip4_gw, NULL, neigh_query, 0 ); + if( FD_UNLIKELY( neigh_res!=FD_MAP_SUCCESS ) ) { + /* Neighbor not found */ + fd_netlink_neigh4_solicit( ctx->neigh4_solicit, next_hop->ip4_gw, fd_frag_meta_ts_comp( fd_tickcount() ) ); + ctx->metrics.tx_drops_neigh++; + return; + } + fd_neigh4_entry_t const * neigh = fd_neigh4_hmap_query_ele_const( neigh_query ); + + memcpy( ctx->frame+0, neigh->mac_addr, 6 ); + memcpy( ctx->frame+6, ctx->src_mac_addr, 6 ); + + if( FD_UNLIKELY( fd_neigh4_hmap_query_test( neigh_query ) ) ) { + ctx->metrics.tx_drops_neigh++; + return; + } + + int aio_err = ctx->tx->send_func( ctx->xsk_aio[ 0 ], &aio_buf, 1, &sent_cnt, 1 ); + ctx->metrics.tx_drops_xsk_main += aio_err!=FD_AIO_SUCCESS; } /* init_link_session is part of privileged_init. It only runs on net @@ -580,6 +524,7 @@ privileged_init( fd_topo_t * topo, uint lo_idx = if_nametoindex( "lo" ); if( FD_UNLIKELY( !lo_idx ) ) FD_LOG_ERR(( "if_nametoindex(lo) failed" )); + ctx->lo_if_idx = lo_idx; fd_xdp_fds_t lo_fds = fd_xdp_install( lo_idx, tile->net.src_ip_addr, @@ -606,8 +551,6 @@ privileged_init( fd_topo_t * topo, tile->net.xdp_aio_depth ), ctx->xsk[ 1 ] ); if( FD_UNLIKELY( !ctx->xsk_aio[ 1 ] ) ) FD_LOG_ERR(( "fd_xsk_aio_new failed" )); } - - ctx->ip = fd_ip_join( fd_ip_new( FD_SCRATCH_ALLOC_APPEND( l, fd_ip_align(), fd_ip_footprint( 0UL, 0UL ) ), 0UL, 0UL ) ); } static void @@ -635,8 +578,6 @@ unprivileged_init( fd_topo_t * topo, ctx->src_ip_addr = tile->net.src_ip_addr; memcpy( ctx->src_mac_addr, tile->net.src_mac_addr, 6UL ); - ctx->metrics.tx_dropped_cnt = 0UL; - ctx->shred_listen_port = tile->net.shred_listen_port; ctx->quic_transaction_listen_port = tile->net.quic_transaction_listen_port; ctx->legacy_transaction_listen_port = tile->net.legacy_transaction_listen_port; @@ -701,6 +642,11 @@ unprivileged_init( fd_topo_t * topo, ctx->repair_out->mem = topo->workspaces[ topo->objs[ repair_out->dcache_obj_id ].wksp_id ].wksp; ctx->repair_out->wmark = fd_dcache_compact_wmark ( ctx->repair_out->mem, repair_out->dcache, repair_out->mtu ); ctx->repair_out->chunk = ctx->repair_out->chunk0; + } else if( strcmp( out_link->name, "net_netlink" ) == 0 ) { + fd_topo_link_t * netlink_out = out_link; + ctx->neigh4_solicit->mcache = netlink_out->mcache; + ctx->neigh4_solicit->depth = fd_mcache_depth( ctx->neigh4_solicit->mcache ); + ctx->neigh4_solicit->seq = fd_mcache_seq_query( fd_mcache_seq_laddr( ctx->neigh4_solicit->mcache ) ); } else { FD_LOG_ERR(( "unrecognized out link `%s`", out_link->name )); } @@ -719,6 +665,8 @@ unprivileged_init( fd_topo_t * topo, FD_LOG_ERR(( "repair intake port set but no out link was found" )); } else if( FD_UNLIKELY( ctx->repair_serve_listen_port!=0 && ctx->repair_out->mcache==NULL ) ) { FD_LOG_ERR(( "repair serve listen port set but no out link was found" )); + } else if( FD_UNLIKELY( ctx->neigh4_solicit->mcache==NULL ) ) { + FD_LOG_ERR(( "netlink out link not found" )); } ulong scratch_top = FD_SCRATCH_ALLOC_FINI( l, 1UL ); @@ -739,8 +687,7 @@ populate_allowed_seccomp( fd_topo_t const * topo, two "allow" FD arguments to the net policy, so we just make them both the same. */ int allow_fd2 = ctx->xsk_cnt>1UL ? ctx->xsk[ 1 ]->xsk_fd : ctx->xsk[ 0 ]->xsk_fd; FD_TEST( ctx->xsk[ 0 ]->xsk_fd >= 0 && allow_fd2 >= 0 ); - int netlink_fd = fd_ip_netlink_get( ctx->ip )->fd; - populate_sock_filter_policy_net( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->xsk[ 0 ]->xsk_fd, (uint)allow_fd2, (uint)netlink_fd ); + populate_sock_filter_policy_net( out_cnt, out, (uint)fd_log_private_logfile_fd(), (uint)ctx->xsk[ 0 ]->xsk_fd, (uint)allow_fd2 ); return sock_filter_policy_net_instr_cnt; } @@ -753,14 +700,13 @@ populate_allowed_fds( fd_topo_t const * topo, FD_SCRATCH_ALLOC_INIT( l, scratch ); fd_net_ctx_t * ctx = FD_SCRATCH_ALLOC_APPEND( l, alignof( fd_net_ctx_t ), sizeof( fd_net_ctx_t ) ); - if( FD_UNLIKELY( out_fds_cnt<7UL ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt )); + if( FD_UNLIKELY( out_fds_cnt<6UL ) ) FD_LOG_ERR(( "out_fds_cnt %lu", out_fds_cnt )); ulong out_cnt = 0UL; out_fds[ out_cnt++ ] = 2; /* stderr */ if( FD_LIKELY( -1!=fd_log_private_logfile_fd() ) ) out_fds[ out_cnt++ ] = fd_log_private_logfile_fd(); /* logfile */ - out_fds[ out_cnt++ ] = fd_ip_netlink_get( ctx->ip )->fd; out_fds[ out_cnt++ ] = ctx->xsk[ 0 ]->xsk_fd; out_fds[ out_cnt++ ] = ctx->prog_link_fds[ 0 ]; diff --git a/src/app/fdctl/run/tiles/generated/net_seccomp.h b/src/app/fdctl/run/tiles/generated/net_seccomp.h index 01d1b8d7c9..f4e5cbf0f0 100644 --- a/src/app/fdctl/run/tiles/generated/net_seccomp.h +++ b/src/app/fdctl/run/tiles/generated/net_seccomp.h @@ -21,126 +21,92 @@ #else # error "Target architecture is unsupported by seccomp." #endif -static const unsigned int sock_filter_policy_net_instr_cnt = 60; +static const unsigned int sock_filter_policy_net_instr_cnt = 43; -static void populate_sock_filter_policy_net( ulong out_cnt, struct sock_filter * out, unsigned int logfile_fd, unsigned int xsk_fd, unsigned int lo_xsk_fd, unsigned int netlink_fd) { - FD_TEST( out_cnt >= 60 ); - struct sock_filter filter[60] = { +static void populate_sock_filter_policy_net( ulong out_cnt, struct sock_filter * out, unsigned int logfile_fd, unsigned int xsk_fd, unsigned int lo_xsk_fd) { + FD_TEST( out_cnt >= 43 ); + struct sock_filter filter[43] = { /* Check: Jump to RET_KILL_PROCESS if the script's arch != the runtime arch */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, arch ) ) ), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, ARCH_NR, 0, /* RET_KILL_PROCESS */ 56 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, ARCH_NR, 0, /* RET_KILL_PROCESS */ 39 ), /* loading syscall number in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, ( offsetof( struct seccomp_data, nr ) ) ), /* allow write based on expression */ - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_write, /* check_write */ 6, 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_write, /* check_write */ 5, 0 ), /* allow fsync based on expression */ - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_fsync, /* check_fsync */ 9, 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_fsync, /* check_fsync */ 8, 0 ), /* allow sendto based on expression */ - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_sendto, /* check_sendto */ 10, 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_sendto, /* check_sendto */ 9, 0 ), /* allow recvmsg based on expression */ - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_recvmsg, /* check_recvmsg */ 31, 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_recvmsg, /* check_recvmsg */ 22, 0 ), /* allow getsockopt based on expression */ - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_getsockopt, /* check_getsockopt */ 36, 0 ), - /* allow recvfrom based on expression */ - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_recvfrom, /* check_recvfrom */ 41, 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SYS_getsockopt, /* check_getsockopt */ 27, 0 ), /* none of the syscalls matched */ - { BPF_JMP | BPF_JA, 0, 0, /* RET_KILL_PROCESS */ 48 }, + { BPF_JMP | BPF_JA, 0, 0, /* RET_KILL_PROCESS */ 32 }, // check_write: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 2, /* RET_ALLOW */ 47, /* lbl_1 */ 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 2, /* RET_ALLOW */ 31, /* lbl_1 */ 0 ), // lbl_1: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 45, /* RET_KILL_PROCESS */ 44 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 29, /* RET_KILL_PROCESS */ 28 ), // check_fsync: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 43, /* RET_KILL_PROCESS */ 42 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, logfile_fd, /* RET_ALLOW */ 27, /* RET_KILL_PROCESS */ 26 ), // check_sendto: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_3 */ 2, /* lbl_4 */ 0 ), -// lbl_4: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_2 */ 2, /* lbl_3 */ 0 ), +// lbl_3: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, lo_xsk_fd, /* lbl_3 */ 0, /* lbl_2 */ 10 ), -// lbl_3: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, lo_xsk_fd, /* lbl_2 */ 0, /* RET_KILL_PROCESS */ 22 ), +// lbl_2: /* load syscall argument 1 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[1])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_5 */ 0, /* lbl_2 */ 8 ), -// lbl_5: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_4 */ 0, /* RET_KILL_PROCESS */ 20 ), +// lbl_4: /* load syscall argument 2 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[2])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_6 */ 0, /* lbl_2 */ 6 ), -// lbl_6: - /* load syscall argument 3 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* lbl_7 */ 0, /* lbl_2 */ 4 ), -// lbl_7: - /* load syscall argument 4 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_8 */ 0, /* lbl_2 */ 2 ), -// lbl_8: - /* load syscall argument 5 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 29, /* lbl_2 */ 0 ), -// lbl_2: - /* load syscall argument 0 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, netlink_fd, /* lbl_9 */ 0, /* RET_KILL_PROCESS */ 26 ), -// lbl_9: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_5 */ 0, /* RET_KILL_PROCESS */ 18 ), +// lbl_5: /* load syscall argument 3 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_10 */ 0, /* RET_KILL_PROCESS */ 24 ), -// lbl_10: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* lbl_6 */ 0, /* RET_KILL_PROCESS */ 16 ), +// lbl_6: /* load syscall argument 4 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_11 */ 0, /* RET_KILL_PROCESS */ 22 ), -// lbl_11: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_7 */ 0, /* RET_KILL_PROCESS */ 14 ), +// lbl_7: /* load syscall argument 5 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 21, /* RET_KILL_PROCESS */ 20 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 13, /* RET_KILL_PROCESS */ 12 ), // check_recvmsg: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_12 */ 2, /* lbl_13 */ 0 ), -// lbl_13: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_8 */ 2, /* lbl_9 */ 0 ), +// lbl_9: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, lo_xsk_fd, /* lbl_12 */ 0, /* RET_KILL_PROCESS */ 16 ), -// lbl_12: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, lo_xsk_fd, /* lbl_8 */ 0, /* RET_KILL_PROCESS */ 8 ), +// lbl_8: /* load syscall argument 2 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[2])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* RET_ALLOW */ 15, /* RET_KILL_PROCESS */ 14 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, MSG_DONTWAIT, /* RET_ALLOW */ 7, /* RET_KILL_PROCESS */ 6 ), // check_getsockopt: /* load syscall argument 0 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_14 */ 0, /* RET_KILL_PROCESS */ 12 ), -// lbl_14: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, xsk_fd, /* lbl_10 */ 0, /* RET_KILL_PROCESS */ 4 ), +// lbl_10: /* load syscall argument 1 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[1])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SOL_XDP, /* lbl_15 */ 0, /* RET_KILL_PROCESS */ 10 ), -// lbl_15: + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, SOL_XDP, /* lbl_11 */ 0, /* RET_KILL_PROCESS */ 2 ), +// lbl_11: /* load syscall argument 2 in accumulator */ BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[2])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, XDP_STATISTICS, /* RET_ALLOW */ 9, /* RET_KILL_PROCESS */ 8 ), -// check_recvfrom: - /* load syscall argument 0 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[0])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, netlink_fd, /* lbl_16 */ 0, /* RET_KILL_PROCESS */ 6 ), -// lbl_16: - /* load syscall argument 3 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[3])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_17 */ 0, /* RET_KILL_PROCESS */ 4 ), -// lbl_17: - /* load syscall argument 4 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[4])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* lbl_18 */ 0, /* RET_KILL_PROCESS */ 2 ), -// lbl_18: - /* load syscall argument 5 in accumulator */ - BPF_STMT( BPF_LD | BPF_W | BPF_ABS, offsetof(struct seccomp_data, args[5])), - BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, 0, /* RET_ALLOW */ 1, /* RET_KILL_PROCESS */ 0 ), + BPF_JUMP( BPF_JMP | BPF_JEQ | BPF_K, XDP_STATISTICS, /* RET_ALLOW */ 1, /* RET_KILL_PROCESS */ 0 ), // RET_KILL_PROCESS: /* KILL_PROCESS is placed before ALLOW since it's the fallthrough case. */ BPF_STMT( BPF_RET | BPF_K, SECCOMP_RET_KILL_PROCESS ), diff --git a/src/app/fdctl/run/tiles/net.seccomppolicy b/src/app/fdctl/run/tiles/net.seccomppolicy index 0223559a93..90868f3df5 100644 --- a/src/app/fdctl/run/tiles/net.seccomppolicy +++ b/src/app/fdctl/run/tiles/net.seccomppolicy @@ -7,12 +7,7 @@ # lo_xsk_fd: This is the file descriptor for the kernel XDP socket we # created for the loopback network device. This is currently # needed because Solana sends packets to itself on loopback. -# -# netlink_fd: The NET tile uses netlink to talk to the kernel and fetch -# the ARP table to fill in ethernet headers on outgoing -# packets. This is the file descriptor of the netlink -# socket. -unsigned int logfile_fd, unsigned int xsk_fd, unsigned int lo_xsk_fd, unsigned int netlink_fd +unsigned int logfile_fd, unsigned int xsk_fd, unsigned int lo_xsk_fd # logging: all log messages are written to a file and/or pipe # @@ -41,25 +36,12 @@ fsync: (eq (arg 0) logfile_fd) # arg 0 is the file descriptor of the XSK that the kernel should poll # for entries. There are two possible XSKs, since we can send packets # on a network device or the loopback device. - -# netlink: send netlink messages to kernel requesting ARP tables -# -# Because Firedancer does kernel bypass for networking, it needs to populate -# the ethernet headers on UDP packets it sends. This requires consulting the -# kernel ARP table, which is done by calling sendto() on a special netlink -# socket. -# -# arg 0 is the netlink file descriptor to send packets to. sendto: (or (and (or (eq (arg 0) xsk_fd) (eq (arg 0) lo_xsk_fd)) (eq (arg 1) 0) (eq (arg 2) 0) (eq (arg 3) MSG_DONTWAIT) (eq (arg 4) 0) - (eq (arg 5) 0)) - (and (eq (arg 0) netlink_fd) - (eq (arg 3) 0) - (eq (arg 4) 0) (eq (arg 5) 0))) # XDP: We use XDP_USE_NEED_WAKEUP so we must notify the kernel when @@ -75,7 +57,7 @@ sendto: (or (and (or (eq (arg 0) xsk_fd) # arg 0 is the file descriptor of the XSK that the kernel should poll # for entries. There are two possible XSKs, since we can receive # packets on a network device or the loopback device. -recvmsg: (and (or (eq (arg 0) xsk_fd) +recvmsg: (and (or (eq (arg 0) xsk_fd) (eq (arg 0) lo_xsk_fd)) (eq (arg 2) MSG_DONTWAIT)) @@ -84,15 +66,3 @@ recvmsg: (and (or (eq (arg 0) xsk_fd) getsockopt: (and (eq (arg 0) xsk_fd) (eq (arg 1) SOL_XDP) (eq (arg 2) XDP_STATISTICS)) - -# netlink: receive netlink messages from kernel for ARP tables -# -# This is the receive side of the above sendto() call, so when we send -# a request for ARP entries, we need to call recvfrom() on the socket -# to read the kernel response. -# -# arg 0 is the netlink file descriptor to receive packets from. -recvfrom: (and (eq (arg 0) netlink_fd) - (eq (arg 3) 0) - (eq (arg 4) 0) - (eq (arg 5) 0)) diff --git a/src/disco/metrics/generated/fd_metrics_net.c b/src/disco/metrics/generated/fd_metrics_net.c index 42bea1485b..5096713d4f 100644 --- a/src/disco/metrics/generated/fd_metrics_net.c +++ b/src/disco/metrics/generated/fd_metrics_net.c @@ -8,5 +8,9 @@ const fd_metrics_meta_t FD_METRICS_NET[FD_METRICS_NET_TOTAL] = { DECLARE_METRIC( NET_SENT_BYTES, COUNTER ), DECLARE_METRIC( NET_XDP_RX_DROPPED_RING_FULL, COUNTER ), DECLARE_METRIC( NET_XDP_RX_DROPPED_OTHER, COUNTER ), - DECLARE_METRIC( NET_TX_DROPPED, COUNTER ), + DECLARE_METRIC( NET_TX_DROPPED_INTERFACE_NO_XDP, COUNTER ), + DECLARE_METRIC( NET_TX_DROPPED_FULL_LO, COUNTER ), + DECLARE_METRIC( NET_TX_DROPPED_FULL_MAIN, COUNTER ), + DECLARE_METRIC( NET_TX_DROPPED_ROUTE_FAIL, COUNTER ), + DECLARE_METRIC( NET_TX_DROPPED_NEIGHBOR_FAIL, COUNTER ), }; diff --git a/src/disco/metrics/generated/fd_metrics_net.h b/src/disco/metrics/generated/fd_metrics_net.h index 82ce610ba3..24a014b1ac 100644 --- a/src/disco/metrics/generated/fd_metrics_net.h +++ b/src/disco/metrics/generated/fd_metrics_net.h @@ -39,11 +39,35 @@ #define FD_METRICS_COUNTER_NET_XDP_RX_DROPPED_OTHER_DESC "Number of packets dropped for other reasons. This is only reported for net tile 0, since the measurement is across all RX queues." #define FD_METRICS_COUNTER_NET_XDP_RX_DROPPED_OTHER_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_COUNTER_NET_TX_DROPPED_OFF (22UL) -#define FD_METRICS_COUNTER_NET_TX_DROPPED_NAME "net_tx_dropped" -#define FD_METRICS_COUNTER_NET_TX_DROPPED_TYPE (FD_METRICS_TYPE_COUNTER) -#define FD_METRICS_COUNTER_NET_TX_DROPPED_DESC "Number of packets dropped because the TX submission queue was empty. This is reported for all net tiles." -#define FD_METRICS_COUNTER_NET_TX_DROPPED_CVT (FD_METRICS_CONVERTER_NONE) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_INTERFACE_NO_XDP_OFF (22UL) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_INTERFACE_NO_XDP_NAME "net_tx_dropped_interface_no_xdp" +#define FD_METRICS_COUNTER_NET_TX_DROPPED_INTERFACE_NO_XDP_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_INTERFACE_NO_XDP_DESC "Number of packets dropped because the output interface has no XDP socket." +#define FD_METRICS_COUNTER_NET_TX_DROPPED_INTERFACE_NO_XDP_CVT (FD_METRICS_CONVERTER_NONE) -#define FD_METRICS_NET_TOTAL (7UL) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_LO_OFF (23UL) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_LO_NAME "net_tx_dropped_full_lo" +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_LO_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_LO_DESC "Number of packets dropped because the TX ring was full (loopback)" +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_LO_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_MAIN_OFF (24UL) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_MAIN_NAME "net_tx_dropped_full_main" +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_MAIN_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_MAIN_DESC "Number of packets dropped because the TX ring was full (main)" +#define FD_METRICS_COUNTER_NET_TX_DROPPED_FULL_MAIN_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_NET_TX_DROPPED_ROUTE_FAIL_OFF (25UL) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_ROUTE_FAIL_NAME "net_tx_dropped_route_fail" +#define FD_METRICS_COUNTER_NET_TX_DROPPED_ROUTE_FAIL_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_ROUTE_FAIL_DESC "Number of packets dropped due to routing failure" +#define FD_METRICS_COUNTER_NET_TX_DROPPED_ROUTE_FAIL_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_COUNTER_NET_TX_DROPPED_NEIGHBOR_FAIL_OFF (26UL) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_NEIGHBOR_FAIL_NAME "net_tx_dropped_neighbor_fail" +#define FD_METRICS_COUNTER_NET_TX_DROPPED_NEIGHBOR_FAIL_TYPE (FD_METRICS_TYPE_COUNTER) +#define FD_METRICS_COUNTER_NET_TX_DROPPED_NEIGHBOR_FAIL_DESC "Number of packets dropped due to missing neighbor/ARP info" +#define FD_METRICS_COUNTER_NET_TX_DROPPED_NEIGHBOR_FAIL_CVT (FD_METRICS_CONVERTER_NONE) + +#define FD_METRICS_NET_TOTAL (11UL) extern const fd_metrics_meta_t FD_METRICS_NET[FD_METRICS_NET_TOTAL]; diff --git a/src/disco/metrics/metrics.xml b/src/disco/metrics/metrics.xml index 0b9b95e98d..8e5fe170da 100644 --- a/src/disco/metrics/metrics.xml +++ b/src/disco/metrics/metrics.xml @@ -58,7 +58,11 @@ metric introduced. - + + + + +