Skip to content

Commit

Permalink
Netlink tile bug fixes
Browse files Browse the repository at this point in the history
- Fix overrun recovery
- Fix dispatching of neighbor solicit requests
- Log warning for neighbor discovery failures
  • Loading branch information
riptl authored and ripatel-fd committed Feb 13, 2025
1 parent 8c960ff commit 71be7d1
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 31 deletions.
17 changes: 16 additions & 1 deletion src/disco/netlink/fd_netlink_tile.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,6 +300,8 @@ during_housekeeping( fd_netlink_tile_ctx_t * ctx ) {
}
}

/* before_credit is called once per loop iteration */

static void
before_credit( fd_netlink_tile_ctx_t * ctx,
fd_stem_context_t * stem FD_PARAM_UNUSED,
Expand All @@ -320,6 +322,16 @@ before_credit( fd_netlink_tile_ctx_t * ctx,

}

/* after_poll_overrun is called when fd_stem.c was overrun while
checking for new fragments. This typically happens when
before_credit takes too long (e.g. we were in a blocking netlink
read) */

static void
after_poll_overrun( fd_netlink_tile_ctx_t * ctx ) {
ctx->idle_cnt = -1L;
}

/* after_frag handles a neighbor solicit request */

static void
Expand Down Expand Up @@ -375,7 +387,9 @@ after_frag( fd_netlink_tile_ctx_t * ctx,
/* Trigger neighbor solicit via netlink */

int netlink_res = fd_neigh4_netlink_solicit( ctx->nl_req, if_idx, ip4_addr );
if( FD_UNLIKELY( netlink_res<0 ) ) {
if( FD_UNLIKELY( netlink_res!=0 ) ) {
FD_LOG_WARNING(( "`ip neigh add " FD_IP4_ADDR_FMT " dev %u use nud incomplete` failed (%i-%s)",
FD_IP4_ADDR_FMT_ARGS( ip4_addr ), if_idx, netlink_res, fd_io_strerror( netlink_res ) ));
ctx->metrics.neigh_solicits_fails++;
return;
}
Expand All @@ -393,6 +407,7 @@ after_frag( fd_netlink_tile_ctx_t * ctx,
#define STEM_CALLBACK_METRICS_WRITE metrics_write
#define STEM_CALLBACK_DURING_HOUSEKEEPING during_housekeeping
#define STEM_CALLBACK_BEFORE_CREDIT before_credit
#define STEM_CALLBACK_AFTER_POLL_OVERRUN after_poll_overrun
#define STEM_CALLBACK_AFTER_FRAG after_frag

#include "../stem/fd_stem.c"
Expand Down
20 changes: 15 additions & 5 deletions src/disco/stem/fd_stem.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,10 @@
be trusted, except for seq which is read atomically.
AFTER_FRAG
Is is called immediately after the DURING_FRAG, along with an
additional check that the reader was not overrun while handling the
frag. If the reader was overrun, the frag is abandoned and this
function is not called. This callback is not invoked if the stem is
Is called immediately after the DURING_FRAG, along with an additional
check that the reader was not overrun while handling the frag. If
the reader was overrun, the frag is abandoned and this function is
not called. This callback is not invoked if the stem is
backpressured, as it would not read a frag in the first place.
in_idx will be the index of the in that the frag was received from.
You should not read the frag data directly here, as it might still
Expand All @@ -126,7 +126,12 @@
number of the fragment that was read from the input mcache. sig,
chunk, sz, and tsorig are the respective fields from the mcache
fragment that was received. If the producer is not respecting flow
control, these may be corrupt or torn and should not be trusted. */
control, these may be corrupt or torn and should not be trusted.
AFTER_POLL_OVERRUN
Is called when an overrun is detected while polling for new frags.
This callback is not called when an overrun is detected in
during_frag. */

#if !FD_HAS_SSE
#error "fd_stem requires SSE"
Expand Down Expand Up @@ -553,7 +558,12 @@ STEM_(run1)( ulong in_cnt,
finish_regime = &metric_regime_ticks[7];
this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_COUNT_OFF ]++;
this_in->accum[ FD_METRICS_COUNTER_LINK_OVERRUN_POLLING_FRAG_COUNT_OFF ] += (uint)(-diff);

#ifdef STEM_CALLBACK_AFTER_POLL_OVERRUN
STEM_CALLBACK_AFTER_POLL_OVERRUN( ctx );
#endif
}

/* Don't bother with spin as polling multiple locations */
*housekeeping_regime += housekeeping_ticks;
*prefrag_regime += prefrag_ticks;
Expand Down
44 changes: 19 additions & 25 deletions src/waltz/neigh/fd_neigh4_netlink.c
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,8 @@ fd_neigh4_netlink_solicit( fd_netlink_t * netlink,
request.nlh = (struct nlmsghdr) {
.nlmsg_type = RTM_NEWNEIGH,
.nlmsg_flags = NLM_F_REQUEST|NLM_F_ACK|NLM_F_EXCL|NLM_F_CREATE,
.nlmsg_seq = seq
.nlmsg_seq = seq,
.nlmsg_len = sizeof(request)
};
request.ndm = (struct ndmsg) {
.ndm_family = AF_INET,
Expand Down Expand Up @@ -192,32 +193,25 @@ fd_neigh4_netlink_solicit( fd_netlink_t * netlink,

/* Get error code */

for( ulong attempt=0UL; attempt<64UL; attempt++ ) {
uchar buf[ 4096 ];
long recv_res = fd_netlink_read_socket( netlink->fd, buf, sizeof(buf) );
if( FD_UNLIKELY( recv_res<0 ) ) {
FD_LOG_WARNING(( "netlink recv failed (%d-%s)", errno, fd_io_strerror( errno ) ));
return errno;
}

struct nlmsghdr const * nlh = fd_type_pun_const( buf );
if( FD_UNLIKELY( nlh->nlmsg_seq != seq ) ) {
/* Should only happen if caller misbehaves */
FD_LOG_WARNING(( "Dropping rtnetlink message type=%u seq=%u", nlh->nlmsg_type, nlh->nlmsg_seq ));
continue;
}
uchar buf[ 4096 ];
long recv_res = fd_netlink_read_socket( netlink->fd, buf, sizeof(buf) );
if( FD_UNLIKELY( recv_res<0 ) ) {
FD_LOG_WARNING(( "netlink recv failed (%d-%s)", errno, fd_io_strerror( errno ) ));
return errno;
}

if( FD_UNLIKELY( nlh->nlmsg_type!=NLMSG_ERROR ) ) {
/* Should never happen */
FD_LOG_WARNING(( "unexpected nlmsg_type %u for RTM_NEWNEIGH request", nlh->nlmsg_type ));
continue;
}
struct nlmsghdr const * nlh = fd_type_pun_const( buf );
if( FD_UNLIKELY( nlh->nlmsg_seq!=seq ) ) {
/* Should only happen if caller misbehaves */
FD_LOG_ERR(( "Unexpected netlink message type=%u seq=%u", nlh->nlmsg_type, nlh->nlmsg_seq ));
}

struct nlmsgerr * err = NLMSG_DATA( nlh );
int nl_err = -err->error;
return nl_err;
if( FD_UNLIKELY( nlh->nlmsg_type!=NLMSG_ERROR ) ) {
/* Should never happen */
FD_LOG_ERR(( "unexpected netlink response nlmsg_type %u for RTM_NEWNEIGH request", nlh->nlmsg_type ));
}

FD_LOG_WARNING(( "Giving up on receiving response code for RTM_NEWNEIGH request" ));
return 0;
struct nlmsgerr * err = NLMSG_DATA( nlh );
int nl_err = -err->error;
return nl_err;
}

0 comments on commit 71be7d1

Please sign in to comment.