Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix warning-sending race #6784

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -734,7 +734,7 @@ update-mocks: $(ALL_TEST_PROGRAMS:%=update-mocks/%.c)

$(ALL_TEST_PROGRAMS:%=update-mocks/%.c): $(ALL_GEN_HEADERS) $(EXTERNAL_LIBS) libccan.a ccan/ccan/cdump/tools/cdump-enumstr config.vars

update-mocks/%: %
update-mocks/%: % $(ALL_GEN_HEADERS) $(ALL_GEN_SOURCES)
@MAKE=$(MAKE) tools/update-mocks.sh "$*" $(SUPPRESS_OUTPUT)

unittest/%: %
Expand Down
19 changes: 12 additions & 7 deletions channeld/channeld.c
Original file line number Diff line number Diff line change
Expand Up @@ -4437,6 +4437,7 @@ static void check_future_dataloss_fields(struct peer *peer,
take(towire_channeld_fail_fallen_behind(NULL,
remote_current_per_commitment_point)));

sleep(1);
/* We have to send them an error to trigger dropping to chain. */
peer_failed_err(peer->pps, &peer->channel_id,
"Awaiting unilateral close");
Expand Down Expand Up @@ -4841,13 +4842,17 @@ static void peer_reconnect(struct peer *peer,
retransmit_revoke_and_ack = true;
} else if (next_revocation_number < peer->next_index[LOCAL] - 1) {
/* Send a warning here! Because this is what it looks like if peer is
* in the past, and they might still recover. */
peer_failed_warn(peer->pps,
&peer->channel_id,
"bad reestablish revocation_number: %"PRIu64
" vs %"PRIu64,
next_revocation_number,
peer->next_index[LOCAL]);
* in the past, and they might still recover.
*
* We don't disconnect: they might send an error, meaning
* we will force-close the channel for them.
*/
peer_failed_warn_nodisconnect(peer->pps,
&peer->channel_id,
"bad reestablish revocation_number: %"PRIu64
" vs %"PRIu64,
next_revocation_number,
peer->next_index[LOCAL]);
} else if (next_revocation_number > peer->next_index[LOCAL] - 1) {
if (!check_extra_fields)
/* They don't support option_data_loss_protect or
Expand Down
2 changes: 1 addition & 1 deletion common/interactivetx.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@
#include <common/peer_io.h>
#include <common/psbt_internal.h>
#include <common/psbt_open.h>
#include <common/read_peer_msg.h>
#include <common/setup.h>
#include <common/status.h>
#include <common/subdaemon.h>
#include <common/type_to_string.h>
#include <common/wire_error.h>

/*
* BOLT-f53ca2301232db780843e894f55d95d512f297f9 #2:
Expand Down
27 changes: 21 additions & 6 deletions common/peer_failed.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ peer_fatal_continue(const u8 *msg TAKES, const struct per_peer_state *pps)
/* We only support one channel per peer anyway */
static void NORETURN
peer_failed(struct per_peer_state *pps,
bool disconnect,
bool warn,
const struct channel_id *channel_id,
const char *desc)
Expand All @@ -40,9 +41,9 @@ peer_failed(struct per_peer_state *pps,

/* Tell master the error so it can re-xmit. */
msg = towire_status_peer_error(NULL,
disconnect,
desc,
warn,
false,
msg);
peer_billboard(true, desc);
peer_fatal_continue(take(msg), pps);
Expand All @@ -59,7 +60,21 @@ void peer_failed_warn(struct per_peer_state *pps,
desc = tal_vfmt(tmpctx, fmt, ap);
va_end(ap);

peer_failed(pps, true, channel_id, desc);
peer_failed(pps, true, true, channel_id, desc);
}

void peer_failed_warn_nodisconnect(struct per_peer_state *pps,
const struct channel_id *channel_id,
const char *fmt, ...)
{
va_list ap;
const char *desc;

va_start(ap, fmt);
desc = tal_vfmt(tmpctx, fmt, ap);
va_end(ap);

peer_failed(pps, false, true, channel_id, desc);
}

void peer_failed_err(struct per_peer_state *pps,
Expand All @@ -74,17 +89,17 @@ void peer_failed_err(struct per_peer_state *pps,
desc = tal_vfmt(tmpctx, fmt, ap);
va_end(ap);

peer_failed(pps, false, channel_id, desc);
peer_failed(pps, true, false, channel_id, desc);
}

/* We're failing because peer sent us an error/warning message */
void peer_failed_received_errmsg(struct per_peer_state *pps,
const char *desc,
bool abort_restart)
bool disconnect,
const char *desc)
{
u8 *msg;

msg = towire_status_peer_error(NULL, desc, false, abort_restart, NULL);
msg = towire_status_peer_error(NULL, disconnect, desc, false, NULL);
peer_billboard(true, "Received %s", desc);
peer_fatal_continue(take(msg), pps);
}
Expand Down
15 changes: 13 additions & 2 deletions common/peer_failed.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,17 @@ void peer_failed_warn(struct per_peer_state *pps,
const char *fmt, ...)
PRINTF_FMT(3,4) NORETURN;

/**
* peer_failed_warn_nodisconnect - Send a warning msg, don't close.
* @pps: the per-peer state.
* @channel_id: channel with error, or NULL for no particular channel.
* @fmt...: format as per status_failed(STATUS_FAIL_PEER_BAD)
*/
void peer_failed_warn_nodisconnect(struct per_peer_state *pps,
const struct channel_id *channel_id,
const char *fmt, ...)
PRINTF_FMT(3,4) NORETURN;

/**
* peer_failed_err - Send a warning msg and close the channel.
* @pps: the per-peer state.
Expand All @@ -35,8 +46,8 @@ void peer_failed_err(struct per_peer_state *pps,
/* We're failing because peer sent us an error message: NULL
* channel_id means all channels. */
void peer_failed_received_errmsg(struct per_peer_state *pps,
const char *desc,
bool abort_restart)
bool disconnect,
const char *desc)
NORETURN;

/* I/O error */
Expand Down
10 changes: 6 additions & 4 deletions common/peer_status_wire.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@

# An error occurred: if error_for_them, that to go to them.
msgtype,status_peer_error,0xFFF4
# Do we force a disconnect from the peer?
msgdata,status_peer_error,disconnect,bool,
# The error string
msgdata,status_peer_error,desc,wirestring,
# Take a deep breath, then try reconnecting to the precious little snowflake.
# (Means we sent it, since we don't hang up if they send one)
# Actually a warning, not a (fatal!) error.
msgdata,status_peer_error,warning,bool,
# From an abort, no reconnect but restart daemon
msgdata,status_peer_error,abort_do_restart,bool,
# The error to send to them in future if they try to talk to us about
# this channel.
msgdata,status_peer_error,len,u16,
msgdata,status_peer_error,error_for_them,u8,len
18 changes: 1 addition & 17 deletions common/read_peer_msg.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,30 +12,14 @@
#include <wire/peer_wire.h>
#include <wire/wire_sync.h>

const char *is_peer_warning(const tal_t *ctx, const u8 *msg)
{
if (fromwire_peektype(msg) != WIRE_WARNING)
return NULL;
/* connectd demuxes, so we only see it if channel_id is ours. */
return sanitize_error(ctx, msg, NULL);
}

const char *is_peer_error(const tal_t *ctx, const u8 *msg)
{
if (fromwire_peektype(msg) != WIRE_ERROR)
return NULL;
/* connectd demuxes, so we only see it if channel_id is ours. */
return sanitize_error(ctx, msg, NULL);
}

bool handle_peer_error_or_warning(struct per_peer_state *pps,
const u8 *msg TAKES)
{
const char *err;

err = is_peer_error(tmpctx, msg);
if (err)
peer_failed_received_errmsg(pps, err, false);
peer_failed_received_errmsg(pps, true, err);

/* Simply log incoming warnings */
err = is_peer_warning(tmpctx, msg);
Expand Down
19 changes: 0 additions & 19 deletions common/read_peer_msg.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,6 @@ struct crypto_state;
struct channel_id;
struct per_peer_state;

/**
* is_peer_error - if it's an error, describe it.
* @ctx: context to allocate return from.
* @msg: the peer message.
*
* If this returns non-NULL, it's usually passed to
* peer_failed_received_errmsg().
*/
const char *is_peer_error(const tal_t *ctx, const u8 *msg);

/**
* is_peer_warning - if it's a warning, describe it.
* @ctx: context to allocate return from.
* @msg: the peer message.
*
* If this returns non-NULL, it's usually logged.
*/
const char *is_peer_warning(const tal_t *ctx, const u8 *msg);

/**
* handle_peer_error_or_warning - simple handler for errors / warnings
* @pps: per-peer state.
Expand Down
17 changes: 17 additions & 0 deletions common/wire_error.c
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,20 @@ char *sanitize_error(const tal_t *ctx, const u8 *errmsg,
: type_to_string(tmpctx, struct channel_id, channel_id),
(int)tal_count(data), (char *)data);
}

const char *is_peer_warning(const tal_t *ctx, const u8 *msg)
{
if (fromwire_peektype(msg) != WIRE_WARNING)
return NULL;
/* connectd demuxes, so we only see it if channel_id is ours. */
return sanitize_error(ctx, msg, NULL);
}

const char *is_peer_error(const tal_t *ctx, const u8 *msg)
{
if (fromwire_peektype(msg) != WIRE_ERROR)
return NULL;
/* connectd demuxes, so we only see it if channel_id is ours. */
return sanitize_error(ctx, msg, NULL);
}

19 changes: 19 additions & 0 deletions common/wire_error.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,4 +74,23 @@ bool channel_id_is_all(const struct channel_id *channel_id);
char *sanitize_error(const tal_t *ctx, const u8 *errmsg,
struct channel_id *channel_id);

/**
* is_peer_error - if it's an error, describe it.
* @ctx: context to allocate return from.
* @msg: the peer message.
*
* If this returns non-NULL, it's usually passed to
* peer_failed_received_errmsg().
*/
const char *is_peer_error(const tal_t *ctx, const u8 *msg);

/**
* is_peer_warning - if it's a warning, describe it.
* @ctx: context to allocate return from.
* @msg: the peer message.
*
* If this returns non-NULL, it's usually logged.
*/
const char *is_peer_warning(const tal_t *ctx, const u8 *msg);

#endif /* LIGHTNING_COMMON_WIRE_ERROR_H */
4 changes: 3 additions & 1 deletion connectd/connectd.c
Original file line number Diff line number Diff line change
Expand Up @@ -1812,8 +1812,10 @@ static void peer_discard(struct daemon *daemon, const u8 *msg)
/* If it's reconnected already, it will learn soon. */
if (peer->counter != counter)
return;

/* We make sure any final messages from the subds are sent! */
status_peer_debug(&id, "discard_peer");
tal_free(peer);
drain_peer(peer);
}

static void start_shutdown(struct daemon *daemon, const u8 *msg)
Expand Down
2 changes: 2 additions & 0 deletions connectd/connectd_wire.csv
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,8 @@ msgdata,connectd_peer_spoke,id,node_id,
msgdata,connectd_peer_spoke,counter,u64,
msgdata,connectd_peer_spoke,msgtype,u16,
msgdata,connectd_peer_spoke,channel_id,channel_id,
# If msgtype == WIRE_ERROR, this is the string.
msgdata,connectd_peer_spoke,error,?wirestring,

# master -> connectd: peer no longer wanted, you can disconnect.
msgtype,connectd_discard_peer,2015
Expand Down
19 changes: 5 additions & 14 deletions connectd/multiplex.c
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ static void close_subd_timeout(struct subd *subd)
io_close(subd->conn);
}

static void drain_peer(struct peer *peer)
void drain_peer(struct peer *peer)
{
status_debug("drain_peer");
assert(!peer->draining);
Expand Down Expand Up @@ -429,7 +429,8 @@ static struct io_plan *encrypt_and_send(struct peer *peer,
case DEV_DISCONNECT_AFTER:
/* Disallow reads from now on */
peer->dev_read_enabled = false;
next = (void *)io_close_cb;
/* Using io_close here can lose the data we're about to send! */
next = io_sock_shutdown_cb;
break;
case DEV_DISCONNECT_BLACKHOLE:
/* Disable both reads and writes from now on */
Expand All @@ -447,17 +448,6 @@ static struct io_plan *encrypt_and_send(struct peer *peer,

set_urgent_flag(peer, is_urgent(type));

/* We are no longer required to do this, but we do disconnect
* after sending an error or warning. */
if (type == WIRE_ERROR || type == WIRE_WARNING) {
/* Might already be draining... */
if (!peer->draining)
drain_peer(peer);

/* Close as soon as we've sent this. */
next = io_sock_shutdown_cb;
}

/* We free this and the encrypted version in next write_to_peer */
peer->sent_to_peer = cryptomsg_encrypt_msg(peer, &peer->cs, msg);
return io_write(peer->to_peer,
Expand Down Expand Up @@ -1137,7 +1127,8 @@ static struct io_plan *read_body_from_peer_done(struct io_conn *peer_conn,
take(towire_connectd_peer_spoke(NULL, &peer->id,
peer->counter,
t,
&channel_id)));
&channel_id,
is_peer_error(tmpctx, decrypted))));
}

/* Even if we just created it, call this to catch open_channel2 */
Expand Down
4 changes: 4 additions & 0 deletions connectd/multiplex.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,8 @@ void send_custommsg(struct daemon *daemon, const u8 *msg);

/* Lightningd wants to talk to you. */
void peer_connect_subd(struct daemon *daemon, const u8 *msg, int fd);

/* Start shutting down peer. */
void drain_peer(struct peer *peer);

#endif /* LIGHTNING_CONNECTD_MULTIPLEX_H */
21 changes: 12 additions & 9 deletions contrib/pyln-testing/pyln/testing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -594,7 +594,6 @@ def __init__(
self.lightning_dir = lightning_dir
self.port = port
self.cmd_prefix = []
self.disconnect_file = None

self.rpcproxy = bitcoindproxy
self.env['CLN_PLUGIN_LOG'] = "cln_plugin=trace,cln_rpc=trace,cln_grpc=trace,debug"
Expand Down Expand Up @@ -639,8 +638,8 @@ def __init__(

def cleanup(self):
# To force blackhole to exit, disconnect file must be truncated!
if self.disconnect_file:
with open(self.disconnect_file, "w") as f:
if 'dev-disconnect' in self.opts:
with open(self.opts['dev-disconnect'], "w") as f:
f.truncate()

@property
Expand Down Expand Up @@ -763,12 +762,10 @@ def __init__(self, node_id, lightning_dir, bitcoind, executor, valgrind, may_fai
grpc_port=self.grpc_port,
)

# If we have a disconnect string, dump it to a file for daemon.
if disconnect:
self.daemon.disconnect_file = os.path.join(lightning_dir, TEST_NETWORK, "dev_disconnect")
with open(self.daemon.disconnect_file, "w") as f:
f.write("\n".join(disconnect))
self.daemon.opts["dev-disconnect"] = "dev_disconnect"
self.disconnect = disconnect
if self.disconnect:
self.daemon.opts["dev-disconnect"] = os.path.join(lightning_dir, TEST_NETWORK, "dev-disconnect")
# Actual population of that file occurs at start.

# Various developer options let us be more aggressive
self.daemon.opts["dev-fail-on-subdaemon-fail"] = None
Expand Down Expand Up @@ -975,6 +972,12 @@ def is_synced_with_bitcoin(self, info=None):
return 'warning_bitcoind_sync' not in info and 'warning_lightningd_sync' not in info

def start(self, wait_for_bitcoind_sync=True, stderr_redir=False):
# If we have a disconnect string, dump it to a file for daemon.
if 'dev-disconnect' in self.daemon.opts:
with open(self.daemon.opts['dev-disconnect'], "w") as f:
if self.disconnect is not None:
f.write("\n".join(self.disconnect))

self.daemon.start(stderr_redir=stderr_redir)
# Cache `getinfo`, we'll be using it a lot
self.info = self.rpc.getinfo()
Expand Down
Loading
Loading