From d10fe6dfc49ca3470aba5770cc5bee496111aa88 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 21 May 2021 10:06:12 +0930 Subject: [PATCH 1/2] pytest: add test for HTLC timeout on failed htlcs. Signed-off-by: Rusty Russell --- tests/test_connection.py | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/tests/test_connection.py b/tests/test_connection.py index 160755ead58e..aa90dc95fb8f 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -11,7 +11,7 @@ scriptpubkey_addr, EXPERIMENTAL_FEATURES ) -from pyln.testing.utils import SLOW_MACHINE, VALGRIND, EXPERIMENTAL_DUAL_FUND +from pyln.testing.utils import SLOW_MACHINE, VALGRIND, EXPERIMENTAL_DUAL_FUND, FUNDAMOUNT import os import pytest @@ -3284,3 +3284,30 @@ def test_openchannel_init_alternate(node_factory, executor): fut = executor.submit(l2.rpc.openchannel_init, l1.info['id'], '1000000msat', psbt2) with pytest.raises(RpcError): fut.result(10) + + +@pytest.mark.xfail(strict=True) +def test_htlc_failed_noclose(node_factory): + """Test a bug where the htlc timeout would kick in even if the HTLC failed""" + l1, l2 = node_factory.line_graph(2) + + payment_hash = l2.rpc.invoice(1000, "test", "test")['payment_hash'] + routestep = { + 'msatoshi': FUNDAMOUNT * 1000, + 'id': l2.info['id'], + 'delay': 5, + 'channel': '1x1x1' # note: can be bogus for 1-hop direct payments + } + + # This fails at channeld + l1.rpc.sendpay([routestep], payment_hash) + with pytest.raises(RpcError, match="Capacity exceeded"): + l1.rpc.waitsendpay(payment_hash) + + # Send a second one, too: make sure we don't crash. + l1.rpc.sendpay([routestep], payment_hash) + with pytest.raises(RpcError, match="Capacity exceeded"): + l1.rpc.waitsendpay(payment_hash) + + time.sleep(35) + assert l1.rpc.getpeer(l2.info['id'])['connected'] From de3fe42ccfdb59548f86ac94e16786e01013f5fb Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 21 May 2021 11:45:54 +0930 Subject: [PATCH 2/2] lightningd: attach HTLC timeout to htlc itself, fix gratuitous disconnect bug. We set the timeout on first HTLC, but didn't clear it if that HTLC failed. It's saner to have a per-HTLC timeout (since that's what it is!) and also our timer infra is specially coded to scale approximately infinitely so trying to optimize this is vastly premature. Signed-off-by: Rusty Russell Changelog-Fixed: Protocol: We would sometimes gratuitously disconnect 30 seconds after an HTLC failed. --- lightningd/channel.c | 2 -- lightningd/channel.h | 3 --- lightningd/htlc_end.c | 1 + lightningd/htlc_end.h | 3 +++ lightningd/peer_htlcs.c | 29 +++++++++++++++++------------ tests/test_connection.py | 1 - tests/test_misc.py | 2 +- 7 files changed, 22 insertions(+), 19 deletions(-) diff --git a/lightningd/channel.c b/lightningd/channel.c index 094a66eb38ed..b3e25c9adf6b 100644 --- a/lightningd/channel.c +++ b/lightningd/channel.c @@ -214,7 +214,6 @@ struct channel *new_unsaved_channel(struct peer *peer, /* A zero value database id means it's not saved in the database yet */ channel->dbid = 0; channel->error = NULL; - channel->htlc_timeout = NULL; channel->openchannel_signed_cmd = NULL; channel->state = DUALOPEND_OPEN_INIT; channel->owner = NULL; @@ -345,7 +344,6 @@ struct channel *new_channel(struct peer *peer, u64 dbid, channel->dbid = dbid; channel->unsaved_dbid = 0; channel->error = NULL; - channel->htlc_timeout = NULL; channel->open_attempt = NULL; channel->openchannel_signed_cmd = NULL; if (their_shachain) diff --git a/lightningd/channel.h b/lightningd/channel.h index 8d7fb8f21a40..d04746450baf 100644 --- a/lightningd/channel.h +++ b/lightningd/channel.h @@ -132,9 +132,6 @@ struct channel { struct amount_msat msat_to_us_min; struct amount_msat msat_to_us_max; - /* Timer we use in case they don't add an HTLC in a timely manner. */ - struct oneshot *htlc_timeout; - /* Last tx they gave us. */ struct bitcoin_tx *last_tx; enum wallet_tx_type last_tx_type; diff --git a/lightningd/htlc_end.c b/lightningd/htlc_end.c index d2b3c8a1ec25..a1131515444b 100644 --- a/lightningd/htlc_end.c +++ b/lightningd/htlc_end.c @@ -297,6 +297,7 @@ struct htlc_out *new_htlc_out(const tal_t *ctx, hout->failmsg = NULL; hout->failonion = NULL; hout->preimage = NULL; + hout->timeout = NULL; if (blinding) hout->blinding = tal_dup(hout, struct pubkey, blinding); diff --git a/lightningd/htlc_end.h b/lightningd/htlc_end.h index 0c5affa61571..7e2c1da67ae0 100644 --- a/lightningd/htlc_end.h +++ b/lightningd/htlc_end.h @@ -93,6 +93,9 @@ struct htlc_out { /* Blinding to send alongside, if any. */ struct pubkey *blinding; + + /* Timer we use in case they don't add an HTLC in a timely manner. */ + struct oneshot *timeout; }; static inline const struct htlc_key *keyof_htlc_in(const struct htlc_in *in) diff --git a/lightningd/peer_htlcs.c b/lightningd/peer_htlcs.c index 827be6f4ff49..c2ac7cdf7a29 100644 --- a/lightningd/peer_htlcs.c +++ b/lightningd/peer_htlcs.c @@ -600,17 +600,22 @@ static void rcvd_htlc_reply(struct subd *subd, const u8 *msg, const int *fds UNU /* When channeld includes it in commitment, we'll make it persistent. */ } -static void htlc_offer_timeout(struct channel *channel) +static void htlc_offer_timeout(struct htlc_out *out) { - /* Unset this in case we reconnect and start again. */ - channel->htlc_timeout = NULL; + struct channel *channel = out->key.channel; + + out->timeout = NULL; + + /* Otherwise, timer would be removed. */ + assert(out->hstate == SENT_ADD_HTLC); /* If owner died, we should already be taken care of. */ if (!channel->owner || channel->state != CHANNELD_NORMAL) return; log_unusual(channel->owner->log, - "Adding HTLC too slow: killing connection"); + "Adding HTLC %"PRIu64" too slow: killing connection", + out->key.id); tal_free(channel->owner); channel_set_billboard(channel, false, "Adding HTLC timed out: killed connection"); @@ -659,12 +664,14 @@ const u8 *send_htlc_out(const tal_t *ctx, partid, in); tal_add_destructor(*houtp, destroy_hout_subd_died); - /* Give channel 30 seconds to commit (first) htlc. */ - if (!out->htlc_timeout && !IFDEV(out->peer->ld->dev_no_htlc_timeout, 0)) - out->htlc_timeout = new_reltimer(out->peer->ld->timers, - out, time_from_sec(30), + /* Give channel 30 seconds to commit this htlc. */ + if (!IFDEV(out->peer->ld->dev_no_htlc_timeout, 0)) { + (*houtp)->timeout = new_reltimer(out->peer->ld->timers, + *houtp, time_from_sec(30), htlc_offer_timeout, - out); + *houtp); + } + msg = towire_channeld_offer_htlc(out, amount, cltv, payment_hash, onion_routing_packet, blinding); subd_req(out->peer->ld, out->owner, take(msg), -1, 0, rcvd_htlc_reply, @@ -1627,8 +1634,8 @@ static bool update_out_htlc(struct channel *channel, /* First transition into commitment; now it outlives peer. */ if (newstate == SENT_ADD_COMMIT) { tal_del_destructor(hout, destroy_hout_subd_died); + hout->timeout = tal_free(hout->timeout); tal_steal(ld, hout); - } else if (newstate == RCVD_REMOVE_ACK_REVOCATION) { remove_htlc_out(channel, hout); } @@ -1721,8 +1728,6 @@ void peer_sending_commitsig(struct channel *channel, const u8 *msg) struct lightningd *ld = channel->peer->ld; struct penalty_base *pbase; - channel->htlc_timeout = tal_free(channel->htlc_timeout); - if (!fromwire_channeld_sending_commitsig(msg, msg, &commitnum, &pbase, diff --git a/tests/test_connection.py b/tests/test_connection.py index aa90dc95fb8f..2e6049f9713b 100644 --- a/tests/test_connection.py +++ b/tests/test_connection.py @@ -3286,7 +3286,6 @@ def test_openchannel_init_alternate(node_factory, executor): fut.result(10) -@pytest.mark.xfail(strict=True) def test_htlc_failed_noclose(node_factory): """Test a bug where the htlc timeout would kick in even if the HTLC failed""" l1, l2 = node_factory.line_graph(2) diff --git a/tests/test_misc.py b/tests/test_misc.py index 7ec079f89454..b0f7b2a4a424 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -1418,7 +1418,7 @@ def test_htlc_send_timeout(node_factory, bitcoind, compat): assert not l2.daemon.is_in_log(r'{}-.*channeld.*: \[IN\] 0013'.format(l3.info['id'])) assert not l2.daemon.is_in_log(r'{}-.*channeld.*: \[OUT\] 0084'.format(l3.info['id'])) # L2 killed the channel with l3 because it was too slow. - l2.daemon.wait_for_log('{}-.*channeld-.*Adding HTLC too slow: killing connection'.format(l3.info['id'])) + l2.daemon.wait_for_log('{}-.*channeld-.*Adding HTLC 0 too slow: killing connection'.format(l3.info['id'])) def test_ipv4_and_ipv6(node_factory):