Skip to content

Commit 920d96e

Browse files
committed
Unblock channels awaiting monitor update based on ChanMan queue
When we have `ChannelMonitorUpdate`s which are completing both synchronously and asynchronously, we need to consider a channel as unblocked based on the `ChannelManager` monitor update queue, rather than by checking the `update_id`s. Consider the case where a channel is updated, leading to a `ChannelMonitorUpdate` which completes asynchronously. The update completes, but prior to the `ChannelManager` receiving the `MonitorEvent::Completed` it generates a further `ChannelMonitorUpdate`. This second update completes synchronously. As a result, when the `MonitorEvent` is processed, the event's `monitor_update_id` is the first update, but there are no updates queued and the channel should be free to return to be unblocked. Here we fix this by looking only at the `ChannelManager` update queue, rather than the update_id of the `MonitorEvent`. While we don't anticipate many users having both synchronous and asynchronous persists in the same application, there isn't much cost to supporting it, which we do here. Found by the chanmon_consistency target.
1 parent d5019fc commit 920d96e

File tree

2 files changed

+88
-1
lines changed

2 files changed

+88
-1
lines changed

lightning/src/ln/chanmon_update_fail_tests.rs

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3337,6 +3337,93 @@ fn test_durable_preimages_on_closed_channel() {
33373337
do_test_durable_preimages_on_closed_channel(false, false, false);
33383338
}
33393339

3340+
#[test]
3341+
fn test_sync_async_persist_doesnt_hang() {
3342+
// Previously, we checked if a channel was a candidate for making forward progress based on if
3343+
// the `MonitorEvent::Completed` matched the channel's latest monitor update id. However, this
3344+
// could lead to a rare race when `ChannelMonitor`s were being persisted both synchronously and
3345+
// asynchronously leading to channel hangs.
3346+
//
3347+
// To hit this case, we need to generate a `MonitorEvent::Completed` prior to a new channel
3348+
// update, but which is only processed after the channel update.
3349+
let chanmon_cfgs = create_chanmon_cfgs(2);
3350+
let node_cfgs = create_node_cfgs(2, &chanmon_cfgs);
3351+
let node_chanmgrs = create_node_chanmgrs(2, &node_cfgs, &[None, None]);
3352+
let nodes = create_network(2, &node_cfgs, &node_chanmgrs);
3353+
3354+
let chan_id_ab = create_announced_chan_between_nodes(&nodes, 0, 1).2;
3355+
3356+
// Send two payments from A to B, then claim the first, marking the very last
3357+
// ChannelMonitorUpdate as InProgress...
3358+
let (payment_preimage_1, payment_hash_1, ..) = route_payment(&nodes[0], &[&nodes[1]], 1_000_000);
3359+
let (payment_preimage_2, payment_hash_2, ..) = route_payment(&nodes[0], &[&nodes[1]], 1_000_000);
3360+
3361+
nodes[1].node.claim_funds(payment_preimage_1);
3362+
check_added_monitors(&nodes[1], 1);
3363+
expect_payment_claimed!(nodes[1], payment_hash_1, 1_000_000);
3364+
3365+
let bs_updates = get_htlc_update_msgs(&nodes[1], &nodes[0].node.get_our_node_id());
3366+
nodes[0].node.handle_update_fulfill_htlc(&nodes[1].node.get_our_node_id(), &bs_updates.update_fulfill_htlcs[0]);
3367+
expect_payment_sent(&nodes[0], payment_preimage_1, None, false, false);
3368+
nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bs_updates.commitment_signed);
3369+
check_added_monitors(&nodes[0], 1);
3370+
let (as_raa, as_cs) = get_revoke_commit_msgs!(nodes[0], nodes[1].node.get_our_node_id());
3371+
3372+
nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &as_raa);
3373+
check_added_monitors(&nodes[1], 1);
3374+
nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &as_cs);
3375+
check_added_monitors(&nodes[1], 1);
3376+
3377+
let bs_final_raa = get_event_msg!(nodes[1], MessageSendEvent::SendRevokeAndACK, nodes[0].node.get_our_node_id());
3378+
chanmon_cfgs[0].persister.set_update_ret(ChannelMonitorUpdateStatus::InProgress);
3379+
nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bs_final_raa);
3380+
check_added_monitors(&nodes[0], 1);
3381+
3382+
// Immediately complete the monitor update, but before the ChannelManager has a chance to see
3383+
// the MonitorEvent::Completed, create a channel update by receiving a claim on the second
3384+
// payment.
3385+
let (outpoint, _, ab_update_id) = nodes[0].chain_monitor.latest_monitor_update_id.lock().unwrap().get(&chan_id_ab).unwrap().clone();
3386+
nodes[0].chain_monitor.chain_monitor.channel_monitor_updated(outpoint, ab_update_id).unwrap();
3387+
3388+
nodes[1].node.claim_funds(payment_preimage_2);
3389+
check_added_monitors(&nodes[1], 1);
3390+
expect_payment_claimed!(nodes[1], payment_hash_2, 1_000_000);
3391+
3392+
let bs_updates = get_htlc_update_msgs(&nodes[1], &nodes[0].node.get_our_node_id());
3393+
nodes[0].node.handle_update_fulfill_htlc(&nodes[1].node.get_our_node_id(), &bs_updates.update_fulfill_htlcs[0]);
3394+
nodes[0].node.handle_commitment_signed(&nodes[1].node.get_our_node_id(), &bs_updates.commitment_signed);
3395+
check_added_monitors(&nodes[0], 1);
3396+
3397+
// At this point, we have completed an extra `ChannelMonitorUpdate` but the `ChannelManager`
3398+
// hasn't yet seen our `MonitorEvent::Completed`. When we call
3399+
// `get_and_clear_pending_msg_events` here, the `ChannelManager` finally sees that event and
3400+
// should return the channel to normal operation.
3401+
let (as_raa, as_cs) = get_revoke_commit_msgs!(nodes[0], nodes[1].node.get_our_node_id());
3402+
3403+
// Now that we've completed our test, process the events we have queued up (which we were not
3404+
// able to check until now as they would have caused the `ChannelManager` to look at the
3405+
// pending `MonitorEvent`s).
3406+
let pending_events = nodes[0].node.get_and_clear_pending_events();
3407+
assert_eq!(pending_events.len(), 2);
3408+
if let Event::PaymentPathSuccessful { ref payment_hash, ..} = pending_events[1] {
3409+
assert_eq!(payment_hash.unwrap(), payment_hash_1);
3410+
} else { panic!(); }
3411+
if let Event::PaymentSent { ref payment_hash, ..} = pending_events[0] {
3412+
assert_eq!(*payment_hash, payment_hash_2);
3413+
} else { panic!(); }
3414+
3415+
// Finally, complete the claiming of the second payment
3416+
nodes[1].node.handle_revoke_and_ack(&nodes[0].node.get_our_node_id(), &as_raa);
3417+
check_added_monitors(&nodes[1], 1);
3418+
nodes[1].node.handle_commitment_signed(&nodes[0].node.get_our_node_id(), &as_cs);
3419+
check_added_monitors(&nodes[1], 1);
3420+
3421+
let bs_final_raa = get_event_msg!(nodes[1], MessageSendEvent::SendRevokeAndACK, nodes[0].node.get_our_node_id());
3422+
nodes[0].node.handle_revoke_and_ack(&nodes[1].node.get_our_node_id(), &bs_final_raa);
3423+
check_added_monitors(&nodes[0], 1);
3424+
expect_payment_path_successful!(nodes[0]);
3425+
}
3426+
33403427
fn do_test_reload_mon_update_completion_actions(close_during_reload: bool) {
33413428
// Test that if a `ChannelMonitorUpdate` completes but a `ChannelManager` isn't serialized
33423429
// before restart we run the monitor update completion action on startup.

lightning/src/ln/channelmanager.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6613,7 +6613,7 @@ where
66136613
log_trace!(logger, "ChannelMonitor updated to {}. Current highest is {}. {} pending in-flight updates.",
66146614
highest_applied_update_id, channel.context.get_latest_monitor_update_id(),
66156615
remaining_in_flight);
6616-
if !channel.is_awaiting_monitor_update() || channel.context.get_latest_monitor_update_id() != highest_applied_update_id {
6616+
if !channel.is_awaiting_monitor_update() || remaining_in_flight != 0 {
66176617
return;
66186618
}
66196619
handle_monitor_update_completion!(self, peer_state_lock, peer_state, per_peer_state, channel);

0 commit comments

Comments
 (0)