-
Notifications
You must be signed in to change notification settings - Fork 4k
Description
Describe the bug
After upgrading RMQ server to version 3.13.6 or 3.13.7 from version 3.12.14, we started having randomly "Restarting crashed queue" errors for some of our queues. Those queues can have a throughput of 300 event/s, with big messages. We have also tested version 4.0.1 but the same issue was also spotted. Everytime we rollback to 3.12.14, the issue no longer happens.
Our topology is quite simple, we have only one RabbitMQ server running on k8s with the Docker management image. Clients publishing and consuming the affected queues are written in .NET and use the officiel .NET RabitMQ client.
Here are some stacktraces of when that happens:
3.13.7
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{rabbit_msg_store,reader_pread_parse,1,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{file,"rabbit_msg_store.erl"},{line,696}]},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {rabbit_msg_store,reader_pread,2,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{file,"rabbit_msg_store.erl"},{line,688}]},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {rabbit_msg_store,read_from_disk,2,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{file,"rabbit_msg_store.erl"},{line,648}]},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {rabbit_msg_store,client_read3,2,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{file,"rabbit_msg_store.erl"},{line,640}]},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {rabbit_variable_queue,with_msg_store_state,3,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{file,"rabbit_variable_queue.erl"},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {line,1348}]},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {rabbit_variable_queue,read_msg,5,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{file,"rabbit_variable_queue.erl"},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {line,1583}]},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {rabbit_variable_queue,fetch,2,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{file,"rabbit_variable_queue.erl"},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {line,616}]},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {rabbit_priority_queue,fetch,2,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{file,"rabbit_priority_queue.erl"},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {line,302}]}]}
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> offender: [{pid,<0.585.0>},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {id,rabbit_amqqueue},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {mfargs,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {rabbit_prequeue,start_link,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{amqqueue,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {resource,<<"/">>,queue,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> <<"QUEUE_NAME">>},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> true,false,none,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> [{<<"x-queue-mode">>,longstr,<<"lazy">>}],
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> <0.867.0>,[],[],[],undefined,undefined,[],
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> undefined,live,0,[],<<"/">>,
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> #{user => <<"rmq-internal">>},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> rabbit_classic_queue,#{}},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> recovery,<0.583.0>]}},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {restart_type,transient},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {significant,true},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {shutdown,600000},
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0> {child_type,worker}]
2024-09-21 00:31:38.734074+00:00 [error] <0.584.0>
2024-09-21 00:31:38.852784+00:00 [error] <0.217004.0> Restarting crashed queue 'QUEUE_NAME' in vhost '/'
4.0.1
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> supervisor: {<0.551.0>,rabbit_amqqueue_sup}
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> errorContext: child_terminated
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> reason: {function_clause,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{rabbit_variable_queue,d,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{delta,1053550,3357582,0,4411131}],
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{file,"rabbit_variable_queue.erl"},{line,988}]},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {rabbit_variable_queue,maybe_deltas_to_betas,4,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{file,"rabbit_variable_queue.erl"},{line,2385}]},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {rabbit_variable_queue,fetch_from_q3,1,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{file,"rabbit_variable_queue.erl"},{line,2240}]},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {rabbit_variable_queue,queue_out,1,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{file,"rabbit_variable_queue.erl"},{line,1262}]},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {rabbit_variable_queue,collect_by_predicate,3,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{file,"rabbit_variable_queue.erl"},{line,1559}]},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {rabbit_variable_queue,remove_by_predicate,2,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{file,"rabbit_variable_queue.erl"},{line,1497}]},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {rabbit_variable_queue,dropwhile,2,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{file,"rabbit_variable_queue.erl"},{line,558}]},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {rabbit_priority_queue,dropwhile,2,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{file,"rabbit_priority_queue.erl"},{line,246}]}]}
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> offender: [{pid,<0.552.0>},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {id,rabbit_amqqueue},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {mfargs,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {rabbit_amqqueue_process,start_link,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{amqqueue,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {resource,<<"/">>,queue,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> <<"QUEUE_NAME_2">>},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> true,false,none,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> [{<<"x-queue-mode">>,longstr,<<"lazy">>}],
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> <0.1275.0>,[],[],[],undefined,undefined,[],[],
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> stopped,0,[],<<"/">>,
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> #{user => <<"USER_NAME">>},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> rabbit_classic_queue,#{}},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> <0.550.0>]}},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {restart_type,transient},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {significant,true},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {shutdown,600000},
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0> {child_type,worker}]
2024-09-23 10:30:56.018962+00:00 [error] <0.551.0>
2024-09-23 10:30:56.020462+00:00 [error] <0.6365.0> Restarting crashed queue 'QUEUE_NAME_2' in vhost '/'.
Those issues seem very similar to what is described here: #10902
However this issue is supposed to be fixed since 3.13.2.
Reproduction steps
Cannot reproduce in a test environment.
Expected behavior
No queue crashing
Additional context
No response