Skip to content

Commit f686b89

Browse files
authored
Fix refcount issue (#366)
*Issue #, if available:* - While connection manager is closing, if the culling task runs, it will increase the refcount back from zero, and decrease it after the task, which will let the clean up happen twice. - A bug while acquire the external refcount should NOT acquire the internal refcount. *Description of changes:* - Culling task hold internal refcount, and cancel the task when the external refcount drops to zero instead of the internal refcount. By submitting this pull request, I confirm that my contribution is made under the terms of the Apache 2.0 license.
1 parent ad7d98c commit f686b89

File tree

3 files changed

+73
-63
lines changed

3 files changed

+73
-63
lines changed

source/connection_manager.c

Lines changed: 31 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ struct aws_http_connection_manager {
176176
/*
177177
* The number of all established, idle connections. So
178178
* that we don't have compute the size of a linked list every time.
179+
* It doesn't contribute to internal refcount as AWS_HCMCT_OPEN_CONNECTION inclues all idle connections as well.
179180
*/
180181
size_t idle_connection_count;
181182

@@ -218,6 +219,8 @@ struct aws_http_connection_manager {
218219

219220
/*
220221
* The number of established new HTTP/2 connections we have waiting for SETTINGS from the http layer
222+
* It doesn't contribute to internal refcount as AWS_HCMCT_OPEN_CONNECTION inclues all connections waiting for
223+
* settings as well.
221224
*/
222225
size_t pending_settings_count;
223226

@@ -570,6 +573,7 @@ static void s_connection_manager_internal_ref_decrease(
570573
}
571574
}
572575

576+
/* Only invoked with the lock held */
573577
static void s_aws_http_connection_manager_build_transaction(struct aws_connection_management_transaction *work) {
574578
struct aws_http_connection_manager *manager = work->manager;
575579

@@ -719,44 +723,20 @@ static void s_aws_http_connection_manager_finish_destroy(struct aws_http_connect
719723
aws_mem_release(manager->allocator, manager);
720724
}
721725

722-
/* This is scheduled to run on the cull task's event loop. If there's no cull task we just destroy the
723-
* manager directly without a cross-thread task. */
726+
/* This is scheduled to run on the cull task's event loop. Should only be scheduled to run if we have one */
724727
static void s_final_destruction_task(struct aws_task *task, void *arg, enum aws_task_status status) {
725728
(void)status;
726729
struct aws_http_connection_manager *manager = arg;
727730
struct aws_allocator *allocator = manager->allocator;
728731

729-
if (manager->cull_task) {
730-
AWS_FATAL_ASSERT(manager->cull_event_loop != NULL);
731-
aws_event_loop_cancel_task(manager->cull_event_loop, manager->cull_task);
732-
}
733-
734-
s_aws_http_connection_manager_finish_destroy(manager);
732+
AWS_FATAL_ASSERT(manager->cull_task != NULL);
733+
AWS_FATAL_ASSERT(manager->cull_event_loop != NULL);
735734

735+
aws_event_loop_cancel_task(manager->cull_event_loop, manager->cull_task);
736736
aws_mem_release(allocator, task);
737-
}
738737

739-
static void s_aws_http_connection_manager_begin_destroy(struct aws_http_connection_manager *manager) {
740-
if (manager == NULL) {
741-
return;
742-
}
743-
744-
/*
745-
* If we have a cull task running then we have to cancel it. But you can only cancel tasks within the event
746-
* loop that the task is scheduled on. So to solve this case, if there's a cull task, rather than doing
747-
* cleanup synchronously, we schedule a final destruction task (on the cull event loop) which cancels the
748-
* cull task before going on to release all the memory and notify the shutdown callback.
749-
*
750-
* If there's no cull task we can just cleanup synchronously.
751-
*/
752-
if (manager->cull_event_loop != NULL) {
753-
AWS_FATAL_ASSERT(manager->cull_task);
754-
struct aws_task *final_destruction_task = aws_mem_calloc(manager->allocator, 1, sizeof(struct aws_task));
755-
aws_task_init(final_destruction_task, s_final_destruction_task, manager, "final_scheduled_destruction");
756-
aws_event_loop_schedule_task_now(manager->cull_event_loop, final_destruction_task);
757-
} else {
758-
s_aws_http_connection_manager_finish_destroy(manager);
759-
}
738+
/* release the refcount on manager as the culling task will not run again */
739+
aws_ref_count_release(&manager->internal_ref_count);
760740
}
761741

762742
static void s_cull_task(struct aws_task *task, void *arg, enum aws_task_status status);
@@ -767,22 +747,19 @@ static void s_schedule_connection_culling(struct aws_http_connection_manager *ma
767747

768748
if (manager->cull_task == NULL) {
769749
manager->cull_task = aws_mem_calloc(manager->allocator, 1, sizeof(struct aws_task));
770-
if (manager->cull_task == NULL) {
771-
return;
772-
}
773-
774750
aws_task_init(manager->cull_task, s_cull_task, manager, "cull_idle_connections");
751+
/* For the task to properly run and cancel, we need to keep manager alive */
752+
aws_ref_count_acquire(&manager->internal_ref_count);
775753
}
776754

777755
if (manager->cull_event_loop == NULL) {
778756
manager->cull_event_loop = aws_event_loop_group_get_next_loop(manager->bootstrap->event_loop_group);
779757
}
780-
781-
if (manager->cull_event_loop == NULL) {
782-
goto on_error;
783-
}
758+
AWS_FATAL_ASSERT(manager->cull_event_loop != NULL);
784759

785760
uint64_t cull_task_time = 0;
761+
762+
aws_mutex_lock(&manager->lock);
786763
const struct aws_linked_list_node *end = aws_linked_list_end(&manager->idle_connections);
787764
struct aws_linked_list_node *oldest_node = aws_linked_list_begin(&manager->idle_connections);
788765
if (oldest_node != end) {
@@ -799,23 +776,16 @@ static void s_schedule_connection_culling(struct aws_http_connection_manager *ma
799776
* culling interval from now.
800777
*/
801778
uint64_t now = 0;
802-
if (manager->system_vtable->get_monotonic_time(&now)) {
803-
goto on_error;
804-
}
779+
manager->system_vtable->get_monotonic_time(&now);
805780
cull_task_time =
806781
now + aws_timestamp_convert(
807782
manager->max_connection_idle_in_milliseconds, AWS_TIMESTAMP_MILLIS, AWS_TIMESTAMP_NANOS, NULL);
808783
}
784+
aws_mutex_unlock(&manager->lock);
809785

810786
aws_event_loop_schedule_task_future(manager->cull_event_loop, manager->cull_task, cull_task_time);
811787

812788
return;
813-
814-
on_error:
815-
816-
manager->cull_event_loop = NULL;
817-
aws_mem_release(manager->allocator, manager->cull_task);
818-
manager->cull_task = NULL;
819789
}
820790

821791
struct aws_http_connection_manager *aws_http_connection_manager_new(
@@ -857,7 +827,7 @@ struct aws_http_connection_manager *aws_http_connection_manager_new(
857827
aws_ref_count_init(
858828
&manager->internal_ref_count,
859829
manager,
860-
(aws_simple_completion_callback *)s_aws_http_connection_manager_begin_destroy);
830+
(aws_simple_completion_callback *)s_aws_http_connection_manager_finish_destroy);
861831

862832
aws_linked_list_init(&manager->idle_connections);
863833
aws_linked_list_init(&manager->pending_acquisitions);
@@ -919,6 +889,7 @@ struct aws_http_connection_manager *aws_http_connection_manager_new(
919889
manager->max_closed_streams = options->max_closed_streams;
920890
manager->http2_conn_manual_window_management = options->http2_conn_manual_window_management;
921891

892+
/* NOTHING can fail after here */
922893
s_schedule_connection_culling(manager);
923894

924895
AWS_LOGF_INFO(AWS_LS_HTTP_CONNECTION_MANAGER, "id=%p: Successfully created", (void *)manager);
@@ -927,14 +898,13 @@ struct aws_http_connection_manager *aws_http_connection_manager_new(
927898

928899
on_error:
929900

930-
s_aws_http_connection_manager_begin_destroy(manager);
901+
s_aws_http_connection_manager_finish_destroy(manager);
931902

932903
return NULL;
933904
}
934905

935906
void aws_http_connection_manager_acquire(struct aws_http_connection_manager *manager) {
936907
aws_mutex_lock(&manager->lock);
937-
aws_ref_count_acquire(&manager->internal_ref_count);
938908
AWS_FATAL_ASSERT(manager->external_ref_count > 0);
939909
manager->external_ref_count += 1;
940910
aws_mutex_unlock(&manager->lock);
@@ -958,6 +928,14 @@ void aws_http_connection_manager_release(struct aws_http_connection_manager *man
958928
(void *)manager);
959929
manager->state = AWS_HCMST_SHUTTING_DOWN;
960930
s_aws_http_connection_manager_build_transaction(&work);
931+
if (manager->cull_task != NULL) {
932+
/* When manager shutting down, schedule the task to cancel the cull task if exist. */
933+
AWS_FATAL_ASSERT(manager->cull_event_loop);
934+
struct aws_task *final_destruction_task =
935+
aws_mem_calloc(manager->allocator, 1, sizeof(struct aws_task));
936+
aws_task_init(final_destruction_task, s_final_destruction_task, manager, "final_scheduled_destruction");
937+
aws_event_loop_schedule_task_now(manager->cull_event_loop, final_destruction_task);
938+
}
961939
aws_ref_count_release(&manager->internal_ref_count);
962940
}
963941
} else {
@@ -1187,14 +1165,8 @@ void aws_http_connection_manager_acquire_connection(
11871165

11881166
aws_mutex_lock(&manager->lock);
11891167

1190-
if (manager->state != AWS_HCMST_READY) {
1191-
AWS_LOGF_ERROR(
1192-
AWS_LS_HTTP_CONNECTION_MANAGER,
1193-
"id=%p: Acquire connection called when manager in shut down state",
1194-
(void *)manager);
1195-
1196-
request->error_code = AWS_ERROR_HTTP_CONNECTION_MANAGER_INVALID_STATE_FOR_ACQUIRE;
1197-
}
1168+
/* It's a use after free crime, we don't want to handle */
1169+
AWS_FATAL_ASSERT(manager->state == AWS_HCMST_READY);
11981170

11991171
aws_linked_list_push_back(&manager->pending_acquisitions, &request->node);
12001172
++manager->pending_acquisition_count;
@@ -1206,6 +1178,7 @@ void aws_http_connection_manager_acquire_connection(
12061178
s_aws_http_connection_manager_execute_transaction(&work);
12071179
}
12081180

1181+
/* Only invoke with lock held */
12091182
static int s_idle_connection(struct aws_http_connection_manager *manager, struct aws_http_connection *connection) {
12101183
struct aws_idle_connection *idle_connection =
12111184
aws_mem_calloc(manager->allocator, 1, sizeof(struct aws_idle_connection));

tests/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,7 @@ add_net_test_case(test_connection_manager_proxy_setup_shutdown)
469469
add_net_test_case(test_connection_manager_idle_culling_single)
470470
add_net_test_case(test_connection_manager_idle_culling_many)
471471
add_net_test_case(test_connection_manager_idle_culling_mixture)
472+
add_net_test_case(test_connection_manager_idle_culling_refcount)
472473

473474
# tests where we establish real connections
474475
add_net_test_case(test_connection_manager_single_connection)

tests/test_connection_manager.c

Lines changed: 41 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ struct cm_tester_options {
5555
bool http2;
5656
struct aws_http2_setting *initial_settings_array;
5757
size_t num_initial_settings;
58+
bool self_lib_init;
5859
};
5960

6061
struct cm_tester {
@@ -93,6 +94,7 @@ struct cm_tester {
9394
struct proxy_env_var_settings proxy_ev_settings;
9495
bool proxy_request_complete;
9596
bool proxy_request_successful;
97+
bool self_lib_init;
9698
};
9799

98100
static struct cm_tester s_tester;
@@ -134,9 +136,10 @@ static int s_cm_tester_init(struct cm_tester_options *options) {
134136
struct cm_tester *tester = &s_tester;
135137

136138
AWS_ZERO_STRUCT(*tester);
137-
138-
aws_http_library_init(options->allocator);
139-
139+
tester->self_lib_init = options->self_lib_init;
140+
if (!tester->self_lib_init) {
141+
aws_http_library_init(options->allocator);
142+
}
140143
tester->allocator = options->allocator;
141144

142145
ASSERT_SUCCESS(aws_mutex_init(&tester->lock));
@@ -418,8 +421,9 @@ static int s_cm_tester_clean_up(void) {
418421
aws_tls_connection_options_clean_up(&tester->tls_connection_options);
419422
aws_tls_ctx_release(tester->tls_ctx);
420423

421-
aws_http_library_clean_up();
422-
424+
if (!tester->self_lib_init) {
425+
aws_http_library_clean_up();
426+
}
423427
aws_mutex_clean_up(&tester->lock);
424428
aws_condition_variable_clean_up(&tester->signal);
425429

@@ -1117,6 +1121,38 @@ static int s_test_connection_manager_idle_culling_mixture(struct aws_allocator *
11171121
}
11181122
AWS_TEST_CASE(test_connection_manager_idle_culling_mixture, s_test_connection_manager_idle_culling_mixture);
11191123

1124+
/**
1125+
* Once upon time, if the culling test is running while the connection manager is shutting, the refcount will be messed
1126+
* up (back from zero to one and trigger the destroy to happen twice)
1127+
*/
1128+
static int s_test_connection_manager_idle_culling_refcount(struct aws_allocator *allocator, void *ctx) {
1129+
(void)ctx;
1130+
1131+
aws_http_library_init(allocator);
1132+
for (size_t i = 0; i < 10; i++) {
1133+
/* To reproduce that more stable, repeat it 10 times. */
1134+
struct cm_tester_options options = {
1135+
.allocator = allocator,
1136+
.max_connections = 10,
1137+
.max_connection_idle_in_ms = 10,
1138+
.self_lib_init = true,
1139+
};
1140+
1141+
ASSERT_SUCCESS(s_cm_tester_init(&options));
1142+
1143+
uint64_t ten_ms_in_nanos = aws_timestamp_convert(10, AWS_TIMESTAMP_MILLIS, AWS_TIMESTAMP_NANOS, NULL);
1144+
1145+
/* Don't ask me how I got the number. :) */
1146+
aws_thread_current_sleep(ten_ms_in_nanos - 10000);
1147+
1148+
ASSERT_SUCCESS(s_cm_tester_clean_up());
1149+
}
1150+
aws_http_library_clean_up();
1151+
return AWS_OP_SUCCESS;
1152+
}
1153+
1154+
AWS_TEST_CASE(test_connection_manager_idle_culling_refcount, s_test_connection_manager_idle_culling_refcount);
1155+
11201156
/**
11211157
* Proxy integration tests. Maybe we should move this to another file. But let's do it later. Someday.
11221158
* AWS_TEST_HTTP_PROXY_HOST - host address of the proxy to use for tests that make open connections to the proxy

0 commit comments

Comments
 (0)