Skip to content

Commit e3d3637

Browse files
committed
AWS peer discovery: ensure consistent hostname path ordering
AWS EC2 API returns networkInterfaceSet and privateIpAddressesSet in arbitrary order, causing non-deterministic hostname resolution during peer discovery. This leads to inconsistent cluster formation. Changes: - Sort network interfaces by deviceIndex (0 first for primary ENI) - Sort private IP addresses by primary flag (primary=true first) - Add debug logging to show hostname path selection and sorting results - Add comprehensive unit tests for sorting behavior The sorting ensures deviceIndex=0 and primary=true IPs are consistently selected first, making peer discovery deterministic across deployments.
1 parent b819507 commit e3d3637

File tree

2 files changed

+190
-13
lines changed

2 files changed

+190
-13
lines changed

deps/rabbitmq_peer_discovery_aws/src/rabbit_peer_discovery_aws.erl

Lines changed: 73 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -351,10 +351,12 @@ get_hostname_by_tags(Tags) ->
351351
get_hostname_path() ->
352352
UsePrivateIP = get_config_key(aws_use_private_ip, ?CONFIG_MODULE:config_map(?BACKEND_CONFIG_KEY)),
353353
HostnamePath = get_config_key(aws_hostname_path, ?CONFIG_MODULE:config_map(?BACKEND_CONFIG_KEY)),
354-
case HostnamePath of
354+
FinalPath = case HostnamePath of
355355
["privateDnsName"] when UsePrivateIP -> ["privateIpAddress"];
356356
P -> P
357-
end.
357+
end,
358+
?LOG_DEBUG("AWS peer discovery using hostname path: ~tp", [FinalPath]),
359+
FinalPath.
358360

359361
-spec get_hostname(path(), props()) -> string().
360362
get_hostname(Path, Props) ->
@@ -370,9 +372,78 @@ get_value(_, []) ->
370372
get_value(Key, Props) when is_integer(Key) ->
371373
{"item", Props2} = lists:nth(Key, Props),
372374
Props2;
375+
get_value("networkInterfaceSet", Props) ->
376+
NetworkInterfaces = proplists:get_value("networkInterfaceSet", Props),
377+
sort_network_interfaces_by_device_index(NetworkInterfaces);
378+
get_value("privateIpAddressesSet", Props) ->
379+
PrivateIpAddresses = proplists:get_value("privateIpAddressesSet", Props),
380+
sort_private_ip_addresses_by_primary(PrivateIpAddresses);
373381
get_value(Key, Props) ->
374382
proplists:get_value(Key, Props).
375383

384+
%% Sort network interfaces by deviceIndex to ensure consistent ENI ordering
385+
-spec sort_network_interfaces_by_device_index(list()) -> list().
386+
sort_network_interfaces_by_device_index(NetworkInterfaces) when is_list(NetworkInterfaces) ->
387+
BeforeInfo = [format_network_interface_info(Props) || {"item", Props} <- NetworkInterfaces],
388+
Sorted = lists:sort(fun({"item", A}, {"item", B}) ->
389+
device_index(A) =< device_index(B)
390+
end, NetworkInterfaces),
391+
AfterInfo = [format_network_interface_info(Props) || {"item", Props} <- Sorted],
392+
?LOG_DEBUG("AWS peer discovery sorted network interfaces from ~tp to ~tp", [BeforeInfo, AfterInfo]),
393+
Sorted;
394+
sort_network_interfaces_by_device_index(Other) ->
395+
Other.
396+
397+
%% Sort private IP addresses by primary flag to ensure primary=true comes first
398+
-spec sort_private_ip_addresses_by_primary(list()) -> list().
399+
sort_private_ip_addresses_by_primary(PrivateIpAddresses) when is_list(PrivateIpAddresses) ->
400+
BeforeInfo = [format_private_ip_info(Props) || {"item", Props} <- PrivateIpAddresses],
401+
Sorted = lists:sort(fun({"item", A}, {"item", B}) ->
402+
is_primary(A) >= is_primary(B)
403+
end, PrivateIpAddresses),
404+
AfterInfo = [format_private_ip_info(Props) || {"item", Props} <- Sorted],
405+
?LOG_DEBUG("AWS peer discovery sorted private IPs from ~tp to ~tp", [BeforeInfo, AfterInfo]),
406+
Sorted;
407+
sort_private_ip_addresses_by_primary(Other) ->
408+
Other.
409+
410+
%% Extract deviceIndex from network interface attachment
411+
-spec device_index(props()) -> integer().
412+
device_index(Interface) ->
413+
Attachment = proplists:get_value("attachment", Interface),
414+
case proplists:get_value("deviceIndex", Attachment) of
415+
DeviceIndex when is_list(DeviceIndex) ->
416+
{Int, []} = string:to_integer(DeviceIndex),
417+
Int;
418+
DeviceIndex when is_integer(DeviceIndex) ->
419+
DeviceIndex
420+
end.
421+
422+
%% Extract primary flag from private IP address
423+
-spec is_primary(props()) -> boolean().
424+
is_primary(IpAddress) ->
425+
case proplists:get_value("primary", IpAddress) of
426+
"true" -> true;
427+
_ -> false
428+
end.
429+
430+
%% Format network interface info for logging
431+
-spec format_network_interface_info(props()) -> string().
432+
format_network_interface_info(Interface) ->
433+
ENI = proplists:get_value("networkInterfaceId", Interface, "unknown"),
434+
DeviceIndex = device_index(Interface),
435+
lists:flatten(io_lib:format("~s:~w", [ENI, DeviceIndex])).
436+
437+
%% Format private IP info for logging
438+
-spec format_private_ip_info(props()) -> string().
439+
format_private_ip_info(IpAddress) ->
440+
IP = proplists:get_value("privateIpAddress", IpAddress, "unknown"),
441+
Primary = case is_primary(IpAddress) of
442+
true -> "primary";
443+
false -> "secondary"
444+
end,
445+
lists:flatten(io_lib:format("~s:~s", [IP, Primary])).
446+
376447
-spec get_tags() -> tags().
377448
get_tags() ->
378449
Tags = get_config_key(aws_ec2_tags, ?CONFIG_MODULE:config_map(?BACKEND_CONFIG_KEY)),

deps/rabbitmq_peer_discovery_aws/test/unit_SUITE.erl

Lines changed: 117 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,9 @@ groups() ->
2323
{unit, [], [
2424
maybe_add_tag_filters,
2525
get_hostname_name_from_reservation_set,
26-
registration_support
26+
registration_support,
27+
network_interface_sorting,
28+
private_ip_address_sorting
2729
]},
2830
{lock, [], [
2931
lock_single_node,
@@ -75,12 +77,93 @@ get_hostname_name_from_reservation_set(_Config) ->
7577
?assertEqual(Expectation,
7678
rabbit_peer_discovery_aws:get_hostname_name_from_reservation_set(
7779
reservation_set(), []))
80+
end},
81+
{"from private IP DNS in network interface",
82+
fun() ->
83+
os:putenv("AWS_HOSTNAME_PATH", "networkInterfaceSet,2,privateIpAddressesSet,1,privateDnsName"),
84+
Expectation = ["ip-10-0-15-100.eu-west-1.compute.internal",
85+
"ip-10-0-16-31.eu-west-1.compute.internal"],
86+
?assertEqual(Expectation,
87+
rabbit_peer_discovery_aws:get_hostname_name_from_reservation_set(
88+
reservation_set(), []))
7889
end}]
7990
}).
8091

8192
registration_support(_Config) ->
8293
?assertEqual(false, rabbit_peer_discovery_aws:supports_registration()).
8394

95+
network_interface_sorting(_Config) ->
96+
%% Test ENI sorting by deviceIndex (DescribeInstances only returns attached ENIs)
97+
NetworkInterfaces = [
98+
{"item", [
99+
{"networkInterfaceId", "eni-secondary"},
100+
{"attachment", [{"deviceIndex", "1"}]}
101+
]},
102+
{"item", [
103+
{"networkInterfaceId", "eni-primary"},
104+
{"attachment", [{"deviceIndex", "0"}]}
105+
]},
106+
{"item", [
107+
{"networkInterfaceId", "eni-tertiary"},
108+
{"attachment", [{"deviceIndex", "2"}]}
109+
]}
110+
],
111+
112+
%% Should sort ENIs by deviceIndex
113+
Sorted = rabbit_peer_discovery_aws:sort_network_interfaces_by_device_index(NetworkInterfaces),
114+
115+
%% Should have all 3 ENIs
116+
?assertEqual(3, length(Sorted)),
117+
118+
%% Primary ENI (deviceIndex=0) should be first
119+
{"item", FirstENI} = lists:nth(1, Sorted),
120+
?assertEqual("eni-primary", proplists:get_value("networkInterfaceId", FirstENI)),
121+
122+
%% Secondary ENI (deviceIndex=1) should be second
123+
{"item", SecondENI} = lists:nth(2, Sorted),
124+
?assertEqual("eni-secondary", proplists:get_value("networkInterfaceId", SecondENI)),
125+
126+
%% Tertiary ENI (deviceIndex=2) should be third
127+
{"item", ThirdENI} = lists:nth(3, Sorted),
128+
?assertEqual("eni-tertiary", proplists:get_value("networkInterfaceId", ThirdENI)).
129+
130+
private_ip_address_sorting(_Config) ->
131+
%% Test private IP address sorting by primary flag
132+
PrivateIpAddresses = [
133+
{"item", [
134+
{"privateIpAddress", "10.0.14.176"},
135+
{"privateDnsName", "ip-10-0-14-176.us-west-2.compute.internal"},
136+
{"primary", "false"}
137+
]},
138+
{"item", [
139+
{"privateIpAddress", "10.0.12.112"},
140+
{"privateDnsName", "ip-10-0-12-112.us-west-2.compute.internal"},
141+
{"primary", "true"}
142+
]},
143+
{"item", [
144+
{"privateIpAddress", "10.0.15.200"},
145+
{"privateDnsName", "ip-10-0-15-200.us-west-2.compute.internal"},
146+
{"primary", "false"}
147+
]}
148+
],
149+
150+
Sorted = rabbit_peer_discovery_aws:sort_private_ip_addresses_by_primary(PrivateIpAddresses),
151+
?assertEqual(3, length(Sorted)),
152+
153+
%% Primary IP (primary=true) should be first
154+
{"item", FirstIP} = lists:nth(1, Sorted),
155+
?assertEqual("10.0.12.112", proplists:get_value("privateIpAddress", FirstIP)),
156+
?assertEqual("true", proplists:get_value("primary", FirstIP)),
157+
158+
%% Non-primary IPs should maintain relative order
159+
{"item", SecondIP} = lists:nth(2, Sorted),
160+
?assertEqual("10.0.14.176", proplists:get_value("privateIpAddress", SecondIP)),
161+
?assertEqual("false", proplists:get_value("primary", SecondIP)),
162+
163+
{"item", ThirdIP} = lists:nth(3, Sorted),
164+
?assertEqual("10.0.15.200", proplists:get_value("privateIpAddress", ThirdIP)),
165+
?assertEqual("false", proplists:get_value("primary", ThirdIP)).
166+
84167
lock_single_node(_Config) ->
85168
LocalNode = node(),
86169
Nodes = [LocalNode],
@@ -141,16 +224,30 @@ reservation_set() ->
141224
{"vpcId","vpc-4fe1562b"},
142225
{"networkInterfaceSet", [
143226
{"item",
144-
[{"association",
145-
[{"publicIp","203.0.113.11"},
146-
{"publicDnsName",
147-
"ec2-203-0-113-11.eu-west-1.compute.amazonaws.com"},
148-
{"ipOwnerId","amazon"}]}]},
149-
{"item",
150-
[{"association",
227+
[{"attachment", [{"deviceIndex", "1"}]},
228+
{"association",
151229
[{"publicIp","203.0.113.12"},
152230
{"publicDnsName",
153231
"ec2-203-0-113-12.eu-west-1.compute.amazonaws.com"},
232+
{"ipOwnerId","amazon"}]},
233+
{"privateIpAddressesSet", [
234+
{"item", [
235+
{"privateIpAddress", "10.0.15.101"},
236+
{"privateDnsName", "ip-10-0-15-101.eu-west-1.compute.internal"},
237+
{"primary", "false"}
238+
]},
239+
{"item", [
240+
{"privateIpAddress", "10.0.15.100"},
241+
{"privateDnsName", "ip-10-0-15-100.eu-west-1.compute.internal"},
242+
{"primary", "true"}
243+
]}
244+
]}]},
245+
{"item",
246+
[{"attachment", [{"deviceIndex", "0"}]},
247+
{"association",
248+
[{"publicIp","203.0.113.11"},
249+
{"publicDnsName",
250+
"ec2-203-0-113-11.eu-west-1.compute.amazonaws.com"},
154251
{"ipOwnerId","amazon"}]}]}]},
155252
{"privateIpAddress","10.0.16.29"}]}]}]},
156253
{"item", [{"reservationId","r-006cfdbf8d04c5f01"},
@@ -171,15 +268,24 @@ reservation_set() ->
171268
{"vpcId","vpc-4fe1562b"},
172269
{"networkInterfaceSet", [
173270
{"item",
174-
[{"association",
271+
[{"attachment", [{"deviceIndex", "0"}]},
272+
{"association",
175273
[{"publicIp","203.0.113.21"},
176274
{"publicDnsName",
177275
"ec2-203-0-113-21.eu-west-1.compute.amazonaws.com"},
178276
{"ipOwnerId","amazon"}]}]},
179277
{"item",
180-
[{"association",
278+
[{"attachment", [{"deviceIndex", "1"}]},
279+
{"association",
181280
[{"publicIp","203.0.113.22"},
182281
{"publicDnsName",
183282
"ec2-203-0-113-22.eu-west-1.compute.amazonaws.com"},
184-
{"ipOwnerId","amazon"}]}]}]},
283+
{"ipOwnerId","amazon"}]},
284+
{"privateIpAddressesSet", [
285+
{"item", [
286+
{"privateIpAddress", "10.0.16.31"},
287+
{"privateDnsName", "ip-10-0-16-31.eu-west-1.compute.internal"},
288+
{"primary", "true"}
289+
]}
290+
]}]}]},
185291
{"privateIpAddress","10.0.16.31"}]}]}]}].

0 commit comments

Comments
 (0)