Skip to content

Commit 0fea07a

Browse files
committed
GEODE-6423 availability checks sometimes immediately initiate removal
Do not loop in trying to form a tcp/ip connection to a suspect unless the next step is to remove the suspect from membership. In this case there will be another invocation of the same method that will take the removal step next. (cherry picked from commit 2e0a893)
1 parent 097353f commit 0fea07a

File tree

2 files changed

+20
-21
lines changed

2 files changed

+20
-21
lines changed

geode-core/src/integrationTest/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitorJUnitTest.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -816,7 +816,7 @@ public void run() {
816816
InternalDistributedMember otherMember =
817817
createInternalDistributedMember(Version.CURRENT_ORDINAL, 0, 1, 1);
818818
long startTime = System.currentTimeMillis();
819-
gmsHealthMonitor.doTCPCheckMember(otherMember, mySocket.getLocalPort());
819+
gmsHealthMonitor.doTCPCheckMember(otherMember, mySocket.getLocalPort(), true);
820820
mySocket.close();
821821
serverThread.interrupt();
822822
serverThread.join(getTimeout().getValueInMS());
@@ -902,7 +902,8 @@ public class GMSHealthMonitorTest extends GMSHealthMonitor {
902902
public boolean useBlockingSocket = false;
903903

904904
@Override
905-
boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
905+
boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port,
906+
boolean retryIfConnectFails) {
906907
if (useGMSHealthMonitorTestClass) {
907908
if (simulateHeartbeatInGMSHealthMonitorTestClass) {
908909
HeartbeatMessage fakeHeartbeat = new HeartbeatMessage();
@@ -911,7 +912,7 @@ boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
911912
}
912913
return false;
913914
}
914-
return super.doTCPCheckMember(suspectMember, port);
915+
return super.doTCPCheckMember(suspectMember, port, retryIfConnectFails);
915916
}
916917

917918
@Override

geode-core/src/main/java/org/apache/geode/distributed/internal/membership/gms/fd/GMSHealthMonitor.java

Lines changed: 16 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -511,7 +511,8 @@ private boolean doCheckMember(InternalDistributedMember member, boolean waitForR
511511
* @param suspectMember member that does not respond to HeartbeatRequestMessage
512512
* @return true if successfully exchanged PING/PONG with TCP connection, otherwise false.
513513
*/
514-
boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
514+
boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port,
515+
boolean retryIfConnectFails) {
515516
Socket clientSocket = null;
516517
// make sure we try to check on the member for the contracted memberTimeout period
517518
// in case a timed socket.connect() returns immediately
@@ -555,7 +556,8 @@ boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
555556
// expected
556557
}
557558
}
558-
} while (!passed && !this.isShutdown() && System.nanoTime() < giveupTime);
559+
} while (retryIfConnectFails && !passed && !this.isShutdown()
560+
&& System.nanoTime() < giveupTime);
559561
return passed;
560562
}
561563

@@ -690,7 +692,6 @@ private void startTcpServer(ServerSocket ssocket) {
690692
if (!ssocket.isClosed()) {
691693
try {
692694
ssocket.close();
693-
logger.info("GMSHealthMonitor server socket closed.");
694695
} catch (IOException e) {
695696
logger.debug("Unexpected exception", e);
696697
}
@@ -857,7 +858,7 @@ protected synchronized void setNextNeighbor(NetView newView, InternalDistributed
857858
}
858859
InternalDistributedMember oldNeighbor = nextNeighbor;
859860
if (oldNeighbor != newNeighbor) {
860-
logger.info("Failure detection is now watching " + newNeighbor);
861+
logger.debug("Failure detection is now watching " + newNeighbor);
861862
nextNeighbor = newNeighbor;
862863
}
863864
}
@@ -928,7 +929,6 @@ private void stopServices() {
928929
if (serverSocket != null && !serverSocket.isClosed()) {
929930
try {
930931
serverSocket.close();
931-
logger.info("GMSHealthMonitor server socket is closed in stopServices().");
932932
} catch (IOException e) {
933933
logger.trace("Unexpected exception", e);
934934
}
@@ -939,8 +939,6 @@ private void stopServices() {
939939
} catch (InterruptedException e) {
940940
Thread.currentThread().interrupt();
941941
}
942-
logger.info("GMSHealthMonitor serverSocketExecutor is "
943-
+ (serverSocketExecutor.isTerminated() ? "terminated" : "not terminated"));
944942
}
945943
}
946944

@@ -1122,9 +1120,6 @@ private void processSuspectMembersRequest(SuspectMembersMessage incomingRequest)
11221120

11231121
NetView cv = currentView;
11241122

1125-
logger.info("Received suspect message {} with current view {}", incomingRequest,
1126-
cv == null ? "<no view>" : cv.getViewId());
1127-
11281123
if (cv == null) {
11291124
return;
11301125
}
@@ -1176,7 +1171,7 @@ private void processSuspectMembersRequest(SuspectMembersMessage incomingRequest)
11761171
synchronized (suspectRequestsInView) {
11771172
recordSuspectRequests(suspectRequests, cv);
11781173
Set<SuspectRequest> suspectsInView = suspectRequestsInView.get(cv);
1179-
logger.info("Current suspects are {}", suspectsInView);
1174+
logger.debug("Current suspects are {}", suspectsInView);
11801175
for (final SuspectRequest sr : suspectsInView) {
11811176
check.remove(sr.getSuspectMember());
11821177
membersToCheck.add(sr);
@@ -1189,10 +1184,10 @@ private void processSuspectMembersRequest(SuspectMembersMessage incomingRequest)
11891184
}
11901185
}
11911186
if (!membersLeaving.isEmpty()) {
1192-
logger.info("Current leave requests are {}", membersLeaving);
1187+
logger.debug("Current leave requests are {}", membersLeaving);
11931188
check.removeAll(membersLeaving);
11941189
}
1195-
logger.info(
1190+
logger.trace(
11961191
"Proposed view with suspects & leaving members removed is {}\nwith coordinator {}\nmy address is {}",
11971192
check,
11981193
check.getCoordinator(), localAddress);
@@ -1308,7 +1303,10 @@ protected boolean inlineCheckIfAvailable(final InternalDistributedMember initiat
13081303
// if we will get heartbeat then it will change the timestamp, which we are
13091304
// checking below in case of tcp check failure..
13101305
doCheckMember(mbr, false);
1311-
pinged = doTCPCheckMember(mbr, port);
1306+
// now, while waiting for a heartbeat, try connecting to the suspect's failure detection
1307+
// port
1308+
final boolean retryIfConnectFails = forceRemovalIfCheckFails;
1309+
pinged = doTCPCheckMember(mbr, port, retryIfConnectFails);
13121310
}
13131311

13141312
if (!pinged && !isStopping) {
@@ -1325,7 +1323,7 @@ protected boolean inlineCheckIfAvailable(final InternalDistributedMember initiat
13251323
} else {
13261324
// if this node can survive an availability check then initiate suspicion about
13271325
// the node that failed the availability check
1328-
if (doTCPCheckMember(localAddress, this.socketPort)) {
1326+
if (doTCPCheckMember(localAddress, this.socketPort, false)) {
13291327
membersInFinalCheck.remove(mbr);
13301328
// tell peers about this member and then perform another availability check
13311329
memberSuspected(localAddress, mbr, reason);
@@ -1335,7 +1333,7 @@ protected boolean inlineCheckIfAvailable(final InternalDistributedMember initiat
13351333
Collections
13361334
.singletonList(new SuspectRequest(mbr, "failed availability check")));
13371335
suspectMembersMessage.setSender(localAddress);
1338-
logger.info("Performing local processing on suspect request");
1336+
logger.debug("Performing local processing on suspect request");
13391337
processSuspectMembersRequest(suspectMembersMessage);
13401338
}
13411339
}
@@ -1398,7 +1396,7 @@ private void sendSuspectRequest(final List<SuspectRequest> requests) {
13981396
recipients = currentView.getMembers();
13991397
}
14001398

1401-
logger.info("Sending suspect messages to {}", recipients);
1399+
logger.trace("Sending suspect messages to {}", recipients);
14021400
SuspectMembersMessage smm = new SuspectMembersMessage(recipients, requests);
14031401
Set<InternalDistributedMember> failedRecipients;
14041402
try {
@@ -1409,7 +1407,7 @@ private void sendSuspectRequest(final List<SuspectRequest> requests) {
14091407
}
14101408

14111409
if (failedRecipients != null && failedRecipients.size() > 0) {
1412-
logger.info("Unable to send suspect message to {}", failedRecipients);
1410+
logger.trace("Unable to send suspect message to {}", failedRecipients);
14131411
}
14141412
}
14151413

0 commit comments

Comments
 (0)