@@ -511,7 +511,8 @@ private boolean doCheckMember(InternalDistributedMember member, boolean waitForR
511511 * @param suspectMember member that does not respond to HeartbeatRequestMessage
512512 * @return true if successfully exchanged PING/PONG with TCP connection, otherwise false.
513513 */
514- boolean doTCPCheckMember (InternalDistributedMember suspectMember , int port ) {
514+ boolean doTCPCheckMember (InternalDistributedMember suspectMember , int port ,
515+ boolean retryIfConnectFails ) {
515516 Socket clientSocket = null ;
516517 // make sure we try to check on the member for the contracted memberTimeout period
517518 // in case a timed socket.connect() returns immediately
@@ -555,7 +556,8 @@ boolean doTCPCheckMember(InternalDistributedMember suspectMember, int port) {
555556 // expected
556557 }
557558 }
558- } while (!passed && !this .isShutdown () && System .nanoTime () < giveupTime );
559+ } while (retryIfConnectFails && !passed && !this .isShutdown ()
560+ && System .nanoTime () < giveupTime );
559561 return passed ;
560562 }
561563
@@ -690,7 +692,6 @@ private void startTcpServer(ServerSocket ssocket) {
690692 if (!ssocket .isClosed ()) {
691693 try {
692694 ssocket .close ();
693- logger .info ("GMSHealthMonitor server socket closed." );
694695 } catch (IOException e ) {
695696 logger .debug ("Unexpected exception" , e );
696697 }
@@ -857,7 +858,7 @@ protected synchronized void setNextNeighbor(NetView newView, InternalDistributed
857858 }
858859 InternalDistributedMember oldNeighbor = nextNeighbor ;
859860 if (oldNeighbor != newNeighbor ) {
860- logger .info ("Failure detection is now watching " + newNeighbor );
861+ logger .debug ("Failure detection is now watching " + newNeighbor );
861862 nextNeighbor = newNeighbor ;
862863 }
863864 }
@@ -928,7 +929,6 @@ private void stopServices() {
928929 if (serverSocket != null && !serverSocket .isClosed ()) {
929930 try {
930931 serverSocket .close ();
931- logger .info ("GMSHealthMonitor server socket is closed in stopServices()." );
932932 } catch (IOException e ) {
933933 logger .trace ("Unexpected exception" , e );
934934 }
@@ -939,8 +939,6 @@ private void stopServices() {
939939 } catch (InterruptedException e ) {
940940 Thread .currentThread ().interrupt ();
941941 }
942- logger .info ("GMSHealthMonitor serverSocketExecutor is "
943- + (serverSocketExecutor .isTerminated () ? "terminated" : "not terminated" ));
944942 }
945943 }
946944
@@ -1122,9 +1120,6 @@ private void processSuspectMembersRequest(SuspectMembersMessage incomingRequest)
11221120
11231121 NetView cv = currentView ;
11241122
1125- logger .info ("Received suspect message {} with current view {}" , incomingRequest ,
1126- cv == null ? "<no view>" : cv .getViewId ());
1127-
11281123 if (cv == null ) {
11291124 return ;
11301125 }
@@ -1176,7 +1171,7 @@ private void processSuspectMembersRequest(SuspectMembersMessage incomingRequest)
11761171 synchronized (suspectRequestsInView ) {
11771172 recordSuspectRequests (suspectRequests , cv );
11781173 Set <SuspectRequest > suspectsInView = suspectRequestsInView .get (cv );
1179- logger .info ("Current suspects are {}" , suspectsInView );
1174+ logger .debug ("Current suspects are {}" , suspectsInView );
11801175 for (final SuspectRequest sr : suspectsInView ) {
11811176 check .remove (sr .getSuspectMember ());
11821177 membersToCheck .add (sr );
@@ -1189,10 +1184,10 @@ private void processSuspectMembersRequest(SuspectMembersMessage incomingRequest)
11891184 }
11901185 }
11911186 if (!membersLeaving .isEmpty ()) {
1192- logger .info ("Current leave requests are {}" , membersLeaving );
1187+ logger .debug ("Current leave requests are {}" , membersLeaving );
11931188 check .removeAll (membersLeaving );
11941189 }
1195- logger .info (
1190+ logger .trace (
11961191 "Proposed view with suspects & leaving members removed is {}\n with coordinator {}\n my address is {}" ,
11971192 check ,
11981193 check .getCoordinator (), localAddress );
@@ -1308,7 +1303,10 @@ protected boolean inlineCheckIfAvailable(final InternalDistributedMember initiat
13081303 // if we will get heartbeat then it will change the timestamp, which we are
13091304 // checking below in case of tcp check failure..
13101305 doCheckMember (mbr , false );
1311- pinged = doTCPCheckMember (mbr , port );
1306+ // now, while waiting for a heartbeat, try connecting to the suspect's failure detection
1307+ // port
1308+ final boolean retryIfConnectFails = forceRemovalIfCheckFails ;
1309+ pinged = doTCPCheckMember (mbr , port , retryIfConnectFails );
13121310 }
13131311
13141312 if (!pinged && !isStopping ) {
@@ -1325,7 +1323,7 @@ protected boolean inlineCheckIfAvailable(final InternalDistributedMember initiat
13251323 } else {
13261324 // if this node can survive an availability check then initiate suspicion about
13271325 // the node that failed the availability check
1328- if (doTCPCheckMember (localAddress , this .socketPort )) {
1326+ if (doTCPCheckMember (localAddress , this .socketPort , false )) {
13291327 membersInFinalCheck .remove (mbr );
13301328 // tell peers about this member and then perform another availability check
13311329 memberSuspected (localAddress , mbr , reason );
@@ -1335,7 +1333,7 @@ protected boolean inlineCheckIfAvailable(final InternalDistributedMember initiat
13351333 Collections
13361334 .singletonList (new SuspectRequest (mbr , "failed availability check" )));
13371335 suspectMembersMessage .setSender (localAddress );
1338- logger .info ("Performing local processing on suspect request" );
1336+ logger .debug ("Performing local processing on suspect request" );
13391337 processSuspectMembersRequest (suspectMembersMessage );
13401338 }
13411339 }
@@ -1398,7 +1396,7 @@ private void sendSuspectRequest(final List<SuspectRequest> requests) {
13981396 recipients = currentView .getMembers ();
13991397 }
14001398
1401- logger .info ("Sending suspect messages to {}" , recipients );
1399+ logger .trace ("Sending suspect messages to {}" , recipients );
14021400 SuspectMembersMessage smm = new SuspectMembersMessage (recipients , requests );
14031401 Set <InternalDistributedMember > failedRecipients ;
14041402 try {
@@ -1409,7 +1407,7 @@ private void sendSuspectRequest(final List<SuspectRequest> requests) {
14091407 }
14101408
14111409 if (failedRecipients != null && failedRecipients .size () > 0 ) {
1412- logger .info ("Unable to send suspect message to {}" , failedRecipients );
1410+ logger .trace ("Unable to send suspect message to {}" , failedRecipients );
14131411 }
14141412 }
14151413
0 commit comments