Skip to content

Commit 80d4a09

Browse files
committed
HBASE-24128 [Flakey Tests] Add retry on thrift cmdline if client fails plus misc debug (#1442)
hbase-server/src/test/java/org/apache/hadoop/hbase/TestClusterPortAssignment.java Saw case where Master failed startup but it came out as an IOE so we did not trip the retry logic. hbase-server/src/test/java/org/apache/hadoop/hbase/TestInfoServers.java Add some debug and up timeouts. This test fails frequently for me locally. hbase-server/src/test/java/org/apache/hadoop/hbase/client/locking/TestEntityLocks.java Up the wait from 2x 200ms to 10x in case a pause on hardware or GC. This test fails locally and up on jenkins. hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestClearRegionBlockCache.java Debug. Have assert say what bad count was. hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactingToCellFlatMapMemStore.java Fails on occasion. Found count is off by a few. Tricky to debug. HBASE-24129 to reenable. hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionMergeTransactionOnCluster.java Debug. Add wait and check before moving to assert. hbase-thrift/src/test/java/org/apache/hadoop/hbase/thrift/TestThriftHttpServer.java Check for null before shutting; can be null if failed start. hbase-thrift/src/test/java/org/apache/hadoop/hbase/thrift/TestThriftServerCmdLine.java Add retry if client messes up connection. Fails for me locally.
1 parent e71c442 commit 80d4a09

File tree

9 files changed

+66
-35
lines changed

9 files changed

+66
-35
lines changed

hbase-server/src/test/java/org/apache/hadoop/hbase/TestClusterPortAssignment.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ public void testClusterPortAssignment() throws Exception {
6565
cluster.getRegionServer(0).getRpcServer().getListenerAddress().getPort());
6666
assertEquals("RS info port is incorrect", rsInfoPort,
6767
cluster.getRegionServer(0).getInfoServer().getPort());
68-
} catch (BindException|UnsupportedOperationException e) {
68+
} catch (Exception e) {
6969
if (e instanceof BindException || e.getCause() != null &&
7070
(e.getCause() instanceof BindException || e.getCause().getCause() != null &&
7171
e.getCause().getCause() instanceof BindException)) {

hbase-server/src/test/java/org/apache/hadoop/hbase/TestInfoServers.java

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/**
1+
/*
22
* Licensed to the Apache Software Foundation (ASF) under one
33
* or more contributor license agreements. See the NOTICE file
44
* distributed with this work for additional information
@@ -20,12 +20,12 @@
2020
import static org.junit.Assert.assertEquals;
2121
import static org.junit.Assert.assertFalse;
2222
import static org.junit.Assert.assertTrue;
23-
2423
import java.io.IOException;
2524
import java.io.InputStream;
2625
import java.net.URL;
2726
import org.apache.commons.io.IOUtils;
2827
import org.apache.hadoop.hbase.client.Admin;
28+
import org.apache.hadoop.hbase.master.HMaster;
2929
import org.apache.hadoop.hbase.testclassification.MediumTests;
3030
import org.apache.hadoop.hbase.testclassification.MiscTests;
3131
import org.apache.hadoop.hbase.util.Bytes;
@@ -85,8 +85,8 @@ public void testGetMasterInfoPort() throws Exception {
8585
}
8686

8787
/**
88-
* Ensure when we go to top level index pages that we get redirected to an info-server specific status
89-
* page.
88+
* Ensure when we go to top level index pages that we get redirected to an info-server specific
89+
* status page.
9090
*/
9191
@Test
9292
public void testInfoServersRedirect() throws Exception {
@@ -121,9 +121,10 @@ public void testMasterServerReadOnly() throws Exception {
121121
byte[] cf = Bytes.toBytes("d");
122122
UTIL.createTable(tableName, cf);
123123
UTIL.waitTableAvailable(tableName);
124-
int port = UTIL.getHBaseCluster().getMaster().getInfoServer().getPort();
125-
assertDoesNotContainContent(new URL("http://localhost:" + port + "/table.jsp?name=" + tableName
126-
+ "&action=split&key="), "Table action request accepted");
124+
HMaster master = UTIL.getHBaseCluster().getMaster();
125+
int port = master.getRegionServerInfoPort(master.getServerName());
126+
assertDoesNotContainContent(new URL("http://localhost:" + port + "/table.jsp?name=" +
127+
tableName + "&action=split&key="), "Table action request accepted");
127128
assertDoesNotContainContent(
128129
new URL("http://localhost:" + port + "/table.jsp?name=" + tableName), "Actions:");
129130
}
@@ -143,11 +144,11 @@ private void assertDoesNotContainContent(final URL u, final String expected) thr
143144

144145
private String getUrlContent(URL u) throws IOException {
145146
java.net.URLConnection c = u.openConnection();
146-
c.setConnectTimeout(2000);
147-
c.setReadTimeout(2000);
147+
c.setConnectTimeout(20000);
148+
c.setReadTimeout(20000);
148149
c.connect();
149150
try (InputStream in = c.getInputStream()) {
150-
return IOUtils.toString(in);
151+
return IOUtils.toString(in, HConstants.UTF8_ENCODING);
151152
}
152153
}
153154
}

hbase-server/src/test/java/org/apache/hadoop/hbase/client/locking/TestEntityLocks.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -173,8 +173,8 @@ public void testEntityLockTimeout() throws Exception {
173173
lock.requestLock();
174174
lock.await();
175175
assertTrue(lock.isLocked());
176-
// Should get unlocked in next heartbeat i.e. after workerSleepTime. Wait 2x time.
177-
assertTrue(waitLockTimeOut(lock, 2 * workerSleepTime));
176+
// Should get unlocked in next heartbeat i.e. after workerSleepTime. Wait 10x time to be sure.
177+
assertTrue(waitLockTimeOut(lock, 10 * workerSleepTime));
178178
assertFalse(lock.getWorker().isAlive());
179179
verify(abortable, times(1)).abort(any(), eq(null));
180180
}

hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestCloseRegionWhileRSCrash.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,16 @@
4646
import org.junit.ClassRule;
4747
import org.junit.Test;
4848
import org.junit.experimental.categories.Category;
49+
import org.slf4j.Logger;
50+
import org.slf4j.LoggerFactory;
4951

5052
/**
5153
* Confirm that we will do backoff when retrying on closing a region, to avoid consuming all the
5254
* CPUs.
5355
*/
5456
@Category({ MasterTests.class, MediumTests.class })
5557
public class TestCloseRegionWhileRSCrash {
58+
private static final Logger LOG = LoggerFactory.getLogger(TestCloseRegionWhileRSCrash.class);
5659

5760
@ClassRule
5861
public static final HBaseClassTestRule CLASS_RULE =
@@ -176,6 +179,7 @@ public void testRetryBackoff() throws IOException, InterruptedException {
176179
try {
177180
UTIL.getAdmin().move(region.getEncodedNameAsBytes(), dstRs.getServerName());
178181
} catch (IOException e) {
182+
LOG.info("Failed move of {}", region.getRegionNameAsString(), e);
179183
}
180184
});
181185
t.start();
@@ -185,12 +189,13 @@ public void testRetryBackoff() throws IOException, InterruptedException {
185189
// wait until the timeout value increase three times
186190
ProcedureTestUtil.waitUntilProcedureTimeoutIncrease(UTIL, TransitRegionStateProcedure.class, 3);
187191
// close connection to make sure that we can not finish the TRSP
188-
HMaster master = UTIL.getMiniHBaseCluster().getMaster();
192+
final HMaster master = UTIL.getMiniHBaseCluster().getMaster();
189193
master.getConnection().close();
190194
RESUME.countDown();
191195
UTIL.waitFor(30000, () -> !master.isAlive());
192196
// here we start a new master
193-
UTIL.getMiniHBaseCluster().startMaster();
197+
HMaster master2 = UTIL.getMiniHBaseCluster().startMaster().getMaster();
198+
LOG.info("Master2 {}, joining move thread", master2.getServerName());
194199
t.join();
195200
// Make sure that the region is online, it may not on the original target server, as we will set
196201
// forceNewPlan to true if there is a server crash

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestClearRegionBlockCache.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,10 @@ public void testClearBlockCache() throws Exception {
117117
HTU.getNumHFilesForRS(rs2, TABLE_NAME, FAMILY));
118118
clearRegionBlockCache(rs2);
119119

120-
assertEquals(initialBlockCount1, blockCache1.getBlockCount());
121-
assertEquals(initialBlockCount2, blockCache2.getBlockCount());
120+
assertEquals("" + blockCache1.getBlockCount(),
121+
initialBlockCount1, blockCache1.getBlockCount());
122+
assertEquals("" + blockCache2.getBlockCount(),
123+
initialBlockCount2, blockCache2.getBlockCount());
122124
}
123125

124126
@Test

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestCompactingToCellFlatMapMemStore.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -837,7 +837,9 @@ public void testFlatteningToJumboCellChunkMap() throws IOException {
837837
* testForceCopyOfBigCellIntoImmutableSegment checks that the
838838
* ImmutableMemStoreLAB's forceCopyOfBigCellInto does what it's supposed to do.
839839
*/
840-
@Test
840+
@org.junit.Ignore @Test // Flakey. Disabled by HBASE-24128. HBASE-24129 is for reenable.
841+
// TestCompactingToCellFlatMapMemStore.testForceCopyOfBigCellIntoImmutableSegment:902 i=1
842+
// expected:<8389924> but was:<8389992>
841843
public void testForceCopyOfBigCellIntoImmutableSegment() throws IOException {
842844

843845
if (toCellChunkMap == false) {

hbase-server/src/test/java/org/apache/hadoop/hbase/regionserver/TestRegionMergeTransactionOnCluster.java

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,13 @@
2121
import static org.junit.Assert.assertFalse;
2222
import static org.junit.Assert.assertTrue;
2323
import static org.junit.Assert.fail;
24-
2524
import java.io.IOException;
2625
import java.util.ArrayList;
26+
import java.util.Arrays;
2727
import java.util.List;
2828
import java.util.Objects;
2929
import java.util.concurrent.atomic.AtomicBoolean;
30+
import java.util.stream.Collectors;
3031
import org.apache.commons.lang3.RandomUtils;
3132
import org.apache.hadoop.conf.Configuration;
3233
import org.apache.hadoop.fs.FileSystem;
@@ -65,6 +66,7 @@
6566
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
6667
import org.apache.hadoop.hbase.util.Pair;
6768
import org.apache.hadoop.hbase.util.PairOfSameType;
69+
import org.apache.hadoop.hbase.util.Threads;
6870
import org.apache.hadoop.util.StringUtils;
6971
import org.apache.zookeeper.KeeperException;
7072
import org.junit.AfterClass;
@@ -76,11 +78,9 @@
7678
import org.junit.rules.TestName;
7779
import org.slf4j.Logger;
7880
import org.slf4j.LoggerFactory;
79-
8081
import org.apache.hbase.thirdparty.com.google.common.base.Joiner;
8182
import org.apache.hbase.thirdparty.com.google.protobuf.RpcController;
8283
import org.apache.hbase.thirdparty.com.google.protobuf.ServiceException;
83-
8484
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.RegionStateTransition.TransitionCode;
8585
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionRequest;
8686
import org.apache.hadoop.hbase.shaded.protobuf.generated.RegionServerStatusProtos.ReportRegionStateTransitionResponse;
@@ -129,7 +129,9 @@ public static void beforeAllTests() throws Exception {
129129
@AfterClass
130130
public static void afterAllTests() throws Exception {
131131
TEST_UTIL.shutdownMiniCluster();
132-
if (ADMIN != null) ADMIN.close();
132+
if (ADMIN != null) {
133+
ADMIN.close();
134+
}
133135
}
134136

135137
@Test
@@ -285,8 +287,19 @@ public void testCleanMergeReference() throws Exception {
285287
// cleaned up by the time we got here making the test sometimes flakey.
286288
assertTrue(cleaned > 0);
287289

288-
mergedRegionResult = MetaTableAccessor.getRegionResult(
289-
TEST_UTIL.getConnection(), mergedRegionInfo.getRegionName());
290+
// Wait around a bit to give stuff a chance to complete.
291+
while (true) {
292+
mergedRegionResult = MetaTableAccessor
293+
.getRegionResult(TEST_UTIL.getConnection(), mergedRegionInfo.getRegionName());
294+
if (MetaTableAccessor.hasMergeRegions(mergedRegionResult.rawCells())) {
295+
LOG.info("Waiting on cleanup of merge columns {}",
296+
Arrays.asList(mergedRegionResult.rawCells()).stream().
297+
map(c -> c.toString()).collect(Collectors.joining(",")));
298+
Threads.sleep(50);
299+
} else {
300+
break;
301+
}
302+
}
290303
assertFalse(MetaTableAccessor.hasMergeRegions(mergedRegionResult.rawCells()));
291304
} finally {
292305
ADMIN.catalogJanitorSwitch(true);

hbase-thrift/src/test/java/org/apache/hadoop/hbase/thrift/TestThriftHttpServer.java

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -239,9 +239,13 @@ protected void talkToThriftServer(String url, int customHeaderSize) throws Excep
239239
}
240240

241241
private void stopHttpServerThread() throws Exception {
242-
LOG.debug("Stopping " + " Thrift HTTP server");
243-
thriftServer.stop();
244-
httpServerThread.join();
242+
LOG.debug("Stopping Thrift HTTP server {}", thriftServer);
243+
if (thriftServer != null) {
244+
thriftServer.stop();
245+
}
246+
if (httpServerThread != null) {
247+
httpServerThread.join();
248+
}
245249
if (httpServerException != null) {
246250
LOG.error("Command-line invocation of HBase Thrift server threw an " +
247251
"exception", httpServerException);

hbase-thrift/src/test/java/org/apache/hadoop/hbase/thrift/TestThriftServerCmdLine.java

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -297,14 +297,18 @@ private boolean isBindException(Exception cmdLineException) {
297297
@Test
298298
public void testRunThriftServer() throws Exception {
299299
ThriftServer thriftServer = createBoundServer();
300-
try {
301-
talkToThriftServer();
302-
} catch (Exception ex) {
303-
clientSideException = ex;
304-
LOG.info("Exception", ex);
305-
} finally {
306-
stopCmdLineThread();
307-
thriftServer.stop();
300+
// Add retries in case we see stuff like connection reset
301+
for (int i = 0; i < 10; i++) {
302+
try {
303+
talkToThriftServer();
304+
break;
305+
} catch (Exception ex) {
306+
clientSideException = ex;
307+
LOG.info("Exception", ex);
308+
} finally {
309+
stopCmdLineThread();
310+
thriftServer.stop();
311+
}
308312
}
309313

310314
if (clientSideException != null) {

0 commit comments

Comments
 (0)