Skip to content

Commit 0b6fc5d

Browse files
committed
HBASE-22527 [hbck2] Add a master web ui to show the problematic regions
1 parent e5f05bf commit 0b6fc5d

File tree

3 files changed

+264
-12
lines changed

3 files changed

+264
-12
lines changed

hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon

Lines changed: 82 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,27 +17,98 @@ See the License for the specific language governing permissions and
1717
limitations under the License.
1818
</%doc>
1919
<%import>
20-
org.apache.hadoop.hbase.master.assignment.AssignmentManager;
21-
org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat;
22-
org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen;
23-
org.apache.hadoop.hbase.master.RegionState;
20+
java.util.Map;
21+
java.util.Set;
22+
java.util.SortedSet;
23+
java.util.concurrent.atomic.AtomicInteger;
24+
java.util.stream.Collectors;
2425
org.apache.hadoop.conf.Configuration;
2526
org.apache.hadoop.hbase.HBaseConfiguration;
2627
org.apache.hadoop.hbase.HConstants;
28+
org.apache.hadoop.hbase.ServerName;
29+
org.apache.hadoop.hbase.client.RegionInfo;
2730
org.apache.hadoop.hbase.client.RegionInfoDisplay;
28-
java.util.HashSet;
29-
java.util.SortedSet;
30-
java.util.Map;
31-
java.util.concurrent.atomic.AtomicInteger;
31+
org.apache.hadoop.hbase.master.RegionState;
32+
org.apache.hadoop.hbase.master.assignment.AssignmentManager;
33+
org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat;
34+
org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen;
35+
org.apache.hadoop.hbase.util.Pair;
3236
</%import>
3337
<%args>
3438
AssignmentManager assignmentManager;
3539
int limit = 100;
3640
</%args>
3741

38-
<%java SortedSet<RegionState> rit = assignmentManager
39-
.getRegionStates().getRegionsInTransitionOrderedByTimestamp();
40-
%>
42+
<%java>
43+
SortedSet<RegionState> rit = assignmentManager.getRegionStates()
44+
.getRegionsInTransitionOrderedByTimestamp();
45+
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = assignmentManager
46+
.getProblematicRegions();
47+
</%java>
48+
49+
<%if !problematicRegions.isEmpty() %>
50+
<%java>
51+
int totalSize = problematicRegions.size();
52+
int sizePerPage = Math.min(10, totalSize);
53+
int numOfPages = (int) Math.ceil(totalSize * 1.0 / sizePerPage);
54+
</%java>
55+
<section>
56+
<h2><a name="rit">Problematic Regions</a></h2>
57+
<div class="tabbable">
58+
<div class="tab-content">
59+
<%java int recordItr = 0; %>
60+
<%for Map.Entry<String, Pair<ServerName, Set<ServerName>>> entry : problematicRegions.entrySet() %>
61+
<%if (recordItr % sizePerPage) == 0 %>
62+
<%if recordItr == 0 %>
63+
<div class="tab-pane active" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
64+
<%else>
65+
<div class="tab-pane" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
66+
</%if>
67+
<table class="table table-striped" style="margin-bottom:0px;">
68+
<tr>
69+
<th>Region</th>
70+
<th>Location in META</th>
71+
<th>Reported Online Region Servers</th>
72+
</tr>
73+
</%if>
74+
75+
<tr>
76+
<th><% entry.getKey() %></th>
77+
<th><% entry.getValue().getFirst() %></th>
78+
<th><% entry.getValue().getSecond().stream().map(ServerName::getServerName)
79+
.collect(Collectors.joining(", ")) %></th>
80+
</tr>
81+
<%java recordItr++; %>
82+
<%if (recordItr % sizePerPage) == 0 %>
83+
</table>
84+
</div>
85+
</%if>
86+
</%for>
87+
88+
<%if (recordItr % sizePerPage) != 0 %>
89+
<%for ; (recordItr % sizePerPage) != 0 ; recordItr++ %>
90+
<tr><td colspan="3" style="height:61px"></td></tr>
91+
</%for>
92+
</table>
93+
</div>
94+
</%if>
95+
96+
</div>
97+
<nav>
98+
<ul class="nav nav-pills pagination">
99+
<%for int i = 1 ; i <= numOfPages; i++ %>
100+
<%if i == 1 %>
101+
<li class="active">
102+
<%else>
103+
<li>
104+
</%if>
105+
<a href="#tab_prs<% i %>"><% i %></a></li>
106+
</%for>
107+
</ul>
108+
</nav>
109+
</div>
110+
</section>
111+
</%if>
41112

42113
<%if !rit.isEmpty() %>
43114
<%java>

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 55 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,8 @@ public class AssignmentManager {
158158
private final RegionStates regionStates = new RegionStates();
159159
private final RegionStateStore regionStateStore;
160160

161+
private final Map<ServerName, Set<byte[]>> rsReports = new HashMap<>();
162+
161163
private final boolean shouldAssignRegionsWithFavoredNodes;
162164
private final int assignDispatchWaitQueueMaxSize;
163165
private final int assignDispatchWaitMillis;
@@ -1065,13 +1067,18 @@ public void reportOnlineRegions(ServerName serverName, Set<byte[]> regionNames)
10651067
}
10661068

10671069
ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
1068-
10691070
synchronized (serverNode) {
10701071
if (!serverNode.isInState(ServerState.ONLINE)) {
10711072
LOG.warn("Got a report from a server result in state " + serverNode.getState());
10721073
return;
10731074
}
10741075
}
1076+
1077+
// Track the regionserver reported online regions in memory.
1078+
synchronized (rsReports) {
1079+
rsReports.put(serverName, regionNames);
1080+
}
1081+
10751082
if (regionNames.isEmpty()) {
10761083
// nothing to do if we don't have regions
10771084
LOG.trace("no online region found on {}", serverName);
@@ -2028,4 +2035,51 @@ public List<ServerName> getExcludedServersForSystemTable() {
20282035
MasterServices getMaster() {
20292036
return master;
20302037
}
2038+
2039+
/**
2040+
* Found the potentially problematic opened regions. There are three case:
2041+
* case 1. Master thought this region opened, but no regionserver reported it.
2042+
* case 2. Master thought this region opened on Server1, but regionserver reported Server2
2043+
* case 3. More than one regionservers reported opened this region
2044+
*
2045+
* @return the map of potentially problematic opened regions. The key is the region name. The
2046+
* value is a pair of location in meta and the regionservers which reported opened this region.
2047+
*/
2048+
public Map<String, Pair<ServerName, Set<ServerName>>> getProblematicRegions() {
2049+
Map<String, Set<ServerName>> reportedOnlineRegions = new HashMap<>();
2050+
synchronized (rsReports) {
2051+
for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
2052+
for (byte[] regionName : entry.getValue()) {
2053+
reportedOnlineRegions
2054+
.computeIfAbsent(RegionInfo.getRegionNameAsString(regionName), r -> new HashSet<>())
2055+
.add(entry.getKey());
2056+
}
2057+
}
2058+
}
2059+
2060+
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = new HashMap<>();
2061+
for (RegionState regionState : regionStates.getRegionStates()) {
2062+
if (regionState.isOpened()) {
2063+
String regionName = regionState.getRegion().getRegionNameAsString();
2064+
ServerName serverName = regionState.getServerName();
2065+
if (reportedOnlineRegions.containsKey(regionName)) {
2066+
Set<ServerName> reportedServers = reportedOnlineRegions.get(regionName);
2067+
if (reportedServers.contains(serverName)) {
2068+
if (reportedServers.size() > 1) {
2069+
// More than one regionserver reported opened this region
2070+
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
2071+
}
2072+
} else {
2073+
// Master thought this region opened on Server1, but regionserver reported Server2
2074+
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
2075+
}
2076+
} else {
2077+
// Master thought this region opened, but no regionserver reported it.
2078+
problematicRegions.put(regionName, new Pair<>(serverName, new HashSet<>()));
2079+
}
2080+
}
2081+
}
2082+
2083+
return problematicRegions;
2084+
}
20312085
}
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master.assignment;
19+
20+
import static org.junit.Assert.assertEquals;
21+
import static org.junit.Assert.assertFalse;
22+
import static org.junit.Assert.assertTrue;
23+
24+
import java.util.Collections;
25+
import java.util.List;
26+
import java.util.Map;
27+
import java.util.Set;
28+
import java.util.concurrent.Future;
29+
30+
import org.apache.hadoop.hbase.HBaseClassTestRule;
31+
import org.apache.hadoop.hbase.ServerName;
32+
import org.apache.hadoop.hbase.TableName;
33+
import org.apache.hadoop.hbase.client.RegionInfo;
34+
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
35+
import org.apache.hadoop.hbase.testclassification.MasterTests;
36+
import org.apache.hadoop.hbase.testclassification.MediumTests;
37+
import org.apache.hadoop.hbase.util.Pair;
38+
import org.junit.ClassRule;
39+
import org.junit.Test;
40+
import org.junit.experimental.categories.Category;
41+
import org.slf4j.Logger;
42+
import org.slf4j.LoggerFactory;
43+
44+
@Category({ MasterTests.class, MediumTests.class })
45+
public class TestAMProblematicRegions extends TestAssignmentManagerBase {
46+
private static final Logger LOG = LoggerFactory.getLogger(TestAMProblematicRegions.class);
47+
48+
@ClassRule
49+
public static final HBaseClassTestRule CLASS_RULE =
50+
HBaseClassTestRule.forClass(TestAMProblematicRegions.class);
51+
52+
@Test
53+
public void testForMeta() {
54+
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
55+
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
56+
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
57+
assertEquals(NSERVERS, serverNames.size());
58+
59+
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
60+
61+
// Test for case1: Master thought this region opened, but no regionserver reported it.
62+
assertTrue(problematicRegions.containsKey(metaRegionName));
63+
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(metaRegionName);
64+
ServerName locationInMeta = pair.getFirst();
65+
Set<ServerName> reportedRegionServers = pair.getSecond();
66+
assertTrue(serverNames.contains(locationInMeta));
67+
assertEquals(0, reportedRegionServers.size());
68+
69+
// Reported right region location. Then not in problematic regions.
70+
am.reportOnlineRegions(locationInMeta, Collections.singleton(metaRegionNameAsBytes));
71+
problematicRegions = am.getProblematicRegions();
72+
assertFalse(problematicRegions.containsKey(metaRegionName));
73+
}
74+
75+
@Test
76+
public void testForUserTable() throws Exception {
77+
TableName tableName = TableName.valueOf("testForUserTable");
78+
RegionInfo hri = createRegionInfo(tableName, 1);
79+
String regionName = hri.getRegionNameAsString();
80+
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
81+
Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
82+
waitOnFuture(future);
83+
84+
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
85+
assertEquals(NSERVERS, serverNames.size());
86+
87+
// Test for case1: Master thought this region opened, but no regionserver reported it.
88+
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
89+
assertTrue(problematicRegions.containsKey(regionName));
90+
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(regionName);
91+
ServerName locationInMeta = pair.getFirst();
92+
Set<ServerName> reportedRegionServers = pair.getSecond();
93+
assertTrue(serverNames.contains(locationInMeta));
94+
assertEquals(0, reportedRegionServers.size());
95+
96+
// Test for case2: Master thought this region opened on Server1, but regionserver reported
97+
// Server2
98+
final ServerName tempLocationInMeta = locationInMeta;
99+
final ServerName anotherServer =
100+
serverNames.stream().filter(s -> !s.equals(tempLocationInMeta)).findFirst().get();
101+
am.reportOnlineRegions(anotherServer, Collections.singleton(hri.getRegionName()));
102+
problematicRegions = am.getProblematicRegions();
103+
assertTrue(problematicRegions.containsKey(regionName));
104+
pair = problematicRegions.get(regionName);
105+
locationInMeta = pair.getFirst();
106+
reportedRegionServers = pair.getSecond();
107+
assertEquals(1, reportedRegionServers.size());
108+
assertFalse(reportedRegionServers.contains(locationInMeta));
109+
assertTrue(reportedRegionServers.contains(anotherServer));
110+
111+
// Test for case3: More than one regionservers reported opened this region.
112+
am.reportOnlineRegions(locationInMeta, Collections.singleton(hri.getRegionName()));
113+
problematicRegions = am.getProblematicRegions();
114+
assertTrue(problematicRegions.containsKey(regionName));
115+
pair = problematicRegions.get(regionName);
116+
locationInMeta = pair.getFirst();
117+
reportedRegionServers = pair.getSecond();
118+
assertEquals(2, reportedRegionServers.size());
119+
assertTrue(reportedRegionServers.contains(locationInMeta));
120+
assertTrue(reportedRegionServers.contains(anotherServer));
121+
122+
// Reported right region location. Then not in problematic regions.
123+
am.reportOnlineRegions(anotherServer, Collections.EMPTY_SET);
124+
problematicRegions = am.getProblematicRegions();
125+
assertFalse(problematicRegions.containsKey(regionName));
126+
}
127+
}

0 commit comments

Comments
 (0)