Skip to content

Commit f0cbdf8

Browse files
committed
HBASE-22527 [hbck2] Add a master web ui to show the problematic regions
1 parent 2b7e33f commit f0cbdf8

File tree

3 files changed

+273
-12
lines changed

3 files changed

+273
-12
lines changed

hbase-server/src/main/jamon/org/apache/hadoop/hbase/tmpl/master/AssignmentManagerStatusTmpl.jamon

Lines changed: 89 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -17,27 +17,105 @@ See the License for the specific language governing permissions and
1717
limitations under the License.
1818
</%doc>
1919
<%import>
20-
org.apache.hadoop.hbase.master.assignment.AssignmentManager;
21-
org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat;
22-
org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen;
23-
org.apache.hadoop.hbase.master.RegionState;
20+
java.util.Map;
21+
java.util.Set;
22+
java.util.SortedSet;
23+
java.util.concurrent.atomic.AtomicInteger;
24+
java.util.stream.Collectors;
2425
org.apache.hadoop.conf.Configuration;
2526
org.apache.hadoop.hbase.HBaseConfiguration;
2627
org.apache.hadoop.hbase.HConstants;
28+
org.apache.hadoop.hbase.ServerName;
29+
org.apache.hadoop.hbase.client.RegionInfo;
2730
org.apache.hadoop.hbase.client.RegionInfoDisplay;
28-
java.util.HashSet;
29-
java.util.SortedSet;
30-
java.util.Map;
31-
java.util.concurrent.atomic.AtomicInteger;
31+
org.apache.hadoop.hbase.master.RegionState;
32+
org.apache.hadoop.hbase.master.assignment.AssignmentManager;
33+
org.apache.hadoop.hbase.master.assignment.AssignmentManager.RegionInTransitionStat;
34+
org.apache.hadoop.hbase.master.assignment.RegionStates.RegionFailedOpen;
35+
org.apache.hadoop.hbase.util.Pair;
3236
</%import>
3337
<%args>
3438
AssignmentManager assignmentManager;
3539
int limit = 100;
3640
</%args>
3741

38-
<%java SortedSet<RegionState> rit = assignmentManager
39-
.getRegionStates().getRegionsInTransitionOrderedByTimestamp();
40-
%>
42+
<%java>
43+
SortedSet<RegionState> rit = assignmentManager.getRegionStates()
44+
.getRegionsInTransitionOrderedByTimestamp();
45+
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = assignmentManager
46+
.getProblematicRegions();
47+
</%java>
48+
49+
<%if !problematicRegions.isEmpty() %>
50+
<%java>
51+
int totalSize = problematicRegions.size();
52+
int sizePerPage = Math.min(10, totalSize);
53+
int numOfPages = (int) Math.ceil(totalSize * 1.0 / sizePerPage);
54+
</%java>
55+
<section>
56+
<h2><a name="problem-regions">Problematic Regions</a></h2>
57+
<p>
58+
<span>
59+
<% problematicRegions.size() %> problematic region(s). Notice: the reported online
60+
regionservers may be not right when there are regions in transition. Please check them
61+
in regionserver's web UI.
62+
</span>
63+
</p>
64+
<div class="tabbable">
65+
<div class="tab-content">
66+
<%java int recordItr = 0; %>
67+
<%for Map.Entry<String, Pair<ServerName, Set<ServerName>>> entry : problematicRegions.entrySet() %>
68+
<%if (recordItr % sizePerPage) == 0 %>
69+
<%if recordItr == 0 %>
70+
<div class="tab-pane active" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
71+
<%else>
72+
<div class="tab-pane" id="tab_prs<% (recordItr / sizePerPage) + 1 %>">
73+
</%if>
74+
<table class="table table-striped" style="margin-bottom:0px;">
75+
<tr>
76+
<th>Region</th>
77+
<th>Location in META</th>
78+
<th>Reported Online Region Servers</th>
79+
</tr>
80+
</%if>
81+
82+
<tr>
83+
<td><% entry.getKey() %></td>
84+
<td><% entry.getValue().getFirst() %></td>
85+
<td><% entry.getValue().getSecond().stream().map(ServerName::getServerName)
86+
.collect(Collectors.joining(", ")) %></td>
87+
</tr>
88+
<%java recordItr++; %>
89+
<%if (recordItr % sizePerPage) == 0 %>
90+
</table>
91+
</div>
92+
</%if>
93+
</%for>
94+
95+
<%if (recordItr % sizePerPage) != 0 %>
96+
<%for ; (recordItr % sizePerPage) != 0 ; recordItr++ %>
97+
<tr><td colspan="3" style="height:61px"></td></tr>
98+
</%for>
99+
</table>
100+
</div>
101+
</%if>
102+
103+
</div>
104+
<nav>
105+
<ul class="nav nav-pills pagination">
106+
<%for int i = 1 ; i <= numOfPages; i++ %>
107+
<%if i == 1 %>
108+
<li class="active">
109+
<%else>
110+
<li>
111+
</%if>
112+
<a href="#tab_prs<% i %>"><% i %></a></li>
113+
</%for>
114+
</ul>
115+
</nav>
116+
</div>
117+
</section>
118+
</%if>
41119

42120
<%if !rit.isEmpty() %>
43121
<%java>

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,8 @@ public class AssignmentManager {
158158
private final RegionStates regionStates = new RegionStates();
159159
private final RegionStateStore regionStateStore;
160160

161+
private final Map<ServerName, Set<byte[]>> rsReports = new HashMap<>();
162+
161163
private final boolean shouldAssignRegionsWithFavoredNodes;
162164
private final int assignDispatchWaitQueueMaxSize;
163165
private final int assignDispatchWaitMillis;
@@ -1065,13 +1067,18 @@ public void reportOnlineRegions(ServerName serverName, Set<byte[]> regionNames)
10651067
}
10661068

10671069
ServerStateNode serverNode = regionStates.getOrCreateServer(serverName);
1068-
10691070
synchronized (serverNode) {
10701071
if (!serverNode.isInState(ServerState.ONLINE)) {
10711072
LOG.warn("Got a report from a server result in state " + serverNode.getState());
10721073
return;
10731074
}
10741075
}
1076+
1077+
// Track the regionserver reported online regions in memory.
1078+
synchronized (rsReports) {
1079+
rsReports.put(serverName, regionNames);
1080+
}
1081+
10751082
if (regionNames.isEmpty()) {
10761083
// nothing to do if we don't have regions
10771084
LOG.trace("no online region found on {}", serverName);
@@ -2028,4 +2035,53 @@ public List<ServerName> getExcludedServersForSystemTable() {
20282035
MasterServices getMaster() {
20292036
return master;
20302037
}
2038+
2039+
/**
2040+
* Found the potentially problematic opened regions. There are three case:
2041+
* case 1. Master thought this region opened, but no regionserver reported it.
2042+
* case 2. Master thought this region opened on Server1, but regionserver reported Server2
2043+
* case 3. More than one regionservers reported opened this region
2044+
*
2045+
* @return the map of potentially problematic opened regions. Key is the region name. Value is
2046+
* a pair of location in meta and the regionservers which reported opened this region.
2047+
*/
2048+
public Map<String, Pair<ServerName, Set<ServerName>>> getProblematicRegions() {
2049+
Map<String, Set<ServerName>> reportedOnlineRegions = new HashMap<>();
2050+
synchronized (rsReports) {
2051+
for (Map.Entry<ServerName, Set<byte[]>> entry : rsReports.entrySet()) {
2052+
for (byte[] regionName : entry.getValue()) {
2053+
reportedOnlineRegions
2054+
.computeIfAbsent(RegionInfo.getRegionNameAsString(regionName), r -> new HashSet<>())
2055+
.add(entry.getKey());
2056+
}
2057+
}
2058+
}
2059+
2060+
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = new HashMap<>();
2061+
List<RegionState> rits = regionStates.getRegionsStateInTransition();
2062+
for (RegionState regionState : regionStates.getRegionStates()) {
2063+
// Only consider the opened region and not in transition
2064+
if (!rits.contains(regionState) && regionState.isOpened()) {
2065+
String regionName = regionState.getRegion().getRegionNameAsString();
2066+
ServerName serverName = regionState.getServerName();
2067+
if (reportedOnlineRegions.containsKey(regionName)) {
2068+
Set<ServerName> reportedServers = reportedOnlineRegions.get(regionName);
2069+
if (reportedServers.contains(serverName)) {
2070+
if (reportedServers.size() > 1) {
2071+
// More than one regionserver reported opened this region
2072+
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
2073+
}
2074+
} else {
2075+
// Master thought this region opened on Server1, but regionserver reported Server2
2076+
problematicRegions.put(regionName, new Pair<>(serverName, reportedServers));
2077+
}
2078+
} else {
2079+
// Master thought this region opened, but no regionserver reported it.
2080+
problematicRegions.put(regionName, new Pair<>(serverName, new HashSet<>()));
2081+
}
2082+
}
2083+
}
2084+
2085+
return problematicRegions;
2086+
}
20312087
}
Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master.assignment;
19+
20+
import static org.junit.Assert.assertEquals;
21+
import static org.junit.Assert.assertFalse;
22+
import static org.junit.Assert.assertTrue;
23+
24+
import java.util.Collections;
25+
import java.util.List;
26+
import java.util.Map;
27+
import java.util.Set;
28+
import java.util.concurrent.Future;
29+
30+
import org.apache.hadoop.hbase.HBaseClassTestRule;
31+
import org.apache.hadoop.hbase.ServerName;
32+
import org.apache.hadoop.hbase.TableName;
33+
import org.apache.hadoop.hbase.client.RegionInfo;
34+
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
35+
import org.apache.hadoop.hbase.testclassification.MasterTests;
36+
import org.apache.hadoop.hbase.testclassification.MediumTests;
37+
import org.apache.hadoop.hbase.util.Pair;
38+
import org.junit.ClassRule;
39+
import org.junit.Test;
40+
import org.junit.experimental.categories.Category;
41+
import org.slf4j.Logger;
42+
import org.slf4j.LoggerFactory;
43+
44+
@Category({ MasterTests.class, MediumTests.class })
45+
public class TestAMProblematicRegions extends TestAssignmentManagerBase {
46+
private static final Logger LOG = LoggerFactory.getLogger(TestAMProblematicRegions.class);
47+
48+
@ClassRule
49+
public static final HBaseClassTestRule CLASS_RULE =
50+
HBaseClassTestRule.forClass(TestAMProblematicRegions.class);
51+
52+
@Test
53+
public void testForMeta() {
54+
byte[] metaRegionNameAsBytes = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionName();
55+
String metaRegionName = RegionInfoBuilder.FIRST_META_REGIONINFO.getRegionNameAsString();
56+
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
57+
assertEquals(NSERVERS, serverNames.size());
58+
59+
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
60+
61+
// Test for case1: Master thought this region opened, but no regionserver reported it.
62+
assertTrue(problematicRegions.containsKey(metaRegionName));
63+
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(metaRegionName);
64+
ServerName locationInMeta = pair.getFirst();
65+
Set<ServerName> reportedRegionServers = pair.getSecond();
66+
assertTrue(serverNames.contains(locationInMeta));
67+
assertEquals(0, reportedRegionServers.size());
68+
69+
// Reported right region location. Then not in problematic regions.
70+
am.reportOnlineRegions(locationInMeta, Collections.singleton(metaRegionNameAsBytes));
71+
problematicRegions = am.getProblematicRegions();
72+
assertFalse(problematicRegions.containsKey(metaRegionName));
73+
}
74+
75+
@Test
76+
public void testForUserTable() throws Exception {
77+
TableName tableName = TableName.valueOf("testForUserTable");
78+
RegionInfo hri = createRegionInfo(tableName, 1);
79+
String regionName = hri.getRegionNameAsString();
80+
rsDispatcher.setMockRsExecutor(new GoodRsExecutor());
81+
Future<byte[]> future = submitProcedure(createAssignProcedure(hri));
82+
waitOnFuture(future);
83+
84+
List<ServerName> serverNames = master.getServerManager().getOnlineServersList();
85+
assertEquals(NSERVERS, serverNames.size());
86+
87+
// Test for case1: Master thought this region opened, but no regionserver reported it.
88+
Map<String, Pair<ServerName, Set<ServerName>>> problematicRegions = am.getProblematicRegions();
89+
assertTrue(problematicRegions.containsKey(regionName));
90+
Pair<ServerName, Set<ServerName>> pair = problematicRegions.get(regionName);
91+
ServerName locationInMeta = pair.getFirst();
92+
Set<ServerName> reportedRegionServers = pair.getSecond();
93+
assertTrue(serverNames.contains(locationInMeta));
94+
assertEquals(0, reportedRegionServers.size());
95+
96+
// Test for case2: Master thought this region opened on Server1, but regionserver reported
97+
// Server2
98+
final ServerName tempLocationInMeta = locationInMeta;
99+
final ServerName anotherServer =
100+
serverNames.stream().filter(s -> !s.equals(tempLocationInMeta)).findFirst().get();
101+
am.reportOnlineRegions(anotherServer, Collections.singleton(hri.getRegionName()));
102+
problematicRegions = am.getProblematicRegions();
103+
assertTrue(problematicRegions.containsKey(regionName));
104+
pair = problematicRegions.get(regionName);
105+
locationInMeta = pair.getFirst();
106+
reportedRegionServers = pair.getSecond();
107+
assertEquals(1, reportedRegionServers.size());
108+
assertFalse(reportedRegionServers.contains(locationInMeta));
109+
assertTrue(reportedRegionServers.contains(anotherServer));
110+
111+
// Test for case3: More than one regionservers reported opened this region.
112+
am.reportOnlineRegions(locationInMeta, Collections.singleton(hri.getRegionName()));
113+
problematicRegions = am.getProblematicRegions();
114+
assertTrue(problematicRegions.containsKey(regionName));
115+
pair = problematicRegions.get(regionName);
116+
locationInMeta = pair.getFirst();
117+
reportedRegionServers = pair.getSecond();
118+
assertEquals(2, reportedRegionServers.size());
119+
assertTrue(reportedRegionServers.contains(locationInMeta));
120+
assertTrue(reportedRegionServers.contains(anotherServer));
121+
122+
// Reported right region location. Then not in problematic regions.
123+
am.reportOnlineRegions(anotherServer, Collections.EMPTY_SET);
124+
problematicRegions = am.getProblematicRegions();
125+
assertFalse(problematicRegions.containsKey(regionName));
126+
}
127+
}

0 commit comments

Comments
 (0)