Skip to content

Commit db7fb06

Browse files
committed
HBASE-23596 HBCKServerCrashProcedure can double assign
Signed-off-by: Duo Zhang <zhangduo@apache.org> Signed-off-by: Lijin Bin <binlijin@apache.org> Signed-off-by: Viraj Jasani <vjasani@apache.org> Change its behavior so it will only look in hbase:meta if the call to the super class turns up zero references. Only then will it search hbase:meta for references to 'Unknown Servers'. Normal operation where we read Master context is usual and sufficient. The scan of hbase:meta is only for case where Master state has been corrupted and we need to clear out 'Unknown Servers'.
1 parent 280b944 commit db7fb06

File tree

5 files changed

+116
-31
lines changed

5 files changed

+116
-31
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1389,7 +1389,7 @@ public static void removeRegionReplicasFromMeta(Set<byte[]> metaRows,
13891389
}
13901390
}
13911391

1392-
private static void addRegionStateToPut(Put put, RegionState.State state) throws IOException {
1392+
private static Put addRegionStateToPut(Put put, RegionState.State state) throws IOException {
13931393
put.add(CellBuilderFactory.create(CellBuilderType.SHALLOW_COPY)
13941394
.setRow(put.getRow())
13951395
.setFamily(HConstants.CATALOG_FAMILY)
@@ -1398,6 +1398,17 @@ private static void addRegionStateToPut(Put put, RegionState.State state) throws
13981398
.setType(Cell.Type.Put)
13991399
.setValue(Bytes.toBytes(state.name()))
14001400
.build());
1401+
return put;
1402+
}
1403+
1404+
/**
1405+
* Update state column in hbase:meta.
1406+
*/
1407+
public static void updateRegionState(Connection connection, RegionInfo ri,
1408+
RegionState.State state) throws IOException {
1409+
Put put = new Put(RegionReplicaUtil.getRegionInfoForDefaultReplica(ri).getRegionName());
1410+
MetaTableAccessor.putsToMetaTable(connection,
1411+
Collections.singletonList(addRegionStateToPut(put, state)));
14011412
}
14021413

14031414
/**

hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateStore.java

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,6 @@
5151
import org.slf4j.Logger;
5252
import org.slf4j.LoggerFactory;
5353

54-
import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesting;
5554
import org.apache.hbase.thirdparty.com.google.common.base.Preconditions;
5655

5756
/**
@@ -132,7 +131,7 @@ private void visitMetaEntry(final RegionStateVisitor visitor, final Result resul
132131
if (regionInfo == null) continue;
133132

134133
final int replicaId = regionInfo.getReplicaId();
135-
final State state = getRegionState(result, replicaId, regionInfo);
134+
final State state = getRegionState(result, regionInfo);
136135

137136
final ServerName lastHost = hrl.getServerName();
138137
final ServerName regionLocation = getRegionServer(result, replicaId);
@@ -343,12 +342,11 @@ private static byte[] getServerNameColumn(int replicaId) {
343342

344343
/**
345344
* Pull the region state from a catalog table {@link Result}.
346-
* @param r Result to pull the region state from
347345
* @return the region state, or null if unknown.
348346
*/
349-
@VisibleForTesting
350-
public static State getRegionState(final Result r, int replicaId, RegionInfo regionInfo) {
351-
Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY, getStateColumn(replicaId));
347+
public static State getRegionState(final Result r, RegionInfo regionInfo) {
348+
Cell cell = r.getColumnLatestCell(HConstants.CATALOG_FAMILY,
349+
getStateColumn(regionInfo.getReplicaId()));
352350
if (cell == null || cell.getValueLength() == 0) {
353351
return null;
354352
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/HBCKServerCrashProcedure.java

Lines changed: 98 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -20,21 +20,31 @@
2020
import java.io.IOException;
2121
import java.util.ArrayList;
2222
import java.util.List;
23+
import java.util.stream.Collectors;
2324

25+
import org.apache.hadoop.hbase.HRegionLocation;
2426
import org.apache.hadoop.hbase.MetaTableAccessor;
27+
import org.apache.hadoop.hbase.RegionLocations;
2528
import org.apache.hadoop.hbase.ServerName;
29+
import org.apache.hadoop.hbase.client.Connection;
2630
import org.apache.hadoop.hbase.client.RegionInfo;
27-
import org.apache.hadoop.hbase.util.Pair;
31+
import org.apache.hadoop.hbase.client.Result;
32+
import org.apache.hadoop.hbase.master.RegionState;
33+
import org.apache.hadoop.hbase.master.assignment.RegionStateStore;
2834
import org.apache.yetus.audience.InterfaceAudience;
2935
import org.slf4j.Logger;
3036
import org.slf4j.LoggerFactory;
3137

3238
/**
33-
* A SCP that differs from default only in how it gets the list of
34-
* Regions hosted on the crashed-server; it also reads hbase:meta directly rather
35-
* than rely solely on Master memory for list of Regions that were on crashed server.
36-
* This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
37-
* scheduleRecoveries). It is for the case where meta has references to 'Unknown Servers',
39+
* Acts like the super class in all cases except when no Regions found in the
40+
* current Master in-memory context. In this latter case, when the call to
41+
* super#getRegionsOnCrashedServer returns nothing, this SCP will scan
42+
* hbase:meta for references to the passed ServerName. If any found, we'll
43+
* clean them up.
44+
*
45+
* <p>This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's
46+
* scheduleRecoveries); the super class is used during normal recovery operations.
47+
* It is for the case where meta has references to 'Unknown Servers',
3848
* servers that are in hbase:meta but not in live-server or dead-server lists; i.e. Master
3949
* and hbase:meta content have deviated. It should never happen in normal running
4050
* cluster but if we do drop accounting of servers, we need a means of fix-up.
@@ -65,31 +75,97 @@ public HBCKServerCrashProcedure(final MasterProcedureEnv env, final ServerName s
6575
public HBCKServerCrashProcedure() {}
6676

6777
/**
68-
* Adds Regions found by super method any found scanning hbase:meta.
78+
* If no Regions found in Master context, then we will search hbase:meta for references
79+
* to the passed server. Operator may have passed ServerName because they have found
80+
* references to 'Unknown Servers'. They are using HBCKSCP to clear them out.
6981
*/
7082
@Override
7183
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION",
7284
justification="FindBugs seems confused on ps in below.")
7385
List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) {
74-
// Super can return immutable emptyList.
86+
// Super will return an immutable list (empty if nothing on this server).
7587
List<RegionInfo> ris = super.getRegionsOnCrashedServer(env);
76-
List<Pair<RegionInfo, ServerName>> ps = null;
88+
if (!ris.isEmpty()) {
89+
return ris;
90+
}
91+
// Nothing in in-master context. Check for Unknown Server! in hbase:meta.
92+
// If super list is empty, then allow that an operator scheduled an SCP because they are trying
93+
// to purge 'Unknown Servers' -- servers that are neither online nor in dead servers
94+
// list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'.
95+
// This mis-accounting does not happen in normal circumstance but may arise in-extremis
96+
// when cluster has been damaged in operation.
97+
UnknownServerVisitor visitor =
98+
new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName());
7799
try {
78-
ps = MetaTableAccessor.getTableRegionsAndLocations(env.getMasterServices().getConnection(),
79-
null, false);
100+
MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(),
101+
visitor, null);
80102
} catch (IOException ioe) {
81-
LOG.warn("Failed get of all regions; continuing", ioe);
82-
}
83-
if (ps == null || ps.isEmpty()) {
84-
LOG.warn("No regions found in hbase:meta");
103+
LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe);
85104
return ris;
86105
}
87-
List<RegionInfo> aggregate = ris == null || ris.isEmpty()?
88-
new ArrayList<>(): new ArrayList<>(ris);
89-
int before = aggregate.size();
90-
ps.stream().filter(p -> p.getSecond() != null && p.getSecond().equals(getServerName())).
91-
forEach(p -> aggregate.add(p.getFirst()));
92-
LOG.info("Found {} mentions of {} in hbase:meta", aggregate.size() - before, getServerName());
93-
return aggregate;
106+
LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}",
107+
visitor.getReassigns().size(), getServerName(),
108+
visitor.getReassigns().stream().map(RegionInfo::getEncodedName).
109+
collect(Collectors.joining(",")));
110+
return visitor.getReassigns();
111+
}
112+
113+
/**
114+
* Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects
115+
* a List of Regions to reassign as 'result'.
116+
*/
117+
private static class UnknownServerVisitor implements MetaTableAccessor.Visitor {
118+
private final List<RegionInfo> reassigns = new ArrayList<>();
119+
private final ServerName unknownServerName;
120+
private final Connection connection;
121+
122+
private UnknownServerVisitor(Connection connection, ServerName unknownServerName) {
123+
this.connection = connection;
124+
this.unknownServerName = unknownServerName;
125+
}
126+
127+
@Override
128+
public boolean visit(Result result) throws IOException {
129+
RegionLocations rls = MetaTableAccessor.getRegionLocations(result);
130+
if (rls == null) {
131+
return true;
132+
}
133+
for (HRegionLocation hrl: rls.getRegionLocations()) {
134+
if (hrl == null) {
135+
continue;
136+
}
137+
if (hrl.getRegion() == null) {
138+
continue;
139+
}
140+
if (hrl.getServerName() == null) {
141+
continue;
142+
}
143+
if (!hrl.getServerName().equals(this.unknownServerName)) {
144+
continue;
145+
}
146+
RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion());
147+
RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName());
148+
if (rs.isClosing()) {
149+
// Move region to CLOSED in hbase:meta.
150+
LOG.info("Moving {} from CLOSING to CLOSED in hbase:meta",
151+
hrl.getRegion().getRegionNameAsString());
152+
try {
153+
MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(),
154+
RegionState.State.CLOSED);
155+
} catch (IOException ioe) {
156+
LOG.warn("Failed moving {} from CLOSING to CLOSED", ioe);
157+
}
158+
} else if (rs.isOpening() || rs.isOpened()) {
159+
this.reassigns.add(hrl.getRegion());
160+
} else {
161+
LOG.info("Passing {}", rs);
162+
}
163+
}
164+
return true;
165+
}
166+
167+
private List<RegionInfo> getReassigns() {
168+
return this.reassigns;
169+
}
94170
}
95171
}

hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseTestingUtility.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3605,8 +3605,7 @@ public boolean evaluate() throws IOException {
36053605
return false;
36063606
}
36073607
}
3608-
if (RegionStateStore.getRegionState(r,
3609-
info.getReplicaId(), info) != RegionState.State.OPEN) {
3608+
if (RegionStateStore.getRegionState(r, info) != RegionState.State.OPEN) {
36103609
return false;
36113610
}
36123611
}

hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestHBCKSCP.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,7 @@ public void test() throws Exception {
112112
master.getServerManager().moveFromOnlineToDeadServers(rsServerName);
113113
master.getServerManager().getDeadServers().finish(rsServerName);
114114
master.getServerManager().getDeadServers().removeDeadServer(rsServerName);
115+
master.getAssignmentManager().getRegionStates().removeServer(rsServerName);
115116
// Kill the server. Nothing should happen since an 'Unknown Server' as far
116117
// as the Master is concerned; i.e. no SCP.
117118
LOG.info("Killing {}", rsServerName);

0 commit comments

Comments
 (0)