|
20 | 20 | import java.io.IOException;
|
21 | 21 | import java.util.ArrayList;
|
22 | 22 | import java.util.List;
|
| 23 | +import java.util.stream.Collectors; |
23 | 24 |
|
| 25 | +import org.apache.hadoop.hbase.HRegionLocation; |
24 | 26 | import org.apache.hadoop.hbase.MetaTableAccessor;
|
| 27 | +import org.apache.hadoop.hbase.RegionLocations; |
25 | 28 | import org.apache.hadoop.hbase.ServerName;
|
| 29 | +import org.apache.hadoop.hbase.client.Connection; |
26 | 30 | import org.apache.hadoop.hbase.client.RegionInfo;
|
27 |
| -import org.apache.hadoop.hbase.util.Pair; |
| 31 | +import org.apache.hadoop.hbase.client.Result; |
| 32 | +import org.apache.hadoop.hbase.master.RegionState; |
| 33 | +import org.apache.hadoop.hbase.master.assignment.RegionStateStore; |
28 | 34 | import org.apache.yetus.audience.InterfaceAudience;
|
29 | 35 | import org.slf4j.Logger;
|
30 | 36 | import org.slf4j.LoggerFactory;
|
31 | 37 |
|
32 | 38 | /**
|
33 |
| - * A SCP that differs from default only in how it gets the list of |
34 |
| - * Regions hosted on the crashed-server; it also reads hbase:meta directly rather |
35 |
| - * than rely solely on Master memory for list of Regions that were on crashed server. |
36 |
| - * This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's |
37 |
| - * scheduleRecoveries). It is for the case where meta has references to 'Unknown Servers', |
| 39 | + * Acts like the super class in all cases except when no Regions found in the |
| 40 | + * current Master in-memory context. In this latter case, when the call to |
| 41 | + * super#getRegionsOnCrashedServer returns nothing, this SCP will scan |
| 42 | + * hbase:meta for references to the passed ServerName. If any found, we'll |
| 43 | + * clean them up. |
| 44 | + * |
| 45 | + * <p>This version of SCP is for external invocation as part of fix-up (e.g. HBCK2's |
| 46 | + * scheduleRecoveries); the super class is used during normal recovery operations. |
| 47 | + * It is for the case where meta has references to 'Unknown Servers', |
38 | 48 | * servers that are in hbase:meta but not in live-server or dead-server lists; i.e. Master
|
39 | 49 | * and hbase:meta content have deviated. It should never happen in normal running
|
40 | 50 | * cluster but if we do drop accounting of servers, we need a means of fix-up.
|
@@ -65,31 +75,97 @@ public HBCKServerCrashProcedure(final MasterProcedureEnv env, final ServerName s
|
65 | 75 | public HBCKServerCrashProcedure() {}
|
66 | 76 |
|
67 | 77 | /**
|
68 |
| - * Adds Regions found by super method any found scanning hbase:meta. |
| 78 | + * If no Regions found in Master context, then we will search hbase:meta for references |
| 79 | + * to the passed server. Operator may have passed ServerName because they have found |
| 80 | + * references to 'Unknown Servers'. They are using HBCKSCP to clear them out. |
69 | 81 | */
|
70 | 82 | @Override
|
71 | 83 | @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION",
|
72 | 84 | justification="FindBugs seems confused on ps in below.")
|
73 | 85 | List<RegionInfo> getRegionsOnCrashedServer(MasterProcedureEnv env) {
|
74 |
| - // Super can return immutable emptyList. |
| 86 | + // Super will return an immutable list (empty if nothing on this server). |
75 | 87 | List<RegionInfo> ris = super.getRegionsOnCrashedServer(env);
|
76 |
| - List<Pair<RegionInfo, ServerName>> ps = null; |
| 88 | + if (!ris.isEmpty()) { |
| 89 | + return ris; |
| 90 | + } |
| 91 | + // Nothing in in-master context. Check for Unknown Server! in hbase:meta. |
| 92 | + // If super list is empty, then allow that an operator scheduled an SCP because they are trying |
| 93 | + // to purge 'Unknown Servers' -- servers that are neither online nor in dead servers |
| 94 | + // list but that ARE in hbase:meta and so showing as unknown in places like 'HBCK Report'. |
| 95 | + // This mis-accounting does not happen in normal circumstance but may arise in-extremis |
| 96 | + // when cluster has been damaged in operation. |
| 97 | + UnknownServerVisitor visitor = |
| 98 | + new UnknownServerVisitor(env.getMasterServices().getConnection(), getServerName()); |
77 | 99 | try {
|
78 |
| - ps = MetaTableAccessor.getTableRegionsAndLocations(env.getMasterServices().getConnection(), |
79 |
| - null, false); |
| 100 | + MetaTableAccessor.scanMetaForTableRegions(env.getMasterServices().getConnection(), |
| 101 | + visitor, null); |
80 | 102 | } catch (IOException ioe) {
|
81 |
| - LOG.warn("Failed get of all regions; continuing", ioe); |
82 |
| - } |
83 |
| - if (ps == null || ps.isEmpty()) { |
84 |
| - LOG.warn("No regions found in hbase:meta"); |
| 103 | + LOG.warn("Failed scan of hbase:meta for 'Unknown Servers'", ioe); |
85 | 104 | return ris;
|
86 | 105 | }
|
87 |
| - List<RegionInfo> aggregate = ris == null || ris.isEmpty()? |
88 |
| - new ArrayList<>(): new ArrayList<>(ris); |
89 |
| - int before = aggregate.size(); |
90 |
| - ps.stream().filter(p -> p.getSecond() != null && p.getSecond().equals(getServerName())). |
91 |
| - forEach(p -> aggregate.add(p.getFirst())); |
92 |
| - LOG.info("Found {} mentions of {} in hbase:meta", aggregate.size() - before, getServerName()); |
93 |
| - return aggregate; |
| 106 | + LOG.info("Found {} mentions of {} in hbase:meta of OPEN/OPENING Regions: {}", |
| 107 | + visitor.getReassigns().size(), getServerName(), |
| 108 | + visitor.getReassigns().stream().map(RegionInfo::getEncodedName). |
| 109 | + collect(Collectors.joining(","))); |
| 110 | + return visitor.getReassigns(); |
| 111 | + } |
| 112 | + |
| 113 | + /** |
| 114 | + * Visitor for hbase:meta that 'fixes' Unknown Server issues. Collects |
| 115 | + * a List of Regions to reassign as 'result'. |
| 116 | + */ |
| 117 | + private static class UnknownServerVisitor implements MetaTableAccessor.Visitor { |
| 118 | + private final List<RegionInfo> reassigns = new ArrayList<>(); |
| 119 | + private final ServerName unknownServerName; |
| 120 | + private final Connection connection; |
| 121 | + |
| 122 | + private UnknownServerVisitor(Connection connection, ServerName unknownServerName) { |
| 123 | + this.connection = connection; |
| 124 | + this.unknownServerName = unknownServerName; |
| 125 | + } |
| 126 | + |
| 127 | + @Override |
| 128 | + public boolean visit(Result result) throws IOException { |
| 129 | + RegionLocations rls = MetaTableAccessor.getRegionLocations(result); |
| 130 | + if (rls == null) { |
| 131 | + return true; |
| 132 | + } |
| 133 | + for (HRegionLocation hrl: rls.getRegionLocations()) { |
| 134 | + if (hrl == null) { |
| 135 | + continue; |
| 136 | + } |
| 137 | + if (hrl.getRegion() == null) { |
| 138 | + continue; |
| 139 | + } |
| 140 | + if (hrl.getServerName() == null) { |
| 141 | + continue; |
| 142 | + } |
| 143 | + if (!hrl.getServerName().equals(this.unknownServerName)) { |
| 144 | + continue; |
| 145 | + } |
| 146 | + RegionState.State state = RegionStateStore.getRegionState(result, hrl.getRegion()); |
| 147 | + RegionState rs = new RegionState(hrl.getRegion(), state, hrl.getServerName()); |
| 148 | + if (rs.isClosing()) { |
| 149 | + // Move region to CLOSED in hbase:meta. |
| 150 | + LOG.info("Moving {} from CLOSING to CLOSED in hbase:meta", |
| 151 | + hrl.getRegion().getRegionNameAsString()); |
| 152 | + try { |
| 153 | + MetaTableAccessor.updateRegionState(this.connection, hrl.getRegion(), |
| 154 | + RegionState.State.CLOSED); |
| 155 | + } catch (IOException ioe) { |
| 156 | + LOG.warn("Failed moving {} from CLOSING to CLOSED", ioe); |
| 157 | + } |
| 158 | + } else if (rs.isOpening() || rs.isOpened()) { |
| 159 | + this.reassigns.add(hrl.getRegion()); |
| 160 | + } else { |
| 161 | + LOG.info("Passing {}", rs); |
| 162 | + } |
| 163 | + } |
| 164 | + return true; |
| 165 | + } |
| 166 | + |
| 167 | + private List<RegionInfo> getReassigns() { |
| 168 | + return this.reassigns; |
| 169 | + } |
94 | 170 | }
|
95 | 171 | }
|
0 commit comments