Skip to content

Commit ecf3deb

Browse files
authored
HBASE-26913 Replication Observability Framework (#4862)
Signed-off-by: Duo Zhang <zhangduo@apache.org> Signed-off-by: Viraj Jasani <vjasani@apache.org>
1 parent ea4ccf0 commit ecf3deb

File tree

39 files changed

+1971
-82
lines changed

39 files changed

+1971
-82
lines changed

hbase-client/src/main/java/org/apache/hadoop/hbase/slowlog/SlowLogTableAccessor.java

Lines changed: 2 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,10 @@
2121
import java.util.ArrayList;
2222
import java.util.List;
2323
import java.util.concurrent.ThreadLocalRandom;
24-
import org.apache.hadoop.conf.Configuration;
2524
import org.apache.hadoop.hbase.HConstants;
2625
import org.apache.hadoop.hbase.NamespaceDescriptor;
2726
import org.apache.hadoop.hbase.TableName;
2827
import org.apache.hadoop.hbase.client.Connection;
29-
import org.apache.hadoop.hbase.client.ConnectionFactory;
3028
import org.apache.hadoop.hbase.client.Durability;
3129
import org.apache.hadoop.hbase.client.Put;
3230
import org.apache.hadoop.hbase.client.Table;
@@ -48,8 +46,6 @@ public class SlowLogTableAccessor {
4846

4947
private static final Logger LOG = LoggerFactory.getLogger(SlowLogTableAccessor.class);
5048

51-
private static Connection connection;
52-
5349
/**
5450
* hbase:slowlog table name - can be enabled with config -
5551
* hbase.regionserver.slowlog.systable.enabled
@@ -66,10 +62,10 @@ private static void doPut(final Connection connection, final List<Put> puts) thr
6662
/**
6763
* Add slow/large log records to hbase:slowlog table
6864
* @param slowLogPayloads List of SlowLogPayload to process
69-
* @param configuration Configuration to use for connection
65+
* @param connection connection
7066
*/
7167
public static void addSlowLogRecords(final List<TooSlowLog.SlowLogPayload> slowLogPayloads,
72-
final Configuration configuration) {
68+
Connection connection) {
7369
List<Put> puts = new ArrayList<>(slowLogPayloads.size());
7470
for (TooSlowLog.SlowLogPayload slowLogPayload : slowLogPayloads) {
7571
final byte[] rowKey = getRowKey(slowLogPayload);
@@ -102,26 +98,12 @@ public static void addSlowLogRecords(final List<TooSlowLog.SlowLogPayload> slowL
10298
puts.add(put);
10399
}
104100
try {
105-
if (connection == null) {
106-
createConnection(configuration);
107-
}
108101
doPut(connection, puts);
109102
} catch (Exception e) {
110103
LOG.warn("Failed to add slow/large log records to hbase:slowlog table.", e);
111104
}
112105
}
113106

114-
private static synchronized void createConnection(Configuration configuration)
115-
throws IOException {
116-
Configuration conf = new Configuration(configuration);
117-
// rpc timeout: 20s
118-
conf.setInt(HConstants.HBASE_RPC_TIMEOUT_KEY, 20000);
119-
// retry count: 5
120-
conf.setInt(HConstants.HBASE_CLIENT_RETRIES_NUMBER, 5);
121-
conf.setInt(HConstants.HBASE_CLIENT_SERVERSIDE_RETRIES_MULTIPLIER, 1);
122-
connection = ConnectionFactory.createConnection(conf);
123-
}
124-
125107
/**
126108
* Create rowKey: currentTime APPEND slowLogPayload.hashcode Scan on slowlog table should keep
127109
* records with sorted order of time, however records added at the very same time could be in
@@ -140,5 +122,4 @@ private static byte[] getRowKey(final TooSlowLog.SlowLogPayload slowLogPayload)
140122
final long rowKeyLong = Long.parseLong(timeAndHashcode);
141123
return Bytes.toBytes(rowKeyLong);
142124
}
143-
144125
}

hbase-common/src/main/java/org/apache/hadoop/hbase/HConstants.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1643,6 +1643,14 @@ public enum OperationStatusCode {
16431643
"hbase.regionserver.slowlog.systable.enabled";
16441644
public static final boolean DEFAULT_SLOW_LOG_SYS_TABLE_ENABLED_KEY = false;
16451645

1646+
@Deprecated
1647+
// since <need to know the version number> and will be removed in <version number>
1648+
// Instead use hbase.regionserver.named.queue.chore.duration config property
1649+
public static final String SLOW_LOG_SYS_TABLE_CHORE_DURATION_KEY =
1650+
"hbase.slowlog.systable.chore.duration";
1651+
// Default 10 mins.
1652+
public static final int DEFAULT_SLOW_LOG_SYS_TABLE_CHORE_DURATION = 10 * 60 * 1000;
1653+
16461654
public static final String SHELL_TIMESTAMP_FORMAT_EPOCH_KEY =
16471655
"hbase.shell.timestamp.format.epoch";
16481656

hbase-common/src/main/resources/hbase-default.xml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2004,7 +2004,7 @@ possible configurations would overwhelm and obscure the important.
20042004
</property>
20052005
<property>
20062006
<name>hbase.namedqueue.provider.classes</name>
2007-
<value>org.apache.hadoop.hbase.namequeues.impl.SlowLogQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerDecisionQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerRejectionQueueService</value>
2007+
<value>org.apache.hadoop.hbase.namequeues.impl.SlowLogQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerDecisionQueueService,org.apache.hadoop.hbase.namequeues.impl.BalancerRejectionQueueService,org.apache.hadoop.hbase.namequeues.WALEventTrackerQueueService</value>
20082008
<description>
20092009
Default values for NamedQueueService implementors. This comma separated full class names
20102010
represent all implementors of NamedQueueService that we would like to be invoked by
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.namequeues;
19+
20+
import org.apache.hadoop.hbase.metrics.BaseSource;
21+
import org.apache.yetus.audience.InterfaceAudience;
22+
23+
@InterfaceAudience.Private
24+
public interface MetricsWALEventTrackerSource extends BaseSource {
25+
/**
26+
* The name of the metrics
27+
*/
28+
String METRICS_NAME = "WALEventTracker";
29+
30+
/**
31+
* The name of the metrics context that metrics will be under.
32+
*/
33+
String METRICS_CONTEXT = "regionserver";
34+
35+
/**
36+
* Description
37+
*/
38+
String METRICS_DESCRIPTION = "Metrics about HBase RegionServer WALEventTracker";
39+
40+
/**
41+
* The name of the metrics context that metrics will be under in jmx
42+
*/
43+
String METRICS_JMX_CONTEXT = "RegionServer,sub=" + METRICS_NAME;
44+
45+
String NUM_FAILED_PUTS = "numFailedPuts";
46+
String NUM_FAILED_PUTS_DESC = "Number of put requests that failed";
47+
48+
String NUM_RECORDS_FAILED_PUTS = "numRecordsFailedPuts";
49+
String NUM_RECORDS_FAILED_PUTS_DESC = "number of records in failed puts";
50+
51+
/*
52+
* Increment 2 counters, numFailedPuts and numRecordsFailedPuts
53+
*/
54+
void incrFailedPuts(long numRecords);
55+
56+
/*
57+
* Get the failed puts counter.
58+
*/
59+
long getFailedPuts();
60+
61+
/*
62+
* Get the number of records in failed puts.
63+
*/
64+
long getNumRecordsFailedPuts();
65+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Licensed to the Apache Software Foundation (ASF) under one
2+
# or more contributor license agreements. See the NOTICE file
3+
# distributed with this work for additional information
4+
# regarding copyright ownership. The ASF licenses this file
5+
# to you under the Apache License, Version 2.0 (the
6+
# "License"); you may not use this file except in compliance
7+
# with the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing,
12+
# software distributed under the License is distributed on an
13+
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
# KIND, either express or implied. See the License for the
15+
# specific language governing permissions and limitations
16+
# under the License.
17+
#
18+
org.apache.hadoop.hbase.namequeues.MetricsWALEventTrackerSourceImpl
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.namequeues;
19+
20+
import org.apache.hadoop.hbase.metrics.BaseSourceImpl;
21+
import org.apache.hadoop.metrics2.lib.MutableFastCounter;
22+
import org.apache.yetus.audience.InterfaceAudience;
23+
24+
@InterfaceAudience.Private
25+
public class MetricsWALEventTrackerSourceImpl extends BaseSourceImpl
26+
implements MetricsWALEventTrackerSource {
27+
28+
private final MutableFastCounter numFailedPutsCount;
29+
private final MutableFastCounter numRecordsFailedPutsCount;
30+
31+
public MetricsWALEventTrackerSourceImpl() {
32+
this(METRICS_NAME, METRICS_DESCRIPTION, METRICS_CONTEXT, METRICS_JMX_CONTEXT);
33+
}
34+
35+
public MetricsWALEventTrackerSourceImpl(String metricsName, String metricsDescription,
36+
String metricsContext, String metricsJmxContext) {
37+
super(metricsName, metricsDescription, metricsContext, metricsJmxContext);
38+
numFailedPutsCount =
39+
this.getMetricsRegistry().newCounter(NUM_FAILED_PUTS, NUM_FAILED_PUTS_DESC, 0L);
40+
numRecordsFailedPutsCount = this.getMetricsRegistry().newCounter(NUM_RECORDS_FAILED_PUTS,
41+
NUM_RECORDS_FAILED_PUTS_DESC, 0L);
42+
}
43+
44+
@Override
45+
public void incrFailedPuts(long numRecords) {
46+
numFailedPutsCount.incr();
47+
numRecordsFailedPutsCount.incr(numRecords);
48+
}
49+
50+
@Override
51+
public long getFailedPuts() {
52+
return numFailedPutsCount.value();
53+
}
54+
55+
@Override
56+
public long getNumRecordsFailedPuts() {
57+
return numRecordsFailedPutsCount.value();
58+
}
59+
}

hbase-protocol-shaded/src/main/protobuf/WAL.proto

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,3 +182,12 @@ message RegionEventDescriptor {
182182
*/
183183
message WALTrailer {
184184
}
185+
186+
/**
187+
* Special WAL entry for replication marker event.
188+
*/
189+
message ReplicationMarkerDescriptor {
190+
required string region_server_name = 1;
191+
required string wal_name = 2;
192+
required uint64 offset = 3;
193+
}

hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@
172172
import org.apache.hadoop.hbase.master.replication.UpdatePeerConfigProcedure;
173173
import org.apache.hadoop.hbase.master.slowlog.SlowLogMasterService;
174174
import org.apache.hadoop.hbase.master.snapshot.SnapshotManager;
175+
import org.apache.hadoop.hbase.master.waleventtracker.WALEventTrackerTableCreator;
175176
import org.apache.hadoop.hbase.master.zksyncer.MasterAddressSyncer;
176177
import org.apache.hadoop.hbase.master.zksyncer.MetaLocationSyncer;
177178
import org.apache.hadoop.hbase.mob.MobFileCleanerChore;
@@ -214,6 +215,7 @@
214215
import org.apache.hadoop.hbase.replication.master.ReplicationHFileCleaner;
215216
import org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner;
216217
import org.apache.hadoop.hbase.replication.master.ReplicationPeerConfigUpgrader;
218+
import org.apache.hadoop.hbase.replication.master.ReplicationSinkTrackerTableCreator;
217219
import org.apache.hadoop.hbase.replication.regionserver.ReplicationStatus;
218220
import org.apache.hadoop.hbase.security.AccessDeniedException;
219221
import org.apache.hadoop.hbase.security.SecurityConstants;
@@ -1243,6 +1245,10 @@ private void finishActiveMasterInitialization(MonitoredTask status)
12431245
final SlowLogMasterService slowLogMasterService = new SlowLogMasterService(conf, this);
12441246
slowLogMasterService.init();
12451247

1248+
WALEventTrackerTableCreator.createIfNeededAndNotExists(conf, this);
1249+
// Create REPLICATION.SINK_TRACKER table if needed.
1250+
ReplicationSinkTrackerTableCreator.createIfNeededAndNotExists(conf, this);
1251+
12461252
// clear the dead servers with same host name and port of online server because we are not
12471253
// removing dead server with same hostname and port of rs which is trying to check in before
12481254
// master initialization. See HBASE-5916.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hbase.master.waleventtracker;
19+
20+
import static org.apache.hadoop.hbase.HConstants.NO_NONCE;
21+
import static org.apache.hadoop.hbase.namequeues.WALEventTrackerTableAccessor.WAL_EVENT_TRACKER_TABLE_NAME_STR;
22+
23+
import java.io.IOException;
24+
import java.util.concurrent.TimeUnit;
25+
import org.apache.hadoop.conf.Configuration;
26+
import org.apache.hadoop.hbase.HConstants;
27+
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
28+
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
29+
import org.apache.hadoop.hbase.master.MasterServices;
30+
import org.apache.hadoop.hbase.namequeues.WALEventTrackerTableAccessor;
31+
import org.apache.hadoop.hbase.util.Bytes;
32+
import org.apache.yetus.audience.InterfaceAudience;
33+
import org.slf4j.Logger;
34+
import org.slf4j.LoggerFactory;
35+
36+
/**
37+
* WALEventTracker Table creation to be used by HMaster
38+
*/
39+
@InterfaceAudience.Private
40+
public final class WALEventTrackerTableCreator {
41+
private static final Logger LOG = LoggerFactory.getLogger(WALEventTrackerTableCreator.class);
42+
43+
public static final String WAL_EVENT_TRACKER_ENABLED_KEY =
44+
"hbase.regionserver.wal.event.tracker.enabled";
45+
public static final boolean WAL_EVENT_TRACKER_ENABLED_DEFAULT = false;
46+
47+
/** The walEventTracker info family as a string */
48+
private static final String WAL_EVENT_TRACKER_INFO_FAMILY_STR = "info";
49+
50+
/** The walEventTracker info family in array of bytes */
51+
public static final byte[] WAL_EVENT_TRACKER_INFO_FAMILY =
52+
Bytes.toBytes(WAL_EVENT_TRACKER_INFO_FAMILY_STR);
53+
54+
private static final long TTL = TimeUnit.DAYS.toSeconds(365); // 1 year in seconds
55+
56+
private static final TableDescriptorBuilder TABLE_DESCRIPTOR_BUILDER = TableDescriptorBuilder
57+
.newBuilder(WALEventTrackerTableAccessor.WAL_EVENT_TRACKER_TABLE_NAME).setRegionReplication(1)
58+
.setColumnFamily(ColumnFamilyDescriptorBuilder.newBuilder(WAL_EVENT_TRACKER_INFO_FAMILY)
59+
.setScope(HConstants.REPLICATION_SCOPE_LOCAL).setBlockCacheEnabled(false).setMaxVersions(1)
60+
.setTimeToLive((int) TTL).build());
61+
62+
/* Private default constructor */
63+
private WALEventTrackerTableCreator() {
64+
}
65+
66+
/*
67+
* We will create this table only if hbase.regionserver.wal.event.tracker.enabled is enabled and
68+
* table doesn't exists already.
69+
*/
70+
public static void createIfNeededAndNotExists(Configuration conf, MasterServices masterServices)
71+
throws IOException {
72+
boolean walEventTrackerEnabled =
73+
conf.getBoolean(WAL_EVENT_TRACKER_ENABLED_KEY, WAL_EVENT_TRACKER_ENABLED_DEFAULT);
74+
if (!walEventTrackerEnabled) {
75+
LOG.info("wal event tracker requests logging to table " + WAL_EVENT_TRACKER_TABLE_NAME_STR
76+
+ " is disabled. Quitting.");
77+
return;
78+
}
79+
if (
80+
!masterServices.getTableDescriptors()
81+
.exists(WALEventTrackerTableAccessor.WAL_EVENT_TRACKER_TABLE_NAME)
82+
) {
83+
LOG.info(WAL_EVENT_TRACKER_TABLE_NAME_STR + " table not found. Creating.");
84+
masterServices.createTable(TABLE_DESCRIPTOR_BUILDER.build(), null, 0L, NO_NONCE);
85+
}
86+
}
87+
}

hbase-server/src/main/java/org/apache/hadoop/hbase/namequeues/LogEventHandler.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import java.util.HashMap;
2424
import java.util.Map;
2525
import org.apache.hadoop.conf.Configuration;
26+
import org.apache.hadoop.hbase.client.Connection;
2627
import org.apache.hadoop.hbase.namequeues.request.NamedQueueGetRequest;
2728
import org.apache.hadoop.hbase.namequeues.response.NamedQueueGetResponse;
2829
import org.apache.yetus.audience.InterfaceAudience;
@@ -70,7 +71,8 @@ class LogEventHandler implements EventHandler<RingBufferEnvelope> {
7071
namedQueueServices.put(namedQueueService.getEvent(), namedQueueService);
7172
} catch (InstantiationException | IllegalAccessException | NoSuchMethodException
7273
| InvocationTargetException e) {
73-
LOG.warn("Unable to instantiate/add NamedQueueService implementor {} to service map.", clz);
74+
LOG.warn("Unable to instantiate/add NamedQueueService implementor {} to service map.", clz,
75+
e);
7476
}
7577
}
7678
}
@@ -105,8 +107,8 @@ boolean clearNamedQueue(NamedQueuePayload.NamedQueueEvent namedQueueEvent) {
105107
* Add all in memory queue records to system table. The implementors can use system table or
106108
* direct HDFS file or ZK as persistence system.
107109
*/
108-
void persistAll(NamedQueuePayload.NamedQueueEvent namedQueueEvent) {
109-
namedQueueServices.get(namedQueueEvent).persistAll();
110+
void persistAll(NamedQueuePayload.NamedQueueEvent namedQueueEvent, Connection connection) {
111+
namedQueueServices.get(namedQueueEvent).persistAll(connection);
110112
}
111113

112114
/**

0 commit comments

Comments
 (0)