Skip to content

Commit 643dfd6

Browse files
authored
HDFS-15842. HDFS mover to emit metrics. (#2738)
1 parent 51991c4 commit 643dfd6

File tree

6 files changed

+196
-2
lines changed

6 files changed

+196
-2
lines changed

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/Dispatcher.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,7 @@ private void dispatch() {
398398
LOG.info("Successfully moved " + this);
399399
} catch (IOException e) {
400400
LOG.warn("Failed to move " + this, e);
401+
nnc.getBlocksFailed().incrementAndGet();
401402
target.getDDatanode().setHasFailure();
402403
// Check that the failure is due to block pinning errors.
403404
if (e instanceof BlockPinningException) {

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/balancer/NameNodeConnector.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,7 @@ public static void checkOtherInstanceRunning(boolean toCheck) {
163163
private final List<Path> targetPaths;
164164
private final AtomicLong bytesMoved = new AtomicLong();
165165
private final AtomicLong blocksMoved = new AtomicLong();
166+
private final AtomicLong blocksFailed = new AtomicLong();
166167

167168
private final int maxNotChangedIterations;
168169
private int notChangedIterations = 0;
@@ -230,14 +231,18 @@ public String getBlockpoolID() {
230231
return blockpoolID;
231232
}
232233

233-
AtomicLong getBytesMoved() {
234+
public AtomicLong getBytesMoved() {
234235
return bytesMoved;
235236
}
236237

237-
AtomicLong getBlocksMoved() {
238+
public AtomicLong getBlocksMoved() {
238239
return blocksMoved;
239240
}
240241

242+
public AtomicLong getBlocksFailed() {
243+
return blocksFailed;
244+
}
245+
241246
public void addBytesMoved(long numBytes) {
242247
bytesMoved.addAndGet(numBytes);
243248
blocksMoved.incrementAndGet();

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/mover/Mover.java

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
import org.apache.hadoop.hdfs.server.protocol.StorageReport;
4343
import org.apache.hadoop.io.IOUtils;
4444
import org.apache.hadoop.hdfs.protocol.ErasureCodingPolicy;
45+
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
46+
import org.apache.hadoop.metrics2.source.JvmMetrics;
4547
import org.apache.hadoop.net.NetUtils;
4648
import org.apache.hadoop.net.NetworkTopology;
4749
import org.apache.hadoop.security.SecurityUtil;
@@ -118,6 +120,8 @@ private List<StorageGroup> getTargetStorages(StorageType t) {
118120
private final int retryMaxAttempts;
119121
private final AtomicInteger retryCount;
120122
private final Map<Long, Set<DatanodeInfo>> excludedPinnedBlocks;
123+
private final MoverMetrics metrics;
124+
private final NameNodeConnector nnc;
121125

122126
private final BlockStoragePolicy[] blockStoragePolicies;
123127

@@ -155,6 +159,8 @@ Collections.<String> emptySet(), movedWinWidth, moverThreads, 0,
155159
this.blockStoragePolicies = new BlockStoragePolicy[1 <<
156160
BlockStoragePolicySuite.ID_BIT_LENGTH];
157161
this.excludedPinnedBlocks = excludedPinnedBlocks;
162+
this.nnc = nnc;
163+
this.metrics = MoverMetrics.create(this);
158164
}
159165

160166
void init() throws IOException {
@@ -196,6 +202,10 @@ private ExitStatus run() {
196202
}
197203
}
198204

205+
public NameNodeConnector getNnc() {
206+
return nnc;
207+
}
208+
199209
DBlock newDBlock(LocatedBlock lb, List<MLocation> locations,
200210
ErasureCodingPolicy ecPolicy) {
201211
Block blk = lb.getBlock().getLocalBlock();
@@ -296,6 +306,7 @@ private boolean isSnapshotPathInCurrent(String path) throws IOException {
296306
* round
297307
*/
298308
private Result processNamespace() throws IOException {
309+
metrics.setProcessingNamespace(true);
299310
getSnapshottableDirs();
300311
Result result = new Result();
301312
for (Path target : targetPaths) {
@@ -322,6 +333,7 @@ private Result processNamespace() throws IOException {
322333
retryCount.set(0);
323334
}
324335
result.updateHasRemaining(hasFailed);
336+
metrics.setProcessingNamespace(false);
325337
return result;
326338
}
327339

@@ -374,6 +386,7 @@ private void processRecursively(String parent, HdfsFileStatus status,
374386
// the full path is a snapshot path but it is also included in the
375387
// current directory tree, thus ignore it.
376388
processFile(fullPath, (HdfsLocatedFileStatus) status, result);
389+
metrics.incrFilesProcessed();
377390
}
378391
} catch (IOException e) {
379392
LOG.warn("Failed to check the status of " + parent
@@ -521,6 +534,7 @@ boolean chooseTargetInSameNode(DBlock db, Source source,
521534
final PendingMove pm = source.addPendingMove(db, target);
522535
if (pm != null) {
523536
dispatcher.executePendingMove(pm);
537+
metrics.incrBlocksScheduled();
524538
return true;
525539
}
526540
}
@@ -539,6 +553,7 @@ boolean chooseTarget(DBlock db, Source source,
539553
final PendingMove pm = source.addPendingMove(db, target);
540554
if (pm != null) {
541555
dispatcher.executePendingMove(pm);
556+
metrics.incrBlocksScheduled();
542557
return true;
543558
}
544559
}
@@ -650,6 +665,11 @@ static int run(Map<URI, List<Path>> namenodes, Configuration conf)
650665
Map<Long, Set<DatanodeInfo>> excludedPinnedBlocks = new HashMap<>();
651666
LOG.info("namenodes = " + namenodes);
652667

668+
DefaultMetricsSystem.initialize("Mover");
669+
JvmMetrics.create("Mover",
670+
conf.get(DFSConfigKeys.DFS_METRICS_SESSION_ID_KEY),
671+
DefaultMetricsSystem.instance());
672+
653673
checkKeytabAndInit(conf);
654674
List<NameNodeConnector> connectors = Collections.emptyList();
655675
try {
@@ -818,6 +838,7 @@ public int run(String[] args) throws Exception {
818838
System.out.println(e + ". Exiting ...");
819839
return ExitStatus.ILLEGAL_ARGUMENTS.getExitCode();
820840
} finally {
841+
DefaultMetricsSystem.shutdown();
821842
System.out.format("%-24s ", DateFormat.getDateTimeInstance().format(new Date()));
822843
System.out.println("Mover took " + StringUtils.formatTime(Time.monotonicNow()-startTime));
823844
}
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
package org.apache.hadoop.hdfs.server.mover;
19+
20+
import org.apache.hadoop.metrics2.annotation.Metric;
21+
import org.apache.hadoop.metrics2.annotation.Metrics;
22+
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
23+
import org.apache.hadoop.metrics2.lib.MutableCounterLong;
24+
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
25+
26+
/**
27+
* Metrics for HDFS Mover of a blockpool.
28+
*/
29+
@Metrics(about="Mover metrics", context="dfs")
30+
final class MoverMetrics {
31+
32+
private final Mover mover;
33+
34+
@Metric("If mover is processing namespace.")
35+
private MutableGaugeInt processingNamespace;
36+
37+
@Metric("Number of blocks being scheduled.")
38+
private MutableCounterLong blocksScheduled;
39+
40+
@Metric("Number of files being processed.")
41+
private MutableCounterLong filesProcessed;
42+
43+
private MoverMetrics(Mover m) {
44+
this.mover = m;
45+
}
46+
47+
public static MoverMetrics create(Mover mover) {
48+
MoverMetrics m = new MoverMetrics(mover);
49+
return DefaultMetricsSystem.instance().register(
50+
m.getName(), null, m);
51+
}
52+
53+
String getName() {
54+
return "Mover-" + mover.getNnc().getBlockpoolID();
55+
}
56+
57+
@Metric("Bytes that already moved by mover.")
58+
public long getBytesMoved() {
59+
return mover.getNnc().getBytesMoved().get();
60+
}
61+
62+
@Metric("Number of blocks that successfully moved by mover.")
63+
public long getBlocksMoved() {
64+
return mover.getNnc().getBlocksMoved().get();
65+
}
66+
67+
@Metric("Number of blocks that failed moved by mover.")
68+
public long getBlocksFailed() {
69+
return mover.getNnc().getBlocksFailed().get();
70+
}
71+
72+
void setProcessingNamespace(boolean processingNamespace) {
73+
this.processingNamespace.set(processingNamespace ? 1 : 0);
74+
}
75+
76+
void incrBlocksScheduled() {
77+
this.blocksScheduled.incr();
78+
}
79+
80+
void incrFilesProcessed() {
81+
this.filesProcessed.incr();
82+
}
83+
}
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/**
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
/**
20+
* Mover is a data migration tool for tiered storage.
21+
* It scans provided paths in HDFS to check
22+
* if the block placement satisfies the storage policy.
23+
* For the blocks violating the storage policy,
24+
* it moves the replicas to a different storage type
25+
* in order to fulfill the storage policy requirement.
26+
*/
27+
package org.apache.hadoop.hdfs.server.mover;

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/mover/TestMover.java

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@
3636
import static org.apache.hadoop.hdfs.client.HdfsClientConfigKeys.DFS_DATA_TRANSFER_PROTECTION_KEY;
3737
import static org.apache.hadoop.test.MetricsAsserts.assertCounter;
3838
import static org.apache.hadoop.test.MetricsAsserts.getMetrics;
39+
import static org.junit.Assert.assertEquals;
40+
import static org.junit.Assert.assertNotNull;
3941

4042
import java.io.File;
4143
import java.io.IOException;
@@ -86,12 +88,15 @@
8688
import org.apache.hadoop.hdfs.server.namenode.ha.HATestUtil;
8789
import org.apache.hadoop.http.HttpConfig;
8890
import org.apache.hadoop.metrics2.MetricsRecordBuilder;
91+
import org.apache.hadoop.metrics2.MetricsSource;
92+
import org.apache.hadoop.metrics2.lib.DefaultMetricsSystem;
8993
import org.apache.hadoop.minikdc.MiniKdc;
9094
import org.apache.hadoop.security.SecurityUtil;
9195
import org.apache.hadoop.security.UserGroupInformation;
9296
import org.apache.hadoop.security.authentication.util.KerberosName;
9397
import org.apache.hadoop.security.ssl.KeyStoreTestUtil;
9498
import org.apache.hadoop.test.GenericTestUtils;
99+
import org.apache.hadoop.test.MetricsAsserts;
95100
import org.apache.hadoop.util.ToolRunner;
96101
import org.junit.Assert;
97102
import org.junit.Test;
@@ -1235,6 +1240,58 @@ public void testMoverWhenStoragePolicyUnset() throws Exception {
12351240
}
12361241
}
12371242

1243+
@Test(timeout=100000)
1244+
public void testMoverMetrics() throws Exception {
1245+
long blockSize = 10*1024*1024;
1246+
final Configuration conf = new HdfsConfiguration();
1247+
initConf(conf);
1248+
conf.setInt(DFSConfigKeys.DFS_MOVER_MOVERTHREADS_KEY, 1);
1249+
conf.setLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY, blockSize);
1250+
conf.setLong(DFSConfigKeys.DFS_BYTES_PER_CHECKSUM_KEY, blockSize);
1251+
1252+
final MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
1253+
.numDataNodes(2)
1254+
.storageTypes(
1255+
new StorageType[][] {{StorageType.DISK, StorageType.DISK},
1256+
{StorageType.ARCHIVE, StorageType.ARCHIVE}})
1257+
.build();
1258+
1259+
cluster.waitActive();
1260+
final DistributedFileSystem fs = cluster.getFileSystem();
1261+
1262+
final String file = "/testMaxIterationTime.dat";
1263+
final Path path = new Path(file);
1264+
short repFactor = 1;
1265+
int seed = 0xFAFAFA;
1266+
// write to DISK
1267+
DFSTestUtil.createFile(fs, path, 4L * blockSize, repFactor, seed);
1268+
1269+
// move to ARCHIVE
1270+
fs.setStoragePolicy(new Path(file), "COLD");
1271+
1272+
Map<URI, List<Path>> nnWithPath = new HashMap<>();
1273+
List<Path> paths = new ArrayList<>();
1274+
paths.add(path);
1275+
nnWithPath
1276+
.put(DFSUtil.getInternalNsRpcUris(conf).iterator().next(), paths);
1277+
1278+
Mover.run(nnWithPath, conf);
1279+
1280+
final String moverMetricsName = "Mover-"
1281+
+ cluster.getNameNode(0).getNamesystem().getBlockPoolId();
1282+
MetricsSource moverMetrics =
1283+
DefaultMetricsSystem.instance().getSource(moverMetricsName);
1284+
assertNotNull(moverMetrics);
1285+
1286+
MetricsRecordBuilder rb = MetricsAsserts.getMetrics(moverMetricsName);
1287+
// Check metrics
1288+
assertEquals(4, MetricsAsserts.getLongCounter("BlocksScheduled", rb));
1289+
assertEquals(1, MetricsAsserts.getLongCounter("FilesProcessed", rb));
1290+
assertEquals(41943040, MetricsAsserts.getLongGauge("BytesMoved", rb));
1291+
assertEquals(4, MetricsAsserts.getLongGauge("BlocksMoved", rb));
1292+
assertEquals(0, MetricsAsserts.getLongGauge("BlocksFailed", rb));
1293+
}
1294+
12381295
private void createFileWithFavoredDatanodes(final Configuration conf,
12391296
final MiniDFSCluster cluster, final DistributedFileSystem dfs)
12401297
throws IOException {

0 commit comments

Comments
 (0)