Skip to content

Commit ced25f8

Browse files
committed
HDFS-16315. Add metrics related to Transfer and NativeCopy to DataNode
1 parent db89a94 commit ced25f8

File tree

5 files changed

+145
-1
lines changed

5 files changed

+145
-1
lines changed

hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,12 @@ contains tags such as Hostname as additional information along with metrics.
510510
| `WriteIoRateNumOps` | The number of file write io operations within an interval time of metric |
511511
| `WriteIoRateAvgTime` | Mean time of file write io operations in milliseconds |
512512
| `WriteIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file write io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
513+
| `TransferIoRateNumOps` | The number of file transfer io operations within an interval time of metric |
514+
| `TransferIoRateAvgTime` | Mean time of file transfer io operations in milliseconds |
515+
| `TransferIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file transfer io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
516+
| `NativeCopyIoRateNumOps` | The number of file nativeCopy io operations within an interval time of metric |
517+
| `NativeCopyIoRateAvgTime` | Mean time of file nativeCopy io operations in milliseconds |
518+
| `NativeCopyIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file nativeCopy io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
513519
| `TotalFileIoErrors` | Total number (monotonically increasing) of file io error operations |
514520
| `FileIoErrorRateNumOps` | The number of file io error operations within an interval time of metric |
515521
| `FileIoErrorRateAvgTime` | It measures the mean time in milliseconds from the start of an operation to hitting a failure |

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ProfilingFileIoEvents.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ public void afterFileIo(@Nullable FsVolumeSpi volume,
116116
case WRITE:
117117
metrics.addWriteIoLatency(latency);
118118
break;
119+
case TRANSFER:
120+
metrics.addTransferIoLatency(latency);
121+
break;
122+
case NATIVE_COPY:
123+
metrics.addNativeCopyIoLatency(latency);
124+
break;
119125
default:
120126
}
121127
}

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/DataNodeVolumeMetrics.java

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ public class DataNodeVolumeMetrics {
7171
private MutableRate writeIoRate;
7272
private MutableQuantiles[] writeIoLatencyQuantiles;
7373

74+
@Metric("file io transfer rate")
75+
private MutableRate transferIoRate;
76+
private MutableQuantiles[] transferIoLatencyQuantiles;
77+
78+
@Metric("file io nativeCopy rate")
79+
private MutableRate nativeCopyIoRate;
80+
private MutableQuantiles[] nativeCopyIoLatencyQuantiles;
81+
7482
@Metric("number of file io errors")
7583
private MutableCounterLong totalFileIoErrors;
7684
@Metric("file io error rate")
@@ -162,6 +170,40 @@ public double getWriteIoStdDev() {
162170
return writeIoRate.lastStat().stddev();
163171
}
164172

173+
// Based on transferIoRate
174+
public long getTransferIoSampleCount() {
175+
return transferIoRate.lastStat().numSamples();
176+
}
177+
178+
public double getTransferIoMean() {
179+
return transferIoRate.lastStat().mean();
180+
}
181+
182+
public double getTransferIoStdDev() {
183+
return transferIoRate.lastStat().stddev();
184+
}
185+
186+
public MutableQuantiles[] getTransferIoQuantiles() {
187+
return transferIoLatencyQuantiles;
188+
}
189+
190+
// Based on nativeCopyIoRate
191+
public long getNativeCopyIoSampleCount() {
192+
return nativeCopyIoRate.lastStat().numSamples();
193+
}
194+
195+
public double getNativeCopyIoMean() {
196+
return nativeCopyIoRate.lastStat().mean();
197+
}
198+
199+
public double getNativeCopyIoStdDev() {
200+
return nativeCopyIoRate.lastStat().stddev();
201+
}
202+
203+
public MutableQuantiles[] getNativeCopyIoQuantiles() {
204+
return nativeCopyIoLatencyQuantiles;
205+
}
206+
165207
public long getTotalFileIoErrors() {
166208
return totalFileIoErrors.value();
167209
}
@@ -193,6 +235,8 @@ public DataNodeVolumeMetrics(final MetricsSystem metricsSystem,
193235
syncIoLatencyQuantiles = new MutableQuantiles[len];
194236
readIoLatencyQuantiles = new MutableQuantiles[len];
195237
writeIoLatencyQuantiles = new MutableQuantiles[len];
238+
transferIoLatencyQuantiles = new MutableQuantiles[len];
239+
nativeCopyIoLatencyQuantiles = new MutableQuantiles[len];
196240
for (int i = 0; i < len; i++) {
197241
int interval = intervals[i];
198242
metadataOperationLatencyQuantiles[i] = registry.newQuantiles(
@@ -213,6 +257,12 @@ public DataNodeVolumeMetrics(final MetricsSystem metricsSystem,
213257
writeIoLatencyQuantiles[i] = registry.newQuantiles(
214258
"writeIoLatency" + interval + "s",
215259
"Data write Io Latency in ms", "ops", "latency", interval);
260+
transferIoLatencyQuantiles[i] = registry.newQuantiles(
261+
"transferIoLatency" + interval + "s",
262+
"Data transfer Io Latency in ms", "ops", "latency", interval);
263+
nativeCopyIoLatencyQuantiles[i] = registry.newQuantiles(
264+
"nativeCopyIoLatency" + interval + "s",
265+
"Data nativeCopy Io Latency in ms", "ops", "latency", interval);
216266
}
217267
}
218268

@@ -282,6 +332,20 @@ public void addWriteIoLatency(final long latency) {
282332
}
283333
}
284334

335+
public void addTransferIoLatency(final long latency) {
336+
transferIoRate.add(latency);
337+
for (MutableQuantiles q: transferIoLatencyQuantiles) {
338+
q.add(latency);
339+
}
340+
}
341+
342+
public void addNativeCopyIoLatency(final long latency) {
343+
nativeCopyIoRate.add(latency);
344+
for (MutableQuantiles q: nativeCopyIoLatencyQuantiles) {
345+
q.add(latency);
346+
}
347+
}
348+
285349
public void addFileIoError(final long latency) {
286350
totalFileIoErrors.incr();
287351
fileIoErrorRate.add(latency);

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeMetrics.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ private void verifyDataNodeVolumeMetrics(final FileSystem fs,
151151
LOG.info("MetadataOperationSampleCount : " +
152152
metrics.getMetadataOperationSampleCount());
153153
LOG.info("MetadataOperationMean : " + metrics.getMetadataOperationMean());
154-
LOG.info("MetadataFileIoStdDev : " +
154+
LOG.info("MetadataOperationStdDev : " +
155155
metrics.getMetadataOperationStdDev());
156156

157157
LOG.info("DataFileIoSampleCount : " + metrics.getDataFileIoSampleCount());
@@ -174,6 +174,15 @@ private void verifyDataNodeVolumeMetrics(final FileSystem fs,
174174
LOG.info("writeIoMean : " + metrics.getWriteIoMean());
175175
LOG.info("writeIoStdDev : " + metrics.getWriteIoStdDev());
176176

177+
LOG.info("transferIoSampleCount : " + metrics.getTransferIoSampleCount());
178+
LOG.info("transferIoMean : " + metrics.getTransferIoMean());
179+
LOG.info("transferIoStdDev : " + metrics.getTransferIoStdDev());
180+
181+
LOG.info("nativeCopyIoSampleCount : " +
182+
metrics.getNativeCopyIoSampleCount());
183+
LOG.info("nativeCopyIoMean : " + metrics.getNativeCopyIoMean());
184+
LOG.info("nativeCopyIoStdDev : " + metrics.getNativeCopyIoStdDev());
185+
177186
LOG.info("fileIoErrorSampleCount : "
178187
+ metrics.getFileIoErrorSampleCount());
179188
LOG.info("fileIoErrorMean : " + metrics.getFileIoErrorMean());

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo;
6363
import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry;
6464
import org.apache.hadoop.hdfs.server.datanode.StorageLocation;
65+
import org.apache.hadoop.hdfs.server.datanode.fsdataset.DataNodeVolumeMetrics;
6566
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
6667
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi.FsVolumeReferences;
6768
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
@@ -1238,6 +1239,10 @@ public void testMoveBlockSuccess() {
12381239
ReplicaInfo newReplicaInfo = createNewReplicaObj(block, fsDataSetImpl);
12391240
fsDataSetImpl.finalizeNewReplica(newReplicaInfo, block);
12401241

1242+
final FsVolumeSpi volume = fsDataSetImpl.getVolume(block);
1243+
DataNodeVolumeMetrics metrics = volume.getMetrics();
1244+
System.out.println(metrics); // nativecopy
1245+
12411246
} catch (Exception ex) {
12421247
LOG.info("Exception in testMoveBlockSuccess ", ex);
12431248
fail("MoveBlock operation should succeed");
@@ -1830,4 +1835,58 @@ public void testReleaseVolumeRefIfExceptionThrown() throws IOException {
18301835
cluster.shutdown();
18311836
}
18321837
}
1838+
1839+
@Test(timeout = 30000)
1840+
public void testTransferAndNativeCopyMetrics() {
1841+
Configuration config = new HdfsConfiguration();
1842+
config.setInt(
1843+
DFSConfigKeys.DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_KEY,
1844+
100);
1845+
config.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
1846+
"60,300,1500");
1847+
MiniDFSCluster cluster = null;
1848+
try {
1849+
cluster = new MiniDFSCluster.Builder(config)
1850+
.numDataNodes(1)
1851+
.storageTypes(new StorageType[]{StorageType.DISK, StorageType.DISK})
1852+
.storagesPerDatanode(2)
1853+
.build();
1854+
FileSystem fs = cluster.getFileSystem();
1855+
DataNode dataNode = cluster.getDataNodes().get(0);
1856+
1857+
// Create file that has one block with one replica.
1858+
Path filePath = new Path(name.getMethodName());
1859+
DFSTestUtil.createFile(fs, filePath, 100, (short) 1, 0);
1860+
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath);
1861+
1862+
// Copy a new replica to other volume.
1863+
FsDatasetImpl fsDataSetImpl = (FsDatasetImpl) dataNode.getFSDataset();
1864+
ReplicaInfo newReplicaInfo = createNewReplicaObj(block, fsDataSetImpl);
1865+
fsDataSetImpl.finalizeNewReplica(newReplicaInfo, block);
1866+
1867+
// Get the volume where the original replica resides.
1868+
FsVolumeSpi volume = null;
1869+
for (FsVolumeSpi fsVolumeReference :
1870+
fsDataSetImpl.getFsVolumeReferences()) {
1871+
if (!fsVolumeReference.getStorageID()
1872+
.equals(newReplicaInfo.getStorageUuid())) {
1873+
volume = fsVolumeReference;
1874+
}
1875+
}
1876+
1877+
// Assert metrics.
1878+
DataNodeVolumeMetrics metrics = volume.getMetrics();
1879+
assertEquals(2, metrics.getTransferIoSampleCount());
1880+
assertEquals(3, metrics.getTransferIoQuantiles().length);
1881+
assertEquals(2, metrics.getNativeCopyIoSampleCount());
1882+
assertEquals(3, metrics.getNativeCopyIoQuantiles().length);
1883+
} catch (Exception ex) {
1884+
LOG.info("Exception in testTransferAndNativeCopyMetrics ", ex);
1885+
fail("MoveBlock operation should succeed");
1886+
} finally {
1887+
if (cluster.isClusterUp()) {
1888+
cluster.shutdown();
1889+
}
1890+
}
1891+
}
18331892
}

0 commit comments

Comments
 (0)