Skip to content

Commit c9f95b0

Browse files
authored
HDFS-16315. Add metrics related to Transfer and NativeCopy for DataNode (#3643)
Reviewed-by: Hui Fei <ferhui@apache.org> Reviewed-by: Ayush Saxena <ayushsaxena@apache.org>
1 parent 89fcbd8 commit c9f95b0

File tree

5 files changed

+132
-1
lines changed

5 files changed

+132
-1
lines changed

hadoop-common-project/hadoop-common/src/site/markdown/Metrics.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -512,6 +512,12 @@ contains tags such as Hostname as additional information along with metrics.
512512
| `WriteIoRateNumOps` | The number of file write io operations within an interval time of metric |
513513
| `WriteIoRateAvgTime` | Mean time of file write io operations in milliseconds |
514514
| `WriteIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file write io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
515+
| `TransferIoRateNumOps` | The number of file transfer io operations within an interval time of metric |
516+
| `TransferIoRateAvgTime` | Mean time of file transfer io operations in milliseconds |
517+
| `TransferIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file transfer io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
518+
| `NativeCopyIoRateNumOps` | The number of file nativeCopy io operations within an interval time of metric |
519+
| `NativeCopyIoRateAvgTime` | Mean time of file nativeCopy io operations in milliseconds |
520+
| `NativeCopyIoLatency`*num*`s(50/75/90/95/99)thPercentileLatency` | The 50/75/90/95/99th percentile of file nativeCopy io operations latency in milliseconds (*num* seconds granularity). Percentile measurement is off by default, by watching no intervals. The intervals are specified by `dfs.metrics.percentiles.intervals`. |
515521
| `TotalFileIoErrors` | Total number (monotonically increasing) of file io error operations |
516522
| `FileIoErrorRateNumOps` | The number of file io error operations within an interval time of metric |
517523
| `FileIoErrorRateAvgTime` | It measures the mean time in milliseconds from the start of an operation to hitting a failure |

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/ProfilingFileIoEvents.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,12 @@ public void afterFileIo(@Nullable FsVolumeSpi volume,
116116
case WRITE:
117117
metrics.addWriteIoLatency(latency);
118118
break;
119+
case TRANSFER:
120+
metrics.addTransferIoLatency(latency);
121+
break;
122+
case NATIVE_COPY:
123+
metrics.addNativeCopyIoLatency(latency);
124+
break;
119125
default:
120126
}
121127
}

hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/DataNodeVolumeMetrics.java

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,14 @@ public class DataNodeVolumeMetrics {
7171
private MutableRate writeIoRate;
7272
private MutableQuantiles[] writeIoLatencyQuantiles;
7373

74+
@Metric("file io transfer rate")
75+
private MutableRate transferIoRate;
76+
private MutableQuantiles[] transferIoLatencyQuantiles;
77+
78+
@Metric("file io nativeCopy rate")
79+
private MutableRate nativeCopyIoRate;
80+
private MutableQuantiles[] nativeCopyIoLatencyQuantiles;
81+
7482
@Metric("number of file io errors")
7583
private MutableCounterLong totalFileIoErrors;
7684
@Metric("file io error rate")
@@ -162,6 +170,40 @@ public double getWriteIoStdDev() {
162170
return writeIoRate.lastStat().stddev();
163171
}
164172

173+
// Based on transferIoRate
174+
public long getTransferIoSampleCount() {
175+
return transferIoRate.lastStat().numSamples();
176+
}
177+
178+
public double getTransferIoMean() {
179+
return transferIoRate.lastStat().mean();
180+
}
181+
182+
public double getTransferIoStdDev() {
183+
return transferIoRate.lastStat().stddev();
184+
}
185+
186+
public MutableQuantiles[] getTransferIoQuantiles() {
187+
return transferIoLatencyQuantiles;
188+
}
189+
190+
// Based on nativeCopyIoRate
191+
public long getNativeCopyIoSampleCount() {
192+
return nativeCopyIoRate.lastStat().numSamples();
193+
}
194+
195+
public double getNativeCopyIoMean() {
196+
return nativeCopyIoRate.lastStat().mean();
197+
}
198+
199+
public double getNativeCopyIoStdDev() {
200+
return nativeCopyIoRate.lastStat().stddev();
201+
}
202+
203+
public MutableQuantiles[] getNativeCopyIoQuantiles() {
204+
return nativeCopyIoLatencyQuantiles;
205+
}
206+
165207
public long getTotalFileIoErrors() {
166208
return totalFileIoErrors.value();
167209
}
@@ -193,6 +235,8 @@ public DataNodeVolumeMetrics(final MetricsSystem metricsSystem,
193235
syncIoLatencyQuantiles = new MutableQuantiles[len];
194236
readIoLatencyQuantiles = new MutableQuantiles[len];
195237
writeIoLatencyQuantiles = new MutableQuantiles[len];
238+
transferIoLatencyQuantiles = new MutableQuantiles[len];
239+
nativeCopyIoLatencyQuantiles = new MutableQuantiles[len];
196240
for (int i = 0; i < len; i++) {
197241
int interval = intervals[i];
198242
metadataOperationLatencyQuantiles[i] = registry.newQuantiles(
@@ -213,6 +257,12 @@ public DataNodeVolumeMetrics(final MetricsSystem metricsSystem,
213257
writeIoLatencyQuantiles[i] = registry.newQuantiles(
214258
"writeIoLatency" + interval + "s",
215259
"Data write Io Latency in ms", "ops", "latency", interval);
260+
transferIoLatencyQuantiles[i] = registry.newQuantiles(
261+
"transferIoLatency" + interval + "s",
262+
"Data transfer Io Latency in ms", "ops", "latency", interval);
263+
nativeCopyIoLatencyQuantiles[i] = registry.newQuantiles(
264+
"nativeCopyIoLatency" + interval + "s",
265+
"Data nativeCopy Io Latency in ms", "ops", "latency", interval);
216266
}
217267
}
218268

@@ -282,6 +332,20 @@ public void addWriteIoLatency(final long latency) {
282332
}
283333
}
284334

335+
public void addTransferIoLatency(final long latency) {
336+
transferIoRate.add(latency);
337+
for (MutableQuantiles q: transferIoLatencyQuantiles) {
338+
q.add(latency);
339+
}
340+
}
341+
342+
public void addNativeCopyIoLatency(final long latency) {
343+
nativeCopyIoRate.add(latency);
344+
for (MutableQuantiles q: nativeCopyIoLatencyQuantiles) {
345+
q.add(latency);
346+
}
347+
}
348+
285349
public void addFileIoError(final long latency) {
286350
totalFileIoErrors.incr();
287351
fileIoErrorRate.add(latency);

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/TestDataNodeVolumeMetrics.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,7 @@ private void verifyDataNodeVolumeMetrics(final FileSystem fs,
151151
LOG.info("MetadataOperationSampleCount : " +
152152
metrics.getMetadataOperationSampleCount());
153153
LOG.info("MetadataOperationMean : " + metrics.getMetadataOperationMean());
154-
LOG.info("MetadataFileIoStdDev : " +
154+
LOG.info("MetadataOperationStdDev : " +
155155
metrics.getMetadataOperationStdDev());
156156

157157
LOG.info("DataFileIoSampleCount : " + metrics.getDataFileIoSampleCount());
@@ -174,6 +174,15 @@ private void verifyDataNodeVolumeMetrics(final FileSystem fs,
174174
LOG.info("writeIoMean : " + metrics.getWriteIoMean());
175175
LOG.info("writeIoStdDev : " + metrics.getWriteIoStdDev());
176176

177+
LOG.info("transferIoSampleCount : " + metrics.getTransferIoSampleCount());
178+
LOG.info("transferIoMean : " + metrics.getTransferIoMean());
179+
LOG.info("transferIoStdDev : " + metrics.getTransferIoStdDev());
180+
181+
LOG.info("nativeCopyIoSampleCount : " +
182+
metrics.getNativeCopyIoSampleCount());
183+
LOG.info("nativeCopyIoMean : " + metrics.getNativeCopyIoMean());
184+
LOG.info("nativeCopyIoStdDev : " + metrics.getNativeCopyIoStdDev());
185+
177186
LOG.info("fileIoErrorSampleCount : "
178187
+ metrics.getFileIoErrorSampleCount());
179188
LOG.info("fileIoErrorMean : " + metrics.getFileIoErrorMean());

hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/datanode/fsdataset/impl/TestFsDatasetImpl.java

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@
6262
import org.apache.hadoop.hdfs.server.datanode.ReplicaInfo;
6363
import org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry;
6464
import org.apache.hadoop.hdfs.server.datanode.StorageLocation;
65+
import org.apache.hadoop.hdfs.server.datanode.fsdataset.DataNodeVolumeMetrics;
6566
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi;
6667
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsDatasetSpi.FsVolumeReferences;
6768
import org.apache.hadoop.hdfs.server.datanode.fsdataset.FsVolumeSpi;
@@ -1830,4 +1831,49 @@ public void testReleaseVolumeRefIfExceptionThrown() throws IOException {
18301831
cluster.shutdown();
18311832
}
18321833
}
1834+
1835+
@Test(timeout = 30000)
1836+
public void testTransferAndNativeCopyMetrics() throws IOException {
1837+
Configuration config = new HdfsConfiguration();
1838+
config.setInt(
1839+
DFSConfigKeys.DFS_DATANODE_FILEIO_PROFILING_SAMPLING_PERCENTAGE_KEY,
1840+
100);
1841+
config.set(DFSConfigKeys.DFS_METRICS_PERCENTILES_INTERVALS_KEY,
1842+
"60,300,1500");
1843+
try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(config)
1844+
.numDataNodes(1)
1845+
.storageTypes(new StorageType[]{StorageType.DISK, StorageType.DISK})
1846+
.storagesPerDatanode(2)
1847+
.build()) {
1848+
FileSystem fs = cluster.getFileSystem();
1849+
DataNode dataNode = cluster.getDataNodes().get(0);
1850+
1851+
// Create file that has one block with one replica.
1852+
Path filePath = new Path(name.getMethodName());
1853+
DFSTestUtil.createFile(fs, filePath, 100, (short) 1, 0);
1854+
ExtendedBlock block = DFSTestUtil.getFirstBlock(fs, filePath);
1855+
1856+
// Copy a new replica to other volume.
1857+
FsDatasetImpl fsDataSetImpl = (FsDatasetImpl) dataNode.getFSDataset();
1858+
ReplicaInfo newReplicaInfo = createNewReplicaObj(block, fsDataSetImpl);
1859+
fsDataSetImpl.finalizeNewReplica(newReplicaInfo, block);
1860+
1861+
// Get the volume where the original replica resides.
1862+
FsVolumeSpi volume = null;
1863+
for (FsVolumeSpi fsVolumeReference :
1864+
fsDataSetImpl.getFsVolumeReferences()) {
1865+
if (!fsVolumeReference.getStorageID()
1866+
.equals(newReplicaInfo.getStorageUuid())) {
1867+
volume = fsVolumeReference;
1868+
}
1869+
}
1870+
1871+
// Assert metrics.
1872+
DataNodeVolumeMetrics metrics = volume.getMetrics();
1873+
assertEquals(2, metrics.getTransferIoSampleCount());
1874+
assertEquals(3, metrics.getTransferIoQuantiles().length);
1875+
assertEquals(2, metrics.getNativeCopyIoSampleCount());
1876+
assertEquals(3, metrics.getNativeCopyIoQuantiles().length);
1877+
}
1878+
}
18331879
}

0 commit comments

Comments
 (0)