Skip to content

Commit 515cba7

Browse files
ahmarsuhailsteveloughran
authored andcommitted
HADOOP-18231. S3A prefetching: fix failing tests & drain stream async. (apache#4386)
* adds in new test for prefetching input stream * creates streamStats before opening stream * updates numBlocks calculation method * fixes ITestS3AOpenCost.testOpenFileLongerLength * drains stream async * fixes failing unit test Contributed by Ahmar Suhail
1 parent 3c06960 commit 515cba7

File tree

15 files changed

+373
-174
lines changed

15 files changed

+373
-174
lines changed

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/common/CachingBlockManager.java

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,8 @@
3131
import org.slf4j.Logger;
3232
import org.slf4j.LoggerFactory;
3333

34+
import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
35+
3436
/**
3537
* Provides read access to the underlying file one block at a time.
3638
* Improve read performance by prefetching and locall caching blocks.
@@ -204,7 +206,7 @@ public synchronized void close() {
204206
// Cancel any prefetches in progress.
205207
this.cancelPrefetches();
206208

207-
Io.closeIgnoringIoException(this.cache);
209+
cleanupWithLogger(LOG, this.cache);
208210

209211
this.ops.end(op);
210212
LOG.info(this.ops.getSummary(false));

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/common/Io.java

Lines changed: 0 additions & 45 deletions
This file was deleted.

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1524,7 +1524,8 @@ private FSDataInputStream executeOpen(
15241524
new S3PrefetchingInputStream(
15251525
readContext.build(),
15261526
createObjectAttributes(path, fileStatus),
1527-
createInputStreamCallbacks(auditSpan)));
1527+
createInputStreamCallbacks(auditSpan),
1528+
inputStreamStats));
15281529
} else {
15291530
return new FSDataInputStream(
15301531
new S3AInputStream(

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/read/S3CachingInputStream.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import org.apache.hadoop.fs.s3a.S3AInputStream;
3232
import org.apache.hadoop.fs.s3a.S3AReadOpContext;
3333
import org.apache.hadoop.fs.s3a.S3ObjectAttributes;
34+
import org.apache.hadoop.fs.s3a.statistics.S3AInputStreamStatistics;
3435

3536
/**
3637
* Provides an {@code InputStream} that allows reading from an S3 file.
@@ -53,6 +54,7 @@ public class S3CachingInputStream extends S3InputStream {
5354
* @param context read-specific operation context.
5455
* @param s3Attributes attributes of the S3 object being read.
5556
* @param client callbacks used for interacting with the underlying S3 client.
57+
* @param streamStatistics statistics for this stream.
5658
*
5759
* @throws IllegalArgumentException if context is null.
5860
* @throws IllegalArgumentException if s3Attributes is null.
@@ -61,8 +63,9 @@ public class S3CachingInputStream extends S3InputStream {
6163
public S3CachingInputStream(
6264
S3AReadOpContext context,
6365
S3ObjectAttributes s3Attributes,
64-
S3AInputStream.InputStreamCallbacks client) {
65-
super(context, s3Attributes, client);
66+
S3AInputStream.InputStreamCallbacks client,
67+
S3AInputStreamStatistics streamStatistics) {
68+
super(context, s3Attributes, client, streamStatistics);
6669

6770
this.numBlocksToPrefetch = this.getContext().getPrefetchBlockCount();
6871
int bufferPoolSize = this.numBlocksToPrefetch + 1;

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/read/S3File.java

Lines changed: 126 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,17 @@
1919

2020
package org.apache.hadoop.fs.s3a.read;
2121

22-
import java.io.Closeable;
22+
2323
import java.io.IOException;
2424
import java.io.InputStream;
25-
import java.util.ArrayList;
2625
import java.util.IdentityHashMap;
27-
import java.util.List;
2826
import java.util.Map;
2927

3028
import com.amazonaws.services.s3.model.GetObjectRequest;
3129
import com.amazonaws.services.s3.model.S3Object;
30+
import org.slf4j.Logger;
31+
import org.slf4j.LoggerFactory;
3232

33-
import org.apache.hadoop.fs.common.Io;
3433
import org.apache.hadoop.fs.common.Validate;
3534
import org.apache.hadoop.fs.s3a.Invoker;
3635
import org.apache.hadoop.fs.s3a.S3AInputStream;
@@ -40,30 +39,56 @@
4039
import org.apache.hadoop.fs.s3a.statistics.S3AInputStreamStatistics;
4140
import org.apache.hadoop.fs.statistics.DurationTracker;
4241

42+
import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.invokeTrackingDuration;
43+
import static org.apache.hadoop.io.IOUtils.cleanupWithLogger;
44+
4345
/**
4446
* Encapsulates low level interactions with S3 object on AWS.
4547
*/
46-
public class S3File implements Closeable {
48+
public class S3File {
49+
private static final Logger LOG = LoggerFactory.getLogger(S3File.class);
4750

48-
// Read-specific operation context.
51+
/**
52+
* Read-specific operation context.
53+
*/
4954
private final S3AReadOpContext context;
5055

51-
// S3 object attributes.
56+
/**
57+
* S3 object attributes.
58+
*/
5259
private final S3ObjectAttributes s3Attributes;
5360

54-
// Callbacks used for interacting with the underlying S3 client.
61+
/**
62+
* Callbacks used for interacting with the underlying S3 client.
63+
*/
5564
private final S3AInputStream.InputStreamCallbacks client;
5665

57-
// Used for reporting input stream access statistics.
66+
/**
67+
* Used for reporting input stream access statistics.
68+
*/
5869
private final S3AInputStreamStatistics streamStatistics;
5970

60-
// Enforces change tracking related policies.
71+
/**
72+
* Enforces change tracking related policies.
73+
*/
6174
private final ChangeTracker changeTracker;
6275

63-
// Maps a stream returned by openForRead() to the associated S3 object.
64-
// That allows us to close the object when closing the stream.
76+
/**
77+
* Maps a stream returned by openForRead() to the associated S3 object.
78+
* That allows us to close the object when closing the stream.
79+
*/
6580
private Map<InputStream, S3Object> s3Objects;
6681

82+
/**
83+
* uri of the object being read.
84+
*/
85+
private final String uri;
86+
87+
/**
88+
* size of a buffer to create when draining the stream.
89+
*/
90+
private static final int DRAIN_BUFFER_SIZE = 16384;
91+
6792
/**
6893
* Initializes a new instance of the {@code S3File} class.
6994
*
@@ -97,7 +122,8 @@ public S3File(
97122
this.client = client;
98123
this.streamStatistics = streamStatistics;
99124
this.changeTracker = changeTracker;
100-
this.s3Objects = new IdentityHashMap<InputStream, S3Object>();
125+
this.s3Objects = new IdentityHashMap<>();
126+
this.uri = this.getPath();
101127
}
102128

103129
/**
@@ -169,7 +195,6 @@ public InputStream openForRead(long offset, int size) throws IOException {
169195
.withRange(offset, offset + size - 1);
170196
this.changeTracker.maybeApplyConstraint(request);
171197

172-
String uri = this.getPath();
173198
String operation = String.format(
174199
"%s %s at %d", S3AInputStream.OPERATION_OPEN, uri, offset);
175200
DurationTracker tracker = streamStatistics.initiateGetRequest();
@@ -193,18 +218,7 @@ public InputStream openForRead(long offset, int size) throws IOException {
193218
return stream;
194219
}
195220

196-
/**
197-
* Closes this stream and releases all acquired resources.
198-
*/
199-
@Override
200-
public synchronized void close() {
201-
List<InputStream> streams = new ArrayList<InputStream>(this.s3Objects.keySet());
202-
for (InputStream stream : streams) {
203-
this.close(stream);
204-
}
205-
}
206-
207-
void close(InputStream inputStream) {
221+
void close(InputStream inputStream, int numRemainingBytes) {
208222
S3Object obj;
209223
synchronized (this.s3Objects) {
210224
obj = this.s3Objects.get(inputStream);
@@ -214,7 +228,91 @@ void close(InputStream inputStream) {
214228
this.s3Objects.remove(inputStream);
215229
}
216230

217-
Io.closeIgnoringIoException(inputStream);
218-
Io.closeIgnoringIoException(obj);
231+
if (numRemainingBytes <= this.context.getAsyncDrainThreshold()) {
232+
// don't bother with async io.
233+
drain(false, "close() operation", numRemainingBytes, obj, inputStream);
234+
} else {
235+
LOG.debug("initiating asynchronous drain of {} bytes", numRemainingBytes);
236+
// schedule an async drain/abort with references to the fields so they
237+
// can be reused
238+
client.submit(() -> drain(false, "close() operation", numRemainingBytes, obj, inputStream));
239+
}
240+
}
241+
242+
/**
243+
* drain the stream. This method is intended to be
244+
* used directly or asynchronously, and measures the
245+
* duration of the operation in the stream statistics.
246+
*
247+
* @param shouldAbort force an abort; used if explicitly requested.
248+
* @param reason reason for stream being closed; used in messages
249+
* @param remaining remaining bytes
250+
* @param requestObject http request object;
251+
* @param inputStream stream to close.
252+
* @return was the stream aborted?
253+
*/
254+
private boolean drain(
255+
final boolean shouldAbort,
256+
final String reason,
257+
final long remaining,
258+
final S3Object requestObject,
259+
final InputStream inputStream) {
260+
261+
try {
262+
return invokeTrackingDuration(streamStatistics.initiateInnerStreamClose(shouldAbort),
263+
() -> drainOrAbortHttpStream(shouldAbort, reason, remaining, requestObject, inputStream));
264+
} catch (IOException e) {
265+
// this is only here because invokeTrackingDuration() has it in its
266+
// signature
267+
return shouldAbort;
268+
}
269+
}
270+
271+
/**
272+
* Drain or abort the inner stream.
273+
* Exceptions are swallowed.
274+
* If a close() is attempted and fails, the operation escalates to
275+
* an abort.
276+
*
277+
* @param shouldAbort force an abort; used if explicitly requested.
278+
* @param reason reason for stream being closed; used in messages
279+
* @param remaining remaining bytes
280+
* @param requestObject http request object
281+
* @param inputStream stream to close.
282+
* @return was the stream aborted?
283+
*/
284+
private boolean drainOrAbortHttpStream(
285+
boolean shouldAbort,
286+
final String reason,
287+
final long remaining,
288+
final S3Object requestObject,
289+
final InputStream inputStream) {
290+
291+
if (!shouldAbort && remaining > 0) {
292+
try {
293+
long drained = 0;
294+
byte[] buffer = new byte[DRAIN_BUFFER_SIZE];
295+
while (true) {
296+
final int count = inputStream.read(buffer);
297+
if (count < 0) {
298+
// no more data is left
299+
break;
300+
}
301+
drained += count;
302+
}
303+
LOG.debug("Drained stream of {} bytes", drained);
304+
} catch (Exception e) {
305+
// exception escalates to an abort
306+
LOG.debug("When closing {} stream for {}, will abort the stream", uri, reason, e);
307+
shouldAbort = true;
308+
}
309+
}
310+
cleanupWithLogger(LOG, inputStream);
311+
cleanupWithLogger(LOG, requestObject);
312+
streamStatistics.streamClose(shouldAbort, remaining);
313+
314+
LOG.debug("Stream {} {}: {}; remaining={}", uri, (shouldAbort ? "aborted" : "closed"), reason,
315+
remaining);
316+
return shouldAbort;
219317
}
220318
}

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/read/S3InMemoryInputStream.java

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
import org.apache.hadoop.fs.s3a.S3AInputStream;
3030
import org.apache.hadoop.fs.s3a.S3AReadOpContext;
3131
import org.apache.hadoop.fs.s3a.S3ObjectAttributes;
32+
import org.apache.hadoop.fs.s3a.statistics.S3AInputStreamStatistics;
3233

3334
/**
3435
* Provides an {@code InputStream} that allows reading from an S3 file.
@@ -48,6 +49,7 @@ public class S3InMemoryInputStream extends S3InputStream {
4849
* @param context read-specific operation context.
4950
* @param s3Attributes attributes of the S3 object being read.
5051
* @param client callbacks used for interacting with the underlying S3 client.
52+
* @param streamStatistics statistics for this stream.
5153
*
5254
* @throws IllegalArgumentException if context is null.
5355
* @throws IllegalArgumentException if s3Attributes is null.
@@ -56,8 +58,9 @@ public class S3InMemoryInputStream extends S3InputStream {
5658
public S3InMemoryInputStream(
5759
S3AReadOpContext context,
5860
S3ObjectAttributes s3Attributes,
59-
S3AInputStream.InputStreamCallbacks client) {
60-
super(context, s3Attributes, client);
61+
S3AInputStream.InputStreamCallbacks client,
62+
S3AInputStreamStatistics streamStatistics) {
63+
super(context, s3Attributes, client, streamStatistics);
6164
int fileSize = (int) s3Attributes.getLen();
6265
this.buffer = ByteBuffer.allocate(fileSize);
6366
LOG.debug("Created in-memory input stream for {} (size = {})", this.getName(), fileSize);

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/read/S3InputStream.java

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@
4444
import org.apache.hadoop.fs.statistics.IOStatistics;
4545
import org.apache.hadoop.fs.statistics.IOStatisticsSource;
4646

47+
import static java.util.Objects.requireNonNull;
48+
4749
/**
4850
* Provides an {@link InputStream} that allows reading from an S3 file.
4951
*/
@@ -96,6 +98,7 @@ public abstract class S3InputStream
9698
* @param context read-specific operation context.
9799
* @param s3Attributes attributes of the S3 object being read.
98100
* @param client callbacks used for interacting with the underlying S3 client.
101+
* @param streamStatistics statistics for this stream.
99102
*
100103
* @throws IllegalArgumentException if context is null.
101104
* @throws IllegalArgumentException if s3Attributes is null.
@@ -104,16 +107,13 @@ public abstract class S3InputStream
104107
public S3InputStream(
105108
S3AReadOpContext context,
106109
S3ObjectAttributes s3Attributes,
107-
S3AInputStream.InputStreamCallbacks client) {
108-
109-
Validate.checkNotNull(context, "context");
110-
Validate.checkNotNull(s3Attributes, "s3Attributes");
111-
Validate.checkNotNull(client, "client");
110+
S3AInputStream.InputStreamCallbacks client,
111+
S3AInputStreamStatistics streamStatistics) {
112112

113-
this.context = context;
114-
this.s3Attributes = s3Attributes;
115-
this.client = client;
116-
this.streamStatistics = context.getS3AStatisticsContext().newInputStreamStatistics();
113+
this.context = requireNonNull(context);
114+
this.s3Attributes = requireNonNull(s3Attributes);
115+
this.client = requireNonNull(client);
116+
this.streamStatistics = requireNonNull(streamStatistics);
117117
this.ioStatistics = streamStatistics.getIOStatistics();
118118
this.name = S3File.getPath(s3Attributes);
119119
this.changeTracker = new ChangeTracker(

0 commit comments

Comments
 (0)