Skip to content

Commit 9db61ad

Browse files
committed
HADOOP-16202. Enhanced openFile(): hadoop-aws changes. (#2584/3)
S3A input stream support for the few fs.option.openfile settings. As well as supporting the read policy option and values, if the file length is declared in fs.option.openfile.length then no HEAD request will be issued when opening a file. This can cut a few tens of milliseconds off the operation. The patch adds a new openfile parameter/FS configuration option fs.s3a.input.async.drain.threshold (default: 16000). It declares the number of bytes remaining in the http input stream above which any operation to read and discard the rest of the stream, "draining", is executed asynchronously. This asynchronous draining offers some performance benefit on seek-heavy file IO. Contributed by Steve Loughran. Change-Id: I9b0626bbe635e9fd97ac0f463f5e7167e0111e39
1 parent e123de9 commit 9db61ad

38 files changed

+2065
-375
lines changed

hadoop-tools/hadoop-aws/dev-support/findbugs-exclude.xml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@
2828
<Method name="s3Exists" />
2929
<Bug pattern="RCN_REDUNDANT_NULLCHECK_OF_NONNULL_VALUE" />
3030
</Match>
31+
<!-- we are using completable futures, so ignore the Future which submit() returns -->
32+
<Match>
33+
<Class name="org.apache.hadoop.fs.s3a.S3AFileSystem$InputStreamCallbacksImpl" />
34+
<Bug pattern="RV_RETURN_VALUE_IGNORED_BAD_PRACTICE" />
35+
</Match>
3136

3237
<!--
3338
findbugs gets confused by lambda expressions in synchronized methods

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Constants.java

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
import org.apache.hadoop.classification.InterfaceAudience;
2222
import org.apache.hadoop.classification.InterfaceStability;
23+
import org.apache.hadoop.fs.Options;
2324
import org.apache.hadoop.security.ssl.DelegatingSSLSocketFactory;
2425

2526
import java.util.concurrent.TimeUnit;
@@ -602,37 +603,69 @@ private Constants() {
602603
public static final String READAHEAD_RANGE = "fs.s3a.readahead.range";
603604
public static final long DEFAULT_READAHEAD_RANGE = 64 * 1024;
604605

606+
/**
607+
* The threshold at which drain operations switch
608+
* to being asynchronous with the schedule/wait overhead
609+
* compared to synchronous.
610+
* Value: {@value}
611+
*/
612+
public static final String ASYNC_DRAIN_THRESHOLD = "fs.s3a.input.async.drain.threshold";
613+
614+
/**
615+
* This is a number based purely on experimentation in
616+
* {@code ITestS3AInputStreamPerformance}.
617+
* Value: {@value}
618+
*/
619+
public static final int DEFAULT_ASYNC_DRAIN_THRESHOLD = 16_000;
620+
605621
/**
606622
* Which input strategy to use for buffering, seeking and similar when
607623
* reading data.
608624
* Value: {@value}
609625
*/
610-
@InterfaceStability.Unstable
611626
public static final String INPUT_FADVISE =
612627
"fs.s3a.experimental.input.fadvise";
613628

629+
/**
630+
* The default value for this FS.
631+
* Which for S3A, is adaptive.
632+
* Value: {@value}
633+
* @deprecated use the {@link Options.OpenFileOptions} value
634+
* in code which only needs to be compiled against newer hadoop
635+
* releases.
636+
*/
637+
public static final String INPUT_FADV_DEFAULT =
638+
Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_DEFAULT;
639+
614640
/**
615641
* General input. Some seeks, some reads.
642+
* The policy name "default" is standard across different stores,
643+
* and should be preferred.
616644
* Value: {@value}
617645
*/
618-
@InterfaceStability.Unstable
619646
public static final String INPUT_FADV_NORMAL = "normal";
620647

621648
/**
622649
* Optimized for sequential access.
623650
* Value: {@value}
651+
* @deprecated use the {@link Options.OpenFileOptions} value
652+
* in code which only needs to be compiled against newer hadoop
653+
* releases.
624654
*/
625-
@InterfaceStability.Unstable
626-
public static final String INPUT_FADV_SEQUENTIAL = "sequential";
655+
public static final String INPUT_FADV_SEQUENTIAL =
656+
Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_SEQUENTIAL;
627657

628658
/**
629659
* Optimized purely for random seek+read/positionedRead operations;
630660
* The performance of sequential IO may be reduced in exchange for
631661
* more efficient {@code seek()} operations.
632662
* Value: {@value}
663+
* @deprecated use the {@link Options.OpenFileOptions} value
664+
* in code which only needs to be compiled against newer hadoop
665+
* releases.
633666
*/
634-
@InterfaceStability.Unstable
635-
public static final String INPUT_FADV_RANDOM = "random";
667+
public static final String INPUT_FADV_RANDOM =
668+
Options.OpenFileOptions.FS_OPTION_OPENFILE_READ_POLICY_RANDOM;
636669

637670
/**
638671
* Gauge name for the input policy : {@value}.

hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/Invoker.java

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,16 @@
3131

3232
import org.apache.commons.lang3.StringUtils;
3333
import org.apache.hadoop.classification.InterfaceAudience;
34+
import org.apache.hadoop.fs.statistics.DurationTracker;
3435
import org.apache.hadoop.io.retry.RetryPolicy;
3536
import org.apache.hadoop.util.DurationInfo;
3637
import org.apache.hadoop.util.functional.CallableRaisingIOE;
3738
import org.apache.hadoop.util.functional.FutureIO;
3839
import org.apache.hadoop.util.functional.InvocationRaisingIOE;
3940
import org.apache.hadoop.util.Preconditions;
4041

42+
import static org.apache.hadoop.fs.statistics.impl.IOStatisticsBinding.invokeTrackingDuration;
43+
4144
/**
4245
* Class to provide lambda expression invocation of AWS operations.
4346
*
@@ -122,6 +125,31 @@ public static <T> T once(String action, String path,
122125
}
123126
}
124127

128+
/**
129+
* Execute a function, translating any exception into an IOException.
130+
* The supplied duration tracker instance is updated with success/failure.
131+
* @param action action to execute (used in error messages)
132+
* @param path path of work (used in error messages)
133+
* @param tracker tracker to update
134+
* @param operation operation to execute
135+
* @param <T> type of return value
136+
* @return the result of the function call
137+
* @throws IOException any IOE raised, or translated exception
138+
*/
139+
@Retries.OnceTranslated
140+
public static <T> T onceTrackingDuration(
141+
final String action,
142+
final String path,
143+
final DurationTracker tracker,
144+
final CallableRaisingIOE<T> operation)
145+
throws IOException {
146+
try {
147+
return invokeTrackingDuration(tracker, operation);
148+
} catch (AmazonClientException e) {
149+
throw S3AUtils.translateException(action, path, e);
150+
}
151+
}
152+
125153
/**
126154
* Execute an operation with no result.
127155
* @param action action to execute (used in error messages)

0 commit comments

Comments
 (0)