|
19 | 19 |
|
20 | 20 | package org.apache.parquet.hadoop.util; |
21 | 21 |
|
| 22 | +import static org.apache.parquet.hadoop.util.wrapped.io.FutureIO.awaitFuture; |
| 23 | + |
22 | 24 | import java.io.IOException; |
| 25 | +import java.io.InterruptedIOException; |
| 26 | +import java.util.concurrent.CompletableFuture; |
23 | 27 | import org.apache.hadoop.conf.Configuration; |
| 28 | +import org.apache.hadoop.fs.FSDataInputStream; |
24 | 29 | import org.apache.hadoop.fs.FileStatus; |
25 | 30 | import org.apache.hadoop.fs.FileSystem; |
26 | 31 | import org.apache.hadoop.fs.Path; |
|
29 | 34 |
|
30 | 35 | public class HadoopInputFile implements InputFile { |
31 | 36 |
|
| 37 | + /** |
| 38 | + * openFile() option name for setting the read policy: {@value}. |
| 39 | + */ |
| 40 | + private static final String OPENFILE_READ_POLICY_KEY = "fs.option.openfile.read.policy"; |
| 41 | + |
| 42 | + /** |
| 43 | + * Read policy when opening parquet files: {@value}. |
| 44 | + * <p>Policy-aware stores pick the first policy they recognize in the list. |
| 45 | + * everything recognizes "random"; |
| 46 | + * "vector" came in with 3.4.0, while "parquet" came with Hadoop 3.4.1 |
| 47 | + * parquet means "this is a Parquet file, so be clever about footers, prefetch, |
| 48 | + * and expect vector and/or random IO". |
| 49 | + * <p>In Hadoop 3.4.1, "parquet" and "vector" are both mapped to "random" for the |
| 50 | + * S3A connector, but as the ABFS and GCS connectors do footer caching, they |
| 51 | + * may use it as a hint to say "fetch the footer and keep it in memory" |
| 52 | + */ |
| 53 | + private static final String PARQUET_READ_POLICY = "parquet, vector, random, adaptive"; |
| 54 | + |
32 | 55 | private final FileSystem fs; |
33 | 56 | private final FileStatus stat; |
34 | 57 | private final Configuration conf; |
@@ -70,9 +93,38 @@ public long getLength() { |
70 | 93 | return stat.getLen(); |
71 | 94 | } |
72 | 95 |
|
| 96 | + /** |
| 97 | + * Open the file. |
| 98 | + * <p>Uses {@code FileSystem.openFile()} so that |
| 99 | + * the existing FileStatus can be passed down: saves a HEAD request on cloud storage. |
| 100 | + * and ignored everywhere else. |
| 101 | + * |
| 102 | + * @return the input stream. |
| 103 | + * |
| 104 | + * @throws InterruptedIOException future was interrupted |
| 105 | + * @throws IOException if something went wrong |
| 106 | + * @throws RuntimeException any nested RTE thrown |
| 107 | + */ |
73 | 108 | @Override |
74 | 109 | public SeekableInputStream newStream() throws IOException { |
75 | | - return HadoopStreams.wrap(fs.open(stat.getPath())); |
| 110 | + FSDataInputStream stream; |
| 111 | + try { |
| 112 | + // this method is async so that implementations may do async HEAD head |
| 113 | + // requests. Not done in S3A/ABFS when a file status passed down (as is done here) |
| 114 | + final CompletableFuture<FSDataInputStream> future = fs.openFile(stat.getPath()) |
| 115 | + .withFileStatus(stat) |
| 116 | + .opt(OPENFILE_READ_POLICY_KEY, PARQUET_READ_POLICY) |
| 117 | + .build(); |
| 118 | + stream = awaitFuture(future); |
| 119 | + } catch (RuntimeException e) { |
| 120 | + // S3A < 3.3.5 would raise illegal path exception if the openFile path didn't |
| 121 | + // equal the path in the FileStatus; Hive virtual FS could create this condition. |
| 122 | + // As the path to open is derived from stat.getPath(), this condition seems |
| 123 | + // near-impossible to create -but is handled here for due diligence. |
| 124 | + stream = fs.open(stat.getPath()); |
| 125 | + } |
| 126 | + |
| 127 | + return HadoopStreams.wrap(stream); |
76 | 128 | } |
77 | 129 |
|
78 | 130 | @Override |
|
0 commit comments