Skip to content

Commit 45ef127

Browse files
committed
Fixed #5777 - Obtain ORC stripe offsets from writer and avoid opening a written file
1 parent 0dd66f1 commit 45ef127

File tree

2 files changed

+14
-4
lines changed

2 files changed

+14
-4
lines changed

orc/src/main/java/org/apache/iceberg/orc/OrcFileAppender.java

+3-4
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,6 @@
3737
import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
3838
import org.apache.iceberg.relocated.com.google.common.collect.Lists;
3939
import org.apache.orc.OrcFile;
40-
import org.apache.orc.Reader;
4140
import org.apache.orc.StripeInformation;
4241
import org.apache.orc.TypeDescription;
4342
import org.apache.orc.Writer;
@@ -147,11 +146,11 @@ public long length() {
147146
@Override
148147
public List<Long> splitOffsets() {
149148
Preconditions.checkState(isClosed, "File is not yet closed");
150-
try (Reader reader = ORC.newFileReader(file.toInputFile(), conf)) {
151-
List<StripeInformation> stripes = reader.getStripes();
149+
try {
150+
List<StripeInformation> stripes = writer.getStripes();
152151
return Collections.unmodifiableList(Lists.transform(stripes, StripeInformation::getOffset));
153152
} catch (IOException e) {
154-
throw new RuntimeIOException(e, "Can't close ORC reader %s", file.location());
153+
throw new RuntimeIOException(e, "Cannot receive stripe information from writer for %s", file.location());
155154
}
156155
}
157156

spark/v3.2/spark/src/test/java/org/apache/iceberg/spark/data/TestOrcWrite.java

+11
Original file line numberDiff line numberDiff line change
@@ -22,11 +22,17 @@
2222

2323
import java.io.File;
2424
import java.io.IOException;
25+
import java.util.stream.Collectors;
26+
import org.apache.hadoop.conf.Configuration;
27+
import org.apache.hadoop.fs.Path;
2528
import org.apache.iceberg.Files;
2629
import org.apache.iceberg.Schema;
2730
import org.apache.iceberg.io.FileAppender;
2831
import org.apache.iceberg.orc.ORC;
2932
import org.apache.iceberg.types.Types;
33+
import org.apache.orc.OrcFile;
34+
import org.apache.orc.Reader;
35+
import org.apache.orc.StripeInformation;
3036
import org.apache.spark.sql.catalyst.InternalRow;
3137
import org.junit.Assert;
3238
import org.junit.Rule;
@@ -55,5 +61,10 @@ public void splitOffsets() throws IOException {
5561
writer.addAll(rows);
5662
writer.close();
5763
Assert.assertNotNull("Split offsets not present", writer.splitOffsets());
64+
// writer offsets are the same as the ORC reader offsets
65+
Reader reader = OrcFile.createReader(new Path(testFile.toURI()), OrcFile.readerOptions(new Configuration()));
66+
Assert.assertEquals(reader.getStripes().stream().map(StripeInformation::getOffset).collect(Collectors.toList()),
67+
writer.splitOffsets());
68+
reader.close();
5869
}
5970
}

0 commit comments

Comments
 (0)