Add test case to validate bloom in files

a49a · a49a · commit 1797d5fce894 · 2022-10-19T15:53:55.000+08:00
diff --git a/build.gradle b/build.gradle
@@ -527,6 +527,7 @@ project(':iceberg-orc') {
 
     testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
     testImplementation project(':iceberg-common')
+    testImplementation 'org.apache.orc:orc-tools'
   }
 }
 
diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestBloomFilter.java b/orc/src/test/java/org/apache/iceberg/orc/TestBloomFilter.java
@@ -22,12 +22,24 @@
 
 import java.io.File;
 import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.data.GenericRecord;
 import org.apache.iceberg.data.Record;
 import org.apache.iceberg.data.orc.GenericOrcWriter;
 import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.io.OutputFile;
 import org.apache.iceberg.types.Types;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.OrcIndex;
+import org.apache.orc.impl.RecordReaderImpl;
 import org.apache.orc.impl.WriterImpl;
 import org.junit.Assert;
 import org.junit.Rule;
@@ -48,14 +60,16 @@ public void testWriteOption() throws Exception {
     File testFile = temp.newFile();
     Assert.assertTrue("Delete should succeed", testFile.delete());
 
+    OutputFile outFile = Files.localOutput(testFile);
     try (FileAppender<Record> writer =
-        ORC.write(Files.localOutput(testFile))
+        ORC.write(outFile)
             .createWriterFunc(GenericOrcWriter::buildWriter)
             .schema(DATA_SCHEMA)
             .set("write.orc.bloom.filter.columns", "id,name")
             .set("write.orc.bloom.filter.fpp", "0.04")
             .build()) {
 
+      // Validate whether the bloom filters are set in ORC SDK or not
       Class clazzOrcFileAppender = Class.forName("org.apache.iceberg.orc.OrcFileAppender");
       Field writerField = clazzOrcFileAppender.getDeclaredField("writer");
       writerField.setAccessible(true);
@@ -72,7 +86,45 @@ public void testWriteOption() throws Exception {
       Assert.assertTrue(bloomFilterColumns[1]);
       Assert.assertTrue(bloomFilterColumns[2]);
       Assert.assertEquals(0.04, bloomFilterFpp, 1e-15);
+
+      Record recordTemplate = GenericRecord.create(DATA_SCHEMA);
+      Record record1 = recordTemplate.copy("id", 1L, "name", "foo", "price", 1.0);
+      Record record2 = recordTemplate.copy("id", 2L, "name", "bar", "price", 2.0);
+      writer.add(record1);
+      writer.add(record2);
     }
+
+    // Validate whether the bloom filters are written ORC files or not
+    Class clazzFileDump = Class.forName("org.apache.orc.tools.FileDump");
+    Method getFormattedBloomFilters =
+        clazzFileDump.getDeclaredMethod(
+            "getFormattedBloomFilters",
+            int.class,
+            OrcIndex.class,
+            OrcFile.WriterVersion.class,
+            TypeDescription.Category.class,
+            OrcProto.ColumnEncoding.class);
+    getFormattedBloomFilters.setAccessible(true);
+
+    Reader reader =
+        OrcFile.createReader(
+            new Path(outFile.location()), new OrcFile.ReaderOptions(new Configuration()));
+    boolean[] readCols = new boolean[] {false, true, true, false};
+    RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
+    OrcIndex indices = rows.readRowIndex(0, null, readCols);
+    StripeInformation stripe = reader.getStripes().get(0);
+    OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
+    String bloomFilterString =
+        (String)
+            getFormattedBloomFilters.invoke(
+                null,
+                1,
+                indices,
+                reader.getWriterVersion(),
+                reader.getSchema().findSubtype(1).getCategory(),
+                footer.getColumns(1));
+
+    Assert.assertTrue(bloomFilterString.contains("Bloom filters for column"));
   }
 
   @Test

Original file line number	Diff line number	Diff line change
`@@ -527,6 +527,7 @@ project(':iceberg-orc') {`
`527`	`527`
`528`	`528`	`testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')`
`529`	`529`	`testImplementation project(':iceberg-common')`
	`530`	`+ testImplementation 'org.apache.orc:orc-tools'`
`530`	`531`	`}`
`531`	`532`	`}`
`532`	`533`