Add test case to validate bloom in files

a49a · a49a · commit 097ef5c1267a · 2022-10-20T15:58:51.000+08:00
diff --git a/build.gradle b/build.gradle
@@ -545,6 +545,7 @@ project(':iceberg-orc') {
     testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')
     testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')
     testImplementation project(':iceberg-common')
+    testImplementation 'org.apache.orc:orc-tools'
   }
 }
 
diff --git a/orc/src/test/java/org/apache/iceberg/orc/TestBloomFilter.java b/orc/src/test/java/org/apache/iceberg/orc/TestBloomFilter.java
@@ -22,12 +22,24 @@
 
 import java.io.File;
 import java.lang.reflect.Field;
+import java.lang.reflect.Method;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.Path;
 import org.apache.iceberg.Files;
 import org.apache.iceberg.Schema;
+import org.apache.iceberg.data.GenericRecord;
 import org.apache.iceberg.data.Record;
 import org.apache.iceberg.data.orc.GenericOrcWriter;
 import org.apache.iceberg.io.FileAppender;
+import org.apache.iceberg.io.OutputFile;
 import org.apache.iceberg.types.Types;
+import org.apache.orc.OrcFile;
+import org.apache.orc.OrcProto;
+import org.apache.orc.Reader;
+import org.apache.orc.StripeInformation;
+import org.apache.orc.TypeDescription;
+import org.apache.orc.impl.OrcIndex;
+import org.apache.orc.impl.RecordReaderImpl;
 import org.apache.orc.impl.WriterImpl;
 import org.junit.Assert;
 import org.junit.Rule;
@@ -48,8 +60,9 @@ public void testWriteOption() throws Exception {
     File testFile = temp.newFile();
     Assert.assertTrue("Delete should succeed", testFile.delete());
 
+    OutputFile outFile = Files.localOutput(testFile);
     try (FileAppender<Record> writer =
-        ORC.write(Files.localOutput(testFile))
+        ORC.write(outFile)
             .createWriterFunc(GenericOrcWriter::buildWriter)
             .schema(DATA_SCHEMA)
             .set("write.orc.bloom.filter.columns", "id,name")
@@ -69,9 +82,50 @@ public void testWriteOption() throws Exception {
       boolean[] bloomFilterColumns = (boolean[]) bloomFilterColumnsField.get(orcWriter);
       double bloomFilterFpp = (double) bloomFilterFppField.get(orcWriter);
 
+      // Validate whether the bloom filters are set in ORC SDK or not
       Assert.assertTrue(bloomFilterColumns[1]);
       Assert.assertTrue(bloomFilterColumns[2]);
       Assert.assertEquals(0.04, bloomFilterFpp, 1e-15);
+
+      Record recordTemplate = GenericRecord.create(DATA_SCHEMA);
+      Record record1 = recordTemplate.copy("id", 1L, "name", "foo", "price", 1.0);
+      Record record2 = recordTemplate.copy("id", 2L, "name", "bar", "price", 2.0);
+      writer.add(record1);
+      writer.add(record2);
+    }
+
+    Class clazzFileDump = Class.forName("org.apache.orc.tools.FileDump");
+    Method getFormattedBloomFilters =
+        clazzFileDump.getDeclaredMethod(
+            "getFormattedBloomFilters",
+            int.class,
+            OrcIndex.class,
+            OrcFile.WriterVersion.class,
+            TypeDescription.Category.class,
+            OrcProto.ColumnEncoding.class);
+    getFormattedBloomFilters.setAccessible(true);
+
+    try (Reader reader =
+        OrcFile.createReader(
+            new Path(outFile.location()), new OrcFile.ReaderOptions(new Configuration())); ) {
+      boolean[] readCols = new boolean[] {false, true, true, false};
+      RecordReaderImpl rows = (RecordReaderImpl) reader.rows();
+      OrcIndex indices = rows.readRowIndex(0, null, readCols);
+      StripeInformation stripe = reader.getStripes().get(0);
+      OrcProto.StripeFooter footer = rows.readStripeFooter(stripe);
+
+      String bloomFilterString =
+          (String)
+              getFormattedBloomFilters.invoke(
+                  null,
+                  1,
+                  indices,
+                  reader.getWriterVersion(),
+                  reader.getSchema().findSubtype(1).getCategory(),
+                  footer.getColumns(1));
+
+      // Validate whether the bloom filters are written ORC files or not
+      Assert.assertTrue(bloomFilterString.contains("Bloom filters for column"));
     }
   }
 

Original file line number	Diff line number	Diff line change
`@@ -545,6 +545,7 @@ project(':iceberg-orc') {`
`545`	`545`	`testImplementation project(path: ':iceberg-api', configuration: 'testArtifacts')`
`546`	`546`	`testImplementation project(path: ':iceberg-core', configuration: 'testArtifacts')`
`547`	`547`	`testImplementation project(':iceberg-common')`
	`548`	`+ testImplementation 'org.apache.orc:orc-tools'`
`548`	`549`	`}`
`549`	`550`	`}`
`550`	`551`