Refactored regression tests to use Parameterized

aehrc · piotrszul · Apr 10, 2019 · Mar 29, 2019 · Apr 8, 2019 · Apr 8, 2019
commit eca9019ac04da69f2d88d990c4af7e26442275bb
diff --git a/src/test/scala/au/csiro/variantspark/test/regression/ImportanceDatasetRegressionTest.scala b/src/test/scala/au/csiro/variantspark/test/regression/ImportanceDatasetRegressionTest.scala
@@ -0,0 +1,32 @@
+package au.csiro.variantspark.test.regression
+
+import java.util.Collection
+
+import scala.collection.JavaConverters.asJavaCollectionConverter
+
+import org.junit.Test
+import org.junit.runner.RunWith
+import org.junit.runners.Parameterized
+import org.junit.runners.Parameterized.Parameters
+
+
+/**
+ * Runs regression test for real world datasets 
+ */
+@RunWith(classOf[Parameterized])
+class ImportanceDatasetRegressionTest(filenameWithExpected:String, cmdLine:String) extends ImportanceRegressionTest {
+
+  @Test
+  def testDatasetImportanceOutputMatches() {
+    runRegression(cmdLine, filenameWithExpected)
+  }
+}
+
+object ImportanceDatasetRegressionTest {
+
+  @Parameters
+  def datasets():Collection[Array[Object]] = List(
+        Array[Object]("chr22-imp_22_16050408.csv", "importance -if data/chr22_1000.vcf -ff data/chr22-labels.csv -fc 22_16050408 -v -rn 100 -rbs 50 -ro -sr 17 -on 100 -sp 4 -of ${outputFile}"),
+        Array[Object]("CNAE-9-imp_category.csv", """importance -if data/CNAE-9-wide.csv -it csv -ff data/CNAE-9-labels.csv -fc category -v -ro -rn 100 -rbs 50 -sr 17 -io {"defVariableType":"ORDINAL(10)"} -sp 4 -on 100 -of ${outputFile}""")
+      ).asJavaCollection
+}
diff --git a/src/test/scala/au/csiro/variantspark/test/regression/ImportanceRegressionTest.scala b/src/test/scala/au/csiro/variantspark/test/regression/ImportanceRegressionTest.scala
@@ -10,8 +10,29 @@ import org.apache.spark.sql.SparkSession
 import org.junit.BeforeClass
 import org.apache.commons.lang3.text.StrSubstitutor
 import collection.JavaConverters._
-import org.junit.runner.RunWith
-import org.junit.Ignore
+
+
+/**
+ * Base class for regression test that compare importance output for know 
+ * datasets and parameters against the recorded one assumed to be correct.
+ * The expected output can be updated with the `dev/test-get-regression-cases.sh`
+ */
+abstract class ImportanceRegressionTest {
+
+  import ImportanceRegressionTest._
+  def expected(fileName:String):String  = new File(ExpectedDir, fileName).getPath
+  def synth(fileName:String):String  = new File(SynthDataDir, fileName).getPath
+  def actual(fileName:String):String  = new File(ActualDir, fileName).getPath
+
+  def runRegression(cmdLine:String, expextedFileName:String, sessionBuilder:SparkSession.Builder = MasterLocal2) {
+    withSessionBuilder(sessionBuilder) { _ =>
+      val outputFile = actual(expextedFileName)
+      val sub = new StrSubstitutor(Map("outputFile" -> outputFile).asJava)
+      VariantSparkApp.main(sub.replace(cmdLine).split(" "))
+      assertSameContent(expected(expextedFileName), outputFile)
+    } 
+  }
+}
 
 object ImportanceRegressionTest {
 
@@ -44,84 +65,5 @@ object ImportanceRegressionTest {
   }  
 }
 
-class ImportanceRegressionTest {
-
-  import ImportanceRegressionTest._
-
-  def expected(fileName:String):String  = new File(ExpectedDir, fileName).getPath
-  def synth(fileName:String):String  = new File(SynthDataDir, fileName).getPath
-  def actual(fileName:String):String  = new File(ActualDir, fileName).getPath
-
-  //TODO: Refactor with ParametrizedTest: see: https://www.tutorialspoint.com/junit/junit_parameterized_test.htm
-  def runRegression(cmdLine:String, expextedFileName:String, sessionBuilder:SparkSession.Builder = MasterLocal2) {
-    withSessionBuilder(MasterLocal2) { _ =>
-      val outputFile = actual(expextedFileName)
-      val sub = new StrSubstitutor(Map("outputFile" -> outputFile).asJava)
-      VariantSparkApp.main(sub.replace(cmdLine).split(" "))
-      assertSameContent(expected(expextedFileName), outputFile)
-    } 
-  }
-
-  def runSynthRegression(caseFile:String) {
-    // synth_2000_500_fact_3_0.995-imp_cat2.csv
-    val caseFileRE = """(synth_([^_]+)_([^_]+)_fact_([^_]+)_([^_]+))-imp_([^_]+).csv""".r
-    caseFile match {
-      case caseFileRE(prefix,_,_,ivo,_,response) => runRegression(s"""importance -if ${synth(prefix)}-wide.csv -ff ${synth(prefix)}-labels.csv -fc ${response} -it csv -io {"defVariableType":"ORDINAL(${ivo})"} -v -rn 100 -rbs 50 -ro -sr 17 -on 100 -sp 4 -of $${outputFile}""",
-          caseFile)
-    }
-  }
-
-  @Test
-  def testVFCImportance() {
-    runRegression("importance -if data/chr22_1000.vcf -ff data/chr22-labels.csv -fc 22_16050408 -v -rn 100 -rbs 50 -ro -sr 17 -on 100 -sp 4 -of ${outputFile}",
-        "chr22-imp_22_16050408.csv")
-  }
-
-  @Test
-  def testCNAEImportance() {
-    runRegression("""importance -if data/CNAE-9-wide.csv -it csv -ff data/CNAE-9-labels.csv -fc category -v -ro -rn 100 -rbs 50 -sr 17 -io {"defVariableType":"ORDINAL(10)"} -sp 4 -on 100 -of ${outputFile}""",
-        "CNAE-9-imp_category.csv")
-  }  
-
-  @Test
-  def test_synth_2000_500_fact_3_0_995_imp_cat2() {
-    runSynthRegression("synth_2000_500_fact_3_0.995-imp_cat2.csv")
-  }  
-
-  @Test
-  def test_synth_2000_500_fact_3_0_995_imp_cat10() {
-    runSynthRegression("synth_2000_500_fact_3_0.995-imp_cat10.csv")
-  }  
-
-  @Test
-  def test_synth_2000_500_fact_3_0_imp_cat2() {
-    runSynthRegression("synth_2000_500_fact_3_0.0-imp_cat2.csv")
-  }  
-
-  @Test
-  def test_synth_2000_500_fact_3_0_imp_cat10() {
-    runSynthRegression("synth_2000_500_fact_3_0.0-imp_cat10.csv")
-  }  
-
-  @Test
-  def test_synth_2000_500_fact_10_0_995_imp_cat2() {
-    runSynthRegression("synth_2000_500_fact_10_0.995-imp_cat2.csv")
-  }  
-
-  @Test
-  def test_synth_2000_500_fact_10_0_995_imp_cat10() {
-    runSynthRegression("synth_2000_500_fact_10_0.995-imp_cat10.csv")
-  }  
-
-  @Test
-  def test_synth_2000_500_fact_10_0_imp_cat2() {
-    runSynthRegression("synth_2000_500_fact_10_0.0-imp_cat2.csv")
-  }  
-
-  @Test
-  def test_synth_2000_500_fact_10_0_imp_cat10() {
-    runSynthRegression("synth_2000_500_fact_10_0.0-imp_cat10.csv")
-  }  
-}
 
 
diff --git a/src/test/scala/au/csiro/variantspark/test/regression/ImportanceSynthRegressionTest.scala b/src/test/scala/au/csiro/variantspark/test/regression/ImportanceSynthRegressionTest.scala
@@ -0,0 +1,43 @@
+package au.csiro.variantspark.test.regression
+
+import java.util.Collection
+
+import scala.collection.JavaConverters.asJavaCollectionConverter
+
+import org.junit.Test
+import org.junit.runner.RunWith
+import org.junit.runners.Parameterized
+import org.junit.runners.Parameterized.Parameters
+import com.google.common.io.PatternFilenameFilter
+
+
+/**
+ * Runs regression test for syntetic datasets
+ * The datasets are generated with `dev/test-get-synth-data.sh`
+ */
+
+@RunWith(classOf[Parameterized])
+class ImportanceSynthRegressionTest(caseFile:String)  extends ImportanceRegressionTest {
+  import ImportanceSynthRegressionTest.caseFileRE
+
+  @Test
+  def testCaseImportanceOutputMatches() {
+    caseFile match {
+      case caseFileRE(prefix,_,_,ivo,_,response) => runRegression(s"""importance -if ${synth(prefix)}-wide.csv -ff ${synth(prefix)}-labels.csv -fc ${response} -it csv -io {"defVariableType":"ORDINAL(${ivo})"} -v -rn 100 -rbs 50 -ro -sr 17 -on 100 -sp 4 -of $${outputFile}""",
+          caseFile)
+    }
+  }
+}
+
+object ImportanceSynthRegressionTest {
+  import ImportanceRegressionTest._
+
+  /**
+   * Match test cases from such as:  synth_2000_500_fact_3_0.995-imp_cat2.csv
+   */
+  val caseFileRE = """(synth_([^_]+)_([^_]+)_fact_([^_]+)_([^_]+))-imp_([^_]+).csv""".r
+
+  @Parameters
+  def testCases:Collection[Array[Object]] = ExpectedDir.listFiles(new PatternFilenameFilter(caseFileRE.pattern))
+    .map(f => Array[Object](f.getName)).toList.asJavaCollection
+}