Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[#104] Implement regression tests #109

Merged
merged 8 commits into from
Apr 10, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Refactored regression tests to use Parameterized
  • Loading branch information
piotrszul committed Apr 10, 2019
commit eca9019ac04da69f2d88d990c4af7e26442275bb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package au.csiro.variantspark.test.regression

import java.util.Collection

import scala.collection.JavaConverters.asJavaCollectionConverter

import org.junit.Test
import org.junit.runner.RunWith
import org.junit.runners.Parameterized
import org.junit.runners.Parameterized.Parameters


/**
* Runs regression test for real world datasets
*/
@RunWith(classOf[Parameterized])
class ImportanceDatasetRegressionTest(filenameWithExpected:String, cmdLine:String) extends ImportanceRegressionTest {

@Test
def testDatasetImportanceOutputMatches() {
runRegression(cmdLine, filenameWithExpected)
}
}

object ImportanceDatasetRegressionTest {

@Parameters
def datasets():Collection[Array[Object]] = List(
Array[Object]("chr22-imp_22_16050408.csv", "importance -if data/chr22_1000.vcf -ff data/chr22-labels.csv -fc 22_16050408 -v -rn 100 -rbs 50 -ro -sr 17 -on 100 -sp 4 -of ${outputFile}"),
Array[Object]("CNAE-9-imp_category.csv", """importance -if data/CNAE-9-wide.csv -it csv -ff data/CNAE-9-labels.csv -fc category -v -ro -rn 100 -rbs 50 -sr 17 -io {"defVariableType":"ORDINAL(10)"} -sp 4 -on 100 -of ${outputFile}""")
).asJavaCollection
}
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,29 @@ import org.apache.spark.sql.SparkSession
import org.junit.BeforeClass
import org.apache.commons.lang3.text.StrSubstitutor
import collection.JavaConverters._
import org.junit.runner.RunWith
import org.junit.Ignore


/**
* Base class for regression test that compare importance output for know
* datasets and parameters against the recorded one assumed to be correct.
* The expected output can be updated with the `dev/test-get-regression-cases.sh`
*/
abstract class ImportanceRegressionTest {

import ImportanceRegressionTest._
def expected(fileName:String):String = new File(ExpectedDir, fileName).getPath
def synth(fileName:String):String = new File(SynthDataDir, fileName).getPath
def actual(fileName:String):String = new File(ActualDir, fileName).getPath

def runRegression(cmdLine:String, expextedFileName:String, sessionBuilder:SparkSession.Builder = MasterLocal2) {
withSessionBuilder(sessionBuilder) { _ =>
val outputFile = actual(expextedFileName)
val sub = new StrSubstitutor(Map("outputFile" -> outputFile).asJava)
VariantSparkApp.main(sub.replace(cmdLine).split(" "))
assertSameContent(expected(expextedFileName), outputFile)
}
}
}

object ImportanceRegressionTest {

Expand Down Expand Up @@ -44,84 +65,5 @@ object ImportanceRegressionTest {
}
}

class ImportanceRegressionTest {

import ImportanceRegressionTest._

def expected(fileName:String):String = new File(ExpectedDir, fileName).getPath
def synth(fileName:String):String = new File(SynthDataDir, fileName).getPath
def actual(fileName:String):String = new File(ActualDir, fileName).getPath

//TODO: Refactor with ParametrizedTest: see: https://www.tutorialspoint.com/junit/junit_parameterized_test.htm
def runRegression(cmdLine:String, expextedFileName:String, sessionBuilder:SparkSession.Builder = MasterLocal2) {
withSessionBuilder(MasterLocal2) { _ =>
val outputFile = actual(expextedFileName)
val sub = new StrSubstitutor(Map("outputFile" -> outputFile).asJava)
VariantSparkApp.main(sub.replace(cmdLine).split(" "))
assertSameContent(expected(expextedFileName), outputFile)
}
}

def runSynthRegression(caseFile:String) {
// synth_2000_500_fact_3_0.995-imp_cat2.csv
val caseFileRE = """(synth_([^_]+)_([^_]+)_fact_([^_]+)_([^_]+))-imp_([^_]+).csv""".r
caseFile match {
case caseFileRE(prefix,_,_,ivo,_,response) => runRegression(s"""importance -if ${synth(prefix)}-wide.csv -ff ${synth(prefix)}-labels.csv -fc ${response} -it csv -io {"defVariableType":"ORDINAL(${ivo})"} -v -rn 100 -rbs 50 -ro -sr 17 -on 100 -sp 4 -of $${outputFile}""",
caseFile)
}
}

@Test
def testVFCImportance() {
runRegression("importance -if data/chr22_1000.vcf -ff data/chr22-labels.csv -fc 22_16050408 -v -rn 100 -rbs 50 -ro -sr 17 -on 100 -sp 4 -of ${outputFile}",
"chr22-imp_22_16050408.csv")
}

@Test
def testCNAEImportance() {
runRegression("""importance -if data/CNAE-9-wide.csv -it csv -ff data/CNAE-9-labels.csv -fc category -v -ro -rn 100 -rbs 50 -sr 17 -io {"defVariableType":"ORDINAL(10)"} -sp 4 -on 100 -of ${outputFile}""",
"CNAE-9-imp_category.csv")
}

@Test
def test_synth_2000_500_fact_3_0_995_imp_cat2() {
runSynthRegression("synth_2000_500_fact_3_0.995-imp_cat2.csv")
}

@Test
def test_synth_2000_500_fact_3_0_995_imp_cat10() {
runSynthRegression("synth_2000_500_fact_3_0.995-imp_cat10.csv")
}

@Test
def test_synth_2000_500_fact_3_0_imp_cat2() {
runSynthRegression("synth_2000_500_fact_3_0.0-imp_cat2.csv")
}

@Test
def test_synth_2000_500_fact_3_0_imp_cat10() {
runSynthRegression("synth_2000_500_fact_3_0.0-imp_cat10.csv")
}

@Test
def test_synth_2000_500_fact_10_0_995_imp_cat2() {
runSynthRegression("synth_2000_500_fact_10_0.995-imp_cat2.csv")
}

@Test
def test_synth_2000_500_fact_10_0_995_imp_cat10() {
runSynthRegression("synth_2000_500_fact_10_0.995-imp_cat10.csv")
}

@Test
def test_synth_2000_500_fact_10_0_imp_cat2() {
runSynthRegression("synth_2000_500_fact_10_0.0-imp_cat2.csv")
}

@Test
def test_synth_2000_500_fact_10_0_imp_cat10() {
runSynthRegression("synth_2000_500_fact_10_0.0-imp_cat10.csv")
}
}


Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
package au.csiro.variantspark.test.regression

import java.util.Collection

import scala.collection.JavaConverters.asJavaCollectionConverter

import org.junit.Test
import org.junit.runner.RunWith
import org.junit.runners.Parameterized
import org.junit.runners.Parameterized.Parameters
import com.google.common.io.PatternFilenameFilter


/**
* Runs regression test for syntetic datasets
* The datasets are generated with `dev/test-get-synth-data.sh`
*/

@RunWith(classOf[Parameterized])
class ImportanceSynthRegressionTest(caseFile:String) extends ImportanceRegressionTest {
import ImportanceSynthRegressionTest.caseFileRE

@Test
def testCaseImportanceOutputMatches() {
caseFile match {
case caseFileRE(prefix,_,_,ivo,_,response) => runRegression(s"""importance -if ${synth(prefix)}-wide.csv -ff ${synth(prefix)}-labels.csv -fc ${response} -it csv -io {"defVariableType":"ORDINAL(${ivo})"} -v -rn 100 -rbs 50 -ro -sr 17 -on 100 -sp 4 -of $${outputFile}""",
caseFile)
}
}
}

object ImportanceSynthRegressionTest {
import ImportanceRegressionTest._

/**
* Match test cases from such as: synth_2000_500_fact_3_0.995-imp_cat2.csv
*/
val caseFileRE = """(synth_([^_]+)_([^_]+)_fact_([^_]+)_([^_]+))-imp_([^_]+).csv""".r

@Parameters
def testCases:Collection[Array[Object]] = ExpectedDir.listFiles(new PatternFilenameFilter(caseFileRE.pattern))
.map(f => Array[Object](f.getName)).toList.asJavaCollection
}