fixed spelling errors

aehrc · piotrszul · Jun 28, 2017 · May 19, 2017 · May 19, 2017 · May 19, 2017
commit a7be1a3fef2fc7bc9eb97a52f30bbe082f630d90
diff --git a/src/main/scala/au/csiro/variantspark/algo/RandomForest.scala b/src/main/scala/au/csiro/variantspark/algo/RandomForest.scala
@@ -66,18 +66,18 @@ case class VotingAggregator(val nLabels:Int, val nSamples:Int) {
 
 
 @SerialVersionUID(2l)
-case class RandomForestMember[V](val predictor:PredictiveModelWithImportance[V], 
-    val oobIndexs:Array[Int] = null, val oobPred:Array[Int] = null) {
+case class RandomForestMember[V](val predictor:PredictiveModelWithImportance[V],
+                                 val oobIndexes:Array[Int] = null, val oobPred:Array[Int] = null) {
 }
 
 @SerialVersionUID(2l)
 case class RandomForestModel[V](val members: List[RandomForestMember[V]], val labelCount:Int, val oobErrors:List[Double] = List.empty)(implicit canSplit:CanSplit[V]) {
-  
+
   def size = members.size
   def trees = members.map(_.predictor)
-  
+
   def oobError:Double = oobErrors.last
-  
+
   def printout() {
     trees.zipWithIndex.foreach {
       case (tree, index) =>
@@ -87,40 +87,40 @@ case class RandomForestModel[V](val members: List[RandomForestMember[V]], val la
   }
 
   def normalizedVariableImportance(norm:VarImportanceNormalizer = To100ImportanceNormalizer): Map[Long, Double] = norm.normalize(variableImportance)
-  
-  def variableImportance: Map[Long, Double] = {   
+
+  def variableImportance: Map[Long, Double] = {
     // average the importance of each variable over all trees
     // if a variable is not used in a tree it's importance for this tree is assumed to be 0
     trees.map(_.variableImportanceAsFastMap).foldLeft(new Long2DoubleOpenHashMap())(_.addAll(_))
       .asScala.mapValues(_/size)
   }
-  
+
   def predict(data: RDD[V])(implicit ct:ClassTag[V]): Array[Int] = predictIndexed(data.zipWithIndex())
 
   def predictIndexed(indexedData: RDD[(V,Long)])(implicit ct:ClassTag[V]): Array[Int] = predictIndexed(indexedData, indexedData.size)
- 
+
   def predictIndexed(indexedData: RDD[(V,Long)], nSamples:Int)(implicit ct:ClassTag[V]): Array[Int] = {
      trees.map(_.predictIndexed(indexedData))
        .foldLeft(VotingAggregator(labelCount, nSamples))(_.addVote(_)).predictions
   }
-  
+
 }
 
 case class RandomForestParams(
     oob:Boolean = true,
-    nTryFraction:Double =  Double.NaN, 
+    nTryFraction:Double =  Double.NaN,
     bootstrap:Boolean = true,
-    subsample:Double = Double.NaN, 
+    subsample:Double = Double.NaN,
     randomizeEquality:Boolean = false,
     seed:Long =  defRng.nextLong
 ) {
   def resolveDefaults(nSamples:Int, nVariables:Int):RandomForestParams = {
     RandomForestParams(
-        oob = oob, 
+        oob = oob,
         nTryFraction = if (!nTryFraction.isNaN) nTryFraction else Math.sqrt(nVariables.toDouble)/nVariables,
         bootstrap = bootstrap,
         subsample = if (!subsample.isNaN) subsample else if (bootstrap) 1.0 else 0.666,
-        randomizeEquality  = randomizeEquality, 
+        randomizeEquality  = randomizeEquality,
         seed = seed
     )
   }
@@ -135,12 +135,12 @@ trait RandomForestCallback {
 // TODO (Design): Avoid using type cast change design
 trait BatchTreeModel[V] {
   def batchTrain(indexedData: RDD[(V, Long)], dataType:VariableType, labels: Array[Int], nTryFraction: Double, samples:Seq[Sample]): Seq[PredictiveModelWithImportance[V]]
-  def batchPredict(indexedData: RDD[(V, Long)], models: Seq[PredictiveModelWithImportance[V]], indexes:Seq[Array[Int]]): Seq[Array[Int]]  
+  def batchPredict(indexedData: RDD[(V, Long)], models: Seq[PredictiveModelWithImportance[V]], indexes:Seq[Array[Int]]): Seq[Array[Int]]
 }
 
 object RandomForest {
   type ModelBuilderFactory[V] = (DecisionTreeParams, CanSplit[V]) => BatchTreeModel[V]
-  
+
   def wideDecisionTreeBuilder[V](params:DecisionTreeParams, canSplit:CanSplit[V]): BatchTreeModel[V] = {
     val decisionTree = new DecisionTree[V](params)(canSplit)
     new BatchTreeModel[V]() {
@@ -149,25 +149,25 @@ object RandomForest {
               models.asInstanceOf[Seq[DecisionTreeModel[V]]], indexes)(canSplit)
     }
   }
-  
+
   val defaultBatchSize = 10
 }
 
 class RandomForest[V](params:RandomForestParams=RandomForestParams()
       ,modelBuilderFactory:RandomForest.ModelBuilderFactory[V] = RandomForest.wideDecisionTreeBuilder[V] _
       )(implicit canSplit:CanSplit[V]) extends Logging {
-  
-  // TODO (Design): This seems like an easiest solution but it make this class 
+
+  // TODO (Design): This seems like an easiest solution but it make this class
   // to keep random state ... perhaps this could be externalised to the implicit random
 
 
-  
+
   implicit lazy val rng = new XorShift1024StarRandomGenerator(params.seed)
-  
-  
-  def train(indexedData: RDD[(V, Long)],  dataType: VariableType,  labels: Array[Int], nTrees: Int)(implicit callback:RandomForestCallback = null): RandomForestModel[V] = 
+
+
+  def train(indexedData: RDD[(V, Long)],  dataType: VariableType,  labels: Array[Int], nTrees: Int)(implicit callback:RandomForestCallback = null): RandomForestModel[V] =
        batchTrain(indexedData, dataType, labels, nTrees, RandomForest.defaultBatchSize)
-  
+
   /**
    * TODO (Nice): Make a parameter rather then an extra method
    * TODO (Func): Add OOB calculation
@@ -177,19 +177,19 @@ class RandomForest[V](params:RandomForestParams=RandomForestParams()
     require(nTrees > 0)
     val nSamples = labels.length
     val nVariables = indexedData.count().toInt
-    val nLabels = labels.max + 1  
+    val nLabels = labels.max + 1
     logDebug(s"Data:  nSamples:${nSamples}, nVariables: ${nVariables}, nLabels:${nLabels}")
-    val actualParams = params.resolveDefaults(nSamples, nVariables) 
+    val actualParams = params.resolveDefaults(nSamples, nVariables)
     Option(callback).foreach(_.onParamsResolved(actualParams))
     logDebug(s"Parameters: ${actualParams}")
-    logDebug(s"Batch Traning: ${nTrees} with batch size: ${nBatchSize}")
-    val oobAggregator = if (actualParams.oob) Option(new VotingAggregator(nLabels,nSamples)) else None   
-    
-    val builder = modelBuilderFactory(DecisionTreeParams(seed = rng.nextLong, randomizeEquality = actualParams.randomizeEquality), canSplit)    
+    logDebug(s"Batch Training: ${nTrees} with batch size: ${nBatchSize}")
+    val oobAggregator = if (actualParams.oob) Option(new VotingAggregator(nLabels,nSamples)) else None
+
+    val builder = modelBuilderFactory(DecisionTreeParams(seed = rng.nextLong, randomizeEquality = actualParams.randomizeEquality), canSplit)
     val allSamples = Stream.fill(nTrees)(Sample.fraction(nSamples, actualParams.subsample, actualParams.bootstrap))
     val (allTrees, errors) = allSamples
       .sliding(nBatchSize, nBatchSize)
-      .flatMap { samplesStream => 
+      .flatMap { samplesStream =>
         time {
           val samples = samplesStream.toList
           val predictors = builder.batchTrain(indexedData, dataType, labels, actualParams.nTryFraction, samples)
@@ -200,7 +200,7 @@ class RandomForest[V](params:RandomForestParams=RandomForestParams()
           } else predictors.map(RandomForestMember(_))
           val oobError = oobAggregator.map { agg =>
             members.map { m =>
-                agg.addVote(m.oobPred, m.oobIndexs)
+                agg.addVote(m.oobPred, m.oobIndexes)
                 Metrics.classificatoinError(labels, agg.predictions)
             }
           }.getOrElse(List.fill(predictors.size)(Double.NaN))

diff --git a/src/main/scala/au/csiro/variantspark/cli/AnalyzeRFCmd.scala b/src/main/scala/au/csiro/variantspark/cli/AnalyzeRFCmd.scala
@@ -94,7 +94,7 @@ class AnalyzeRFCmd extends ArgsApp with FeatureSourceArgs with Echoable with Log
       val samples = featureSource.sampleNames
       LoanUtils.withCloseable(CSVWriter.open(outputOobPerTree)) { writer =>
         writer.writeRow(samples)
-        rfModel.members.map(m => m.oobIndexs.zip(m.oobPred).toMap)
+        rfModel.members.map(m => m.oobIndexes.zip(m.oobPred).toMap)
           .map(m => Range(0,samples.size).map(i => m.getOrElse(i, null))).foreach(writer.writeRow)
       }
     }