apache
diff --git a/‎build.sbt‎
Lines changed: 14 additions & 3 deletions b/‎build.sbt‎
Lines changed: 14 additions & 3 deletions
diff --git a/‎data/Twitter140sample.txt‎
Lines changed: 16000 additions & 0 deletions b/‎data/Twitter140sample.txt‎
Lines changed: 16000 additions & 0 deletions
diff --git a/‎data/import_eventserver.py‎
Lines changed: 46 additions & 0 deletions b/‎data/import_eventserver.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎engine.json‎
Lines changed: 12 additions & 5 deletions b/‎engine.json‎
Lines changed: 12 additions & 5 deletions
diff --git a/‎getnativepath.java‎
Lines changed: 7 additions & 0 deletions b/‎getnativepath.java‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/main/scala/org/template/textclassification/BIDMachLRAlgorithm.scala‎
Lines changed: 178 additions & 0 deletions b/‎src/main/scala/org/template/textclassification/BIDMachLRAlgorithm.scala‎
Lines changed: 178 additions & 0 deletions
diff --git a/‎src/main/scala/org/template/textclassification/DataSource.scala‎
Lines changed: 17 additions & 17 deletions b/‎src/main/scala/org/template/textclassification/DataSource.scala‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎src/main/scala/org/template/textclassification/Engine.scala‎
Lines changed: 3 additions & 1 deletion b/‎src/main/scala/org/template/textclassification/Engine.scala‎
Lines changed: 3 additions & 1 deletion
@@ -3,9 +3,20 @@ name := "org.template.textclassification"
 
 organization := "io.prediction"
 
+scalaVersion := "2.10.5"
+
 libraryDependencies ++= Seq(
-  "io.prediction"    %% "core"        % pioVersion.value % "provided",
-  "org.apache.spark" %% "spark-core" % "1.3.1" % "provided",
-  "org.apache.spark" %% "spark-mllib" % "1.3.1" % "provided",
+  "io.prediction"    % "core_2.10"        % pioVersion.value % "provided",
+  "org.apache.spark" %% "spark-core" % "1.4.1" % "provided",
+  "org.apache.spark" %% "spark-mllib" % "1.4.1" % "provided",
+  "com.github.fommil.netlib" % "all" % "1.1.2" pomOnly(),
+  "com.github.johnlangford" % "vw-jni" % "8.0.0",
   "org.xerial.snappy" % "snappy-java" % "1.1.1.7"
 )
+
+mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
+  {
+    case y if y.startsWith("doc")     => MergeStrategy.discard
+    case x => old(x)
+  }
+}
@@ -0,0 +1,46 @@
+"""
+Import sample data for classification engine
+"""
+
+import predictionio
+import argparse
+
+def import_events(client, file):
+  f = open(file, 'r')
+  count = 0
+  print "Importing data..."
+  for line in f:
+    data = line.rstrip('\r\n').split(",")
+    plan = data[0] 
+    #Not strictly CSV, after the first comma, no longer delimiting
+    text = ",".join(data[1:])
+    client.create_event(
+      event="$set",
+      entity_type="user",
+      entity_id=str(count), # use the count num as user ID
+      properties= {
+        "text" : text,
+        "category" : plan,
+        "label" : int(plan)
+      }
+    )
+    count += 1
+  f.close()
+  print "%s events are imported." % count
+
+if __name__ == '__main__':
+  parser = argparse.ArgumentParser(
+    description="Import sample data for classification engine")
+  parser.add_argument('--access_key', default='invald_access_key')
+  parser.add_argument('--url', default="http://localhost:7070")
+  parser.add_argument('--file', default="./data/Twitter140sample.txt")
+
+  args = parser.parse_args()
+  print args
+
+  client = predictionio.EventClient(
+    access_key=args.access_key,
+    url=args.url,
+    threads=5,
+    qsize=500)
+  import_events(client, args.file)
@@ -4,20 +4,27 @@
   "engineFactory": "org.template.textclassification.TextClassificationEngine",
   "datasource": {
     "params": {
-      "appName": "MyTextApp"
+      "appName": "smallerData"
     }
   },
   "preparator": {
     "params": {
-      "nGram": 2,
-      "numFeatures": 15000
+      "nGram": 1,
+      "numFeatures": 500,
+      "SPPMI": false
     }
   },
   "algorithms": [
     {
-      "name": "nb",
+      "name": "bid-lr",
       "params": {
-        "lambda": 0.25
+         "maxIter": 1,
+        "regParam": 0.00000005,
+        "stepSize": 5.0,
+        "bitPrecision": 22,
+        "modelName": "model.vw",
+        "namespace": "n",
+        "ngram": 1 
       }
     }
   ]
 
@@ -0,0 +1,7 @@
+public class getnativepath {
+    public static void main(String [] args) 
+    {
+        String v = System.getProperty("java.library.path");
+        System.out.print(v);
+    }
+}
@@ -0,0 +1,178 @@
+package org.template.textclassification
+
+import java.io.{InputStreamReader, BufferedReader, ByteArrayInputStream, Serializable}
+
+import BIDMat.{CMat,CSMat,DMat,Dict,FMat,FND,GMat,GDMat,GIMat,GLMat,GSMat,GSDMat,HMat,IDict,Image,IMat,LMat,Mat,SMat,SBMat,SDMat}
+import BIDMat.MatFunctions._
+import BIDMat.SciFunctions._
+import BIDMat.Solvers._
+import BIDMat.Plotting._
+import BIDMach.Learner
+import BIDMach.models.{FM,GLM,KMeans,KMeansw,LDA,LDAgibbs,Model,NMF,SFA,RandomForest}
+import BIDMach.networks.{DNN}
+import BIDMach.datasources.{DataSource,MatDS,FilesDS,SFilesDS}
+import BIDMach.mixins.{CosineSim,Perplexity,Top,L1Regularizer,L2Regularizer}
+import BIDMach.updaters.{ADAGrad,Batch,BatchNorm,IncMult,IncNorm,Telescoping}
+import BIDMach.causal.{IPTW}
+
+import io.prediction.controller.{P2LAlgorithm, Params}
+import org.apache.spark.SparkContext
+import org.apache.spark.ml.classification.LogisticRegression
+import org.apache.spark.mllib.linalg.{DenseVector, SparseVector}
+import org.apache.spark.rdd.RDD
+import org.apache.spark.sql.DataFrame
+
+case class BIDMachLRAlgorithmParams (
+                               regParam  : Double
+                               ) extends Params
+
+
+class BIDMachLRAlgorithm(
+                           val sap: BIDMachLRAlgorithmParams
+                           ) extends P2LAlgorithm[PreparedData, NativeLRModel, Query, PredictedResult] {
+  // Train your model.
+  def train(sc: SparkContext, pd: PreparedData): NativeLRModel = {
+    new BIDMachLRModel(sc, pd, sap.regParam)
+  }
+
+  // Prediction method for trained model.
+  def predict(model: NativeLRModel, query: Query): PredictedResult = {
+    model.predict(query.text)
+  }
+
+}
+
+  class BIDMachLRModel (
+                  sc : SparkContext,
+                  pd : PreparedData,
+                  regParam : Double
+                  ) extends Serializable with NativeLRModel {
+
+    private val labels: Seq[Double] = pd.categoryMap.keys.toSeq
+
+    val data = prepareDataFrame(sc, pd, labels)
+
+    private val lrModels = fitLRModels
+
+    def fitLRModels:Seq[(Double, LREstimate)] = {
+
+      Mat.checkMKL
+      Mat.checkCUDA
+      if (Mat.hasCUDA > 0) GPUmem
+
+      // 3. Create a logistic regression model for each class.
+      val lrModels: Seq[(Double, LREstimate)] = labels.map(
+        label => {
+          val lab = label.toInt.toString
+
+          val (categories, features) = getFMatsFromData(lab, data)
+
+          val mm: Learner = trainGLM(features, FMat(categories))
+
+          test(categories, features, mm)
+          val modelmat = FMat(mm.modelmat)
+          val weightSize = size(modelmat)._2 -1
+
+          val weights = modelmat(1,0 to weightSize)
+
+          val weightArray = (for(i <- 0 to weightSize -1) yield weights(0,i).toDouble).toArray
+
+          // Return (label, feature coefficients, and intercept term.
+          (label, LREstimate(weightArray, weights(0,weightSize)))
+        }
+      )
+      lrModels
+    }
+
+    def predict(text : String): PredictedResult = {
+      predict(text, pd, lrModels)
+    }
+
+    def trainGLM(traindata:SMat, traincats: FMat): Learner = {
+      //min(traindata, 1, traindata) // the first "traindata" argument is the input, the other is output
+
+      val (mm, mopts) = GLM.learner(traindata, traincats, GLM.logistic)
+      mopts.what
+
+      mopts.lrate = 0.1
+      mopts.reg1weight = regParam
+      mopts.batchSize = 1000
+      mopts.npasses = 250
+      mopts.autoReset = false
+      mopts.addConstFeat = true
+      mm.train
+      mm
+    }
+
+    def getFMatsFromData(lab: String, data:DataFrame): (FMat, SMat) = {
+      val features = data.select(lab, "features")
+
+      val sparseVectorsWithRowIndices = (for (r <- features) yield (r.getAs[SparseVector](1), r.getAs[Double](0))).zipWithIndex 
+
+      val triples = for {
+        ((vector, innerLabel), rowIndex) <- sparseVectorsWithRowIndices
+        (index, value) <- vector.indices zip vector.values
+      }  yield ((rowIndex.toInt,index,value), innerLabel)
+
+      val catTriples = for {
+        ((vector, innerLabel), rowIndex) <- sparseVectorsWithRowIndices
+      } yield (rowIndex.toInt,innerLabel.toInt,1.0)
+
+      val cats = catTriples
+      val feats = triples.map(x => x._1)
+
+      val numRows = cats.count().toInt
+
+      val catsMat = loadFMatTxt(cats,numRows)
+
+      val featsMat = loadFMatTxt(feats,numRows)
+
+      println(featsMat)
+
+      (full(catsMat), featsMat)
+    }
+
+    //See https://github.com/BIDData/BIDMat/blob/master/src/main/scala/BIDMat/HMat.scala , method loadDMatTxt
+    def loadFMatTxt(cats:RDD[(Int,Int,Double)], nrows: Int):SMat = {
+
+      val rows = cats.map(x=> x._1).collect()
+      val cols = cats.map(x=> x._2).collect()
+      val vals = cats.map(x=> x._3).collect()
+
+
+      println("LOADING")
+
+      sparse(icol(cols.toList),icol(rows.toList),col(vals.toList))
+    }
+
+    def test(categories: DMat, features: SMat, mm: Learner): Unit = {
+      val testdata = features
+      val testcats = categories
+
+      //min(testdata, 1, testdata)
+
+      val predcats = zeros(testcats.nrows, testcats.ncols)
+
+
+
+      val (nn, nopts) = GLM.predictor(mm.model, testdata, predcats)
+
+
+
+      nopts.addConstFeat = true
+      nn.predict
+
+
+      computeAccuracy(FMat(testcats), predcats)
+    }
+
+    def computeAccuracy(testcats: FMat, predcats: FMat): Unit = {
+      //println(testcats)
+      //println(predcats)
+
+      val lacc = (predcats ∙→ testcats + (1 - predcats) ∙→ (1 - testcats)) / predcats.ncols
+      lacc.t
+      println(mean(lacc))
+    }
+
+}
@@ -17,9 +17,9 @@ import org.apache.spark.rdd.RDD
 // cross validation.
 
 case class DataSourceParams(
-  appName: String,
-  evalK: Option[Int]
-) extends Params
+                             appName: String,
+                             evalK: Option[Int]
+                             ) extends Params
 
 
 
@@ -28,8 +28,8 @@ case class DataSourceParams(
 // readEval method.
 
 class DataSource (
-  val dsp : DataSourceParams
-) extends PDataSource[TrainingData, EmptyEvaluationInfo, Query, ActualResult] {
+                   val dsp : DataSourceParams
+                   ) extends PDataSource[TrainingData, EmptyEvaluationInfo, Query, ActualResult] {
 
   @transient lazy val logger = Logger[this.type]
 
@@ -39,15 +39,15 @@ class DataSource (
     //Get RDD of Events.
     PEventStore.find(
       appName = dsp.appName,
-      entityType = Some("content"), // specify data entity type
-      eventNames = Some(List("e-mail")) // specify data event name
+      entityType = Some("user"), // specify data entity type
+      eventNames = Some(List("$set")) // specify data event name
 
       // Convert collected RDD of events to and RDD of Observation
       // objects.
     )(sc).map(e => {
       val label : String = e.properties.get[String]("label")
       Observation(
-        if (label == "spam") 1.0 else 0.0,
+        if (label == "1") 1.0 else 0.0,
         e.properties.get[String]("text"),
         label
       )
@@ -62,7 +62,7 @@ class DataSource (
       entityType = Some("resource"),
       eventNames = Some(List("stopwords"))
 
-    //Convert collected RDD of strings to a string set.
+      //Convert collected RDD of strings to a string set.
     )(sc)
       .map(e => e.properties.get[String]("word"))
       .collect
@@ -92,7 +92,7 @@ class DataSource (
       val train = new TrainingData(
         data.filter(_._2 % dsp.evalK.get != k).map(_._1),
         readStopWords
-        ((sc)))
+          ((sc)))
 
       // Prepare test data for fold.
       val test = data.filter(_._2 % dsp.evalK.get == k)
@@ -108,17 +108,17 @@ class DataSource (
 // 3. Observation class serving as a wrapper for both our
 // data's class label and document string.
 case class Observation(
-  label : Double,
-  text : String,
-  category :String
-) extends Serializable
+                        label : Double,
+                        text : String,
+                        category :String
+                        ) extends Serializable
 
 // 4. TrainingData class serving as a wrapper for all
 // read in from the Event Server.
 class TrainingData(
-  val data : RDD[Observation],
-  val stopWords : Set[String]
-) extends Serializable with SanityCheck {
+                    val data : RDD[Observation],
+                    val stopWords : Set[String]
+                    ) extends Serializable with SanityCheck {
 
   // Sanity check to make sure your data is being fed in correctly.
 
 
@@ -40,8 +40,10 @@ object TextClassificationEngine extends EngineFactory {
       classOf[DataSource],
       classOf[Preparator],
       Map(
+        "VWlogisticSGD" -> classOf[VowpalLogisticRegressionWithSGDAlgorithm],
         "nb" -> classOf[NBAlgorithm],
-        "lr" -> classOf[LRAlgorithm]
+        "lr" -> classOf[LRAlgorithm],
+        "bid-lr" -> classOf[BIDMachLRAlgorithm]
       ), classOf[Serving]
     )
   }
Original file line number	Diff line number	Diff line change
`@@ -4,20 +4,27 @@`
`4`	`4`	`"engineFactory": "org.template.textclassification.TextClassificationEngine",`
`5`	`5`	`"datasource": {`
`6`	`6`	`"params": {`
`7`		`- "appName": "MyTextApp"`
	`7`	`+ "appName": "smallerData"`
`8`	`8`	`}`
`9`	`9`	`},`
`10`	`10`	`"preparator": {`
`11`	`11`	`"params": {`
`12`		`- "nGram": 2,`
`13`		`- "numFeatures": 15000`
	`12`	`+ "nGram": 1,`
	`13`	`+ "numFeatures": 500,`
	`14`	`+ "SPPMI": false`
`14`	`15`	`}`
`15`	`16`	`},`
`16`	`17`	`"algorithms": [`
`17`	`18`	`{`
`18`		`- "name": "nb",`
	`19`	`+ "name": "bid-lr",`
`19`	`20`	`"params": {`
`20`		`- "lambda": 0.25`
	`21`	`+ "maxIter": 1,`
	`22`	`+ "regParam": 0.00000005,`
	`23`	`+ "stepSize": 5.0,`
	`24`	`+ "bitPrecision": 22,`
	`25`	`+ "modelName": "model.vw",`
	`26`	`+ "namespace": "n",`
	`27`	`+ "ngram": 1`
`21`	`28`	`}`
`22`	`29`	`}`
`23`	`30`	`]`
Original file line number	Diff line number	Diff line change
`@@ -40,8 +40,10 @@ object TextClassificationEngine extends EngineFactory {`
`40`	`40`	`classOf[DataSource],`
`41`	`41`	`classOf[Preparator],`
`42`	`42`	`Map(`
	`43`	`+ "VWlogisticSGD" -> classOf[VowpalLogisticRegressionWithSGDAlgorithm],`
`43`	`44`	`"nb" -> classOf[NBAlgorithm],`
`44`		`- "lr" -> classOf[LRAlgorithm]`
	`45`	`+ "lr" -> classOf[LRAlgorithm],`
	`46`	`+ "bid-lr" -> classOf[BIDMachLRAlgorithm]`
`45`	`47`	`), classOf[Serving]`
`46`	`48`	`)`
`47`	`49`	`}`