Skip to content
This repository was archived by the owner on May 12, 2021. It is now read-only.

Commit 55fd981

Browse files
committed
Merge pull request #3 from EmergentOrder/master
Add support for BIDMach & VW LR + SPPMI feature vectorization
2 parents 36fed5b + d9d3e49 commit 55fd981

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

57 files changed

+17560
-111
lines changed

build.sbt

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,20 @@ name := "org.template.textclassification"
33

44
organization := "io.prediction"
55

6+
scalaVersion := "2.10.5"
7+
68
libraryDependencies ++= Seq(
7-
"io.prediction" %% "core" % pioVersion.value % "provided",
8-
"org.apache.spark" %% "spark-core" % "1.3.1" % "provided",
9-
"org.apache.spark" %% "spark-mllib" % "1.3.1" % "provided",
9+
"io.prediction" % "core_2.10" % pioVersion.value % "provided",
10+
"org.apache.spark" %% "spark-core" % "1.4.1" % "provided",
11+
"org.apache.spark" %% "spark-mllib" % "1.4.1" % "provided",
12+
"com.github.fommil.netlib" % "all" % "1.1.2" pomOnly(),
13+
"com.github.johnlangford" % "vw-jni" % "8.0.0",
1014
"org.xerial.snappy" % "snappy-java" % "1.1.1.7"
1115
)
16+
17+
mergeStrategy in assembly <<= (mergeStrategy in assembly) { (old) =>
18+
{
19+
case y if y.startsWith("doc") => MergeStrategy.discard
20+
case x => old(x)
21+
}
22+
}

data/Twitter140sample.txt

Lines changed: 16000 additions & 0 deletions
Large diffs are not rendered by default.

data/import_eventserver.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
"""
2+
Import sample data for classification engine
3+
"""
4+
5+
import predictionio
6+
import argparse
7+
8+
def import_events(client, file):
9+
f = open(file, 'r')
10+
count = 0
11+
print "Importing data..."
12+
for line in f:
13+
data = line.rstrip('\r\n').split(",")
14+
plan = data[0]
15+
#Not strictly CSV, after the first comma, no longer delimiting
16+
text = ",".join(data[1:])
17+
client.create_event(
18+
event="$set",
19+
entity_type="user",
20+
entity_id=str(count), # use the count num as user ID
21+
properties= {
22+
"text" : text,
23+
"category" : plan,
24+
"label" : int(plan)
25+
}
26+
)
27+
count += 1
28+
f.close()
29+
print "%s events are imported." % count
30+
31+
if __name__ == '__main__':
32+
parser = argparse.ArgumentParser(
33+
description="Import sample data for classification engine")
34+
parser.add_argument('--access_key', default='invald_access_key')
35+
parser.add_argument('--url', default="http://localhost:7070")
36+
parser.add_argument('--file', default="./data/Twitter140sample.txt")
37+
38+
args = parser.parse_args()
39+
print args
40+
41+
client = predictionio.EventClient(
42+
access_key=args.access_key,
43+
url=args.url,
44+
threads=5,
45+
qsize=500)
46+
import_events(client, args.file)

engine.json

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,22 @@
99
},
1010
"preparator": {
1111
"params": {
12-
"nGram": 2,
13-
"numFeatures": 15000
12+
"nGram": 1,
13+
"numFeatures": 500,
14+
"SPPMI": false
1415
}
1516
},
1617
"algorithms": [
1718
{
18-
"name": "nb",
19+
"name": "lr",
1920
"params": {
20-
"lambda": 0.25
21+
"maxIter": 1,
22+
"regParam": 0.00000005,
23+
"stepSize": 5.0,
24+
"bitPrecision": 22,
25+
"modelName": "model.vw",
26+
"namespace": "n",
27+
"ngram": 1
2128
}
2229
}
2330
]

getnativepath.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
public class getnativepath {
2+
public static void main(String [] args)
3+
{
4+
String v = System.getProperty("java.library.path");
5+
System.out.print(v);
6+
}
7+
}

0 commit comments

Comments
 (0)