Skip to content

Commit 2b46c4b

Browse files
committed
Merge remote-tracking branch 'origin/master' into countDistinctPartial
2 parents 8ff6402 + 217b5e9 commit 2b46c4b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+818
-79
lines changed

.travis.yml

Lines changed: 0 additions & 32 deletions
This file was deleted.

core/src/main/scala/org/apache/spark/scheduler/EventLoggingListener.scala

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,8 @@ private[spark] class EventLoggingListener(
5454
private val testing = sparkConf.getBoolean("spark.eventLog.testing", false)
5555
private val outputBufferSize = sparkConf.getInt("spark.eventLog.buffer.kb", 100) * 1024
5656
private val logBaseDir = sparkConf.get("spark.eventLog.dir", DEFAULT_LOG_DIR).stripSuffix("/")
57-
private val name = appName.replaceAll("[ :/]", "-").toLowerCase + "-" + System.currentTimeMillis
57+
private val name = appName.replaceAll("[ :/]", "-").replaceAll("[${}'\"]", "_")
58+
.toLowerCase + "-" + System.currentTimeMillis
5859
val logDir = Utils.resolveURI(logBaseDir) + "/" + name.stripSuffix("/")
5960

6061
protected val logger = new FileLogger(logDir, sparkConf, hadoopConf, outputBufferSize,

examples/src/main/python/als.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,3 +97,5 @@ def update(i, vec, mat, ratings):
9797
error = rmse(R, ms, us)
9898
print "Iteration %d:" % i
9999
print "\nRMSE: %5.4f\n" % error
100+
101+
sc.stop()

examples/src/main/python/cassandra_inputformat.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,5 @@
7777
output = cass_rdd.collect()
7878
for (k, v) in output:
7979
print (k, v)
80+
81+
sc.stop()

examples/src/main/python/cassandra_outputformat.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,5 @@
8181
conf=conf,
8282
keyConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLKeyConverter",
8383
valueConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLValueConverter")
84+
85+
sc.stop()

examples/src/main/python/hbase_inputformat.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,3 +71,5 @@
7171
output = hbase_rdd.collect()
7272
for (k, v) in output:
7373
print (k, v)
74+
75+
sc.stop()

examples/src/main/python/hbase_outputformat.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,5 @@
6363
conf=conf,
6464
keyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter",
6565
valueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter")
66+
67+
sc.stop()

examples/src/main/python/kmeans.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,3 +77,5 @@ def closestPoint(p, centers):
7777
kPoints[x] = y
7878

7979
print "Final centers: " + str(kPoints)
80+
81+
sc.stop()

examples/src/main/python/logistic_regression.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,3 +80,5 @@ def add(x, y):
8080
w -= points.map(lambda m: gradient(m, w)).reduce(add)
8181

8282
print "Final w: " + str(w)
83+
84+
sc.stop()
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
"""
19+
Correlations using MLlib.
20+
"""
21+
22+
import sys
23+
24+
from pyspark import SparkContext
25+
from pyspark.mllib.regression import LabeledPoint
26+
from pyspark.mllib.stat import Statistics
27+
from pyspark.mllib.util import MLUtils
28+
29+
30+
if __name__ == "__main__":
31+
if len(sys.argv) not in [1,2]:
32+
print >> sys.stderr, "Usage: correlations (<file>)"
33+
exit(-1)
34+
sc = SparkContext(appName="PythonCorrelations")
35+
if len(sys.argv) == 2:
36+
filepath = sys.argv[1]
37+
else:
38+
filepath = 'data/mllib/sample_linear_regression_data.txt'
39+
corrType = 'pearson'
40+
41+
points = MLUtils.loadLibSVMFile(sc, filepath)\
42+
.map(lambda lp: LabeledPoint(lp.label, lp.features.toArray()))
43+
44+
print
45+
print 'Summary of data file: ' + filepath
46+
print '%d data points' % points.count()
47+
48+
# Statistics (correlations)
49+
print
50+
print 'Correlation (%s) between label and each feature' % corrType
51+
print 'Feature\tCorrelation'
52+
numFeatures = points.take(1)[0].features.size
53+
labelRDD = points.map(lambda lp: lp.label)
54+
for i in range(numFeatures):
55+
featureRDD = points.map(lambda lp: lp.features[i])
56+
corr = Statistics.corr(labelRDD, featureRDD, corrType)
57+
print '%d\t%g' % (i, corr)
58+
print
59+
60+
sc.stop()

examples/src/main/python/mllib/decision_tree_runner.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717

1818
"""
1919
Decision tree classification and regression using MLlib.
20+
21+
This example requires NumPy (http://www.numpy.org/).
2022
"""
2123

2224
import numpy, os, sys
@@ -117,17 +119,22 @@ def usage():
117119
if len(sys.argv) == 2:
118120
dataPath = sys.argv[1]
119121
if not os.path.isfile(dataPath):
122+
sc.stop()
120123
usage()
121124
points = MLUtils.loadLibSVMFile(sc, dataPath)
122125

123126
# Re-index class labels if needed.
124127
(reindexedData, origToNewLabels) = reindexClassLabels(points)
125128

126129
# Train a classifier.
127-
model = DecisionTree.trainClassifier(reindexedData, numClasses=2)
130+
categoricalFeaturesInfo={} # no categorical features
131+
model = DecisionTree.trainClassifier(reindexedData, numClasses=2,
132+
categoricalFeaturesInfo=categoricalFeaturesInfo)
128133
# Print learned tree and stats.
129134
print "Trained DecisionTree for classification:"
130135
print " Model numNodes: %d\n" % model.numNodes()
131136
print " Model depth: %d\n" % model.depth()
132137
print " Training accuracy: %g\n" % getAccuracy(model, reindexedData)
133138
print model
139+
140+
sc.stop()

examples/src/main/python/mllib/kmeans.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,4 @@ def parseVector(line):
4242
k = int(sys.argv[2])
4343
model = KMeans.train(data, k)
4444
print "Final centers: " + str(model.clusterCenters)
45+
sc.stop()

examples/src/main/python/mllib/logistic_regression.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,3 +50,4 @@ def parsePoint(line):
5050
model = LogisticRegressionWithSGD.train(points, iterations)
5151
print "Final weights: " + str(model.weights)
5252
print "Final intercept: " + str(model.intercept)
53+
sc.stop()
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
"""
19+
Randomly generated RDDs.
20+
"""
21+
22+
import sys
23+
24+
from pyspark import SparkContext
25+
from pyspark.mllib.random import RandomRDDs
26+
27+
28+
if __name__ == "__main__":
29+
if len(sys.argv) not in [1, 2]:
30+
print >> sys.stderr, "Usage: random_rdd_generation"
31+
exit(-1)
32+
33+
sc = SparkContext(appName="PythonRandomRDDGeneration")
34+
35+
numExamples = 10000 # number of examples to generate
36+
fraction = 0.1 # fraction of data to sample
37+
38+
# Example: RandomRDDs.normalRDD
39+
normalRDD = RandomRDDs.normalRDD(sc, numExamples)
40+
print 'Generated RDD of %d examples sampled from the standard normal distribution'\
41+
% normalRDD.count()
42+
print ' First 5 samples:'
43+
for sample in normalRDD.take(5):
44+
print ' ' + str(sample)
45+
print
46+
47+
# Example: RandomRDDs.normalVectorRDD
48+
normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows = numExamples, numCols = 2)
49+
print 'Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()
50+
print ' First 5 samples:'
51+
for sample in normalVectorRDD.take(5):
52+
print ' ' + str(sample)
53+
print
54+
55+
sc.stop()
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
#
2+
# Licensed to the Apache Software Foundation (ASF) under one or more
3+
# contributor license agreements. See the NOTICE file distributed with
4+
# this work for additional information regarding copyright ownership.
5+
# The ASF licenses this file to You under the Apache License, Version 2.0
6+
# (the "License"); you may not use this file except in compliance with
7+
# the License. You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
#
17+
18+
"""
19+
Randomly sampled RDDs.
20+
"""
21+
22+
import sys
23+
24+
from pyspark import SparkContext
25+
from pyspark.mllib.util import MLUtils
26+
27+
28+
if __name__ == "__main__":
29+
if len(sys.argv) not in [1, 2]:
30+
print >> sys.stderr, "Usage: sampled_rdds <libsvm data file>"
31+
exit(-1)
32+
if len(sys.argv) == 2:
33+
datapath = sys.argv[1]
34+
else:
35+
datapath = 'data/mllib/sample_binary_classification_data.txt'
36+
37+
sc = SparkContext(appName="PythonSampledRDDs")
38+
39+
fraction = 0.1 # fraction of data to sample
40+
41+
examples = MLUtils.loadLibSVMFile(sc, datapath)
42+
numExamples = examples.count()
43+
if numExamples == 0:
44+
print >> sys.stderr, "Error: Data file had no samples to load."
45+
exit(1)
46+
print 'Loaded data with %d examples from file: %s' % (numExamples, datapath)
47+
48+
# Example: RDD.sample() and RDD.takeSample()
49+
expectedSampleSize = int(numExamples * fraction)
50+
print 'Sampling RDD using fraction %g. Expected sample size = %d.' \
51+
% (fraction, expectedSampleSize)
52+
sampledRDD = examples.sample(withReplacement = True, fraction = fraction)
53+
print ' RDD.sample(): sample has %d examples' % sampledRDD.count()
54+
sampledArray = examples.takeSample(withReplacement = True, num = expectedSampleSize)
55+
print ' RDD.takeSample(): sample has %d examples' % len(sampledArray)
56+
57+
print
58+
59+
# Example: RDD.sampleByKey()
60+
keyedRDD = examples.map(lambda lp: (int(lp.label), lp.features))
61+
print ' Keyed data using label (Int) as key ==> Orig'
62+
# Count examples per label in original data.
63+
keyCountsA = keyedRDD.countByKey()
64+
65+
# Subsample, and count examples per label in sampled data.
66+
fractions = {}
67+
for k in keyCountsA.keys():
68+
fractions[k] = fraction
69+
sampledByKeyRDD = keyedRDD.sampleByKey(withReplacement = True, fractions = fractions)
70+
keyCountsB = sampledByKeyRDD.countByKey()
71+
sizeB = sum(keyCountsB.values())
72+
print ' Sampled %d examples using approximate stratified sampling (by label). ==> Sample' \
73+
% sizeB
74+
75+
# Compare samples
76+
print ' \tFractions of examples with key'
77+
print 'Key\tOrig\tSample'
78+
for k in sorted(keyCountsA.keys()):
79+
fracA = keyCountsA[k] / float(numExamples)
80+
if sizeB != 0:
81+
fracB = keyCountsB.get(k, 0) / float(sizeB)
82+
else:
83+
fracB = 0
84+
print '%d\t%g\t%g' % (k, fracA, fracB)
85+
86+
sc.stop()

examples/src/main/python/pagerank.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,3 +68,5 @@ def parseNeighbors(urls):
6868
# Collects all URL ranks and dump them to console.
6969
for (link, rank) in ranks.collect():
7070
print "%s has rank: %s." % (link, rank)
71+
72+
sc.stop()

examples/src/main/python/pi.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,5 @@ def f(_):
3737

3838
count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add)
3939
print "Pi is roughly %f" % (4.0 * count / n)
40+
41+
sc.stop()

examples/src/main/python/sort.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,3 +34,5 @@
3434
output = sortedCount.collect()
3535
for (num, unitcount) in output:
3636
print num
37+
38+
sc.stop()

examples/src/main/python/transitive_closure.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,5 @@ def generateGraph():
6464
break
6565

6666
print "TC has %i edges" % tc.count()
67+
68+
sc.stop()

examples/src/main/python/wordcount.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,5 @@
3333
output = counts.collect()
3434
for (word, count) in output:
3535
print "%s: %i" % (word, count)
36+
37+
sc.stop()

0 commit comments

Comments
 (0)