Skip to content

Commit 0b7cec3

Browse files
committed
Small updates based on code review. Renamed statistical_summary.py to correlations.py
1 parent ab48f6e commit 0b7cec3

File tree

4 files changed

+32
-29
lines changed

4 files changed

+32
-29
lines changed

examples/src/main/python/mllib/statistical_summary.py renamed to examples/src/main/python/mllib/correlations.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#
1717

1818
"""
19-
Statistical summarization using MLlib.
19+
Correlations using MLlib.
2020
"""
2121

2222
import sys
@@ -29,9 +29,9 @@
2929

3030
if __name__ == "__main__":
3131
if len(sys.argv) not in [1,2]:
32-
print >> sys.stderr, "Usage: statistical_summary (<file>)"
32+
print >> sys.stderr, "Usage: correlations (<file>)"
3333
exit(-1)
34-
sc = SparkContext(appName="PythonStatisticalSummary")
34+
sc = SparkContext(appName="PythonCorrelations")
3535
if len(sys.argv) == 2:
3636
filepath = sys.argv[1]
3737
else:

examples/src/main/python/mllib/random_and_sampled_rdds.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929

3030
if __name__ == "__main__":
3131
if len(sys.argv) not in [1, 2]:
32-
print >> sys.stderr, "Usage: logistic_regression <libsvm data file>"
32+
print >> sys.stderr, "Usage: random_and_sampled_rdds <libsvm data file>"
3333
exit(-1)
3434
if len(sys.argv) == 2:
3535
datapath = sys.argv[1]
@@ -45,22 +45,23 @@
4545

4646
# Example: RandomRDDGenerators
4747
normalRDD = RandomRDDGenerators.normalRDD(sc, numExamples)
48-
print 'Generated RDD of %d examples sampled from a unit normal distribution' % normalRDD.count()
48+
print 'Generated RDD of %d examples sampled from the standard normal distribution'\
49+
% normalRDD.count()
4950
normalVectorRDD = RandomRDDGenerators.normalVectorRDD(sc, numRows = numExamples, numCols = 2)
5051
print 'Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()
5152

52-
print ''
53+
print
5354

5455
# Example: RDD.sample() and RDD.takeSample()
55-
exactSampleSize = int(numExamples * fraction)
56+
expectedSampleSize = int(numExamples * fraction)
5657
print 'Sampling RDD using fraction %g. Expected sample size = %d.' \
57-
% (fraction, exactSampleSize)
58+
% (fraction, expectedSampleSize)
5859
sampledRDD = normalRDD.sample(withReplacement = True, fraction = fraction)
5960
print ' RDD.sample(): sample has %d examples' % sampledRDD.count()
60-
sampledArray = normalRDD.takeSample(withReplacement = True, num = exactSampleSize)
61+
sampledArray = normalRDD.takeSample(withReplacement = True, num = expectedSampleSize)
6162
print ' RDD.takeSample(): sample has %d examples' % len(sampledArray)
6263

63-
print ''
64+
print
6465

6566
# Example: RDD.sampleByKey()
6667
examples = MLUtils.loadLibSVMFile(sc, datapath)
@@ -74,8 +75,7 @@
7475
fractions = {}
7576
for k in keyCountsA.keys():
7677
fractions[k] = fraction
77-
sampledByKeyRDD = \
78-
keyedRDD.sampleByKey(withReplacement = True, fractions = fractions)#, exact = True)
78+
sampledByKeyRDD = keyedRDD.sampleByKey(withReplacement = True, fractions = fractions)
7979
keyCountsB = sampledByKeyRDD.countByKey()
8080
sizeB = sum(keyCountsB.values())
8181
print ' Sampled %d examples using approximate stratified sampling (by label). ==> Sample' \

examples/src/main/scala/org/apache/spark/examples/mllib/RandomAndSampledRDDs.scala

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -32,30 +32,33 @@ import org.apache.spark.SparkContext._
3232
* }}}
3333
* If you use it as a template to create your own app, please use `spark-submit` to submit your app.
3434
*/
35-
object RandomAndSampledRDDs extends App {
35+
object RandomAndSampledRDDs {
3636

3737
case class Params(input: String = "data/mllib/sample_binary_classification_data.txt")
3838

39-
val defaultParams = Params()
39+
def main(args: Array[String]) {
40+
val defaultParams = Params()
4041

41-
val parser = new OptionParser[Params]("RandomAndSampledRDDs") {
42-
head("RandomAndSampledRDDs: an example app for randomly generated and sampled RDDs.")
43-
opt[String]("input")
44-
.text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
45-
.action((x, c) => c.copy(input = x))
46-
note(
47-
"""
42+
val parser = new OptionParser[Params]("RandomAndSampledRDDs") {
43+
head("RandomAndSampledRDDs: an example app for randomly generated and sampled RDDs.")
44+
opt[String]("input")
45+
.text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
46+
.action((x, c) => c.copy(input = x))
47+
note(
48+
"""
4849
|For example, the following command runs this app:
4950
|
5051
| bin/spark-submit --class org.apache.spark.examples.mllib.RandomAndSampledRDDs \
5152
| examples/target/scala-*/spark-examples-*.jar
52-
""".stripMargin)
53-
}
53+
""".
54+
stripMargin)
55+
}
5456

55-
parser.parse(args, defaultParams).map { params =>
56-
run(params)
57-
} getOrElse {
58-
sys.exit(1)
57+
parser.parse(args, defaultParams).map { params =>
58+
run(params)
59+
} getOrElse {
60+
sys.exit(1)
61+
}
5962
}
6063

6164
def run(params: Params) {

python/pyspark/mllib/linalg.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -161,12 +161,12 @@ def squared_distance(self, other):
161161
j += 1
162162
return result
163163

164-
def toDense(self):
164+
def toArray(self):
165165
"""
166166
Returns a copy of this SparseVector as a 1-dimensional NumPy array.
167167
"""
168168
arr = numpy.zeros(self.size)
169-
for i in range(self.indices.size):
169+
for i in xrange(self.indices.size):
170170
arr[self.indices[i]] = self.values[i]
171171
return arr
172172

0 commit comments

Comments
 (0)