|
| 1 | +Mean and Standard Deviation by Spark's combineByKey() |
| 2 | +===================================================== |
| 3 | + |
| 4 | +```` |
| 5 | +# ./bin/pyspark |
| 6 | +Python 2.7.10 (default, Oct 23 2015, 19:19:21) |
| 7 | +... |
| 8 | +Welcome to |
| 9 | + ____ __ |
| 10 | + / __/__ ___ _____/ /__ |
| 11 | + _\ \/ _ \/ _ `/ __/ '_/ |
| 12 | + /__ / .__/\_,_/_/ /_/\_\ version 1.6.1 |
| 13 | + /_/ |
| 14 | +
|
| 15 | +Using Python version 2.7.10 (default, Oct 23 2015 19:19:21) |
| 16 | +SparkContext available as sc, HiveContext available as sqlContext. |
| 17 | +>>> data = [ |
| 18 | +... ("A", 2.), ("A", 4.), ("A", 9.), |
| 19 | +... ("B", 10.), ("B", 20.), |
| 20 | +... ("Z", 3.), ("Z", 5.), ("Z", 8.), ("Z", 12.) |
| 21 | +... ] |
| 22 | +>>> data |
| 23 | +[ |
| 24 | + ('A', 2.0), |
| 25 | + ('A', 4.0), |
| 26 | + ('A', 9.0), |
| 27 | + ('B', 10.0), |
| 28 | + ('B', 20.0), |
| 29 | + ('Z', 3.0), |
| 30 | + ('Z', 5.0), |
| 31 | + ('Z', 8.0), |
| 32 | + ('Z', 12.0) |
| 33 | +] |
| 34 | +>>> rdd = sc.parallelize( data ) |
| 35 | +>>> rdd.collect() |
| 36 | +[ |
| 37 | + ('A', 2.0), |
| 38 | + ('A', 4.0), |
| 39 | + ('A', 9.0), |
| 40 | + ('B', 10.0), |
| 41 | + ('B', 20.0), |
| 42 | + ('Z', 3.0), |
| 43 | + ('Z', 5.0), |
| 44 | + ('Z', 8.0), |
| 45 | + ('Z', 12.0) |
| 46 | +] |
| 47 | +>>> rdd.count() |
| 48 | +9 |
| 49 | +>>> sumCount = rdd.combineByKey(lambda value: (value, value*value, 1), |
| 50 | +... lambda x, value: (x[0] + value, x[1] + value*value, x[2] + 1), |
| 51 | +... lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]) |
| 52 | +... ) |
| 53 | +
|
| 54 | +>>> sumCount.collect() |
| 55 | +[ |
| 56 | + ('A', (15.0, 101.0, 3)), |
| 57 | + ('Z', (28.0, 242.0, 4)), |
| 58 | + ('B', (30.0, 500.0, 2)) |
| 59 | +] |
| 60 | +
|
| 61 | +>>> import math |
| 62 | +>>> def stdDev( sumX, sumSquared, n ): |
| 63 | +... mean = sumX / n |
| 64 | +... stdDeviation = math.sqrt ((sumSquared - n*mean*mean) /n) |
| 65 | +... return (mean, stdDeviation) |
| 66 | +... ^D |
| 67 | +
|
| 68 | +>>> meanAndStdDev = sumCount.mapValues(lambda x : stdDev(x[0], x[1], x[2])) |
| 69 | +>>> meanAndStdDev.collect() |
| 70 | +[ |
| 71 | + ('A', (5.0, 2.943920288775949)), |
| 72 | + ('Z', (7.0, 3.391164991562634)), |
| 73 | + ('B', (15.0, 5.0)) |
| 74 | +] |
| 75 | +>>> |
| 76 | +```` |
0 commit comments