diff --git a/tutorial/combine-by-key/README.md b/tutorial/combine-by-key/README.md new file mode 100644 index 0000000..dd9e291 --- /dev/null +++ b/tutorial/combine-by-key/README.md @@ -0,0 +1,8 @@ +Spark's combineByKey() Examples and Tutorial +============================================ + +* [Mean Calculation by combineByKey()](./spark-combineByKey.md) +* [Standard Deviation and Mean Calculation by combineByKey()](./standard_deviation_by_combineByKey.md) + + +[![Data Algorithms Book](https://github.com/mahmoudparsian/data-algorithms-book/blob/master/misc/data_algorithms_image.jpg)](http://shop.oreilly.com/product/0636920033950.do) diff --git a/tutorial/combine-by-key/standard_deviation_by_combineByKey.md b/tutorial/combine-by-key/standard_deviation_by_combineByKey.md new file mode 100644 index 0000000..2d1576d --- /dev/null +++ b/tutorial/combine-by-key/standard_deviation_by_combineByKey.md @@ -0,0 +1,76 @@ +Mean and Standard Deviation by Spark's combineByKey() +===================================================== + +```` +# ./bin/pyspark +Python 2.7.10 (default, Oct 23 2015, 19:19:21) +... +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /__ / .__/\_,_/_/ /_/\_\ version 1.6.1 + /_/ + +Using Python version 2.7.10 (default, Oct 23 2015 19:19:21) +SparkContext available as sc, HiveContext available as sqlContext. +>>> data = [ +... ("A", 2.), ("A", 4.), ("A", 9.), +... ("B", 10.), ("B", 20.), +... ("Z", 3.), ("Z", 5.), ("Z", 8.), ("Z", 12.) +... ] +>>> data +[ + ('A', 2.0), + ('A', 4.0), + ('A', 9.0), + ('B', 10.0), + ('B', 20.0), + ('Z', 3.0), + ('Z', 5.0), + ('Z', 8.0), + ('Z', 12.0) +] +>>> rdd = sc.parallelize( data ) +>>> rdd.collect() +[ + ('A', 2.0), + ('A', 4.0), + ('A', 9.0), + ('B', 10.0), + ('B', 20.0), + ('Z', 3.0), + ('Z', 5.0), + ('Z', 8.0), + ('Z', 12.0) +] +>>> rdd.count() +9 +>>> sumCount = rdd.combineByKey(lambda value: (value, value*value, 1), +... lambda x, value: (x[0] + value, x[1] + value*value, x[2] + 1), +... lambda x, y: (x[0] + y[0], x[1] + y[1], x[2] + y[2]) +... ) + +>>> sumCount.collect() +[ + ('A', (15.0, 101.0, 3)), + ('Z', (28.0, 242.0, 4)), + ('B', (30.0, 500.0, 2)) +] + +>>> import math +>>> def stdDev( sumX, sumSquared, n ): +... mean = sumX / n +... stdDeviation = math.sqrt ((sumSquared - n*mean*mean) /n) +... return (mean, stdDeviation) +... ^D + +>>> meanAndStdDev = sumCount.mapValues(lambda x : stdDev(x[0], x[1], x[2])) +>>> meanAndStdDev.collect() +[ + ('A', (5.0, 2.943920288775949)), + ('Z', (7.0, 3.391164991562634)), + ('B', (15.0, 5.0)) +] +>>> +```` \ No newline at end of file