From d413439ed62a5376bb1e0fe7825ad405621d2c20 Mon Sep 17 00:00:00 2001
From: Mahmoud Parsian <mparsian@illumina.com>
Date: Mon, 29 Jan 2018 10:24:12 -0800
Subject: [PATCH] added self-contained word count in PySpark

---
 tutorial/wordcount/README.md          | 13 +++++++++
 tutorial/wordcount/word_count.py      | 36 +++++++++++++++++++++++
 tutorial/wordcount/word_count_ver2.py | 42 +++++++++++++++++++++++++++
 3 files changed, 91 insertions(+)
 create mode 100644 tutorial/wordcount/README.md
 create mode 100644 tutorial/wordcount/word_count.py
 create mode 100644 tutorial/wordcount/word_count_ver2.py

diff --git a/tutorial/wordcount/README.md b/tutorial/wordcount/README.md
new file mode 100644
index 0000000..6c1fb49
--- /dev/null
+++ b/tutorial/wordcount/README.md
@@ -0,0 +1,13 @@
+* word_count.py
+
+Word Count solution in PySpark: Note that input file is 
+hard-coded: not a very good practice. The purpose is to 
+show how to read files in Spark.
+
+* word_count_ver2.py
+
+I pass input file as a parameter.
+
+
+best regards,
+Mahmoud Parsian
diff --git a/tutorial/wordcount/word_count.py b/tutorial/wordcount/word_count.py
new file mode 100644
index 0000000..c62d0da
--- /dev/null
+++ b/tutorial/wordcount/word_count.py
@@ -0,0 +1,36 @@
+from __future__ import print_function
+
+import sys
+
+from pyspark.sql import SparkSession
+#-----------------------------------
+
+
+if __name__ == "__main__":
+
+    # create an instance of a SparkSession as spark
+    spark = SparkSession\
+        .builder\
+        .appName("wordcount")\
+        .getOrCreate()
+
+    inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
+
+    # create SparkContext as sc
+    sc = spark.sparkContext
+
+    # create RDD from a text file
+    textfileRDD = sc.textFile(inputPath)
+    print(textfileRDD.collect())
+
+    wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
+    print(wordsRDD.collect())
+
+    pairsRDD =  wordsRDD.map(lambda word: (word, 1))
+    print(pairsRDD.collect())
+
+    frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
+    print(frequenciesRDD.collect())
+
+    # done!
+    spark.stop()
diff --git a/tutorial/wordcount/word_count_ver2.py b/tutorial/wordcount/word_count_ver2.py
new file mode 100644
index 0000000..38c613a
--- /dev/null
+++ b/tutorial/wordcount/word_count_ver2.py
@@ -0,0 +1,42 @@
+from __future__ import print_function
+
+import sys
+
+from pyspark.sql import SparkSession
+#-----------------------------------
+
+
+if __name__ == "__main__":
+
+    # create an instance of a SparkSession as spark
+    spark = SparkSession\
+        .builder\
+        .appName("wordcount")\
+        .getOrCreate()
+
+    # inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
+    #
+    #   sys.argv[0] is the name of the script.
+    #   sys.argv[1] is the first parameter
+    inputPath = sys.argv[1] # input file
+    print("inputPath: {}".format(inputPath))
+
+
+    # create SparkContext as sc
+    sc = spark.sparkContext
+
+    # create RDD from a text file
+    textfileRDD = sc.textFile(inputPath)
+    print(textfileRDD.collect())
+
+    wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
+    print(wordsRDD.collect())
+
+    pairsRDD =  wordsRDD.map(lambda word: (word, 1))
+    print(pairsRDD.collect())
+
+    frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
+    print(frequenciesRDD.collect())
+
+    # done!
+    spark.stop()