added self-contained word count in PySpark

ljrocklee · Jan 29, 2018 · d413439 · d413439
1 parent 38f877c
commit d413439
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 0 deletions.
diff --git a/tutorial/wordcount/README.md b/tutorial/wordcount/README.md
@@ -0,0 +1,13 @@
+* word_count.py
+
+Word Count solution in PySpark: Note that input file is 
+hard-coded: not a very good practice. The purpose is to 
+show how to read files in Spark.
+
+* word_count_ver2.py
+
+I pass input file as a parameter.
+
+
+best regards,
+Mahmoud Parsian
diff --git a/tutorial/wordcount/word_count.py b/tutorial/wordcount/word_count.py
@@ -0,0 +1,36 @@
+from __future__ import print_function
+
+import sys
+
+from pyspark.sql import SparkSession
+#-----------------------------------
+
+
+if __name__ == "__main__":
+
+    # create an instance of a SparkSession as spark
+    spark = SparkSession\
+        .builder\
+        .appName("wordcount")\
+        .getOrCreate()
+
+    inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
+
+    # create SparkContext as sc
+    sc = spark.sparkContext
+
+    # create RDD from a text file
+    textfileRDD = sc.textFile(inputPath)
+    print(textfileRDD.collect())
+
+    wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
+    print(wordsRDD.collect())
+
+    pairsRDD =  wordsRDD.map(lambda word: (word, 1))
+    print(pairsRDD.collect())
+
+    frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
+    print(frequenciesRDD.collect())
+
+    # done!
+    spark.stop()
diff --git a/tutorial/wordcount/word_count_ver2.py b/tutorial/wordcount/word_count_ver2.py
@@ -0,0 +1,42 @@
+from __future__ import print_function
+
+import sys
+
+from pyspark.sql import SparkSession
+#-----------------------------------
+
+
+if __name__ == "__main__":
+
+    # create an instance of a SparkSession as spark
+    spark = SparkSession\
+        .builder\
+        .appName("wordcount")\
+        .getOrCreate()
+
+    # inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
+    #
+    #   sys.argv[0] is the name of the script.
+    #   sys.argv[1] is the first parameter
+    inputPath = sys.argv[1] # input file
+    print("inputPath: {}".format(inputPath))
+
+
+    # create SparkContext as sc
+    sc = spark.sparkContext
+
+    # create RDD from a text file
+    textfileRDD = sc.textFile(inputPath)
+    print(textfileRDD.collect())
+
+    wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
+    print(wordsRDD.collect())
+
+    pairsRDD =  wordsRDD.map(lambda word: (word, 1))
+    print(pairsRDD.collect())
+
+    frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
+    print(frequenciesRDD.collect())
+
+    # done!
+    spark.stop()