From d413439ed62a5376bb1e0fe7825ad405621d2c20 Mon Sep 17 00:00:00 2001 From: Mahmoud Parsian Date: Mon, 29 Jan 2018 10:24:12 -0800 Subject: [PATCH] added self-contained word count in PySpark --- tutorial/wordcount/README.md | 13 +++++++++ tutorial/wordcount/word_count.py | 36 +++++++++++++++++++++++ tutorial/wordcount/word_count_ver2.py | 42 +++++++++++++++++++++++++++ 3 files changed, 91 insertions(+) create mode 100644 tutorial/wordcount/README.md create mode 100644 tutorial/wordcount/word_count.py create mode 100644 tutorial/wordcount/word_count_ver2.py diff --git a/tutorial/wordcount/README.md b/tutorial/wordcount/README.md new file mode 100644 index 0000000..6c1fb49 --- /dev/null +++ b/tutorial/wordcount/README.md @@ -0,0 +1,13 @@ +* word_count.py + +Word Count solution in PySpark: Note that input file is +hard-coded: not a very good practice. The purpose is to +show how to read files in Spark. + +* word_count_ver2.py + +I pass input file as a parameter. + + +best regards, +Mahmoud Parsian diff --git a/tutorial/wordcount/word_count.py b/tutorial/wordcount/word_count.py new file mode 100644 index 0000000..c62d0da --- /dev/null +++ b/tutorial/wordcount/word_count.py @@ -0,0 +1,36 @@ +from __future__ import print_function + +import sys + +from pyspark.sql import SparkSession +#----------------------------------- + + +if __name__ == "__main__": + + # create an instance of a SparkSession as spark + spark = SparkSession\ + .builder\ + .appName("wordcount")\ + .getOrCreate() + + inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt" + + # create SparkContext as sc + sc = spark.sparkContext + + # create RDD from a text file + textfileRDD = sc.textFile(inputPath) + print(textfileRDD.collect()) + + wordsRDD = textfileRDD.flatMap(lambda line: line.split(" ")) + print(wordsRDD.collect()) + + pairsRDD = wordsRDD.map(lambda word: (word, 1)) + print(pairsRDD.collect()) + + frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b) + print(frequenciesRDD.collect()) + + # done! + spark.stop() diff --git a/tutorial/wordcount/word_count_ver2.py b/tutorial/wordcount/word_count_ver2.py new file mode 100644 index 0000000..38c613a --- /dev/null +++ b/tutorial/wordcount/word_count_ver2.py @@ -0,0 +1,42 @@ +from __future__ import print_function + +import sys + +from pyspark.sql import SparkSession +#----------------------------------- + + +if __name__ == "__main__": + + # create an instance of a SparkSession as spark + spark = SparkSession\ + .builder\ + .appName("wordcount")\ + .getOrCreate() + + # inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt" + # + # sys.argv[0] is the name of the script. + # sys.argv[1] is the first parameter + inputPath = sys.argv[1] # input file + print("inputPath: {}".format(inputPath)) + + + # create SparkContext as sc + sc = spark.sparkContext + + # create RDD from a text file + textfileRDD = sc.textFile(inputPath) + print(textfileRDD.collect()) + + wordsRDD = textfileRDD.flatMap(lambda line: line.split(" ")) + print(wordsRDD.collect()) + + pairsRDD = wordsRDD.map(lambda word: (word, 1)) + print(pairsRDD.collect()) + + frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b) + print(frequenciesRDD.collect()) + + # done! + spark.stop()