Skip to content

Commit

Permalink
added self-contained word count in PySpark
Browse files Browse the repository at this point in the history
  • Loading branch information
pyspark-in-action committed Jan 29, 2018
1 parent 38f877c commit d413439
Show file tree
Hide file tree
Showing 3 changed files with 91 additions and 0 deletions.
13 changes: 13 additions & 0 deletions tutorial/wordcount/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
* word_count.py

Word Count solution in PySpark: Note that input file is
hard-coded: not a very good practice. The purpose is to
show how to read files in Spark.

* word_count_ver2.py

I pass input file as a parameter.


best regards,
Mahmoud Parsian
36 changes: 36 additions & 0 deletions tutorial/wordcount/word_count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from __future__ import print_function

import sys

from pyspark.sql import SparkSession
#-----------------------------------


if __name__ == "__main__":

# create an instance of a SparkSession as spark
spark = SparkSession\
.builder\
.appName("wordcount")\
.getOrCreate()

inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"

# create SparkContext as sc
sc = spark.sparkContext

# create RDD from a text file
textfileRDD = sc.textFile(inputPath)
print(textfileRDD.collect())

wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
print(wordsRDD.collect())

pairsRDD = wordsRDD.map(lambda word: (word, 1))
print(pairsRDD.collect())

frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
print(frequenciesRDD.collect())

# done!
spark.stop()
42 changes: 42 additions & 0 deletions tutorial/wordcount/word_count_ver2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from __future__ import print_function

import sys

from pyspark.sql import SparkSession
#-----------------------------------


if __name__ == "__main__":

# create an instance of a SparkSession as spark
spark = SparkSession\
.builder\
.appName("wordcount")\
.getOrCreate()

# inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt"
#
# sys.argv[0] is the name of the script.
# sys.argv[1] is the first parameter
inputPath = sys.argv[1] # input file
print("inputPath: {}".format(inputPath))


# create SparkContext as sc
sc = spark.sparkContext

# create RDD from a text file
textfileRDD = sc.textFile(inputPath)
print(textfileRDD.collect())

wordsRDD = textfileRDD.flatMap(lambda line: line.split(" "))
print(wordsRDD.collect())

pairsRDD = wordsRDD.map(lambda word: (word, 1))
print(pairsRDD.collect())

frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b)
print(frequenciesRDD.collect())

# done!
spark.stop()

0 comments on commit d413439

Please sign in to comment.