forked from mahmoudparsian/pyspark-tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added self-contained word count in PySpark
- Loading branch information
1 parent
38f877c
commit d413439
Showing
3 changed files
with
91 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
* word_count.py | ||
|
||
Word Count solution in PySpark: Note that input file is | ||
hard-coded: not a very good practice. The purpose is to | ||
show how to read files in Spark. | ||
|
||
* word_count_ver2.py | ||
|
||
I pass input file as a parameter. | ||
|
||
|
||
best regards, | ||
Mahmoud Parsian |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
from __future__ import print_function | ||
|
||
import sys | ||
|
||
from pyspark.sql import SparkSession | ||
#----------------------------------- | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
# create an instance of a SparkSession as spark | ||
spark = SparkSession\ | ||
.builder\ | ||
.appName("wordcount")\ | ||
.getOrCreate() | ||
|
||
inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt" | ||
|
||
# create SparkContext as sc | ||
sc = spark.sparkContext | ||
|
||
# create RDD from a text file | ||
textfileRDD = sc.textFile(inputPath) | ||
print(textfileRDD.collect()) | ||
|
||
wordsRDD = textfileRDD.flatMap(lambda line: line.split(" ")) | ||
print(wordsRDD.collect()) | ||
|
||
pairsRDD = wordsRDD.map(lambda word: (word, 1)) | ||
print(pairsRDD.collect()) | ||
|
||
frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b) | ||
print(frequenciesRDD.collect()) | ||
|
||
# done! | ||
spark.stop() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from __future__ import print_function | ||
|
||
import sys | ||
|
||
from pyspark.sql import SparkSession | ||
#----------------------------------- | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
# create an instance of a SparkSession as spark | ||
spark = SparkSession\ | ||
.builder\ | ||
.appName("wordcount")\ | ||
.getOrCreate() | ||
|
||
# inputPath = "file:///Users/mparsian/spark-2.2.1/zbin/sample.txt" | ||
# | ||
# sys.argv[0] is the name of the script. | ||
# sys.argv[1] is the first parameter | ||
inputPath = sys.argv[1] # input file | ||
print("inputPath: {}".format(inputPath)) | ||
|
||
|
||
# create SparkContext as sc | ||
sc = spark.sparkContext | ||
|
||
# create RDD from a text file | ||
textfileRDD = sc.textFile(inputPath) | ||
print(textfileRDD.collect()) | ||
|
||
wordsRDD = textfileRDD.flatMap(lambda line: line.split(" ")) | ||
print(wordsRDD.collect()) | ||
|
||
pairsRDD = wordsRDD.map(lambda word: (word, 1)) | ||
print(pairsRDD.collect()) | ||
|
||
frequenciesRDD = pairsRDD.reduceByKey(lambda a, b: a + b) | ||
print(frequenciesRDD.collect()) | ||
|
||
# done! | ||
spark.stop() |