From c04f3b55a01e32f9431ead480a6cd42443796d33 Mon Sep 17 00:00:00 2001 From: Mahmoud Parsian Date: Fri, 31 Dec 2021 20:40:51 -0800 Subject: [PATCH] added pyspark UDF example --- tutorial/.DS_Store | Bin tutorial/add-indices/add-indices.txt | 0 tutorial/basic-average/basic-average.txt | 0 tutorial/basic-filter/basic-filter.txt | 0 tutorial/basic-join/basicjoin.txt | 0 tutorial/basic-map/basic-map.txt | 0 tutorial/basic-multiply/basic-multiply.txt | 0 tutorial/basic-sort/sort-by-key.txt | 0 tutorial/basic-sum/basic-sum.txt | 0 tutorial/basic-union/basic-union.txt | 0 tutorial/bigrams/bigrams.txt | 0 tutorial/cartesian/cartesian.txt | 0 tutorial/combine-by-key/README.md | 0 tutorial/combine-by-key/combine-by-key.txt | 0 ...ng_with_spark_by_Javier_Santos_Paniego.pdf | Bin tutorial/combine-by-key/spark-combineByKey.md | 0 .../combine-by-key/spark-combineByKey.txt | 0 .../standard_deviation_by_combineByKey.md | 0 tutorial/dna-basecount/README.md | 0 tutorial/dna-basecount/dna-basecount.md | 0 tutorial/dna-basecount/dna-basecount2.md | 0 tutorial/dna-basecount/dna-basecount3.md | 0 tutorial/dna-basecount/dna_seq.txt | 0 tutorial/map-partitions/README.md | 0 tutorial/pyspark-udf/pyspark_udf_maptype.txt | 57 ++++++++++++++++++ tutorial/split-function/README.md | 0 tutorial/top-N/top-N.txt | 0 tutorial/wordcount/README.md | 0 tutorial/wordcount/word_count.py | 0 tutorial/wordcount/word_count_ver2.py | 0 tutorial/wordcount/wordcount-shorthand.txt | 0 tutorial/wordcount/wordcount.txt | 0 32 files changed, 57 insertions(+) mode change 100644 => 100755 tutorial/.DS_Store mode change 100644 => 100755 tutorial/add-indices/add-indices.txt mode change 100644 => 100755 tutorial/basic-average/basic-average.txt mode change 100644 => 100755 tutorial/basic-filter/basic-filter.txt mode change 100644 => 100755 tutorial/basic-join/basicjoin.txt mode change 100644 => 100755 tutorial/basic-map/basic-map.txt mode change 100644 => 100755 tutorial/basic-multiply/basic-multiply.txt mode change 100644 => 100755 tutorial/basic-sort/sort-by-key.txt mode change 100644 => 100755 tutorial/basic-sum/basic-sum.txt mode change 100644 => 100755 tutorial/basic-union/basic-union.txt mode change 100644 => 100755 tutorial/bigrams/bigrams.txt mode change 100644 => 100755 tutorial/cartesian/cartesian.txt mode change 100644 => 100755 tutorial/combine-by-key/README.md mode change 100644 => 100755 tutorial/combine-by-key/combine-by-key.txt mode change 100644 => 100755 tutorial/combine-by-key/distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf mode change 100644 => 100755 tutorial/combine-by-key/spark-combineByKey.md mode change 100644 => 100755 tutorial/combine-by-key/spark-combineByKey.txt mode change 100644 => 100755 tutorial/combine-by-key/standard_deviation_by_combineByKey.md mode change 100644 => 100755 tutorial/dna-basecount/README.md mode change 100644 => 100755 tutorial/dna-basecount/dna-basecount.md mode change 100644 => 100755 tutorial/dna-basecount/dna-basecount2.md mode change 100644 => 100755 tutorial/dna-basecount/dna-basecount3.md mode change 100644 => 100755 tutorial/dna-basecount/dna_seq.txt mode change 100644 => 100755 tutorial/map-partitions/README.md create mode 100644 tutorial/pyspark-udf/pyspark_udf_maptype.txt mode change 100644 => 100755 tutorial/split-function/README.md mode change 100644 => 100755 tutorial/top-N/top-N.txt mode change 100644 => 100755 tutorial/wordcount/README.md mode change 100644 => 100755 tutorial/wordcount/word_count.py mode change 100644 => 100755 tutorial/wordcount/word_count_ver2.py mode change 100644 => 100755 tutorial/wordcount/wordcount-shorthand.txt mode change 100644 => 100755 tutorial/wordcount/wordcount.txt diff --git a/tutorial/.DS_Store b/tutorial/.DS_Store old mode 100644 new mode 100755 diff --git a/tutorial/add-indices/add-indices.txt b/tutorial/add-indices/add-indices.txt old mode 100644 new mode 100755 diff --git a/tutorial/basic-average/basic-average.txt b/tutorial/basic-average/basic-average.txt old mode 100644 new mode 100755 diff --git a/tutorial/basic-filter/basic-filter.txt b/tutorial/basic-filter/basic-filter.txt old mode 100644 new mode 100755 diff --git a/tutorial/basic-join/basicjoin.txt b/tutorial/basic-join/basicjoin.txt old mode 100644 new mode 100755 diff --git a/tutorial/basic-map/basic-map.txt b/tutorial/basic-map/basic-map.txt old mode 100644 new mode 100755 diff --git a/tutorial/basic-multiply/basic-multiply.txt b/tutorial/basic-multiply/basic-multiply.txt old mode 100644 new mode 100755 diff --git a/tutorial/basic-sort/sort-by-key.txt b/tutorial/basic-sort/sort-by-key.txt old mode 100644 new mode 100755 diff --git a/tutorial/basic-sum/basic-sum.txt b/tutorial/basic-sum/basic-sum.txt old mode 100644 new mode 100755 diff --git a/tutorial/basic-union/basic-union.txt b/tutorial/basic-union/basic-union.txt old mode 100644 new mode 100755 diff --git a/tutorial/bigrams/bigrams.txt b/tutorial/bigrams/bigrams.txt old mode 100644 new mode 100755 diff --git a/tutorial/cartesian/cartesian.txt b/tutorial/cartesian/cartesian.txt old mode 100644 new mode 100755 diff --git a/tutorial/combine-by-key/README.md b/tutorial/combine-by-key/README.md old mode 100644 new mode 100755 diff --git a/tutorial/combine-by-key/combine-by-key.txt b/tutorial/combine-by-key/combine-by-key.txt old mode 100644 new mode 100755 diff --git a/tutorial/combine-by-key/distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf b/tutorial/combine-by-key/distributed_computing_with_spark_by_Javier_Santos_Paniego.pdf old mode 100644 new mode 100755 diff --git a/tutorial/combine-by-key/spark-combineByKey.md b/tutorial/combine-by-key/spark-combineByKey.md old mode 100644 new mode 100755 diff --git a/tutorial/combine-by-key/spark-combineByKey.txt b/tutorial/combine-by-key/spark-combineByKey.txt old mode 100644 new mode 100755 diff --git a/tutorial/combine-by-key/standard_deviation_by_combineByKey.md b/tutorial/combine-by-key/standard_deviation_by_combineByKey.md old mode 100644 new mode 100755 diff --git a/tutorial/dna-basecount/README.md b/tutorial/dna-basecount/README.md old mode 100644 new mode 100755 diff --git a/tutorial/dna-basecount/dna-basecount.md b/tutorial/dna-basecount/dna-basecount.md old mode 100644 new mode 100755 diff --git a/tutorial/dna-basecount/dna-basecount2.md b/tutorial/dna-basecount/dna-basecount2.md old mode 100644 new mode 100755 diff --git a/tutorial/dna-basecount/dna-basecount3.md b/tutorial/dna-basecount/dna-basecount3.md old mode 100644 new mode 100755 diff --git a/tutorial/dna-basecount/dna_seq.txt b/tutorial/dna-basecount/dna_seq.txt old mode 100644 new mode 100755 diff --git a/tutorial/map-partitions/README.md b/tutorial/map-partitions/README.md old mode 100644 new mode 100755 diff --git a/tutorial/pyspark-udf/pyspark_udf_maptype.txt b/tutorial/pyspark-udf/pyspark_udf_maptype.txt new file mode 100644 index 0000000..5994c9f --- /dev/null +++ b/tutorial/pyspark-udf/pyspark_udf_maptype.txt @@ -0,0 +1,57 @@ +$SPARK_HOME/bin/pyspark +Python 3.8.9 (default, Nov 9 2021, 04:26:29) +Welcome to + ____ __ + / __/__ ___ _____/ /__ + _\ \/ _ \/ _ `/ __/ '_/ + /__ / .__/\_,_/_/ /_/\_\ version 3.2.0 + /_/ + +Using Python version 3.8.9 (default, Nov 9 2021 04:26:29) +Spark context Web UI available at http://10.0.0.232:4040 +Spark context available as 'sc' (master = local[*], app id = local-1641011178190). +SparkSession available as 'spark'. + +>>> from pyspark.sql import Row + +>>> data = spark.createDataFrame( +... [Row(zip_code='94087', city='Sunnyvale'), +... Row(zip_code='94088', city='Cupertino'), +... Row(zip_code='95055', city='Santa Clara'), +... Row(zip_code='95054', city='Palo Alto')]) + +>>> +>>> data.show() ++--------+-----------+ +|zip_code| city| ++--------+-----------+ +| 94087| Sunnyvale| +| 94088| Cupertino| +| 95055|Santa Clara| +| 95054| Palo Alto| ++--------+-----------+ + +>>> from pyspark.sql.functions import udf +>>> from pyspark.sql import types as T +>>> +>>> @udf(T.MapType(T.StringType(), T.StringType())) +... def create_structure(zip_code, city): +... return {zip_code: city} +... +>>> data.withColumn('structure', create_structure(data.zip_code, data.city)).toJSON().collect() +[ + '{"zip_code":"94087","city":"Sunnyvale","structure":{"94087":"Sunnyvale"}}', + '{"zip_code":"94088","city":"Cupertino","structure":{"94088":"Cupertino"}}', + '{"zip_code":"95055","city":"Santa Clara","structure":{"95055":"Santa Clara"}}', + '{"zip_code":"95054","city":"Palo Alto","structure":{"95054":"Palo Alto"}}' +] + +>>> data.withColumn('structure', create_structure(data.zip_code, data.city)).show(truncate=False) ++--------+-----------+----------------------+ +|zip_code|city |structure | ++--------+-----------+----------------------+ +|94087 |Sunnyvale |{94087 -> Sunnyvale} | +|94088 |Cupertino |{94088 -> Cupertino} | +|95055 |Santa Clara|{95055 -> Santa Clara}| +|95054 |Palo Alto |{95054 -> Palo Alto} | ++--------+-----------+----------------------+ diff --git a/tutorial/split-function/README.md b/tutorial/split-function/README.md old mode 100644 new mode 100755 diff --git a/tutorial/top-N/top-N.txt b/tutorial/top-N/top-N.txt old mode 100644 new mode 100755 diff --git a/tutorial/wordcount/README.md b/tutorial/wordcount/README.md old mode 100644 new mode 100755 diff --git a/tutorial/wordcount/word_count.py b/tutorial/wordcount/word_count.py old mode 100644 new mode 100755 diff --git a/tutorial/wordcount/word_count_ver2.py b/tutorial/wordcount/word_count_ver2.py old mode 100644 new mode 100755 diff --git a/tutorial/wordcount/wordcount-shorthand.txt b/tutorial/wordcount/wordcount-shorthand.txt old mode 100644 new mode 100755 diff --git a/tutorial/wordcount/wordcount.txt b/tutorial/wordcount/wordcount.txt old mode 100644 new mode 100755