|
| 1 | +# |
| 2 | +# Licensed to the Apache Software Foundation (ASF) under one or more |
| 3 | +# contributor license agreements. See the NOTICE file distributed with |
| 4 | +# this work for additional information regarding copyright ownership. |
| 5 | +# The ASF licenses this file to You under the Apache License, Version 2.0 |
| 6 | +# (the "License"); you may not use this file except in compliance with |
| 7 | +# the License. You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | +# |
| 17 | + |
| 18 | +""" |
| 19 | +A collections of builtin avro functions |
| 20 | +""" |
| 21 | + |
| 22 | + |
| 23 | +from pyspark import since, SparkContext |
| 24 | +from pyspark.sql.column import Column, _to_java_column |
| 25 | +from pyspark.util import _print_missing_jar |
| 26 | + |
| 27 | + |
| 28 | +@since(3.0) |
| 29 | +def from_avro(col, jsonFormatSchema, options={}): |
| 30 | + """ |
| 31 | + Converts a binary column of avro format into its corresponding catalyst value. The specified |
| 32 | + schema must match the read data, otherwise the behavior is undefined: it may fail or return |
| 33 | + arbitrary result. |
| 34 | +
|
| 35 | + Avro is built-in but external data source module since Spark 2.4. Please deploy the application |
| 36 | + as per the deployment section of "Apache Avro Data Source Guide". |
| 37 | +
|
| 38 | + :param data: the binary column. |
| 39 | + :param jsonFormatSchema: the avro schema in JSON string format. |
| 40 | + :param options: options to control how the Avro record is parsed. |
| 41 | +
|
| 42 | + >>> from pyspark.sql import Row |
| 43 | + >>> from pyspark.sql.avro.functions import from_avro, to_avro |
| 44 | + >>> data = [(1, Row(name='Alice', age=2))] |
| 45 | + >>> df = spark.createDataFrame(data, ("key", "value")) |
| 46 | + >>> avroDf = df.select(to_avro(df.value).alias("avro")) |
| 47 | + >>> avroDf.collect() |
| 48 | + [Row(avro=bytearray(b'\\x00\\x00\\x04\\x00\\nAlice'))] |
| 49 | + >>> jsonFormatSchema = '''{"type":"record","name":"topLevelRecord","fields": |
| 50 | + ... [{"name":"avro","type":[{"type":"record","name":"value","namespace":"topLevelRecord", |
| 51 | + ... "fields":[{"name":"age","type":["long","null"]}, |
| 52 | + ... {"name":"name","type":["string","null"]}]},"null"]}]}''' |
| 53 | + >>> avroDf.select(from_avro(avroDf.avro, jsonFormatSchema).alias("value")).collect() |
| 54 | + [Row(value=Row(avro=Row(age=2, name=u'Alice')))] |
| 55 | + """ |
| 56 | + |
| 57 | + sc = SparkContext._active_spark_context |
| 58 | + try: |
| 59 | + jc = sc._jvm.org.apache.spark.sql.avro.functions.from_avro( |
| 60 | + _to_java_column(col), jsonFormatSchema, options) |
| 61 | + except TypeError as e: |
| 62 | + if str(e) == "'JavaPackage' object is not callable": |
| 63 | + _print_missing_jar("Avro", "avro", "avro", ssc.sparkContext.version) |
| 64 | + raise |
| 65 | + return Column(jc) |
| 66 | + |
| 67 | + |
| 68 | +@since(3.0) |
| 69 | +def to_avro(col): |
| 70 | + """ |
| 71 | + Converts a column into binary of avro format. |
| 72 | +
|
| 73 | + Avro is built-in but external data source module since Spark 2.4. Please deploy the application |
| 74 | + as per the deployment section of "Apache Avro Data Source Guide". |
| 75 | +
|
| 76 | + :param data: the data column. |
| 77 | +
|
| 78 | + >>> from pyspark.sql import Row |
| 79 | + >>> from pyspark.sql.avro.functions import to_avro |
| 80 | + >>> data = [(1, Row(name='Alice', age=2))] |
| 81 | + >>> df = spark.createDataFrame(data, ("key", "value")) |
| 82 | + >>> df.select(to_avro(df.value).alias("avro")).collect() |
| 83 | + [Row(avro=bytearray(b'\\x00\\x00\\x04\\x00\\nAlice'))] |
| 84 | + """ |
| 85 | + |
| 86 | + sc = SparkContext._active_spark_context |
| 87 | + try: |
| 88 | + jc = sc._jvm.org.apache.spark.sql.avro.functions.to_avro(_to_java_column(col)) |
| 89 | + except TypeError as e: |
| 90 | + if str(e) == "'JavaPackage' object is not callable": |
| 91 | + _print_missing_jar("Avro", "avro", "avro", ssc.sparkContext.version) |
| 92 | + raise |
| 93 | + return Column(jc) |
| 94 | + |
| 95 | + |
| 96 | +def _test(): |
| 97 | + import os |
| 98 | + import sys |
| 99 | + from pyspark.testing.utils import search_jar |
| 100 | + avro_jar = search_jar("external/avro", "spark-avro") |
| 101 | + if avro_jar is None: |
| 102 | + print( |
| 103 | + "Skipping all Avro Python tests as the optional Avro project was " |
| 104 | + "not compiled into a JAR. To run these tests, " |
| 105 | + "you need to build Spark with 'build/sbt -Pavro package' or " |
| 106 | + "'build/mvn -Pavro package' before running this test.") |
| 107 | + sys.exit(0) |
| 108 | + else: |
| 109 | + existing_args = os.environ.get("PYSPARK_SUBMIT_ARGS", "pyspark-shell") |
| 110 | + jars_args = "--jars %s" % avro_jar |
| 111 | + os.environ["PYSPARK_SUBMIT_ARGS"] = " ".join([jars_args, existing_args]) |
| 112 | + |
| 113 | + import doctest |
| 114 | + from pyspark.sql import Row, SparkSession |
| 115 | + import pyspark.sql.avro.functions |
| 116 | + globs = pyspark.sql.avro.functions.__dict__.copy() |
| 117 | + spark = SparkSession.builder\ |
| 118 | + .master("local[4]")\ |
| 119 | + .appName("sql.avro.functions tests")\ |
| 120 | + .getOrCreate() |
| 121 | + sc = spark.sparkContext |
| 122 | + globs['sc'] = sc |
| 123 | + globs['spark'] = spark |
| 124 | + globs['df'] = spark.createDataFrame([Row(name='Alice', age=2), Row(name='Bob', age=5)]) |
| 125 | + (failure_count, test_count) = doctest.testmod( |
| 126 | + pyspark.sql.avro.functions, globs=globs, |
| 127 | + optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) |
| 128 | + spark.stop() |
| 129 | + if failure_count: |
| 130 | + sys.exit(-1) |
| 131 | + |
| 132 | + |
| 133 | +if __name__ == "__main__": |
| 134 | + _test() |
0 commit comments