@@ -375,6 +375,49 @@ fallback for type mismatches, leading to potential ambiguity and data loss. Addi
375375and tuples to strings can yield ambiguous results. Arrow Python UDFs, on the other hand, leverage Arrow's
376376capabilities to standardize type coercion and address these issues effectively.
377377
378+ A note on Arrow Python UDF type coercion: In Spark 4.1, unnecessary conversion to pandas instances is removed in the serializer
379+ when ``spark.sql.execution.pythonUDF.arrow.enabled `` is enabled. As a result, the type coercion changes
380+ when the produced output has a schema different from the specified schema. To restore the previous behavior,
381+ enable ``spark.sql.legacy.execution.pythonUDF.pandas.conversion.enabled ``. The behavior difference is summarized in the tables below.
382+
383+ Legacy type coercion:
384+
385+ .. csv-table ::
386+ :header: "SQL Type", "None", "True", "1", "a", "date", "datetime", "1.0", "array", "[1]", "(1,)", "bytearray", "Decimal", "dict"
387+ :widths: 12, 6, 6, 6, 6, 10, 12, 6, 8, 6, 6, 10, 8, 8
388+
389+ "boolean", "None", "True", "True", "X", "X", "X", "True", "X", "X", "X", "X", "X", "X"
390+ "tinyint", "None", "1", "1", "X", "X", "X", "1", "X", "X", "X", "X", "1", "X"
391+ "smallint", "None", "1", "1", "X", "X", "X", "1", "X", "X", "X", "X", "1", "X"
392+ "int", "None", "1", "1", "X", "0", "X", "1", "X", "X", "X", "X", "1", "X"
393+ "bigint", "None", "1", "1", "X", "X", "0", "1", "X", "X", "X", "X", "1", "X"
394+ "string", "None", "'True'", "'1'", "'a'", "'1970-01-01'", "'1970-01-01 00:00...'", "'1.0'", "\" array('i', [1])\" ", "'[1]'", "'(1,)'", "\" bytearray(b'ABC')\" ", "'1'", "\" {'a': 1}\" "
395+ "date", "None", "X", "X", "X", "datetime.date(197...)", "datetime.date(197...)", "X", "X", "X", "X", "X", "datetime.date(197...)", "X"
396+ "timestamp", "None", "X", "datetime.datetime...", "X", "X", "datetime.datetime...", "X", "X", "X", "X", "X", "datetime.datetime...", "X"
397+ "float", "None", "1.0", "1.0", "X", "X", "X", "1.0", "X", "X", "X", "X", "1.0", "X"
398+ "double", "None", "1.0", "1.0", "X", "X", "X", "1.0", "X", "X", "X", "X", "1.0", "X"
399+ "binary", "None", "bytearray(b'\\ x00')", "bytearray(b'\\ x00')", "X", "X", "X", "X", "bytearray(b'\\ x01\\ ...", "bytearray(b'\\ x01')", "bytearray(b'\\ x01')", "bytearray(b'ABC')", "X", "X"
400+ "decimal(10,0)", "None", "X", "X", "X", "X", "X", "Decimal('1')", "X", "X", "X", "X", "Decimal('1')", "X"
401+
402+ New type coercion:
403+
404+ .. csv-table ::
405+ :header: "SQL Type", "None", "True", "1", "a", "date", "datetime", "1.0", "array", "[1]", "(1,)", "bytearray", "Decimal", "dict"
406+ :widths: 12, 6, 6, 6, 6, 10, 12, 6, 8, 6, 6, 10, 8, 8
407+
408+ "boolean", "None", "True", "True", "X", "X", "X", "True", "X", "X", "X", "X", "X", "X"
409+ "tinyint", "None", "X", "1", "X", "X", "X", "1", "X", "X", "X", "X", "1", "X"
410+ "smallint", "None", "X", "1", "X", "X", "X", "1", "X", "X", "X", "X", "1", "X"
411+ "int", "None", "X", "1", "X", "0", "X", "1", "X", "X", "X", "X", "1", "X"
412+ "bigint", "None", "X", "1", "X", "X", "0", "1", "X", "X", "X", "X", "1", "X"
413+ "string", "None", "'true'", "'1'", "'a'", "'1970-01-01'", "'1970-01-01 00:00...'", "'1.0'", "\" array('i', [1])\" ", "'[1]'", "'(1,)'", "\" bytearray(b'ABC')\" ", "'1'", "\" {'a': 1}\" "
414+ "date", "None", "X", "datetime.date(197...)", "X", "datetime.date(197...)", "datetime.date(197...)", "datetime.date(197...)", "X", "X", "X", "X", "datetime.date(197...)", "X"
415+ "timestamp", "None", "X", "X", "X", "X", "datetime.datetime...", "X", "X", "X", "X", "X", "X", "X"
416+ "float", "None", "1.0", "1.0", "X", "X", "X", "1.0", "X", "X", "X", "X", "1.0", "X"
417+ "double", "None", "1.0", "1.0", "X", "X", "X", "1.0", "X", "X", "X", "X", "1.0", "X"
418+ "binary", "None", "X", "X", "X", "X", "X", "X", "X", "X", "X", "bytearray(b'ABC')", "X", "X"
419+ "decimal(10,0)", "None", "X", "X", "X", "X", "X", "X", "X", "X", "X", "X", "Decimal('1')", "X"
420+
378421Usage Notes
379422-----------
380423
0 commit comments