Skip to content

Commit 10bc597

Browse files
Davies Liumarkhamstra
authored andcommitted
[SPARK-5722] [SQL] [PySpark] infer int as LongType in Python (for 1.2 branch)
This PR change to use LongType for int in Python, when inferSchema(), because IntegerType in SQL is not enough for int in Python (which is 64-bit on 64-bit machines). Closes apache#4521 cc dondrake marmbrus Author: Davies Liu <davies@databricks.com> Closes apache#4681 from davies/long2 and squashes the following commits: 05ef1c8 [Davies Liu] infer LongType for int in Python
1 parent d4ab71a commit 10bc597

File tree

4 files changed

+28
-5
lines changed

4 files changed

+28
-5
lines changed

python/pyspark/sql.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ def _parse_datatype_json_value(json_value):
577577
_type_mappings = {
578578
type(None): NullType,
579579
bool: BooleanType,
580-
int: IntegerType,
580+
int: LongType,
581581
long: LongType,
582582
float: DoubleType,
583583
str: StringType,
@@ -926,11 +926,11 @@ def _infer_schema_type(obj, dataType):
926926
>>> schema = _parse_schema_abstract("a b c d")
927927
>>> row = (1, 1.0, "str", datetime.date(2014, 10, 10))
928928
>>> _infer_schema_type(row, schema)
929-
StructType...IntegerType...DoubleType...StringType...DateType...
929+
StructType...LongType...DoubleType...StringType...DateType...
930930
>>> row = [[1], {"key": (1, 2.0)}]
931931
>>> schema = _parse_schema_abstract("a[] b{c d}")
932932
>>> _infer_schema_type(row, schema)
933-
StructType...a,ArrayType...b,MapType(StringType,...c,IntegerType...
933+
StructType...a,ArrayType...b,MapType(StringType,...c,LongType...
934934
"""
935935
if dataType is None:
936936
return _infer_type(obj)
@@ -985,7 +985,7 @@ def _verify_type(obj, dataType):
985985
986986
>>> _verify_type(None, StructType([]))
987987
>>> _verify_type("", StringType())
988-
>>> _verify_type(0, IntegerType())
988+
>>> _verify_type(0, LongType())
989989
>>> _verify_type(range(3), ArrayType(ShortType()))
990990
>>> _verify_type(set(), ArrayType(StringType())) # doctest: +IGNORE_EXCEPTION_DETAIL
991991
Traceback (most recent call last):

python/pyspark/tests.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
CloudPickleSerializer, CompressedSerializer
5252
from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, ExternalSorter
5353
from pyspark.sql import SQLContext, IntegerType, Row, ArrayType, StructType, StructField, \
54-
UserDefinedType, DoubleType
54+
UserDefinedType, DoubleType, LongType, _infer_type
5555
from pyspark import shuffle
5656

5757
_have_scipy = False
@@ -985,6 +985,27 @@ def test_parquet_with_udt(self):
985985
point = srdd1.first().point
986986
self.assertEquals(point, ExamplePoint(1.0, 2.0))
987987

988+
def test_infer_long_type(self):
989+
longrow = [Row(f1='a', f2=100000000000000)]
990+
rdd = self.sc.parallelize(longrow)
991+
srdd = self.sqlCtx.inferSchema(rdd)
992+
self.assertEqual(srdd.schema().fields[1].dataType, LongType())
993+
994+
# this saving as Parquet caused issues as well.
995+
output_dir = os.path.join(self.tempdir.name, "infer_long_type")
996+
srdd.saveAsParquetFile(output_dir)
997+
df1 = self.sqlCtx.parquetFile(output_dir)
998+
self.assertEquals('a', df1.first().f1)
999+
self.assertEquals(100000000000000, df1.first().f2)
1000+
1001+
self.assertEqual(_infer_type(1), LongType())
1002+
self.assertEqual(_infer_type(2**10), LongType())
1003+
self.assertEqual(_infer_type(2**20), LongType())
1004+
self.assertEqual(_infer_type(2**31 - 1), LongType())
1005+
self.assertEqual(_infer_type(2**31), LongType())
1006+
self.assertEqual(_infer_type(2**61), LongType())
1007+
self.assertEqual(_infer_type(2**71), LongType())
1008+
9881009

9891010
class InputFormatTests(ReusedPySparkTestCase):
9901011

sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
479479
case ByteType => true
480480
case ShortType => true
481481
case FloatType => true
482+
case LongType => true
482483
case DateType => true
483484
case TimestampType => true
484485
case ArrayType(_, _) => true

sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ object EvaluatePython {
187187
case (c: Int, ShortType) => c.toShort
188188
case (c: Long, ShortType) => c.toShort
189189
case (c: Long, IntegerType) => c.toInt
190+
case (c: Int, LongType) => c.toLong
190191
case (c: Double, FloatType) => c.toFloat
191192
case (c, StringType) if !c.isInstanceOf[String] => c.toString
192193

0 commit comments

Comments
 (0)