Skip to content

Commit 05ef1c8

Browse files
author
Davies Liu
committed
infer LongType for int in Python
1 parent 36e15b4 commit 05ef1c8

File tree

4 files changed

+28
-5
lines changed

4 files changed

+28
-5
lines changed

python/pyspark/sql.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -577,7 +577,7 @@ def _parse_datatype_json_value(json_value):
577577
_type_mappings = {
578578
type(None): NullType,
579579
bool: BooleanType,
580-
int: IntegerType,
580+
int: LongType,
581581
long: LongType,
582582
float: DoubleType,
583583
str: StringType,
@@ -926,11 +926,11 @@ def _infer_schema_type(obj, dataType):
926926
>>> schema = _parse_schema_abstract("a b c d")
927927
>>> row = (1, 1.0, "str", datetime.date(2014, 10, 10))
928928
>>> _infer_schema_type(row, schema)
929-
StructType...IntegerType...DoubleType...StringType...DateType...
929+
StructType...LongType...DoubleType...StringType...DateType...
930930
>>> row = [[1], {"key": (1, 2.0)}]
931931
>>> schema = _parse_schema_abstract("a[] b{c d}")
932932
>>> _infer_schema_type(row, schema)
933-
StructType...a,ArrayType...b,MapType(StringType,...c,IntegerType...
933+
StructType...a,ArrayType...b,MapType(StringType,...c,LongType...
934934
"""
935935
if dataType is None:
936936
return _infer_type(obj)
@@ -985,7 +985,7 @@ def _verify_type(obj, dataType):
985985
986986
>>> _verify_type(None, StructType([]))
987987
>>> _verify_type("", StringType())
988-
>>> _verify_type(0, IntegerType())
988+
>>> _verify_type(0, LongType())
989989
>>> _verify_type(range(3), ArrayType(ShortType()))
990990
>>> _verify_type(set(), ArrayType(StringType())) # doctest: +IGNORE_EXCEPTION_DETAIL
991991
Traceback (most recent call last):

python/pyspark/tests.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,7 @@
5151
CloudPickleSerializer, CompressedSerializer
5252
from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger, ExternalSorter
5353
from pyspark.sql import SQLContext, IntegerType, Row, ArrayType, StructType, StructField, \
54-
UserDefinedType, DoubleType
54+
UserDefinedType, DoubleType, LongType, _infer_type
5555
from pyspark import shuffle
5656

5757
_have_scipy = False
@@ -985,6 +985,27 @@ def test_parquet_with_udt(self):
985985
point = srdd1.first().point
986986
self.assertEquals(point, ExamplePoint(1.0, 2.0))
987987

988+
def test_infer_long_type(self):
989+
longrow = [Row(f1='a', f2=100000000000000)]
990+
rdd = self.sc.parallelize(longrow)
991+
srdd = self.sqlCtx.inferSchema(rdd)
992+
self.assertEqual(srdd.schema().fields[1].dataType, LongType())
993+
994+
# this saving as Parquet caused issues as well.
995+
output_dir = os.path.join(self.tempdir.name, "infer_long_type")
996+
srdd.saveAsParquetFile(output_dir)
997+
df1 = self.sqlCtx.parquetFile(output_dir)
998+
self.assertEquals('a', df1.first().f1)
999+
self.assertEquals(100000000000000, df1.first().f2)
1000+
1001+
self.assertEqual(_infer_type(1), LongType())
1002+
self.assertEqual(_infer_type(2**10), LongType())
1003+
self.assertEqual(_infer_type(2**20), LongType())
1004+
self.assertEqual(_infer_type(2**31 - 1), LongType())
1005+
self.assertEqual(_infer_type(2**31), LongType())
1006+
self.assertEqual(_infer_type(2**61), LongType())
1007+
self.assertEqual(_infer_type(2**71), LongType())
1008+
9881009

9891010
class InputFormatTests(ReusedPySparkTestCase):
9901011

sql/core/src/main/scala/org/apache/spark/sql/SQLContext.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -479,6 +479,7 @@ class SQLContext(@transient val sparkContext: SparkContext)
479479
case ByteType => true
480480
case ShortType => true
481481
case FloatType => true
482+
case LongType => true
482483
case DateType => true
483484
case TimestampType => true
484485
case ArrayType(_, _) => true

sql/core/src/main/scala/org/apache/spark/sql/execution/pythonUdfs.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -187,6 +187,7 @@ object EvaluatePython {
187187
case (c: Int, ShortType) => c.toShort
188188
case (c: Long, ShortType) => c.toShort
189189
case (c: Long, IntegerType) => c.toInt
190+
case (c: Int, LongType) => c.toLong
190191
case (c: Double, FloatType) => c.toFloat
191192
case (c, StringType) if !c.isInstanceOf[String] => c.toString
192193

0 commit comments

Comments
 (0)