Skip to content

Commit db06a81

Browse files
committed
[PySpark] [SPARK-2954] [SPARK-2948] [SPARK-2910] [SPARK-2101] Python 2.6 Fixes
- Modify python/run-tests to test with Python 2.6 - Use unittest2 when running on Python 2.6. - Fix issue with namedtuple. - Skip TestOutputFormat.test_newhadoop on Python 2.6 until SPARK-2951 is fixed. - Fix MLlib _deserialize_double on Python 2.6. Closes #1868. Closes #1042. Author: Josh Rosen <joshrosen@apache.org> Closes #1874 from JoshRosen/python2.6 and squashes the following commits: 983d259 [Josh Rosen] [SPARK-2954] Fix MLlib _deserialize_double on Python 2.6. 5d18fd7 [Josh Rosen] [SPARK-2948] [SPARK-2910] [SPARK-2101] Python 2.6 fixes
1 parent ba28a8f commit db06a81

File tree

5 files changed

+36
-7
lines changed

5 files changed

+36
-7
lines changed

python/pyspark/mllib/_common.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#
1717

1818
import struct
19+
import sys
1920
import numpy
2021
from numpy import ndarray, float64, int64, int32, array_equal, array
2122
from pyspark import SparkContext, RDD
@@ -78,6 +79,14 @@
7879
LABELED_POINT_MAGIC = 4
7980

8081

82+
# Workaround for SPARK-2954: before Python 2.7, struct.unpack couldn't unpack bytearray()s.
83+
if sys.version_info[:2] <= (2, 6):
84+
def _unpack(fmt, string):
85+
return struct.unpack(fmt, buffer(string))
86+
else:
87+
_unpack = struct.unpack
88+
89+
8190
def _deserialize_numpy_array(shape, ba, offset, dtype=float64):
8291
"""
8392
Deserialize a numpy array of the given type from an offset in
@@ -191,7 +200,7 @@ def _deserialize_double(ba, offset=0):
191200
raise TypeError("_deserialize_double called on a %s; wanted bytearray" % type(ba))
192201
if len(ba) - offset != 8:
193202
raise TypeError("_deserialize_double called on a %d-byte array; wanted 8 bytes." % nb)
194-
return struct.unpack("d", ba[offset:])[0]
203+
return _unpack("d", ba[offset:])[0]
195204

196205

197206
def _deserialize_double_vector(ba, offset=0):

python/pyspark/mllib/tests.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,13 @@
1919
Fuller unit tests for Python MLlib.
2020
"""
2121

22+
import sys
2223
from numpy import array, array_equal
23-
import unittest
24+
25+
if sys.version_info[:2] <= (2, 6):
26+
import unittest2 as unittest
27+
else:
28+
import unittest
2429

2530
from pyspark.mllib._common import _convert_vector, _serialize_double_vector, \
2631
_deserialize_double_vector, _dot, _squared_distance

python/pyspark/serializers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,8 @@ def _copy_func(f):
314314

315315
_old_namedtuple = _copy_func(collections.namedtuple)
316316

317-
def namedtuple(name, fields, verbose=False, rename=False):
318-
cls = _old_namedtuple(name, fields, verbose, rename)
317+
def namedtuple(*args, **kwargs):
318+
cls = _old_namedtuple(*args, **kwargs)
319319
return _hack_namedtuple(cls)
320320

321321
# replace namedtuple with new one

python/pyspark/tests.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,14 @@
2929
import sys
3030
import tempfile
3131
import time
32-
import unittest
3332
import zipfile
3433

34+
if sys.version_info[:2] <= (2, 6):
35+
import unittest2 as unittest
36+
else:
37+
import unittest
38+
39+
3540
from pyspark.context import SparkContext
3641
from pyspark.files import SparkFiles
3742
from pyspark.serializers import read_int
@@ -605,6 +610,7 @@ def test_oldhadoop(self):
605610
conf=input_conf).collect())
606611
self.assertEqual(old_dataset, dict_data)
607612

613+
@unittest.skipIf(sys.version_info[:2] <= (2, 6), "Skipped on 2.6 until SPARK-2951 is fixed")
608614
def test_newhadoop(self):
609615
basepath = self.tempdir.name
610616
# use custom ArrayWritable types and converters to handle arrays
@@ -905,8 +911,9 @@ def createFileInZip(self, name, content):
905911
pattern = re.compile(r'^ *\|', re.MULTILINE)
906912
content = re.sub(pattern, '', content.strip())
907913
path = os.path.join(self.programDir, name + ".zip")
908-
with zipfile.ZipFile(path, 'w') as zip:
909-
zip.writestr(name, content)
914+
zip = zipfile.ZipFile(path, 'w')
915+
zip.writestr(name, content)
916+
zip.close()
910917
return path
911918

912919
def test_single_script(self):

python/run-tests

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,14 @@ function run_test() {
4848

4949
echo "Running PySpark tests. Output is in python/unit-tests.log."
5050

51+
# Try to test with Python 2.6, since that's the minimum version that we support:
52+
if [ $(which python2.6) ]; then
53+
export PYSPARK_PYTHON="python2.6"
54+
fi
55+
56+
echo "Testing with Python version:"
57+
$PYSPARK_PYTHON --version
58+
5159
run_test "pyspark/rdd.py"
5260
run_test "pyspark/context.py"
5361
run_test "pyspark/conf.py"

0 commit comments

Comments
 (0)