Skip to content

Commit 9c21316

Browse files
cocoatomoAndrew Or
authored andcommitted
[PySpark] Fix tests with Python 2.6 in 1.0 branch
[SPARK-2951] [PySpark] support unpickle array.array for Python 2.6 Pyrolite can not unpickle array.array which pickled by Python 2.6, this patch fix it by extend Pyrolite. There is a bug in Pyrolite when unpickle array of float/double, this patch workaround it by reverse the endianness for float/double. This workaround should be removed after Pyrolite have a new release to fix this issue. [PySpark] [SPARK-2954] [SPARK-2948] [SPARK-2910] [SPARK-2101] Python 2.6 Fixes - Modify python/run-tests to test with Python 2.6 - Use unittest2 when running on Python 2.6. - Fix issue with namedtuple. - Skip TestOutputFormat.test_newhadoop on Python 2.6 until SPARK-2951 is fixed. - Fix MLlib _deserialize_double on Python 2.6. [SPARK-3867][PySpark] ./python/run-tests failed when it run with Python 2.6 and unittest2 is not installed ./python/run-tests search a Python 2.6 executable on PATH and use it if available. When using Python 2.6, it is going to import unittest2 module which is not a standard library in Python 2.6, so it fails with Import Author: cocoatomo <cocoatomo77@gmail.com> Author: Josh Rosen <joshrosen@apache.org> Author: Davies Liu <davies.liu@gmail.com> Author: Davies Liu <davies@databricks.com> Closes #3668 from davies/port_2365 and squashes the following commits: b32583d [Davies Liu] rollback _common.py bda1c72 [cocoatomo] [SPARK-3867][PySpark] ./python/run-tests failed when it run with Python 2.6 and unittest2 is not installed 14ad3d9 [Josh Rosen] [PySpark] [SPARK-2954] [SPARK-2948] [SPARK-2910] [SPARK-2101] Python 2.6 Fixes 7c55cff [Davies Liu] [SPARK-2951] [PySpark] support unpickle array.array for Python 2.6
1 parent 3425ba8 commit 9c21316

File tree

5 files changed

+115
-4
lines changed

5 files changed

+115
-4
lines changed
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.api.python
19+
20+
import java.nio.ByteOrder
21+
22+
import scala.collection.JavaConversions._
23+
import scala.util.Failure
24+
import scala.util.Try
25+
26+
import net.razorvine.pickle.{Unpickler, Pickler}
27+
28+
import org.apache.spark.{Logging, SparkException}
29+
import org.apache.spark.rdd.RDD
30+
31+
/** Utilities for serialization / deserialization between Python and Java, using Pickle. */
32+
private[python] object SerDeUtil extends Logging {
33+
// Unpickle array.array generated by Python 2.6
34+
class ArrayConstructor extends net.razorvine.pickle.objects.ArrayConstructor {
35+
// /* Description of types */
36+
// static struct arraydescr descriptors[] = {
37+
// {'c', sizeof(char), c_getitem, c_setitem},
38+
// {'b', sizeof(char), b_getitem, b_setitem},
39+
// {'B', sizeof(char), BB_getitem, BB_setitem},
40+
// #ifdef Py_USING_UNICODE
41+
// {'u', sizeof(Py_UNICODE), u_getitem, u_setitem},
42+
// #endif
43+
// {'h', sizeof(short), h_getitem, h_setitem},
44+
// {'H', sizeof(short), HH_getitem, HH_setitem},
45+
// {'i', sizeof(int), i_getitem, i_setitem},
46+
// {'I', sizeof(int), II_getitem, II_setitem},
47+
// {'l', sizeof(long), l_getitem, l_setitem},
48+
// {'L', sizeof(long), LL_getitem, LL_setitem},
49+
// {'f', sizeof(float), f_getitem, f_setitem},
50+
// {'d', sizeof(double), d_getitem, d_setitem},
51+
// {'\0', 0, 0, 0} /* Sentinel */
52+
// };
53+
// TODO: support Py_UNICODE with 2 bytes
54+
// FIXME: unpickle array of float is wrong in Pyrolite, so we reverse the
55+
// machine code for float/double here to workaround it.
56+
// we should fix this after Pyrolite fix them
57+
val machineCodes: Map[Char, Int] = if (ByteOrder.nativeOrder().equals(ByteOrder.BIG_ENDIAN)) {
58+
Map('c' -> 1, 'B' -> 0, 'b' -> 1, 'H' -> 3, 'h' -> 5, 'I' -> 7, 'i' -> 9,
59+
'L' -> 11, 'l' -> 13, 'f' -> 14, 'd' -> 16, 'u' -> 21
60+
)
61+
} else {
62+
Map('c' -> 1, 'B' -> 0, 'b' -> 1, 'H' -> 2, 'h' -> 4, 'I' -> 6, 'i' -> 8,
63+
'L' -> 10, 'l' -> 12, 'f' -> 15, 'd' -> 17, 'u' -> 20
64+
)
65+
}
66+
override def construct(args: Array[Object]): Object = {
67+
if (args.length == 1) {
68+
construct(args ++ Array(""))
69+
} else if (args.length == 2 && args(1).isInstanceOf[String]) {
70+
val typecode = args(0).asInstanceOf[String].charAt(0)
71+
val data: String = args(1).asInstanceOf[String]
72+
construct(typecode, machineCodes(typecode), data.getBytes("ISO-8859-1"))
73+
} else {
74+
super.construct(args)
75+
}
76+
}
77+
}
78+
79+
def initialize() = {
80+
Unpickler.registerConstructor("array", "array", new ArrayConstructor())
81+
}
82+
}
83+

python/pyspark/context.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,7 @@ def _ensure_initialized(cls, instance=None, gateway=None):
190190
SparkContext._gateway = gateway or launch_gateway()
191191
SparkContext._jvm = SparkContext._gateway.jvm
192192
SparkContext._writeToFile = SparkContext._jvm.PythonRDD.writeToFile
193+
SparkContext._jvm.SerDeUtil.initialize()
193194

194195
if instance:
195196
if SparkContext._active_spark_context and SparkContext._active_spark_context != instance:

python/pyspark/mllib/tests.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,17 @@
1919
Fuller unit tests for Python MLlib.
2020
"""
2121

22+
import sys
2223
from numpy import array, array_equal
23-
import unittest
24+
25+
if sys.version_info[:2] <= (2, 6):
26+
try:
27+
import unittest2 as unittest
28+
except ImportError:
29+
sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier')
30+
sys.exit(1)
31+
else:
32+
import unittest
2433

2534
from pyspark.mllib._common import _convert_vector, _serialize_double_vector, \
2635
_deserialize_double_vector, _dot, _squared_distance

python/pyspark/tests.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,18 @@
2828
import sys
2929
import tempfile
3030
import time
31-
import unittest
3231
import zipfile
3332

33+
if sys.version_info[:2] <= (2, 6):
34+
try:
35+
import unittest2 as unittest
36+
except ImportError:
37+
sys.stderr.write('Please install unittest2 to test with Python 2.6 or earlier')
38+
sys.exit(1)
39+
else:
40+
import unittest
41+
42+
3443
from pyspark.context import SparkContext
3544
from pyspark.files import SparkFiles
3645
from pyspark.serializers import read_int
@@ -291,8 +300,9 @@ def createFileInZip(self, name, content):
291300
pattern = re.compile(r'^ *\|', re.MULTILINE)
292301
content = re.sub(pattern, '', content.strip())
293302
path = os.path.join(self.programDir, name + ".zip")
294-
with zipfile.ZipFile(path, 'w') as zip:
295-
zip.writestr(name, content)
303+
zip = zipfile.ZipFile(path, 'w')
304+
zip.writestr(name, content)
305+
zip.close()
296306
return path
297307

298308
def test_single_script(self):

python/run-tests

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,14 @@ function run_test() {
4949

5050
echo "Running PySpark tests. Output is in python/unit-tests.log."
5151

52+
# Try to test with Python 2.6, since that's the minimum version that we support:
53+
if [ $(which python2.6) ]; then
54+
export PYSPARK_PYTHON="python2.6"
55+
fi
56+
57+
echo "Testing with Python version:"
58+
$PYSPARK_PYTHON --version
59+
5260
run_test "pyspark/rdd.py"
5361
run_test "pyspark/context.py"
5462
run_test "pyspark/conf.py"

0 commit comments

Comments
 (0)