Description
Calling read_pickle
to load a DataFrame with MultiIndex columns from a pickle file created in py27 throws an error saying:
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd9 in position 0: ordinal not in range(128)
This issue was introduced by #28645, more specifically by this change:
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
index adf0aa961..8f9bae0f7 100644
--- a/pandas/io/pickle.py
+++ b/pandas/io/pickle.py
@@ -142,18 +142,24 @@ def read_pickle(path, compression="infer"):
# 1) try standard library Pickle
# 2) try pickle_compat (older pandas version) to handle subclass changes
- # 3) try pickle_compat with latin1 encoding
+
+ excs_to_catch = (AttributeError, ImportError)
+ if PY36:
+ excs_to_catch += (ModuleNotFoundError,)
try:
with warnings.catch_warnings(record=True):
# We want to silence any warnings about, e.g. moved modules.
warnings.simplefilter("ignore", Warning)
return pickle.load(f)
- except Exception:
- try:
- return pc.load(f, encoding=None)
- except Exception:
- return pc.load(f, encoding="latin1")
+ except excs_to_catch:
+ # e.g.
+ # "No module named 'pandas.core.sparse.series'"
+ # "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
+ return pc.load(f, encoding=None)
+ except UnicodeDecodeError:
+ # e.g. can occur for files written in py27; see GH#28645
+ return pc.load(f, encoding="latin-1")
finally:
f.close()
for _f in fh:
Note how before when there was an Exception trying to load the file with the builtin pickle.load
function, it would try to load them using pickle_compat
with enconding=None
and if that call also threw an Exception, it would then fallback to try to load with encoding="latin-1"
. With the change from #28645 the fallback to use encoding="latin-1"
is only in the catch block of the initial pickle_load
call, not the second one that tries to use pickle_compat with encoding=None
.
This become an issue for py27 pickles with MultiIndex columns after FrozenNDArray
was removed by #29840 as pickle.load(f)
throws an AttributeError
for FrozenNDArray
and then pc.load(f, encoding=None)
throws an UnicodeDecodeError
.
Here is a full stack trace:
In [1]: import pandas as pd
...: df = pd.read_pickle('test_mi_py27.pkl')
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~/anaconda3/envs/pandas-pickle/lib/python3.7/site-packages/pandas/io/pickle.py in read_pickle(filepath_or_buffer, compression)
180 warnings.simplefilter("ignore", Warning)
--> 181 return pickle.load(f)
182 except excs_to_catch:
AttributeError: Can't get attribute 'FrozenNDArray' on <module 'pandas.core.indexes.frozen' from '~/anaconda3/envs/pandas-pickle/lib/python3.7/site-packages/pandas/core/indexes/frozen.py'>
During handling of the above exception, another exception occurred:
UnicodeDecodeError Traceback (most recent call last)
<ipython-input-1-8c3800ea96cb> in <module>
1 import pandas as pd
----> 2 df = pd.read_pickle('test_mi_py27.pkl')
~/anaconda3/envs/pandas-pickle/lib/python3.7/site-packages/pandas/io/pickle.py in read_pickle(filepath_or_buffer, compression)
184 # "No module named 'pandas.core.sparse.series'"
185 # "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
--> 186 return pc.load(f, encoding=None)
187 except UnicodeDecodeError:
188 # e.g. can occur for files written in py27; see GH#28645
~/anaconda3/envs/pandas-pickle/lib/python3.7/site-packages/pandas/compat/pickle_compat.py in load(fh, encoding, is_verbose)
239 up.is_verbose = is_verbose
240
--> 241 return up.load()
242 except (ValueError, TypeError):
243 raise
~/anaconda3/envs/pandas-pickle/lib/python3.7/pickle.py in load(self)
1086 raise EOFError
1087 assert isinstance(key, bytes_types)
-> 1088 dispatch[key[0]](self)
1089 except _Stop as stopinst:
1090 return stopinst.value
~/anaconda3/envs/pandas-pickle/lib/python3.7/pickle.py in load_short_binstring(self)
1262 len = self.read(1)[0]
1263 data = self.read(len)
-> 1264 self.append(self._decode_string(data))
1265 dispatch[SHORT_BINSTRING[0]] = load_short_binstring
1266
~/anaconda3/envs/pandas-pickle/lib/python3.7/pickle.py in _decode_string(self, value)
1202 return value
1203 else:
-> 1204 return value.decode(self.encoding, self.errors)
1205
1206 def load_string(self):
UnicodeDecodeError: 'ascii' codec can't decode byte 0xd9 in position 0: ordinal not in range(128)
I believe the fix for this specific issue can be as simple as:
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
index e51f24b55..5c4f2d8c4 100644
--- a/pandas/io/pickle.py
+++ b/pandas/io/pickle.py
@@ -183,7 +183,11 @@ def read_pickle(
# e.g.
# "No module named 'pandas.core.sparse.series'"
# "Can't get attribute '__nat_unpickle' on <module 'pandas._libs.tslib"
- return pc.load(f, encoding=None)
+ try:
+ return pc.load(f, encoding=None)
+ except UnicodeDecodeError:
+ # e.g. can occur for files written in py27;
+ return pc.load(f, encoding="latin-1")
except UnicodeDecodeError:
# e.g. can occur for files written in py27; see GH#28645
return pc.load(f, encoding="latin-1")
I will open a PR with the fix above over the weekend.
Thanks!