pandas-dev · jreback · Apr 5, 2019 · Apr 2, 2019
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -355,6 +355,7 @@ I/O
 - Improved performance in :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` when converting columns that have missing values (:issue:`25772`)
 - Bug in :func:`read_hdf` not properly closing store after a ``KeyError`` is raised (:issue:`25766`)
 - Bug in ``read_csv`` which would not raise ``ValueError`` if a column index in ``usecols`` was out of bounds (:issue:`25623`)
+- Improved the explanation for the failure when value labels are repeated in Stata dta files and suggested work-arounds (:issue:`25772`)
 - Improved :meth:`pandas.read_stata` and :class:`pandas.io.stata.StataReader` to read incorrectly formatted 118 format files saved by Stata (:issue:`25960`)
 
 Plotting

diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -1719,10 +1719,19 @@ def _do_convert_categoricals(self, data, value_label_dict, lbllist,
                     vc = Series(categories).value_counts()
                     repeats = list(vc.index[vc > 1])
                     repeats = '-' * 80 + '\n' + '\n'.join(repeats)
-                    raise ValueError('Value labels for column {col} are not '
-                                     'unique. The repeated labels are:\n'
-                                     '{repeats}'
-                                     .format(col=col, repeats=repeats))
+                    # GH 25772
+                    msg = """
+Value labels for column {col} are not unique. These cannot be converted to
+pandas categoricals.
+
+Either read the file with `convert_categoricals` set to False or use the
+low level interface in `StataReader` to separately read the values and the
+value_labels.
+
+The repeated labels are:
+{repeats}
+"""
+                    raise ValueError(msg.format(col=col, repeats=repeats))
                 # TODO: is the next line needed above in the data(...) method?
                 cat_data = Series(cat_data, index=data.index)
                 cat_converted_data.append((col, cat_data))

diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py
@@ -1311,9 +1311,17 @@ def test_unsupported_datetype(self):
                 original.to_stata(path)
 
     def test_repeated_column_labels(self):
-        # GH 13923
-        msg = (r"Value labels for column ethnicsn are not unique\. The"
-               r" repeated labels are:\n-+\nwolof")
+        # GH 13923, 25772
+        msg = """
+Value labels for column ethnicsn are not unique. These cannot be converted to
+pandas categoricals.
+
+Either read the file with `convert_categoricals` set to False or use the
+low level interface in `StataReader` to separately read the values and the
+value_labels.
+
+The repeated labels are:\n-+\nwolof
+"""
         with pytest.raises(ValueError, match=msg):
             read_stata(self.dta23, convert_categoricals=True)