API: rename labels to codes (#29509)

topper-123 · jreback · commit e56129323cfe · 2019-11-11T09:32:45.000-05:00
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -3,7 +3,7 @@
 intended for public consumption
 """
 from textwrap import dedent
-from typing import Dict
+from typing import Dict, Optional, Tuple, Union
 from warnings import catch_warnings, simplefilter, warn
 
 import numpy as np
@@ -501,9 +501,9 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
 
     Returns
     -------
-    labels : ndarray
+    codes : ndarray
         An integer ndarray that's an indexer into `uniques`.
-        ``uniques.take(labels)`` will have the same values as `values`.
+        ``uniques.take(codes)`` will have the same values as `values`.
     uniques : ndarray, Index, or Categorical
         The unique valid values. When `values` is Categorical, `uniques`
         is a Categorical. When `values` is some other pandas object, an
@@ -525,27 +525,27 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
     ``pd.factorize(values)``. The results are identical for methods like
     :meth:`Series.factorize`.
 
-    >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
-    >>> labels
+    >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
+    >>> codes
     array([0, 0, 1, 2, 0])
     >>> uniques
     array(['b', 'a', 'c'], dtype=object)
 
-    With ``sort=True``, the `uniques` will be sorted, and `labels` will be
+    With ``sort=True``, the `uniques` will be sorted, and `codes` will be
     shuffled so that the relationship is the maintained.
 
-    >>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
-    >>> labels
+    >>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
+    >>> codes
     array([1, 1, 0, 2, 1])
     >>> uniques
     array(['a', 'b', 'c'], dtype=object)
 
-    Missing values are indicated in `labels` with `na_sentinel`
+    Missing values are indicated in `codes` with `na_sentinel`
     (``-1`` by default). Note that missing values are never
     included in `uniques`.
 
-    >>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
-    >>> labels
+    >>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
+    >>> codes
     array([ 0, -1,  1,  2,  0])
     >>> uniques
     array(['b', 'a', 'c'], dtype=object)
@@ -555,8 +555,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
     will differ. For Categoricals, a `Categorical` is returned.
 
     >>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
-    >>> labels, uniques = pd.factorize(cat)
-    >>> labels
+    >>> codes, uniques = pd.factorize(cat)
+    >>> codes
     array([0, 0, 1])
     >>> uniques
     [a, c]
@@ -569,8 +569,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
     returned.
 
     >>> cat = pd.Series(['a', 'a', 'c'])
-    >>> labels, uniques = pd.factorize(cat)
-    >>> labels
+    >>> codes, uniques = pd.factorize(cat)
+    >>> codes
     array([0, 0, 1])
     >>> uniques
     Index(['a', 'c'], dtype='object')
@@ -596,7 +596,7 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
     sort=dedent(
         """\
     sort : bool, default False
-        Sort `uniques` and shuffle `labels` to maintain the
+        Sort `uniques` and shuffle `codes` to maintain the
         relationship.
     """
     ),
@@ -609,11 +609,17 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
 )
 @Appender(_shared_docs["factorize"])
 @deprecate_kwarg(old_arg_name="order", new_arg_name=None)
-def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=None):
+def factorize(
+    values,
+    sort: bool = False,
+    order=None,
+    na_sentinel: int = -1,
+    size_hint: Optional[int] = None,
+) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
     # Implementation notes: This method is responsible for 3 things
     # 1.) coercing data to array-like (ndarray, Index, extension array)
-    # 2.) factorizing labels and uniques
-    # 3.) Maybe boxing the output in an Index
+    # 2.) factorizing codes and uniques
+    # 3.) Maybe boxing the uniques in an Index
     #
     # Step 2 is dispatched to extension types (like Categorical). They are
     # responsible only for factorization. All data coercion, sorting and boxing
@@ -624,7 +630,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
 
     if is_extension_array_dtype(values):
         values = extract_array(values)
-        labels, uniques = values.factorize(na_sentinel=na_sentinel)
+        codes, uniques = values.factorize(na_sentinel=na_sentinel)
         dtype = original.dtype
     else:
         values, dtype = _ensure_data(values)
@@ -634,13 +640,13 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
         else:
             na_value = None
 
-        labels, uniques = _factorize_array(
+        codes, uniques = _factorize_array(
             values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
         )
 
     if sort and len(uniques) > 0:
-        uniques, labels = safe_sort(
-            uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
+        uniques, codes = safe_sort(
+            uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
     uniques = _reconstruct_data(uniques, dtype, original)
@@ -653,7 +659,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
 
         uniques = Index(uniques)
 
-    return labels, uniques
+    return codes, uniques
 
 
 def value_counts(
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -690,11 +690,11 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
         Parameters
         ----------
         na_sentinel : int, default -1
-            Value to use in the `labels` array to indicate missing values.
+            Value to use in the `codes` array to indicate missing values.
 
         Returns
         -------
-        labels : ndarray
+        codes : ndarray
             An integer NumPy array that's an indexer into the original
             ExtensionArray.
         uniques : ExtensionArray
@@ -724,12 +724,12 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
         #    Complete control over factorization.
         arr, na_value = self._values_for_factorize()
 
-        labels, uniques = _factorize_array(
+        codes, uniques = _factorize_array(
             arr, na_sentinel=na_sentinel, na_value=na_value
         )
 
         uniques = self._from_factorized(uniques, self)
-        return labels, uniques
+        return codes, uniques
 
     _extension_array_shared_docs[
         "repeat"
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -710,11 +710,11 @@ def factorize(self, na_sentinel=-1):
         # Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
         # The sparsity on this is backwards from what Sparse would want. Want
         # ExtensionArray.factorize -> Tuple[EA, EA]
-        # Given that we have to return a dense array of labels, why bother
+        # Given that we have to return a dense array of codes, why bother
         # implementing an efficient factorize?
-        labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
+        codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
         uniques = SparseArray(uniques, dtype=self.dtype)
-        return labels, uniques
+        return codes, uniques
 
     def value_counts(self, dropna=True):
         """
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -1518,7 +1518,7 @@ def memory_usage(self, deep=False):
         sort=textwrap.dedent(
             """\
             sort : bool, default False
-                Sort `uniques` and shuffle `labels` to maintain the
+                Sort `uniques` and shuffle `codes` to maintain the
                 relationship.
             """
         ),
diff --git a/pandas/tests/arrays/categorical/test_algos.py b/pandas/tests/arrays/categorical/test_algos.py
@@ -11,23 +11,23 @@ def test_factorize(categories, ordered):
     cat = pd.Categorical(
         ["b", "b", "a", "c", None], categories=categories, ordered=ordered
     )
-    labels, uniques = pd.factorize(cat)
-    expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp)
+    codes, uniques = pd.factorize(cat)
+    expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp)
     expected_uniques = pd.Categorical(
         ["b", "a", "c"], categories=categories, ordered=ordered
     )
 
-    tm.assert_numpy_array_equal(labels, expected_labels)
+    tm.assert_numpy_array_equal(codes, expected_codes)
     tm.assert_categorical_equal(uniques, expected_uniques)
 
 
 def test_factorized_sort():
     cat = pd.Categorical(["b", "b", None, "a"])
-    labels, uniques = pd.factorize(cat, sort=True)
-    expected_labels = np.array([1, 1, -1, 0], dtype=np.intp)
+    codes, uniques = pd.factorize(cat, sort=True)
+    expected_codes = np.array([1, 1, -1, 0], dtype=np.intp)
     expected_uniques = pd.Categorical(["a", "b"])
 
-    tm.assert_numpy_array_equal(labels, expected_labels)
+    tm.assert_numpy_array_equal(codes, expected_codes)
     tm.assert_categorical_equal(uniques, expected_uniques)
 
 
@@ -36,13 +36,13 @@ def test_factorized_sort_ordered():
         ["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
     )
 
-    labels, uniques = pd.factorize(cat, sort=True)
-    expected_labels = np.array([0, 0, -1, 1], dtype=np.intp)
+    codes, uniques = pd.factorize(cat, sort=True)
+    expected_codes = np.array([0, 0, -1, 1], dtype=np.intp)
     expected_uniques = pd.Categorical(
         ["b", "a"], categories=["c", "b", "a"], ordered=True
     )
 
-    tm.assert_numpy_array_equal(labels, expected_labels)
+    tm.assert_numpy_array_equal(codes, expected_codes)
     tm.assert_categorical_equal(uniques, expected_uniques)
 
 
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -113,29 +113,29 @@ def test_unique(self, data, box, method):
 
     @pytest.mark.parametrize("na_sentinel", [-1, -2])
     def test_factorize(self, data_for_grouping, na_sentinel):
-        labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
-        expected_labels = np.array(
+        codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
+        expected_codes = np.array(
             [0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp
         )
         expected_uniques = data_for_grouping.take([0, 4, 7])
 
-        tm.assert_numpy_array_equal(labels, expected_labels)
+        tm.assert_numpy_array_equal(codes, expected_codes)
         self.assert_extension_array_equal(uniques, expected_uniques)
 
     @pytest.mark.parametrize("na_sentinel", [-1, -2])
     def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
-        l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
-        l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
+        codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
+        codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
 
-        tm.assert_numpy_array_equal(l1, l2)
-        self.assert_extension_array_equal(u1, u2)
+        tm.assert_numpy_array_equal(codes_1, codes_2)
+        self.assert_extension_array_equal(uniques_1, uniques_2)
 
     def test_factorize_empty(self, data):
-        labels, uniques = pd.factorize(data[:0])
-        expected_labels = np.array([], dtype=np.intp)
+        codes, uniques = pd.factorize(data[:0])
+        expected_codes = np.array([], dtype=np.intp)
         expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)
 
-        tm.assert_numpy_array_equal(labels, expected_labels)
+        tm.assert_numpy_array_equal(codes, expected_codes)
         self.assert_extension_array_equal(uniques, expected_uniques)
 
     def test_fillna_copy_frame(self, data_missing):
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py