Skip to content

Commit

Permalink
API: rename labels to codes (#29509)
Browse files Browse the repository at this point in the history
  • Loading branch information
topper-123 authored and jreback committed Nov 11, 2019
1 parent bcbc468 commit e561293
Show file tree
Hide file tree
Showing 8 changed files with 133 additions and 127 deletions.
54 changes: 30 additions & 24 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
intended for public consumption
"""
from textwrap import dedent
from typing import Dict
from typing import Dict, Optional, Tuple, Union
from warnings import catch_warnings, simplefilter, warn

import numpy as np
Expand Down Expand Up @@ -501,9 +501,9 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
Returns
-------
labels : ndarray
codes : ndarray
An integer ndarray that's an indexer into `uniques`.
``uniques.take(labels)`` will have the same values as `values`.
``uniques.take(codes)`` will have the same values as `values`.
uniques : ndarray, Index, or Categorical
The unique valid values. When `values` is Categorical, `uniques`
is a Categorical. When `values` is some other pandas object, an
Expand All @@ -525,27 +525,27 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
``pd.factorize(values)``. The results are identical for methods like
:meth:`Series.factorize`.
>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
>>> labels
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'])
>>> codes
array([0, 0, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)
With ``sort=True``, the `uniques` will be sorted, and `labels` will be
With ``sort=True``, the `uniques` will be sorted, and `codes` will be
shuffled so that the relationship is the maintained.
>>> labels, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
>>> labels
>>> codes, uniques = pd.factorize(['b', 'b', 'a', 'c', 'b'], sort=True)
>>> codes
array([1, 1, 0, 2, 1])
>>> uniques
array(['a', 'b', 'c'], dtype=object)
Missing values are indicated in `labels` with `na_sentinel`
Missing values are indicated in `codes` with `na_sentinel`
(``-1`` by default). Note that missing values are never
included in `uniques`.
>>> labels, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
>>> labels
>>> codes, uniques = pd.factorize(['b', None, 'a', 'c', 'b'])
>>> codes
array([ 0, -1, 1, 2, 0])
>>> uniques
array(['b', 'a', 'c'], dtype=object)
Expand All @@ -555,8 +555,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
will differ. For Categoricals, a `Categorical` is returned.
>>> cat = pd.Categorical(['a', 'a', 'c'], categories=['a', 'b', 'c'])
>>> labels, uniques = pd.factorize(cat)
>>> labels
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
[a, c]
Expand All @@ -569,8 +569,8 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
returned.
>>> cat = pd.Series(['a', 'a', 'c'])
>>> labels, uniques = pd.factorize(cat)
>>> labels
>>> codes, uniques = pd.factorize(cat)
>>> codes
array([0, 0, 1])
>>> uniques
Index(['a', 'c'], dtype='object')
Expand All @@ -596,7 +596,7 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
sort=dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `labels` to maintain the
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
Expand All @@ -609,11 +609,17 @@ def _factorize_array(values, na_sentinel: int = -1, size_hint=None, na_value=Non
)
@Appender(_shared_docs["factorize"])
@deprecate_kwarg(old_arg_name="order", new_arg_name=None)
def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=None):
def factorize(
values,
sort: bool = False,
order=None,
na_sentinel: int = -1,
size_hint: Optional[int] = None,
) -> Tuple[np.ndarray, Union[np.ndarray, ABCIndex]]:
# Implementation notes: This method is responsible for 3 things
# 1.) coercing data to array-like (ndarray, Index, extension array)
# 2.) factorizing labels and uniques
# 3.) Maybe boxing the output in an Index
# 2.) factorizing codes and uniques
# 3.) Maybe boxing the uniques in an Index
#
# Step 2 is dispatched to extension types (like Categorical). They are
# responsible only for factorization. All data coercion, sorting and boxing
Expand All @@ -624,7 +630,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=

if is_extension_array_dtype(values):
values = extract_array(values)
labels, uniques = values.factorize(na_sentinel=na_sentinel)
codes, uniques = values.factorize(na_sentinel=na_sentinel)
dtype = original.dtype
else:
values, dtype = _ensure_data(values)
Expand All @@ -634,13 +640,13 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=
else:
na_value = None

labels, uniques = _factorize_array(
codes, uniques = _factorize_array(
values, na_sentinel=na_sentinel, size_hint=size_hint, na_value=na_value
)

if sort and len(uniques) > 0:
uniques, labels = safe_sort(
uniques, labels, na_sentinel=na_sentinel, assume_unique=True, verify=False
uniques, codes = safe_sort(
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
)

uniques = _reconstruct_data(uniques, dtype, original)
Expand All @@ -653,7 +659,7 @@ def factorize(values, sort: bool = False, order=None, na_sentinel=-1, size_hint=

uniques = Index(uniques)

return labels, uniques
return codes, uniques


def value_counts(
Expand Down
8 changes: 4 additions & 4 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -690,11 +690,11 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
Parameters
----------
na_sentinel : int, default -1
Value to use in the `labels` array to indicate missing values.
Value to use in the `codes` array to indicate missing values.
Returns
-------
labels : ndarray
codes : ndarray
An integer NumPy array that's an indexer into the original
ExtensionArray.
uniques : ExtensionArray
Expand Down Expand Up @@ -724,12 +724,12 @@ def factorize(self, na_sentinel: int = -1) -> Tuple[np.ndarray, ABCExtensionArra
# Complete control over factorization.
arr, na_value = self._values_for_factorize()

labels, uniques = _factorize_array(
codes, uniques = _factorize_array(
arr, na_sentinel=na_sentinel, na_value=na_value
)

uniques = self._from_factorized(uniques, self)
return labels, uniques
return codes, uniques

_extension_array_shared_docs[
"repeat"
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,11 +710,11 @@ def factorize(self, na_sentinel=-1):
# Currently, ExtensionArray.factorize -> Tuple[ndarray, EA]
# The sparsity on this is backwards from what Sparse would want. Want
# ExtensionArray.factorize -> Tuple[EA, EA]
# Given that we have to return a dense array of labels, why bother
# Given that we have to return a dense array of codes, why bother
# implementing an efficient factorize?
labels, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
codes, uniques = algos.factorize(np.asarray(self), na_sentinel=na_sentinel)
uniques = SparseArray(uniques, dtype=self.dtype)
return labels, uniques
return codes, uniques

def value_counts(self, dropna=True):
"""
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1518,7 +1518,7 @@ def memory_usage(self, deep=False):
sort=textwrap.dedent(
"""\
sort : bool, default False
Sort `uniques` and shuffle `labels` to maintain the
Sort `uniques` and shuffle `codes` to maintain the
relationship.
"""
),
Expand Down
18 changes: 9 additions & 9 deletions pandas/tests/arrays/categorical/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,23 @@ def test_factorize(categories, ordered):
cat = pd.Categorical(
["b", "b", "a", "c", None], categories=categories, ordered=ordered
)
labels, uniques = pd.factorize(cat)
expected_labels = np.array([0, 0, 1, 2, -1], dtype=np.intp)
codes, uniques = pd.factorize(cat)
expected_codes = np.array([0, 0, 1, 2, -1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a", "c"], categories=categories, ordered=ordered
)

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)


def test_factorized_sort():
cat = pd.Categorical(["b", "b", None, "a"])
labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([1, 1, -1, 0], dtype=np.intp)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([1, 1, -1, 0], dtype=np.intp)
expected_uniques = pd.Categorical(["a", "b"])

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)


Expand All @@ -36,13 +36,13 @@ def test_factorized_sort_ordered():
["b", "b", None, "a"], categories=["c", "b", "a"], ordered=True
)

labels, uniques = pd.factorize(cat, sort=True)
expected_labels = np.array([0, 0, -1, 1], dtype=np.intp)
codes, uniques = pd.factorize(cat, sort=True)
expected_codes = np.array([0, 0, -1, 1], dtype=np.intp)
expected_uniques = pd.Categorical(
["b", "a"], categories=["c", "b", "a"], ordered=True
)

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
tm.assert_categorical_equal(uniques, expected_uniques)


Expand Down
20 changes: 10 additions & 10 deletions pandas/tests/extension/base/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,29 +113,29 @@ def test_unique(self, data, box, method):

@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize(self, data_for_grouping, na_sentinel):
labels, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
expected_labels = np.array(
codes, uniques = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
expected_codes = np.array(
[0, 0, na_sentinel, na_sentinel, 1, 1, 0, 2], dtype=np.intp
)
expected_uniques = data_for_grouping.take([0, 4, 7])

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
self.assert_extension_array_equal(uniques, expected_uniques)

@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize_equivalence(self, data_for_grouping, na_sentinel):
l1, u1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
l2, u2 = data_for_grouping.factorize(na_sentinel=na_sentinel)
codes_1, uniques_1 = pd.factorize(data_for_grouping, na_sentinel=na_sentinel)
codes_2, uniques_2 = data_for_grouping.factorize(na_sentinel=na_sentinel)

tm.assert_numpy_array_equal(l1, l2)
self.assert_extension_array_equal(u1, u2)
tm.assert_numpy_array_equal(codes_1, codes_2)
self.assert_extension_array_equal(uniques_1, uniques_2)

def test_factorize_empty(self, data):
labels, uniques = pd.factorize(data[:0])
expected_labels = np.array([], dtype=np.intp)
codes, uniques = pd.factorize(data[:0])
expected_codes = np.array([], dtype=np.intp)
expected_uniques = type(data)._from_sequence([], dtype=data[:0].dtype)

tm.assert_numpy_array_equal(labels, expected_labels)
tm.assert_numpy_array_equal(codes, expected_codes)
self.assert_extension_array_equal(uniques, expected_uniques)

def test_fillna_copy_frame(self, data_missing):
Expand Down
Loading

0 comments on commit e561293

Please sign in to comment.