Skip to content

ENH: Preserve Series index on json_normalize #57422

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ Other enhancements
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
- Allow dictionaries to be passed to :meth:`pandas.Series.str.replace` via ``pat`` parameter (:issue:`51748`)
- Support passing a :class:`Series` input to :func:`json_normalize` that retains the :class:`Series` :class:`Index` (:issue:`51452`)
-

.. ---------------------------------------------------------------------------
Expand Down
40 changes: 35 additions & 5 deletions pandas/io/json/_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@
from pandas._libs.writers import convert_json_to_lines

import pandas as pd
from pandas import DataFrame
from pandas import (
DataFrame,
Series,
)

if TYPE_CHECKING:
from collections.abc import Iterable
Expand Down Expand Up @@ -266,7 +269,7 @@ def _simple_json_normalize(


def json_normalize(
data: dict | list[dict],
data: dict | list[dict] | Series,
record_path: str | list | None = None,
meta: str | list[str | list[str]] | None = None,
meta_prefix: str | None = None,
Expand All @@ -280,7 +283,7 @@ def json_normalize(

Parameters
----------
data : dict or list of dicts
data : dict, list of dicts, or Series of dicts
Unserialized JSON objects.
record_path : str or list of str, default None
Path in each object to list of records. If not passed, data will be
Expand Down Expand Up @@ -365,6 +368,26 @@ def json_normalize(
1 NaN Mark Reg 130 60
2 2.0 Faye Raker 130 60

>>> data = [
... {
... "id": 1,
... "name": "Cole Volk",
... "fitness": {"height": 130, "weight": 60},
... },
... {"name": "Mark Reg", "fitness": {"height": 130, "weight": 60}},
... {
... "id": 2,
... "name": "Faye Raker",
... "fitness": {"height": 130, "weight": 60},
... },
... ]
>>> series = pd.Series(data, index=pd.Index(["a", "b", "c"]))
>>> pd.json_normalize(series)
id name fitness.height fitness.weight
a 1.0 Cole Volk 130 60
b NaN Mark Reg 130 60
c 2.0 Faye Raker 130 60

>>> data = [
... {
... "state": "Florida",
Expand Down Expand Up @@ -455,6 +478,11 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
)
return result

if isinstance(data, Series):
index = data.index
else:
index = None

if isinstance(data, list) and not data:
return DataFrame()
elif isinstance(data, dict):
Expand All @@ -477,7 +505,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
and record_prefix is None
and max_level is None
):
return DataFrame(_simple_json_normalize(data, sep=sep))
return DataFrame(_simple_json_normalize(data, sep=sep), index=index)

if record_path is None:
if any([isinstance(x, dict) for x in y.values()] for y in data):
Expand All @@ -489,7 +517,7 @@ def _pull_records(js: dict[str, Any], spec: list | str) -> list:
# TODO: handle record value which are lists, at least error
# reasonably
data = nested_to_record(data, sep=sep, max_level=max_level)
return DataFrame(data)
return DataFrame(data, index=index)
elif not isinstance(record_path, list):
record_path = [record_path]

Expand Down Expand Up @@ -564,4 +592,6 @@ def _recursive_extract(data, path, seen_meta, level: int = 0) -> None:
values[i] = val

result[k] = values.repeat(lengths)
if index is not None:
result.index = index.repeat(lengths)
return result
11 changes: 10 additions & 1 deletion pandas/tests/io/json/test_normalize.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,14 @@ def test_top_column_with_leading_underscore(self):

tm.assert_frame_equal(result, expected)

def test_series_index(self, state_data):
idx = Index([7, 8])
series = Series(state_data, index=idx)
result = json_normalize(series)
tm.assert_index_equal(result.index, idx)
result = json_normalize(series, "counties")
tm.assert_index_equal(result.index, idx.repeat([3, 2]))


class TestNestedToRecord:
def test_flat_stays_flat(self):
Expand Down Expand Up @@ -891,6 +899,7 @@ def test_series_non_zero_index(self):
"elements.a": [1.0, np.nan, np.nan],
"elements.b": [np.nan, 2.0, np.nan],
"elements.c": [np.nan, np.nan, 3.0],
}
},
index=[1, 2, 3],
)
tm.assert_frame_equal(result, expected)