Skip to content

Commit

Permalink
CLN: reorg pandas/io/json to sub-dirs
Browse files Browse the repository at this point in the history
xref pandas-dev#14904

Author: Jeff Reback <jeff@reback.net>

Closes pandas-dev#15322 from jreback/json and squashes the following commits:

0c2da60 [Jeff Reback] DOC: whatsnew update
fa3deef [Jeff Reback] CLN: reorg pandas/io/json to sub-dirs
  • Loading branch information
jreback committed Feb 6, 2017
1 parent f93714b commit 34cdfa4
Show file tree
Hide file tree
Showing 6 changed files with 259 additions and 246 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`).

.. _whatsnew_0200.enhancements.uint64_support:

UInt64 Support Improved
^^^^^^^^^^^^^^^^^^^^^^^

Pandas has significantly improved support for operations involving unsigned,
or purely non-negative, integers. Previously, handling these integers would
result in improper rounding or data-type casting, leading to incorrect results.
Expand Down
4 changes: 4 additions & 0 deletions pandas/io/json/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .json import to_json, read_json, loads, dumps # noqa
from .normalize import json_normalize # noqa

del json, normalize # noqa
246 changes: 1 addition & 245 deletions pandas/io/json.py → pandas/io/json/json.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# pylint: disable-msg=E1101,W0613,W0603

import os
import copy
from collections import defaultdict
import numpy as np

import pandas.json as _json
Expand All @@ -13,6 +11,7 @@
from pandas.io.common import get_filepath_or_buffer, _get_handle
from pandas.core.common import AbstractMethodError
from pandas.formats.printing import pprint_thing
from .normalize import _convert_to_line_delimits

loads = _json.loads
dumps = _json.dumps
Expand Down Expand Up @@ -641,246 +640,3 @@ def is_ok(col):
lambda col, c: self._try_convert_to_date(c),
lambda col, c: ((self.keep_default_dates and is_ok(col)) or
col in convert_dates))

# ---------------------------------------------------------------------
# JSON normalization routines


def _convert_to_line_delimits(s):
"""Helper function that converts json lists to line delimited json."""

# Determine we have a JSON list to turn to lines otherwise just return the
# json object, only lists can
if not s[0] == '[' and s[-1] == ']':
return s
s = s[1:-1]

from pandas.lib import convert_json_to_lines
return convert_json_to_lines(s)


def nested_to_record(ds, prefix="", level=0):
"""a simplified json_normalize
converts a nested dict into a flat dict ("record"), unlike json_normalize,
it does not attempt to extract a subset of the data.
Parameters
----------
ds : dict or list of dicts
prefix: the prefix, optional, default: ""
level: the number of levels in the jason string, optional, default: 0
Returns
-------
d - dict or list of dicts, matching `ds`
Examples
--------
IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2),
nested=dict(e=dict(c=1,d=2),d=2)))
Out[52]:
{'dict1.c': 1,
'dict1.d': 2,
'flat1': 1,
'nested.d': 2,
'nested.e.c': 1,
'nested.e.d': 2}
"""
singleton = False
if isinstance(ds, dict):
ds = [ds]
singleton = True

new_ds = []
for d in ds:

new_d = copy.deepcopy(d)
for k, v in d.items():
# each key gets renamed with prefix
if not isinstance(k, compat.string_types):
k = str(k)
if level == 0:
newkey = k
else:
newkey = prefix + '.' + k

# only dicts gets recurse-flattend
# only at level>1 do we rename the rest of the keys
if not isinstance(v, dict):
if level != 0: # so we skip copying for top level, common case
v = new_d.pop(k)
new_d[newkey] = v
continue
else:
v = new_d.pop(k)
new_d.update(nested_to_record(v, newkey, level + 1))
new_ds.append(new_d)

if singleton:
return new_ds[0]
return new_ds


def json_normalize(data, record_path=None, meta=None,
meta_prefix=None,
record_prefix=None,
errors='raise'):

"""
"Normalize" semi-structured JSON data into a flat table
Parameters
----------
data : dict or list of dicts
Unserialized JSON objects
record_path : string or list of strings, default None
Path in each object to list of records. If not passed, data will be
assumed to be an array of records
meta : list of paths (string or list of strings), default None
Fields to use as metadata for each record in resulting table
record_prefix : string, default None
If True, prefix records with dotted (?) path, e.g. foo.bar.field if
path to records is ['foo', 'bar']
meta_prefix : string, default None
errors : {'raise', 'ignore'}, default 'raise'
* ignore : will ignore KeyError if keys listed in meta are not
always present
* raise : will raise KeyError if keys listed in meta are not
always present
.. versionadded:: 0.20.0
Returns
-------
frame : DataFrame
Examples
--------
>>> data = [{'state': 'Florida',
... 'shortname': 'FL',
... 'info': {
... 'governor': 'Rick Scott'
... },
... 'counties': [{'name': 'Dade', 'population': 12345},
... {'name': 'Broward', 'population': 40000},
... {'name': 'Palm Beach', 'population': 60000}]},
... {'state': 'Ohio',
... 'shortname': 'OH',
... 'info': {
... 'governor': 'John Kasich'
... },
... 'counties': [{'name': 'Summit', 'population': 1234},
... {'name': 'Cuyahoga', 'population': 1337}]}]
>>> from pandas.io.json import json_normalize
>>> result = json_normalize(data, 'counties', ['state', 'shortname',
... ['info', 'governor']])
>>> result
name population info.governor state shortname
0 Dade 12345 Rick Scott Florida FL
1 Broward 40000 Rick Scott Florida FL
2 Palm Beach 60000 Rick Scott Florida FL
3 Summit 1234 John Kasich Ohio OH
4 Cuyahoga 1337 John Kasich Ohio OH
"""
def _pull_field(js, spec):
result = js
if isinstance(spec, list):
for field in spec:
result = result[field]
else:
result = result[spec]

return result

# A bit of a hackjob
if isinstance(data, dict):
data = [data]

if record_path is None:
if any([isinstance(x, dict) for x in compat.itervalues(data[0])]):
# naive normalization, this is idempotent for flat records
# and potentially will inflate the data considerably for
# deeply nested structures:
# {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@}
#
# TODO: handle record value which are lists, at least error
# reasonably
data = nested_to_record(data)
return DataFrame(data)
elif not isinstance(record_path, list):
record_path = [record_path]

if meta is None:
meta = []
elif not isinstance(meta, list):
meta = [meta]

for i, x in enumerate(meta):
if not isinstance(x, list):
meta[i] = [x]

# Disastrously inefficient for now
records = []
lengths = []

meta_vals = defaultdict(list)
meta_keys = ['.'.join(val) for val in meta]

def _recursive_extract(data, path, seen_meta, level=0):
if len(path) > 1:
for obj in data:
for val, key in zip(meta, meta_keys):
if level + 1 == len(val):
seen_meta[key] = _pull_field(obj, val[-1])

_recursive_extract(obj[path[0]], path[1:],
seen_meta, level=level + 1)
else:
for obj in data:
recs = _pull_field(obj, path[0])

# For repeating the metadata later
lengths.append(len(recs))

for val, key in zip(meta, meta_keys):
if level + 1 > len(val):
meta_val = seen_meta[key]
else:
try:
meta_val = _pull_field(obj, val[level:])
except KeyError as e:
if errors == 'ignore':
meta_val = np.nan
else:
raise \
KeyError("Try running with "
"errors='ignore' as key "
"%s is not always present", e)
meta_vals[key].append(meta_val)

records.extend(recs)

_recursive_extract(data, record_path, {}, level=0)

result = DataFrame(records)

if record_prefix is not None:
result.rename(columns=lambda x: record_prefix + x, inplace=True)

# Data types, a problem
for k, v in compat.iteritems(meta_vals):
if meta_prefix is not None:
k = meta_prefix + k

if k in result:
raise ValueError('Conflicting metadata name %s, '
'need distinguishing prefix ' % k)

result[k] = np.array(v).repeat(lengths)

return result
Loading

0 comments on commit 34cdfa4

Please sign in to comment.