diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 16caef57673f7..1a32498d53c23 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -96,6 +96,9 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). .. _whatsnew_0200.enhancements.uint64_support: +UInt64 Support Improved +^^^^^^^^^^^^^^^^^^^^^^^ + Pandas has significantly improved support for operations involving unsigned, or purely non-negative, integers. Previously, handling these integers would result in improper rounding or data-type casting, leading to incorrect results. diff --git a/pandas/io/json/__init__.py b/pandas/io/json/__init__.py new file mode 100644 index 0000000000000..a9390a04cc2cd --- /dev/null +++ b/pandas/io/json/__init__.py @@ -0,0 +1,4 @@ +from .json import to_json, read_json, loads, dumps # noqa +from .normalize import json_normalize # noqa + +del json, normalize # noqa diff --git a/pandas/io/json.py b/pandas/io/json/json.py similarity index 73% rename from pandas/io/json.py rename to pandas/io/json/json.py index 767a2212d92da..d29f4a371dd4d 100644 --- a/pandas/io/json.py +++ b/pandas/io/json/json.py @@ -1,8 +1,6 @@ # pylint: disable-msg=E1101,W0613,W0603 import os -import copy -from collections import defaultdict import numpy as np import pandas.json as _json @@ -13,6 +11,7 @@ from pandas.io.common import get_filepath_or_buffer, _get_handle from pandas.core.common import AbstractMethodError from pandas.formats.printing import pprint_thing +from .normalize import _convert_to_line_delimits loads = _json.loads dumps = _json.dumps @@ -641,246 +640,3 @@ def is_ok(col): lambda col, c: self._try_convert_to_date(c), lambda col, c: ((self.keep_default_dates and is_ok(col)) or col in convert_dates)) - -# --------------------------------------------------------------------- -# JSON normalization routines - - -def _convert_to_line_delimits(s): - """Helper function that converts json lists to line delimited json.""" - - # Determine we have a JSON list to turn to lines otherwise just return the - # json object, only lists can - if not s[0] == '[' and s[-1] == ']': - return s - s = s[1:-1] - - from pandas.lib import convert_json_to_lines - return convert_json_to_lines(s) - - -def nested_to_record(ds, prefix="", level=0): - """a simplified json_normalize - - converts a nested dict into a flat dict ("record"), unlike json_normalize, - it does not attempt to extract a subset of the data. - - Parameters - ---------- - ds : dict or list of dicts - prefix: the prefix, optional, default: "" - level: the number of levels in the jason string, optional, default: 0 - - Returns - ------- - d - dict or list of dicts, matching `ds` - - Examples - -------- - - IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), - nested=dict(e=dict(c=1,d=2),d=2))) - Out[52]: - {'dict1.c': 1, - 'dict1.d': 2, - 'flat1': 1, - 'nested.d': 2, - 'nested.e.c': 1, - 'nested.e.d': 2} - """ - singleton = False - if isinstance(ds, dict): - ds = [ds] - singleton = True - - new_ds = [] - for d in ds: - - new_d = copy.deepcopy(d) - for k, v in d.items(): - # each key gets renamed with prefix - if not isinstance(k, compat.string_types): - k = str(k) - if level == 0: - newkey = k - else: - newkey = prefix + '.' + k - - # only dicts gets recurse-flattend - # only at level>1 do we rename the rest of the keys - if not isinstance(v, dict): - if level != 0: # so we skip copying for top level, common case - v = new_d.pop(k) - new_d[newkey] = v - continue - else: - v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, level + 1)) - new_ds.append(new_d) - - if singleton: - return new_ds[0] - return new_ds - - -def json_normalize(data, record_path=None, meta=None, - meta_prefix=None, - record_prefix=None, - errors='raise'): - - """ - "Normalize" semi-structured JSON data into a flat table - - Parameters - ---------- - data : dict or list of dicts - Unserialized JSON objects - record_path : string or list of strings, default None - Path in each object to list of records. If not passed, data will be - assumed to be an array of records - meta : list of paths (string or list of strings), default None - Fields to use as metadata for each record in resulting table - record_prefix : string, default None - If True, prefix records with dotted (?) path, e.g. foo.bar.field if - path to records is ['foo', 'bar'] - meta_prefix : string, default None - errors : {'raise', 'ignore'}, default 'raise' - - * ignore : will ignore KeyError if keys listed in meta are not - always present - * raise : will raise KeyError if keys listed in meta are not - always present - - .. versionadded:: 0.20.0 - - Returns - ------- - frame : DataFrame - - Examples - -------- - - >>> data = [{'state': 'Florida', - ... 'shortname': 'FL', - ... 'info': { - ... 'governor': 'Rick Scott' - ... }, - ... 'counties': [{'name': 'Dade', 'population': 12345}, - ... {'name': 'Broward', 'population': 40000}, - ... {'name': 'Palm Beach', 'population': 60000}]}, - ... {'state': 'Ohio', - ... 'shortname': 'OH', - ... 'info': { - ... 'governor': 'John Kasich' - ... }, - ... 'counties': [{'name': 'Summit', 'population': 1234}, - ... {'name': 'Cuyahoga', 'population': 1337}]}] - >>> from pandas.io.json import json_normalize - >>> result = json_normalize(data, 'counties', ['state', 'shortname', - ... ['info', 'governor']]) - >>> result - name population info.governor state shortname - 0 Dade 12345 Rick Scott Florida FL - 1 Broward 40000 Rick Scott Florida FL - 2 Palm Beach 60000 Rick Scott Florida FL - 3 Summit 1234 John Kasich Ohio OH - 4 Cuyahoga 1337 John Kasich Ohio OH - - """ - def _pull_field(js, spec): - result = js - if isinstance(spec, list): - for field in spec: - result = result[field] - else: - result = result[spec] - - return result - - # A bit of a hackjob - if isinstance(data, dict): - data = [data] - - if record_path is None: - if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): - # naive normalization, this is idempotent for flat records - # and potentially will inflate the data considerably for - # deeply nested structures: - # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} - # - # TODO: handle record value which are lists, at least error - # reasonably - data = nested_to_record(data) - return DataFrame(data) - elif not isinstance(record_path, list): - record_path = [record_path] - - if meta is None: - meta = [] - elif not isinstance(meta, list): - meta = [meta] - - for i, x in enumerate(meta): - if not isinstance(x, list): - meta[i] = [x] - - # Disastrously inefficient for now - records = [] - lengths = [] - - meta_vals = defaultdict(list) - meta_keys = ['.'.join(val) for val in meta] - - def _recursive_extract(data, path, seen_meta, level=0): - if len(path) > 1: - for obj in data: - for val, key in zip(meta, meta_keys): - if level + 1 == len(val): - seen_meta[key] = _pull_field(obj, val[-1]) - - _recursive_extract(obj[path[0]], path[1:], - seen_meta, level=level + 1) - else: - for obj in data: - recs = _pull_field(obj, path[0]) - - # For repeating the metadata later - lengths.append(len(recs)) - - for val, key in zip(meta, meta_keys): - if level + 1 > len(val): - meta_val = seen_meta[key] - else: - try: - meta_val = _pull_field(obj, val[level:]) - except KeyError as e: - if errors == 'ignore': - meta_val = np.nan - else: - raise \ - KeyError("Try running with " - "errors='ignore' as key " - "%s is not always present", e) - meta_vals[key].append(meta_val) - - records.extend(recs) - - _recursive_extract(data, record_path, {}, level=0) - - result = DataFrame(records) - - if record_prefix is not None: - result.rename(columns=lambda x: record_prefix + x, inplace=True) - - # Data types, a problem - for k, v in compat.iteritems(meta_vals): - if meta_prefix is not None: - k = meta_prefix + k - - if k in result: - raise ValueError('Conflicting metadata name %s, ' - 'need distinguishing prefix ' % k) - - result[k] = np.array(v).repeat(lengths) - - return result diff --git a/pandas/io/json/normalize.py b/pandas/io/json/normalize.py new file mode 100644 index 0000000000000..aa80954233682 --- /dev/null +++ b/pandas/io/json/normalize.py @@ -0,0 +1,248 @@ +# --------------------------------------------------------------------- +# JSON normalization routines + +import copy +from collections import defaultdict +import numpy as np + +from pandas.lib import convert_json_to_lines +from pandas import compat, DataFrame + + +def _convert_to_line_delimits(s): + """Helper function that converts json lists to line delimited json.""" + + # Determine we have a JSON list to turn to lines otherwise just return the + # json object, only lists can + if not s[0] == '[' and s[-1] == ']': + return s + s = s[1:-1] + + return convert_json_to_lines(s) + + +def nested_to_record(ds, prefix="", level=0): + """a simplified json_normalize + + converts a nested dict into a flat dict ("record"), unlike json_normalize, + it does not attempt to extract a subset of the data. + + Parameters + ---------- + ds : dict or list of dicts + prefix: the prefix, optional, default: "" + level: the number of levels in the jason string, optional, default: 0 + + Returns + ------- + d - dict or list of dicts, matching `ds` + + Examples + -------- + + IN[52]: nested_to_record(dict(flat1=1,dict1=dict(c=1,d=2), + nested=dict(e=dict(c=1,d=2),d=2))) + Out[52]: + {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1, + 'nested.d': 2, + 'nested.e.c': 1, + 'nested.e.d': 2} + """ + singleton = False + if isinstance(ds, dict): + ds = [ds] + singleton = True + + new_ds = [] + for d in ds: + + new_d = copy.deepcopy(d) + for k, v in d.items(): + # each key gets renamed with prefix + if not isinstance(k, compat.string_types): + k = str(k) + if level == 0: + newkey = k + else: + newkey = prefix + '.' + k + + # only dicts gets recurse-flattend + # only at level>1 do we rename the rest of the keys + if not isinstance(v, dict): + if level != 0: # so we skip copying for top level, common case + v = new_d.pop(k) + new_d[newkey] = v + continue + else: + v = new_d.pop(k) + new_d.update(nested_to_record(v, newkey, level + 1)) + new_ds.append(new_d) + + if singleton: + return new_ds[0] + return new_ds + + +def json_normalize(data, record_path=None, meta=None, + meta_prefix=None, + record_prefix=None, + errors='raise'): + + """ + "Normalize" semi-structured JSON data into a flat table + + Parameters + ---------- + data : dict or list of dicts + Unserialized JSON objects + record_path : string or list of strings, default None + Path in each object to list of records. If not passed, data will be + assumed to be an array of records + meta : list of paths (string or list of strings), default None + Fields to use as metadata for each record in resulting table + record_prefix : string, default None + If True, prefix records with dotted (?) path, e.g. foo.bar.field if + path to records is ['foo', 'bar'] + meta_prefix : string, default None + errors : {'raise', 'ignore'}, default 'raise' + + * ignore : will ignore KeyError if keys listed in meta are not + always present + * raise : will raise KeyError if keys listed in meta are not + always present + + .. versionadded:: 0.20.0 + + Returns + ------- + frame : DataFrame + + Examples + -------- + + >>> data = [{'state': 'Florida', + ... 'shortname': 'FL', + ... 'info': { + ... 'governor': 'Rick Scott' + ... }, + ... 'counties': [{'name': 'Dade', 'population': 12345}, + ... {'name': 'Broward', 'population': 40000}, + ... {'name': 'Palm Beach', 'population': 60000}]}, + ... {'state': 'Ohio', + ... 'shortname': 'OH', + ... 'info': { + ... 'governor': 'John Kasich' + ... }, + ... 'counties': [{'name': 'Summit', 'population': 1234}, + ... {'name': 'Cuyahoga', 'population': 1337}]}] + >>> from pandas.io.json import json_normalize + >>> result = json_normalize(data, 'counties', ['state', 'shortname', + ... ['info', 'governor']]) + >>> result + name population info.governor state shortname + 0 Dade 12345 Rick Scott Florida FL + 1 Broward 40000 Rick Scott Florida FL + 2 Palm Beach 60000 Rick Scott Florida FL + 3 Summit 1234 John Kasich Ohio OH + 4 Cuyahoga 1337 John Kasich Ohio OH + + """ + def _pull_field(js, spec): + result = js + if isinstance(spec, list): + for field in spec: + result = result[field] + else: + result = result[spec] + + return result + + # A bit of a hackjob + if isinstance(data, dict): + data = [data] + + if record_path is None: + if any([isinstance(x, dict) for x in compat.itervalues(data[0])]): + # naive normalization, this is idempotent for flat records + # and potentially will inflate the data considerably for + # deeply nested structures: + # {VeryLong: { b: 1,c:2}} -> {VeryLong.b:1 ,VeryLong.c:@} + # + # TODO: handle record value which are lists, at least error + # reasonably + data = nested_to_record(data) + return DataFrame(data) + elif not isinstance(record_path, list): + record_path = [record_path] + + if meta is None: + meta = [] + elif not isinstance(meta, list): + meta = [meta] + + for i, x in enumerate(meta): + if not isinstance(x, list): + meta[i] = [x] + + # Disastrously inefficient for now + records = [] + lengths = [] + + meta_vals = defaultdict(list) + meta_keys = ['.'.join(val) for val in meta] + + def _recursive_extract(data, path, seen_meta, level=0): + if len(path) > 1: + for obj in data: + for val, key in zip(meta, meta_keys): + if level + 1 == len(val): + seen_meta[key] = _pull_field(obj, val[-1]) + + _recursive_extract(obj[path[0]], path[1:], + seen_meta, level=level + 1) + else: + for obj in data: + recs = _pull_field(obj, path[0]) + + # For repeating the metadata later + lengths.append(len(recs)) + + for val, key in zip(meta, meta_keys): + if level + 1 > len(val): + meta_val = seen_meta[key] + else: + try: + meta_val = _pull_field(obj, val[level:]) + except KeyError as e: + if errors == 'ignore': + meta_val = np.nan + else: + raise \ + KeyError("Try running with " + "errors='ignore' as key " + "%s is not always present", e) + meta_vals[key].append(meta_val) + + records.extend(recs) + + _recursive_extract(data, record_path, {}, level=0) + + result = DataFrame(records) + + if record_prefix is not None: + result.rename(columns=lambda x: record_prefix + x, inplace=True) + + # Data types, a problem + for k, v in compat.iteritems(meta_vals): + if meta_prefix is not None: + k = meta_prefix + k + + if k in result: + raise ValueError('Conflicting metadata name %s, ' + 'need distinguishing prefix ' % k) + + result[k] = np.array(v).repeat(lengths) + + return result diff --git a/pandas/io/tests/json/test_json_norm.py b/pandas/io/tests/json/test_normalize.py similarity index 99% rename from pandas/io/tests/json/test_json_norm.py rename to pandas/io/tests/json/test_normalize.py index 36110898448ea..e5aba43648d0c 100644 --- a/pandas/io/tests/json/test_json_norm.py +++ b/pandas/io/tests/json/test_normalize.py @@ -7,7 +7,8 @@ import pandas.util.testing as tm from pandas import compat -from pandas.io.json import json_normalize, nested_to_record +from pandas.io.json import json_normalize +from pandas.io.json.normalize import nested_to_record def _assert_equal_data(left, right): diff --git a/setup.py b/setup.py index 93a044bc3cc7d..4d6bb76fd6b7c 100755 --- a/setup.py +++ b/setup.py @@ -631,6 +631,7 @@ def pxd(name): 'pandas.core', 'pandas.indexes', 'pandas.io', + 'pandas.io.json', 'pandas.io.sas', 'pandas.formats', 'pandas.sparse',