Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset.encoding and unlimited dimensions for to_netcdf #1170

Merged
merged 19 commits into from
Jan 24, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ Attributes
Dataset.data_vars
Dataset.coords
Dataset.attrs
Dataset.encoding
Dataset.indexes
Dataset.get_index

Expand Down
4 changes: 4 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,10 @@ Enhancements
and attributes. The method prints to a buffer (e.g. ``stdout``) with output
similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`).
By `Joe Hamman <https://github.com/jhamman>`_.
- Added the ability write unlimited netCDF dimensions with the ``scipy`` and
``netcdf4`` backends via the new :py:attr:`~xray.Dataset.encoding` attribute
or via the ``unlimited_dims`` argument to :py:meth:`~xray.Dataset.to_netcdf`.
By `Joe Hamman <https://github.com/jhamman>`_.
- New :py:meth:`~DataArray.quantile` method to calculate quantiles from
DataArray objects (:issue:`1187`).
By `Joe Hamman <https://github.com/jhamman>`_.
Expand Down
8 changes: 6 additions & 2 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -522,7 +522,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,


def to_netcdf(dataset, path=None, mode='w', format=None, group=None,
engine=None, writer=None, encoding=None):
engine=None, writer=None, encoding=None, unlimited_dims=None):
"""This function creates an appropriate datastore for writing a dataset to
disk as a netCDF file

Expand Down Expand Up @@ -561,8 +561,12 @@ def to_netcdf(dataset, path=None, mode='w', format=None, group=None,
sync = writer is None

store = store_cls(path, mode, format, group, writer)

if unlimited_dims is None:
unlimited_dims = dataset.encoding.get('unlimited_dims', None)
try:
dataset.dump_to_store(store, sync=sync, encoding=encoding)
dataset.dump_to_store(store, sync=sync, encoding=encoding,
unlimited_dims=unlimited_dims)
if isinstance(path, BytesIO):
return path.getvalue()
finally:
Expand Down
36 changes: 25 additions & 11 deletions xarray/backends/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import logging
import time
import traceback
import threading
from collections import Mapping
from distutils.version import StrictVersion

Expand Down Expand Up @@ -84,6 +83,9 @@ def get_attrs(self): # pragma: no cover
def get_variables(self): # pragma: no cover
raise NotImplementedError

def get_encoding(self):
return {}

def load(self):
"""
This loads the variables and attributes simultaneously.
Expand All @@ -105,8 +107,8 @@ def load(self):
This function will be called anytime variables or attributes
are requested, so care should be taken to make sure its fast.
"""
variables = FrozenOrderedDict((_decode_variable_name(k), v)
for k, v in iteritems(self.get_variables()))
variables = FrozenOrderedDict((_decode_variable_name(k), v) for k, v in
iteritems(self.get_variables()))
attributes = FrozenOrderedDict(self.get_attrs())
return variables, attributes

Expand Down Expand Up @@ -152,7 +154,11 @@ def add(self, source, target):
self.sources.append(source)
self.targets.append(target)
else:
target[...] = source
try:
target[...] = source
except TypeError:
# workaround for GH: scipy/scipy#6880
target[:] = source

def sync(self):
if self.sources:
Expand Down Expand Up @@ -191,34 +197,42 @@ def store_dataset(self, dataset):
# dataset.variables
self.store(dataset, dataset.attrs)

def store(self, variables, attributes, check_encoding_set=frozenset()):
def store(self, variables, attributes, check_encoding_set=frozenset(),
unlimited_dims=None):
self.set_attributes(attributes)
self.set_variables(variables, check_encoding_set)
self.set_variables(variables, check_encoding_set,
unlimited_dims=unlimited_dims)

def set_attributes(self, attributes):
for k, v in iteritems(attributes):
self.set_attribute(k, v)

def set_variables(self, variables, check_encoding_set):
def set_variables(self, variables, check_encoding_set,
unlimited_dims=None):
for vn, v in iteritems(variables):
name = _encode_variable_name(vn)
check = vn in check_encoding_set
target, source = self.prepare_variable(name, v, check)
target, source = self.prepare_variable(
name, v, check, unlimited_dims=unlimited_dims)
self.writer.add(source, target)

def set_necessary_dimensions(self, variable):
def set_necessary_dimensions(self, variable, unlimited_dims=None):
if unlimited_dims is None:
unlimited_dims = set()
for d, l in zip(variable.dims, variable.shape):
if d not in self.dimensions:
if d in unlimited_dims:
l = None
self.set_dimension(d, l)


class WritableCFDataStore(AbstractWritableDataStore):
def store(self, variables, attributes, check_encoding_set=frozenset()):
def store(self, variables, attributes, *args, **kwargs):
# All NetCDF files get CF encoded by default, without this attempting
# to write times, for example, would fail.
cf_variables, cf_attrs = cf_encoder(variables, attributes)
AbstractWritableDataStore.store(self, cf_variables, cf_attrs,
check_encoding_set)
*args, **kwargs)


class DataStorePickleMixin(object):
Expand Down
16 changes: 11 additions & 5 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,16 @@
from __future__ import division
from __future__ import print_function
import functools
import warnings

from .. import Variable
from ..core import indexing
from ..core.utils import FrozenOrderedDict, close_on_error, Frozen
from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict

from .common import WritableCFDataStore, DataStorePickleMixin
from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, _extract_nc4_encoding,
BaseNetCDF4Array)
from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype,
_extract_nc4_variable_encoding, BaseNetCDF4Array)


def maybe_decode_bytes(txt):
Expand All @@ -33,7 +34,7 @@ def _read_attributes(h5netcdf_var):
return attrs


_extract_h5nc_encoding = functools.partial(_extract_nc4_encoding,
_extract_h5nc_encoding = functools.partial(_extract_nc4_variable_encoding,
lsd_okay=False, backend='h5netcdf')


Expand Down Expand Up @@ -92,15 +93,20 @@ def set_dimension(self, name, length):
def set_attribute(self, key, value):
self.ds.setncattr(key, value)

def prepare_variable(self, name, variable, check_encoding=False):
def prepare_variable(self, name, variable, check_encoding=False,
unlimited_dims=None):
import h5py

attrs = variable.attrs.copy()
variable, dtype = _nc4_values_and_dtype(variable)
if dtype is str:
dtype = h5py.special_dtype(vlen=unicode_type)

self.set_necessary_dimensions(variable)
if unlimited_dims is not None:
warnings.warn('h5netcdf does not support unlimited dimensions, '
'got: %s.' % unlimited_dims)
unlimited_dims = None
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)

fill_value = attrs.pop('_FillValue', None)
if fill_value in ['\x00']:
Expand Down
2 changes: 1 addition & 1 deletion xarray/backends/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def get_attrs(self):
def get_variables(self):
return self._variables

def prepare_variable(self, k, v, check_encoding=False):
def prepare_variable(self, k, v, *args, **kwargs):
new_var = Variable(v.dims, np.empty_like(v), v.attrs)
# we copy the variable and stuff all encodings in the
# attributes to imitate what happens when writing to disk.
Expand Down
30 changes: 18 additions & 12 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import numpy as np

from .. import Variable
from ..conventions import pop_to, cf_encoder
from ..conventions import pop_to
from ..core import indexing
from ..core.utils import (FrozenOrderedDict, NDArrayMixin,
close_on_error, is_remote_uri)
Expand Down Expand Up @@ -138,13 +138,13 @@ def _force_native_endianness(var):
# check to see if encoding has a value for endian its 'native'
if not var.encoding.get('endian', 'native') is 'native':
raise NotImplementedError("Attempt to write non-native endian type, "
"this is not supported by the netCDF4 python "
"library.")
"this is not supported by the netCDF4 "
"python library.")
return var


def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True,
backend='netCDF4'):
def _extract_nc4_variable_encoding(variable, raise_on_invalid=False,
lsd_okay=True, backend='netCDF4'):
encoding = variable.encoding.copy()

safe_to_drop = set(['source', 'original_shape'])
Expand All @@ -154,9 +154,8 @@ def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True,
valid_encodings.add('least_significant_digit')

if (encoding.get('chunksizes') is not None and
(encoding.get('original_shape', variable.shape)
!= variable.shape) and
not raise_on_invalid):
(encoding.get('original_shape', variable.shape) !=
variable.shape) and not raise_on_invalid):
del encoding['chunksizes']

for k in safe_to_drop:
Expand Down Expand Up @@ -251,6 +250,12 @@ def get_dimensions(self):
return FrozenOrderedDict((k, len(v))
for k, v in iteritems(self.ds.dimensions))

def get_encoding(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would lean slightly toward just creating a get_unlimited_dims method rather than get_encoding, unless we can think of other Dataset wide encodings we might possibly add in the future.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The other encoding value that comes to mind is the dataset format (e.g. NETCDF4 vs. NETCDF3). Maybe there are others as well but nothing is mind.

encoding = {}
encoding['unlimited_dims'] = {
k for k, v in self.ds.dimensions.items() if v.isunlimited()}
return encoding

def set_dimension(self, name, length):
self.ds.createDimension(name, size=length)

Expand All @@ -259,7 +264,8 @@ def set_attribute(self, key, value):
value = encode_nc3_attr_value(value)
self.ds.setncattr(key, value)

def prepare_variable(self, name, variable, check_encoding=False):
def prepare_variable(self, name, variable, check_encoding=False,
unlimited_dims=None):
attrs = variable.attrs.copy()

variable = _force_native_endianness(variable)
Expand All @@ -270,16 +276,16 @@ def prepare_variable(self, name, variable, check_encoding=False):
variable = encode_nc3_variable(variable)
datatype = variable.dtype

self.set_necessary_dimensions(variable)
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)

fill_value = attrs.pop('_FillValue', None)
if fill_value in ['', '\x00']:
# these are equivalent to the default FillValue, but netCDF4
# doesn't like setting fill_value to an empty string
fill_value = None

encoding = _extract_nc4_encoding(variable,
raise_on_invalid=check_encoding)
encoding = _extract_nc4_variable_encoding(
variable, raise_on_invalid=check_encoding)
nc4_var = self.ds.createVariable(
varname=name,
datatype=datatype,
Expand Down
6 changes: 6 additions & 0 deletions xarray/backends/pynio_.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,5 +57,11 @@ def get_attrs(self):
def get_dimensions(self):
return Frozen(self.ds.dimensions)

def get_encoding(self):
encoding = {}
encoding['unlimited_dims'] = set(
[k for k in self.ds.dimensions if self.ds.unlimited(k)])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think dap can represent unlimited dimensions:
http://docs.opendap.org/index.php/DAP4:_Specification_Volume_1#Dimensions

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, but this is pynio which does: https://www.pyngl.ucar.edu/whatsnew.shtml#Version1.4.1

return encoding

def close(self):
self.ds.close()
16 changes: 13 additions & 3 deletions xarray/backends/scipy_.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import warnings

from .. import Variable
from ..core.pycompat import iteritems, basestring, OrderedDict
from ..core.pycompat import iteritems, OrderedDict
from ..core.utils import Frozen, FrozenOrderedDict
from ..core.indexing import NumpyIndexingAdapter

Expand Down Expand Up @@ -119,6 +119,12 @@ def get_attrs(self):
def get_dimensions(self):
return Frozen(self.ds.dimensions)

def get_encoding(self):
encoding = {}
encoding['unlimited_dims'] = {
k for k, v in self.ds.dimensions.items() if v is None}
return encoding

def set_dimension(self, name, length):
if name in self.dimensions:
raise ValueError('%s does not support modifying dimensions'
Expand All @@ -134,13 +140,17 @@ def set_attribute(self, key, value):
value = encode_nc3_attr_value(value)
setattr(self.ds, key, value)

def prepare_variable(self, name, variable, check_encoding=False):
def prepare_variable(self, name, variable, check_encoding=False,
unlimited_dims=None):
variable = encode_nc3_variable(variable)
if check_encoding and variable.encoding:
raise ValueError('unexpected encoding for scipy backend: %r'
% list(variable.encoding))

self.set_necessary_dimensions(variable)
if unlimited_dims is not None and len(unlimited_dims) > 1:
raise ValueError('NETCDF3 only supports one unlimited dimension')
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)

data = variable.data
# nb. this still creates a numpy array in all memory, even though we
# don't write the data yet; scipy.io.netcdf does not not support
Expand Down
4 changes: 4 additions & 0 deletions xarray/conventions.py
Original file line number Diff line number Diff line change
Expand Up @@ -937,10 +937,12 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,
attrs = obj.attrs
extra_coords = set(obj.coords)
file_obj = obj._file_obj
encoding = obj.encoding
elif isinstance(obj, AbstractDataStore):
vars, attrs = obj.load()
extra_coords = set()
file_obj = obj
encoding = obj.get_encoding()
else:
raise TypeError('can only decode Dataset or DataStore objects')

Expand All @@ -950,6 +952,8 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,
ds = Dataset(vars, attrs=attrs)
ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))
ds._file_obj = file_obj
ds.encoding = encoding

return ds


Expand Down
6 changes: 3 additions & 3 deletions xarray/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@
import numpy as np
import pandas as pd

from .pycompat import (basestring, iteritems, suppress, dask_array_type,
OrderedDict)
from .pycompat import (basestring, suppress, dask_array_type, OrderedDict)
from . import formatting
from .utils import SortedKeysDict, not_implemented, Frozen

Expand Down Expand Up @@ -751,7 +750,8 @@ def full_like(other, fill_value, dtype=None):
elif isinstance(other, DataArray):
return DataArray(
_full_like_variable(other.variable, fill_value, dtype),
dims=other.dims, coords=other.coords, attrs=other.attrs, name=other.name)
dims=other.dims, coords=other.coords, attrs=other.attrs,
name=other.name)
elif isinstance(other, Variable):
return _full_like_variable(other, fill_value, dtype)
else:
Expand Down
Loading