Skip to content

Commit 6d5ad44

Browse files
author
Joe Hamman
authored
Dataset.encoding and unlimited dimensions for to_netcdf (#1170)
* initial hack at enabling unlimited dims in to_netcdf * unlimited dims for netcdf4, still working on scipy * fix two bugs in h5netcdf tests * fix failing tests, try workaround for scipy/scipy#6880 * cleanup * simple slice in scipy workaround * initial fixes after @shoyer's review * fix failing test by passing unlimited_dims through to in memory store * remove encoding from dataset constructor * more tests for unlimited_dims and update whats-new * refactor unlimited dimensions / dataset encoding to avoid using DataStore statespace, respond to a few of @shoyer's comments * raise user warning if unlimited dims is used with h5netcdf * cleanup backends after unlimited_dims changes
1 parent c5146e8 commit 6d5ad44

File tree

14 files changed

+151
-52
lines changed

14 files changed

+151
-52
lines changed

doc/api.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ Attributes
4646
Dataset.data_vars
4747
Dataset.coords
4848
Dataset.attrs
49+
Dataset.encoding
4950
Dataset.indexes
5051
Dataset.get_index
5152

doc/whats-new.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,10 @@ Enhancements
209209
and attributes. The method prints to a buffer (e.g. ``stdout``) with output
210210
similar to what the command line utility ``ncdump -h`` produces (:issue:`1150`).
211211
By `Joe Hamman <https://github.com/jhamman>`_.
212+
- Added the ability write unlimited netCDF dimensions with the ``scipy`` and
213+
``netcdf4`` backends via the new :py:attr:`~xray.Dataset.encoding` attribute
214+
or via the ``unlimited_dims`` argument to :py:meth:`~xray.Dataset.to_netcdf`.
215+
By `Joe Hamman <https://github.com/jhamman>`_.
212216
- New :py:meth:`~DataArray.quantile` method to calculate quantiles from
213217
DataArray objects (:issue:`1187`).
214218
By `Joe Hamman <https://github.com/jhamman>`_.

xarray/backends/api.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ def open_mfdataset(paths, chunks=None, concat_dim=_CONCAT_DIM_DEFAULT,
522522

523523

524524
def to_netcdf(dataset, path=None, mode='w', format=None, group=None,
525-
engine=None, writer=None, encoding=None):
525+
engine=None, writer=None, encoding=None, unlimited_dims=None):
526526
"""This function creates an appropriate datastore for writing a dataset to
527527
disk as a netCDF file
528528
@@ -561,8 +561,12 @@ def to_netcdf(dataset, path=None, mode='w', format=None, group=None,
561561
sync = writer is None
562562

563563
store = store_cls(path, mode, format, group, writer)
564+
565+
if unlimited_dims is None:
566+
unlimited_dims = dataset.encoding.get('unlimited_dims', None)
564567
try:
565-
dataset.dump_to_store(store, sync=sync, encoding=encoding)
568+
dataset.dump_to_store(store, sync=sync, encoding=encoding,
569+
unlimited_dims=unlimited_dims)
566570
if isinstance(path, BytesIO):
567571
return path.getvalue()
568572
finally:

xarray/backends/common.py

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import logging
66
import time
77
import traceback
8-
import threading
98
from collections import Mapping
109
from distutils.version import StrictVersion
1110

@@ -84,6 +83,9 @@ def get_attrs(self): # pragma: no cover
8483
def get_variables(self): # pragma: no cover
8584
raise NotImplementedError
8685

86+
def get_encoding(self):
87+
return {}
88+
8789
def load(self):
8890
"""
8991
This loads the variables and attributes simultaneously.
@@ -105,8 +107,8 @@ def load(self):
105107
This function will be called anytime variables or attributes
106108
are requested, so care should be taken to make sure its fast.
107109
"""
108-
variables = FrozenOrderedDict((_decode_variable_name(k), v)
109-
for k, v in iteritems(self.get_variables()))
110+
variables = FrozenOrderedDict((_decode_variable_name(k), v) for k, v in
111+
iteritems(self.get_variables()))
110112
attributes = FrozenOrderedDict(self.get_attrs())
111113
return variables, attributes
112114

@@ -152,7 +154,11 @@ def add(self, source, target):
152154
self.sources.append(source)
153155
self.targets.append(target)
154156
else:
155-
target[...] = source
157+
try:
158+
target[...] = source
159+
except TypeError:
160+
# workaround for GH: scipy/scipy#6880
161+
target[:] = source
156162

157163
def sync(self):
158164
if self.sources:
@@ -191,34 +197,42 @@ def store_dataset(self, dataset):
191197
# dataset.variables
192198
self.store(dataset, dataset.attrs)
193199

194-
def store(self, variables, attributes, check_encoding_set=frozenset()):
200+
def store(self, variables, attributes, check_encoding_set=frozenset(),
201+
unlimited_dims=None):
195202
self.set_attributes(attributes)
196-
self.set_variables(variables, check_encoding_set)
203+
self.set_variables(variables, check_encoding_set,
204+
unlimited_dims=unlimited_dims)
197205

198206
def set_attributes(self, attributes):
199207
for k, v in iteritems(attributes):
200208
self.set_attribute(k, v)
201209

202-
def set_variables(self, variables, check_encoding_set):
210+
def set_variables(self, variables, check_encoding_set,
211+
unlimited_dims=None):
203212
for vn, v in iteritems(variables):
204213
name = _encode_variable_name(vn)
205214
check = vn in check_encoding_set
206-
target, source = self.prepare_variable(name, v, check)
215+
target, source = self.prepare_variable(
216+
name, v, check, unlimited_dims=unlimited_dims)
207217
self.writer.add(source, target)
208218

209-
def set_necessary_dimensions(self, variable):
219+
def set_necessary_dimensions(self, variable, unlimited_dims=None):
220+
if unlimited_dims is None:
221+
unlimited_dims = set()
210222
for d, l in zip(variable.dims, variable.shape):
211223
if d not in self.dimensions:
224+
if d in unlimited_dims:
225+
l = None
212226
self.set_dimension(d, l)
213227

214228

215229
class WritableCFDataStore(AbstractWritableDataStore):
216-
def store(self, variables, attributes, check_encoding_set=frozenset()):
230+
def store(self, variables, attributes, *args, **kwargs):
217231
# All NetCDF files get CF encoded by default, without this attempting
218232
# to write times, for example, would fail.
219233
cf_variables, cf_attrs = cf_encoder(variables, attributes)
220234
AbstractWritableDataStore.store(self, cf_variables, cf_attrs,
221-
check_encoding_set)
235+
*args, **kwargs)
222236

223237

224238
class DataStorePickleMixin(object):

xarray/backends/h5netcdf_.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,15 +2,16 @@
22
from __future__ import division
33
from __future__ import print_function
44
import functools
5+
import warnings
56

67
from .. import Variable
78
from ..core import indexing
89
from ..core.utils import FrozenOrderedDict, close_on_error, Frozen
910
from ..core.pycompat import iteritems, bytes_type, unicode_type, OrderedDict
1011

1112
from .common import WritableCFDataStore, DataStorePickleMixin
12-
from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype, _extract_nc4_encoding,
13-
BaseNetCDF4Array)
13+
from .netCDF4_ import (_nc4_group, _nc4_values_and_dtype,
14+
_extract_nc4_variable_encoding, BaseNetCDF4Array)
1415

1516

1617
def maybe_decode_bytes(txt):
@@ -33,7 +34,7 @@ def _read_attributes(h5netcdf_var):
3334
return attrs
3435

3536

36-
_extract_h5nc_encoding = functools.partial(_extract_nc4_encoding,
37+
_extract_h5nc_encoding = functools.partial(_extract_nc4_variable_encoding,
3738
lsd_okay=False, backend='h5netcdf')
3839

3940

@@ -92,15 +93,20 @@ def set_dimension(self, name, length):
9293
def set_attribute(self, key, value):
9394
self.ds.setncattr(key, value)
9495

95-
def prepare_variable(self, name, variable, check_encoding=False):
96+
def prepare_variable(self, name, variable, check_encoding=False,
97+
unlimited_dims=None):
9698
import h5py
9799

98100
attrs = variable.attrs.copy()
99101
variable, dtype = _nc4_values_and_dtype(variable)
100102
if dtype is str:
101103
dtype = h5py.special_dtype(vlen=unicode_type)
102104

103-
self.set_necessary_dimensions(variable)
105+
if unlimited_dims is not None:
106+
warnings.warn('h5netcdf does not support unlimited dimensions, '
107+
'got: %s.' % unlimited_dims)
108+
unlimited_dims = None
109+
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
104110

105111
fill_value = attrs.pop('_FillValue', None)
106112
if fill_value in ['\x00']:

xarray/backends/memory.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ def get_attrs(self):
2929
def get_variables(self):
3030
return self._variables
3131

32-
def prepare_variable(self, k, v, check_encoding=False):
32+
def prepare_variable(self, k, v, *args, **kwargs):
3333
new_var = Variable(v.dims, np.empty_like(v), v.attrs)
3434
# we copy the variable and stuff all encodings in the
3535
# attributes to imitate what happens when writing to disk.

xarray/backends/netCDF4_.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import numpy as np
88

99
from .. import Variable
10-
from ..conventions import pop_to, cf_encoder
10+
from ..conventions import pop_to
1111
from ..core import indexing
1212
from ..core.utils import (FrozenOrderedDict, NDArrayMixin,
1313
close_on_error, is_remote_uri)
@@ -138,13 +138,13 @@ def _force_native_endianness(var):
138138
# check to see if encoding has a value for endian its 'native'
139139
if not var.encoding.get('endian', 'native') is 'native':
140140
raise NotImplementedError("Attempt to write non-native endian type, "
141-
"this is not supported by the netCDF4 python "
142-
"library.")
141+
"this is not supported by the netCDF4 "
142+
"python library.")
143143
return var
144144

145145

146-
def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True,
147-
backend='netCDF4'):
146+
def _extract_nc4_variable_encoding(variable, raise_on_invalid=False,
147+
lsd_okay=True, backend='netCDF4'):
148148
encoding = variable.encoding.copy()
149149

150150
safe_to_drop = set(['source', 'original_shape'])
@@ -154,9 +154,8 @@ def _extract_nc4_encoding(variable, raise_on_invalid=False, lsd_okay=True,
154154
valid_encodings.add('least_significant_digit')
155155

156156
if (encoding.get('chunksizes') is not None and
157-
(encoding.get('original_shape', variable.shape)
158-
!= variable.shape) and
159-
not raise_on_invalid):
157+
(encoding.get('original_shape', variable.shape) !=
158+
variable.shape) and not raise_on_invalid):
160159
del encoding['chunksizes']
161160

162161
for k in safe_to_drop:
@@ -251,6 +250,12 @@ def get_dimensions(self):
251250
return FrozenOrderedDict((k, len(v))
252251
for k, v in iteritems(self.ds.dimensions))
253252

253+
def get_encoding(self):
254+
encoding = {}
255+
encoding['unlimited_dims'] = {
256+
k for k, v in self.ds.dimensions.items() if v.isunlimited()}
257+
return encoding
258+
254259
def set_dimension(self, name, length):
255260
self.ds.createDimension(name, size=length)
256261

@@ -259,7 +264,8 @@ def set_attribute(self, key, value):
259264
value = encode_nc3_attr_value(value)
260265
self.ds.setncattr(key, value)
261266

262-
def prepare_variable(self, name, variable, check_encoding=False):
267+
def prepare_variable(self, name, variable, check_encoding=False,
268+
unlimited_dims=None):
263269
attrs = variable.attrs.copy()
264270

265271
variable = _force_native_endianness(variable)
@@ -270,16 +276,16 @@ def prepare_variable(self, name, variable, check_encoding=False):
270276
variable = encode_nc3_variable(variable)
271277
datatype = variable.dtype
272278

273-
self.set_necessary_dimensions(variable)
279+
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
274280

275281
fill_value = attrs.pop('_FillValue', None)
276282
if fill_value in ['', '\x00']:
277283
# these are equivalent to the default FillValue, but netCDF4
278284
# doesn't like setting fill_value to an empty string
279285
fill_value = None
280286

281-
encoding = _extract_nc4_encoding(variable,
282-
raise_on_invalid=check_encoding)
287+
encoding = _extract_nc4_variable_encoding(
288+
variable, raise_on_invalid=check_encoding)
283289
nc4_var = self.ds.createVariable(
284290
varname=name,
285291
datatype=datatype,

xarray/backends/pynio_.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,5 +57,11 @@ def get_attrs(self):
5757
def get_dimensions(self):
5858
return Frozen(self.ds.dimensions)
5959

60+
def get_encoding(self):
61+
encoding = {}
62+
encoding['unlimited_dims'] = set(
63+
[k for k in self.ds.dimensions if self.ds.unlimited(k)])
64+
return encoding
65+
6066
def close(self):
6167
self.ds.close()

xarray/backends/scipy_.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
import warnings
99

1010
from .. import Variable
11-
from ..core.pycompat import iteritems, basestring, OrderedDict
11+
from ..core.pycompat import iteritems, OrderedDict
1212
from ..core.utils import Frozen, FrozenOrderedDict
1313
from ..core.indexing import NumpyIndexingAdapter
1414

@@ -119,6 +119,12 @@ def get_attrs(self):
119119
def get_dimensions(self):
120120
return Frozen(self.ds.dimensions)
121121

122+
def get_encoding(self):
123+
encoding = {}
124+
encoding['unlimited_dims'] = {
125+
k for k, v in self.ds.dimensions.items() if v is None}
126+
return encoding
127+
122128
def set_dimension(self, name, length):
123129
if name in self.dimensions:
124130
raise ValueError('%s does not support modifying dimensions'
@@ -134,13 +140,17 @@ def set_attribute(self, key, value):
134140
value = encode_nc3_attr_value(value)
135141
setattr(self.ds, key, value)
136142

137-
def prepare_variable(self, name, variable, check_encoding=False):
143+
def prepare_variable(self, name, variable, check_encoding=False,
144+
unlimited_dims=None):
138145
variable = encode_nc3_variable(variable)
139146
if check_encoding and variable.encoding:
140147
raise ValueError('unexpected encoding for scipy backend: %r'
141148
% list(variable.encoding))
142149

143-
self.set_necessary_dimensions(variable)
150+
if unlimited_dims is not None and len(unlimited_dims) > 1:
151+
raise ValueError('NETCDF3 only supports one unlimited dimension')
152+
self.set_necessary_dimensions(variable, unlimited_dims=unlimited_dims)
153+
144154
data = variable.data
145155
# nb. this still creates a numpy array in all memory, even though we
146156
# don't write the data yet; scipy.io.netcdf does not not support

xarray/conventions.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -937,10 +937,12 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,
937937
attrs = obj.attrs
938938
extra_coords = set(obj.coords)
939939
file_obj = obj._file_obj
940+
encoding = obj.encoding
940941
elif isinstance(obj, AbstractDataStore):
941942
vars, attrs = obj.load()
942943
extra_coords = set()
943944
file_obj = obj
945+
encoding = obj.get_encoding()
944946
else:
945947
raise TypeError('can only decode Dataset or DataStore objects')
946948

@@ -950,6 +952,8 @@ def decode_cf(obj, concat_characters=True, mask_and_scale=True,
950952
ds = Dataset(vars, attrs=attrs)
951953
ds = ds.set_coords(coord_names.union(extra_coords).intersection(vars))
952954
ds._file_obj = file_obj
955+
ds.encoding = encoding
956+
953957
return ds
954958

955959

0 commit comments

Comments
 (0)