Skip to content

Commit d5ad4a0

Browse files
committed
Merge branch 'master' into rolling_window
2 parents aece1c4 + 04974b9 commit d5ad4a0

File tree

10 files changed

+301
-226
lines changed

10 files changed

+301
-226
lines changed

.github/PULL_REQUEST_TEMPLATE.md

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
- [ ] Closes #xxxx (remove if there is no corresponding issue, which should only be the case for minor changes)
22
- [ ] Tests added (for all bug fixes or enhancements)
33
- [ ] Tests passed (for all non-documentation changes)
4-
- [ ] Passes ``git diff upstream/master **/*py | flake8 --diff`` (remove if you did not edit any Python files)
54
- [ ] Fully documented, including `whats-new.rst` for all changes and `api.rst` for new API (remove if this change should not be visible to users, e.g., if it is an internal clean-up, or if this is part of a larger project that will be documented later)

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ nosetests.xml
3434
.cache
3535
.ropeproject/
3636

37+
# asv environments
38+
.asv
39+
3740
# Translations
3841
*.mo
3942

asv_bench/benchmarks/__init__.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66

77
import numpy as np
88

9-
np.random.seed(10)
109
_counter = itertools.count()
1110

1211

@@ -25,15 +24,27 @@ def requires_dask():
2524
raise NotImplementedError
2625

2726

28-
def randn(shape, frac_nan=None, chunks=None):
27+
def randn(shape, frac_nan=None, chunks=None, seed=0):
28+
rng = np.random.RandomState(seed)
2929
if chunks is None:
30-
x = np.random.standard_normal(shape)
30+
x = rng.standard_normal(shape)
3131
else:
3232
import dask.array as da
33-
x = da.random.standard_normal(shape, chunks=chunks)
33+
rng = da.random.RandomState(seed)
34+
x = rng.standard_normal(shape, chunks=chunks)
3435

3536
if frac_nan is not None:
36-
inds = random.sample(range(x.size), int(x.size * frac_nan))
37+
inds = rng.choice(range(x.size), int(x.size * frac_nan))
3738
x.flat[inds] = np.nan
3839

3940
return x
41+
42+
43+
def randint(low, high=None, size=None, frac_minus=None, seed=0):
44+
rng = np.random.RandomState(seed)
45+
x = rng.randint(low, high, size)
46+
if frac_minus is not None:
47+
inds = rng.choice(range(x.size), int(x.size * frac_minus))
48+
x.flat[inds] = -1
49+
50+
return x

asv_bench/benchmarks/indexing.py

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
from __future__ import absolute_import
2+
from __future__ import division
3+
from __future__ import print_function
4+
5+
import numpy as np
6+
import pandas as pd
7+
import xarray as xr
8+
9+
from . import randn, randint, requires_dask
10+
11+
12+
nx = 3000
13+
ny = 2000
14+
nt = 1000
15+
16+
basic_indexes = {
17+
'1slice': {'x': slice(0, 3)},
18+
'1slice-1scalar': {'x': 0, 'y': slice(None, None, 3)},
19+
'2slicess-1scalar': {'x': slice(3, -3, 3), 'y': 1, 't': slice(None, -3, 3)}
20+
}
21+
22+
basic_assignment_values = {
23+
'1slice': xr.DataArray(randn((3, ny), frac_nan=0.1), dims=['x', 'y']),
24+
'1slice-1scalar': xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1),
25+
dims=['y']),
26+
'2slicess-1scalar': xr.DataArray(randn(int((nx - 6) / 3), frac_nan=0.1),
27+
dims=['x'])
28+
}
29+
30+
outer_indexes = {
31+
'1d': {'x': randint(0, nx, 400)},
32+
'2d': {'x': randint(0, nx, 500), 'y': randint(0, ny, 400)},
33+
'2d-1scalar': {'x': randint(0, nx, 100), 'y': 1, 't': randint(0, nt, 400)}
34+
}
35+
36+
outer_assignment_values = {
37+
'1d': xr.DataArray(randn((400, ny), frac_nan=0.1), dims=['x', 'y']),
38+
'2d': xr.DataArray(randn((500, 400), frac_nan=0.1), dims=['x', 'y']),
39+
'2d-1scalar': xr.DataArray(randn(100, frac_nan=0.1), dims=['x'])
40+
}
41+
42+
vectorized_indexes = {
43+
'1-1d': {'x': xr.DataArray(randint(0, nx, 400), dims='a')},
44+
'2-1d': {'x': xr.DataArray(randint(0, nx, 400), dims='a'),
45+
'y': xr.DataArray(randint(0, ny, 400), dims='a')},
46+
'3-2d': {'x': xr.DataArray(randint(0, nx, 400).reshape(4, 100),
47+
dims=['a', 'b']),
48+
'y': xr.DataArray(randint(0, ny, 400).reshape(4, 100),
49+
dims=['a', 'b']),
50+
't': xr.DataArray(randint(0, nt, 400).reshape(4, 100),
51+
dims=['a', 'b'])},
52+
}
53+
54+
vectorized_assignment_values = {
55+
'1-1d': xr.DataArray(randn((400, 2000)), dims=['a', 'y'],
56+
coords={'a': randn(400)}),
57+
'2-1d': xr.DataArray(randn(400), dims=['a', ], coords={'a': randn(400)}),
58+
'3-2d': xr.DataArray(randn((4, 100)), dims=['a', 'b'],
59+
coords={'a': randn(4), 'b': randn(100)})
60+
}
61+
62+
63+
class Base(object):
64+
def setup(self, key):
65+
self.ds = xr.Dataset(
66+
{'var1': (('x', 'y'), randn((nx, ny), frac_nan=0.1)),
67+
'var2': (('x', 't'), randn((nx, nt))),
68+
'var3': (('t', ), randn(nt))},
69+
coords={'x': np.arange(nx),
70+
'y': np.linspace(0, 1, ny),
71+
't': pd.date_range('1970-01-01', periods=nt, freq='D'),
72+
'x_coords': ('x', np.linspace(1.1, 2.1, nx))})
73+
74+
75+
class Indexing(Base):
76+
def time_indexing_basic(self, key):
77+
self.ds.isel(**basic_indexes[key]).load()
78+
79+
time_indexing_basic.param_names = ['key']
80+
time_indexing_basic.params = [list(basic_indexes.keys())]
81+
82+
def time_indexing_outer(self, key):
83+
self.ds.isel(**outer_indexes[key]).load()
84+
85+
time_indexing_outer.param_names = ['key']
86+
time_indexing_outer.params = [list(outer_indexes.keys())]
87+
88+
def time_indexing_vectorized(self, key):
89+
self.ds.isel(**vectorized_indexes[key]).load()
90+
91+
time_indexing_vectorized.param_names = ['key']
92+
time_indexing_vectorized.params = [list(vectorized_indexes.keys())]
93+
94+
95+
class Assignment(Base):
96+
def time_assignment_basic(self, key):
97+
ind = basic_indexes[key]
98+
val = basic_assignment_values[key]
99+
self.ds['var1'][ind.get('x', slice(None)),
100+
ind.get('y', slice(None))] = val
101+
102+
time_assignment_basic.param_names = ['key']
103+
time_assignment_basic.params = [list(basic_indexes.keys())]
104+
105+
def time_assignment_outer(self, key):
106+
ind = outer_indexes[key]
107+
val = outer_assignment_values[key]
108+
self.ds['var1'][ind.get('x', slice(None)),
109+
ind.get('y', slice(None))] = val
110+
111+
time_assignment_outer.param_names = ['key']
112+
time_assignment_outer.params = [list(outer_indexes.keys())]
113+
114+
def time_assignment_vectorized(self, key):
115+
ind = vectorized_indexes[key]
116+
val = vectorized_assignment_values[key]
117+
self.ds['var1'][ind.get('x', slice(None)),
118+
ind.get('y', slice(None))] = val
119+
120+
time_assignment_vectorized.param_names = ['key']
121+
time_assignment_vectorized.params = [list(vectorized_indexes.keys())]
122+
123+
124+
class IndexingDask(Indexing):
125+
def setup(self, key):
126+
requires_dask()
127+
super(IndexingDask, self).setup(key)
128+
self.ds = self.ds.chunk({'x': 100, 'y': 50, 't': 50})

doc/examples/weather-data.rst

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,27 @@ not show any seasonal cycle.
9393
@savefig examples_anomalies_plot.png
9494
anomalies.mean('location').to_dataframe()[['tmin', 'tmax']].plot()
9595
96+
.. _standardized monthly anomalies:
97+
98+
Calculate standardized monthly anomalies
99+
----------------------------------------
100+
101+
You can create standardized anomalies where the difference between the
102+
observations and the climatological monthly mean is
103+
divided by the climatological standard deviation.
104+
105+
.. ipython:: python
106+
107+
climatology_mean = ds.groupby('time.month').mean('time')
108+
climatology_std = ds.groupby('time.month').std('time')
109+
stand_anomalies = xr.apply_ufunc(
110+
lambda x, m, s: (x - m) / s,
111+
ds.groupby('time.month'),
112+
climatology_mean, climatology_std)
113+
114+
@savefig examples_standardized_anomalies_plot.png
115+
stand_anomalies.mean('location').to_dataframe()[['tmin', 'tmax']].plot()
116+
96117
.. _fill with climatology:
97118

98119
Fill missing values with climatology

doc/whats-new.rst

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ v0.10.1 (unreleased)
2121
Documentation
2222
~~~~~~~~~~~~~
2323

24+
- Added apply_ufunc example to toy weather data page (:issue:`1844`).
25+
By `Liam Brannigan <https://github.com/braaannigan>` _.
2426
- New entry `Why don’t aggregations return Python scalars?` in the
2527
:doc:`faq` (:issue:`1726`).
2628
By `0x0L <https://github.com/0x0L>`_.
@@ -61,6 +63,11 @@ Enhancements
6163
- :py:func:`~plot.line()` learned to draw multiple lines if provided with a
6264
2D variable.
6365
By `Deepak Cherian <https://github.com/dcherian>`_.
66+
- Reduce memory usage when decoding a variable with a scale_factor, by
67+
converting 8-bit and 16-bit integers to float32 instead of float64
68+
(:pull:`1840`), and keeping float16 and float32 as float32 (:issue:`1842`).
69+
Correspondingly, encoded variables may also be saved with a smaller dtype.
70+
By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.
6471

6572
.. _Zarr: http://zarr.readthedocs.io/
6673

@@ -84,11 +91,12 @@ Bug fixes
8491
- Fixed encoding of multi-dimensional coordinates in
8592
:py:meth:`~Dataset.to_netcdf` (:issue:`1763`).
8693
By `Mike Neish <https://github.com/neishm>`_.
87-
94+
- Fixed chunking with non-file-based rasterio datasets (:issue:`1816`) and
95+
refactored rasterio test suite.
96+
By `Ryan Abernathey <https://github.com/rabernat>`_
8897
- Bug fix in open_dataset(engine='pydap') (:issue:`1775`)
8998
By `Keisuke Fujii <https://github.com/fujiisoup>`_.
90-
91-
- Bug fix in vectorized assignment (:issue:`1743`, `1744`).
99+
- Bug fix in vectorized assignment (:issue:`1743`, :issue:`1744`).
92100
Now item assignment to :py:meth:`~DataArray.__setitem__` checks
93101
- Bug fix in vectorized assignment (:issue:`1743`, :issue:`1744`).
94102
Now item assignment to :py:meth:`DataArray.__setitem__` checks

xarray/backends/rasterio_.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,11 @@ def open_rasterio(filename, chunks=None, cache=None, lock=None):
222222
if chunks is not None:
223223
from dask.base import tokenize
224224
# augment the token with the file modification time
225-
mtime = os.path.getmtime(filename)
225+
try:
226+
mtime = os.path.getmtime(filename)
227+
except OSError:
228+
# the filename is probably an s3 bucket rather than a regular file
229+
mtime = None
226230
token = tokenize(filename, mtime, chunks)
227231
name_prefix = 'open_rasterio-%s' % token
228232
if lock is None:

xarray/coding/variables.py

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,25 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype):
205205
return data
206206

207207

208+
def _choose_float_dtype(dtype, has_offset):
209+
"""Return a float dtype that can losslessly represent `dtype` values."""
210+
# Keep float32 as-is. Upcast half-precision to single-precision,
211+
# because float16 is "intended for storage but not computation"
212+
if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
213+
return np.float32
214+
# float32 can exactly represent all integers up to 24 bits
215+
if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
216+
# A scale factor is entirely safe (vanishing into the mantissa),
217+
# but a large integer offset could lead to loss of precision.
218+
# Sensitivity analysis can be tricky, so we just use a float64
219+
# if there's any offset at all - better unoptimised than wrong!
220+
if not has_offset:
221+
return np.float32
222+
# For all other types and circumstances, we just use float64.
223+
# (safe because eg. complex numbers are not supported in NetCDF)
224+
return np.float64
225+
226+
208227
class CFScaleOffsetCoder(VariableCoder):
209228
"""Scale and offset variables according to CF conventions.
210229
@@ -216,7 +235,8 @@ def encode(self, variable, name=None):
216235
dims, data, attrs, encoding = unpack_for_encoding(variable)
217236

218237
if 'scale_factor' in encoding or 'add_offset' in encoding:
219-
data = data.astype(dtype=np.float64, copy=True)
238+
dtype = _choose_float_dtype(data.dtype, 'add_offset' in encoding)
239+
data = data.astype(dtype=dtype, copy=True)
220240
if 'add_offset' in encoding:
221241
data -= pop_to(encoding, attrs, 'add_offset', name=name)
222242
if 'scale_factor' in encoding:
@@ -230,7 +250,7 @@ def decode(self, variable, name=None):
230250
if 'scale_factor' in attrs or 'add_offset' in attrs:
231251
scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name)
232252
add_offset = pop_to(attrs, encoding, 'add_offset', name=name)
233-
dtype = np.float64
253+
dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs)
234254
transform = partial(_scale_offset_decoding,
235255
scale_factor=scale_factor,
236256
add_offset=add_offset,

0 commit comments

Comments
 (0)