Merge branch 'master' into rolling_window

fujiisoup · fujiisoup · commit d5ad4a077295 · 2018-01-24T18:52:53.000+09:00
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,5 +1,4 @@
  - [ ] Closes #xxxx (remove if there is no corresponding issue, which should only be the case for minor changes)
  - [ ] Tests added (for all bug fixes or enhancements)
  - [ ] Tests passed (for all non-documentation changes)
- - [ ] Passes ``git diff upstream/master **/*py | flake8 --diff`` (remove if you did not edit any Python files)
  - [ ] Fully documented, including `whats-new.rst` for all changes and `api.rst` for new API (remove if this change should not be visible to users, e.g., if it is an internal clean-up, or if this is part of a larger project that will be documented later)
diff --git a/.gitignore b/.gitignore
@@ -34,6 +34,9 @@ nosetests.xml
 .cache
 .ropeproject/
 
+# asv environments
+.asv
+
 # Translations
 *.mo
 
diff --git a/asv_bench/benchmarks/__init__.py b/asv_bench/benchmarks/__init__.py
@@ -6,7 +6,6 @@
 
 import numpy as np
 
-np.random.seed(10)
 _counter = itertools.count()
 
 
@@ -25,15 +24,27 @@ def requires_dask():
         raise NotImplementedError
 
 
-def randn(shape, frac_nan=None, chunks=None):
+def randn(shape, frac_nan=None, chunks=None, seed=0):
+    rng = np.random.RandomState(seed)
     if chunks is None:
-        x = np.random.standard_normal(shape)
+        x = rng.standard_normal(shape)
     else:
         import dask.array as da
-        x = da.random.standard_normal(shape, chunks=chunks)
+        rng = da.random.RandomState(seed)
+        x = rng.standard_normal(shape, chunks=chunks)
 
     if frac_nan is not None:
-        inds = random.sample(range(x.size), int(x.size * frac_nan))
+        inds = rng.choice(range(x.size), int(x.size * frac_nan))
         x.flat[inds] = np.nan
 
     return x
+
+
+def randint(low, high=None, size=None, frac_minus=None, seed=0):
+    rng = np.random.RandomState(seed)
+    x = rng.randint(low, high, size)
+    if frac_minus is not None:
+        inds = rng.choice(range(x.size), int(x.size * frac_minus))
+        x.flat[inds] = -1
+
+    return x
diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py
@@ -0,0 +1,128 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import numpy as np
+import pandas as pd
+import xarray as xr
+
+from . import randn, randint, requires_dask
+
+
+nx = 3000
+ny = 2000
+nt = 1000
+
+basic_indexes = {
+    '1slice': {'x': slice(0, 3)},
+    '1slice-1scalar': {'x': 0, 'y': slice(None, None, 3)},
+    '2slicess-1scalar': {'x': slice(3, -3, 3), 'y': 1, 't': slice(None, -3, 3)}
+}
+
+basic_assignment_values = {
+    '1slice': xr.DataArray(randn((3, ny), frac_nan=0.1), dims=['x', 'y']),
+    '1slice-1scalar': xr.DataArray(randn(int(ny / 3) + 1, frac_nan=0.1),
+                                   dims=['y']),
+    '2slicess-1scalar': xr.DataArray(randn(int((nx - 6) / 3), frac_nan=0.1),
+                                     dims=['x'])
+}
+
+outer_indexes = {
+    '1d': {'x': randint(0, nx, 400)},
+    '2d':  {'x': randint(0, nx, 500), 'y': randint(0, ny, 400)},
+    '2d-1scalar': {'x': randint(0, nx, 100), 'y': 1, 't': randint(0, nt, 400)}
+}
+
+outer_assignment_values = {
+    '1d': xr.DataArray(randn((400, ny), frac_nan=0.1), dims=['x', 'y']),
+    '2d': xr.DataArray(randn((500, 400), frac_nan=0.1), dims=['x', 'y']),
+    '2d-1scalar': xr.DataArray(randn(100, frac_nan=0.1), dims=['x'])
+}
+
+vectorized_indexes = {
+    '1-1d': {'x': xr.DataArray(randint(0, nx, 400), dims='a')},
+    '2-1d': {'x': xr.DataArray(randint(0, nx, 400), dims='a'),
+             'y': xr.DataArray(randint(0, ny, 400), dims='a')},
+    '3-2d': {'x': xr.DataArray(randint(0, nx, 400).reshape(4, 100),
+                               dims=['a', 'b']),
+             'y': xr.DataArray(randint(0, ny, 400).reshape(4, 100),
+                               dims=['a', 'b']),
+             't': xr.DataArray(randint(0, nt, 400).reshape(4, 100),
+                               dims=['a', 'b'])},
+}
+
+vectorized_assignment_values = {
+    '1-1d': xr.DataArray(randn((400, 2000)), dims=['a', 'y'],
+                         coords={'a': randn(400)}),
+    '2-1d': xr.DataArray(randn(400), dims=['a', ], coords={'a': randn(400)}),
+    '3-2d': xr.DataArray(randn((4, 100)), dims=['a', 'b'],
+                         coords={'a': randn(4), 'b': randn(100)})
+}
+
+
+class Base(object):
+    def setup(self, key):
+        self.ds = xr.Dataset(
+            {'var1': (('x', 'y'), randn((nx, ny), frac_nan=0.1)),
+             'var2': (('x', 't'), randn((nx, nt))),
+             'var3': (('t', ), randn(nt))},
+            coords={'x': np.arange(nx),
+                    'y': np.linspace(0, 1, ny),
+                    't': pd.date_range('1970-01-01', periods=nt, freq='D'),
+                    'x_coords': ('x', np.linspace(1.1, 2.1, nx))})
+
+
+class Indexing(Base):
+    def time_indexing_basic(self, key):
+        self.ds.isel(**basic_indexes[key]).load()
+
+    time_indexing_basic.param_names = ['key']
+    time_indexing_basic.params = [list(basic_indexes.keys())]
+
+    def time_indexing_outer(self, key):
+        self.ds.isel(**outer_indexes[key]).load()
+
+    time_indexing_outer.param_names = ['key']
+    time_indexing_outer.params = [list(outer_indexes.keys())]
+
+    def time_indexing_vectorized(self, key):
+        self.ds.isel(**vectorized_indexes[key]).load()
+
+    time_indexing_vectorized.param_names = ['key']
+    time_indexing_vectorized.params = [list(vectorized_indexes.keys())]
+
+
+class Assignment(Base):
+    def time_assignment_basic(self, key):
+        ind = basic_indexes[key]
+        val = basic_assignment_values[key]
+        self.ds['var1'][ind.get('x', slice(None)),
+                        ind.get('y', slice(None))] = val
+
+    time_assignment_basic.param_names = ['key']
+    time_assignment_basic.params = [list(basic_indexes.keys())]
+
+    def time_assignment_outer(self, key):
+        ind = outer_indexes[key]
+        val = outer_assignment_values[key]
+        self.ds['var1'][ind.get('x', slice(None)),
+                        ind.get('y', slice(None))] = val
+
+    time_assignment_outer.param_names = ['key']
+    time_assignment_outer.params = [list(outer_indexes.keys())]
+
+    def time_assignment_vectorized(self, key):
+        ind = vectorized_indexes[key]
+        val = vectorized_assignment_values[key]
+        self.ds['var1'][ind.get('x', slice(None)),
+                        ind.get('y', slice(None))] = val
+
+    time_assignment_vectorized.param_names = ['key']
+    time_assignment_vectorized.params = [list(vectorized_indexes.keys())]
+
+
+class IndexingDask(Indexing):
+    def setup(self, key):
+        requires_dask()
+        super(IndexingDask, self).setup(key)
+        self.ds = self.ds.chunk({'x': 100, 'y': 50, 't': 50})
diff --git a/doc/examples/weather-data.rst b/doc/examples/weather-data.rst
@@ -93,6 +93,27 @@ not show any seasonal cycle.
     @savefig examples_anomalies_plot.png
     anomalies.mean('location').to_dataframe()[['tmin', 'tmax']].plot()
 
+.. _standardized monthly anomalies:
+
+Calculate standardized monthly anomalies
+----------------------------------------
+
+You can create standardized anomalies where the difference between the
+observations and the climatological monthly mean is
+divided by the climatological standard deviation.
+
+.. ipython:: python
+
+    climatology_mean = ds.groupby('time.month').mean('time')
+    climatology_std = ds.groupby('time.month').std('time')
+    stand_anomalies = xr.apply_ufunc(
+                                     lambda x, m, s: (x - m) / s,
+                                     ds.groupby('time.month'),
+                                     climatology_mean, climatology_std)
+
+    @savefig examples_standardized_anomalies_plot.png
+    stand_anomalies.mean('location').to_dataframe()[['tmin', 'tmax']].plot()
+
 .. _fill with climatology:
 
 Fill missing values with climatology
diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -21,6 +21,8 @@ v0.10.1 (unreleased)
 Documentation
 ~~~~~~~~~~~~~
 
+- Added apply_ufunc example to toy weather data page (:issue:`1844`).
+  By `Liam Brannigan <https://github.com/braaannigan>` _.
 - New entry `Why don’t aggregations return Python scalars?` in the
   :doc:`faq` (:issue:`1726`).
   By `0x0L <https://github.com/0x0L>`_.
@@ -61,6 +63,11 @@ Enhancements
 - :py:func:`~plot.line()` learned to draw multiple lines if provided with a
   2D variable.
   By `Deepak Cherian <https://github.com/dcherian>`_.
+- Reduce memory usage when decoding a variable with a scale_factor, by
+  converting 8-bit and 16-bit integers to float32 instead of float64
+  (:pull:`1840`), and keeping float16 and float32 as float32 (:issue:`1842`).
+  Correspondingly, encoded variables may also be saved with a smaller dtype.
+  By `Zac Hatfield-Dodds <https://github.com/Zac-HD>`_.
 
 .. _Zarr: http://zarr.readthedocs.io/
 
@@ -84,11 +91,12 @@ Bug fixes
 - Fixed encoding of multi-dimensional coordinates in
   :py:meth:`~Dataset.to_netcdf` (:issue:`1763`).
   By `Mike Neish <https://github.com/neishm>`_.
-
+- Fixed chunking with non-file-based rasterio datasets (:issue:`1816`) and
+  refactored rasterio test suite.
+  By `Ryan Abernathey <https://github.com/rabernat>`_
 - Bug fix in open_dataset(engine='pydap') (:issue:`1775`)
   By `Keisuke Fujii <https://github.com/fujiisoup>`_.
-
-- Bug fix in vectorized assignment  (:issue:`1743`, `1744`).
+- Bug fix in vectorized assignment  (:issue:`1743`, :issue:`1744`).
   Now item assignment to :py:meth:`~DataArray.__setitem__` checks
 - Bug fix in vectorized assignment  (:issue:`1743`, :issue:`1744`).
   Now item assignment to :py:meth:`DataArray.__setitem__` checks
diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py
@@ -222,7 +222,11 @@ def open_rasterio(filename, chunks=None, cache=None, lock=None):
     if chunks is not None:
         from dask.base import tokenize
         # augment the token with the file modification time
-        mtime = os.path.getmtime(filename)
+        try:
+            mtime = os.path.getmtime(filename)
+        except OSError:
+            # the filename is probably an s3 bucket rather than a regular file
+            mtime = None
         token = tokenize(filename, mtime, chunks)
         name_prefix = 'open_rasterio-%s' % token
         if lock is None:
diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py
@@ -205,6 +205,25 @@ def _scale_offset_decoding(data, scale_factor, add_offset, dtype):
     return data
 
 
+def _choose_float_dtype(dtype, has_offset):
+    """Return a float dtype that can losslessly represent `dtype` values."""
+    # Keep float32 as-is.  Upcast half-precision to single-precision,
+    # because float16 is "intended for storage but not computation"
+    if dtype.itemsize <= 4 and np.issubdtype(dtype, np.floating):
+        return np.float32
+    # float32 can exactly represent all integers up to 24 bits
+    if dtype.itemsize <= 2 and np.issubdtype(dtype, np.integer):
+        # A scale factor is entirely safe (vanishing into the mantissa),
+        # but a large integer offset could lead to loss of precision.
+        # Sensitivity analysis can be tricky, so we just use a float64
+        # if there's any offset at all - better unoptimised than wrong!
+        if not has_offset:
+            return np.float32
+    # For all other types and circumstances, we just use float64.
+    # (safe because eg. complex numbers are not supported in NetCDF)
+    return np.float64
+
+
 class CFScaleOffsetCoder(VariableCoder):
     """Scale and offset variables according to CF conventions.
 
@@ -216,7 +235,8 @@ def encode(self, variable, name=None):
         dims, data, attrs, encoding = unpack_for_encoding(variable)
 
         if 'scale_factor' in encoding or 'add_offset' in encoding:
-            data = data.astype(dtype=np.float64, copy=True)
+            dtype = _choose_float_dtype(data.dtype, 'add_offset' in encoding)
+            data = data.astype(dtype=dtype, copy=True)
             if 'add_offset' in encoding:
                 data -= pop_to(encoding, attrs, 'add_offset', name=name)
             if 'scale_factor' in encoding:
@@ -230,7 +250,7 @@ def decode(self, variable, name=None):
         if 'scale_factor' in attrs or 'add_offset' in attrs:
             scale_factor = pop_to(attrs, encoding, 'scale_factor', name=name)
             add_offset = pop_to(attrs, encoding, 'add_offset', name=name)
-            dtype = np.float64
+            dtype = _choose_float_dtype(data.dtype, 'add_offset' in attrs)
             transform = partial(_scale_offset_decoding,
                                 scale_factor=scale_factor,
                                 add_offset=add_offset,
diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py