From 6bfbaede69eb73810cb63672a8161bd1fc147594 Mon Sep 17 00:00:00 2001 From: keewis Date: Sun, 25 Apr 2021 12:54:33 +0200 Subject: [PATCH] more tutorial refactoring (#5074) * split out open_rasterio again * remove engine from the explicit signature * fix the lists of available datasets * explicitly pass the temporary cache_dir * use open_rasterio instead of open_dataset * update the description of "tiny" * generate a API page for tutorial.open_rasterio [skip-ci] * add a typespec alias for path-like * use tutorial.open_rasterio instead of downloading manually * back to downloading manually * update whats-new.rst * add the shade dataset * add a description for RGB.byte * reference a tag to make having dead links less likely * [test-upstream] * rename to arr because open_rasterio returns DataArray objects * try to fix the docs [skip-ci] * fix the links [skip-ci] * add descriptions for all except the era5 grib file [skip-ci] * also add a description of the ERA5 data * move the credits to the bottom of the list of available datasets [skip-ci] * adjust the log level of pooch's logger --- doc/api.rst | 1 + doc/conf.py | 1 + doc/examples/visualization_gallery.ipynb | 4 +- doc/whats-new.rst | 6 +- xarray/backends/rasterio_.py | 2 +- xarray/tests/test_tutorial.py | 31 +++--- xarray/tutorial.py | 131 ++++++++++++++++------- 7 files changed, 119 insertions(+), 57 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 82304f223a1..85a0d75f56a 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -861,6 +861,7 @@ Tutorial :toctree: generated/ tutorial.open_dataset + tutorial.open_rasterio tutorial.load_dataset Testing diff --git a/doc/conf.py b/doc/conf.py index 86ff3768411..a87713fb293 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -125,6 +125,7 @@ "callable": ":py:func:`callable`", "dict_like": ":term:`dict-like `", "dict-like": ":term:`dict-like `", + "path-like": ":term:`path-like `", "mapping": ":term:`mapping`", "file-like": ":term:`file-like `", # special terms diff --git a/doc/examples/visualization_gallery.ipynb b/doc/examples/visualization_gallery.ipynb index 249c1b7ee94..3f2973dbdb4 100644 --- a/doc/examples/visualization_gallery.ipynb +++ b/doc/examples/visualization_gallery.ipynb @@ -209,7 +209,7 @@ "metadata": {}, "outputs": [], "source": [ - "da = xr.tutorial.open_dataset(\"RGB.byte\").data\n", + "da = xr.tutorial.open_rasterio(\"RGB.byte\")\n", "\n", "# The data is in UTM projection. We have to set it manually until\n", "# https://github.com/SciTools/cartopy/issues/813 is implemented\n", @@ -245,7 +245,7 @@ "from rasterio.warp import transform\n", "import numpy as np\n", "\n", - "da = xr.tutorial.open_dataset(\"RGB.byte\").data\n", + "da = xr.tutorial.open_rasterio(\"RGB.byte\")\n", "\n", "# Compute the lon/lat coordinates with rasterio.warp.transform\n", "ny, nx = len(da['y']), len(da['x'])\n", diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 029231a3753..403227b1b6b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -64,11 +64,15 @@ New Features :py:class:`~core.groupby.DataArrayGroupBy`, inspired by pandas' :py:meth:`~pandas.core.groupby.GroupBy.get_group`. By `Deepak Cherian `_. +- Switch the tutorial functions to use `pooch `_ + (which is now a optional dependency) and add :py:func:`tutorial.open_rasterio` as a + way to open example rasterio files (:issue:`3986`, :pull:`4102`, :pull:`5074`). + By `Justus Magin `_. - Add typing information to unary and binary arithmetic operators operating on :py:class:`~core.dataset.Dataset`, :py:class:`~core.dataarray.DataArray`, :py:class:`~core.variable.Variable`, :py:class:`~core.groupby.DatasetGroupBy` or :py:class:`~core.groupby.DataArrayGroupBy` (:pull:`4904`). - By `Richard Kleijn `_ . + By `Richard Kleijn `_. - Add a ``combine_attrs`` parameter to :py:func:`open_mfdataset` (:pull:`4971`). By `Justus Magin `_. - Enable passing arrays with a subset of dimensions to diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index 06b964fdc46..f5d9b7bf900 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -176,7 +176,7 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc >>> from affine import Affine >>> da = xr.open_rasterio( - ... "https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif" + ... "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/RGB.byte.tif" ... ) >>> da diff --git a/xarray/tests/test_tutorial.py b/xarray/tests/test_tutorial.py index 9b9dfe83867..225fda08f68 100644 --- a/xarray/tests/test_tutorial.py +++ b/xarray/tests/test_tutorial.py @@ -1,5 +1,3 @@ -import os - import pytest from xarray import DataArray, tutorial @@ -13,25 +11,30 @@ class TestLoadDataset: def setUp(self): self.testfile = "tiny" - def test_download_from_github(self, tmp_path, monkeypatch): - monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path)) - - ds = tutorial.open_dataset(self.testfile).load() + def test_download_from_github(self, tmp_path): + cache_dir = tmp_path / tutorial._default_cache_dir_name + ds = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load() tiny = DataArray(range(5), name="tiny").to_dataset() assert_identical(ds, tiny) def test_download_from_github_load_without_cache(self, tmp_path, monkeypatch): - monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path)) + cache_dir = tmp_path / tutorial._default_cache_dir_name - ds_nocache = tutorial.open_dataset(self.testfile, cache=False).load() - ds_cache = tutorial.open_dataset(self.testfile).load() + ds_nocache = tutorial.open_dataset( + self.testfile, cache=False, cache_dir=cache_dir + ).load() + ds_cache = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load() assert_identical(ds_cache, ds_nocache) def test_download_rasterio_from_github_load_without_cache( self, tmp_path, monkeypatch ): - monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path)) - - ds_nocache = tutorial.open_dataset("RGB.byte", cache=False).load() - ds_cache = tutorial.open_dataset("RGB.byte", cache=True).load() - assert_identical(ds_cache, ds_nocache) + cache_dir = tmp_path / tutorial._default_cache_dir_name + + arr_nocache = tutorial.open_rasterio( + "RGB.byte", cache=False, cache_dir=cache_dir + ).load() + arr_cache = tutorial.open_rasterio( + "RGB.byte", cache=True, cache_dir=cache_dir + ).load() + assert_identical(arr_cache, arr_nocache) diff --git a/xarray/tutorial.py b/xarray/tutorial.py index c7b9ac55f25..80c5e22513d 100644 --- a/xarray/tutorial.py +++ b/xarray/tutorial.py @@ -11,37 +11,36 @@ import numpy as np from .backends.api import open_dataset as _open_dataset -from .backends.rasterio_ import open_rasterio +from .backends.rasterio_ import open_rasterio as _open_rasterio from .core.dataarray import DataArray from .core.dataset import Dataset - -def _open_rasterio(path, engine=None, **kwargs): - data = open_rasterio(path, **kwargs) - name = data.name if data.name is not None else "data" - return data.to_dataset(name=name) - - _default_cache_dir_name = "xarray_tutorial_data" base_url = "https://github.com/pydata/xarray-data" version = "master" -external_urls = { - "RGB.byte": ( - "rasterio", - "https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif", - ), -} -overrides = { - "rasterio": _open_rasterio, +def _construct_cache_dir(path): + import pooch + + if isinstance(path, pathlib.Path): + path = os.fspath(path) + elif path is None: + path = pooch.os_cache(_default_cache_dir_name) + + return path + + +external_urls = {} # type: dict +external_rasterio_urls = { + "RGB.byte": "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/RGB.byte.tif", + "shade": "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/shade.tif", } # idea borrowed from Seaborn def open_dataset( name, - engine=None, cache=True, cache_dir=None, **kws, @@ -51,13 +50,20 @@ def open_dataset( If a local copy is found then always use that to avoid network traffic. + Available datasets: + + * ``"air_temperature"``: NCEP reanalysis subset + * ``"rasm"``: Output of the Regional Arctic System Model (RASM) + * ``"ROMS_example"``: Regional Ocean Model System (ROMS) output + * ``"tiny"``: small synthetic dataset with a 1D data variable + * ``"era5-2mt-2019-03-uk.grib"``: ERA5 temperature data over the UK + * ``"eraint_uvz"``: data from ERA-Interim reanalysis, monthly averages of upper level data + Parameters ---------- name : str Name of the file containing the dataset. e.g. 'air_temperature' - engine : str, optional - The engine to use. cache_dir : path-like, optional The directory in which to search for and write cached data. cache : bool, optional @@ -65,17 +71,6 @@ def open_dataset( **kws : dict, optional Passed to xarray.open_dataset - Notes - ----- - Available datasets: - - * ``"air_temperature"`` - * ``"rasm"`` - * ``"ROMS_example"`` - * ``"tiny"`` - * ``"era5-2mt-2019-03-uk.grib"`` - * ``"RGB.byte"``: example rasterio file from https://github.com/mapbox/rasterio - See Also -------- xarray.open_dataset @@ -85,15 +80,12 @@ def open_dataset( except ImportError: raise ImportError("using the tutorial data requires pooch") - if isinstance(cache_dir, pathlib.Path): - cache_dir = os.fspath(cache_dir) - elif cache_dir is None: - cache_dir = pooch.os_cache(_default_cache_dir_name) + logger = pooch.get_logger() + logger.setLevel("WARNING") + cache_dir = _construct_cache_dir(cache_dir) if name in external_urls: - engine_, url = external_urls[name] - if engine is None: - engine = engine_ + url = external_urls[name] else: # process the name default_extension = ".nc" @@ -103,10 +95,9 @@ def open_dataset( url = f"{base_url}/raw/{version}/{path.name}" - _open = overrides.get(engine, _open_dataset) # retrieve the file filepath = pooch.retrieve(url=url, known_hash=None, path=cache_dir) - ds = _open(filepath, engine=engine, **kws) + ds = _open_dataset(filepath, **kws) if not cache: ds = ds.load() pathlib.Path(filepath).unlink() @@ -114,6 +105,68 @@ def open_dataset( return ds +def open_rasterio( + name, + engine=None, + cache=True, + cache_dir=None, + **kws, +): + """ + Open a rasterio dataset from the online repository (requires internet). + + If a local copy is found then always use that to avoid network traffic. + + Available datasets: + + * ``"RGB.byte"``: TIFF file derived from USGS Landsat 7 ETM imagery. + * ``"shade"``: TIFF file derived from from USGS SRTM 90 data + + ``RGB.byte`` and ``shade`` are downloaded from the ``rasterio`` repository [1]_. + + Parameters + ---------- + name : str + Name of the file containing the dataset. + e.g. 'RGB.byte' + cache_dir : path-like, optional + The directory in which to search for and write cached data. + cache : bool, optional + If True, then cache data locally for use on subsequent calls + **kws : dict, optional + Passed to xarray.open_rasterio + + See Also + -------- + xarray.open_rasterio + + References + ---------- + .. [1] https://github.com/mapbox/rasterio + """ + try: + import pooch + except ImportError: + raise ImportError("using the tutorial data requires pooch") + + logger = pooch.get_logger() + logger.setLevel("WARNING") + + cache_dir = _construct_cache_dir(cache_dir) + url = external_rasterio_urls.get(name) + if url is None: + raise ValueError(f"unknown rasterio dataset: {name}") + + # retrieve the file + filepath = pooch.retrieve(url=url, known_hash=None, path=cache_dir) + arr = _open_rasterio(filepath, **kws) + if not cache: + arr = arr.load() + pathlib.Path(filepath).unlink() + + return arr + + def load_dataset(*args, **kwargs): """ Open, load into memory, and close a dataset from the online repository