Skip to content

Commit

Permalink
more tutorial refactoring (#5074)
Browse files Browse the repository at this point in the history
* split out open_rasterio again

* remove engine from the explicit signature

* fix the lists of available datasets

* explicitly pass the temporary cache_dir

* use open_rasterio instead of open_dataset

* update the description of "tiny"

* generate a API page for tutorial.open_rasterio [skip-ci]

* add a typespec alias for path-like

* use tutorial.open_rasterio instead of downloading manually

* back to downloading manually

* update whats-new.rst

* add the shade dataset

* add a description for RGB.byte

* reference a tag to make having dead links less likely

* [test-upstream]

* rename to arr because open_rasterio returns DataArray objects

* try to fix the docs [skip-ci]

* fix the links [skip-ci]

* add descriptions for all except the era5 grib file [skip-ci]

* also add a description of the ERA5 data

* move the credits to the bottom of the list of available datasets [skip-ci]

* adjust the log level of pooch's logger
  • Loading branch information
keewis authored Apr 25, 2021
1 parent 24c357f commit 6bfbaed
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 57 deletions.
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -861,6 +861,7 @@ Tutorial
:toctree: generated/

tutorial.open_dataset
tutorial.open_rasterio
tutorial.load_dataset

Testing
Expand Down
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@
"callable": ":py:func:`callable`",
"dict_like": ":term:`dict-like <mapping>`",
"dict-like": ":term:`dict-like <mapping>`",
"path-like": ":term:`path-like <path-like object>`",
"mapping": ":term:`mapping`",
"file-like": ":term:`file-like <file-like object>`",
# special terms
Expand Down
4 changes: 2 additions & 2 deletions doc/examples/visualization_gallery.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@
"metadata": {},
"outputs": [],
"source": [
"da = xr.tutorial.open_dataset(\"RGB.byte\").data\n",
"da = xr.tutorial.open_rasterio(\"RGB.byte\")\n",
"\n",
"# The data is in UTM projection. We have to set it manually until\n",
"# https://github.com/SciTools/cartopy/issues/813 is implemented\n",
Expand Down Expand Up @@ -245,7 +245,7 @@
"from rasterio.warp import transform\n",
"import numpy as np\n",
"\n",
"da = xr.tutorial.open_dataset(\"RGB.byte\").data\n",
"da = xr.tutorial.open_rasterio(\"RGB.byte\")\n",
"\n",
"# Compute the lon/lat coordinates with rasterio.warp.transform\n",
"ny, nx = len(da['y']), len(da['x'])\n",
Expand Down
6 changes: 5 additions & 1 deletion doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,15 @@ New Features
:py:class:`~core.groupby.DataArrayGroupBy`, inspired by pandas'
:py:meth:`~pandas.core.groupby.GroupBy.get_group`.
By `Deepak Cherian <https://github.com/dcherian>`_.
- Switch the tutorial functions to use `pooch <https://github.com/fatiando/pooch>`_
(which is now a optional dependency) and add :py:func:`tutorial.open_rasterio` as a
way to open example rasterio files (:issue:`3986`, :pull:`4102`, :pull:`5074`).
By `Justus Magin <https://github.com/keewis>`_.
- Add typing information to unary and binary arithmetic operators operating on
:py:class:`~core.dataset.Dataset`, :py:class:`~core.dataarray.DataArray`,
:py:class:`~core.variable.Variable`, :py:class:`~core.groupby.DatasetGroupBy` or
:py:class:`~core.groupby.DataArrayGroupBy` (:pull:`4904`).
By `Richard Kleijn <https://github.com/rhkleijn>`_ .
By `Richard Kleijn <https://github.com/rhkleijn>`_.
- Add a ``combine_attrs`` parameter to :py:func:`open_mfdataset` (:pull:`4971`).
By `Justus Magin <https://github.com/keewis>`_.
- Enable passing arrays with a subset of dimensions to
Expand Down
2 changes: 1 addition & 1 deletion xarray/backends/rasterio_.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc
>>> from affine import Affine
>>> da = xr.open_rasterio(
... "https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif"
... "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/RGB.byte.tif"
... )
>>> da
<xarray.DataArray (band: 3, y: 718, x: 791)>
Expand Down
31 changes: 17 additions & 14 deletions xarray/tests/test_tutorial.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import os

import pytest

from xarray import DataArray, tutorial
Expand All @@ -13,25 +11,30 @@ class TestLoadDataset:
def setUp(self):
self.testfile = "tiny"

def test_download_from_github(self, tmp_path, monkeypatch):
monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path))

ds = tutorial.open_dataset(self.testfile).load()
def test_download_from_github(self, tmp_path):
cache_dir = tmp_path / tutorial._default_cache_dir_name
ds = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load()
tiny = DataArray(range(5), name="tiny").to_dataset()
assert_identical(ds, tiny)

def test_download_from_github_load_without_cache(self, tmp_path, monkeypatch):
monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path))
cache_dir = tmp_path / tutorial._default_cache_dir_name

ds_nocache = tutorial.open_dataset(self.testfile, cache=False).load()
ds_cache = tutorial.open_dataset(self.testfile).load()
ds_nocache = tutorial.open_dataset(
self.testfile, cache=False, cache_dir=cache_dir
).load()
ds_cache = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load()
assert_identical(ds_cache, ds_nocache)

def test_download_rasterio_from_github_load_without_cache(
self, tmp_path, monkeypatch
):
monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path))

ds_nocache = tutorial.open_dataset("RGB.byte", cache=False).load()
ds_cache = tutorial.open_dataset("RGB.byte", cache=True).load()
assert_identical(ds_cache, ds_nocache)
cache_dir = tmp_path / tutorial._default_cache_dir_name

arr_nocache = tutorial.open_rasterio(
"RGB.byte", cache=False, cache_dir=cache_dir
).load()
arr_cache = tutorial.open_rasterio(
"RGB.byte", cache=True, cache_dir=cache_dir
).load()
assert_identical(arr_cache, arr_nocache)
131 changes: 92 additions & 39 deletions xarray/tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,36 @@
import numpy as np

from .backends.api import open_dataset as _open_dataset
from .backends.rasterio_ import open_rasterio
from .backends.rasterio_ import open_rasterio as _open_rasterio
from .core.dataarray import DataArray
from .core.dataset import Dataset


def _open_rasterio(path, engine=None, **kwargs):
data = open_rasterio(path, **kwargs)
name = data.name if data.name is not None else "data"
return data.to_dataset(name=name)


_default_cache_dir_name = "xarray_tutorial_data"
base_url = "https://github.com/pydata/xarray-data"
version = "master"


external_urls = {
"RGB.byte": (
"rasterio",
"https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif",
),
}
overrides = {
"rasterio": _open_rasterio,
def _construct_cache_dir(path):
import pooch

if isinstance(path, pathlib.Path):
path = os.fspath(path)
elif path is None:
path = pooch.os_cache(_default_cache_dir_name)

return path


external_urls = {} # type: dict
external_rasterio_urls = {
"RGB.byte": "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/RGB.byte.tif",
"shade": "https://github.com/mapbox/rasterio/raw/1.2.1/tests/data/shade.tif",
}


# idea borrowed from Seaborn
def open_dataset(
name,
engine=None,
cache=True,
cache_dir=None,
**kws,
Expand All @@ -51,31 +50,27 @@ def open_dataset(
If a local copy is found then always use that to avoid network traffic.
Available datasets:
* ``"air_temperature"``: NCEP reanalysis subset
* ``"rasm"``: Output of the Regional Arctic System Model (RASM)
* ``"ROMS_example"``: Regional Ocean Model System (ROMS) output
* ``"tiny"``: small synthetic dataset with a 1D data variable
* ``"era5-2mt-2019-03-uk.grib"``: ERA5 temperature data over the UK
* ``"eraint_uvz"``: data from ERA-Interim reanalysis, monthly averages of upper level data
Parameters
----------
name : str
Name of the file containing the dataset.
e.g. 'air_temperature'
engine : str, optional
The engine to use.
cache_dir : path-like, optional
The directory in which to search for and write cached data.
cache : bool, optional
If True, then cache data locally for use on subsequent calls
**kws : dict, optional
Passed to xarray.open_dataset
Notes
-----
Available datasets:
* ``"air_temperature"``
* ``"rasm"``
* ``"ROMS_example"``
* ``"tiny"``
* ``"era5-2mt-2019-03-uk.grib"``
* ``"RGB.byte"``: example rasterio file from https://github.com/mapbox/rasterio
See Also
--------
xarray.open_dataset
Expand All @@ -85,15 +80,12 @@ def open_dataset(
except ImportError:
raise ImportError("using the tutorial data requires pooch")

if isinstance(cache_dir, pathlib.Path):
cache_dir = os.fspath(cache_dir)
elif cache_dir is None:
cache_dir = pooch.os_cache(_default_cache_dir_name)
logger = pooch.get_logger()
logger.setLevel("WARNING")

cache_dir = _construct_cache_dir(cache_dir)
if name in external_urls:
engine_, url = external_urls[name]
if engine is None:
engine = engine_
url = external_urls[name]
else:
# process the name
default_extension = ".nc"
Expand All @@ -103,17 +95,78 @@ def open_dataset(

url = f"{base_url}/raw/{version}/{path.name}"

_open = overrides.get(engine, _open_dataset)
# retrieve the file
filepath = pooch.retrieve(url=url, known_hash=None, path=cache_dir)
ds = _open(filepath, engine=engine, **kws)
ds = _open_dataset(filepath, **kws)
if not cache:
ds = ds.load()
pathlib.Path(filepath).unlink()

return ds


def open_rasterio(
name,
engine=None,
cache=True,
cache_dir=None,
**kws,
):
"""
Open a rasterio dataset from the online repository (requires internet).
If a local copy is found then always use that to avoid network traffic.
Available datasets:
* ``"RGB.byte"``: TIFF file derived from USGS Landsat 7 ETM imagery.
* ``"shade"``: TIFF file derived from from USGS SRTM 90 data
``RGB.byte`` and ``shade`` are downloaded from the ``rasterio`` repository [1]_.
Parameters
----------
name : str
Name of the file containing the dataset.
e.g. 'RGB.byte'
cache_dir : path-like, optional
The directory in which to search for and write cached data.
cache : bool, optional
If True, then cache data locally for use on subsequent calls
**kws : dict, optional
Passed to xarray.open_rasterio
See Also
--------
xarray.open_rasterio
References
----------
.. [1] https://github.com/mapbox/rasterio
"""
try:
import pooch
except ImportError:
raise ImportError("using the tutorial data requires pooch")

logger = pooch.get_logger()
logger.setLevel("WARNING")

cache_dir = _construct_cache_dir(cache_dir)
url = external_rasterio_urls.get(name)
if url is None:
raise ValueError(f"unknown rasterio dataset: {name}")

# retrieve the file
filepath = pooch.retrieve(url=url, known_hash=None, path=cache_dir)
arr = _open_rasterio(filepath, **kws)
if not cache:
arr = arr.load()
pathlib.Path(filepath).unlink()

return arr


def load_dataset(*args, **kwargs):
"""
Open, load into memory, and close a dataset from the online repository
Expand Down

0 comments on commit 6bfbaed

Please sign in to comment.