Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

more tutorial refactoring #5074

Merged
merged 24 commits into from
Apr 25, 2021
Merged
Show file tree
Hide file tree
Changes from 9 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
797dc50
split out open_rasterio again
keewis Mar 24, 2021
d1ee9ee
remove engine from the explicit signature
keewis Mar 24, 2021
aa6c5a1
fix the lists of available datasets
keewis Mar 24, 2021
e07bdf9
explicitly pass the temporary cache_dir
keewis Mar 24, 2021
734d898
use open_rasterio instead of open_dataset
keewis Mar 24, 2021
17795da
update the description of "tiny"
keewis Mar 24, 2021
932cde6
generate a API page for tutorial.open_rasterio [skip-ci]
keewis Mar 25, 2021
1211f2e
add a typespec alias for path-like
keewis Mar 25, 2021
626a7d9
use tutorial.open_rasterio instead of downloading manually
keewis Mar 25, 2021
eac4698
back to downloading manually
keewis Mar 27, 2021
baffa37
update whats-new.rst
keewis Mar 27, 2021
e0ab011
add the shade dataset
keewis Mar 27, 2021
7e43294
add a description for RGB.byte
keewis Mar 27, 2021
9146fca
reference a tag to make having dead links less likely
keewis Mar 27, 2021
71a87db
[test-upstream]
keewis Mar 27, 2021
59a860f
rename to arr because open_rasterio returns DataArray objects
keewis Mar 27, 2021
906a011
try to fix the docs [skip-ci]
keewis Mar 27, 2021
98b230d
fix the links [skip-ci]
keewis Mar 28, 2021
fde6e05
add descriptions for all except the era5 grib file [skip-ci]
keewis Apr 3, 2021
28082aa
Merge branch 'master' into refactor-tutorial
keewis Apr 3, 2021
8e32480
also add a description of the ERA5 data
keewis Apr 3, 2021
05c1e2b
move the credits to the bottom of the list of available datasets [ski…
keewis Apr 3, 2021
4824832
Merge branch 'master' into refactor-tutorial
keewis Apr 17, 2021
ff65a02
adjust the log level of pooch's logger
keewis Apr 18, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,7 @@ Tutorial
:toctree: generated/

tutorial.open_dataset
tutorial.open_rasterio
tutorial.load_dataset

Testing
Expand Down
1 change: 1 addition & 0 deletions doc/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@
"callable": ":py:func:`callable`",
"dict_like": ":term:`dict-like <mapping>`",
"dict-like": ":term:`dict-like <mapping>`",
"path-like": ":term:`path-like <path-like object>`",
"mapping": ":term:`mapping`",
"file-like": ":term:`file-like <file-like object>`",
# special terms
Expand Down
4 changes: 2 additions & 2 deletions doc/examples/visualization_gallery.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@
"metadata": {},
"outputs": [],
"source": [
"da = xr.tutorial.open_dataset(\"RGB.byte\").data\n",
"da = xr.tutorial.open_rasterio(\"RGB.byte\")\n",
"\n",
"# The data is in UTM projection. We have to set it manually until\n",
"# https://github.com/SciTools/cartopy/issues/813 is implemented\n",
Expand Down Expand Up @@ -245,7 +245,7 @@
"from rasterio.warp import transform\n",
"import numpy as np\n",
"\n",
"da = xr.tutorial.open_dataset(\"RGB.byte\").data\n",
"da = xr.tutorial.open_rasterio(\"RGB.byte\")\n",
"\n",
"# Compute the lon/lat coordinates with rasterio.warp.transform\n",
"ny, nx = len(da['y']), len(da['x'])\n",
Expand Down
4 changes: 1 addition & 3 deletions xarray/backends/rasterio_.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,9 +175,7 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc
You can generate 2D coordinates from the file's attributes with::

>>> from affine import Affine
>>> da = xr.open_rasterio(
... "https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif"
... )
>>> da = xr.tutorial.open_rasterio("RGB.byte")
keewis marked this conversation as resolved.
Show resolved Hide resolved
>>> da
<xarray.DataArray (band: 3, y: 718, x: 791)>
[1703814 values with dtype=uint8]
Expand Down
31 changes: 17 additions & 14 deletions xarray/tests/test_tutorial.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import os

import pytest

from xarray import DataArray, tutorial
Expand All @@ -13,25 +11,30 @@ class TestLoadDataset:
def setUp(self):
self.testfile = "tiny"

def test_download_from_github(self, tmp_path, monkeypatch):
monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path))

ds = tutorial.open_dataset(self.testfile).load()
def test_download_from_github(self, tmp_path):
cache_dir = tmp_path / tutorial._default_cache_dir_name
ds = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load()
tiny = DataArray(range(5), name="tiny").to_dataset()
assert_identical(ds, tiny)

def test_download_from_github_load_without_cache(self, tmp_path, monkeypatch):
monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path))
cache_dir = tmp_path / tutorial._default_cache_dir_name

ds_nocache = tutorial.open_dataset(self.testfile, cache=False).load()
ds_cache = tutorial.open_dataset(self.testfile).load()
ds_nocache = tutorial.open_dataset(
self.testfile, cache=False, cache_dir=cache_dir
).load()
ds_cache = tutorial.open_dataset(self.testfile, cache_dir=cache_dir).load()
assert_identical(ds_cache, ds_nocache)

def test_download_rasterio_from_github_load_without_cache(
self, tmp_path, monkeypatch
):
monkeypatch.setenv("XDG_CACHE_DIR", os.fspath(tmp_path))

ds_nocache = tutorial.open_dataset("RGB.byte", cache=False).load()
ds_cache = tutorial.open_dataset("RGB.byte", cache=True).load()
assert_identical(ds_cache, ds_nocache)
cache_dir = tmp_path / tutorial._default_cache_dir_name

arr_nocache = tutorial.open_rasterio(
"RGB.byte", cache=False, cache_dir=cache_dir
).load()
arr_cache = tutorial.open_rasterio(
"RGB.byte", cache=True, cache_dir=cache_dir
).load()
assert_identical(arr_cache, arr_nocache)
123 changes: 82 additions & 41 deletions xarray/tutorial.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,37 +11,35 @@
import numpy as np

from .backends.api import open_dataset as _open_dataset
from .backends.rasterio_ import open_rasterio
from .backends.rasterio_ import open_rasterio as _open_rasterio
from .core.dataarray import DataArray
from .core.dataset import Dataset


def _open_rasterio(path, engine=None, **kwargs):
data = open_rasterio(path, **kwargs)
name = data.name if data.name is not None else "data"
return data.to_dataset(name=name)


_default_cache_dir_name = "xarray_tutorial_data"
base_url = "https://github.com/pydata/xarray-data"
version = "master"


external_urls = {
"RGB.byte": (
"rasterio",
"https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif",
),
}
overrides = {
"rasterio": _open_rasterio,
def _construct_cache_dir(path):
import pooch

if isinstance(path, pathlib.Path):
path = os.fspath(path)
elif path is None:
path = pooch.os_cache(_default_cache_dir_name)

return path


external_urls = {} # type: dict
external_rasterio_urls = {
"RGB.byte": "https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif",
}


# idea borrowed from Seaborn
def open_dataset(
name,
engine=None,
cache=True,
cache_dir=None,
**kws,
Expand All @@ -51,31 +49,29 @@ def open_dataset(

If a local copy is found then always use that to avoid network traffic.

Available datasets:

* ``"air_temperature"``: float32 (time x lat x lon)
* ``"rasm"``: float64 (time × y × x)
* ``"ROMS_example"``:

- float64 (ocean_time × s_rho × eta_rho × xi_rho)
- float64 (ocean_time × eta_rho × xi_rho)
* ``"tiny"``: small synthetic dataset with a 1D data variable
* ``"era5-2mt-2019-03-uk.grib"``: float32 (time × latitude × longitude)

Parameters
----------
name : str
Name of the file containing the dataset.
e.g. 'air_temperature'
engine : str, optional
The engine to use.
cache_dir : path-like, optional
The directory in which to search for and write cached data.
cache : bool, optional
If True, then cache data locally for use on subsequent calls
kws : dict, optional
**kws : dict, optional
Passed to xarray.open_dataset

Notes
-----
Available datasets:

* ``"air_temperature"``
* ``"rasm"``
* ``"ROMS_example"``
* ``"tiny"``
* ``"era5-2mt-2019-03-uk.grib"``
* ``"RGB.byte"``: example rasterio file from https://github.com/mapbox/rasterio

See Also
--------
xarray.open_dataset
Expand All @@ -85,15 +81,9 @@ def open_dataset(
except ImportError:
raise ImportError("using the tutorial data requires pooch")
keewis marked this conversation as resolved.
Show resolved Hide resolved

if isinstance(cache_dir, pathlib.Path):
cache_dir = os.fspath(cache_dir)
elif cache_dir is None:
cache_dir = pooch.os_cache(_default_cache_dir_name)

cache_dir = _construct_cache_dir(cache_dir)
if name in external_urls:
engine_, url = external_urls[name]
if engine is None:
engine = engine_
url = external_urls[name]
else:
# process the name
default_extension = ".nc"
Expand All @@ -103,10 +93,61 @@ def open_dataset(

url = f"{base_url}/raw/{version}/{path.name}"

_open = overrides.get(engine, _open_dataset)
# retrieve the file
filepath = pooch.retrieve(url=url, known_hash=None, path=cache_dir)
keewis marked this conversation as resolved.
Show resolved Hide resolved
ds = _open(filepath, engine=engine, **kws)
ds = _open_dataset(filepath, **kws)
if not cache:
ds = ds.load()
pathlib.Path(filepath).unlink()

return ds


def open_rasterio(
name,
engine=None,
cache=True,
cache_dir=None,
**kws,
):
"""
Open a rasterio dataset from the online repository (requires internet).

If a local copy is found then always use that to avoid network traffic.

Available datasets:

* ``"RGB.byte"``: uint8 (band × y × x)

Parameters
----------
name : str
Name of the file containing the dataset.
e.g. 'RGB.byte'
cache_dir : path-like, optional
The directory in which to search for and write cached data.
cache : bool, optional
If True, then cache data locally for use on subsequent calls
**kws : dict, optional
Passed to xarray.open_rasterio

See Also
--------
xarray.open_rasterio
"""
try:
import pooch
except ImportError:
raise ImportError("using the tutorial data requires pooch")

cache_dir = _construct_cache_dir(cache_dir)
url = external_rasterio_urls.get(name)
if url is None:
raise ValueError(f"unknown rasterio dataset: {name}")

# retrieve the file
filepath = pooch.retrieve(url=url, known_hash=None, path=cache_dir)
ds = _open_rasterio(filepath, **kws)
if not cache:
ds = ds.load()
pathlib.Path(filepath).unlink()
Expand Down