pydata · andersy005 · Mar 24, 2021 · May 27, 2020 · May 27, 2020 · May 27, 2020
diff --git a/doc/examples/visualization_gallery.ipynb b/doc/examples/visualization_gallery.ipynb
@@ -209,8 +209,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "url = 'https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif'\n",
-    "da = xr.open_rasterio(url)\n",
+    "da = xr.tutorial.open_rasterio(\"RGB.byte\")\n",
     "\n",
     "# The data is in UTM projection. We have to set it manually until\n",
     "# https://github.com/SciTools/cartopy/issues/813 is implemented\n",
@@ -246,8 +245,7 @@
     "from rasterio.warp import transform\n",
     "import numpy as np\n",
     "\n",
-    "url = 'https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif'\n",
-    "da = xr.open_rasterio(url)\n",
+    "da = xr.tutorial.open_rasterio(\"RGB.byte\")\n",
     "\n",
     "# Compute the lon/lat coordinates with rasterio.warp.transform\n",
     "ny, nx = len(da['y']), len(da['x'])\n",

diff --git a/xarray/tests/test_tutorial.py b/xarray/tests/test_tutorial.py
@@ -13,9 +13,7 @@ class TestLoadDataset:
     @pytest.fixture(autouse=True)
     def setUp(self):
         self.testfile = "tiny"
-        self.testfilepath = os.path.expanduser(
-            os.sep.join(("~", ".xarray_tutorial_data", self.testfile))
-        )
+        self.testfilepath = tutorial._default_cache_dir / self.testfile
         with suppress(OSError):
             os.remove(f"{self.testfilepath}.nc")
         with suppress(OSError):
@@ -30,3 +28,8 @@ def test_download_from_github_load_without_cache(self):
         ds_nocache = tutorial.open_dataset(self.testfile, cache=False).load()
         ds_cache = tutorial.open_dataset(self.testfile).load()
         assert_identical(ds_cache, ds_nocache)
+
+    def test_download_rasterio_from_github_load_without_cache(self):
+        ds_nocache = tutorial.open_rasterio("RGB.byte", cache=False).load()
+        ds_cache = tutorial.open_rasterio("RGB.byte", cache=True).load()
+        assert_identical(ds_cache, ds_nocache)
diff --git a/xarray/tutorial.py b/xarray/tutorial.py
@@ -6,23 +6,78 @@
 
 """
 import hashlib
-import os as _os
-from urllib.request import urlretrieve
+import pathlib
+import shutil
+import tempfile
 
 import numpy as np
+import requests
 
 from .backends.api import open_dataset as _open_dataset
+from .backends.rasterio_ import open_rasterio as _open_rasterio
 from .core.dataarray import DataArray
 from .core.dataset import Dataset
 
-_default_cache_dir = _os.sep.join(("~", ".xarray_tutorial_data"))
+_cache_name = "xarray_tutorial_data"
+_cache_dir = pathlib.Path.home() / ".cache"
+# TODO: I/O on import. Might not be a good idea.
+if _cache_dir.exists():
+    _default_cache_dir = _cache_dir / _cache_name
+else:
+    _default_cache_dir = pathlib.Path.home() / f".{_cache_name}"
 
 
-def file_md5_checksum(fname):
-    hash_md5 = hashlib.md5()
-    with open(fname, "rb") as f:
-        hash_md5.update(f.read())
-    return hash_md5.hexdigest()
+def check_md5sum(content, checksum):
+    md5 = hashlib.md5()
+    md5.update(content)
+    md5sum = md5.hexdigest()
+
+    return md5sum == checksum
+
+
+def download_to(url, path):
+    with requests.get(url, stream=True) as r, path.open("wb") as f:
-    with requests.get(url, stream=True) as r, path.open("wb") as f:
+    with requests.get(url, stream=True) as r
+        with path.open("wb") as f:
+            ...
-    with requests.get(url, stream=True) as r, path.open("wb") as f:
+    with requests.get(url, stream=True) as r
+        with path.open("wb") as f:
+            ...
+        if r.status_code != 200:
+            raise OSError(f"download failed: {r.reason}")
+
+        r.raw.decode_content = True
+        shutil.copyfileobj(r.raw, f)
+
+
+def open_rasterio(
+    name,
+    cache=True,
+    cache_dir=_default_cache_dir,
+    github_url="https://github.com/mapbox/rasterio",
+    branch="master",
+    **kws,
+):
+    if not cache_dir.is_dir():
+        cache_dir.mkdir()
+
+    default_extension = ".tif"
+
+    if cache:
+        path = cache_dir / name
+        # need to always do that, otherwise the context manager might fail
+        cache_dir = pathlib.Path(cache_dir)
+    else:
+        cache_dir = tempfile.TemporaryDirectory()
+        path = pathlib.Path(cache_dir.name) / name
+
+    if not path.suffix:
+        path = path.with_suffix(default_extension)
+    elif path.suffix == ".byte":
+        path = path.with_name(name + default_extension)
+
+    if cache and path.is_file():
+        return _open_rasterio(path, **kws)
+
+    url = f"{github_url}/raw/{branch}/tests/data/{path.name}"
+    # make sure the directory is deleted afterwards
+    with cache_dir:
+        download_to(url, path)
+        return _open_rasterio(path, **kws)
 
 
 # idea borrowed from Seaborn
@@ -61,44 +116,45 @@ def open_dataset(
     xarray.open_dataset
 
     """
-    root, ext = _os.path.splitext(name)
-    if not ext:
-        ext = ".nc"
-    fullname = root + ext
-    longdir = _os.path.expanduser(cache_dir)
-    localfile = _os.sep.join((longdir, fullname))
-    md5name = fullname + ".md5"
-    md5file = _os.sep.join((longdir, md5name))
-
-    if not _os.path.exists(localfile):
-
-        # This will always leave this directory on disk.
-        # May want to add an option to remove it.
-        if not _os.path.isdir(longdir):
-            _os.mkdir(longdir)
-
-        url = "/".join((github_url, "raw", branch, fullname))
-        urlretrieve(url, localfile)
-        url = "/".join((github_url, "raw", branch, md5name))
-        urlretrieve(url, md5file)
-
-        localmd5 = file_md5_checksum(localfile)
-        with open(md5file, "r") as f:
-            remotemd5 = f.read()
-        if localmd5 != remotemd5:
-            _os.remove(localfile)
+
+    def construct_url(full_name):
+        return f"{github_url}/raw/{branch}/{full_name}"
+
+    if not cache_dir.is_dir():
+        cache_dir.mkdir()
+
+    default_extension = ".nc"
+
+    if cache:
+        path = cache_dir / name
+        # need to always do that, otherwise the context manager might fail
+        cache_dir = pathlib.Path(cache_dir)
+    else:
+        cache_dir = tempfile.TemporaryDirectory()
+        path = pathlib.Path(cache_dir.name) / name
+
+    if not path.suffix:
+        path = path.with_suffix(default_extension)
+
+    if cache and path.is_file():
+        return _open_dataset(path, **kws)
+
+    # make sure the directory is deleted afterwards if it was temporary
+    with cache_dir:
+        download_to(construct_url(path.name), path)
+
+        # verify the checksum (md5 guards only against transport corruption)
+        md5_path = path.with_name(path.name + ".md5")
+        download_to(construct_url(md5_path.name), md5_path)
+        if not check_md5sum(path.read_bytes(), md5_path.read_text()):
+            path.unlink()
+            md5_path.unlink()
             msg = """
             MD5 checksum does not match, try downloading dataset again.
             """
             raise OSError(msg)
 
-    ds = _open_dataset(localfile, **kws)
-
-    if not cache:
-        ds = ds.load()
-        _os.remove(localfile)
-
-    return ds
+        return _open_dataset(path, **kws)
 
 
 def load_dataset(*args, **kwargs):