5
5
* building tutorials in the documentation.
6
6
7
7
"""
8
- import hashlib
9
- import os as _os
10
- from urllib .request import urlretrieve
8
+ import os
9
+ import pathlib
11
10
12
11
import numpy as np
13
12
14
13
from .backends .api import open_dataset as _open_dataset
14
+ from .backends .rasterio_ import open_rasterio
15
15
from .core .dataarray import DataArray
16
16
from .core .dataset import Dataset
17
17
18
- _default_cache_dir = _os .sep .join (("~" , ".xarray_tutorial_data" ))
19
18
19
+ def _open_rasterio (path , engine = None , ** kwargs ):
20
+ data = open_rasterio (path , ** kwargs )
21
+ name = data .name if data .name is not None else "data"
22
+ return data .to_dataset (name = name )
20
23
21
- def file_md5_checksum (fname ):
22
- hash_md5 = hashlib .md5 ()
23
- with open (fname , "rb" ) as f :
24
- hash_md5 .update (f .read ())
25
- return hash_md5 .hexdigest ()
24
+
25
+ _default_cache_dir_name = "xarray_tutorial_data"
26
+ base_url = "https://github.com/pydata/xarray-data"
27
+ version = "master"
28
+
29
+
30
+ external_urls = {
31
+ "RGB.byte" : (
32
+ "rasterio" ,
33
+ "https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif" ,
34
+ ),
35
+ }
36
+ overrides = {
37
+ "rasterio" : _open_rasterio ,
38
+ }
26
39
27
40
28
41
# idea borrowed from Seaborn
29
42
def open_dataset (
30
43
name ,
44
+ engine = None ,
31
45
cache = True ,
32
- cache_dir = _default_cache_dir ,
33
- github_url = "https://github.com/pydata/xarray-data" ,
34
- branch = "master" ,
46
+ cache_dir = None ,
35
47
** kws ,
36
48
):
37
49
"""
@@ -42,61 +54,62 @@ def open_dataset(
42
54
Parameters
43
55
----------
44
56
name : str
45
- Name of the file containing the dataset. If no suffix is given, assumed
46
- to be netCDF ('.nc' is appended)
57
+ Name of the file containing the dataset.
47
58
e.g. 'air_temperature'
48
- cache_dir : str, optional
59
+ engine : str, optional
60
+ The engine to use.
61
+ cache_dir : path-like, optional
49
62
The directory in which to search for and write cached data.
50
63
cache : bool, optional
51
64
If True, then cache data locally for use on subsequent calls
52
- github_url : str
53
- Github repository where the data is stored
54
- branch : str
55
- The git branch to download from
56
65
kws : dict, optional
57
66
Passed to xarray.open_dataset
58
67
68
+ Notes
69
+ -----
70
+ Available datasets:
71
+
72
+ * ``"air_temperature"``
73
+ * ``"rasm"``
74
+ * ``"ROMS_example"``
75
+ * ``"tiny"``
76
+ * ``"era5-2mt-2019-03-uk.grib"``
77
+ * ``"RGB.byte"``: example rasterio file from https://github.com/mapbox/rasterio
78
+
59
79
See Also
60
80
--------
61
81
xarray.open_dataset
62
-
63
82
"""
64
- root , ext = _os .path .splitext (name )
65
- if not ext :
66
- ext = ".nc"
67
- fullname = root + ext
68
- longdir = _os .path .expanduser (cache_dir )
69
- localfile = _os .sep .join ((longdir , fullname ))
70
- md5name = fullname + ".md5"
71
- md5file = _os .sep .join ((longdir , md5name ))
72
-
73
- if not _os .path .exists (localfile ):
74
-
75
- # This will always leave this directory on disk.
76
- # May want to add an option to remove it.
77
- if not _os .path .isdir (longdir ):
78
- _os .mkdir (longdir )
79
-
80
- url = "/" .join ((github_url , "raw" , branch , fullname ))
81
- urlretrieve (url , localfile )
82
- url = "/" .join ((github_url , "raw" , branch , md5name ))
83
- urlretrieve (url , md5file )
84
-
85
- localmd5 = file_md5_checksum (localfile )
86
- with open (md5file ) as f :
87
- remotemd5 = f .read ()
88
- if localmd5 != remotemd5 :
89
- _os .remove (localfile )
90
- msg = """
91
- MD5 checksum does not match, try downloading dataset again.
92
- """
93
- raise OSError (msg )
94
-
95
- ds = _open_dataset (localfile , ** kws )
96
-
83
+ try :
84
+ import pooch
85
+ except ImportError :
86
+ raise ImportError ("using the tutorial data requires pooch" )
87
+
88
+ if isinstance (cache_dir , pathlib .Path ):
89
+ cache_dir = os .fspath (cache_dir )
90
+ elif cache_dir is None :
91
+ cache_dir = pooch .os_cache (_default_cache_dir_name )
92
+
93
+ if name in external_urls :
94
+ engine_ , url = external_urls [name ]
95
+ if engine is None :
96
+ engine = engine_
97
+ else :
98
+ # process the name
99
+ default_extension = ".nc"
100
+ path = pathlib .Path (name )
101
+ if not path .suffix :
102
+ path = path .with_suffix (default_extension )
103
+
104
+ url = f"{ base_url } /raw/{ version } /{ path .name } "
105
+
106
+ _open = overrides .get (engine , _open_dataset )
107
+ # retrieve the file
108
+ filepath = pooch .retrieve (url = url , known_hash = None , path = cache_dir )
109
+ ds = _open (filepath , engine = engine , ** kws )
97
110
if not cache :
98
111
ds = ds .load ()
99
- _os . remove ( localfile )
112
+ pathlib . Path ( filepath ). unlink ( )
100
113
101
114
return ds
102
115
0 commit comments