Refactor the options to ignore errors. This is now good enough to close

#13.
ecmwf · Nov 3, 2018 · d3d4896 · d3d4896
1 parent 52945f4
commit d3d4896
Show file tree

Hide file tree

Showing 8 changed files with 24 additions and 27 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,6 +8,8 @@ Changelog for cfgrib
 - Saves one index file per set of ``index_keys`` in a much more robust way.
 - Refactor CF-encoding and add the new ``encode_cf`` option to ``backend_kwargs``.
   See: `#23 <https://github.com/ecmwf/cfgrib/issues/23>`_.
+- Refactor error handling and the option to ignore errors (not well documented yet).
+  See: `#13 <https://github.com/ecmwf/cfgrib/issues/13>`_.
 
 
 0.9.3.1 (2018-10-28)

diff --git a/README.rst b/README.rst
@@ -248,13 +248,14 @@ Attributes:
 
 *cfgrib* also provides an **experimental function** that automate the selection of
 appropriate ``filter_by_keys`` and returns a list of all valid ``xarray.Dataset``'s
-in the GRIB file. The ``open_datasets`` is intended for interactive exploration of a file
+in the GRIB file (add ``backend_kwargs={'errors': 'ignore'}`` for extra robustness).
+The ``open_datasets`` is intended for interactive exploration of a file
 and it is not part of the stable API. In the future it may change or be removed altogether.
 
 .. code-block: python
 
 >>> from cfgrib import xarray_store
->>> xarray_store.open_datasets('nam.t00z.awp21100.tm00.grib2')
+>>> xarray_store.open_datasets('nam.t00z.awp21100.tm00.grib2', backend_kwargs={'errors': 'ignore'})
 [<xarray.Dataset>
 Dimensions:        (isobaricInhPa: 19, x: 93, y: 65)
 Coordinates:

diff --git a/cfgrib/cfgrib_.py b/cfgrib/cfgrib_.py
@@ -70,12 +70,7 @@ def __init__(self, filename, lock=None, **backend_kwargs):
         if lock is None:
             lock = ECCODES_LOCK
         self.lock = ensure_lock(lock)
-
-        # NOTE: filter_by_keys is a dict, but CachingFileManager only accepts hashable types
-        if 'filter_by_keys' in backend_kwargs:
-            backend_kwargs['filter_by_keys'] = tuple(backend_kwargs['filter_by_keys'].items())
-
-        self.ds = cfgrib.open_file(filename, mode='r', **backend_kwargs)
+        self.ds = cfgrib.open_file(filename, **backend_kwargs)
 
     def open_store_variable(self, name, var):
         if isinstance(var.data, np.ndarray):

diff --git a/cfgrib/dataset.py b/cfgrib/dataset.py
@@ -368,7 +368,7 @@ def dict_merge(master, update):
 
 
 def build_dataset_components(
-        stream, indexpath='{path}.{short_hash}.idx', filter_by_keys={},
+        stream, indexpath='{path}.{short_hash}.idx', filter_by_keys={}, errors='ignore',
         encode_cf=('parameter', 'time', 'geography', 'vertical'), log=LOG,
 ):
     filter_by_keys = dict(filter_by_keys)
@@ -389,7 +389,10 @@ def build_dataset_components(
             dict_merge(dimensions, dims)
             dict_merge(variables, vars)
         except ValueError:
-            log.exception("skipping variable with paramId==%r shortName=%r", param_id, short_name)
+            if errors == 'ignore':
+                log.exception("skipping variable: paramId==%r shortName=%r", param_id, short_name)
+            else:
+                raise
     attributes = enforce_unique_attributes(index, GLOBAL_ATTRIBUTES_KEYS, filter_by_keys)
     cfgrib_ver = pkg_resources.get_distribution("cfgrib").version
     eccodes_ver = eccodes.codes_get_api_version()
@@ -414,13 +417,8 @@ class Dataset(object):
     attributes = attr.attrib(type=T.Dict[str, T.Any])
     encoding = attr.attrib(type=T.Dict[str, T.Any])
 
-    @classmethod
-    def from_path(cls, path, mode='r', errors='ignore', **kwargs):
-        """Open a GRIB file as a ``Dataset``."""
-        stream = messages.FileStream(path, message_class=cfmessage.CfMessage, errors=errors)
-        return cls(*build_dataset_components(stream, **kwargs))
 
-
-def open_file(path, **kwargs):
+def open_file(path, grib_errors='ignore', **kwargs):
     """Open a GRIB file as a ``cfgrib.Dataset``."""
-    return Dataset.from_path(path, **kwargs)
+    stream = messages.FileStream(path, message_class=cfmessage.CfMessage, errors=grib_errors)
+    return Dataset(*build_dataset_components(stream, **kwargs))
diff --git a/cfgrib/xarray_store.py b/cfgrib/xarray_store.py
@@ -40,7 +40,8 @@ def open_dataset(path, backend_kwargs={}, filter_by_keys={}, **kwargs):
         warnings.warn("passing filter_by_keys is depreciated use backend_kwargs", FutureWarning)
     real_backend_kwargs = {
         'filter_by_keys': filter_by_keys,
-        'errors': 'ignore',
+        'errors': 'strict',
+        'grib_errors': 'ignore',
     }
     real_backend_kwargs.update(backend_kwargs)
     store = cfgrib_.CfGribDataStore(path, **real_backend_kwargs)

diff --git a/tests/test_30_dataset.py b/tests/test_30_dataset.py
@@ -52,7 +52,7 @@ def test_build_data_var_components_encode_cf_geography():
 
 
 def test_Dataset():
-    res = dataset.Dataset.from_path(TEST_DATA)
+    res = dataset.open_file(TEST_DATA)
     assert 'history' in res.attributes
     assert res.attributes['GRIB_edition'] == 1
     assert tuple(res.dimensions.keys()) == \
@@ -61,7 +61,7 @@ def test_Dataset():
 
 
 def test_Dataset_no_encode():
-    res = dataset.Dataset.from_path(
+    res = dataset.open_file(
         TEST_DATA, encode_cf=()
     )
     assert 'history' in res.attributes
@@ -71,7 +71,7 @@ def test_Dataset_no_encode():
 
 
 def test_Dataset_encode_cf_time():
-    res = dataset.Dataset.from_path(TEST_DATA, encode_cf=('time',))
+    res = dataset.open_file(TEST_DATA, encode_cf=('time',))
     assert 'history' in res.attributes
     assert res.attributes['GRIB_edition'] == 1
     assert tuple(res.dimensions.keys()) == ('number', 'time', 'level', 'i')
@@ -82,7 +82,7 @@ def test_Dataset_encode_cf_time():
 
 
 def test_Dataset_encode_cf_geography():
-    res = dataset.Dataset.from_path(TEST_DATA, encode_cf=('geography',))
+    res = dataset.open_file(TEST_DATA, encode_cf=('geography',))
     assert 'history' in res.attributes
     assert res.attributes['GRIB_edition'] == 1
     assert tuple(res.dimensions.keys()) == \
@@ -94,7 +94,7 @@ def test_Dataset_encode_cf_geography():
 
 
 def test_Dataset_encode_cf_vertical():
-    res = dataset.Dataset.from_path(TEST_DATA, encode_cf=('vertical',))
+    res = dataset.open_file(TEST_DATA, encode_cf=('vertical',))
     assert 'history' in res.attributes
     assert res.attributes['GRIB_edition'] == 1
     assert tuple(res.dimensions.keys()) == ('number', 'dataDate', 'dataTime', 'isobaricInhPa', 'i')
@@ -106,7 +106,7 @@ def test_Dataset_encode_cf_vertical():
 
 def test_Dataset_reguler_gg_surface():
     path = os.path.join(SAMPLE_DATA_FOLDER, 'regular_gg_sfc.grib')
-    res = dataset.Dataset.from_path(path)
+    res = dataset.open_file(path)
 
     assert res.dimensions == {'latitude': 96, 'longitude': 192}
     assert np.allclose(res.variables['latitude'].data[:2], [88.57216851, 86.72253095])
diff --git a/tests/test_40_xarray_store.py b/tests/test_40_xarray_store.py
@@ -54,7 +54,7 @@ def test_open_dataset_corrupted():
     assert len(res.data_vars) == 1
 
     with pytest.raises(eccodes.EcCodesError):
-        xarray_store.open_dataset(TEST_CORRUPTED, backend_kwargs={'errors': 'strict'})
+        xarray_store.open_dataset(TEST_CORRUPTED, backend_kwargs={'grib_errors': 'strict'})
 
 
 def test_open_dataset_encode_cf_time():

diff --git a/tests/test_50_sample_data.py b/tests/test_50_sample_data.py
@@ -34,7 +34,7 @@ def test_open_dataset(grib_name):
     'hpa_and_pa',
     't_on_different_level_types',
     'tp_on_different_grid_resolutions',
-    pytest.param('uv_on_different_levels', marks=pytest.mark.xfail),
+    'uv_on_different_levels',
 ])
 def test_open_dataset_fail(grib_name):
     grib_path = os.path.join(SAMPLE_DATA_FOLDER, grib_name + '.grib')