diff --git a/.github/workflows/cancel-duplicate-runs.yaml b/.github/workflows/cancel-duplicate-runs.yaml new file mode 100644 index 00000000000..7362e04209b --- /dev/null +++ b/.github/workflows/cancel-duplicate-runs.yaml @@ -0,0 +1,14 @@ +name: Cancel +on: + workflow_run: + workflows: ["CI", "CI Additional", "CI Upstream"] + types: + - requested +jobs: + cancel: + name: Cancel previous runs + runs-on: ubuntu-latest + steps: + - uses: styfle/cancel-workflow-action@0.8.0 + with: + workflow_id: ${{ github.event.workflow.id }} diff --git a/.github/workflows/ci-additional.yaml b/.github/workflows/ci-additional.yaml index 4bf85458211..3c6318632f3 100644 --- a/.github/workflows/ci-additional.yaml +++ b/.github/workflows/ci-additional.yaml @@ -19,7 +19,7 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 2 - - uses: xarray-contrib/ci-trigger@v1 + - uses: xarray-contrib/ci-trigger@v1.1 id: detect-trigger with: keyword: "[skip-ci]" @@ -42,26 +42,16 @@ jobs: "py37-min-all-deps", "py37-min-nep18", "py38-all-but-dask", - "py38-backend-api-v2", "py38-flaky", ] steps: - - name: Cancel previous runs - uses: styfle/cancel-workflow-action@0.6.0 - with: - access_token: ${{ github.token }} - uses: actions/checkout@v2 with: fetch-depth: 0 # Fetch all history for all branches and tags. - name: Set environment variables run: | - if [[ ${{ matrix.env }} == "py38-backend-api-v2" ]] ; - then - echo "CONDA_ENV_FILE=ci/requirements/environment.yml" >> $GITHUB_ENV - echo "XARRAY_BACKEND_API=v2" >> $GITHUB_ENV - - elif [[ ${{ matrix.env }} == "py38-flaky" ]] ; + if [[ ${{ matrix.env }} == "py38-flaky" ]] ; then echo "CONDA_ENV_FILE=ci/requirements/environment.yml" >> $GITHUB_ENV echo "PYTEST_EXTRA_FLAGS=--run-flaky --run-network-tests" >> $GITHUB_ENV @@ -126,10 +116,6 @@ jobs: shell: bash -l {0} steps: - - name: Cancel previous runs - uses: styfle/cancel-workflow-action@0.6.0 - with: - access_token: ${{ github.token }} - uses: actions/checkout@v2 with: fetch-depth: 0 # Fetch all history for all branches and tags. @@ -167,10 +153,6 @@ jobs: shell: bash -l {0} steps: - - name: Cancel previous runs - uses: styfle/cancel-workflow-action@0.6.0 - with: - access_token: ${{ github.token }} - uses: actions/checkout@v2 with: fetch-depth: 0 # Fetch all history for all branches and tags. @@ -211,10 +193,6 @@ jobs: shell: bash -l {0} steps: - - name: Cancel previous runs - uses: styfle/cancel-workflow-action@0.6.0 - with: - access_token: ${{ github.token }} - uses: actions/checkout@v2 with: fetch-depth: 0 # Fetch all history for all branches and tags. diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e8fd881e707..f45e0b4d2e1 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -19,7 +19,7 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 2 - - uses: xarray-contrib/ci-trigger@v1 + - uses: xarray-contrib/ci-trigger@v1.1 id: detect-trigger with: keyword: "[skip-ci]" @@ -35,12 +35,8 @@ jobs: fail-fast: false matrix: os: ["ubuntu-latest", "macos-latest", "windows-latest"] - python-version: ["3.7", "3.8"] + python-version: ["3.7", "3.8", "3.9"] steps: - - name: Cancel previous runs - uses: styfle/cancel-workflow-action@0.6.0 - with: - access_token: ${{ github.token }} - uses: actions/checkout@v2 with: fetch-depth: 0 # Fetch all history for all branches and tags. diff --git a/.github/workflows/upstream-dev-ci.yaml b/.github/workflows/upstream-dev-ci.yaml index e55be4da329..9f29a4ddc31 100644 --- a/.github/workflows/upstream-dev-ci.yaml +++ b/.github/workflows/upstream-dev-ci.yaml @@ -21,7 +21,7 @@ jobs: - uses: actions/checkout@v2 with: fetch-depth: 2 - - uses: xarray-contrib/ci-trigger@v1 + - uses: xarray-contrib/ci-trigger@v1.1 id: detect-trigger with: keyword: "[test-upstream]" @@ -43,15 +43,13 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.8"] + python-version: ["3.9"] outputs: artifacts_availability: ${{ steps.status.outputs.ARTIFACTS_AVAILABLE }} steps: - - name: Cancel previous runs - uses: styfle/cancel-workflow-action@0.6.0 - with: - access_token: ${{ github.token }} - uses: actions/checkout@v2 + with: + fetch-depth: 0 # Fetch all history for all branches and tags. - uses: conda-incubator/setup-miniconda@v2 with: channels: conda-forge @@ -64,6 +62,9 @@ jobs: run: | mamba env update -f ci/requirements/environment.yml bash ci/install-upstream-wheels.sh + - name: Install xarray + run: | + python -m pip install --no-deps -e . - name: Version info run: | conda info -a diff --git a/.gitignore b/.gitignore index 5f02700de37..90f4a10ed5f 100644 --- a/.gitignore +++ b/.gitignore @@ -65,10 +65,11 @@ dask-worker-space/ # xarray specific doc/_build -doc/generated +generated/ xarray/tests/data/*.grib.*.idx # Sync tools Icon* .ipynb_checkpoints +doc/rasm.zarr diff --git a/ci/install-upstream-wheels.sh b/ci/install-upstream-wheels.sh index 8458a8df352..c5a640c5a81 100755 --- a/ci/install-upstream-wheels.sh +++ b/ci/install-upstream-wheels.sh @@ -15,7 +15,8 @@ conda uninstall -y --force \ rasterio \ pint \ bottleneck \ - sparse + sparse \ + xarray # to limit the runtime of Upstream CI python -m pip install pytest-timeout python -m pip install \ diff --git a/ci/requirements/doc.yml b/ci/requirements/doc.yml index 47b3cb9eb70..fe6dead5434 100644 --- a/ci/requirements/doc.yml +++ b/ci/requirements/doc.yml @@ -20,16 +20,20 @@ dependencies: - numba - numpy>=1.17 - pandas>=1.0 + - pip + - pydata-sphinx-theme>=0.4.3 - rasterio>=1.1 - seaborn - setuptools - sparse - - sphinx>=3.3 - - sphinx_rtd_theme>=0.4 - sphinx-autosummary-accessors + - sphinx-book-theme >= 0.0.38 + - sphinx-copybutton + - sphinx-panels + - sphinx>=3.3 - zarr>=2.4 - - pip - pip: - - scanpydoc + - sphinxext-rediraffe + - sphinxext-opengraph # relative to this file. Needs to be editable to be accepted. - -e ../.. diff --git a/ci/requirements/environment.yml b/ci/requirements/environment.yml index 36147c64c03..57498fa5700 100644 --- a/ci/requirements/environment.yml +++ b/ci/requirements/environment.yml @@ -22,6 +22,7 @@ dependencies: - nc-time-axis - netcdf4 - numba + - numexpr - numpy - pandas - pint diff --git a/ci/requirements/py37-min-all-deps.yml b/ci/requirements/py37-min-all-deps.yml index c2fd2e18a8a..7d04f431935 100644 --- a/ci/requirements/py37-min-all-deps.yml +++ b/ci/requirements/py37-min-all-deps.yml @@ -25,7 +25,9 @@ dependencies: - lxml=4.5 # Optional dep of pydap - matplotlib-base=3.1 - nc-time-axis=1.2 - - netcdf4=1.5 +# netcdf follows a 1.major.minor[.patch] convention (see https://github.com/Unidata/netcdf4-python/issues/1090) +# bumping the netCDF4 version is currently blocked by #4491 + - netcdf4=1.5.3 - numba=0.48 - numpy=1.17 - pandas=1.0 diff --git a/design_notes/flexible_indexes_notes.md b/design_notes/flexible_indexes_notes.md new file mode 100644 index 00000000000..c7eb718720c --- /dev/null +++ b/design_notes/flexible_indexes_notes.md @@ -0,0 +1,398 @@ +# Proposal: Xarray flexible indexes refactoring + +Current status: https://github.com/pydata/xarray/projects/1 + +## 1. Data Model + +Indexes are used in Xarray to extract data from Xarray objects using coordinate labels instead of using integer array indices. Although the indexes used in an Xarray object can be accessed (or built on-the-fly) via public methods like `to_index()` or properties like `indexes`, those are mainly used internally. + +The goal of this project is to make those indexes 1st-class citizens of Xarray's data model. As such, indexes should clearly be separated from Xarray coordinates with the following relationships: + +- Index -> Coordinate: one-to-many +- Coordinate -> Index: one-to-zero-or-one + +An index may be built from one or more coordinates. However, each coordinate must relate to one index at most. Additionally, a coordinate may not be tied to any index. + +The order in which multiple coordinates relate to an index may matter. For example, Scikit-Learn's [`BallTree`](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.BallTree.html#sklearn.neighbors.BallTree) index with the Haversine metric requires providing latitude and longitude values in that specific order. As another example, the order in which levels are defined in a `pandas.MultiIndex` may affect its lexsort depth (see [MultiIndex sorting](https://pandas.pydata.org/pandas-docs/stable/user_guide/advanced.html#sorting-a-multiindex)). + +Xarray's current data model has the same index-coordinate relationships than stated above, although this assumes that multi-index "virtual" coordinates are counted as coordinates (we can consider them as such, with some constraints). More importantly, This refactoring would turn the current one-to-one relationship between a dimension and an index into a many-to-many relationship, which would overcome some current limitations. + +For example, we might want to select data along a dimension which has several coordinates: + +```python +>>> da + +array([...]) +Coordinates: + * drainage_area (river_profile) float64 ... + * chi (river_profile) float64 ... +``` + +In this example, `chi` is a transformation of the `drainage_area` variable that is often used in geomorphology. We'd like to select data along the river profile using either `da.sel(drainage_area=...)` or `da.sel(chi=...)` but that's not currently possible. We could rename the `river_profile` dimension to one of the coordinates, then use `sel` with that coordinate, then call `swap_dims` if we want to use `sel` with the other coordinate, but that's not ideal. We could also build a `pandas.MultiIndex` from `drainage_area` and `chi`, but that's not optimal (there's no hierarchical relationship between these two coordinates). + +Let's take another example: + +```python +>>> da + +array([[...], [...]]) +Coordinates: + * lon (x, y) float64 ... + * lat (x, y) float64 ... + * x (x) float64 ... + * y (y) float64 ... +``` + +This refactoring would allow creating a geographic index for `lat` and `lon` and two simple indexes for `x` and `y` such that we could select data with either `da.sel(lon=..., lat=...)` or `da.sel(x=..., y=...)`. + +Refactoring the dimension -> index one-to-one relationship into many-to-many would also introduce some issues that we'll need to address, e.g., ambiguous cases like `da.sel(chi=..., drainage_area=...)` where multiple indexes may potentially return inconsistent positional indexers along a dimension. + +## 2. Proposed API changes + +### 2.1 Index wrapper classes + +Every index that is used to select data from Xarray objects should inherit from a base class, e.g., `XarrayIndex`, that provides some common API. `XarrayIndex` subclasses would generally consist of thin wrappers around existing index classes such as `pandas.Index`, `pandas.MultiIndex`, `scipy.spatial.KDTree`, etc. + +There is a variety of features that an xarray index wrapper may or may not support: + +- 1-dimensional vs. 2-dimensional vs. n-dimensional coordinate (e.g., `pandas.Index` only supports 1-dimensional coordinates while a geographic index could be built from n-dimensional coordinates) +- built from a single vs multiple coordinate(s) (e.g., `pandas.Index` is built from one coordinate, `pandas.MultiIndex` may be built from an arbitrary number of coordinates and a geographic index would typically require two latitude/longitude coordinates) +- in-memory vs. out-of-core (dask) index data/coordinates (vs. other array backends) +- range-based vs. point-wise selection +- exact vs. inexact lookups + +Whether or not a `XarrayIndex` subclass supports each of the features listed above should be either declared explicitly via a common API or left to the implementation. An `XarrayIndex` subclass may encapsulate more than one underlying object used to perform the actual indexing. Such "meta" index would typically support a range of features among those mentioned above and would automatically select the optimal index object for a given indexing operation. + +An `XarrayIndex` subclass must/should/may implement the following properties/methods: + +- a `from_coords` class method that creates a new index wrapper instance from one or more Dataset/DataArray coordinates (+ some options) +- a `query` method that takes label-based indexers as argument (+ some options) and that returns the corresponding position-based indexers +- an `indexes` property to access the underlying index object(s) wrapped by the `XarrayIndex` subclass +- a `data` property to access index's data and map it to coordinate data (see [Section 4](#4-indexvariable)) +- a `__getitem__()` implementation to propagate the index through DataArray/Dataset indexing operations +- `equals()`, `union()` and `intersection()` methods for data alignment (see [Section 2.6](#26-using-indexes-for-data-alignment)) +- Xarray coordinate getters (see [Section 2.2.4](#224-implicit-coodinates)) +- a method that may return a new index and that will be called when one of the corresponding coordinates is dropped from the Dataset/DataArray (multi-coordinate indexes) +- `encode()`/`decode()` methods that would allow storage-agnostic serialization and fast-path reconstruction of the underlying index object(s) (see [Section 2.8](#28-index-encoding)) +- one or more "non-standard" methods or properties that could be leveraged in Xarray 3rd-party extensions like Dataset/DataArray accessors (see [Section 2.7](#27-using-indexes-for-other-purposes)) + +The `XarrayIndex` API has still to be defined in detail. + +Xarray should provide a minimal set of built-in index wrappers (this could be reduced to the indexes currently supported in Xarray, i.e., `pandas.Index` and `pandas.MultiIndex`). Other index wrappers may be implemented in 3rd-party libraries (recommended). The `XarrayIndex` base class should be part of Xarray's public API. + +#### 2.1.1 Index discoverability + +For better discoverability of Xarray-compatible indexes, Xarray could provide some mechanism to register new index wrappers, e.g., something like [xoak's `IndexRegistry`](https://xoak.readthedocs.io/en/latest/_api_generated/xoak.IndexRegistry.html#xoak.IndexRegistry) or [numcodec's registry](https://numcodecs.readthedocs.io/en/stable/registry.html). + +Additionally (or alternatively), new index wrappers may be registered via entry points as is already the case for storage backends and maybe other backends (plotting) in the future. + +Registering new indexes either via a custom registry or via entry points should be optional. Xarray should also allow providing `XarrayIndex` subclasses in its API (Dataset/DataArray constructors, `set_index()`, etc.). + +### 2.2 Explicit vs. implicit index creation + +#### 2.2.1 Dataset/DataArray's `indexes` constructor argument + +The new `indexes` argument of Dataset/DataArray constructors may be used to specify which kind of index to bind to which coordinate(s). It would consist of a mapping where, for each item, the key is one coordinate name (or a sequence of coordinate names) that must be given in `coords` and the value is the type of the index to build from this (these) coordinate(s): + +```python +>>> da = xr.DataArray( +... data=[[275.2, 273.5], [270.8, 278.6]], +... dims=('x', 'y'), +... coords={ +... 'lat': (('x', 'y'), [[45.6, 46.5], [50.2, 51.6]]), +... 'lon': (('x', 'y'), [[5.7, 10.5], [6.2, 12.8]]), +... }, +... indexes={('lat', 'lon'): SpatialIndex}, +... ) + +array([[275.2, 273.5], + [270.8, 278.6]]) +Coordinates: + * lat (x, y) float64 45.6 46.5 50.2 51.6 + * lon (x, y) float64 5.7 10.5 6.2 12.8 +``` + +More formally, `indexes` would accept `Mapping[CoordinateNames, IndexSpec]` where: + +- `CoordinateNames = Union[CoordinateName, Tuple[CoordinateName, ...]]` and `CoordinateName = Hashable` +- `IndexSpec = Union[Type[XarrayIndex], Tuple[Type[XarrayIndex], Dict[str, Any]], XarrayIndex]`, so that index instances or index classes + build options could be also passed + +Currently index objects like `pandas.MultiIndex` can be passed directly to `coords`, which in this specific case results in the implicit creation of virtual coordinates. With the new `indexes` argument this behavior may become even more confusing than it currently is. For the sake of clarity, it would be appropriate to eventually drop support for this specific behavior and treat any given mapping value given in `coords` as an array that can be wrapped into an Xarray variable, i.e., in the case of a multi-index: + +```python +>>> xr.DataArray([1.0, 2.0], dims='x', coords={'x': midx}) + +array([1., 2.]) +Coordinates: + x (x) object ('a', 0) ('b', 1) +``` + +A possible, more explicit solution to reuse a `pandas.MultiIndex` in a DataArray/Dataset with levels exposed as coordinates is proposed in [Section 2.2.4](#224-implicit-coordinates). + +#### 2.2.2 Dataset/DataArray's `set_index` method + +New indexes may also be built from existing sets of coordinates or variables in a Dataset/DataArray using the `.set_index()` method. + +The [current signature](http://xarray.pydata.org/en/stable/generated/xarray.DataArray.set_index.html#xarray.DataArray.set_index) of `.set_index()` is tailored to `pandas.MultiIndex` and tied to the concept of a dimension-index. It is therefore hardly reusable as-is in the context of flexible indexes proposed here. + +The new signature may look like one of these: + +- A. `.set_index(coords: CoordinateNames, index: Union[XarrayIndex, Type[XarrayIndex]], **index_kwargs)`: one index is set at a time, index construction options may be passed as keyword arguments +- B. `.set_index(indexes: Mapping[CoordinateNames, Union[Type[XarrayIndex], Tuple[Type[XarrayIndex], Dict[str, Any]]]])`: multiple indexes may be set at a time from a mapping of coordinate or variable name(s) as keys and `XarrayIndex` subclasses (maybe with a dict of build options) as values. If variable names are given as keys of they will be promoted as coordinates + +Option A looks simple and elegant but significantly departs from the current signature. Option B is more consistent with the Dataset/DataArray constructor signature proposed in the previous section and would be easier to adopt in parallel with the current signature that we could still support through some depreciation cycle. + +The `append` parameter of the current `.set_index()` is specific to `pandas.MultiIndex`. With option B we could still support it, although we might want to either drop it or move it to the index construction options in the future. + +#### 2.2.3 Implicit default indexes + +In general explicit index creation should be preferred over implicit index creation. However, there is a majority of cases where basic `pandas.Index` objects could be built and used as indexes for 1-dimensional coordinates. For convenience, Xarray should automatically build such indexes for the coordinates where no index has been explicitly assigned in the Dataset/DataArray constructor or when indexes have been reset / dropped. + +For which coordinates? + +- A. only 1D coordinates with a name matching their dimension name +- B. all 1D coordinates + +When to create it? + +- A. each time when a new Dataset/DataArray is created +- B. only when we need it (i.e., when calling `.sel()` or `indexes`) + +Options A and A are what Xarray currently does and may be the best choice considering that indexes could possibly be invalidated by coordinate mutation. + +Besides `pandas.Index`, other indexes currently supported in Xarray like `CFTimeIndex` could be built depending on the coordinate data type. + +#### 2.2.4 Implicit coordinates + +Like for the indexes, explicit coordinate creation should be preferred over implicit coordinate creation. However, there may be some situations where we would like to keep creating coordinates implicitly for backwards compatibility. + +For example, it is currently possible to pass a `pandas.MulitIndex` object as a coordinate to the Dataset/DataArray constructor: + +```python +>>> midx = pd.MultiIndex.from_arrays([['a', 'b'], [0, 1]], names=['lvl1', 'lvl2']) +>>> da = xr.DataArray([1.0, 2.0], dims='x', coords={'x': midx}) +>>> da + +array([1., 2.]) +Coordinates: + * x (x) MultiIndex + - lvl1 (x) object 'a' 'b' + - lvl2 (x) int64 0 1 +``` + +In that case, virtual coordinates are created for each level of the multi-index. After the index refactoring, these coordinates would become real coordinates bound to the multi-index. + +In the example above a coordinate is also created for the `x` dimension: + +```python +>>> da.x + +array([('a', 0), ('b', 1)], dtype=object) +Coordinates: + * x (x) MultiIndex + - lvl1 (x) object 'a' 'b' + - lvl2 (x) int64 0 1 +``` + +With the new proposed data model, this wouldn't be a requirement anymore: there is no concept of a dimension-index. However, some users might still rely on the `x` coordinate so we could still (temporarily) support it for backwards compatibility. + +Besides `pandas.MultiIndex`, there may be other situations where we would like to reuse an existing index in a new Dataset/DataArray (e.g., when the index is very expensive to build), and which might require implicit creation of one or more coordinates. + +The example given here is quite confusing, though: this is not an easily predictable behavior. We could entirely avoid the implicit creation of coordinates, e.g., using a helper function that generates coordinate + index dictionaries that we could then pass directly to the DataArray/Dataset constructor: + +```python +>>> coords_dict, index_dict = create_coords_from_index(midx, dims='x', include_dim_coord=True) +>>> coords_dict +{'x': + array([('a', 0), ('b', 1)], dtype=object), + 'lvl1': + array(['a', 'b'], dtype=object), + 'lvl2': + array([0, 1])} +>>> index_dict +{('lvl1', 'lvl2'): midx} +>>> xr.DataArray([1.0, 2.0], dims='x', coords=coords_dict, indexes=index_dict) + +array([1., 2.]) +Coordinates: + x (x) object ('a', 0) ('b', 1) + * lvl1 (x) object 'a' 'b' + * lvl2 (x) int64 0 1 +``` + +### 2.2.5 Immutable indexes + +Some underlying indexes might be mutable (e.g., a tree-based index structure that allows dynamic addition of data points) while other indexes like `pandas.Index` aren't. To keep things simple, it is probably better to continue considering all indexes in Xarray as immutable (as well as their corresponding coordinates, see [Section 2.4.1](#241-mutable-coordinates)). + +### 2.3 Index access + +#### 2.3.1 Dataset/DataArray's `indexes` property + +The `indexes` property would allow easy access to all the indexes used in a Dataset/DataArray. It would return a `Dict[CoordinateName, XarrayIndex]` for easy index lookup from coordinate name. + +#### 2.3.2 Additional Dataset/DataArray properties or methods + +In some cases the format returned by the `indexes` property would not be the best (e.g, it may return duplicate index instances as values). For convenience, we could add one more property / method to get the indexes in the desired format if needed. + +### 2.4 Propagate indexes through operations + +#### 2.4.1 Mutable coordinates + +Dataset/DataArray coordinates may be replaced (`__setitem__`) or dropped (`__delitem__`) in-place, which may invalidate some of the indexes. A drastic though probably reasonable solution in this case would be to simply drop all indexes bound to those replaced/dropped coordinates. For the case where a 1D basic coordinate that corresponds to a dimension is added/replaced, we could automatically generate a new index (see [Section 2.2.4](#224-implicit-indexes)). + +We must also ensure that coordinates having a bound index are immutable, e.g., still wrap them into `IndexVariable` objects (even though the `IndexVariable` class might change substantially after this refactoring). + +#### 2.4.2 New Dataset/DataArray with updated coordinates + +Xarray provides a variety of Dataset/DataArray operations affecting the coordinates and where simply dropping the index(es) is not desirable. For example: + +- multi-coordinate indexes could be reduced to single coordinate indexes + - like in `.reset_index()` or `.sel()` applied on a subset of the levels of a `pandas.MultiIndex` and that internally call `MultiIndex.droplevel` and `MultiIndex.get_loc_level`, respectively +- indexes may be indexed themselves + - like `pandas.Index` implements `__getitem__()` + - when indexing their corresponding coordinate(s), e.g., via `.sel()` or `.isel()`, those indexes should be indexed too + - this might not be supported by all Xarray indexes, though +- some indexes that can't be indexed could still be automatically (re)built in the new Dataset/DataArray + - like for example building a new `KDTree` index from the selection of a subset of an initial collection of data points + - this is not always desirable, though, as indexes may be expensive to build + - a more reasonable option would be to explicitly re-build the index, e.g., using `.set_index()` +- Dataset/DataArray operations involving alignment (see [Section 2.6](#26-using-indexes-for-data-alignment)) + +### 2.5 Using indexes for data selection + +One main use of indexes is label-based data selection using the DataArray/Dataset `.sel()` method. This refactoring would introduce a number of API changes that could go through some depreciation cycles: + +- the keys of the mapping given to `indexers` (or the names of `indexer_kwargs`) would not correspond to only dimension names but could be the name of any coordinate that has an index +- for a `pandas.MultiIndex`, if no dimension-coordinate is created by default (see [Section 2.2.4](#224-implicit-coordinates)), providing dict-like objects as indexers should be depreciated +- there should be the possibility to provide additional options to the indexes that support specific selection features (e.g., Scikit-learn's `BallTree`'s `dualtree` query option to boost performance). + - the best API is not trivial here, since `.sel()` may accept indexers passed to several indexes (which should still be supported for convenience and compatibility), and indexes may have similar options with different semantics + - we could introduce a new parameter like `index_options: Dict[XarrayIndex, Dict[str, Any]]` to pass options grouped by index +- the `method` and `tolerance` parameters are specific to `pandas.Index` and would not be supported by all indexes: probably best is to eventually pass those arguments as `index_options` +- the list valid indexer types might be extended in order to support new ways of indexing data, e.g., unordered selection of all points within a given range + - alternatively, we could reuse existing indexer types with different semantics depending on the index, e.g., using `slice(min, max, None)` for unordered range selection + +With the new data model proposed here, an ambiguous situation may occur when indexers are given for several coordinates that share the same dimension but not the same index, e.g., from the example in [Section 1](#1-data-model): + +```python +da.sel(x=..., y=..., lat=..., lon=...) +``` + +The easiest solution for this situation would be to raise an error. Alternatively, we could introduce a new parameter to specify how to combine the resulting integer indexers (i.e., union vs intersection), although this could already be achieved by chaining `.sel()` calls or combining `.sel()` with `.merge()` (it may or may not be straightforward). + +### 2.6 Using indexes for data alignment + +Another main use if indexes is data alignment in various operations. Some considerations regarding alignment and flexible indexes: + +- support for alignment should probably be optional for an `XarrayIndex` subclass. + - like `pandas.Index`, the index wrapper classes that support it should implement `.equals()`, `.union()` and/or `.intersection()` + - support might be partial if that makes sense (outer, inner, left, right, exact...). + - index equality might involve more than just the labels: for example a spatial index might be used to check if the coordinate system (CRS) is identical for two sets of coordinates + - some indexes might implement inexact alignment, like in [#4489](https://github.com/pydata/xarray/pull/4489) or a `KDTree` index that selects nearest-neighbors within a given tolerance + - alignment may be "multi-dimensional", i.e., the `KDTree` example above vs. dimensions aligned independently of each other +- we need to decide what to do when one dimension has more than one index that supports alignment + - we should probably raise unless the user explicitly specify which index to use for the alignment +- we need to decide what to do when one dimension has one or more index(es) but none support alignment + - either we raise or we fail back (silently) to alignment based on dimension size +- for inexact alignment, the tolerance threshold might be given when building the index and/or when performing the alignment +- are there cases where we want a specific index to perform alignment and another index to perform selection? + - it would be tricky to support that unless we allow multiple indexes per coordinate + - alternatively, underlying indexes could be picked internally in a "meta" index for one operation or another, although the risk is to eventually have to deal with an explosion of index wrapper classes with different meta indexes for each combination that we'd like to use. + +### 2.7 Using indexes for other purposes + +Xarray also provides a number of Dataset/DataArray methods where indexes are used in various ways, e.g., + +- `resample` (`CFTimeIndex` and a `DatetimeIntervalIndex`) +- `DatetimeAccessor` & `TimedeltaAccessor` properties (`CFTimeIndex` and a `DatetimeIntervalIndex`) +- `interp` & `interpolate_na`, + - with `IntervalIndex`, these become regridding operations. Should we support hooks for these operations? +- `differentiate`, `integrate`, `polyfit` + - raise an error if not a "simple" 1D index? +- `pad` +- `coarsen` has to make choices about output index labels. +- `sortby` +- `stack`/`unstack` +- plotting + - `plot.pcolormesh` "infers" interval breaks along axes, which are really inferred `bounds` for the appropriate indexes. + - `plot.step` again uses `bounds`. In fact, we may even want `step` to be the default 1D plotting function if the axis has `bounds` attached. + +It would be reasonable to first restrict those methods to the indexes that are currently available in Xarray, and maybe extend the `XarrayIndex` API later upon request when the opportunity arises. + +Conversely, nothing should prevent implementing "non-standard" API in 3rd-party `XarrayIndex` subclasses that could be used in DataArray/Dataset extensions (accessors). For example, we might want to reuse a `KDTree` index to compute k-nearest neighbors (returning a DataArray/Dataset with a new dimension) and/or the distances to the nearest neighbors (returning a DataArray/Dataset with a new data variable). + +### 2.8 Index encoding + +Indexes don't need to be directly serializable since we could (re)build them from their corresponding coordinate(s). However, it would be useful if some indexes could be encoded/decoded to/from a set of arrays that would allow optimized reconstruction and/or storage, e.g., + +- `pandas.MultiIndex` -> `index.levels` and `index.codes` +- Scikit-learn's `KDTree` and `BallTree` that use an array-based representation of an immutable tree structure + +## 3. Index representation in DataArray/Dataset's `repr` + +Since indexes would become 1st class citizen of Xarray's data model, they deserve their own section in Dataset/DataArray `repr` that could look like: + +``` + +array([[5.4, 7.8], + [6.2, 4.7]]) +Coordinates: + * lon (x, y) float64 10.2 15.2 12.6 17.6 + * lat (x, y) float64 40.2 45.6 42.2 47.6 + * x (x) float64 200.0 400.0 + * y (y) float64 800.0 1e+03 +Indexes: + lat, lon + x + y +``` + +To keep the `repr` compact, we could: + +- consolidate entries that map to the same index object, and have an short inline repr for `XarrayIndex` object +- collapse the index section by default in the HTML `repr` +- maybe omit all trivial indexes for 1D coordinates that match the dimension name + +## 4. `IndexVariable` + +`IndexVariable` is currently used to wrap a `pandas.Index` as a variable, which would not be relevant after this refactoring since it is aimed at decoupling indexes and variables. + +We'll probably need to move elsewhere some of the features implemented in `IndexVariable` to: + +- ensure that all coordinates with an index are immutable (see [Section 2.4.1](#241-mutable-coordinates)) + - do not set values directly, do not (re)chunk (even though it may be already chunked), do not load, do not convert to sparse/dense, etc. +- directly reuse index's data when that's possible + - in the case of a `pandas.Index`, it makes little sense to have duplicate data (e.g., as a NumPy array) for its corresponding coordinate +- convert a variable into a `pandas.Index` using `.to_index()` (for backwards compatibility). + +Other `IndexVariable` API like `level_names` and `get_level_variable()` would not useful anymore: it is specific to how we currently deal with `pandas.MultiIndex` and virtual "level" coordinates in Xarray. + +## 5. Chunked coordinates and/or indexers + +We could take opportunity of this refactoring to better leverage chunked coordinates (and/or chunked indexers for data selection). There's two ways to enable it: + +A. support for chunked coordinates is left to the index +B. support for chunked coordinates is index agnostic and is implemented in Xarray + +As an example for B, [xoak](https://github.com/ESM-VFC/xoak) supports building an index for each chunk, which is coupled with a two-step data selection process (cross-index queries + brute force "reduction" look-up). There is an example [here](https://xoak.readthedocs.io/en/latest/examples/dask_support.html). This may be tedious to generalize this to other kinds of operations, though. Xoak's Dask support is rather experimental, not super stable (it's quite hard to control index replication and data transfer between Dask workers with the default settings), and depends on whether indexes are thread-safe and/or serializable. + +Option A may be more reasonable for now. + +## 6. Coordinate duck arrays + +Another opportunity of this refactoring is support for duck arrays as index coordinates. Decoupling coordinates and indexes would *de-facto* enable it. + +However, support for duck arrays in index-based operations such as data selection or alignment would probably require some protocol extension, e.g., + +```python +class MyDuckArray: + ... + + def _sel_(self, indexer): + """Prepare the label-based indexer to conform to this coordinate array.""" + ... + return new_indexer + + ... +``` + +For example, a `pint` array would implement `_sel_` to perform indexer unit conversion or raise, warn, or just pass the indexer through if it has no units. diff --git a/doc/Makefile b/doc/Makefile index d88a8a59c39..8b08d3a2dbe 100644 --- a/doc/Makefile +++ b/doc/Makefile @@ -19,6 +19,7 @@ I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) . help: @echo "Please use \`make ' where is one of" @echo " html to make standalone HTML files" + @echo " rtdhtml Build html using same settings used on ReadtheDocs" @echo " livehtml Make standalone HTML files and rebuild the documentation when a change is detected. Also includes a livereload enabled web server" @echo " dirhtml to make HTML files named index.html in directories" @echo " singlehtml to make a single large HTML file" @@ -58,6 +59,13 @@ html: @echo @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." +.PHONY: rtdhtml +rtdhtml: + $(SPHINXBUILD) -T -j auto -E -W --keep-going -b html -d $(BUILDDIR)/doctrees -D language=en . $(BUILDDIR)/html + @echo + @echo "Build finished. The HTML pages are in $(BUILDDIR)/html." + + .PHONY: livehtml livehtml: # @echo "$(SPHINXATUOBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html" diff --git a/doc/_static/style.css b/doc/_static/style.css index b7d30f429cf..833b11a83ab 100644 --- a/doc/_static/style.css +++ b/doc/_static/style.css @@ -1,27 +1,263 @@ -@import url("theme.css"); +table.colwidths-given { + table-layout: fixed; + width: 100%; +} +table.docutils td { + white-space: unset; + word-wrap: break-word; +} + +/* Reduce left and right margins */ + +.container, .container-lg, .container-md, .container-sm, .container-xl { + max-width: 1350px !important; +} + + +/* Copied from +https://github.com/bokeh/bokeh/blob/branch-2.4/sphinx/source/bokeh/static/custom.css +*/ + +:root { + /* Logo image height + all the paddings/margins make the navbar height. */ + --navbar-height: calc(30px + 0.3125rem * 2 + 0.5rem * 2); +} + +.bd-search { + position: relative; + padding-bottom: 20px; +} + +@media (min-width: 768px) { + .search-front-page { + width: 50%; + } +} + +/* minimal copy paste from bootstrap docs css to get sidebars working */ + +.bd-toc { + -ms-flex-order: 2; + order: 2; + padding-top: 1.5rem; + padding-bottom: 1.5rem; + /* font-size: 0.875rem; */ + /* add scrolling sidebar */ + height: calc(100vh - 2rem); + overflow-y: auto; +} + +@supports ((position: -webkit-sticky) or (position: sticky)) { + .bd-toc { + position: -webkit-sticky; + position: sticky; + top: 4rem; + height: calc(100vh - 4rem); + overflow-y: auto; + } +} + +.section-nav { + padding-left: 0; + border-left: 1px solid #eee; + border-bottom: none; +} + +.section-nav ul { + padding-left: 1rem; +} + +.toc-entry { + display: block; +} + +.toc-entry a { + display: block; + padding: 0.125rem 1.5rem; + color: #77757a; +} -.wy-side-nav-search>a img.logo, -.wy-side-nav-search .wy-dropdown>a img.logo { - width: 12rem +.toc-entry a:hover { + color: rgba(0, 0, 0, 0.85); + text-decoration: none; } -.wy-side-nav-search { - background-color: #eee; +.bd-sidebar { + -ms-flex-order: 0; + order: 0; + border-bottom: 1px solid rgba(0, 0, 0, 0.1); } -.wy-side-nav-search>div.version { +@media (min-width: 768px) { + .bd-sidebar { + border-right: 1px solid rgba(0, 0, 0, 0.1); + } + @supports ((position: -webkit-sticky) or (position: sticky)) { + .bd-sidebar { + position: -webkit-sticky; + position: sticky; + top: var(--navbar-height); + z-index: 1000; + height: calc(100vh - var(--navbar-height)); + } + } +} + +@media (min-width: 1200px) { + .bd-sidebar { + -ms-flex: 0 1 320px; + flex: 0 1 320px; + } +} + +.bd-links { + padding-top: 1rem; + padding-bottom: 1rem; + margin-right: -15px; + margin-left: -15px; +} + +@media (min-width: 768px) { + @supports ((position: -webkit-sticky) or (position: sticky)) { + .bd-links { + max-height: calc(100vh - 9rem); + overflow-y: auto; + } + } +} + +@media (min-width: 768px) { + .bd-links { + display: block !important; + } +} + +.bd-sidenav { display: none; } -.wy-nav-top { - background-color: #555; +.bd-toc-link { + display: block; + padding: 0.25rem 1.5rem; + font-weight: 400; + color: rgba(0, 0, 0, 0.65); } -table.colwidths-given { - table-layout: fixed; - width: 100%; +.bd-toc-link:hover { + color: rgba(0, 0, 0, 0.85); + text-decoration: none; } -table.docutils td { - white-space: unset; - word-wrap: break-word; + +.bd-toc-item.active { + margin-bottom: 1rem; +} + +.bd-toc-item.active:not(:first-child) { + margin-top: 1rem; +} + +.bd-toc-item.active > .bd-toc-link { + color: rgba(0, 0, 0, 0.85); +} + +.bd-toc-item.active > .bd-toc-link:hover { + background-color: transparent; +} + +.bd-toc-item.active > .bd-sidenav { + display: block; +} + +.bd-sidebar .nav > li > a { + display: block; + padding: 0.25rem 1.5rem; + font-size: 90%; + color: rgba(0, 0, 0, 0.65); +} + +.bd-sidebar .nav > li > a:hover { + color: rgba(0, 0, 0, 0.85); + text-decoration: none; + background-color: transparent; +} + +.bd-sidebar .nav > .active > a, +.bd-sidebar .nav > .active:hover > a { + font-weight: 400; + color: #130654; + /* adjusted from original + color: rgba(0, 0, 0, 0.85); + background-color: transparent; */ +} + +.bd-sidebar .nav > li > ul { + list-style: none; + padding: 0.25rem 1.5rem; +} + +.bd-sidebar .nav > li > ul > li > a { + display: block; + padding: 0.25rem 1.5rem; + font-size: 90%; + color: rgba(0, 0, 0, 0.65); +} + +.bd-sidebar .nav > li > ul > .active > a, +.bd-sidebar .nav > li > ul > .active:hover > a { + font-weight: 400; + color: #542437; +} + +dt:target { + background-color: initial; +} + +/* Offsetting anchored elements within the main content to adjust for fixed header + https://github.com/pandas-dev/pandas-sphinx-theme/issues/6 */ +main *:target::before { + display: block; + content: ''; + height: var(--navbar-height); + margin-top: calc(-1 * var(--navbar-height)); +} + +body { + /* Add padding to body to avoid overlap with navbar. */ + padding-top: var(--navbar-height); + width: 100%; +} + +/* adjust toc font sizes to improve overview */ +.toc-h2 { + font-size: 0.85rem; +} + +.toc-h3 { + font-size: 0.75rem; +} + +.toc-h4 { + font-size: 0.65rem; +} + +.toc-entry > .nav-link.active { + font-weight: 400; + color: #542437; + background-color: transparent; + border-left: 2px solid #563d7c; +} + +.nav-link:hover { + border-style: none; +} + +/* Collapsing of the TOC sidebar while scrolling */ + +/* Nav: hide second level (shown on .active) */ +.bd-toc .nav .nav { + display: none; +} + +.bd-toc .nav > .active > ul { + display: block; } diff --git a/doc/_static/thumbnails/ERA5-GRIB-example.png b/doc/_static/thumbnails/ERA5-GRIB-example.png new file mode 100644 index 00000000000..412dd28a6d9 Binary files /dev/null and b/doc/_static/thumbnails/ERA5-GRIB-example.png differ diff --git a/doc/_static/thumbnails/ROMS_ocean_model.png b/doc/_static/thumbnails/ROMS_ocean_model.png new file mode 100644 index 00000000000..9333335d1ef Binary files /dev/null and b/doc/_static/thumbnails/ROMS_ocean_model.png differ diff --git a/doc/_static/thumbnails/area_weighted_temperature.png b/doc/_static/thumbnails/area_weighted_temperature.png new file mode 100644 index 00000000000..7d3604d7c2b Binary files /dev/null and b/doc/_static/thumbnails/area_weighted_temperature.png differ diff --git a/doc/_static/thumbnails/monthly-means.png b/doc/_static/thumbnails/monthly-means.png new file mode 100644 index 00000000000..da5691848b0 Binary files /dev/null and b/doc/_static/thumbnails/monthly-means.png differ diff --git a/doc/_static/thumbnails/multidimensional-coords.png b/doc/_static/thumbnails/multidimensional-coords.png new file mode 100644 index 00000000000..b0d893d6894 Binary files /dev/null and b/doc/_static/thumbnails/multidimensional-coords.png differ diff --git a/doc/_static/thumbnails/toy-weather-data.png b/doc/_static/thumbnails/toy-weather-data.png new file mode 100644 index 00000000000..64ac0a4b021 Binary files /dev/null and b/doc/_static/thumbnails/toy-weather-data.png differ diff --git a/doc/_static/thumbnails/visualization_gallery.png b/doc/_static/thumbnails/visualization_gallery.png new file mode 100644 index 00000000000..9e6c2436be5 Binary files /dev/null and b/doc/_static/thumbnails/visualization_gallery.png differ diff --git a/doc/_templates/autosummary/base.rst b/doc/_templates/autosummary/base.rst deleted file mode 100644 index 53f2a29c193..00000000000 --- a/doc/_templates/autosummary/base.rst +++ /dev/null @@ -1,3 +0,0 @@ -:github_url: {{ fullname | github_url | escape_underscores }} - -{% extends "!autosummary/base.rst" %} diff --git a/doc/_templates/layout.html b/doc/_templates/layout.html deleted file mode 100644 index 4c57ba83056..00000000000 --- a/doc/_templates/layout.html +++ /dev/null @@ -1,2 +0,0 @@ -{% extends "!layout.html" %} -{% set css_files = css_files + ["_static/style.css"] %} diff --git a/doc/api-hidden.rst b/doc/api-hidden.rst index 14d79039a3a..f5e9348d4eb 100644 --- a/doc/api-hidden.rst +++ b/doc/api-hidden.rst @@ -287,6 +287,7 @@ core.accessor_dt.DatetimeAccessor.floor core.accessor_dt.DatetimeAccessor.round core.accessor_dt.DatetimeAccessor.strftime + core.accessor_dt.DatetimeAccessor.date core.accessor_dt.DatetimeAccessor.day core.accessor_dt.DatetimeAccessor.dayofweek core.accessor_dt.DatetimeAccessor.dayofyear @@ -324,14 +325,21 @@ core.accessor_dt.TimedeltaAccessor.seconds core.accessor_str.StringAccessor.capitalize + core.accessor_str.StringAccessor.casefold + core.accessor_str.StringAccessor.cat core.accessor_str.StringAccessor.center core.accessor_str.StringAccessor.contains core.accessor_str.StringAccessor.count core.accessor_str.StringAccessor.decode core.accessor_str.StringAccessor.encode core.accessor_str.StringAccessor.endswith + core.accessor_str.StringAccessor.extract + core.accessor_str.StringAccessor.extractall core.accessor_str.StringAccessor.find + core.accessor_str.StringAccessor.findall + core.accessor_str.StringAccessor.format core.accessor_str.StringAccessor.get + core.accessor_str.StringAccessor.get_dummies core.accessor_str.StringAccessor.index core.accessor_str.StringAccessor.isalnum core.accessor_str.StringAccessor.isalpha @@ -342,20 +350,26 @@ core.accessor_str.StringAccessor.isspace core.accessor_str.StringAccessor.istitle core.accessor_str.StringAccessor.isupper + core.accessor_str.StringAccessor.join core.accessor_str.StringAccessor.len core.accessor_str.StringAccessor.ljust core.accessor_str.StringAccessor.lower core.accessor_str.StringAccessor.lstrip core.accessor_str.StringAccessor.match + core.accessor_str.StringAccessor.normalize core.accessor_str.StringAccessor.pad + core.accessor_str.StringAccessor.partition core.accessor_str.StringAccessor.repeat core.accessor_str.StringAccessor.replace core.accessor_str.StringAccessor.rfind core.accessor_str.StringAccessor.rindex core.accessor_str.StringAccessor.rjust + core.accessor_str.StringAccessor.rpartition + core.accessor_str.StringAccessor.rsplit core.accessor_str.StringAccessor.rstrip core.accessor_str.StringAccessor.slice core.accessor_str.StringAccessor.slice_replace + core.accessor_str.StringAccessor.split core.accessor_str.StringAccessor.startswith core.accessor_str.StringAccessor.strip core.accessor_str.StringAccessor.swapcase @@ -811,3 +825,28 @@ backends.DummyFileManager.acquire backends.DummyFileManager.acquire_context backends.DummyFileManager.close + + backends.common.BackendArray + backends.common.BackendEntrypoint + backends.common.BackendEntrypoint.guess_can_open + backends.common.BackendEntrypoint.open_dataset + + core.indexing.IndexingSupport + core.indexing.explicit_indexing_adapter + core.indexing.BasicIndexer + core.indexing.OuterIndexer + core.indexing.VectorizedIndexer + core.indexing.LazilyIndexedArray + core.indexing.LazilyVectorizedIndexedArray + + conventions.decode_cf_variables + + coding.variables.UnsignedIntegerCoder + coding.variables.CFMaskCoder + coding.variables.CFScaleOffsetCoder + + coding.strings.CharacterArrayCoder + coding.strings.EncodedStringCoder + + coding.times.CFTimedeltaCoder + coding.times.CFDatetimeCoder diff --git a/doc/api.rst b/doc/api.rst index 9add7a96109..4af9bb01208 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -138,6 +138,7 @@ Indexing Dataset.set_index Dataset.reset_index Dataset.reorder_levels + Dataset.query Missing value handling ---------------------- @@ -321,6 +322,7 @@ Indexing DataArray.set_index DataArray.reset_index DataArray.reorder_levels + DataArray.query Missing value handling ---------------------- @@ -420,38 +422,58 @@ String manipulation :toctree: generated/ :template: autosummary/accessor_method.rst + DataArray.str._apply + DataArray.str._padder + DataArray.str._partitioner + DataArray.str._re_compile + DataArray.str._splitter + DataArray.str._stringify DataArray.str.capitalize + DataArray.str.casefold + DataArray.str.cat DataArray.str.center DataArray.str.contains DataArray.str.count DataArray.str.decode DataArray.str.encode DataArray.str.endswith + DataArray.str.extract + DataArray.str.extractall DataArray.str.find + DataArray.str.findall + DataArray.str.format DataArray.str.get + DataArray.str.get_dummies DataArray.str.index DataArray.str.isalnum DataArray.str.isalpha DataArray.str.isdecimal DataArray.str.isdigit + DataArray.str.islower DataArray.str.isnumeric DataArray.str.isspace DataArray.str.istitle DataArray.str.isupper + DataArray.str.join DataArray.str.len DataArray.str.ljust DataArray.str.lower DataArray.str.lstrip DataArray.str.match + DataArray.str.normalize DataArray.str.pad + DataArray.str.partition DataArray.str.repeat DataArray.str.replace DataArray.str.rfind DataArray.str.rindex DataArray.str.rjust + DataArray.str.rpartition + DataArray.str.rsplit DataArray.str.rstrip DataArray.str.slice DataArray.str.slice_replace + DataArray.str.split DataArray.str.startswith DataArray.str.strip DataArray.str.swapcase @@ -487,6 +509,7 @@ Datetimelike properties DataArray.dt.daysinmonth DataArray.dt.season DataArray.dt.time + DataArray.dt.date DataArray.dt.is_month_start DataArray.dt.is_month_end DataArray.dt.is_quarter_end diff --git a/doc/conf.py b/doc/conf.py index db7229cfa61..86ff3768411 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -13,14 +13,13 @@ import datetime +import inspect import os -import pathlib import subprocess import sys from contextlib import suppress import sphinx_autosummary_accessors -from jinja2.defaults import DEFAULT_FILTERS import xarray @@ -80,7 +79,11 @@ "IPython.sphinxext.ipython_console_highlighting", "nbsphinx", "sphinx_autosummary_accessors", - "scanpydoc.rtd_github_links", + "sphinx.ext.linkcode", + "sphinx_panels", + "sphinxext.opengraph", + "sphinx_copybutton", + "sphinxext.rediraffe", ] extlinks = { @@ -88,6 +91,12 @@ "pull": ("https://github.com/pydata/xarray/pull/%s", "PR"), } +# sphinx-copybutton configurations +copybutton_prompt_text = r">>> |\.\.\. |\$ |In \[\d*\]: | {2,5}\.\.\.: | {5,8}: " +copybutton_prompt_is_regexp = True + +# nbsphinx configurations + nbsphinx_timeout = 600 nbsphinx_execute = "always" nbsphinx_prolog = """ @@ -100,20 +109,12 @@ """ autosummary_generate = True - -# for scanpydoc's jinja filter -project_dir = pathlib.Path(__file__).parent.parent -html_context = { - "github_user": "pydata", - "github_repo": "xarray", - "github_version": "master", -} - autodoc_typehints = "none" +# Napoleon configurations + napoleon_google_docstring = False napoleon_numpy_docstring = True - napoleon_use_param = False napoleon_use_rtype = False napoleon_preprocess_types = True @@ -167,17 +168,13 @@ "pd.NaT": "~pandas.NaT", } -numpydoc_class_members_toctree = True -numpydoc_show_class_members = False # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates", sphinx_autosummary_accessors.templates_path] # The suffix of source filenames. -source_suffix = ".rst" +# source_suffix = ".rst" -# The encoding of source files. -# source_encoding = 'utf-8-sig' # The master toctree document. master_doc = "index" @@ -186,19 +183,11 @@ project = "xarray" copyright = "2014-%s, xarray Developers" % datetime.datetime.now().year -# The version info for the project you're documenting, acts as replacement for -# |version| and |release|, also used in various other places throughout the -# built documents. -# # The short X.Y version. version = xarray.__version__.split("+")[0] # The full version, including alpha/beta/rc tags. release = xarray.__version__ -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# language = None - # There are two options for replacing |today|: either, you set today to some # non-false value, then it is used: # today = '' @@ -209,51 +198,45 @@ # directories to ignore when looking for source files. exclude_patterns = ["_build", "**.ipynb_checkpoints"] -# The reST default role (used for this markup: `text`) to use for all -# documents. -# default_role = None - -# If true, '()' will be appended to :func: etc. cross-reference text. -# add_function_parentheses = True - -# If true, the current module name will be prepended to all description -# unit titles (such as .. function::). -# add_module_names = True - -# If true, sectionauthor and moduleauthor directives will be shown in the -# output. They are ignored by default. -# show_authors = False # The name of the Pygments (syntax highlighting) style to use. pygments_style = "sphinx" -# A list of ignored prefixes for module index sorting. -# modindex_common_prefix = [] - -# If true, keep warnings as "system message" paragraphs in the built documents. -# keep_warnings = False - # -- Options for HTML output ---------------------------------------------- - # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. -html_theme = "sphinx_rtd_theme" +html_theme = "sphinx_book_theme" +html_title = "" + +html_context = { + "github_user": "pydata", + "github_repo": "xarray", + "github_version": "master", + "doc_path": "doc", +} # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the # documentation. -html_theme_options = {"logo_only": True} - -# Add any paths that contain custom themes here, relative to this directory. -# html_theme_path = [] - -# The name for this set of Sphinx documents. If None, it defaults to -# " v documentation". -# html_title = None +html_theme_options = dict( + # analytics_id='' this is configured in rtfd.io + # canonical_url="", + repository_url="https://github.com/pydata/xarray", + repository_branch="master", + path_to_docs="doc", + use_edit_page_button=True, + use_repository_button=True, + use_issues_button=True, + home_page_in_toc=False, + extra_navbar="", + navbar_footer_text="", + extra_footer="""

Xarray is a fiscally sponsored project of NumFOCUS, + a nonprofit dedicated to supporting the open-source scientific computing community.
+ Theme by the Executable Book Project

""", + twitter_url="https://twitter.com/xarray_devs", +) -# A shorter title for the navigation bar. Default is the same as html_title. -# html_short_title = None # The name of an image file (relative to this directory) to place at the top # of the sidebar. @@ -268,6 +251,42 @@ # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". html_static_path = ["_static"] +html_css_files = ["style.css"] + + +# configuration for sphinxext.opengraph +ogp_site_url = "https://xarray.pydata.org/en/latest/" +ogp_image = "https://xarray.pydata.org/en/stable/_static/dataset-diagram-logo.png" +ogp_custom_meta_tags = [ + '', + '', +] + +# Redirects for pages that were moved to new locations + +rediraffe_redirects = { + "terminology.rst": "user-guide/terminology.rst", + "data-structures.rst": "user-guide/data-structures.rst", + "indexing.rst": "user-guide/indexing.rst", + "interpolation.rst": "user-guide/interpolation.rst", + "computation.rst": "user-guide/computation.rst", + "groupby.rst": "user-guide/groupby.rst", + "reshaping.rst": "user-guide/reshaping.rst", + "combining.rst": "user-guide/combining.rst", + "time-series.rst": "user-guide/time-series.rst", + "weather-climate.rst": "user-guide/weather-climate.rst", + "pandas.rst": "user-guide/pandas.rst", + "io.rst": "user-guide/io.rst", + "dask.rst": "user-guide/dask.rst", + "plotting.rst": "user-guide/plotting.rst", + "duckarrays.rst": "user-guide/duckarrays.rst", + "related-projects.rst": "ecosystem.rst", + "faq.rst": "getting-started-guide/faq.rst", + "why-xarray.rst": "getting-started-guide/why-xarray.rst", + "installing.rst": "getting-started-guide/installing.rst", + "quick-overview.rst": "getting-started-guide/quick-overview.rst", +} # Sometimes the savefig directory doesn't exist and needs to be created # https://github.com/ipython/ipython/issues/8733 @@ -278,135 +297,15 @@ if not os.path.exists(ipython_savefig_dir): os.makedirs(ipython_savefig_dir) -# Add any extra paths that contain custom files (such as robots.txt or -# .htaccess) here, relative to this directory. These files are copied -# directly to the root of the documentation. -# html_extra_path = [] # If not '', a 'Last updated on:' timestamp is inserted at every page bottom, # using the given strftime format. html_last_updated_fmt = today_fmt -# If true, SmartyPants will be used to convert quotes and dashes to -# typographically correct entities. -# html_use_smartypants = True - -# Custom sidebar templates, maps document names to template names. -# html_sidebars = {} - -# Additional templates that should be rendered to pages, maps page names to -# template names. -# html_additional_pages = {} - -# If false, no module index is generated. -# html_domain_indices = True - -# If false, no index is generated. -# html_use_index = True - -# If true, the index is split into individual pages for each letter. -# html_split_index = False - -# If true, links to the reST sources are added to the pages. -# html_show_sourcelink = True - -# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. -# html_show_sphinx = True - -# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. -# html_show_copyright = True - -# If true, an OpenSearch description file will be output, and all pages will -# contain a tag referring to it. The value of this option must be the -# base URL from which the finished HTML is served. -# html_use_opensearch = '' - -# This is the file name suffix for HTML files (e.g. ".xhtml"). -# html_file_suffix = None - # Output file base name for HTML help builder. htmlhelp_basename = "xarraydoc" -# -- Options for LaTeX output --------------------------------------------- - -# latex_elements = { -# # The paper size ('letterpaper' or 'a4paper'). -# # 'papersize': 'letterpaper', -# # The font size ('10pt', '11pt' or '12pt'). -# # 'pointsize': '10pt', -# # Additional stuff for the LaTeX preamble. -# # 'preamble': '', -# } - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -# latex_documents = [ -# ("index", "xarray.tex", "xarray Documentation", "xarray Developers", "manual") -# ] - -# The name of an image file (relative to this directory) to place at the top of -# the title page. -# latex_logo = None - -# For "manual" documents, if this is true, then toplevel headings are parts, -# not chapters. -# latex_use_parts = False - -# If true, show page references after internal links. -# latex_show_pagerefs = False - -# If true, show URL addresses after external links. -# latex_show_urls = False - -# Documents to append as an appendix to all manuals. -# latex_appendices = [] - -# If false, no module index is generated. -# latex_domain_indices = True - - -# -- Options for manual page output --------------------------------------- - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -# man_pages = [("index", "xarray", "xarray Documentation", ["xarray Developers"], 1)] - -# If true, show URL addresses after external links. -# man_show_urls = False - - -# -- Options for Texinfo output ------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -# texinfo_documents = [ -# ( -# "index", -# "xarray", -# "xarray Documentation", -# "xarray Developers", -# "xarray", -# "N-D labeled arrays and datasets in Python.", -# "Miscellaneous", -# ) -# ] - -# Documents to append as an appendix to all manuals. -# texinfo_appendices = [] - -# If false, no module index is generated. -# texinfo_domain_indices = True - -# How to display URL addresses: 'footnote', 'no', or 'inline'. -# texinfo_show_urls = 'footnote' - -# If true, do not generate a @detailmenu in the "Top" node's menu. -# texinfo_no_detailmenu = False - - # Example configuration for intersphinx: refer to the Python standard library. intersphinx_mapping = { "python": ("https://docs.python.org/3/", None), @@ -423,9 +322,61 @@ } -def escape_underscores(string): - return string.replace("_", r"\_") +# based on numpy doc/source/conf.py +def linkcode_resolve(domain, info): + """ + Determine the URL corresponding to Python object + """ + if domain != "py": + return None + + modname = info["module"] + fullname = info["fullname"] + + submod = sys.modules.get(modname) + if submod is None: + return None + + obj = submod + for part in fullname.split("."): + try: + obj = getattr(obj, part) + except AttributeError: + return None + + try: + fn = inspect.getsourcefile(inspect.unwrap(obj)) + except TypeError: + fn = None + if not fn: + return None + + try: + source, lineno = inspect.getsourcelines(obj) + except OSError: + lineno = None + + if lineno: + linespec = f"#L{lineno}-L{lineno + len(source) - 1}" + else: + linespec = "" + + fn = os.path.relpath(fn, start=os.path.dirname(xarray.__file__)) + + if "+" in xarray.__version__: + return f"https://github.com/pydata/xarray/blob/master/xarray/{fn}{linespec}" + else: + return ( + f"https://github.com/pydata/xarray/blob/" + f"v{xarray.__version__}/xarray/{fn}{linespec}" + ) + + +def html_page_context(app, pagename, templatename, context, doctree): + # Disable edit button for docstring generated pages + if "generated" in pagename: + context["theme_use_edit_page_button"] = False def setup(app): - DEFAULT_FILTERS["escape_underscores"] = escape_underscores + app.connect("html-page-context", html_page_context) diff --git a/doc/contributing.rst b/doc/contributing.rst index 439791cbbd6..345443001a0 100644 --- a/doc/contributing.rst +++ b/doc/contributing.rst @@ -4,8 +4,6 @@ Contributing to xarray ********************** -.. contents:: Table of contents: - :local: .. note:: diff --git a/doc/related-projects.rst b/doc/ecosystem.rst similarity index 99% rename from doc/related-projects.rst rename to doc/ecosystem.rst index 0a010195d6d..01f5c29b9f5 100644 --- a/doc/related-projects.rst +++ b/doc/ecosystem.rst @@ -1,4 +1,4 @@ -.. _related-projects: +.. _ecosystem: Xarray related projects ----------------------- diff --git a/doc/examples.rst b/doc/examples.rst deleted file mode 100644 index 102138b6e4e..00000000000 --- a/doc/examples.rst +++ /dev/null @@ -1,29 +0,0 @@ -Examples -======== - -.. toctree:: - :maxdepth: 1 - - examples/weather-data - examples/monthly-means - examples/area_weighted_temperature - examples/multidimensional-coords - examples/visualization_gallery - examples/ROMS_ocean_model - examples/ERA5-GRIB-example - -Using apply_ufunc ------------------- -.. toctree:: - :maxdepth: 1 - - examples/apply_ufunc_vectorize_1d - -External Examples ------------------ -.. toctree:: - :maxdepth: 2 - - Managing raster data with rioxarray - Xarray with dask - Xarray and dask on the cloud with Pangeo diff --git a/doc/gallery.rst b/doc/gallery.rst new file mode 100644 index 00000000000..9e5284cc2ee --- /dev/null +++ b/doc/gallery.rst @@ -0,0 +1,130 @@ +Gallery +======= + +Here's a list of examples on how to use xarray. We will be adding more examples soon. +Contributions are highly welcomed and appreciated. So, if you are interested in contributing, please consult the +:doc:`contributing` guide. + + + +Notebook Examples +----------------- + +.. panels:: + :column: text-center col-lg-6 col-md-6 col-sm-12 col-xs-12 p-2 + :card: +my-2 + :img-top-cls: w-75 m-auto p-2 + :body: d-none + + --- + :img-top: _static/thumbnails/toy-weather-data.png + ++++ + .. link-button:: examples/weather-data + :type: ref + :text: Toy weather data + :classes: btn-outline-dark btn-block stretched-link + + --- + :img-top: _static/thumbnails/monthly-means.png + ++++ + .. link-button:: examples/monthly-means + :type: ref + :text: Calculating Seasonal Averages from Timeseries of Monthly Means + :classes: btn-outline-dark btn-block stretched-link + + --- + :img-top: _static/thumbnails/area_weighted_temperature.png + ++++ + .. link-button:: examples/area_weighted_temperature + :type: ref + :text: Compare weighted and unweighted mean temperature + :classes: btn-outline-dark btn-block stretched-link + + --- + :img-top: _static/thumbnails/multidimensional-coords.png + ++++ + .. link-button:: examples/multidimensional-coords + :type: ref + :text: Working with Multidimensional Coordinates + :classes: btn-outline-dark btn-block stretched-link + + --- + :img-top: _static/thumbnails/visualization_gallery.png + ++++ + .. link-button:: examples/visualization_gallery + :type: ref + :text: Visualization Gallery + :classes: btn-outline-dark btn-block stretched-link + + --- + :img-top: _static/thumbnails/ROMS_ocean_model.png + ++++ + .. link-button:: examples/ROMS_ocean_model + :type: ref + :text: ROMS Ocean Model Example + :classes: btn-outline-dark btn-block stretched-link + + --- + :img-top: _static/thumbnails/ERA5-GRIB-example.png + ++++ + .. link-button:: examples/ERA5-GRIB-example + :type: ref + :text: GRIB Data Example + :classes: btn-outline-dark btn-block stretched-link + + --- + :img-top: _static/dataset-diagram-square-logo.png + ++++ + .. link-button:: examples/apply_ufunc_vectorize_1d + :type: ref + :text: Applying unvectorized functions with apply_ufunc + :classes: btn-outline-dark btn-block stretched-link + + +.. toctree:: + :maxdepth: 1 + :hidden: + + examples/weather-data + examples/monthly-means + examples/area_weighted_temperature + examples/multidimensional-coords + examples/visualization_gallery + examples/ROMS_ocean_model + examples/ERA5-GRIB-example + examples/apply_ufunc_vectorize_1d + + +External Examples +----------------- + + +.. panels:: + :column: text-center col-lg-6 col-md-6 col-sm-12 col-xs-12 p-2 + :card: +my-2 + :img-top-cls: w-75 m-auto p-2 + :body: d-none + + --- + :img-top: _static/dataset-diagram-square-logo.png + ++++ + .. link-button:: https://corteva.github.io/rioxarray/stable/examples/examples.html + :type: url + :text: Managing raster data with rioxarray + :classes: btn-outline-dark btn-block stretched-link + + --- + :img-top: https://avatars.githubusercontent.com/u/60833341?s=200&v=4 + ++++ + .. link-button:: http://gallery.pangeo.io/ + :type: url + :text: Xarray and dask on the cloud with Pangeo + :classes: btn-outline-dark btn-block stretched-link + + --- + :img-top: _static/dataset-diagram-square-logo.png + ++++ + .. link-button:: https://examples.dask.org/xarray.html + :type: url + :text: Xarray with Dask Arrays + :classes: btn-outline-dark btn-block stretched-link diff --git a/doc/faq.rst b/doc/getting-started-guide/faq.rst similarity index 99% rename from doc/faq.rst rename to doc/getting-started-guide/faq.rst index a2151cc4b37..4cf3cc5b63d 100644 --- a/doc/faq.rst +++ b/doc/getting-started-guide/faq.rst @@ -185,7 +185,7 @@ for more details. What other projects leverage xarray? ------------------------------------ -See section :ref:`related-projects`. +See section :ref:`ecosystem`. How should I cite xarray? ------------------------- diff --git a/doc/getting-started-guide/index.rst b/doc/getting-started-guide/index.rst new file mode 100644 index 00000000000..20fd49fb2c4 --- /dev/null +++ b/doc/getting-started-guide/index.rst @@ -0,0 +1,15 @@ +################ +Getting Started +################ + +The getting started guide aims to get you using xarray productively as quickly as possible. +It is designed as an entry point for new users, and it provided an introduction to xarray's main concepts. + +.. toctree:: + :maxdepth: 2 + :hidden: + + why-xarray + installing + quick-overview + faq diff --git a/doc/installing.rst b/doc/getting-started-guide/installing.rst similarity index 92% rename from doc/installing.rst rename to doc/getting-started-guide/installing.rst index 396f24b9151..fb88c223c98 100644 --- a/doc/installing.rst +++ b/doc/getting-started-guide/installing.rst @@ -144,15 +144,15 @@ being updated in the default channel. If you don't use conda, be sure you have the required dependencies (numpy and pandas) installed first. Then, install xarray with pip:: - $ pip install xarray + $ python -m pip install xarray We also maintain other dependency sets for different subsets of functionality:: - $ pip install "xarray[io]" # Install optional dependencies for handling I/O - $ pip install "xarray[accel]" # Install optional dependencies for accelerating xarray - $ pip install "xarray[parallel]" # Install optional dependencies for dask arrays - $ pip install "xarray[viz]" # Install optional dependencies for visualization - $ pip install "xarray[complete]" # Install all the above + $ python -m pip install "xarray[io]" # Install optional dependencies for handling I/O + $ python -m pip install "xarray[accel]" # Install optional dependencies for accelerating xarray + $ python -m pip install "xarray[parallel]" # Install optional dependencies for dask arrays + $ python -m pip install "xarray[viz]" # Install optional dependencies for visualization + $ python -m pip install "xarray[complete]" # Install all the above The above commands should install most of the `optional dependencies`_. However, some packages which are either not listed on PyPI or require extra @@ -160,7 +160,7 @@ installation steps are excluded. To know which dependencies would be installed, take a look at the ``[options.extras_require]`` section in ``setup.cfg``: -.. literalinclude:: ../setup.cfg +.. literalinclude:: ../../setup.cfg :language: ini :start-at: [options.extras_require] :end-before: [options.package_data] diff --git a/doc/quick-overview.rst b/doc/getting-started-guide/quick-overview.rst similarity index 100% rename from doc/quick-overview.rst rename to doc/getting-started-guide/quick-overview.rst diff --git a/doc/why-xarray.rst b/doc/getting-started-guide/why-xarray.rst similarity index 100% rename from doc/why-xarray.rst rename to doc/getting-started-guide/why-xarray.rst diff --git a/doc/index.rst b/doc/index.rst index ee44d0ad4d9..c4c9d89264b 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -22,122 +22,56 @@ computing. .. _dask: http://dask.org .. _netCDF: http://www.unidata.ucar.edu/software/netcdf -Documentation -------------- - -**Getting Started** - -* :doc:`why-xarray` -* :doc:`faq` -* :doc:`quick-overview` -* :doc:`examples` -* :doc:`installing` .. toctree:: - :maxdepth: 1 + :maxdepth: 2 :hidden: - :caption: Getting Started - - why-xarray - faq - quick-overview - examples - installing - -**User Guide** - -* :doc:`terminology` -* :doc:`data-structures` -* :doc:`indexing` -* :doc:`interpolation` -* :doc:`computation` -* :doc:`groupby` -* :doc:`reshaping` -* :doc:`combining` -* :doc:`time-series` -* :doc:`weather-climate` -* :doc:`pandas` -* :doc:`io` -* :doc:`dask` -* :doc:`plotting` -* :doc:`duckarrays` + :caption: For users + + Getting Started + User Guide + Gallery + Tutorials & Videos + API Reference + How do I ... + Ecosystem .. toctree:: - :maxdepth: 1 + :maxdepth: 2 :hidden: - :caption: User Guide - - terminology - data-structures - indexing - interpolation - computation - groupby - reshaping - combining - time-series - weather-climate - pandas - io - dask - plotting - duckarrays - -**Help & reference** - -* :doc:`whats-new` -* :doc:`howdoi` -* :doc:`api` -* :doc:`internals` -* :doc:`roadmap` -* :doc:`contributing` -* :doc:`related-projects` + :caption: For developers/contributors + + Contributing Guide + Xarray Internals + Development Roadmap + Team + What’s New + GitHub repository .. toctree:: :maxdepth: 1 :hidden: - :caption: Help & reference + :caption: Community + + GitHub discussions + StackOverflow - whats-new - howdoi - api - internals - roadmap - contributing - related-projects -See also --------- -- `Xarray's Tutorial`_ presented at the 2020 SciPy Conference (`video recording`_). -- Stephan Hoyer and Joe Hamman's `Journal of Open Research Software paper`_ describing the xarray project. -- The `UW eScience Institute's Geohackweek`_ tutorial on xarray for geospatial data scientists. -- Stephan Hoyer's `SciPy2015 talk`_ introducing xarray to a general audience. -- Stephan Hoyer's `2015 Unidata Users Workshop talk`_ and `tutorial`_ (`with answers`_) introducing - xarray to users familiar with netCDF. -- `Nicolas Fauchereau's tutorial`_ on xarray for netCDF users. - -.. _Xarray's Tutorial: https://xarray-contrib.github.io/xarray-tutorial/ -.. _video recording: https://youtu.be/mecN-Ph_-78 -.. _Journal of Open Research Software paper: http://doi.org/10.5334/jors.148 -.. _UW eScience Institute's Geohackweek : https://geohackweek.github.io/nDarrays/ -.. _SciPy2015 talk: https://www.youtube.com/watch?v=X0pAhJgySxk -.. _2015 Unidata Users Workshop talk: https://www.youtube.com/watch?v=J9ypQOnt5l8 -.. _tutorial: https://github.com/Unidata/unidata-users-workshop/blob/master/notebooks/xray-tutorial.ipynb -.. _with answers: https://github.com/Unidata/unidata-users-workshop/blob/master/notebooks/xray-tutorial-with-answers.ipynb -.. _Nicolas Fauchereau's tutorial: http://nbviewer.iPython.org/github/nicolasfauchereau/metocean/blob/master/notebooks/xray.ipynb Get in touch ------------ -- Ask usage questions ("How do I?") on `StackOverflow`_. +- If you have a question like "How do I concatenate a list of datasets?", ask on `GitHub discussions`_ or `StackOverflow`_. + Please include a self-contained reproducible example if possible. - Report bugs, suggest features or view the source code `on GitHub`_. - For less well defined questions or ideas, or to announce other projects of - interest to xarray users, use the `mailing list`_. + interest to xarray users, use `GitHub discussions`_ or the `mailing list`_. -.. _StackOverFlow: http://stackoverflow.com/questions/tagged/python-xarray +.. _StackOverFlow: https://stackoverflow.com/questions/tagged/python-xarray +.. _Github discussions: https://github.com/pydata/xarray/discussions .. _mailing list: https://groups.google.com/forum/#!forum/xarray -.. _on GitHub: http://github.com/pydata/xarray +.. _on GitHub: https://github.com/pydata/xarray NumFOCUS -------- diff --git a/doc/internals.rst b/doc/internals.rst deleted file mode 100644 index f3d67de9077..00000000000 --- a/doc/internals.rst +++ /dev/null @@ -1,250 +0,0 @@ -.. _internals: - -xarray Internals -================ - -.. currentmodule:: xarray - -xarray builds upon two of the foundational libraries of the scientific Python -stack, NumPy and pandas. It is written in pure Python (no C or Cython -extensions), which makes it easy to develop and extend. Instead, we push -compiled code to :ref:`optional dependencies`. - -.. ipython:: python - :suppress: - - import dask.array as da - import numpy as np - import pandas as pd - import sparse - import xarray as xr - - np.random.seed(123456) - -Variable objects ----------------- - -The core internal data structure in xarray is the :py:class:`~xarray.Variable`, -which is used as the basic building block behind xarray's -:py:class:`~xarray.Dataset` and :py:class:`~xarray.DataArray` types. A -``Variable`` consists of: - -- ``dims``: A tuple of dimension names. -- ``data``: The N-dimensional array (typically, a NumPy or Dask array) storing - the Variable's data. It must have the same number of dimensions as the length - of ``dims``. -- ``attrs``: An ordered dictionary of metadata associated with this array. By - convention, xarray's built-in operations never use this metadata. -- ``encoding``: Another ordered dictionary used to store information about how - these variable's data is represented on disk. See :ref:`io.encoding` for more - details. - -``Variable`` has an interface similar to NumPy arrays, but extended to make use -of named dimensions. For example, it uses ``dim`` in preference to an ``axis`` -argument for methods like ``mean``, and supports :ref:`compute.broadcasting`. - -However, unlike ``Dataset`` and ``DataArray``, the basic ``Variable`` does not -include coordinate labels along each axis. - -``Variable`` is public API, but because of its incomplete support for labeled -data, it is mostly intended for advanced uses, such as in xarray itself or for -writing new backends. You can access the variable objects that correspond to -xarray objects via the (readonly) :py:attr:`Dataset.variables -` and -:py:attr:`DataArray.variable ` attributes. - - -.. _internals.duck_arrays: - -Integrating with duck arrays ----------------------------- - -.. warning:: - - This is a experimental feature. - -xarray can wrap custom :term:`duck array` objects as long as they define numpy's -``shape``, ``dtype`` and ``ndim`` properties and the ``__array__``, -``__array_ufunc__`` and ``__array_function__`` methods. - -In certain situations (e.g. when printing the collapsed preview of -variables of a ``Dataset``), xarray will display the repr of a :term:`duck array` -in a single line, truncating it to a certain number of characters. If that -would drop too much information, the :term:`duck array` may define a -``_repr_inline_`` method that takes ``max_width`` (number of characters) as an -argument: - -.. code:: python - - class MyDuckArray: - ... - - def _repr_inline_(self, max_width): - """ format to a single line with at most max_width characters """ - ... - - ... - -To avoid duplicated information, this method must omit information about the shape and -:term:`dtype`. For example, the string representation of a ``dask`` array or a -``sparse`` matrix would be: - -.. ipython:: python - - a = da.linspace(0, 1, 20, chunks=2) - a - - b = np.eye(10) - b[[5, 7, 3, 0], [6, 8, 2, 9]] = 2 - b = sparse.COO.from_numpy(b) - b - - xr.Dataset({"a": ("x", a), "b": (("y", "z"), b)}) - -Extending xarray ----------------- - -xarray is designed as a general purpose library, and hence tries to avoid -including overly domain specific functionality. But inevitably, the need for more -domain specific logic arises. - -One standard solution to this problem is to subclass Dataset and/or DataArray to -add domain specific functionality. However, inheritance is not very robust. It's -easy to inadvertently use internal APIs when subclassing, which means that your -code may break when xarray upgrades. Furthermore, many builtin methods will -only return native xarray objects. - -The standard advice is to use `composition over inheritance`__, but -reimplementing an API as large as xarray's on your own objects can be an onerous -task, even if most methods are only forwarding to xarray implementations. - -__ https://github.com/pydata/xarray/issues/706 - -If you simply want the ability to call a function with the syntax of a -method call, then the builtin :py:meth:`~xarray.DataArray.pipe` method (copied -from pandas) may suffice. - -To resolve this issue for more complex cases, xarray has the -:py:func:`~xarray.register_dataset_accessor` and -:py:func:`~xarray.register_dataarray_accessor` decorators for adding custom -"accessors" on xarray objects. Here's how you might use these decorators to -write a custom "geo" accessor implementing a geography specific extension to -xarray: - -.. literalinclude:: examples/_code/accessor_example.py - -In general, the only restriction on the accessor class is that the ``__init__`` method -must have a single parameter: the ``Dataset`` or ``DataArray`` object it is supposed -to work on. - -This achieves the same result as if the ``Dataset`` class had a cached property -defined that returns an instance of your class: - -.. code-block:: python - - class Dataset: - ... - - @property - def geo(self): - return GeoAccessor(self) - -However, using the register accessor decorators is preferable to simply adding -your own ad-hoc property (i.e., ``Dataset.geo = property(...)``), for several -reasons: - -1. It ensures that the name of your property does not accidentally conflict with - any other attributes or methods (including other accessors). -2. Instances of accessor object will be cached on the xarray object that creates - them. This means you can save state on them (e.g., to cache computed - properties). -3. Using an accessor provides an implicit namespace for your custom - functionality that clearly identifies it as separate from built-in xarray - methods. - -.. note:: - - Accessors are created once per DataArray and Dataset instance. New - instances, like those created from arithmetic operations or when accessing - a DataArray from a Dataset (ex. ``ds[var_name]``), will have new - accessors created. - -Back in an interactive IPython session, we can use these properties: - -.. ipython:: python - :suppress: - - exec(open("examples/_code/accessor_example.py").read()) - -.. ipython:: python - - ds = xr.Dataset({"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)}) - ds.geo.center - ds.geo.plot() - -The intent here is that libraries that extend xarray could add such an accessor -to implement subclass specific functionality rather than using actual subclasses -or patching in a large number of domain specific methods. For further reading -on ways to write new accessors and the philosophy behind the approach, see -:issue:`1080`. - -To help users keep things straight, please `let us know -`_ if you plan to write a new accessor -for an open source library. In the future, we will maintain a list of accessors -and the libraries that implement them on this page. - -To make documenting accessors with ``sphinx`` and ``sphinx.ext.autosummary`` -easier, you can use `sphinx-autosummary-accessors`_. - -.. _sphinx-autosummary-accessors: https://sphinx-autosummary-accessors.readthedocs.io/ - -.. _zarr_encoding: - -Zarr Encoding Specification ---------------------------- - -In implementing support for the `Zarr `_ storage -format, Xarray developers made some *ad hoc* choices about how to store -NetCDF data in Zarr. -Future versions of the Zarr spec will likely include a more formal convention -for the storage of the NetCDF data model in Zarr; see -`Zarr spec repo `_ for ongoing -discussion. - -First, Xarray can only read and write Zarr groups. There is currently no support -for reading / writting individual Zarr arrays. Zarr groups are mapped to -Xarray ``Dataset`` objects. - -Second, from Xarray's point of view, the key difference between -NetCDF and Zarr is that all NetCDF arrays have *dimension names* while Zarr -arrays do not. Therefore, in order to store NetCDF data in Zarr, Xarray must -somehow encode and decode the name of each array's dimensions. - -To accomplish this, Xarray developers decided to define a special Zarr array -attribute: ``_ARRAY_DIMENSIONS``. The value of this attribute is a list of -dimension names (strings), for example ``["time", "lon", "lat"]``. When writing -data to Zarr, Xarray sets this attribute on all variables based on the variable -dimensions. When reading a Zarr group, Xarray looks for this attribute on all -arrays, raising an error if it can't be found. The attribute is used to define -the variable dimension names and then removed from the attributes dictionary -returned to the user. - -Because of these choices, Xarray cannot read arbitrary array data, but only -Zarr data with valid ``_ARRAY_DIMENSIONS`` attributes on each array. - -After decoding the ``_ARRAY_DIMENSIONS`` attribute and assigning the variable -dimensions, Xarray proceeds to [optionally] decode each variable using its -standard CF decoding machinery used for NetCDF data (see :py:func:`decode_cf`). - -As a concrete example, here we write a tutorial dataset to Zarr and then -re-open it directly with Zarr: - -.. ipython:: python - - ds = xr.tutorial.load_dataset("rasm") - ds.to_zarr("rasm.zarr", mode="w") - import zarr - - zgroup = zarr.open("rasm.zarr") - print(zgroup.tree()) - dict(zgroup["Tair"].attrs) diff --git a/doc/internals/duck-arrays-integration.rst b/doc/internals/duck-arrays-integration.rst new file mode 100644 index 00000000000..eb5c4d8592f --- /dev/null +++ b/doc/internals/duck-arrays-integration.rst @@ -0,0 +1,51 @@ + +.. _internals.duck_arrays: + +Integrating with duck arrays +============================= + +.. warning:: + + This is a experimental feature. + +xarray can wrap custom :term:`duck array` objects as long as they define numpy's +``shape``, ``dtype`` and ``ndim`` properties and the ``__array__``, +``__array_ufunc__`` and ``__array_function__`` methods. + +In certain situations (e.g. when printing the collapsed preview of +variables of a ``Dataset``), xarray will display the repr of a :term:`duck array` +in a single line, truncating it to a certain number of characters. If that +would drop too much information, the :term:`duck array` may define a +``_repr_inline_`` method that takes ``max_width`` (number of characters) as an +argument: + +.. code:: python + + class MyDuckArray: + ... + + def _repr_inline_(self, max_width): + """ format to a single line with at most max_width characters """ + ... + + ... + +To avoid duplicated information, this method must omit information about the shape and +:term:`dtype`. For example, the string representation of a ``dask`` array or a +``sparse`` matrix would be: + +.. ipython:: python + + import dask.array as da + import xarray as xr + import sparse + + a = da.linspace(0, 1, 20, chunks=2) + a + + b = np.eye(10) + b[[5, 7, 3, 0], [6, 8, 2, 9]] = 2 + b = sparse.COO.from_numpy(b) + b + + xr.Dataset({"a": ("x", a), "b": (("y", "z"), b)}) diff --git a/doc/internals/extending-xarray.rst b/doc/internals/extending-xarray.rst new file mode 100644 index 00000000000..ef26f30689e --- /dev/null +++ b/doc/internals/extending-xarray.rst @@ -0,0 +1,103 @@ + +Extending xarray +================ + +.. ipython:: python + :suppress: + + import xarray as xr + + +xarray is designed as a general purpose library, and hence tries to avoid +including overly domain specific functionality. But inevitably, the need for more +domain specific logic arises. + +One standard solution to this problem is to subclass Dataset and/or DataArray to +add domain specific functionality. However, inheritance is not very robust. It's +easy to inadvertently use internal APIs when subclassing, which means that your +code may break when xarray upgrades. Furthermore, many builtin methods will +only return native xarray objects. + +The standard advice is to use `composition over inheritance`__, but +reimplementing an API as large as xarray's on your own objects can be an onerous +task, even if most methods are only forwarding to xarray implementations. + +__ https://github.com/pydata/xarray/issues/706 + +If you simply want the ability to call a function with the syntax of a +method call, then the builtin :py:meth:`~xarray.DataArray.pipe` method (copied +from pandas) may suffice. + +To resolve this issue for more complex cases, xarray has the +:py:func:`~xarray.register_dataset_accessor` and +:py:func:`~xarray.register_dataarray_accessor` decorators for adding custom +"accessors" on xarray objects. Here's how you might use these decorators to +write a custom "geo" accessor implementing a geography specific extension to +xarray: + +.. literalinclude:: ../examples/_code/accessor_example.py + +In general, the only restriction on the accessor class is that the ``__init__`` method +must have a single parameter: the ``Dataset`` or ``DataArray`` object it is supposed +to work on. + +This achieves the same result as if the ``Dataset`` class had a cached property +defined that returns an instance of your class: + +.. code-block:: python + + class Dataset: + ... + + @property + def geo(self): + return GeoAccessor(self) + +However, using the register accessor decorators is preferable to simply adding +your own ad-hoc property (i.e., ``Dataset.geo = property(...)``), for several +reasons: + +1. It ensures that the name of your property does not accidentally conflict with + any other attributes or methods (including other accessors). +2. Instances of accessor object will be cached on the xarray object that creates + them. This means you can save state on them (e.g., to cache computed + properties). +3. Using an accessor provides an implicit namespace for your custom + functionality that clearly identifies it as separate from built-in xarray + methods. + +.. note:: + + Accessors are created once per DataArray and Dataset instance. New + instances, like those created from arithmetic operations or when accessing + a DataArray from a Dataset (ex. ``ds[var_name]``), will have new + accessors created. + +Back in an interactive IPython session, we can use these properties: + +.. ipython:: python + :suppress: + + exec(open("examples/_code/accessor_example.py").read()) + +.. ipython:: python + + ds = xr.Dataset({"longitude": np.linspace(0, 10), "latitude": np.linspace(0, 20)}) + ds.geo.center + ds.geo.plot() + +The intent here is that libraries that extend xarray could add such an accessor +to implement subclass specific functionality rather than using actual subclasses +or patching in a large number of domain specific methods. For further reading +on ways to write new accessors and the philosophy behind the approach, see +:issue:`1080`. + +To help users keep things straight, please `let us know +`_ if you plan to write a new accessor +for an open source library. In the future, we will maintain a list of accessors +and the libraries that implement them on this page. + +To make documenting accessors with ``sphinx`` and ``sphinx.ext.autosummary`` +easier, you can use `sphinx-autosummary-accessors`_. + +.. _sphinx-autosummary-accessors: https://sphinx-autosummary-accessors.readthedocs.io/ diff --git a/doc/internals/how-to-add-new-backend.rst b/doc/internals/how-to-add-new-backend.rst new file mode 100644 index 00000000000..a1a838ef5de --- /dev/null +++ b/doc/internals/how-to-add-new-backend.rst @@ -0,0 +1,456 @@ +.. _add_a_backend: + +How to add a new backend +------------------------ + +Adding a new backend for read support to Xarray does not require +to integrate any code in Xarray; all you need to do is: + +- Create a class that inherits from Xarray :py:class:`~xarray.backends.common.BackendEntrypoint` + and implements the method ``open_dataset`` see :ref:`RST backend_entrypoint` + +- Declare this class as an external plugin in your ``setup.py``, see :ref:`RST backend_registration` + +If you also want to support lazy loading and dask see :ref:`RST lazy_loading`. + +Note that the new interface for backends is available from Xarray +version >= 0.18 onwards. + +.. _RST backend_entrypoint: + +BackendEntrypoint subclassing ++++++++++++++++++++++++++++++ + +Your ``BackendEntrypoint`` sub-class is the primary interface with Xarray, and +it should implement the following attributes and methods: + +- the ``open_dataset`` method (mandatory) +- the ``open_dataset_parameters`` attribute (optional) +- the ``guess_can_open`` method (optional). + +This is what a ``BackendEntrypoint`` subclass should look like: + +.. code-block:: python + + class MyBackendEntrypoint(BackendEntrypoint): + def open_dataset( + self, + filename_or_obj, + *, + drop_variables=None, + # other backend specific keyword arguments + ): + ... + return ds + + open_dataset_parameters = ["filename_or_obj", "drop_variables"] + + def guess_can_open(self, filename_or_obj): + try: + _, ext = os.path.splitext(filename_or_obj) + except TypeError: + return False + return ext in {...} + +``BackendEntrypoint`` subclass methods and attributes are detailed in the following. + +.. _RST open_dataset: + +open_dataset +^^^^^^^^^^^^ + +The backend ``open_dataset`` shall implement reading from file, the variables +decoding and it shall instantiate the output Xarray class :py:class:`~xarray.Dataset`. + +The following is an example of the high level processing steps: + +.. code-block:: python + + def open_dataset( + self, + filename_or_obj, + *, + drop_variables=None, + decode_times=True, + decode_timedelta=True, + decode_coords=True, + my_backend_param=None, + ): + vars, attrs, coords = my_reader( + filename_or_obj, + drop_variables=drop_variables, + my_backend_param=my_backend_param, + ) + vars, attrs, coords = my_decode_variables( + vars, attrs, decode_times, decode_timedelta, decode_coords + ) # see also conventions.decode_cf_variables + + ds = xr.Dataset(vars, attrs=attrs) + ds = ds.set_coords(coords) + ds.set_close(store.close) + + return ds + + +The output :py:class:`~xarray.Dataset` shall implement the additional custom method +``close``, used by Xarray to ensure the related files are eventually closed. This +method shall be set by using :py:meth:`~xarray.Dataset.set_close`. + + +The input of ``open_dataset`` method are one argument +(``filename``) and one keyword argument (``drop_variables``): + +- ``filename``: can be a string containing a path or an instance of + :py:class:`pathlib.Path`. +- ``drop_variables``: can be `None` or an iterable containing the variable + names to be dropped when reading the data. + +If it makes sense for your backend, your ``open_dataset`` method +should implement in its interface the following boolean keyword arguments, called +**decoders**, which default to ``None``: + +- ``mask_and_scale`` +- ``decode_times`` +- ``decode_timedelta`` +- ``use_cftime`` +- ``concat_characters`` +- ``decode_coords`` + +Note: all the supported decoders shall be declared explicitly +in backend ``open_dataset`` signature. + +These keyword arguments are explicitly defined in Xarray +:py:func:`~xarray.open_dataset` signature. Xarray will pass them to the +backend only if the User explicitly sets a value different from ``None``. +For more details on decoders see :ref:`RST decoders`. + +Your backend can also take as input a set of backend-specific keyword +arguments. All these keyword arguments can be passed to +:py:func:`~xarray.open_dataset` grouped either via the ``backend_kwargs`` +parameter or explicitly using the syntax ``**kwargs``. + + +If you don't want to support the lazy loading, then the +:py:class:`~xarray.Dataset` shall contain values as a :py:class:`numpy.ndarray` +and your work is almost done. + +.. _RST open_dataset_parameters: + +open_dataset_parameters +^^^^^^^^^^^^^^^^^^^^^^^ + +``open_dataset_parameters`` is the list of backend ``open_dataset`` parameters. +It is not a mandatory parameter, and if the backend does not provide it +explicitly, Xarray creates a list of them automatically by inspecting the +backend signature. + +If ``open_dataset_parameters`` is not defined, but ``**kwargs`` and ``*args`` +are in the backend ``open_dataset`` signature, Xarray raises an error. +On the other hand, if the backend provides the ``open_dataset_parameters``, +then ``**kwargs`` and ``*args`` can be used in the signature. +However, this practice is discouraged unless there is a good reasons for using +``**kwargs`` or ``*args``. + +.. _RST guess_can_open: + +guess_can_open +^^^^^^^^^^^^^^ + +``guess_can_open`` is used to identify the proper engine to open your data +file automatically in case the engine is not specified explicitly. If you are +not interested in supporting this feature, you can skip this step since +:py:class:`~xarray.backends.common.BackendEntrypoint` already provides a +default :py:meth:`~xarray.backend.common.BackendEntrypoint.guess_can_open` +that always returns ``False``. + +Backend ``guess_can_open`` takes as input the ``filename_or_obj`` parameter of +Xarray :py:meth:`~xarray.open_dataset`, and returns a boolean. + +.. _RST decoders: + +Decoders +^^^^^^^^ +The decoders implement specific operations to transform data from on-disk +representation to Xarray representation. + +A classic example is the “time” variable decoding operation. In NetCDF, the +elements of the “time” variable are stored as integers, and the unit contains +an origin (for example: "seconds since 1970-1-1"). In this case, Xarray +transforms the pair integer-unit in a :py:class:`numpy.datetime64`. + +The standard coders implemented in Xarray are: + +- :py:class:`xarray.coding.strings.CharacterArrayCoder()` +- :py:class:`xarray.coding.strings.EncodedStringCoder()` +- :py:class:`xarray.coding.variables.UnsignedIntegerCoder()` +- :py:class:`xarray.coding.variables.CFMaskCoder()` +- :py:class:`xarray.coding.variables.CFScaleOffsetCoder()` +- :py:class:`xarray.coding.times.CFTimedeltaCoder()` +- :py:class:`xarray.coding.times.CFDatetimeCoder()` + +Xarray coders all have the same interface. They have two methods: ``decode`` +and ``encode``. The method ``decode`` takes a ``Variable`` in on-disk +format and returns a ``Variable`` in Xarray format. Variable +attributes no more applicable after the decoding, are dropped and stored in the +``Variable.encoding`` to make them available to the ``encode`` method, which +performs the inverse transformation. + +In the following an example on how to use the coders ``decode`` method: + +.. ipython:: python + + var = xr.Variable( + dims=("x",), data=np.arange(10.0), attrs={"scale_factor": 10, "add_offset": 2} + ) + var + + coder = xr.coding.variables.CFScaleOffsetCoder() + decoded_var = coder.decode(var) + decoded_var + decoded_var.encoding + +Some of the transformations can be common to more backends, so before +implementing a new decoder, be sure Xarray does not already implement that one. + +The backends can reuse Xarray’s decoders, either instantiating the coders +and using the method ``decode`` directly or using the higher-level function +:py:func:`~xarray.conventions.decode_cf_variables` that groups Xarray decoders. + +In some cases, the transformation to apply strongly depends on the on-disk +data format. Therefore, you may need to implement your own decoder. + +An example of such a case is when you have to deal with the time format of a +grib file. grib format is very different from the NetCDF one: in grib, the +time is stored in two attributes dataDate and dataTime as strings. Therefore, +it is not possible to reuse the Xarray time decoder, and implementing a new +one is mandatory. + +Decoders can be activated or deactivated using the boolean keywords of +Xarray :py:meth:`~xarray.open_dataset` signature: ``mask_and_scale``, +``decode_times``, ``decode_timedelta``, ``use_cftime``, +``concat_characters``, ``decode_coords``. +Such keywords are passed to the backend only if the User sets a value +different from ``None``. Note that the backend does not necessarily have to +implement all the decoders, but it shall declare in its ``open_dataset`` +interface only the boolean keywords related to the supported decoders. + +.. _RST backend_registration: + +How to register a backend ++++++++++++++++++++++++++++ + +Define a new entrypoint in your ``setup.py`` (or ``setup.cfg``) with: + +- group: ``xarray.backend`` +- name: the name to be passed to :py:meth:`~xarray.open_dataset` as ``engine`` +- object reference: the reference of the class that you have implemented. + +You can declare the entrypoint in ``setup.py`` using the following syntax: + +.. code-block:: + + setuptools.setup( + entry_points={ + "xarray.backends": [ + "engine_name=your_package.your_module:YourBackendEntryClass" + ], + }, + ) + +in ``setup.cfg``: + +.. code-block:: cfg + + [options.entry_points] + xarray.backends = + engine_name = your_package.your_module:YourBackendEntryClass + + +See https://packaging.python.org/specifications/entry-points/#data-model +for more information + +If you are using [Poetry](https://python-poetry.org/) for your build system, you can accomplish the same thing using "plugins". In this case you would need to add the following to your ``pyproject.toml`` file: + +.. code-block:: toml + + [tool.poetry.plugins."xarray_backends"] + "engine_name" = "your_package.your_module:YourBackendEntryClass" + +See https://python-poetry.org/docs/pyproject/#plugins for more information on Poetry plugins. + +.. _RST lazy_loading: + +How to support Lazy Loading ++++++++++++++++++++++++++++ +If you want to make your backend effective with big datasets, then you should +support lazy loading. +Basically, you shall replace the :py:class:`numpy.ndarray` inside the +variables with a custom class that supports lazy loading indexing. +See the example below: + +.. code-block:: python + + backend_array = MyBackendArray() + data = indexing.LazilyIndexedArray(backend_array) + var = xr.Variable(dims, data, attrs=attrs, encoding=encoding) + +Where: + +- :py:class:`~xarray.core.indexing.LazilyIndexedArray` is a class + provided by Xarray that manages the lazy loading. +- ``MyBackendArray`` shall be implemented by the backend and shall inherit + from :py:class:`~xarray.backends.common.BackendArray`. + +BackendArray subclassing +^^^^^^^^^^^^^^^^^^^^^^^^ + +The BackendArray subclass shall implement the following method and attributes: + +- the ``__getitem__`` method that takes in input an index and returns a + `NumPy `__ array +- the ``shape`` attribute +- the ``dtype`` attribute. + + +Xarray supports different type of +`indexing `__, that can be +grouped in three types of indexes +:py:class:`~xarray.core.indexing.BasicIndexer`, +:py:class:`~xarray.core.indexing.OuterIndexer` and +:py:class:`~xarray.core.indexing.VectorizedIndexer`. +This implies that the implementation of the method ``__getitem__`` can be tricky. +In oder to simplify this task, Xarray provides a helper function, +:py:func:`~xarray.core.indexing.explicit_indexing_adapter`, that transforms +all the input ``indexer`` types (`basic`, `outer`, `vectorized`) in a tuple +which is interpreted correctly by your backend. + +This is an example ``BackendArray`` subclass implementation: + +.. code-block:: python + + class MyBackendArray(BackendArray): + def __init__( + self, + shape, + dtype, + lock, + # other backend specific keyword arguments + ): + self.shape = shape + self.dtype = lock + self.lock = dtype + + def __getitem__( + self, key: xarray.core.indexing.ExplicitIndexer + ) -> np.typing.ArrayLike: + return indexing.explicit_indexing_adapter( + key, + self.shape, + indexing.IndexingSupport.BASIC, + self._raw_indexing_method, + ) + + def _raw_indexing_method(self, key: tuple) -> np.typing.ArrayLike: + # thread safe method that access to data on disk + with self.lock: + ... + return item + +Note that ``BackendArray.__getitem__`` must be thread safe to support +multi-thread processing. + +The :py:func:`~xarray.core.indexing.explicit_indexing_adapter` method takes in +input the ``key``, the array ``shape`` and the following parameters: + +- ``indexing_support``: the type of index supported by ``raw_indexing_method`` +- ``raw_indexing_method``: a method that shall take in input a key in the form + of a tuple and return an indexed :py:class:`numpy.ndarray`. + +For more details see +:py:class:`~xarray.core.indexing.IndexingSupport` and :ref:`RST indexing`. + +In order to support `Dask `__ distributed and +:py:mod:`multiprocessing`, ``BackendArray`` subclass should be serializable +either with :ref:`io.pickle` or +`cloudpickle `__. +That implies that all the reference to open files should be dropped. For +opening files, we therefore suggest to use the helper class provided by Xarray +:py:class:`~xarray.backends.CachingFileManager`. + +.. _RST indexing: + +Indexing Examples +^^^^^^^^^^^^^^^^^ +**BASIC** + +In the ``BASIC`` indexing support, numbers and slices are supported. + +Example: + +.. ipython:: + :verbatim: + + In [1]: # () shall return the full array + ...: backend_array._raw_indexing_method(()) + Out[1]: array([[0, 1, 2, 3], [4, 5, 6, 7], [8, 9, 10, 11]]) + + In [2]: # shall support integers + ...: backend_array._raw_indexing_method(1, 1) + Out[2]: 5 + + In [3]: # shall support slices + ...: backend_array._raw_indexing_method(slice(0, 3), slice(2, 4)) + Out[3]: array([[2, 3], [6, 7], [10, 11]]) + +**OUTER** + +The ``OUTER`` indexing shall support number, slices and in addition it shall +support also lists of integers. The the outer indexing is equivalent to +combining multiple input list with ``itertools.product()``: + +.. ipython:: + :verbatim: + + In [1]: backend_array._raw_indexing_method([0, 1], [0, 1, 2]) + Out[1]: array([[0, 1, 2], [4, 5, 6]]) + + # shall support integers + In [2]: backend_array._raw_indexing_method(1, 1) + Out[2]: 5 + + +**OUTER_1VECTOR** + +The ``OUTER_1VECTOR`` indexing shall supports number, slices and at most one +list. The behaviour with the list shall be the same of ``OUTER`` indexing. + +If you support more complex indexing as `explicit indexing` or +`numpy indexing`, you can have a look to the implemetation of Zarr backend and Scipy backend, +currently available in :py:mod:`~xarray.backends` module. + +.. _RST preferred_chunks: + +Backend preferred chunks +^^^^^^^^^^^^^^^^^^^^^^^^ + +The backend is not directly involved in `Dask `__ +chunking, since it is internally managed by Xarray. However, the backend can +define the preferred chunk size inside the variable’s encoding +``var.encoding["preferred_chunks"]``. The ``preferred_chunks`` may be useful +to improve performances with lazy loading. ``preferred_chunks`` shall be a +dictionary specifying chunk size per dimension like +``{“dim1”: 1000, “dim2”: 2000}`` or +``{“dim1”: [1000, 100], “dim2”: [2000, 2000, 2000]]}``. + +The ``preferred_chunks`` is used by Xarray to define the chunk size in some +special cases: + +- if ``chunks`` along a dimension is ``None`` or not defined +- if ``chunks`` is ``"auto"``. + +In the first case Xarray uses the chunks size specified in +``preferred_chunks``. +In the second case Xarray accommodates ideal chunk sizes, preserving if +possible the "preferred_chunks". The ideal chunk size is computed using +:py:func:`dask.core.normalize_chunks`, setting +``previous_chunks = preferred_chunks``. diff --git a/doc/internals/index.rst b/doc/internals/index.rst new file mode 100644 index 00000000000..8cedc67327e --- /dev/null +++ b/doc/internals/index.rst @@ -0,0 +1,20 @@ +.. _internals: + +xarray Internals +================ + +xarray builds upon two of the foundational libraries of the scientific Python +stack, NumPy and pandas. It is written in pure Python (no C or Cython +extensions), which makes it easy to develop and extend. Instead, we push +compiled code to :ref:`optional dependencies`. + + +.. toctree:: + :maxdepth: 2 + :hidden: + + variable-objects + duck-arrays-integration + extending-xarray + zarr-encoding-spec + how-to-add-new-backend diff --git a/doc/internals/variable-objects.rst b/doc/internals/variable-objects.rst new file mode 100644 index 00000000000..6ae3c2f7e6d --- /dev/null +++ b/doc/internals/variable-objects.rst @@ -0,0 +1,31 @@ +Variable objects +================ + +The core internal data structure in xarray is the :py:class:`~xarray.Variable`, +which is used as the basic building block behind xarray's +:py:class:`~xarray.Dataset` and :py:class:`~xarray.DataArray` types. A +``Variable`` consists of: + +- ``dims``: A tuple of dimension names. +- ``data``: The N-dimensional array (typically, a NumPy or Dask array) storing + the Variable's data. It must have the same number of dimensions as the length + of ``dims``. +- ``attrs``: An ordered dictionary of metadata associated with this array. By + convention, xarray's built-in operations never use this metadata. +- ``encoding``: Another ordered dictionary used to store information about how + these variable's data is represented on disk. See :ref:`io.encoding` for more + details. + +``Variable`` has an interface similar to NumPy arrays, but extended to make use +of named dimensions. For example, it uses ``dim`` in preference to an ``axis`` +argument for methods like ``mean``, and supports :ref:`compute.broadcasting`. + +However, unlike ``Dataset`` and ``DataArray``, the basic ``Variable`` does not +include coordinate labels along each axis. + +``Variable`` is public API, but because of its incomplete support for labeled +data, it is mostly intended for advanced uses, such as in xarray itself or for +writing new backends. You can access the variable objects that correspond to +xarray objects via the (readonly) :py:attr:`Dataset.variables +` and +:py:attr:`DataArray.variable ` attributes. diff --git a/doc/internals/zarr-encoding-spec.rst b/doc/internals/zarr-encoding-spec.rst new file mode 100644 index 00000000000..aea75155e80 --- /dev/null +++ b/doc/internals/zarr-encoding-spec.rst @@ -0,0 +1,53 @@ + +.. _zarr_encoding: + +Zarr Encoding Specification +============================ + +In implementing support for the `Zarr `_ storage +format, Xarray developers made some *ad hoc* choices about how to store +NetCDF data in Zarr. +Future versions of the Zarr spec will likely include a more formal convention +for the storage of the NetCDF data model in Zarr; see +`Zarr spec repo `_ for ongoing +discussion. + +First, Xarray can only read and write Zarr groups. There is currently no support +for reading / writting individual Zarr arrays. Zarr groups are mapped to +Xarray ``Dataset`` objects. + +Second, from Xarray's point of view, the key difference between +NetCDF and Zarr is that all NetCDF arrays have *dimension names* while Zarr +arrays do not. Therefore, in order to store NetCDF data in Zarr, Xarray must +somehow encode and decode the name of each array's dimensions. + +To accomplish this, Xarray developers decided to define a special Zarr array +attribute: ``_ARRAY_DIMENSIONS``. The value of this attribute is a list of +dimension names (strings), for example ``["time", "lon", "lat"]``. When writing +data to Zarr, Xarray sets this attribute on all variables based on the variable +dimensions. When reading a Zarr group, Xarray looks for this attribute on all +arrays, raising an error if it can't be found. The attribute is used to define +the variable dimension names and then removed from the attributes dictionary +returned to the user. + +Because of these choices, Xarray cannot read arbitrary array data, but only +Zarr data with valid ``_ARRAY_DIMENSIONS`` attributes on each array. + +After decoding the ``_ARRAY_DIMENSIONS`` attribute and assigning the variable +dimensions, Xarray proceeds to [optionally] decode each variable using its +standard CF decoding machinery used for NetCDF data (see :py:func:`decode_cf`). + +As a concrete example, here we write a tutorial dataset to Zarr and then +re-open it directly with Zarr: + +.. ipython:: python + + import xarray as xr + import zarr + + ds = xr.tutorial.load_dataset("rasm") + ds.to_zarr("rasm.zarr", mode="w") + + zgroup = zarr.open("rasm.zarr") + print(zgroup.tree()) + dict(zgroup["Tair"].attrs) diff --git a/doc/roadmap.rst b/doc/roadmap.rst index 1cbbaf8ef42..dd5235bfb16 100644 --- a/doc/roadmap.rst +++ b/doc/roadmap.rst @@ -206,27 +206,10 @@ In order to lower this adoption barrier, we propose to: - Write a basic glossary that defines terms that might not be familiar to all (e.g. "lazy", "labeled", "serialization", "indexing", "backend"). + Administrative -------------- -Current core developers -~~~~~~~~~~~~~~~~~~~~~~~ - -- Stephan Hoyer -- Ryan Abernathey -- Joe Hamman -- Benoit Bovy -- Fabien Maussion -- Keisuke Fujii -- Maximilian Roos -- Deepak Cherian -- Spencer Clark -- Tom Nicholas -- Guido Imperiale -- Justus Magin -- Mathias Hauser -- Anderson Banihirwe - NumFOCUS ~~~~~~~~ diff --git a/doc/team.rst b/doc/team.rst new file mode 100644 index 00000000000..7b185dc3a52 --- /dev/null +++ b/doc/team.rst @@ -0,0 +1,87 @@ +Team +----- + +Current core developers +~~~~~~~~~~~~~~~~~~~~~~~ + +Xarray core developers are responsible for the ongoing organizational maintenance and technical direction of the xarray project. + +The current core developers team comprises: + +.. panels:: + :column: col-lg-4 col-md-4 col-sm-6 col-xs-12 p-2 + :card: text-center + + --- + .. image:: https://avatars.githubusercontent.com/u/1217238?v=4 + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/shoyer,"Stephan Hoyer",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/1197350?v=4 + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/rabernat,"Ryan Abernathey",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/2443309?v=4 + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/jhamman,"Joe Hamman",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/4160723?v=4 + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/benbovy,"Benoit Bovy",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/10050469?v=4 + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/fmaussion,"Fabien Maussion",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/6815844?v=4 + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/fujiisoup,"Keisuke Fujii",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/5635139?v=4 + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/max-sixty,"Maximilian Roos",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/2448579?v=4 + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/dcherian,"Deepak Cherian",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/6628425?v=4 + +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/spencerkclark,"Spencer Clark",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/35968931?v=4 + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/TomNicholas,"Tom Nicholas",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/6213168?v=4 + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/crusaderky,"Guido Imperiale",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/14808389?v=4 + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/keewis,"Justus Magin",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/10194086?v=4 + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/mathause,"Mathias Hauser",cls=btn badge-light` + + --- + .. image:: https://avatars.githubusercontent.com/u/13301940?v=4 + ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + :link-badge:`https://github.com/andersy005,"Anderson Banihirwe",cls=btn badge-light` + + + +The full list of contributors is on our `GitHub Contributors Page `__. diff --git a/doc/tutorials-and-videos.rst b/doc/tutorials-and-videos.rst new file mode 100644 index 00000000000..6189fb745db --- /dev/null +++ b/doc/tutorials-and-videos.rst @@ -0,0 +1,64 @@ + +Tutorials and Videos +==================== + + +Tutorials +---------- + +- `Xarray's Tutorials`_ repository +- The `UW eScience Institute's Geohackweek`_ tutorial on xarray for geospatial data scientists. +- `Nicolas Fauchereau's 2015 tutorial`_ on xarray for netCDF users. + + + +Videos +------- + +.. panels:: + :card: text-center + + --- + + Xarray's virtual tutorial | October 2020 | Anderson Banihirwe, Deepak Cherian, and Martin Durant + ^^^ + .. raw:: html + + + + --- + Xarray's Tutorial presented at the 2020 SciPy Conference | Joe Hamman, Ryan Abernathey, + Deepak Cherian, and Stephan Hoyer + ^^^ + .. raw:: html + + + + --- + Scipy 2015 talk introducing xarray to a general audience | Stephan Hoyer + ^^^ + .. raw:: html + + + + --- + 2015 Unidata Users Workshop talk and tutorial with (`with answers`_) introducing + xarray to users familiar with netCDF | Stephan Hoyer + ^^^ + .. raw:: html + + + + +Books, Chapters and Articles +----------------------------- + +- Stephan Hoyer and Joe Hamman's `Journal of Open Research Software paper`_ describing the xarray project. + + +.. _Xarray's Tutorials: https://xarray-contrib.github.io/xarray-tutorial/ +.. _Journal of Open Research Software paper: http://doi.org/10.5334/jors.148 +.. _UW eScience Institute's Geohackweek : https://geohackweek.github.io/nDarrays/ +.. _tutorial: https://github.com/Unidata/unidata-users-workshop/blob/master/notebooks/xray-tutorial.ipynb +.. _with answers: https://github.com/Unidata/unidata-users-workshop/blob/master/notebooks/xray-tutorial-with-answers.ipynb +.. _Nicolas Fauchereau's 2015 tutorial: http://nbviewer.iPython.org/github/nicolasfauchereau/metocean/blob/master/notebooks/xray.ipynb diff --git a/doc/combining.rst b/doc/user-guide/combining.rst similarity index 100% rename from doc/combining.rst rename to doc/user-guide/combining.rst diff --git a/doc/computation.rst b/doc/user-guide/computation.rst similarity index 99% rename from doc/computation.rst rename to doc/user-guide/computation.rst index dcfe270a942..28804bcba6f 100644 --- a/doc/computation.rst +++ b/doc/user-guide/computation.rst @@ -6,6 +6,7 @@ Computation ########### + The labels associated with :py:class:`~xarray.DataArray` and :py:class:`~xarray.Dataset` objects enables some powerful shortcuts for computation, notably including aggregation and broadcasting by dimension diff --git a/doc/dask.rst b/doc/user-guide/dask.rst similarity index 99% rename from doc/dask.rst rename to doc/user-guide/dask.rst index 4844967350b..e92aacbe511 100644 --- a/doc/dask.rst +++ b/doc/user-guide/dask.rst @@ -21,7 +21,7 @@ and at the `Dask examples website `_. What is a Dask array? --------------------- -.. image:: _static/dask_array.png +.. image:: ../_static/dask_array.png :width: 40 % :align: right :alt: A Dask array diff --git a/doc/data-structures.rst b/doc/user-guide/data-structures.rst similarity index 99% rename from doc/data-structures.rst rename to doc/user-guide/data-structures.rst index ac78e1769d5..14881e0c261 100644 --- a/doc/data-structures.rst +++ b/doc/user-guide/data-structures.rst @@ -251,7 +251,7 @@ quantities that belong in data. Here is an example of how we might structure a dataset for a weather forecast: -.. image:: _static/dataset-diagram.png +.. image:: ../_static/dataset-diagram.png In this example, it would be natural to call ``temperature`` and ``precipitation`` "data variables" and all the other arrays "coordinate diff --git a/doc/duckarrays.rst b/doc/user-guide/duckarrays.rst similarity index 100% rename from doc/duckarrays.rst rename to doc/user-guide/duckarrays.rst diff --git a/doc/groupby.rst b/doc/user-guide/groupby.rst similarity index 98% rename from doc/groupby.rst rename to doc/user-guide/groupby.rst index d0c0b1849f9..804799aebd1 100644 --- a/doc/groupby.rst +++ b/doc/user-guide/groupby.rst @@ -63,6 +63,12 @@ You can also iterate over groups in ``(label, group)`` pairs: list(ds.groupby("letters")) +You can index out a particular group: + +.. ipython:: python + + ds.groupby("letters")["b"] + Just like in pandas, creating a GroupBy object is cheap: it does not actually split the data until you access particular values. diff --git a/doc/user-guide/index.rst b/doc/user-guide/index.rst new file mode 100644 index 00000000000..edeb0aac632 --- /dev/null +++ b/doc/user-guide/index.rst @@ -0,0 +1,27 @@ +########### +User Guide +########### + +In this user guide, you will find detailed descriptions and +examples that describe many common tasks that you can accomplish with xarray. + + +.. toctree:: + :maxdepth: 2 + :hidden: + + terminology + data-structures + indexing + interpolation + computation + groupby + reshaping + combining + time-series + weather-climate + pandas + io + dask + plotting + duckarrays diff --git a/doc/indexing.rst b/doc/user-guide/indexing.rst similarity index 100% rename from doc/indexing.rst rename to doc/user-guide/indexing.rst diff --git a/doc/interpolation.rst b/doc/user-guide/interpolation.rst similarity index 99% rename from doc/interpolation.rst rename to doc/user-guide/interpolation.rst index 9a3b7a7ee2d..8a7f9ebe911 100644 --- a/doc/interpolation.rst +++ b/doc/user-guide/interpolation.rst @@ -179,7 +179,7 @@ For example, if you want to interpolate a two dimensional array along a particul you can pass two 1-dimensional :py:class:`~xarray.DataArray` s with a common dimension as new coordinate. -.. image:: _static/advanced_selection_interpolation.svg +.. image:: ../_static/advanced_selection_interpolation.svg :height: 200px :width: 400 px :alt: advanced indexing and interpolation diff --git a/doc/io.rst b/doc/user-guide/io.rst similarity index 99% rename from doc/io.rst rename to doc/user-guide/io.rst index c2022cc9325..41cb53e4d6b 100644 --- a/doc/io.rst +++ b/doc/user-guide/io.rst @@ -621,7 +621,7 @@ over the network until we look at particular values: # the data is downloaded automatically when we make the plot In [6]: tmax[0].plot() -.. image:: _static/opendap-prism-tmax.png +.. image:: ../_static/opendap-prism-tmax.png Some servers require authentication before we can access the data. For this purpose we can explicitly create a :py:class:`backends.PydapDataStore` diff --git a/doc/pandas.rst b/doc/user-guide/pandas.rst similarity index 100% rename from doc/pandas.rst rename to doc/user-guide/pandas.rst diff --git a/doc/plotting.rst b/doc/user-guide/plotting.rst similarity index 100% rename from doc/plotting.rst rename to doc/user-guide/plotting.rst diff --git a/doc/reshaping.rst b/doc/user-guide/reshaping.rst similarity index 100% rename from doc/reshaping.rst rename to doc/user-guide/reshaping.rst diff --git a/doc/terminology.rst b/doc/user-guide/terminology.rst similarity index 98% rename from doc/terminology.rst rename to doc/user-guide/terminology.rst index 3cfc211593f..1876058323e 100644 --- a/doc/terminology.rst +++ b/doc/user-guide/terminology.rst @@ -79,7 +79,7 @@ complete examples, please consult the relevant documentation.* example, multidimensional coordinates are often used in geoscience datasets when :doc:`the data's physical coordinates (such as latitude and longitude) differ from their logical coordinates - `. However, non-dimension coordinates + <../examples/multidimensional-coords>`. However, non-dimension coordinates are not indexed, and any operation on non-dimension coordinates that leverages indexing will fail. Printing ``arr.coords`` will print all of ``arr``'s coordinate names, with the corresponding dimension(s) in diff --git a/doc/time-series.rst b/doc/user-guide/time-series.rst similarity index 99% rename from doc/time-series.rst rename to doc/user-guide/time-series.rst index 96a2edc0ea5..f9d341ff25d 100644 --- a/doc/time-series.rst +++ b/doc/user-guide/time-series.rst @@ -224,4 +224,4 @@ Data that has indices outside of the given ``tolerance`` are set to ``NaN``. For more examples of using grouped operations on a time dimension, see -:doc:`examples/weather-data`. +:doc:`../examples/weather-data`. diff --git a/doc/weather-climate.rst b/doc/user-guide/weather-climate.rst similarity index 99% rename from doc/weather-climate.rst rename to doc/user-guide/weather-climate.rst index 068edba1e64..e14b77386b2 100644 --- a/doc/weather-climate.rst +++ b/doc/user-guide/weather-climate.rst @@ -8,7 +8,7 @@ Weather and climate data import xarray as xr -``xarray`` can leverage metadata that follows the `Climate and Forecast (CF) conventions`_ if present. Examples include automatic labelling of plots with descriptive names and units if proper metadata is present (see :ref:`plotting`) and support for non-standard calendars used in climate science through the ``cftime`` module (see :ref:`CFTimeIndex`). There are also a number of geosciences-focused projects that build on xarray (see :ref:`related-projects`). +``xarray`` can leverage metadata that follows the `Climate and Forecast (CF) conventions`_ if present. Examples include automatic labelling of plots with descriptive names and units if proper metadata is present (see :ref:`plotting`) and support for non-standard calendars used in climate science through the ``cftime`` module (see :ref:`CFTimeIndex`). There are also a number of geosciences-focused projects that build on xarray (see :ref:`ecosystem`). .. _Climate and Forecast (CF) conventions: http://cfconventions.org diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 9c16fb74a7b..77d6296acac 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -22,28 +22,76 @@ v0.17.1 (unreleased) New Features ~~~~~~~~~~~~ + +- Add :py:meth:`Dataset.query` and :py:meth:`DataArray.query` which enable indexing + of datasets and data arrays by evaluating query expressions against the values of the + data variables (:pull:`4984`). By `Alistair Miles `_. - Allow passing ``combine_attrs`` to :py:meth:`Dataset.merge` (:pull:`4895`). By `Justus Magin `_. - Support for `dask.graph_manipulation `_ (requires dask >=2021.3) By `Guido Imperiale `_ +- Many of the arguments for the :py:attr:`DataArray.str` methods now support + providing an array-like input. In this case, the array provided to the + arguments is broadcast against the original array and applied elementwise. +- :py:attr:`DataArray.str` now supports `+`, `*`, and `%` operators. These + behave the same as they do for :py:class:`str`, except that they follow + array broadcasting rules. +- A large number of new :py:attr:`DataArray.str` methods were implemented, + :py:meth:`DataArray.str.casefold`, :py:meth:`DataArray.str.cat`, + :py:meth:`DataArray.str.extract`, :py:meth:`DataArray.str.extractall`, + :py:meth:`DataArray.str.findall`, :py:meth:`DataArray.str.format`, + :py:meth:`DataArray.str.get_dummies`, :py:meth:`DataArray.str.islower`, + :py:meth:`DataArray.str.join`, :py:meth:`DataArray.str.normalize`, + :py:meth:`DataArray.str.partition`, :py:meth:`DataArray.str.rpartition`, + :py:meth:`DataArray.str.rsplit`, and :py:meth:`DataArray.str.split`. + A number of these methods allow for splitting or joining the strings in an + array. (:issue:`4622`) +- Thanks to the new pluggable backend infrastructure external packages may now + use the ``xarray.backends`` entry point to register additional engines to be used in + :py:func:`open_dataset`, see the documentation in :ref:`add_a_backend` + (:issue:`4309`, :issue:`4803`, :pull:`4989`, :pull:`4810` and many others). + The backend refactor has been sponsored with the "Essential Open Source Software for Science" + grant from the `Chan Zuckerberg Initiative `_ and + developed by `B-Open `_. + By `Aureliana Barghini `_ and `Alessandro Amici `_. +- :py:attr:`~core.accessor_dt.DatetimeAccessor.date` added (:issue:`4983`, :pull:`4994`). + By `Hauke Schulz `_. +- Implement ``__getitem__`` for both :py:class:`~core.groupby.DatasetGroupBy` and + :py:class:`~core.groupby.DataArrayGroupBy`, inspired by pandas' + :py:meth:`~pandas.core.groupby.GroupBy.get_group`. + By `Deepak Cherian `_. Breaking changes ~~~~~~~~~~~~~~~~ - +- :py:func:`open_dataset` and :py:func:`open_dataarray` now accept only the first argument + as positional, all others need to be passed are keyword arguments. This is part of the + refactor to support external backends (:issue:`4309`, :pull:`4989`). + By `Alessandro Amici `_. Deprecations ~~~~~~~~~~~~ - Bug fixes ~~~~~~~~~ +- Ensure standard calendar times encoded with large values (i.e. greater than approximately 292 years), can be decoded correctly without silently overflowing (:pull:`5050`). This was a regression in xarray 0.17.0. By `Zeb Nicholls `_. +- Added support for `numpy.bool_` attributes in roundtrips using `h5netcdf` engine with `invalid_netcdf=True` [which casts `bool`s to `numpy.bool_`] (:issue:`4981`, :pull:`4986`). + By `Victor Negîrneac `_. - Don't allow passing ``axis`` to :py:meth:`Dataset.reduce` methods (:issue:`3510`, :pull:`4940`). By `Justus Magin `_. +- Decode values as signed if attribute `_Unsigned = "false"` (:issue:`4954`) + By `Tobias Kölling `_. Documentation ~~~~~~~~~~~~~ +- New section on :ref:`add_a_backend` in the "Internals" chapter aimed to backend developers + (:issue:`4803`, :pull:`4810`). By `Aureliana Barghini `_. +- Add :py:meth:`Dataset.polyfit` and :py:meth:`DataArray.polyfit` under "See also" in + the docstrings of :py:meth:`Dataset.polyfit` and :py:meth:`DataArray.polyfit` + (:issue:`5016`, :pull:`5020`). By `Aaron Spring `_. +- New sphinx theme & rearrangement of the docs (:pull:`4835`). + By `Anderson Banihirwe `_. Internal Changes ~~~~~~~~~~~~~~~~ @@ -71,6 +119,7 @@ Breaking changes ~~~~~~~~~~~~~~~~ - xarray no longer supports python 3.6 + The minimum versions of some other dependencies were changed: The minimum version policy was changed to also apply to projects with irregular releases. As a result, the minimum versions of some dependencies have changed: @@ -473,7 +522,7 @@ New Features By `Aaron Spring `_. - Use a wrapped array's ``_repr_inline_`` method to construct the collapsed ``repr`` of :py:class:`DataArray` and :py:class:`Dataset` objects and - document the new method in :doc:`internals`. (:pull:`4248`). + document the new method in :doc:`internals/index`. (:pull:`4248`). By `Justus Magin `_. - Allow per-variable fill values in most functions. (:pull:`4237`). By `Justus Magin `_. @@ -2476,7 +2525,7 @@ non-standard calendars used in climate modeling. Documentation ~~~~~~~~~~~~~ -- New FAQ entry, :ref:`related-projects`. +- New FAQ entry, :ref:`ecosystem`. By `Deepak Cherian `_. - :ref:`assigning_values` now includes examples on how to select and assign values to a :py:class:`~xarray.DataArray` with ``.loc``. @@ -4521,11 +4570,11 @@ Highlights ~~~~~~~~~~ The headline feature in this release is experimental support for out-of-core -computing (data that doesn't fit into memory) with dask_. This includes a new +computing (data that doesn't fit into memory) with :doc:`user-guide/dask`. This includes a new top-level function ``xray.open_mfdataset`` that makes it easy to open a collection of netCDF (using dask) as a single ``xray.Dataset`` object. For more on dask, read the `blog post introducing xray + dask`_ and the new -documentation section :doc:`dask`. +documentation section :doc:`user-guide/dask`. .. _blog post introducing xray + dask: https://www.anaconda.com/blog/developer-blog/xray-dask-out-core-labeled-arrays-python/ diff --git a/setup.cfg b/setup.cfg index 5037cb9c584..80785d6f108 100644 --- a/setup.cfg +++ b/setup.cfg @@ -66,6 +66,7 @@ classifiers = Programming Language :: Python :: 3 Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 + Programming Language :: Python :: 3.9 Topic :: Scientific/Engineering [options] @@ -74,8 +75,8 @@ zip_safe = False # https://mypy.readthedocs.io/en/latest/installed_packages.htm include_package_data = True python_requires = >=3.7 install_requires = - numpy >= 1.15 - pandas >= 0.25 + numpy >= 1.17 + pandas >= 1.0 setuptools >= 40.4 # For pkg_resources [options.extras_require] diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 4fa34b39925..b2bb928cb90 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -26,7 +26,8 @@ ) from ..core.dataarray import DataArray from ..core.dataset import Dataset, _get_chunk, _maybe_chunk -from ..core.utils import close_on_error, is_grib_path, is_remote_uri, read_magic_number +from ..core.utils import is_remote_uri +from . import plugins from .common import AbstractDataStore, ArrayWriter from .locks import _get_scheduler @@ -70,26 +71,6 @@ def _get_default_engine_remote_uri(): return engine -def _get_default_engine_grib(): - msgs = [] - try: - import Nio # noqa: F401 - - msgs += ["set engine='pynio' to access GRIB files with PyNIO"] - except ImportError: # pragma: no cover - pass - try: - import cfgrib # noqa: F401 - - msgs += ["set engine='cfgrib' to access GRIB files with cfgrib"] - except ImportError: # pragma: no cover - pass - if msgs: - raise ValueError(" or\n".join(msgs)) - else: - raise ValueError("PyNIO or cfgrib is required for accessing GRIB files") - - def _get_default_engine_gz(): try: import scipy # noqa: F401 @@ -118,27 +99,9 @@ def _get_default_engine_netcdf(): return engine -def _get_engine_from_magic_number(filename_or_obj): - magic_number = read_magic_number(filename_or_obj) - - if magic_number.startswith(b"CDF"): - engine = "scipy" - elif magic_number.startswith(b"\211HDF\r\n\032\n"): - engine = "h5netcdf" - else: - raise ValueError( - "cannot guess the engine, " - f"{magic_number} is not the signature of any supported file format " - "did you mean to pass a string for a path instead?" - ) - return engine - - def _get_default_engine(path: str, allow_remote: bool = False): if allow_remote and is_remote_uri(path): engine = _get_default_engine_remote_uri() - elif is_grib_path(path): - engine = _get_default_engine_grib() elif path.endswith(".gz"): engine = _get_default_engine_gz() else: @@ -146,27 +109,6 @@ def _get_default_engine(path: str, allow_remote: bool = False): return engine -def _autodetect_engine(filename_or_obj): - if isinstance(filename_or_obj, AbstractDataStore): - engine = "store" - elif isinstance(filename_or_obj, (str, Path)): - engine = _get_default_engine(str(filename_or_obj), allow_remote=True) - else: - engine = _get_engine_from_magic_number(filename_or_obj) - return engine - - -def _get_backend_cls(engine, engines=ENGINES): - """Select open_dataset method based on current engine""" - try: - return engines[engine] - except KeyError: - raise ValueError( - "unrecognized engine for open_dataset: {}\n" - "must be one of: {}".format(engine, list(ENGINES)) - ) - - def _normalize_path(path): if isinstance(path, Path): path = str(path) @@ -199,12 +141,21 @@ def check_name(name): check_name(k) -def _validate_attrs(dataset): +def _validate_attrs(dataset, invalid_netcdf=False): """`attrs` must have a string key and a value which is either: a number, - a string, an ndarray or a list/tuple of numbers/strings. + a string, an ndarray, a list/tuple of numbers/strings, or a numpy.bool_. + + Notes + ----- + A numpy.bool_ is only allowed when using the h5netcdf engine with + `invalid_netcdf=True`. """ - def check_attr(name, value): + valid_types = (str, Number, np.ndarray, np.number, list, tuple) + if invalid_netcdf: + valid_types += (np.bool_,) + + def check_attr(name, value, valid_types): if isinstance(name, str): if not name: raise ValueError( @@ -218,22 +169,46 @@ def check_attr(name, value): "serialization to netCDF files" ) - if not isinstance(value, (str, Number, np.ndarray, np.number, list, tuple)): + if not isinstance(value, valid_types): raise TypeError( - f"Invalid value for attr {name!r}: {value!r} must be a number, " - "a string, an ndarray or a list/tuple of " - "numbers/strings for serialization to netCDF " - "files" + f"Invalid value for attr {name!r}: {value!r}. For serialization to " + "netCDF files, its value must be of one of the following types: " + f"{', '.join([vtype.__name__ for vtype in valid_types])}" ) # Check attrs on the dataset itself for k, v in dataset.attrs.items(): - check_attr(k, v) + check_attr(k, v, valid_types) # Check attrs on each variable within the dataset for variable in dataset.variables.values(): for k, v in variable.attrs.items(): - check_attr(k, v) + check_attr(k, v, valid_types) + + +def _resolve_decoders_kwargs(decode_cf, open_backend_dataset_parameters, **decoders): + for d in list(decoders): + if decode_cf is False and d in open_backend_dataset_parameters: + decoders[d] = False + if decoders[d] is None: + decoders.pop(d) + return decoders + + +def _get_mtime(filename_or_obj): + # if passed an actual file path, augment the token with + # the file modification time + mtime = None + + try: + path = os.fspath(filename_or_obj) + except TypeError: + path = None + + if path and not is_remote_uri(path): + mtime = os.path.getmtime(filename_or_obj) + + return mtime def _protect_dataset_variables_inplace(dataset, cache): @@ -304,22 +279,90 @@ def load_dataarray(filename_or_obj, **kwargs): return da.load() +def _chunk_ds( + backend_ds, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + **extra_tokens, +): + from dask.base import tokenize + + mtime = _get_mtime(filename_or_obj) + token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) + name_prefix = "open_dataset-%s" % token + + variables = {} + for name, var in backend_ds.variables.items(): + var_chunks = _get_chunk(var, chunks) + variables[name] = _maybe_chunk( + name, + var, + var_chunks, + overwrite_encoded_chunks=overwrite_encoded_chunks, + name_prefix=name_prefix, + token=token, + ) + ds = backend_ds._replace(variables) + return ds + + +def _dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + **extra_tokens, +): + if not (isinstance(chunks, (int, dict)) or chunks is None): + if chunks != "auto": + raise ValueError( + "chunks must be an int, dict, 'auto', or None. " + "Instead found %s. " % chunks + ) + + _protect_dataset_variables_inplace(backend_ds, cache) + if chunks is None: + ds = backend_ds + else: + ds = _chunk_ds( + backend_ds, + filename_or_obj, + engine, + chunks, + overwrite_encoded_chunks, + **extra_tokens, + ) + + ds.set_close(backend_ds._close) + + # Ensure source filename always stored in dataset object (GH issue #2550) + if "source" not in ds.encoding: + if isinstance(filename_or_obj, str): + ds.encoding["source"] = filename_or_obj + + return ds + + def open_dataset( filename_or_obj, - group=None, - decode_cf=True, - mask_and_scale=None, - decode_times=True, - concat_characters=True, - decode_coords=True, + *args, engine=None, chunks=None, - lock=None, cache=None, + decode_cf=None, + mask_and_scale=None, + decode_times=None, + decode_timedelta=None, + use_cftime=None, + concat_characters=None, + decode_coords=None, drop_variables=None, backend_kwargs=None, - use_cftime=None, - decode_timedelta=None, + **kwargs, ): """Open and decode a dataset from a file or file-like object. @@ -331,9 +374,26 @@ def open_dataset( ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - group : str, optional - Path to the netCDF4 group in the given file to open (only works for - netCDF4 files). + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ + "pseudonetcdf", "zarr"}, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". + chunks : int or dict, optional + If chunks is provided, it is used to load the new dataset into dask + arrays. ``chunks=-1`` loads the dataset with dask using a single + chunk for all arrays. `chunks={}`` loads the dataset with dask using + engine preferred chunks if exposed by the backend, otherwise with + a single chunk for all arrays. + ``chunks='auto'`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. See dask chunking for more details. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -345,15 +405,33 @@ def open_dataset( `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. + pseudonetcdf backend. This keyword may not be supported by all the backends. decode_times : bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. + This keyword may not be supported by all the backends. + decode_timedelta : bool, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + This keyword may not be supported by all the backends. + use_cftime: bool, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. This keyword may not be supported by all the backends. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. + This keyword may not be supported by all the backends. decode_coords : bool or {"coordinates", "all"}, optional Controls which variables are set as coordinate variables: @@ -362,54 +440,26 @@ def open_dataset( as coordinate variables. - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and other attributes as coordinate variables. - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ - "pseudonetcdf", "zarr"}, optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - "netcdf4". - chunks : int or dict, optional - If chunks is provided, it is used to load the new dataset into dask - arrays. ``chunks=-1`` loads the dataset with dask using a single - chunk for all arrays. `chunks={}`` loads the dataset with dask using - engine preferred chunks if exposed by the backend, otherwise with - a single chunk for all arrays. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. See dask chunking for more details. - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. - cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. drop_variables: str or iterable, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - backend_kwargs: dict, optional - A dictionary of keyword arguments to pass on to the backend. This - may be useful when backend options would improve performance or - allow user control of dataset processing. - use_cftime: bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of decode_time. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the netCDF4 group in the given file to open given as + a str,supported by "netcdf4", "h5netcdf", "zarr". + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "pynio", "pseudonetcdf", "cfgrib". + + See engine open function for kwargs accepted by each specific engine. Returns ------- @@ -427,159 +477,72 @@ def open_dataset( -------- open_mfdataset """ - if os.environ.get("XARRAY_BACKEND_API", "v1") == "v2": - kwargs = {k: v for k, v in locals().items() if v is not None} - from . import apiv2 - - return apiv2.open_dataset(**kwargs) - - if mask_and_scale is None: - mask_and_scale = not engine == "pseudonetcdf" - - if not decode_cf: - mask_and_scale = False - decode_times = False - concat_characters = False - decode_coords = False - decode_timedelta = False + if len(args) > 0: + raise TypeError( + "open_dataset() takes only 1 positional argument starting from version 0.18.0, " + "all other options must be passed as keyword arguments" + ) if cache is None: cache = chunks is None - if backend_kwargs is None: - backend_kwargs = {} + if backend_kwargs is not None: + kwargs.update(backend_kwargs) - def maybe_decode_store(store, chunks): - ds = conventions.decode_cf( - store, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - concat_characters=concat_characters, - decode_coords=decode_coords, - drop_variables=drop_variables, - use_cftime=use_cftime, - decode_timedelta=decode_timedelta, - ) - - _protect_dataset_variables_inplace(ds, cache) - - if chunks is not None and engine != "zarr": - from dask.base import tokenize - - # if passed an actual file path, augment the token with - # the file modification time - if isinstance(filename_or_obj, str) and not is_remote_uri(filename_or_obj): - mtime = os.path.getmtime(filename_or_obj) - else: - mtime = None - token = tokenize( - filename_or_obj, - mtime, - group, - decode_cf, - mask_and_scale, - decode_times, - concat_characters, - decode_coords, - engine, - chunks, - drop_variables, - use_cftime, - decode_timedelta, - ) - name_prefix = "open_dataset-%s" % token - ds2 = ds.chunk(chunks, name_prefix=name_prefix, token=token) - - elif engine == "zarr": - # adapted from Dataset.Chunk() and taken from open_zarr - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - if chunks == "auto": - try: - import dask.array # noqa - except ImportError: - chunks = None - - # auto chunking needs to be here and not in ZarrStore because - # the variable chunks does not survive decode_cf - # return trivial case - if chunks is None: - return ds - - if isinstance(chunks, int): - chunks = dict.fromkeys(ds.dims, chunks) - - variables = { - k: _maybe_chunk( - k, - v, - _get_chunk(v, chunks), - overwrite_encoded_chunks=overwrite_encoded_chunks, - ) - for k, v in ds.variables.items() - } - ds2 = ds._replace(variables) - - else: - ds2 = ds - ds2.set_close(ds._close) - return ds2 - - filename_or_obj = _normalize_path(filename_or_obj) - - if isinstance(filename_or_obj, AbstractDataStore): - store = filename_or_obj - else: - if engine is None: - engine = _autodetect_engine(filename_or_obj) - - extra_kwargs = {} - if group is not None: - extra_kwargs["group"] = group - if lock is not None: - extra_kwargs["lock"] = lock - - if engine == "zarr": - backend_kwargs = backend_kwargs.copy() - overwrite_encoded_chunks = backend_kwargs.pop( - "overwrite_encoded_chunks", None - ) + if engine is None: + engine = plugins.guess_engine(filename_or_obj) - opener = _get_backend_cls(engine) - store = opener(filename_or_obj, **extra_kwargs, **backend_kwargs) + backend = plugins.get_backend(engine) - with close_on_error(store): - ds = maybe_decode_store(store, chunks) + decoders = _resolve_decoders_kwargs( + decode_cf, + open_backend_dataset_parameters=backend.open_dataset_parameters, + mask_and_scale=mask_and_scale, + decode_times=decode_times, + decode_timedelta=decode_timedelta, + concat_characters=concat_characters, + use_cftime=use_cftime, + decode_coords=decode_coords, + ) - # Ensure source filename always stored in dataset object (GH issue #2550) - if "source" not in ds.encoding: - if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj + overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) + backend_ds = backend.open_dataset( + filename_or_obj, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) + ds = _dataset_from_backend_dataset( + backend_ds, + filename_or_obj, + engine, + chunks, + cache, + overwrite_encoded_chunks, + drop_variables=drop_variables, + **decoders, + **kwargs, + ) return ds def open_dataarray( filename_or_obj, - group=None, - decode_cf=True, - mask_and_scale=None, - decode_times=True, - concat_characters=True, - decode_coords=True, + *args, engine=None, chunks=None, - lock=None, cache=None, + decode_cf=None, + mask_and_scale=None, + decode_times=None, + decode_timedelta=None, + use_cftime=None, + concat_characters=None, + decode_coords=None, drop_variables=None, backend_kwargs=None, - use_cftime=None, - decode_timedelta=None, + **kwargs, ): """Open an DataArray from a file or file-like object containing a single data variable. @@ -590,14 +553,31 @@ def open_dataarray( Parameters ---------- filename_or_obj : str, Path, file-like or DataStore - Strings and Paths are interpreted as a path to a netCDF file or an - OpenDAP URL and opened with python-netCDF4, unless the filename ends - with .gz, in which case the file is gunzipped and opened with + Strings and Path objects are interpreted as a path to a netCDF file + or an OpenDAP URL and opened with python-netCDF4, unless the filename + ends with .gz, in which case the file is gunzipped and opened with scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - group : str, optional - Path to the netCDF4 group in the given file to open (only works for - netCDF4 files). + engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib", \ + "pseudonetcdf", "zarr"}, optional + Engine to use when reading files. If not provided, the default engine + is chosen based on available dependencies, with a preference for + "netcdf4". + chunks : int or dict, optional + If chunks is provided, it is used to load the new dataset into dask + arrays. ``chunks=-1`` loads the dataset with dask using a single + chunk for all arrays. `chunks={}`` loads the dataset with dask using + engine preferred chunks if exposed by the backend, otherwise with + a single chunk for all arrays. + ``chunks='auto'`` will use dask ``auto`` chunking taking into account the + engine preferred chunks. See dask chunking for more details. + cache : bool, optional + If True, cache data loaded from the underlying datastore in memory as + NumPy arrays when accessed to avoid reading from the underlying data- + store multiple times. Defaults to True unless you specify the `chunks` + argument to use dask, in which case it defaults to False. Does not + change the behavior of coordinates corresponding to dimensions, which + always load their data from disk into a ``pandas.Index``. decode_cf : bool, optional Whether to decode these variables, assuming they were saved according to CF conventions. @@ -609,15 +589,33 @@ def open_dataarray( `missing_value` attribute contains multiple values a warning will be issued and all array values matching one of the multiple values will be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. + pseudonetcdf backend. This keyword may not be supported by all the backends. decode_times : bool, optional If True, decode times encoded in the standard NetCDF datetime format into datetime objects. Otherwise, leave them encoded as numbers. + This keyword may not be supported by all the backends. + decode_timedelta : bool, optional + If True, decode variables and coordinates with time units in + {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} + into timedelta objects. If False, leave them encoded as numbers. + If None (default), assume the same value of decode_time. + This keyword may not be supported by all the backends. + use_cftime: bool, optional + Only relevant if encoded dates come from a standard calendar + (e.g. "gregorian", "proleptic_gregorian", "standard", or not + specified). If None (default), attempt to decode times to + ``np.datetime64[ns]`` objects; if this is not possible, decode times to + ``cftime.datetime`` objects. If True, always decode times to + ``cftime.datetime`` objects, regardless of whether or not they can be + represented using ``np.datetime64[ns]`` objects. If False, always + decode times to ``np.datetime64[ns]`` objects; if this is not possible + raise an error. This keyword may not be supported by all the backends. concat_characters : bool, optional If True, concatenate along the last dimension of character arrays to form string arrays. Dimensions will only be concatenated over (and removed) if they have no corresponding variable and if they are only used as the last dimension of character arrays. + This keyword may not be supported by all the backends. decode_coords : bool or {"coordinates", "all"}, optional Controls which variables are set as coordinate variables: @@ -626,51 +624,26 @@ def open_dataarray( as coordinate variables. - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and other attributes as coordinate variables. - engine : {"netcdf4", "scipy", "pydap", "h5netcdf", "pynio", "cfgrib"}, \ - optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - "netcdf4". - chunks : int or dict, optional - If chunks is provided, it used to load the new dataset into dask - arrays. - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. - cache : bool, optional - If True, cache data loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. drop_variables: str or iterable, optional A variable or list of variables to exclude from being parsed from the dataset. This may be useful to drop variables with problems or inconsistent values. - backend_kwargs: dict, optional - A dictionary of keyword arguments to pass on to the backend. This - may be useful when backend options would improve performance or - allow user control of dataset processing. If using fsspec URLs, - include the key "storage_options" to pass arguments to the - storage layer. - use_cftime: bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, leave them encoded as numbers. - If None (default), assume the same value of decode_time. + backend_kwargs: dict + Additional keyword arguments passed on to the engine open function, + equivalent to `**kwargs`. + **kwargs: dict + Additional keyword arguments passed on to the engine open function. + For example: + + - 'group': path to the netCDF4 group in the given file to open given as + a str,supported by "netcdf4", "h5netcdf", "zarr". + - 'lock': resource lock to use when reading data from disk. Only + relevant when using dask or another form of parallelism. By default, + appropriate locks are chosen to safely read and write files with the + currently active dask scheduler. Supported by "netcdf4", "h5netcdf", + "pynio", "pseudonetcdf", "cfgrib". + + See engine open function for kwargs accepted by each specific engine. Notes ----- @@ -685,10 +658,14 @@ def open_dataarray( -------- open_dataset """ + if len(args) > 0: + raise TypeError( + "open_dataarray() takes only 1 positional argument starting from version 0.18.0, " + "all other options must be passed as keyword arguments" + ) dataset = open_dataset( filename_or_obj, - group=group, decode_cf=decode_cf, mask_and_scale=mask_and_scale, decode_times=decode_times, @@ -696,12 +673,12 @@ def open_dataarray( decode_coords=decode_coords, engine=engine, chunks=chunks, - lock=lock, cache=cache, drop_variables=drop_variables, backend_kwargs=backend_kwargs, use_cftime=use_cftime, decode_timedelta=decode_timedelta, + **kwargs, ) if len(dataset.data_vars) != 1: @@ -734,7 +711,6 @@ def open_mfdataset( compat="no_conflicts", preprocess=None, engine=None, - lock=None, data_vars="all", coords="different", combine="by_coords", @@ -804,11 +780,6 @@ def open_mfdataset( Engine to use when reading files. If not provided, the default engine is chosen based on available dependencies, with a preference for "netcdf4". - lock : False or lock-like, optional - Resource lock to use when reading data from disk. Only relevant when - using dask or another form of parallelism. By default, appropriate - locks are chosen to safely read and write files with the currently - active dask scheduler. data_vars : {"minimal", "different", "all"} or list of str, optional These data variables will be concatenated together: * "minimal": Only data variables in which the dimension already @@ -923,7 +894,7 @@ def open_mfdataset( combined_ids_paths = _infer_concat_order_from_positions(paths) ids, paths = (list(combined_ids_paths.keys()), list(combined_ids_paths.values())) - open_kwargs = dict(engine=engine, chunks=chunks or {}, lock=lock, **kwargs) + open_kwargs = dict(engine=engine, chunks=chunks or {}, **kwargs) if parallel: import dask @@ -1056,7 +1027,7 @@ def to_netcdf( # validate Dataset keys, DataArray names, and attr keys/values _validate_dataset_names(dataset) - _validate_attrs(dataset) + _validate_attrs(dataset, invalid_netcdf=invalid_netcdf and engine == "h5netcdf") try: store_open = WRITEABLE_STORES[engine] diff --git a/xarray/backends/apiv2.py b/xarray/backends/apiv2.py deleted file mode 100644 index de1b3e1bb29..00000000000 --- a/xarray/backends/apiv2.py +++ /dev/null @@ -1,286 +0,0 @@ -import os - -from ..core import indexing -from ..core.dataset import _get_chunk, _maybe_chunk -from ..core.utils import is_remote_uri -from . import plugins - - -def _protect_dataset_variables_inplace(dataset, cache): - for name, variable in dataset.variables.items(): - if name not in variable.dims: - # no need to protect IndexVariable objects - data = indexing.CopyOnWriteArray(variable._data) - if cache: - data = indexing.MemoryCachedArray(data) - variable.data = data - - -def _get_mtime(filename_or_obj): - # if passed an actual file path, augment the token with - # the file modification time - mtime = None - - try: - path = os.fspath(filename_or_obj) - except TypeError: - path = None - - if path and not is_remote_uri(path): - mtime = os.path.getmtime(filename_or_obj) - - return mtime - - -def _chunk_ds( - backend_ds, - filename_or_obj, - engine, - chunks, - overwrite_encoded_chunks, - **extra_tokens, -): - from dask.base import tokenize - - mtime = _get_mtime(filename_or_obj) - token = tokenize(filename_or_obj, mtime, engine, chunks, **extra_tokens) - name_prefix = "open_dataset-%s" % token - - variables = {} - for name, var in backend_ds.variables.items(): - var_chunks = _get_chunk(var, chunks) - variables[name] = _maybe_chunk( - name, - var, - var_chunks, - overwrite_encoded_chunks=overwrite_encoded_chunks, - name_prefix=name_prefix, - token=token, - ) - ds = backend_ds._replace(variables) - return ds - - -def _dataset_from_backend_dataset( - backend_ds, - filename_or_obj, - engine, - chunks, - cache, - overwrite_encoded_chunks, - **extra_tokens, -): - if not (isinstance(chunks, (int, dict)) or chunks is None): - if chunks != "auto": - raise ValueError( - "chunks must be an int, dict, 'auto', or None. " - "Instead found %s. " % chunks - ) - - _protect_dataset_variables_inplace(backend_ds, cache) - if chunks is None: - ds = backend_ds - else: - ds = _chunk_ds( - backend_ds, - filename_or_obj, - engine, - chunks, - overwrite_encoded_chunks, - **extra_tokens, - ) - - ds.set_close(backend_ds._close) - - # Ensure source filename always stored in dataset object (GH issue #2550) - if "source" not in ds.encoding: - if isinstance(filename_or_obj, str): - ds.encoding["source"] = filename_or_obj - - return ds - - -def _resolve_decoders_kwargs(decode_cf, open_backend_dataset_parameters, **decoders): - for d in list(decoders): - if decode_cf is False and d in open_backend_dataset_parameters: - decoders[d] = False - if decoders[d] is None: - decoders.pop(d) - return decoders - - -def open_dataset( - filename_or_obj, - *, - engine=None, - chunks=None, - cache=None, - decode_cf=None, - mask_and_scale=None, - decode_times=None, - decode_timedelta=None, - use_cftime=None, - concat_characters=None, - decode_coords=None, - drop_variables=None, - backend_kwargs=None, - **kwargs, -): - """Open and decode a dataset from a file or file-like object. - - Parameters - ---------- - filename_or_obj : str, Path, file-like or DataStore - Strings and Path objects are interpreted as a path to a netCDF file - or an OpenDAP URL and opened with python-netCDF4, unless the filename - ends with .gz, in which case the file is unzipped and opened with - scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like - objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). - engine : str, optional - Engine to use when reading files. If not provided, the default engine - is chosen based on available dependencies, with a preference for - "netcdf4". Options are: {"netcdf4", "scipy", "pydap", "h5netcdf",\ - "pynio", "cfgrib", "pseudonetcdf", "zarr"}. - chunks : int or dict, optional - If chunks is provided, it is used to load the new dataset into dask - arrays. ``chunks=-1`` loads the dataset with dask using a single - chunk for all arrays. `chunks={}`` loads the dataset with dask using - engine preferred chunks if exposed by the backend, otherwise with - a single chunk for all arrays. - ``chunks='auto'`` will use dask ``auto`` chunking taking into account the - engine preferred chunks. See dask chunking for more details. - cache : bool, optional - If True, cache data is loaded from the underlying datastore in memory as - NumPy arrays when accessed to avoid reading from the underlying data- - store multiple times. Defaults to True unless you specify the `chunks` - argument to use dask, in which case it defaults to False. Does not - change the behavior of coordinates corresponding to dimensions, which - always load their data from disk into a ``pandas.Index``. - decode_cf : bool, optional - Setting ``decode_cf=False`` will disable ``mask_and_scale``, - ``decode_times``, ``decode_timedelta``, ``concat_characters``, - ``decode_coords``. - mask_and_scale : bool, optional - If True, array values equal to `_FillValue` are replaced with NA and other - values are scaled according to the formula `original_values * scale_factor + - add_offset`, where `_FillValue`, `scale_factor` and `add_offset` are - taken from variable attributes (if they exist). If the `_FillValue` or - `missing_value` attribute contains multiple values, a warning will be - issued and all array values matching one of the multiple values will - be replaced by NA. mask_and_scale defaults to True except for the - pseudonetcdf backend. This keyword may not be supported by all the backends. - decode_times : bool, optional - If True, decode times encoded in the standard NetCDF datetime format - into datetime objects. Otherwise, leave them encoded as numbers. - This keyword may not be supported by all the backends. - decode_timedelta : bool, optional - If True, decode variables and coordinates with time units in - {"days", "hours", "minutes", "seconds", "milliseconds", "microseconds"} - into timedelta objects. If False, they remain encoded as numbers. - If None (default), assume the same value of decode_time. - This keyword may not be supported by all the backends. - use_cftime: bool, optional - Only relevant if encoded dates come from a standard calendar - (e.g. "gregorian", "proleptic_gregorian", "standard", or not - specified). If None (default), attempt to decode times to - ``np.datetime64[ns]`` objects; if this is not possible, decode times to - ``cftime.datetime`` objects. If True, always decode times to - ``cftime.datetime`` objects, regardless of whether or not they can be - represented using ``np.datetime64[ns]`` objects. If False, always - decode times to ``np.datetime64[ns]`` objects; if this is not possible - raise an error. This keyword may not be supported by all the backends. - concat_characters : bool, optional - If True, concatenate along the last dimension of character arrays to - form string arrays. Dimensions will only be concatenated over (and - removed) if they have no corresponding variable and if they are only - used as the last dimension of character arrays. - This keyword may not be supported by all the backends. - decode_coords : bool or {"coordinates", "all"}, optional - Controls which variables are set as coordinate variables: - - - "coordinates" or True: Set variables referred to in the - ``'coordinates'`` attribute of the datasets or individual variables - as coordinate variables. - - "all": Set variables referred to in ``'grid_mapping'``, ``'bounds'`` and - other attributes as coordinate variables. - drop_variables: str or iterable, optional - A variable or list of variables to exclude from the dataset parsing. - This may be useful to drop variables with problems or - inconsistent values. - backend_kwargs: - Additional keyword arguments passed on to the engine open function. - **kwargs: dict - Additional keyword arguments passed on to the engine open function. - For example: - - - 'group': path to the netCDF4 group in the given file to open given as - a str,supported by "netcdf4", "h5netcdf", "zarr". - - - 'lock': resource lock to use when reading data from disk. Only - relevant when using dask or another form of parallelism. By default, - appropriate locks are chosen to safely read and write files with the - currently active dask scheduler. Supported by "netcdf4", "h5netcdf", - "pynio", "pseudonetcdf", "cfgrib". - - See engine open function for kwargs accepted by each specific engine. - - - Returns - ------- - dataset : Dataset - The newly created dataset. - - Notes - ----- - ``open_dataset`` opens the file with read-only access. When you modify - values of a Dataset, even one linked to files on disk, only the in-memory - copy you are manipulating in xarray is modified: the original file on disk - is never touched. - - See Also - -------- - open_mfdataset - """ - - if cache is None: - cache = chunks is None - - if backend_kwargs is not None: - kwargs.update(backend_kwargs) - - if engine is None: - engine = plugins.guess_engine(filename_or_obj) - - backend = plugins.get_backend(engine) - - decoders = _resolve_decoders_kwargs( - decode_cf, - open_backend_dataset_parameters=backend.open_dataset_parameters, - mask_and_scale=mask_and_scale, - decode_times=decode_times, - decode_timedelta=decode_timedelta, - concat_characters=concat_characters, - use_cftime=use_cftime, - decode_coords=decode_coords, - ) - - overwrite_encoded_chunks = kwargs.pop("overwrite_encoded_chunks", None) - backend_ds = backend.open_dataset( - filename_or_obj, - drop_variables=drop_variables, - **decoders, - **kwargs, - ) - ds = _dataset_from_backend_dataset( - backend_ds, - filename_or_obj, - engine, - chunks, - cache, - overwrite_encoded_chunks, - drop_variables=drop_variables, - **decoders, - **kwargs, - ) - - return ds diff --git a/xarray/backends/cfgrib_.py b/xarray/backends/cfgrib_.py index d582af82c6e..c63e1543746 100644 --- a/xarray/backends/cfgrib_.py +++ b/xarray/backends/cfgrib_.py @@ -62,7 +62,7 @@ def open_store_variable(self, name, var): data = var.data else: wrapped_array = CfGribArrayWrapper(self, var.data) - data = indexing.LazilyOuterIndexedArray(wrapped_array) + data = indexing.LazilyIndexedArray(wrapped_array) encoding = self.ds.encoding.copy() encoding["original_shape"] = var.data.shape @@ -87,9 +87,9 @@ def get_encoding(self): class CfgribfBackendEntrypoint(BackendEntrypoint): - def guess_can_open(self, store_spec): + def guess_can_open(self, filename_or_obj): try: - _, ext = os.path.splitext(store_spec) + _, ext = os.path.splitext(filename_or_obj) except TypeError: return False return ext in {".grib", ".grib2", ".grb", ".grb2"} diff --git a/xarray/backends/common.py b/xarray/backends/common.py index e2905d0866b..aa902602278 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -1,7 +1,7 @@ import logging import time import traceback -from typing import Dict, Tuple, Type, Union +from typing import Any, Dict, Tuple, Type, Union import numpy as np @@ -344,12 +344,41 @@ def encode(self, variables, attributes): class BackendEntrypoint: + """ + ``BackendEntrypoint`` is a class container and it is the main interface + for the backend plugins, see :ref:`RST backend_entrypoint`. + It shall implement: + + - ``open_dataset`` method: it shall implement reading from file, variables + decoding and it returns an instance of :py:class:`~xarray.Dataset`. + It shall take in input at least ``filename_or_obj`` argument and + ``drop_variables`` keyword argument. + For more details see :ref:`RST open_dataset`. + - ``guess_can_open`` method: it shall return ``True`` if the backend is able to open + ``filename_or_obj``, ``False`` otherwise. The implementation of this + method is not mandatory. + """ + open_dataset_parameters: Union[Tuple, None] = None + """list of ``open_dataset`` method parameters""" + + def open_dataset( + self, + filename_or_obj: str, + drop_variables: Tuple[str] = None, + **kwargs: Any, + ): + """ + Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`. + """ - def open_dataset(self): raise NotImplementedError - def guess_can_open(self, store_spec): + def guess_can_open(self, filename_or_obj): + """ + Backend open_dataset method used by Xarray in :py:func:`~xarray.open_dataset`. + """ + return False diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index ca531af81f6..0909418dd8d 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -187,7 +187,7 @@ def open_store_variable(self, name, var): import h5py dimensions = var.dimensions - data = indexing.LazilyOuterIndexedArray(H5NetCDFArrayWrapper(name, self)) + data = indexing.LazilyIndexedArray(H5NetCDFArrayWrapper(name, self)) attrs = _read_attributes(var) # netCDF4 specific encoding @@ -334,14 +334,14 @@ def close(self, **kwargs): class H5netcdfBackendEntrypoint(BackendEntrypoint): - def guess_can_open(self, store_spec): + def guess_can_open(self, filename_or_obj): try: - return read_magic_number(store_spec).startswith(b"\211HDF\r\n\032\n") + return read_magic_number(filename_or_obj).startswith(b"\211HDF\r\n\032\n") except TypeError: pass try: - _, ext = os.path.splitext(store_spec) + _, ext = os.path.splitext(filename_or_obj) except TypeError: return False diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 78ad1a4c20f..7243a36dea3 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -388,7 +388,7 @@ def ds(self): def open_store_variable(self, name, var): dimensions = var.dimensions - data = indexing.LazilyOuterIndexedArray(NetCDF4ArrayWrapper(name, self)) + data = indexing.LazilyIndexedArray(NetCDF4ArrayWrapper(name, self)) attributes = {k: var.getncattr(k) for k in var.ncattrs()} _ensure_fill_value_valid(data, attributes) # netCDF4 specific encoding; save _FillValue for later @@ -513,11 +513,11 @@ def close(self, **kwargs): class NetCDF4BackendEntrypoint(BackendEntrypoint): - def guess_can_open(self, store_spec): - if isinstance(store_spec, str) and is_remote_uri(store_spec): + def guess_can_open(self, filename_or_obj): + if isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj): return True try: - _, ext = os.path.splitext(store_spec) + _, ext = os.path.splitext(filename_or_obj) except TypeError: return False return ext in {".nc", ".nc4", ".cdf"} diff --git a/xarray/backends/pseudonetcdf_.py b/xarray/backends/pseudonetcdf_.py index a9d7f0bbed4..62966711b11 100644 --- a/xarray/backends/pseudonetcdf_.py +++ b/xarray/backends/pseudonetcdf_.py @@ -74,7 +74,7 @@ def ds(self): return self._manager.acquire() def open_store_variable(self, name, var): - data = indexing.LazilyOuterIndexedArray(PncArrayWrapper(name, self)) + data = indexing.LazilyIndexedArray(PncArrayWrapper(name, self)) attrs = {k: getattr(var, k) for k in var.ncattrs()} return Variable(var.dimensions, data, attrs) diff --git a/xarray/backends/pydap_.py b/xarray/backends/pydap_.py index 09bff9acc1d..148f32cf982 100644 --- a/xarray/backends/pydap_.py +++ b/xarray/backends/pydap_.py @@ -92,7 +92,7 @@ def open(cls, url, session=None): return cls(ds) def open_store_variable(self, var): - data = indexing.LazilyOuterIndexedArray(PydapArrayWrapper(var)) + data = indexing.LazilyIndexedArray(PydapArrayWrapper(var)) return Variable(var.dimensions, data, _fix_attributes(var.attributes)) def get_variables(self): @@ -108,8 +108,8 @@ def get_dimensions(self): class PydapBackendEntrypoint(BackendEntrypoint): - def guess_can_open(self, store_spec): - return isinstance(store_spec, str) and is_remote_uri(store_spec) + def guess_can_open(self, filename_or_obj): + return isinstance(filename_or_obj, str) and is_remote_uri(filename_or_obj) def open_dataset( self, diff --git a/xarray/backends/pynio_.py b/xarray/backends/pynio_.py index 8ace5697d09..ea9841da21e 100644 --- a/xarray/backends/pynio_.py +++ b/xarray/backends/pynio_.py @@ -74,7 +74,7 @@ def ds(self): return self._manager.acquire() def open_store_variable(self, name, var): - data = indexing.LazilyOuterIndexedArray(NioArrayWrapper(name, self)) + data = indexing.LazilyIndexedArray(NioArrayWrapper(name, self)) return Variable(var.dimensions, data, var.attributes) def get_variables(self): @@ -99,6 +99,7 @@ def close(self): class PynioBackendEntrypoint(BackendEntrypoint): def open_dataset( + self, filename_or_obj, mask_and_scale=True, decode_times=True, diff --git a/xarray/backends/rasterio_.py b/xarray/backends/rasterio_.py index d776b116ea8..06b964fdc46 100644 --- a/xarray/backends/rasterio_.py +++ b/xarray/backends/rasterio_.py @@ -174,11 +174,46 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc You can generate 2D coordinates from the file's attributes with:: - from affine import Affine - da = xr.open_rasterio('path_to_file.tif') - transform = Affine.from_gdal(*da.attrs['transform']) - nx, ny = da.sizes['x'], da.sizes['y'] - x, y = np.meshgrid(np.arange(nx)+0.5, np.arange(ny)+0.5) * transform + >>> from affine import Affine + >>> da = xr.open_rasterio( + ... "https://github.com/mapbox/rasterio/raw/master/tests/data/RGB.byte.tif" + ... ) + >>> da + + [1703814 values with dtype=uint8] + Coordinates: + * band (band) int64 1 2 3 + * y (y) float64 2.827e+06 2.826e+06 2.826e+06 ... 2.612e+06 2.612e+06 + * x (x) float64 1.021e+05 1.024e+05 1.027e+05 ... 3.389e+05 3.392e+05 + Attributes: + transform: (300.0379266750948, 0.0, 101985.0, 0.0, -300.041782729805... + crs: +init=epsg:32618 + res: (300.0379266750948, 300.041782729805) + is_tiled: 0 + nodatavals: (0.0, 0.0, 0.0) + scales: (1.0, 1.0, 1.0) + offsets: (0.0, 0.0, 0.0) + AREA_OR_POINT: Area + >>> transform = Affine(*da.attrs["transform"]) + >>> transform + Affine(300.0379266750948, 0.0, 101985.0, + 0.0, -300.041782729805, 2826915.0) + >>> nx, ny = da.sizes["x"], da.sizes["y"] + >>> x, y = transform * np.meshgrid(np.arange(nx) + 0.5, np.arange(ny) + 0.5) + >>> x + array([[102135.01896334, 102435.05689001, 102735.09481669, ..., + 338564.90518331, 338864.94310999, 339164.98103666], + [102135.01896334, 102435.05689001, 102735.09481669, ..., + 338564.90518331, 338864.94310999, 339164.98103666], + [102135.01896334, 102435.05689001, 102735.09481669, ..., + 338564.90518331, 338864.94310999, 339164.98103666], + ..., + [102135.01896334, 102435.05689001, 102735.09481669, ..., + 338564.90518331, 338864.94310999, 339164.98103666], + [102135.01896334, 102435.05689001, 102735.09481669, ..., + 338564.90518331, 338864.94310999, 339164.98103666], + [102135.01896334, 102435.05689001, 102735.09481669, ..., + 338564.90518331, 338864.94310999, 339164.98103666]]) Parameters ---------- @@ -335,9 +370,7 @@ def open_rasterio(filename, parse_coordinates=None, chunks=None, cache=None, loc else: attrs[k] = v - data = indexing.LazilyOuterIndexedArray( - RasterioArrayWrapper(manager, lock, vrt_params) - ) + data = indexing.LazilyIndexedArray(RasterioArrayWrapper(manager, lock, vrt_params)) # this lets you write arrays loaded with rasterio data = indexing.CopyOnWriteArray(data) diff --git a/xarray/backends/scipy_.py b/xarray/backends/scipy_.py index b98515c7b5b..02e84c4d7a5 100644 --- a/xarray/backends/scipy_.py +++ b/xarray/backends/scipy_.py @@ -233,14 +233,14 @@ def close(self): class ScipyBackendEntrypoint(BackendEntrypoint): - def guess_can_open(self, store_spec): + def guess_can_open(self, filename_or_obj): try: - return read_magic_number(store_spec).startswith(b"CDF") + return read_magic_number(filename_or_obj).startswith(b"CDF") except TypeError: pass try: - _, ext = os.path.splitext(store_spec) + _, ext = os.path.splitext(filename_or_obj) except TypeError: return False return ext in {".nc", ".nc4", ".cdf", ".gz"} diff --git a/xarray/backends/store.py b/xarray/backends/store.py index d57b3ab9df8..860a0254b64 100644 --- a/xarray/backends/store.py +++ b/xarray/backends/store.py @@ -4,8 +4,8 @@ class StoreBackendEntrypoint(BackendEntrypoint): - def guess_can_open(self, store_spec): - return isinstance(store_spec, AbstractDataStore) + def guess_can_open(self, filename_or_obj): + return isinstance(filename_or_obj, AbstractDataStore) def open_dataset( self, diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index d740b207e37..a4663b6708f 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -336,7 +336,7 @@ def __init__( self._write_region = write_region def open_store_variable(self, name, zarr_array): - data = indexing.LazilyOuterIndexedArray(ZarrArrayWrapper(name, self)) + data = indexing.LazilyIndexedArray(ZarrArrayWrapper(name, self)) dimensions, attributes = _get_zarr_dims_and_attrs(zarr_array, DIMENSION_KEY) attributes = dict(attributes) encoding = { diff --git a/xarray/coding/times.py b/xarray/coding/times.py index a1822393dc1..ff42e4e8973 100644 --- a/xarray/coding/times.py +++ b/xarray/coding/times.py @@ -181,6 +181,11 @@ def _decode_datetime_with_pandas(flat_num_dates, units, calendar): # strings, in which case we fall back to using cftime raise OutOfBoundsDatetime + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", "invalid value encountered", RuntimeWarning) + pd.to_timedelta(flat_num_dates.min(), delta) + ref_date + pd.to_timedelta(flat_num_dates.max(), delta) + ref_date + # To avoid integer overflow when converting to nanosecond units for integer # dtypes smaller than np.int64 cast all integer-dtype arrays to np.int64 # (GH 2002). diff --git a/xarray/coding/variables.py b/xarray/coding/variables.py index b035ff82086..938752c4efc 100644 --- a/xarray/coding/variables.py +++ b/xarray/coding/variables.py @@ -316,6 +316,14 @@ def decode(self, variable, name=None): if "_FillValue" in attrs: new_fill = unsigned_dtype.type(attrs["_FillValue"]) attrs["_FillValue"] = new_fill + elif data.dtype.kind == "u": + if unsigned == "false": + signed_dtype = np.dtype("i%s" % data.dtype.itemsize) + transform = partial(np.asarray, dtype=signed_dtype) + data = lazy_elemwise_func(data, transform, signed_dtype) + if "_FillValue" in attrs: + new_fill = signed_dtype.type(attrs["_FillValue"]) + attrs["_FillValue"] = new_fill else: warnings.warn( "variable %r has _Unsigned attribute but is not " diff --git a/xarray/conventions.py b/xarray/conventions.py index bb0cc5cd338..f7be60c3f9e 100644 --- a/xarray/conventions.py +++ b/xarray/conventions.py @@ -374,7 +374,7 @@ def decode_cf_variable( data = BoolTypeArray(data) if not is_duck_dask_array(data): - data = indexing.LazilyOuterIndexedArray(data) + data = indexing.LazilyIndexedArray(data) return Variable(dimensions, data, attributes, encoding=encoding) diff --git a/xarray/core/accessor_dt.py b/xarray/core/accessor_dt.py index 561d5d30a79..1d4ef755fa0 100644 --- a/xarray/core/accessor_dt.py +++ b/xarray/core/accessor_dt.py @@ -31,6 +31,10 @@ def _access_through_cftimeindex(values, name): if name == "season": months = values_as_cftimeindex.month field_values = _season_from_months(months) + elif name == "date": + raise AttributeError( + "'CFTimeIndex' object has no attribute `date`. Consider using the floor method instead, for instance: `.time.dt.floor('D')`." + ) else: field_values = getattr(values_as_cftimeindex, name) return field_values.reshape(values.shape) @@ -415,6 +419,10 @@ def weekofyear(self): "time", "Timestamps corresponding to datetimes", object ) + date = Properties._tslib_field_accessor( + "date", "Date corresponding to datetimes", object + ) + is_month_start = Properties._tslib_field_accessor( "is_month_start", "Indicates whether the date is the first day of the month.", diff --git a/xarray/core/accessor_str.py b/xarray/core/accessor_str.py index 02d8ca00bf9..f0e416b52e6 100644 --- a/xarray/core/accessor_str.py +++ b/xarray/core/accessor_str.py @@ -40,6 +40,20 @@ import codecs import re import textwrap +from functools import reduce +from operator import or_ as set_union +from typing import ( + Any, + Callable, + Hashable, + Mapping, + Optional, + Pattern, + Tuple, + Type, + Union, +) +from unicodedata import normalize import numpy as np @@ -57,12 +71,76 @@ _cpython_optimized_decoders = _cpython_optimized_encoders + ("utf-16", "utf-32") -def _is_str_like(x): - return isinstance(x, str) or isinstance(x, bytes) +def _contains_obj_type(*, pat: Any, checker: Any) -> bool: + """Determine if the object fits some rule or is array of objects that do so.""" + if isinstance(checker, type): + targtype = checker + checker = lambda x: isinstance(x, targtype) + + if checker(pat): + return True + + # If it is not an object array it can't contain compiled re + if not getattr(pat, "dtype", "no") == np.object_: + return False + + return _apply_str_ufunc(func=checker, obj=pat).all() + + +def _contains_str_like(pat: Any) -> bool: + """Determine if the object is a str-like or array of str-like.""" + if isinstance(pat, (str, bytes)): + return True + + if not hasattr(pat, "dtype"): + return False + + return pat.dtype.kind == "U" or pat.dtype.kind == "S" + + +def _contains_compiled_re(pat: Any) -> bool: + """Determine if the object is a compiled re or array of compiled re.""" + return _contains_obj_type(pat=pat, checker=re.Pattern) + + +def _contains_callable(pat: Any) -> bool: + """Determine if the object is a callable or array of callables.""" + return _contains_obj_type(pat=pat, checker=callable) + + +def _apply_str_ufunc( + *, + func: Callable, + obj: Any, + dtype: Union[str, np.dtype, Type] = None, + output_core_dims: Union[list, tuple] = ((),), + output_sizes: Mapping[Hashable, int] = None, + func_args: Tuple = (), + func_kwargs: Mapping = {}, +) -> Any: + # TODO handling of na values ? + if dtype is None: + dtype = obj.dtype + + dask_gufunc_kwargs = dict() + if output_sizes is not None: + dask_gufunc_kwargs["output_sizes"] = output_sizes + + return apply_ufunc( + func, + obj, + *func_args, + vectorize=True, + dask="parallelized", + output_dtypes=[dtype], + output_core_dims=output_core_dims, + dask_gufunc_kwargs=dask_gufunc_kwargs, + **func_kwargs, + ) class StringAccessor: - """Vectorized string functions for string-like arrays. + r"""Vectorized string functions for string-like arrays. Similar to pandas, fields can be accessed through the `.str` attribute for applicable DataArrays. @@ -73,6 +151,55 @@ class StringAccessor: array([4, 4, 2, 2, 5]) Dimensions without coordinates: dim_0 + It also implements ``+``, ``*``, and ``%``, which operate as elementwise + versions of the corresponding ``str`` methods. These will automatically + broadcast for array-like inputs. + + >>> da1 = xr.DataArray(["first", "second", "third"], dims=["X"]) + >>> da2 = xr.DataArray([1, 2, 3], dims=["Y"]) + >>> da1.str + da2 + + array([['first1', 'first2', 'first3'], + ['second1', 'second2', 'second3'], + ['third1', 'third2', 'third3']], dtype='>> da1 = xr.DataArray(["a", "b", "c", "d"], dims=["X"]) + >>> reps = xr.DataArray([3, 4], dims=["Y"]) + >>> da1.str * reps + + array([['aaa', 'aaaa'], + ['bbb', 'bbbb'], + ['ccc', 'cccc'], + ['ddd', 'dddd']], dtype='>> da1 = xr.DataArray(["%s_%s", "%s-%s", "%s|%s"], dims=["X"]) + >>> da2 = xr.DataArray([1, 2], dims=["Y"]) + >>> da3 = xr.DataArray([0.1, 0.2], dims=["Z"]) + >>> da1.str % (da2, da3) + + array([[['1_0.1', '1_0.2'], + ['2_0.1', '2_0.2']], + + [['1-0.1', '1-0.2'], + ['2-0.1', '2-0.2']], + + [['1|0.1', '1|0.2'], + ['2|0.1', '2|0.2']]], dtype='>> da1 = xr.DataArray(["%(a)s"], dims=["X"]) + >>> da2 = xr.DataArray([1, 2, 3], dims=["Y"]) + >>> da1 % {"a": da2} + + array(['\narray([1, 2, 3])\nDimensions without coordinates: Y'], + dtype=object) + Dimensions without coordinates: X """ __slots__ = ("_obj",) @@ -80,15 +207,78 @@ class StringAccessor: def __init__(self, obj): self._obj = obj - def _apply(self, f, dtype=None): - # TODO handling of na values ? - if dtype is None: - dtype = self._obj.dtype + def _stringify( + self, + invar: Any, + ) -> Union[str, bytes, Any]: + """ + Convert a string-like to the correct string/bytes type. - g = np.vectorize(f, otypes=[dtype]) - return apply_ufunc(g, self._obj, dask="parallelized", output_dtypes=[dtype]) + This is mostly here to tell mypy a pattern is a str/bytes not a re.Pattern. + """ + if hasattr(invar, "astype"): + return invar.astype(self._obj.dtype.kind) + else: + return self._obj.dtype.type(invar) + + def _apply( + self, + *, + func: Callable, + dtype: Union[str, np.dtype, Type] = None, + output_core_dims: Union[list, tuple] = ((),), + output_sizes: Mapping[Hashable, int] = None, + func_args: Tuple = (), + func_kwargs: Mapping = {}, + ) -> Any: + return _apply_str_ufunc( + obj=self._obj, + func=func, + dtype=dtype, + output_core_dims=output_core_dims, + output_sizes=output_sizes, + func_args=func_args, + func_kwargs=func_kwargs, + ) + + def _re_compile( + self, + *, + pat: Union[str, bytes, Pattern, Any], + flags: int = 0, + case: bool = None, + ) -> Union[Pattern, Any]: + is_compiled_re = isinstance(pat, re.Pattern) + + if is_compiled_re and flags != 0: + raise ValueError("Flags cannot be set when pat is a compiled regex.") + + if is_compiled_re and case is not None: + raise ValueError("Case cannot be set when pat is a compiled regex.") + + if is_compiled_re: + # no-op, needed to tell mypy this isn't a string + return re.compile(pat) + + if case is None: + case = True + + # The case is handled by the re flags internally. + # Add it to the flags if necessary. + if not case: + flags |= re.IGNORECASE + + if getattr(pat, "dtype", None) != np.object_: + pat = self._stringify(pat) + func = lambda x: re.compile(x, flags=flags) + if isinstance(pat, np.ndarray): + # apply_ufunc doesn't work for numpy arrays with output object dtypes + func = np.vectorize(func) + return func(pat) + else: + return _apply_str_ufunc(func=func, obj=pat, dtype=np.object_) - def len(self): + def len(self) -> Any: """ Compute the length of each string in the array. @@ -96,22 +286,58 @@ def len(self): ------- lengths array : array of int """ - return self._apply(len, dtype=int) + return self._apply(func=len, dtype=int) - def __getitem__(self, key): + def __getitem__( + self, + key: Union[int, slice], + ) -> Any: if isinstance(key, slice): return self.slice(start=key.start, stop=key.stop, step=key.step) else: return self.get(key) - def get(self, i, default=""): + def __add__( + self, + other: Any, + ) -> Any: + return self.cat(other, sep="") + + def __mul__( + self, + num: Union[int, Any], + ) -> Any: + return self.repeat(num) + + def __mod__( + self, + other: Any, + ) -> Any: + if isinstance(other, dict): + other = {key: self._stringify(val) for key, val in other.items()} + return self._apply(func=lambda x: x % other) + elif isinstance(other, tuple): + other = tuple(self._stringify(x) for x in other) + return self._apply(func=lambda x, *y: x % y, func_args=other) + else: + return self._apply(func=lambda x, y: x % y, func_args=(other,)) + + def get( + self, + i: Union[int, Any], + default: Union[str, bytes] = "", + ) -> Any: """ Extract character number `i` from each string in the array. + If `i` is array-like, they are broadcast against the array and + applied elementwise. + Parameters ---------- - i : int + i : int or array-like of int Position of element to extract. + If array-like, it is broadcast. default : optional Value for out-of-range index. If not specified (None) defaults to an empty string. @@ -120,76 +346,334 @@ def get(self, i, default=""): ------- items : array of object """ - s = slice(-1, None) if i == -1 else slice(i, i + 1) - def f(x): - item = x[s] + def f(x, iind): + islice = slice(-1, None) if iind == -1 else slice(iind, iind + 1) + item = x[islice] return item if item else default - return self._apply(f) + return self._apply(func=f, func_args=(i,)) - def slice(self, start=None, stop=None, step=None): + def slice( + self, + start: Union[int, Any] = None, + stop: Union[int, Any] = None, + step: Union[int, Any] = None, + ) -> Any: """ Slice substrings from each string in the array. + If `start`, `stop`, or 'step` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - start : int, optional + start : int or array-like of int, optional Start position for slice operation. - stop : int, optional + If array-like, it is broadcast. + stop : int or array-like of int, optional Stop position for slice operation. - step : int, optional + If array-like, it is broadcast. + step : int or array-like of int, optional Step size for slice operation. + If array-like, it is broadcast. Returns ------- sliced strings : same type as values """ - s = slice(start, stop, step) - f = lambda x: x[s] - return self._apply(f) + f = lambda x, istart, istop, istep: x[slice(istart, istop, istep)] + return self._apply(func=f, func_args=(start, stop, step)) - def slice_replace(self, start=None, stop=None, repl=""): + def slice_replace( + self, + start: Union[int, Any] = None, + stop: Union[int, Any] = None, + repl: Union[str, bytes, Any] = "", + ) -> Any: """ Replace a positional slice of a string with another value. + If `start`, `stop`, or 'repl` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - start : int, optional + start : int or array-like of int, optional Left index position to use for the slice. If not specified (None), the slice is unbounded on the left, i.e. slice from the start - of the string. - stop : int, optional + of the string. If array-like, it is broadcast. + stop : int or array-like of int, optional Right index position to use for the slice. If not specified (None), the slice is unbounded on the right, i.e. slice until the - end of the string. - repl : str, optional + end of the string. If array-like, it is broadcast. + repl : str or array-like of str, optional String for replacement. If not specified, the sliced region - is replaced with an empty string. + is replaced with an empty string. If array-like, it is broadcast. Returns ------- replaced : same type as values """ - repl = self._obj.dtype.type(repl) + repl = self._stringify(repl) - def f(x): - if len(x[start:stop]) == 0: - local_stop = start + def func(x, istart, istop, irepl): + if len(x[istart:istop]) == 0: + local_stop = istart else: - local_stop = stop - y = self._obj.dtype.type("") - if start is not None: - y += x[:start] - y += repl - if stop is not None: + local_stop = istop + y = self._stringify("") + if istart is not None: + y += x[:istart] + y += irepl + if istop is not None: y += x[local_stop:] return y - return self._apply(f) + return self._apply(func=func, func_args=(start, stop, repl)) - def capitalize(self): + def cat( + self, + *others, + sep: Union[str, bytes, Any] = "", + ) -> Any: + """ + Concatenate strings elementwise in the DataArray with other strings. + + The other strings can either be string scalars or other array-like. + Dimensions are automatically broadcast together. + + An optional separator `sep` can also be specified. If `sep` is + array-like, it is broadcast against the array and applied elementwise. + + Parameters + ---------- + *others : str or array-like of str + Strings or array-like of strings to concatenate elementwise with + the current DataArray. + sep : str or array-like of str, default: "". + Seperator to use between strings. + It is broadcast in the same way as the other input strings. + If array-like, its dimensions will be placed at the end of the output array dimensions. + + Returns + ------- + concatenated : same type as values + + Examples + -------- + Create a string array + + >>> myarray = xr.DataArray( + ... ["11111", "4"], + ... dims=["X"], + ... ) + + Create some arrays to concatenate with it + + >>> values_1 = xr.DataArray( + ... ["a", "bb", "cccc"], + ... dims=["Y"], + ... ) + >>> values_2 = np.array(3.4) + >>> values_3 = "" + >>> values_4 = np.array("test", dtype=np.unicode_) + + Determine the separator to use + + >>> seps = xr.DataArray( + ... [" ", ", "], + ... dims=["ZZ"], + ... ) + + Concatenate the arrays using the separator + + >>> myarray.str.cat(values_1, values_2, values_3, values_4, sep=seps) + + array([[['11111 a 3.4 test', '11111, a, 3.4, , test'], + ['11111 bb 3.4 test', '11111, bb, 3.4, , test'], + ['11111 cccc 3.4 test', '11111, cccc, 3.4, , test']], + + [['4 a 3.4 test', '4, a, 3.4, , test'], + ['4 bb 3.4 test', '4, bb, 3.4, , test'], + ['4 cccc 3.4 test', '4, cccc, 3.4, , test']]], dtype=' Any: + """ + Concatenate strings in a DataArray along a particular dimension. + + An optional separator `sep` can also be specified. If `sep` is + array-like, it is broadcast against the array and applied elementwise. + + Parameters + ---------- + dim : hashable, optional + Dimension along which the strings should be concatenated. + Only one dimension is allowed at a time. + Optional for 0D or 1D DataArrays, required for multidimensional DataArrays. + sep : str or array-like, default: "". + Seperator to use between strings. + It is broadcast in the same way as the other input strings. + If array-like, its dimensions will be placed at the end of the output array dimensions. + + Returns + ------- + joined : same type as values + + Examples + -------- + Create an array + + >>> values = xr.DataArray( + ... [["a", "bab", "abc"], ["abcd", "", "abcdef"]], + ... dims=["X", "Y"], + ... ) + + Determine the separator + + >>> seps = xr.DataArray( + ... ["-", "_"], + ... dims=["ZZ"], + ... ) + + Join the strings along a given dimension + + >>> values.str.join(dim="Y", sep=seps) + + array([['a-bab-abc', 'a_bab_abc'], + ['abcd--abcdef', 'abcd__abcdef']], dtype=' 1 and dim is None: + raise ValueError("Dimension must be specified for multidimensional arrays.") + + if self._obj.ndim > 1: + # Move the target dimension to the start and split along it + dimshifted = list(self._obj.transpose(dim, ...)) + elif self._obj.ndim == 1: + dimshifted = list(self._obj) + else: + dimshifted = [self._obj] + + start, *others = dimshifted + + # concatenate the resulting arrays + return start.str.cat(*others, sep=sep) + + def format( + self, + *args: Any, + **kwargs: Any, + ) -> Any: + """ + Perform python string formatting on each element of the DataArray. + + This is equivalent to calling `str.format` on every element of the + DataArray. The replacement values can either be a string-like + scalar or array-like of string-like values. If array-like, + the values will be broadcast and applied elementwiseto the input + DataArray. + + .. note:: + Array-like values provided as `*args` will have their + dimensions added even if those arguments are not used in any + string formatting. + + .. warning:: + Array-like arguments are only applied elementwise for `*args`. + For `**kwargs`, values are used as-is. + + Parameters + ---------- + *args : str or bytes or array-like of str or bytes + Values for positional formatting. + If array-like, the values are broadcast and applied elementwise. + The dimensions will be placed at the end of the output array dimensions + in the order they are provided. + **kwargs : str or bytes or array-like of str or bytes + Values for keyword-based formatting. + These are **not** broadcast or applied elementwise. + + Returns + ------- + formatted : same type as values + + Examples + -------- + Create an array to format. + + >>> values = xr.DataArray( + ... ["{} is {adj0}", "{} and {} are {adj1}"], + ... dims=["X"], + ... ) + + Set the values to fill. + + >>> noun0 = xr.DataArray( + ... ["spam", "egg"], + ... dims=["Y"], + ... ) + >>> noun1 = xr.DataArray( + ... ["lancelot", "arthur"], + ... dims=["ZZ"], + ... ) + >>> adj0 = "unexpected" + >>> adj1 = "like a duck" + + Insert the values into the array + + >>> values.str.format(noun0, noun1, adj0=adj0, adj1=adj1) + + array([[['spam is unexpected', 'spam is unexpected'], + ['egg is unexpected', 'egg is unexpected']], + + [['spam and lancelot are like a duck', + 'spam and arthur are like a duck'], + ['egg and lancelot are like a duck', + 'egg and arthur are like a duck']]], dtype=' Any: """ Convert strings in the array to be capitalized. @@ -197,9 +681,9 @@ def capitalize(self): ------- capitalized : same type as values """ - return self._apply(lambda x: x.capitalize()) + return self._apply(func=lambda x: x.capitalize()) - def lower(self): + def lower(self) -> Any: """ Convert strings in the array to lowercase. @@ -207,9 +691,9 @@ def lower(self): ------- lowerd : same type as values """ - return self._apply(lambda x: x.lower()) + return self._apply(func=lambda x: x.lower()) - def swapcase(self): + def swapcase(self) -> Any: """ Convert strings in the array to be swapcased. @@ -217,9 +701,9 @@ def swapcase(self): ------- swapcased : same type as values """ - return self._apply(lambda x: x.swapcase()) + return self._apply(func=lambda x: x.swapcase()) - def title(self): + def title(self) -> Any: """ Convert strings in the array to titlecase. @@ -227,9 +711,9 @@ def title(self): ------- titled : same type as values """ - return self._apply(lambda x: x.title()) + return self._apply(func=lambda x: x.title()) - def upper(self): + def upper(self) -> Any: """ Convert strings in the array to uppercase. @@ -237,9 +721,46 @@ def upper(self): ------- uppered : same type as values """ - return self._apply(lambda x: x.upper()) + return self._apply(func=lambda x: x.upper()) + + def casefold(self) -> Any: + """ + Convert strings in the array to be casefolded. + + Casefolding is similar to converting to lowercase, + but removes all case distinctions. + This is important in some languages that have more complicated + cases and case conversions. + + Returns + ------- + casefolded : same type as values + """ + return self._apply(func=lambda x: x.casefold()) + + def normalize( + self, + form: str, + ) -> Any: + """ + Return the Unicode normal form for the strings in the datarray. + + For more information on the forms, see the documentation for + :func:`unicodedata.normalize`. + + Parameters + ---------- + form : {"NFC", "NFKC", "NFD", "NFKD"} + Unicode form. + + Returns + ------- + normalized : same type as values + + """ + return self._apply(func=lambda x: normalize(form, x)) - def isalnum(self): + def isalnum(self) -> Any: """ Check whether all characters in each string are alphanumeric. @@ -248,9 +769,9 @@ def isalnum(self): isalnum : array of bool Array of boolean values with the same shape as the original array. """ - return self._apply(lambda x: x.isalnum(), dtype=bool) + return self._apply(func=lambda x: x.isalnum(), dtype=bool) - def isalpha(self): + def isalpha(self) -> Any: """ Check whether all characters in each string are alphabetic. @@ -259,9 +780,9 @@ def isalpha(self): isalpha : array of bool Array of boolean values with the same shape as the original array. """ - return self._apply(lambda x: x.isalpha(), dtype=bool) + return self._apply(func=lambda x: x.isalpha(), dtype=bool) - def isdecimal(self): + def isdecimal(self) -> Any: """ Check whether all characters in each string are decimal. @@ -270,9 +791,9 @@ def isdecimal(self): isdecimal : array of bool Array of boolean values with the same shape as the original array. """ - return self._apply(lambda x: x.isdecimal(), dtype=bool) + return self._apply(func=lambda x: x.isdecimal(), dtype=bool) - def isdigit(self): + def isdigit(self) -> Any: """ Check whether all characters in each string are digits. @@ -281,9 +802,9 @@ def isdigit(self): isdigit : array of bool Array of boolean values with the same shape as the original array. """ - return self._apply(lambda x: x.isdigit(), dtype=bool) + return self._apply(func=lambda x: x.isdigit(), dtype=bool) - def islower(self): + def islower(self) -> Any: """ Check whether all characters in each string are lowercase. @@ -292,9 +813,9 @@ def islower(self): islower : array of bool Array of boolean values with the same shape as the original array. """ - return self._apply(lambda x: x.islower(), dtype=bool) + return self._apply(func=lambda x: x.islower(), dtype=bool) - def isnumeric(self): + def isnumeric(self) -> Any: """ Check whether all characters in each string are numeric. @@ -303,9 +824,9 @@ def isnumeric(self): isnumeric : array of bool Array of boolean values with the same shape as the original array. """ - return self._apply(lambda x: x.isnumeric(), dtype=bool) + return self._apply(func=lambda x: x.isnumeric(), dtype=bool) - def isspace(self): + def isspace(self) -> Any: """ Check whether all characters in each string are spaces. @@ -314,9 +835,9 @@ def isspace(self): isspace : array of bool Array of boolean values with the same shape as the original array. """ - return self._apply(lambda x: x.isspace(), dtype=bool) + return self._apply(func=lambda x: x.isspace(), dtype=bool) - def istitle(self): + def istitle(self) -> Any: """ Check whether all characters in each string are titlecase. @@ -325,9 +846,9 @@ def istitle(self): istitle : array of bool Array of boolean values with the same shape as the original array. """ - return self._apply(lambda x: x.istitle(), dtype=bool) + return self._apply(func=lambda x: x.istitle(), dtype=bool) - def isupper(self): + def isupper(self) -> Any: """ Check whether all characters in each string are uppercase. @@ -336,9 +857,14 @@ def isupper(self): isupper : array of bool Array of boolean values with the same shape as the original array. """ - return self._apply(lambda x: x.isupper(), dtype=bool) + return self._apply(func=lambda x: x.isupper(), dtype=bool) - def count(self, pat, flags=0): + def count( + self, + pat: Union[str, bytes, Pattern, Any], + flags: int = 0, + case: bool = None, + ) -> Any: """ Count occurrences of pattern in each string of the array. @@ -346,31 +872,49 @@ def count(self, pat, flags=0): pattern is repeated in each of the string elements of the :class:`~xarray.DataArray`. + The pattern `pat` can either be a single ``str`` or `re.Pattern` or + array-like of ``str`` or `re.Pattern`. If array-like, it is broadcast + against the array and applied elementwise. + Parameters ---------- - pat : str - Valid regular expression. + pat : str or re.Pattern or array-like of str or re.Pattern + A string containing a regular expression or a compiled regular + expression object. If array-like, it is broadcast. flags : int, default: 0 - Flags for the `re` module. Use 0 for no flags. For a complete list, - `see here `_. + Flags to pass through to the re module, e.g. `re.IGNORECASE`. + see `compilation-flags `_. + ``0`` means no flags. Flags can be combined with the bitwise or operator ``|``. + Cannot be set if `pat` is a compiled regex. + case : bool, default: True + If True, case sensitive. + Cannot be set if `pat` is a compiled regex. + Equivalent to setting the `re.IGNORECASE` flag. Returns ------- counts : array of int """ - pat = self._obj.dtype.type(pat) - regex = re.compile(pat, flags=flags) - f = lambda x: len(regex.findall(x)) - return self._apply(f, dtype=int) + pat = self._re_compile(pat=pat, flags=flags, case=case) + + func = lambda x, ipat: len(ipat.findall(x)) + return self._apply(func=func, func_args=(pat,), dtype=int) - def startswith(self, pat): + def startswith( + self, + pat: Union[str, bytes, Any], + ) -> Any: """ Test if the start of each string in the array matches a pattern. + The pattern `pat` can either be a ``str`` or array-like of ``str``. + If array-like, it will be broadcast and applied elementwise. + Parameters ---------- pat : str Character sequence. Regular expressions are not accepted. + If array-like, it is broadcast. Returns ------- @@ -378,18 +922,25 @@ def startswith(self, pat): An array of booleans indicating whether the given pattern matches the start of each string element. """ - pat = self._obj.dtype.type(pat) - f = lambda x: x.startswith(pat) - return self._apply(f, dtype=bool) + pat = self._stringify(pat) + func = lambda x, y: x.startswith(y) + return self._apply(func=func, func_args=(pat,), dtype=bool) - def endswith(self, pat): + def endswith( + self, + pat: Union[str, bytes, Any], + ) -> Any: """ Test if the end of each string in the array matches a pattern. + The pattern `pat` can either be a ``str`` or array-like of ``str``. + If array-like, it will be broadcast and applied elementwise. + Parameters ---------- pat : str Character sequence. Regular expressions are not accepted. + If array-like, it is broadcast. Returns ------- @@ -397,100 +948,151 @@ def endswith(self, pat): A Series of booleans indicating whether the given pattern matches the end of each string element. """ - pat = self._obj.dtype.type(pat) - f = lambda x: x.endswith(pat) - return self._apply(f, dtype=bool) + pat = self._stringify(pat) + func = lambda x, y: x.endswith(y) + return self._apply(func=func, func_args=(pat,), dtype=bool) - def pad(self, width, side="left", fillchar=" "): + def pad( + self, + width: Union[int, Any], + side: str = "left", + fillchar: Union[str, bytes, Any] = " ", + ) -> Any: """ Pad strings in the array up to width. + If `width` or 'fillchar` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - width : int + width : int or array-like of int Minimum width of resulting string; additional characters will be - filled with character defined in `fillchar`. + filled with character defined in ``fillchar``. + If array-like, it is broadcast. side : {"left", "right", "both"}, default: "left" Side from which to fill resulting string. - fillchar : str, default: " " - Additional character for filling, default is whitespace. + fillchar : str or array-like of str, default: " " + Additional character for filling, default is a space. + If array-like, it is broadcast. Returns ------- filled : same type as values Array with a minimum number of char in each element. """ - width = int(width) - fillchar = self._obj.dtype.type(fillchar) - if len(fillchar) != 1: - raise TypeError("fillchar must be a character, not str") - if side == "left": - f = lambda s: s.rjust(width, fillchar) + func = self.rjust elif side == "right": - f = lambda s: s.ljust(width, fillchar) + func = self.ljust elif side == "both": - f = lambda s: s.center(width, fillchar) + func = self.center else: # pragma: no cover raise ValueError("Invalid side") - return self._apply(f) + return func(width=width, fillchar=fillchar) + + def _padder( + self, + *, + func: Callable, + width: Union[int, Any], + fillchar: Union[str, bytes, Any] = " ", + ) -> Any: + """ + Wrapper function to handle padding operations + """ + fillchar = self._stringify(fillchar) + + def overfunc(x, iwidth, ifillchar): + if len(ifillchar) != 1: + raise TypeError("fillchar must be a character, not str") + return func(x, int(iwidth), ifillchar) + + return self._apply(func=overfunc, func_args=(width, fillchar)) - def center(self, width, fillchar=" "): + def center( + self, + width: Union[int, Any], + fillchar: Union[str, bytes, Any] = " ", + ) -> Any: """ Pad left and right side of each string in the array. + If `width` or 'fillchar` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - width : int + width : int or array-like of int Minimum width of resulting string; additional characters will be - filled with ``fillchar`` - fillchar : str, default: " " - Additional character for filling, default is whitespace + filled with ``fillchar``. If array-like, it is broadcast. + fillchar : str or array-like of str, default: " " + Additional character for filling, default is a space. + If array-like, it is broadcast. Returns ------- filled : same type as values """ - return self.pad(width, side="both", fillchar=fillchar) + func = self._obj.dtype.type.center + return self._padder(func=func, width=width, fillchar=fillchar) - def ljust(self, width, fillchar=" "): + def ljust( + self, + width: Union[int, Any], + fillchar: Union[str, bytes, Any] = " ", + ) -> Any: """ Pad right side of each string in the array. + If `width` or 'fillchar` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - width : int + width : int or array-like of int Minimum width of resulting string; additional characters will be - filled with ``fillchar`` - fillchar : str, default: " " - Additional character for filling, default is whitespace + filled with ``fillchar``. If array-like, it is broadcast. + fillchar : str or array-like of str, default: " " + Additional character for filling, default is a space. + If array-like, it is broadcast. Returns ------- filled : same type as values """ - return self.pad(width, side="right", fillchar=fillchar) + func = self._obj.dtype.type.ljust + return self._padder(func=func, width=width, fillchar=fillchar) - def rjust(self, width, fillchar=" "): + def rjust( + self, + width: Union[int, Any], + fillchar: Union[str, bytes, Any] = " ", + ) -> Any: """ Pad left side of each string in the array. + If `width` or 'fillchar` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - width : int + width : int or array-like of int Minimum width of resulting string; additional characters will be - filled with ``fillchar`` - fillchar : str, default: " " - Additional character for filling, default is whitespace + filled with ``fillchar``. If array-like, it is broadcast. + fillchar : str or array-like of str, default: " " + Additional character for filling, default is a space. + If array-like, it is broadcast. Returns ------- filled : same type as values """ - return self.pad(width, side="left", fillchar=fillchar) + func = self._obj.dtype.type.rjust + return self._padder(func=func, width=width, fillchar=fillchar) - def zfill(self, width): + def zfill(self, width: Union[int, Any]) -> Any: """ Pad each string in the array by prepending '0' characters. @@ -498,37 +1100,56 @@ def zfill(self, width): left of the string to reach a total string length `width`. Strings in the array with length greater or equal to `width` are unchanged. + If `width` is array-like, it is broadcast against the array and applied + elementwise. + Parameters ---------- - width : int + width : int or array-like of int Minimum length of resulting string; strings with length less - than `width` be prepended with '0' characters. + than `width` be prepended with '0' characters. If array-like, it is broadcast. Returns ------- filled : same type as values """ - return self.pad(width, side="left", fillchar="0") + return self.rjust(width, fillchar="0") - def contains(self, pat, case=True, flags=0, regex=True): + def contains( + self, + pat: Union[str, bytes, Pattern, Any], + case: bool = None, + flags: int = 0, + regex: bool = True, + ) -> Any: """ Test if pattern or regex is contained within each string of the array. Return boolean array based on whether a given pattern or regex is contained within a string of the array. + The pattern `pat` can either be a single ``str`` or `re.Pattern` or + array-like of ``str`` or `re.Pattern`. If array-like, it is broadcast + against the array and applied elementwise. + Parameters ---------- - pat : str - Character sequence or regular expression. + pat : str or re.Pattern or array-like of str or re.Pattern + Character sequence, a string containing a regular expression, + or a compiled regular expression object. If array-like, it is broadcast. case : bool, default: True If True, case sensitive. + Cannot be set if `pat` is a compiled regex. + Equivalent to setting the `re.IGNORECASE` flag. flags : int, default: 0 - Flags to pass through to the re module, e.g. re.IGNORECASE. - ``0`` means no flags. + Flags to pass through to the re module, e.g. `re.IGNORECASE`. + see `compilation-flags `_. + ``0`` means no flags. Flags can be combined with the bitwise or operator ``|``. + Cannot be set if `pat` is a compiled regex. regex : bool, default: True If True, assumes the pat is a regular expression. If False, treats the pat as a literal string. + Cannot be set to `False` if `pat` is a compiled regex. Returns ------- @@ -537,65 +1158,94 @@ def contains(self, pat, case=True, flags=0, regex=True): given pattern is contained within the string of each element of the array. """ - pat = self._obj.dtype.type(pat) - if regex: - if not case: - flags |= re.IGNORECASE + is_compiled_re = _contains_compiled_re(pat) + if is_compiled_re and not regex: + raise ValueError( + "Must use regular expression matching for regular expression object." + ) - regex = re.compile(pat, flags=flags) + if regex: + if not is_compiled_re: + pat = self._re_compile(pat=pat, flags=flags, case=case) - if regex.groups > 0: # pragma: no cover - raise ValueError("This pattern has match groups.") + def func(x, ipat): + if ipat.groups > 0: # pragma: no cover + raise ValueError("This pattern has match groups.") + return bool(ipat.search(x)) - f = lambda x: bool(regex.search(x)) else: - if case: - f = lambda x: pat in x + pat = self._stringify(pat) + if case or case is None: + func = lambda x, ipat: ipat in x + elif self._obj.dtype.char == "U": + uppered = self._obj.str.casefold() + uppat = StringAccessor(pat).casefold() + return uppered.str.contains(uppat, regex=False) else: uppered = self._obj.str.upper() - return uppered.str.contains(pat.upper(), regex=False) + uppat = StringAccessor(pat).upper() + return uppered.str.contains(uppat, regex=False) - return self._apply(f, dtype=bool) + return self._apply(func=func, func_args=(pat,), dtype=bool) - def match(self, pat, case=True, flags=0): + def match( + self, + pat: Union[str, bytes, Pattern, Any], + case: bool = None, + flags: int = 0, + ) -> Any: """ Determine if each string in the array matches a regular expression. + The pattern `pat` can either be a single ``str`` or `re.Pattern` or + array-like of ``str`` or `re.Pattern`. If array-like, it is broadcast + against the array and applied elementwise. + Parameters ---------- - pat : str - Character sequence or regular expression + pat : str or re.Pattern or array-like of str or re.Pattern + A string containing a regular expression or + a compiled regular expression object. If array-like, it is broadcast. case : bool, default: True - If True, case sensitive + If True, case sensitive. + Cannot be set if `pat` is a compiled regex. + Equivalent to setting the `re.IGNORECASE` flag. flags : int, default: 0 - re module flags, e.g. re.IGNORECASE. ``0`` means no flags + Flags to pass through to the re module, e.g. `re.IGNORECASE`. + see `compilation-flags `_. + ``0`` means no flags. Flags can be combined with the bitwise or operator ``|``. + Cannot be set if `pat` is a compiled regex. Returns ------- matched : array of bool """ - if not case: - flags |= re.IGNORECASE + pat = self._re_compile(pat=pat, flags=flags, case=case) - pat = self._obj.dtype.type(pat) - regex = re.compile(pat, flags=flags) - f = lambda x: bool(regex.match(x)) - return self._apply(f, dtype=bool) + func = lambda x, ipat: bool(ipat.match(x)) + return self._apply(func=func, func_args=(pat,), dtype=bool) - def strip(self, to_strip=None, side="both"): + def strip( + self, + to_strip: Union[str, bytes, Any] = None, + side: str = "both", + ) -> Any: """ Remove leading and trailing characters. Strip whitespaces (including newlines) or a set of specified characters from each string in the array from left and/or right sides. + `to_strip` can either be a ``str`` or array-like of ``str``. + If array-like, it will be broadcast and applied elementwise. + Parameters ---------- - to_strip : str or None, default: None + to_strip : str or array-like of str or None, default: None Specifying the set of characters to be removed. All combinations of this set of characters will be stripped. - If None then whitespaces are removed. - side : {"left", "right", "both"}, default: "left" + If None then whitespaces are removed. If array-like, it is broadcast. + side : {"left", "right", "both"}, default: "both" Side from which to strip. Returns @@ -603,32 +1253,38 @@ def strip(self, to_strip=None, side="both"): stripped : same type as values """ if to_strip is not None: - to_strip = self._obj.dtype.type(to_strip) + to_strip = self._stringify(to_strip) if side == "both": - f = lambda x: x.strip(to_strip) + func = lambda x, y: x.strip(y) elif side == "left": - f = lambda x: x.lstrip(to_strip) + func = lambda x, y: x.lstrip(y) elif side == "right": - f = lambda x: x.rstrip(to_strip) + func = lambda x, y: x.rstrip(y) else: # pragma: no cover raise ValueError("Invalid side") - return self._apply(f) + return self._apply(func=func, func_args=(to_strip,)) - def lstrip(self, to_strip=None): + def lstrip( + self, + to_strip: Union[str, bytes, Any] = None, + ) -> Any: """ Remove leading characters. Strip whitespaces (including newlines) or a set of specified characters from each string in the array from the left side. + `to_strip` can either be a ``str`` or array-like of ``str``. + If array-like, it will be broadcast and applied elementwise. + Parameters ---------- - to_strip : str or None, default: None + to_strip : str or array-like of str or None, default: None Specifying the set of characters to be removed. All combinations of this set of characters will be stripped. - If None then whitespaces are removed. + If None then whitespaces are removed. If array-like, it is broadcast. Returns ------- @@ -636,19 +1292,25 @@ def lstrip(self, to_strip=None): """ return self.strip(to_strip, side="left") - def rstrip(self, to_strip=None): + def rstrip( + self, + to_strip: Union[str, bytes, Any] = None, + ) -> Any: """ Remove trailing characters. Strip whitespaces (including newlines) or a set of specified characters from each string in the array from the right side. + `to_strip` can either be a ``str`` or array-like of ``str``. + If array-like, it will be broadcast and applied elementwise. + Parameters ---------- - to_strip : str or None, default: None + to_strip : str or array-like of str or None, default: None Specifying the set of characters to be removed. All combinations of this set of characters will be stripped. - If None then whitespaces are removed. + If None then whitespaces are removed. If array-like, it is broadcast. Returns ------- @@ -656,17 +1318,25 @@ def rstrip(self, to_strip=None): """ return self.strip(to_strip, side="right") - def wrap(self, width, **kwargs): + def wrap( + self, + width: Union[int, Any], + **kwargs, + ) -> Any: """ Wrap long strings in the array in paragraphs with length less than `width`. This method has the same keyword parameters and defaults as :class:`textwrap.TextWrapper`. + If `width` is array-like, it is broadcast against the array and applied + elementwise. + Parameters ---------- - width : int - Maximum line-width + width : int or array-like of int + Maximum line-width. + If array-like, it is broadcast. **kwargs keyword arguments passed into :class:`textwrap.TextWrapper`. @@ -674,11 +1344,15 @@ def wrap(self, width, **kwargs): ------- wrapped : same type as values """ - tw = textwrap.TextWrapper(width=width, **kwargs) - f = lambda x: "\n".join(tw.wrap(x)) - return self._apply(f) + ifunc = lambda x: textwrap.TextWrapper(width=x, **kwargs) + tw = StringAccessor(width)._apply(func=ifunc, dtype=np.object_) + func = lambda x, itw: "\n".join(itw.wrap(x)) + return self._apply(func=func, func_args=(tw,)) - def translate(self, table): + def translate( + self, + table: Mapping[Union[str, bytes], Union[str, bytes]], + ) -> Any: """ Map characters of each string through the given mapping table. @@ -694,40 +1368,59 @@ def translate(self, table): ------- translated : same type as values """ - f = lambda x: x.translate(table) - return self._apply(f) + func = lambda x: x.translate(table) + return self._apply(func=func) - def repeat(self, repeats): + def repeat( + self, + repeats: Union[int, Any], + ) -> Any: """ - Duplicate each string in the array. + Repeat each string in the array. + + If `repeats` is array-like, it is broadcast against the array and applied + elementwise. Parameters ---------- - repeats : int + repeats : int or array-like of int Number of repetitions. + If array-like, it is broadcast. Returns ------- repeated : same type as values Array of repeated string objects. """ - f = lambda x: repeats * x - return self._apply(f) + func = lambda x, y: x * y + return self._apply(func=func, func_args=(repeats,)) - def find(self, sub, start=0, end=None, side="left"): + def find( + self, + sub: Union[str, bytes, Any], + start: Union[int, Any] = 0, + end: Union[int, Any] = None, + side: str = "left", + ) -> Any: """ Return lowest or highest indexes in each strings in the array where the substring is fully contained between [start:end]. Return -1 on failure. + If `start`, `end`, or 'sub` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - sub : str - Substring being searched - start : int - Left edge index - end : int - Right edge index + sub : str or array-like of str + Substring being searched. + If array-like, it is broadcast. + start : int or array-like of int + Left edge index. + If array-like, it is broadcast. + end : int or array-like of int + Right edge index. + If array-like, it is broadcast. side : {"left", "right"}, default: "left" Starting side for search. @@ -735,7 +1428,7 @@ def find(self, sub, start=0, end=None, side="left"): ------- found : array of int """ - sub = self._obj.dtype.type(sub) + sub = self._stringify(sub) if side == "left": method = "find" @@ -744,27 +1437,34 @@ def find(self, sub, start=0, end=None, side="left"): else: # pragma: no cover raise ValueError("Invalid side") - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return self._apply(f, dtype=int) + func = lambda x, isub, istart, iend: getattr(x, method)(isub, istart, iend) + return self._apply(func=func, func_args=(sub, start, end), dtype=int) - def rfind(self, sub, start=0, end=None): + def rfind( + self, + sub: Union[str, bytes, Any], + start: Union[int, Any] = 0, + end: Union[int, Any] = None, + ) -> Any: """ Return highest indexes in each strings in the array where the substring is fully contained between [start:end]. Return -1 on failure. + If `start`, `end`, or 'sub` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - sub : str - Substring being searched - start : int - Left edge index - end : int - Right edge index + sub : str or array-like of str + Substring being searched. + If array-like, it is broadcast. + start : int or array-like of int + Left edge index. + If array-like, it is broadcast. + end : int or array-like of int + Right edge index. + If array-like, it is broadcast. Returns ------- @@ -772,29 +1472,46 @@ def rfind(self, sub, start=0, end=None): """ return self.find(sub, start=start, end=end, side="right") - def index(self, sub, start=0, end=None, side="left"): + def index( + self, + sub: Union[str, bytes, Any], + start: Union[int, Any] = 0, + end: Union[int, Any] = None, + side: str = "left", + ) -> Any: """ Return lowest or highest indexes in each strings where the substring is fully contained between [start:end]. This is the same as ``str.find`` except instead of returning -1, it raises a ValueError when the substring is not found. + If `start`, `end`, or 'sub` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - sub : str - Substring being searched - start : int - Left edge index - end : int - Right edge index + sub : str or array-like of str + Substring being searched. + If array-like, it is broadcast. + start : int or array-like of int + Left edge index. + If array-like, it is broadcast. + end : int or array-like of int + Right edge index. + If array-like, it is broadcast. side : {"left", "right"}, default: "left" Starting side for search. Returns ------- found : array of int + + Raises + ------ + ValueError + substring is not found """ - sub = self._obj.dtype.type(sub) + sub = self._stringify(sub) if side == "left": method = "index" @@ -803,61 +1520,89 @@ def index(self, sub, start=0, end=None, side="left"): else: # pragma: no cover raise ValueError("Invalid side") - if end is None: - f = lambda x: getattr(x, method)(sub, start) - else: - f = lambda x: getattr(x, method)(sub, start, end) - - return self._apply(f, dtype=int) + func = lambda x, isub, istart, iend: getattr(x, method)(isub, istart, iend) + return self._apply(func=func, func_args=(sub, start, end), dtype=int) - def rindex(self, sub, start=0, end=None): + def rindex( + self, + sub: Union[str, bytes, Any], + start: Union[int, Any] = 0, + end: Union[int, Any] = None, + ) -> Any: """ Return highest indexes in each strings where the substring is fully contained between [start:end]. This is the same as ``str.rfind`` except instead of returning -1, it raises a ValueError when the substring is not found. + If `start`, `end`, or 'sub` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - sub : str - Substring being searched - start : int - Left edge index - end : int - Right edge index + sub : str or array-like of str + Substring being searched. + If array-like, it is broadcast. + start : int or array-like of int + Left edge index. + If array-like, it is broadcast. + end : int or array-like of int + Right edge index. + If array-like, it is broadcast. Returns ------- found : array of int + + Raises + ------ + ValueError + substring is not found """ return self.index(sub, start=start, end=end, side="right") - def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): + def replace( + self, + pat: Union[str, bytes, Pattern, Any], + repl: Union[str, bytes, Callable, Any], + n: Union[int, Any] = -1, + case: bool = None, + flags: int = 0, + regex: bool = True, + ) -> Any: """ Replace occurrences of pattern/regex in the array with some string. + If `pat`, `repl`, or 'n` is array-like, they are broadcast + against the array and applied elementwise. + Parameters ---------- - pat : str or re.Pattern + pat : str or re.Pattern or array-like of str or re.Pattern String can be a character sequence or regular expression. - repl : str or callable + If array-like, it is broadcast. + repl : str or callable or array-like of str or callable Replacement string or a callable. The callable is passed the regex match object and must return a replacement string to be used. See :func:`re.sub`. - n : int, default: -1 + If array-like, it is broadcast. + n : int or array of int, default: -1 Number of replacements to make from start. Use ``-1`` to replace all. - case : bool, default: None - - If True, case sensitive (the default if `pat` is a string) - - Set to False for case insensitive - - Cannot be set if `pat` is a compiled regex + If array-like, it is broadcast. + case : bool, default: True + If True, case sensitive. + Cannot be set if `pat` is a compiled regex. + Equivalent to setting the `re.IGNORECASE` flag. flags : int, default: 0 - - re module flags, e.g. re.IGNORECASE. Use ``0`` for no flags. - - Cannot be set if `pat` is a compiled regex + Flags to pass through to the re module, e.g. `re.IGNORECASE`. + see `compilation-flags `_. + ``0`` means no flags. Flags can be combined with the bitwise or operator ``|``. + Cannot be set if `pat` is a compiled regex. regex : bool, default: True - - If True, assumes the passed-in pattern is a regular expression. - - If False, treats the pattern as a literal string - - Cannot be set to False if `pat` is a compiled regex or `repl` is - a callable. + If True, assumes the passed-in pattern is a regular expression. + If False, treats the pattern as a literal string. + Cannot be set to False if `pat` is a compiled regex or `repl` is + a callable. Returns ------- @@ -865,84 +1610,968 @@ def replace(self, pat, repl, n=-1, case=None, flags=0, regex=True): A copy of the object with all matching occurrences of `pat` replaced by `repl`. """ - if not (_is_str_like(repl) or callable(repl)): # pragma: no cover + if _contains_str_like(repl): + repl = self._stringify(repl) + elif not _contains_callable(repl): # pragma: no cover raise TypeError("repl must be a string or callable") - if _is_str_like(pat): - pat = self._obj.dtype.type(pat) + is_compiled_re = _contains_compiled_re(pat) + if not regex and is_compiled_re: + raise ValueError( + "Cannot use a compiled regex as replacement pattern with regex=False" + ) - if _is_str_like(repl): - repl = self._obj.dtype.type(repl) + if not regex and callable(repl): + raise ValueError("Cannot use a callable replacement when regex=False") - is_compiled_re = isinstance(pat, type(re.compile(""))) if regex: - if is_compiled_re: - if (case is not None) or (flags != 0): - raise ValueError( - "case and flags cannot be set when pat is a compiled regex" - ) - else: - # not a compiled regex - # set default case - if case is None: - case = True - - # add case flag, if provided - if case is False: - flags |= re.IGNORECASE - if is_compiled_re or len(pat) > 1 or flags or callable(repl): - n = n if n >= 0 else 0 - compiled = re.compile(pat, flags=flags) - f = lambda x: compiled.sub(repl=repl, string=x, count=n) - else: - f = lambda x: x.replace(pat, repl, n) + pat = self._re_compile(pat=pat, flags=flags, case=case) + func = lambda x, ipat, irepl, i_n: ipat.sub( + repl=irepl, string=x, count=i_n if i_n >= 0 else 0 + ) else: - if is_compiled_re: - raise ValueError( - "Cannot use a compiled regex as replacement " - "pattern with regex=False" - ) - if callable(repl): - raise ValueError("Cannot use a callable replacement when regex=False") - f = lambda x: x.replace(pat, repl, n) - return self._apply(f) - - def decode(self, encoding, errors="strict"): + pat = self._stringify(pat) + func = lambda x, ipat, irepl, i_n: x.replace(ipat, irepl, i_n) + return self._apply(func=func, func_args=(pat, repl, n)) + + def extract( + self, + pat: Union[str, bytes, Pattern, Any], + dim: Hashable, + case: bool = None, + flags: int = 0, + ) -> Any: + r""" + Extract the first match of capture groups in the regex pat as a new + dimension in a DataArray. + + For each string in the DataArray, extract groups from the first match + of regular expression pat. + + If `pat` is array-like, it is broadcast against the array and applied + elementwise. + + Parameters + ---------- + pat : str or re.Pattern or array-like of str or re.Pattern + A string containing a regular expression or a compiled regular + expression object. If array-like, it is broadcast. + dim : hashable or None + Name of the new dimension to store the captured strings in. + If None, the pattern must have only one capture group and the + resulting DataArray will have the same size as the original. + case : bool, default: True + If True, case sensitive. + Cannot be set if `pat` is a compiled regex. + Equivalent to setting the `re.IGNORECASE` flag. + flags : int, default: 0 + Flags to pass through to the re module, e.g. `re.IGNORECASE`. + see `compilation-flags `_. + ``0`` means no flags. Flags can be combined with the bitwise or operator ``|``. + Cannot be set if `pat` is a compiled regex. + + Returns + ------- + extracted : same type as values or object array + + Raises + ------ + ValueError + `pat` has no capture groups. + ValueError + `dim` is None and there is more than one capture group. + ValueError + `case` is set when `pat` is a compiled regular expression. + KeyError + The given dimension is already present in the DataArray. + + Examples + -------- + Create a string array + + >>> value = xr.DataArray( + ... [ + ... [ + ... "a_Xy_0", + ... "ab_xY_10-bab_Xy_110-baab_Xy_1100", + ... "abc_Xy_01-cbc_Xy_2210", + ... ], + ... [ + ... "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + ... "", + ... "abcdef_Xy_101-fef_Xy_5543210", + ... ], + ... ], + ... dims=["X", "Y"], + ... ) + + Extract matches + + >>> value.str.extract(r"(\w+)_Xy_(\d*)", dim="match") + + array([[['a', '0'], + ['bab', '110'], + ['abc', '01']], + + [['abcd', ''], + ['', ''], + ['abcdef', '101']]], dtype=' Any: + r""" + Extract all matches of capture groups in the regex pat as new + dimensions in a DataArray. + + For each string in the DataArray, extract groups from all matches + of regular expression pat. + Equivalent to applying re.findall() to all the elements in the DataArray + and splitting the results across dimensions. + + If `pat` is array-like, it is broadcast against the array and applied + elementwise. + + Parameters + ---------- + pat : str or re.Pattern + A string containing a regular expression or a compiled regular + expression object. If array-like, it is broadcast. + group_dim : hashable + Name of the new dimensions corresponding to the capture groups. + This dimension is added to the new DataArray first. + match_dim : hashable + Name of the new dimensions corresponding to the matches for each group. + This dimension is added to the new DataArray second. + case : bool, default: True + If True, case sensitive. + Cannot be set if `pat` is a compiled regex. + Equivalent to setting the `re.IGNORECASE` flag. + flags : int, default: 0 + Flags to pass through to the re module, e.g. `re.IGNORECASE`. + see `compilation-flags `_. + ``0`` means no flags. Flags can be combined with the bitwise or operator ``|``. + Cannot be set if `pat` is a compiled regex. + + Returns + ------- + extracted : same type as values or object array + + Raises + ------ + ValueError + `pat` has no capture groups. + ValueError + `case` is set when `pat` is a compiled regular expression. + KeyError + Either of the given dimensions is already present in the DataArray. + KeyError + The given dimensions names are the same. + + Examples + -------- + Create a string array + + >>> value = xr.DataArray( + ... [ + ... [ + ... "a_Xy_0", + ... "ab_xY_10-bab_Xy_110-baab_Xy_1100", + ... "abc_Xy_01-cbc_Xy_2210", + ... ], + ... [ + ... "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + ... "", + ... "abcdef_Xy_101-fef_Xy_5543210", + ... ], + ... ], + ... dims=["X", "Y"], + ... ) + + Extract matches + + >>> value.str.extractall( + ... r"(\w+)_Xy_(\d*)", group_dim="group", match_dim="match" + ... ) + + array([[[['a', '0'], + ['', ''], + ['', '']], + + [['bab', '110'], + ['baab', '1100'], + ['', '']], + + [['abc', '01'], + ['cbc', '2210'], + ['', '']]], + + + [[['abcd', ''], + ['dcd', '33210'], + ['dccd', '332210']], + + [['', ''], + ['', ''], + ['', '']], + + [['abcdef', '101'], + ['fef', '5543210'], + ['', '']]]], dtype=' Any: + r""" + Find all occurrences of pattern or regular expression in the DataArray. + + Equivalent to applying re.findall() to all the elements in the DataArray. + Results in an object array of lists. + If there is only one capture group, the lists will be a sequence of matches. + If there are multiple capture groups, the lists will be a sequence of lists, + each of which contains a sequence of matches. + + If `pat` is array-like, it is broadcast against the array and applied + elementwise. + + Parameters + ---------- + pat : str or re.Pattern + A string containing a regular expression or a compiled regular + expression object. If array-like, it is broadcast. + case : bool, default: True + If True, case sensitive. + Cannot be set if `pat` is a compiled regex. + Equivalent to setting the `re.IGNORECASE` flag. + flags : int, default: 0 + Flags to pass through to the re module, e.g. `re.IGNORECASE`. + see `compilation-flags `_. + ``0`` means no flags. Flags can be combined with the bitwise or operator ``|``. + Cannot be set if `pat` is a compiled regex. + + Returns + ------- + extracted : object array + + Raises + ------ + ValueError + `pat` has no capture groups. + ValueError + `case` is set when `pat` is a compiled regular expression. + + Examples + -------- + Create a string array + + >>> value = xr.DataArray( + ... [ + ... [ + ... "a_Xy_0", + ... "ab_xY_10-bab_Xy_110-baab_Xy_1100", + ... "abc_Xy_01-cbc_Xy_2210", + ... ], + ... [ + ... "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + ... "", + ... "abcdef_Xy_101-fef_Xy_5543210", + ... ], + ... ], + ... dims=["X", "Y"], + ... ) + + Extract matches + + >>> value.str.findall(r"(\w+)_Xy_(\d*)") + + array([[list([('a', '0')]), list([('bab', '110'), ('baab', '1100')]), + list([('abc', '01'), ('cbc', '2210')])], + [list([('abcd', ''), ('dcd', '33210'), ('dccd', '332210')]), + list([]), list([('abcdef', '101'), ('fef', '5543210')])]], + dtype=object) + Dimensions without coordinates: X, Y + + See Also + -------- + DataArray.str.extract + DataArray.str.extractall + re.compile + re.findall + pandas.Series.str.findall + """ + pat = self._re_compile(pat=pat, flags=flags, case=case) + + def func(x, ipat): + if ipat.groups == 0: + raise ValueError("No capture groups found in pattern.") + + return ipat.findall(x) + + return self._apply(func=func, func_args=(pat,), dtype=np.object_) + + def _partitioner( + self, + *, + func: Callable, + dim: Hashable, + sep: Optional[Union[str, bytes, Any]], + ) -> Any: + """ + Implements logic for `partition` and `rpartition`. + """ + sep = self._stringify(sep) + + if dim is None: + listfunc = lambda x, isep: list(func(x, isep)) + return self._apply(func=listfunc, func_args=(sep,), dtype=np.object_) + + # _apply breaks on an empty array in this case + if not self._obj.size: + return self._obj.copy().expand_dims({dim: 0}, axis=-1) + + arrfunc = lambda x, isep: np.array(func(x, isep), dtype=self._obj.dtype) + + # dtype MUST be object or strings can be truncated + # See: https://github.com/numpy/numpy/issues/8352 + return self._apply( + func=arrfunc, + func_args=(sep,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: 3}, + ).astype(self._obj.dtype.kind) + + def partition( + self, + dim: Optional[Hashable], + sep: Union[str, bytes, Any] = " ", + ) -> Any: + """ + Split the strings in the DataArray at the first occurrence of separator `sep`. + + This method splits the string at the first occurrence of `sep`, + and returns 3 elements containing the part before the separator, + the separator itself, and the part after the separator. + If the separator is not found, return 3 elements containing the string itself, + followed by two empty strings. + + If `sep` is array-like, it is broadcast against the array and applied + elementwise. + + Parameters + ---------- + dim : hashable or None + Name for the dimension to place the 3 elements in. + If `None`, place the results as list elements in an object DataArray. + sep : str, default: " " + String to split on. + If array-like, it is broadcast. + + Returns + ------- + partitioned : same type as values or object array + + See Also + -------- + DataArray.str.rpartition + str.partition + pandas.Series.str.partition + """ + return self._partitioner(func=self._obj.dtype.type.partition, dim=dim, sep=sep) + + def rpartition( + self, + dim: Optional[Hashable], + sep: Union[str, bytes, Any] = " ", + ) -> Any: + """ + Split the strings in the DataArray at the last occurrence of separator `sep`. + + This method splits the string at the last occurrence of `sep`, + and returns 3 elements containing the part before the separator, + the separator itself, and the part after the separator. + If the separator is not found, return 3 elements containing two empty strings, + followed by the string itself. + + If `sep` is array-like, it is broadcast against the array and applied + elementwise. + + Parameters + ---------- + dim : hashable or None + Name for the dimension to place the 3 elements in. + If `None`, place the results as list elements in an object DataArray. + sep : str, default: " " + String to split on. + If array-like, it is broadcast. + + Returns + ------- + rpartitioned : same type as values or object array + + See Also + -------- + DataArray.str.partition + str.rpartition + pandas.Series.str.rpartition + """ + return self._partitioner(func=self._obj.dtype.type.rpartition, dim=dim, sep=sep) + + def _splitter( + self, + *, + func: Callable, + pre: bool, + dim: Hashable, + sep: Optional[Union[str, bytes, Any]], + maxsplit: int, + ) -> Any: + """ + Implements logic for `split` and `rsplit`. + """ + if sep is not None: + sep = self._stringify(sep) + + if dim is None: + f_none = lambda x, isep: func(x, isep, maxsplit) + return self._apply(func=f_none, func_args=(sep,), dtype=np.object_) + + # _apply breaks on an empty array in this case + if not self._obj.size: + return self._obj.copy().expand_dims({dim: 0}, axis=-1) + + f_count = lambda x, isep: max(len(func(x, isep, maxsplit)), 1) + maxsplit = ( + self._apply(func=f_count, func_args=(sep,), dtype=np.int_).max().data.item() + - 1 + ) + + def _dosplit(mystr, sep, maxsplit=maxsplit, dtype=self._obj.dtype): + res = func(mystr, sep, maxsplit) + if len(res) < maxsplit + 1: + pad = [""] * (maxsplit + 1 - len(res)) + if pre: + res += pad + else: + res = pad + res + return np.array(res, dtype=dtype) + + # dtype MUST be object or strings can be truncated + # See: https://github.com/numpy/numpy/issues/8352 + return self._apply( + func=_dosplit, + func_args=(sep,), + dtype=np.object_, + output_core_dims=[[dim]], + output_sizes={dim: maxsplit}, + ).astype(self._obj.dtype.kind) + + def split( + self, + dim: Optional[Hashable], + sep: Union[str, bytes, Any] = None, + maxsplit: int = -1, + ) -> Any: + r""" + Split strings in a DataArray around the given separator/delimiter `sep`. + + Splits the string in the DataArray from the beginning, + at the specified delimiter string. + + If `sep` is array-like, it is broadcast against the array and applied + elementwise. + + Parameters + ---------- + dim : hashable or None + Name for the dimension to place the results in. + If `None`, place the results as list elements in an object DataArray. + sep : str, default: None + String to split on. If ``None`` (the default), split on any whitespace. + If array-like, it is broadcast. + maxsplit : int, default: -1 + Limit number of splits in output, starting from the beginning. + If -1 (the default), return all splits. + + Returns + ------- + splitted : same type as values or object array + + Examples + -------- + Create a string DataArray + + >>> values = xr.DataArray( + ... [ + ... ["abc def", "spam\t\teggs\tswallow", "red_blue"], + ... ["test0\ntest1\ntest2\n\ntest3", "", "abra ka\nda\tbra"], + ... ], + ... dims=["X", "Y"], + ... ) + + Split once and put the results in a new dimension + + >>> values.str.split(dim="splitted", maxsplit=1) + + array([[['abc', 'def'], + ['spam', 'eggs\tswallow'], + ['red_blue', '']], + + [['test0', 'test1\ntest2\n\ntest3'], + ['', ''], + ['abra', 'ka\nda\tbra']]], dtype='>> values.str.split(dim="splitted") + + array([[['abc', 'def', '', ''], + ['spam', 'eggs', 'swallow', ''], + ['red_blue', '', '', '']], + + [['test0', 'test1', 'test2', 'test3'], + ['', '', '', ''], + ['abra', 'ka', 'da', 'bra']]], dtype='>> values.str.split(dim=None, maxsplit=1) + + array([[list(['abc', 'def']), list(['spam', 'eggs\tswallow']), + list(['red_blue'])], + [list(['test0', 'test1\ntest2\n\ntest3']), list([]), + list(['abra', 'ka\nda\tbra'])]], dtype=object) + Dimensions without coordinates: X, Y + + Split as many times as needed and put the results in a list + + >>> values.str.split(dim=None) + + array([[list(['abc', 'def']), list(['spam', 'eggs', 'swallow']), + list(['red_blue'])], + [list(['test0', 'test1', 'test2', 'test3']), list([]), + list(['abra', 'ka', 'da', 'bra'])]], dtype=object) + Dimensions without coordinates: X, Y + + Split only on spaces + + >>> values.str.split(dim="splitted", sep=" ") + + array([[['abc', 'def', ''], + ['spam\t\teggs\tswallow', '', ''], + ['red_blue', '', '']], + + [['test0\ntest1\ntest2\n\ntest3', '', ''], + ['', '', ''], + ['abra', '', 'ka\nda\tbra']]], dtype=' Any: + r""" + Split strings in a DataArray around the given separator/delimiter `sep`. + + Splits the string in the DataArray from the end, + at the specified delimiter string. + + If `sep` is array-like, it is broadcast against the array and applied + elementwise. + + Parameters + ---------- + dim : hashable or None + Name for the dimension to place the results in. + If `None`, place the results as list elements in an object DataArray + sep : str, default: None + String to split on. If ``None`` (the default), split on any whitespace. + If array-like, it is broadcast. + maxsplit : int, default: -1 + Limit number of splits in output, starting from the end. + If -1 (the default), return all splits. + The final number of split values may be less than this if there are no + DataArray elements with that many values. + + Returns + ------- + rsplitted : same type as values or object array + + Examples + -------- + Create a string DataArray + + >>> values = xr.DataArray( + ... [ + ... ["abc def", "spam\t\teggs\tswallow", "red_blue"], + ... ["test0\ntest1\ntest2\n\ntest3", "", "abra ka\nda\tbra"], + ... ], + ... dims=["X", "Y"], + ... ) + + Split once and put the results in a new dimension + + >>> values.str.rsplit(dim="splitted", maxsplit=1) + + array([[['abc', 'def'], + ['spam\t\teggs', 'swallow'], + ['', 'red_blue']], + + [['test0\ntest1\ntest2', 'test3'], + ['', ''], + ['abra ka\nda', 'bra']]], dtype='>> values.str.rsplit(dim="splitted") + + array([[['', '', 'abc', 'def'], + ['', 'spam', 'eggs', 'swallow'], + ['', '', '', 'red_blue']], + + [['test0', 'test1', 'test2', 'test3'], + ['', '', '', ''], + ['abra', 'ka', 'da', 'bra']]], dtype='>> values.str.rsplit(dim=None, maxsplit=1) + + array([[list(['abc', 'def']), list(['spam\t\teggs', 'swallow']), + list(['red_blue'])], + [list(['test0\ntest1\ntest2', 'test3']), list([]), + list(['abra ka\nda', 'bra'])]], dtype=object) + Dimensions without coordinates: X, Y + + Split as many times as needed and put the results in a list + + >>> values.str.rsplit(dim=None) + + array([[list(['abc', 'def']), list(['spam', 'eggs', 'swallow']), + list(['red_blue'])], + [list(['test0', 'test1', 'test2', 'test3']), list([]), + list(['abra', 'ka', 'da', 'bra'])]], dtype=object) + Dimensions without coordinates: X, Y + + Split only on spaces + + >>> values.str.rsplit(dim="splitted", sep=" ") + + array([[['', 'abc', 'def'], + ['', '', 'spam\t\teggs\tswallow'], + ['', '', 'red_blue']], + + [['', '', 'test0\ntest1\ntest2\n\ntest3'], + ['', '', ''], + ['abra', '', 'ka\nda\tbra']]], dtype=' Any: + """ + Return DataArray of dummy/indicator variables. + + Each string in the DataArray is split at `sep`. + A new dimension is created with coordinates for each unique result, + and the corresponding element of that dimension is `True` if + that result is present and `False` if not. + + If `sep` is array-like, it is broadcast against the array and applied + elementwise. + + Parameters + ---------- + dim : hashable + Name for the dimension to place the results in. + sep : str, default: "|". + String to split on. + If array-like, it is broadcast. + + Returns + ------- + dummies : array of bool + + Examples + -------- + Create a string array + + >>> values = xr.DataArray( + ... [ + ... ["a|ab~abc|abc", "ab", "a||abc|abcd"], + ... ["abcd|ab|a", "abc|ab~abc", "|a"], + ... ], + ... dims=["X", "Y"], + ... ) + + Extract dummy values + + >>> values.str.get_dummies(dim="dummies") + + array([[[ True, False, True, False, True], + [False, True, False, False, False], + [ True, False, True, True, False]], + + [[ True, True, False, True, False], + [False, False, True, False, True], + [ True, False, False, False, False]]]) + Coordinates: + * dummies (dummies) Any: """ Decode character string in the array using indicated encoding. Parameters ---------- encoding : str + The encoding to use. + Please see the Python documentation `codecs standard encoders `_ + section for a list of encodings handlers. errors : str, optional + The handler for encoding errors. + Please see the Python documentation `codecs error handlers `_ + for a list of error handlers. Returns ------- decoded : same type as values """ if encoding in _cpython_optimized_decoders: - f = lambda x: x.decode(encoding, errors) + func = lambda x: x.decode(encoding, errors) else: decoder = codecs.getdecoder(encoding) - f = lambda x: decoder(x, errors)[0] - return self._apply(f, dtype=np.str_) + func = lambda x: decoder(x, errors)[0] + return self._apply(func=func, dtype=np.str_) - def encode(self, encoding, errors="strict"): + def encode( + self, + encoding: str, + errors: str = "strict", + ) -> Any: """ Encode character string in the array using indicated encoding. Parameters ---------- encoding : str + The encoding to use. + Please see the Python documentation `codecs standard encoders `_ + section for a list of encodings handlers. errors : str, optional + The handler for encoding errors. + Please see the Python documentation `codecs error handlers `_ + for a list of error handlers. Returns ------- encoded : same type as values """ if encoding in _cpython_optimized_encoders: - f = lambda x: x.encode(encoding, errors) + func = lambda x: x.encode(encoding, errors) else: encoder = codecs.getencoder(encoding) - f = lambda x: encoder(x, errors)[0] - return self._apply(f, dtype=np.bytes_) + func = lambda x: encoder(x, errors)[0] + return self._apply(func=func, dtype=np.bytes_) diff --git a/xarray/core/computation.py b/xarray/core/computation.py index e68c6b2629d..027bf29000a 100644 --- a/xarray/core/computation.py +++ b/xarray/core/computation.py @@ -923,6 +923,14 @@ def apply_ufunc( Single value or tuple of Dataset, DataArray, Variable, dask.array.Array or numpy.ndarray, the first type on that list to appear on an input. + Notes + ----- + This function is designed for the more common case where ``func`` can work on numpy + arrays. If ``func`` needs to manipulate a whole xarray object subset to each block + it is possible to use :py:func:`xarray.map_blocks`. + + Note that due to the overhead ``map_blocks`` is considerably slower than ``apply_ufunc``. + Examples -------- Calculate the vector magnitude of two arguments: @@ -1015,6 +1023,8 @@ def earth_mover_distance(first_samples, numpy.broadcast_arrays numba.vectorize numba.guvectorize + dask.array.apply_gufunc + xarray.map_blocks References ---------- diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py index de0310e468f..8c5df5575ac 100644 --- a/xarray/core/dataarray.py +++ b/xarray/core/dataarray.py @@ -3781,6 +3781,8 @@ def polyfit( See Also -------- numpy.polyfit + numpy.polyval + xarray.polyval """ return self._to_temp_dataset().polyfit( dim, deg, skipna=skipna, rcond=rcond, w=w, full=full, cov=cov @@ -4352,6 +4354,70 @@ def argmax( else: return self._replace_maybe_drop_dims(result) + def query( + self, + queries: Mapping[Hashable, Any] = None, + parser: str = "pandas", + engine: str = None, + missing_dims: str = "raise", + **queries_kwargs: Any, + ) -> "DataArray": + """Return a new data array indexed along the specified + dimension(s), where the indexers are given as strings containing + Python expressions to be evaluated against the values in the array. + + Parameters + ---------- + queries : dict, optional + A dict with keys matching dimensions and values given by strings + containing Python expressions to be evaluated against the data variables + in the dataset. The expressions will be evaluated using the pandas + eval() function, and can contain any valid Python expressions but cannot + contain any Python statements. + parser : {"pandas", "python"}, default: "pandas" + The parser to use to construct the syntax tree from the expression. + The default of 'pandas' parses code slightly different than standard + Python. Alternatively, you can parse an expression using the 'python' + parser to retain strict Python semantics. + engine: {"python", "numexpr", None}, default: None + The engine used to evaluate the expression. Supported engines are: + - None: tries to use numexpr, falls back to python + - "numexpr": evaluates expressions using numexpr + - "python": performs operations as if you had eval’d in top level python + missing_dims : {"raise", "warn", "ignore"}, default: "raise" + What to do if dimensions that should be selected from are not present in the + Dataset: + - "raise": raise an exception + - "warning": raise a warning, and ignore the missing dimensions + - "ignore": ignore the missing dimensions + **queries_kwargs : {dim: query, ...}, optional + The keyword arguments form of ``queries``. + One of queries or queries_kwargs must be provided. + + Returns + ------- + obj : DataArray + A new DataArray with the same contents as this dataset, indexed by + the results of the appropriate queries. + + See Also + -------- + DataArray.isel + Dataset.query + pandas.eval + + """ + + ds = self._to_dataset_whole(shallow_copy=True) + ds = ds.query( + queries=queries, + parser=parser, + engine=engine, + missing_dims=missing_dims, + **queries_kwargs, + ) + return ds[self.name] + # this needs to be at the end, or mypy will confuse with `str` # https://mypy.readthedocs.io/en/latest/common_issues.html#dealing-with-conflicting-names str = utils.UncachedAccessor(StringAccessor) diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py index db45157e7c1..19b3eec5f07 100644 --- a/xarray/core/dataset.py +++ b/xarray/core/dataset.py @@ -6399,6 +6399,8 @@ def polyfit( See Also -------- numpy.polyfit + numpy.polyval + xarray.polyval """ variables = {} skipna_da = skipna @@ -6999,5 +7001,78 @@ def argmax(self, dim=None, **kwargs): "Dataset.argmin() with a sequence or ... for dim" ) + def query( + self, + queries: Mapping[Hashable, Any] = None, + parser: str = "pandas", + engine: str = None, + missing_dims: str = "raise", + **queries_kwargs: Any, + ) -> "Dataset": + """Return a new dataset with each array indexed along the specified + dimension(s), where the indexers are given as strings containing + Python expressions to be evaluated against the data variables in the + dataset. + + Parameters + ---------- + queries : dict, optional + A dict with keys matching dimensions and values given by strings + containing Python expressions to be evaluated against the data variables + in the dataset. The expressions will be evaluated using the pandas + eval() function, and can contain any valid Python expressions but cannot + contain any Python statements. + parser : {"pandas", "python"}, default: "pandas" + The parser to use to construct the syntax tree from the expression. + The default of 'pandas' parses code slightly different than standard + Python. Alternatively, you can parse an expression using the 'python' + parser to retain strict Python semantics. + engine: {"python", "numexpr", None}, default: None + The engine used to evaluate the expression. Supported engines are: + - None: tries to use numexpr, falls back to python + - "numexpr": evaluates expressions using numexpr + - "python": performs operations as if you had eval’d in top level python + missing_dims : {"raise", "warn", "ignore"}, default: "raise" + What to do if dimensions that should be selected from are not present in the + Dataset: + - "raise": raise an exception + - "warning": raise a warning, and ignore the missing dimensions + - "ignore": ignore the missing dimensions + **queries_kwargs : {dim: query, ...}, optional + The keyword arguments form of ``queries``. + One of queries or queries_kwargs must be provided. + + Returns + ------- + obj : Dataset + A new Dataset with the same contents as this dataset, except each + array and dimension is indexed by the results of the appropriate + queries. + + See Also + -------- + Dataset.isel + pandas.eval + + """ + + # allow queries to be given either as a dict or as kwargs + queries = either_dict_or_kwargs(queries, queries_kwargs, "query") + + # check queries + for dim, expr in queries.items(): + if not isinstance(expr, str): + msg = f"expr for dim {dim} must be a string to be evaluated, {type(expr)} given" + raise ValueError(msg) + + # evaluate the queries to create the indexers + indexers = { + dim: pd.eval(expr, resolvers=[self], parser=parser, engine=engine) + for dim, expr in queries.items() + } + + # apply the selection + return self.isel(indexers, missing_dims=missing_dims) + ops.inject_all_ops_and_reduce_methods(Dataset, array_only=False) diff --git a/xarray/core/groupby.py b/xarray/core/groupby.py index 824f2767153..8a0972519e1 100644 --- a/xarray/core/groupby.py +++ b/xarray/core/groupby.py @@ -415,11 +415,20 @@ def dims(self): @property def groups(self): + """ + Mapping from group labels to indices. The indices can be used to index the underlying object. + """ # provided to mimic pandas.groupby if self._groups is None: self._groups = dict(zip(self._unique_coord.values, self._group_indices)) return self._groups + def __getitem__(self, key): + """ + Get DataArray or Dataset corresponding to a particular group label. + """ + return self._obj.isel({self._group_dim: self.groups[key]}) + def __len__(self): return self._unique_coord.size diff --git a/xarray/core/indexing.py b/xarray/core/indexing.py index 1cac5e89906..82810908bea 100644 --- a/xarray/core/indexing.py +++ b/xarray/core/indexing.py @@ -513,7 +513,7 @@ def __getitem__(self, key): return result -class LazilyOuterIndexedArray(ExplicitlyIndexedNDArrayMixin): +class LazilyIndexedArray(ExplicitlyIndexedNDArrayMixin): """Wrap an array to make basic and outer indexing lazy.""" __slots__ = ("array", "key") @@ -619,10 +619,10 @@ def _updated_key(self, new_key): return _combine_indexers(self.key, self.shape, new_key) def __getitem__(self, indexer): - # If the indexed array becomes a scalar, return LazilyOuterIndexedArray + # If the indexed array becomes a scalar, return LazilyIndexedArray if all(isinstance(ind, integer_types) for ind in indexer.tuple): key = BasicIndexer(tuple(k[indexer.tuple] for k in self.key.tuple)) - return LazilyOuterIndexedArray(self.array, key) + return LazilyIndexedArray(self.array, key) return type(self)(self.array, self._updated_key(indexer)) def transpose(self, order): diff --git a/xarray/core/utils.py b/xarray/core/utils.py index 9648458ec6d..f9c8523306c 100644 --- a/xarray/core/utils.py +++ b/xarray/core/utils.py @@ -4,7 +4,6 @@ import functools import io import itertools -import os.path import re import warnings from enum import Enum @@ -671,11 +670,6 @@ def read_magic_number(filename_or_obj, count=8): return magic_number -def is_grib_path(path: str) -> bool: - _, ext = os.path.splitext(path) - return ext in [".grib", ".grb", ".grib2", ".grb2"] - - def is_uniform_spaced(arr, **kwargs) -> bool: """Return True if values of an array are uniformly spaced and sorted. diff --git a/xarray/core/variable.py b/xarray/core/variable.py index c59cbf1f3e4..debc53e0bf3 100644 --- a/xarray/core/variable.py +++ b/xarray/core/variable.py @@ -179,7 +179,7 @@ def _maybe_wrap_data(data): Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure they can be indexed properly. - NumpyArrayAdapter, PandasIndexAdapter and LazilyOuterIndexedArray should + NumpyArrayAdapter, PandasIndexAdapter and LazilyIndexedArray should all pass through unmodified. """ if isinstance(data, pd.Index): diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 4b47e1d2c7e..aebcb0f2b8d 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -83,6 +83,7 @@ def LooseVersion(vstring): has_cartopy, requires_cartopy = _importorskip("cartopy") # Need Pint 0.15 for __dask_tokenize__ tests for Quantity wrapped Dask Arrays has_pint_0_15, requires_pint_0_15 = _importorskip("pint", minversion="0.15") +has_numexpr, requires_numexpr = _importorskip("numexpr") # some special cases has_scipy_or_netCDF4 = has_scipy or has_netCDF4 diff --git a/xarray/tests/test_accessor_dt.py b/xarray/tests/test_accessor_dt.py index 984bfc763bc..adfa2bed33b 100644 --- a/xarray/tests/test_accessor_dt.py +++ b/xarray/tests/test_accessor_dt.py @@ -59,6 +59,8 @@ def setup(self): "weekday", "dayofyear", "quarter", + "date", + "time", "is_month_start", "is_month_end", "is_quarter_start", @@ -144,6 +146,8 @@ def test_not_datetime_type(self): "weekday", "dayofyear", "quarter", + "date", + "time", "is_month_start", "is_month_end", "is_quarter_start", @@ -430,6 +434,16 @@ def test_isocalendar_cftime(data): data.time.dt.isocalendar() +@requires_cftime +def test_date_cftime(data): + + with raises_regex( + AttributeError, + r"'CFTimeIndex' object has no attribute `date`. Consider using the floor method instead, for instance: `.time.dt.floor\('D'\)`.", + ): + data.time.dt.date() + + @requires_cftime @pytest.mark.filterwarnings("ignore::RuntimeWarning") def test_cftime_strftime_access(data): diff --git a/xarray/tests/test_accessor_str.py b/xarray/tests/test_accessor_str.py index e0cbdb7377a..519ca762c41 100644 --- a/xarray/tests/test_accessor_str.py +++ b/xarray/tests/test_accessor_str.py @@ -44,7 +44,7 @@ import xarray as xr -from . import assert_equal, requires_dask +from . import assert_equal, assert_identical, requires_dask @pytest.fixture(params=[np.str_, np.bytes_]) @@ -61,97 +61,363 @@ def test_dask(): result = xarr.str.len().compute() expected = xr.DataArray([1, 1, 1]) + assert result.dtype == expected.dtype assert_equal(result, expected) def test_count(dtype): values = xr.DataArray(["foo", "foofoo", "foooofooofommmfoo"]).astype(dtype) - result = values.str.count("f[o]+") + pat_str = dtype(r"f[o]+") + pat_re = re.compile(pat_str) + + result_str = values.str.count(pat_str) + result_re = values.str.count(pat_re) + expected = xr.DataArray([1, 2, 4]) - assert_equal(result, expected) + + assert result_str.dtype == expected.dtype + assert result_re.dtype == expected.dtype + assert_equal(result_str, expected) + assert_equal(result_re, expected) + + +def test_count_broadcast(dtype): + values = xr.DataArray(["foo", "foofoo", "foooofooofommmfoo"]).astype(dtype) + pat_str = np.array([r"f[o]+", r"o", r"m"]).astype(dtype) + pat_re = np.array([re.compile(x) for x in pat_str]) + + result_str = values.str.count(pat_str) + result_re = values.str.count(pat_re) + + expected = xr.DataArray([1, 4, 3]) + + assert result_str.dtype == expected.dtype + assert result_re.dtype == expected.dtype + assert_equal(result_str, expected) + assert_equal(result_re, expected) def test_contains(dtype): values = xr.DataArray(["Foo", "xYz", "fOOomMm__fOo", "MMM_"]).astype(dtype) + # case insensitive using regex - result = values.str.contains("FOO|mmm", case=False) + pat = values.dtype.type("FOO|mmm") + result = values.str.contains(pat, case=False) expected = xr.DataArray([True, False, True, True]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.contains(re.compile(pat, flags=re.IGNORECASE)) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # case sensitive using regex + pat = values.dtype.type("Foo|mMm") + result = values.str.contains(pat) + expected = xr.DataArray([True, False, True, False]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.contains(re.compile(pat)) + assert result.dtype == expected.dtype assert_equal(result, expected) + # case insensitive without regex result = values.str.contains("foo", regex=False, case=False) expected = xr.DataArray([True, False, True, False]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # case sensitive without regex + result = values.str.contains("fO", regex=False, case=True) + expected = xr.DataArray([False, False, True, False]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # regex regex=False + pat_re = re.compile("(/w+)") + with pytest.raises( + ValueError, + match="Must use regular expression matching for regular expression object.", + ): + values.str.contains(pat_re, regex=False) + + +def test_contains_broadcast(dtype): + values = xr.DataArray(["Foo", "xYz", "fOOomMm__fOo", "MMM_"], dims="X").astype( + dtype + ) + pat_str = xr.DataArray(["FOO|mmm", "Foo", "MMM"], dims="Y").astype(dtype) + pat_re = xr.DataArray([re.compile(x) for x in pat_str.data], dims="Y") + + # case insensitive using regex + result = values.str.contains(pat_str, case=False) + expected = xr.DataArray( + [ + [True, True, False], + [False, False, False], + [True, True, True], + [True, False, True], + ], + dims=["X", "Y"], + ) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # case sensitive using regex + result = values.str.contains(pat_str) + expected = xr.DataArray( + [ + [False, True, False], + [False, False, False], + [False, False, False], + [False, False, True], + ], + dims=["X", "Y"], + ) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.contains(pat_re) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # case insensitive without regex + result = values.str.contains(pat_str, regex=False, case=False) + expected = xr.DataArray( + [ + [False, True, False], + [False, False, False], + [False, True, True], + [False, False, True], + ], + dims=["X", "Y"], + ) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # case insensitive with regex + result = values.str.contains(pat_str, regex=False, case=True) + expected = xr.DataArray( + [ + [False, True, False], + [False, False, False], + [False, False, False], + [False, False, True], + ], + dims=["X", "Y"], + ) + assert result.dtype == expected.dtype assert_equal(result, expected) def test_starts_ends_with(dtype): values = xr.DataArray(["om", "foo_nom", "nom", "bar_foo", "foo"]).astype(dtype) + result = values.str.startswith("foo") expected = xr.DataArray([False, True, False, False, True]) + assert result.dtype == expected.dtype assert_equal(result, expected) + result = values.str.endswith("foo") expected = xr.DataArray([False, False, False, True, True]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_starts_ends_with_broadcast(dtype): + values = xr.DataArray( + ["om", "foo_nom", "nom", "bar_foo", "foo_bar"], dims="X" + ).astype(dtype) + pat = xr.DataArray(["foo", "bar"], dims="Y").astype(dtype) + + result = values.str.startswith(pat) + expected = xr.DataArray( + [[False, False], [True, False], [False, False], [False, True], [True, False]], + dims=["X", "Y"], + ) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.endswith(pat) + expected = xr.DataArray( + [[False, False], [False, False], [False, False], [True, False], [False, True]], + dims=["X", "Y"], + ) + assert result.dtype == expected.dtype assert_equal(result, expected) -def test_case(dtype): - da = xr.DataArray(["SOme word"]).astype(dtype) - capitalized = xr.DataArray(["Some word"]).astype(dtype) - lowered = xr.DataArray(["some word"]).astype(dtype) - swapped = xr.DataArray(["soME WORD"]).astype(dtype) - titled = xr.DataArray(["Some Word"]).astype(dtype) - uppered = xr.DataArray(["SOME WORD"]).astype(dtype) - assert_equal(da.str.capitalize(), capitalized) - assert_equal(da.str.lower(), lowered) - assert_equal(da.str.swapcase(), swapped) - assert_equal(da.str.title(), titled) - assert_equal(da.str.upper(), uppered) +def test_case_bytes(): + value = xr.DataArray(["SOme wOrd"]).astype(np.bytes_) + + exp_capitalized = xr.DataArray(["Some word"]).astype(np.bytes_) + exp_lowered = xr.DataArray(["some word"]).astype(np.bytes_) + exp_swapped = xr.DataArray(["soME WoRD"]).astype(np.bytes_) + exp_titled = xr.DataArray(["Some Word"]).astype(np.bytes_) + exp_uppered = xr.DataArray(["SOME WORD"]).astype(np.bytes_) + + res_capitalized = value.str.capitalize() + res_lowered = value.str.lower() + res_swapped = value.str.swapcase() + res_titled = value.str.title() + res_uppered = value.str.upper() + + assert res_capitalized.dtype == exp_capitalized.dtype + assert res_lowered.dtype == exp_lowered.dtype + assert res_swapped.dtype == exp_swapped.dtype + assert res_titled.dtype == exp_titled.dtype + assert res_uppered.dtype == exp_uppered.dtype + + assert_equal(res_capitalized, exp_capitalized) + assert_equal(res_lowered, exp_lowered) + assert_equal(res_swapped, exp_swapped) + assert_equal(res_titled, exp_titled) + assert_equal(res_uppered, exp_uppered) + + +def test_case_str(): + # This string includes some unicode characters + # that are common case management corner cases + value = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi⁵Å Ç Ⅰ"]).astype(np.unicode_) + + exp_capitalized = xr.DataArray(["Some word dž ß ᾓ σς ffi⁵å ç ⅰ"]).astype(np.unicode_) + exp_lowered = xr.DataArray(["some word dž ß ᾓ σς ffi⁵å ç ⅰ"]).astype(np.unicode_) + exp_swapped = xr.DataArray(["soME WoRD dž SS ᾛ σς FFI⁵å ç ⅰ"]).astype(np.unicode_) + exp_titled = xr.DataArray(["Some Word Dž Ss ᾛ Σς Ffi⁵Å Ç Ⅰ"]).astype(np.unicode_) + exp_uppered = xr.DataArray(["SOME WORD DŽ SS ἫΙ ΣΣ FFI⁵Å Ç Ⅰ"]).astype(np.unicode_) + exp_casefolded = xr.DataArray(["some word dž ss ἣι σσ ffi⁵å ç ⅰ"]).astype( + np.unicode_ + ) + + exp_norm_nfc = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi⁵Å Ç Ⅰ"]).astype(np.unicode_) + exp_norm_nfkc = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi5Å Ç I"]).astype(np.unicode_) + exp_norm_nfd = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi⁵Å Ç Ⅰ"]).astype(np.unicode_) + exp_norm_nfkd = xr.DataArray(["SOme wOrd DŽ ß ᾛ ΣΣ ffi5Å Ç I"]).astype( + np.unicode_ + ) + + res_capitalized = value.str.capitalize() + res_casefolded = value.str.casefold() + res_lowered = value.str.lower() + res_swapped = value.str.swapcase() + res_titled = value.str.title() + res_uppered = value.str.upper() + + res_norm_nfc = value.str.normalize("NFC") + res_norm_nfd = value.str.normalize("NFD") + res_norm_nfkc = value.str.normalize("NFKC") + res_norm_nfkd = value.str.normalize("NFKD") + + assert res_capitalized.dtype == exp_capitalized.dtype + assert res_casefolded.dtype == exp_casefolded.dtype + assert res_lowered.dtype == exp_lowered.dtype + assert res_swapped.dtype == exp_swapped.dtype + assert res_titled.dtype == exp_titled.dtype + assert res_uppered.dtype == exp_uppered.dtype + + assert res_norm_nfc.dtype == exp_norm_nfc.dtype + assert res_norm_nfd.dtype == exp_norm_nfd.dtype + assert res_norm_nfkc.dtype == exp_norm_nfkc.dtype + assert res_norm_nfkd.dtype == exp_norm_nfkd.dtype + + assert_equal(res_capitalized, exp_capitalized) + assert_equal(res_casefolded, exp_casefolded) + assert_equal(res_lowered, exp_lowered) + assert_equal(res_swapped, exp_swapped) + assert_equal(res_titled, exp_titled) + assert_equal(res_uppered, exp_uppered) + + assert_equal(res_norm_nfc, exp_norm_nfc) + assert_equal(res_norm_nfd, exp_norm_nfd) + assert_equal(res_norm_nfkc, exp_norm_nfkc) + assert_equal(res_norm_nfkd, exp_norm_nfkd) def test_replace(dtype): - values = xr.DataArray(["fooBAD__barBAD"]).astype(dtype) + values = xr.DataArray(["fooBAD__barBAD"], dims=["x"]).astype(dtype) result = values.str.replace("BAD[_]*", "") - expected = xr.DataArray(["foobar"]).astype(dtype) + expected = xr.DataArray(["foobar"], dims=["x"]).astype(dtype) + assert result.dtype == expected.dtype assert_equal(result, expected) result = values.str.replace("BAD[_]*", "", n=1) - expected = xr.DataArray(["foobarBAD"]).astype(dtype) + expected = xr.DataArray(["foobarBAD"], dims=["x"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + pat = xr.DataArray(["BAD[_]*", "AD[_]*"], dims=["y"]).astype(dtype) + result = values.str.replace(pat, "") + expected = xr.DataArray([["foobar", "fooBbarB"]], dims=["x", "y"]).astype(dtype) + assert result.dtype == expected.dtype assert_equal(result, expected) - s = xr.DataArray(["A", "B", "C", "Aaba", "Baca", "", "CABA", "dog", "cat"]).astype( + repl = xr.DataArray(["", "spam"], dims=["y"]).astype(dtype) + result = values.str.replace(pat, repl, n=1) + expected = xr.DataArray([["foobarBAD", "fooBspambarBAD"]], dims=["x", "y"]).astype( dtype ) - result = s.str.replace("A", "YYY") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + values = xr.DataArray( + ["A", "B", "C", "Aaba", "Baca", "", "CABA", "dog", "cat"] + ).astype(dtype) expected = xr.DataArray( ["YYY", "B", "C", "YYYaba", "Baca", "", "CYYYBYYY", "dog", "cat"] ).astype(dtype) + result = values.str.replace("A", "YYY") + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.replace("A", "YYY", regex=False) + assert result.dtype == expected.dtype assert_equal(result, expected) - result = s.str.replace("A", "YYY", case=False) + result = values.str.replace("A", "YYY", case=False) expected = xr.DataArray( ["YYY", "B", "C", "YYYYYYbYYY", "BYYYcYYY", "", "CYYYBYYY", "dog", "cYYYt"] ).astype(dtype) + assert result.dtype == expected.dtype assert_equal(result, expected) - result = s.str.replace("^.a|dog", "XX-XX ", case=False) + result = values.str.replace("^.a|dog", "XX-XX ", case=False) expected = xr.DataArray( ["A", "B", "C", "XX-XX ba", "XX-XX ca", "", "XX-XX BA", "XX-XX ", "XX-XX t"] ).astype(dtype) + assert result.dtype == expected.dtype assert_equal(result, expected) def test_replace_callable(): values = xr.DataArray(["fooBAD__barBAD"]) + # test with callable repl = lambda m: m.group(0).swapcase() result = values.str.replace("[a-z][A-Z]{2}", repl, n=2) exp = xr.DataArray(["foObaD__baRbaD"]) + assert result.dtype == exp.dtype assert_equal(result, exp) + # test regex named groups values = xr.DataArray(["Foo Bar Baz"]) pat = r"(?P\w+) (?P\w+) (?P\w+)" repl = lambda m: m.group("middle").swapcase() result = values.str.replace(pat, repl) exp = xr.DataArray(["bAR"]) + assert result.dtype == exp.dtype + assert_equal(result, exp) + + # test broadcast + values = xr.DataArray(["Foo Bar Baz"], dims=["x"]) + pat = r"(?P\w+) (?P\w+) (?P\w+)" + repl = xr.DataArray( + [ + lambda m: m.group("first").swapcase(), + lambda m: m.group("middle").swapcase(), + lambda m: m.group("last").swapcase(), + ], + dims=["Y"], + ) + result = values.str.replace(pat, repl) + exp = xr.DataArray([["fOO", "bAR", "bAZ"]], dims=["x", "Y"]) + assert result.dtype == exp.dtype assert_equal(result, exp) @@ -161,19 +427,54 @@ def test_replace_unicode(): expected = xr.DataArray([b"abcd, \xc3\xa0".decode("utf-8")]) pat = re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE) result = values.str.replace(pat, ", ") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # broadcast version + values = xr.DataArray([b"abcd,\xc3\xa0".decode("utf-8")], dims=["X"]) + expected = xr.DataArray( + [[b"abcd, \xc3\xa0".decode("utf-8"), b"BAcd,\xc3\xa0".decode("utf-8")]], + dims=["X", "Y"], + ) + pat = xr.DataArray( + [re.compile(r"(?<=\w),(?=\w)", flags=re.UNICODE), r"ab"], dims=["Y"] + ) + repl = xr.DataArray([", ", "BA"], dims=["Y"]) + result = values.str.replace(pat, repl) + assert result.dtype == expected.dtype assert_equal(result, expected) def test_replace_compiled_regex(dtype): - values = xr.DataArray(["fooBAD__barBAD"]).astype(dtype) + values = xr.DataArray(["fooBAD__barBAD"], dims=["x"]).astype(dtype) + # test with compiled regex pat = re.compile(dtype("BAD[_]*")) result = values.str.replace(pat, "") - expected = xr.DataArray(["foobar"]).astype(dtype) + expected = xr.DataArray(["foobar"], dims=["x"]).astype(dtype) + assert result.dtype == expected.dtype assert_equal(result, expected) result = values.str.replace(pat, "", n=1) - expected = xr.DataArray(["foobarBAD"]).astype(dtype) + expected = xr.DataArray(["foobarBAD"], dims=["x"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # broadcast + pat = xr.DataArray( + [re.compile(dtype("BAD[_]*")), re.compile(dtype("AD[_]*"))], dims=["y"] + ) + result = values.str.replace(pat, "") + expected = xr.DataArray([["foobar", "fooBbarB"]], dims=["x", "y"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + repl = xr.DataArray(["", "spam"], dims=["y"]).astype(dtype) + result = values.str.replace(pat, repl, n=1) + expected = xr.DataArray([["foobarBAD", "fooBspambarBAD"]], dims=["x", "y"]).astype( + dtype + ) + assert result.dtype == expected.dtype assert_equal(result, expected) # case and flags provided to str.replace will have no effect @@ -181,13 +482,19 @@ def test_replace_compiled_regex(dtype): values = xr.DataArray(["fooBAD__barBAD__bad"]).astype(dtype) pat = re.compile(dtype("BAD[_]*")) - with pytest.raises(ValueError, match="case and flags cannot be"): + with pytest.raises( + ValueError, match="Flags cannot be set when pat is a compiled regex." + ): result = values.str.replace(pat, "", flags=re.IGNORECASE) - with pytest.raises(ValueError, match="case and flags cannot be"): + with pytest.raises( + ValueError, match="Case cannot be set when pat is a compiled regex." + ): result = values.str.replace(pat, "", case=False) - with pytest.raises(ValueError, match="case and flags cannot be"): + with pytest.raises( + ValueError, match="Case cannot be set when pat is a compiled regex." + ): result = values.str.replace(pat, "", case=True) # test with callable @@ -196,18 +503,37 @@ def test_replace_compiled_regex(dtype): pat = re.compile(dtype("[a-z][A-Z]{2}")) result = values.str.replace(pat, repl, n=2) expected = xr.DataArray(["foObaD__baRbaD"]).astype(dtype) + assert result.dtype == expected.dtype assert_equal(result, expected) def test_replace_literal(dtype): # GH16808 literal replace (regex=False vs regex=True) - values = xr.DataArray(["f.o", "foo"]).astype(dtype) - expected = xr.DataArray(["bao", "bao"]).astype(dtype) + values = xr.DataArray(["f.o", "foo"], dims=["X"]).astype(dtype) + expected = xr.DataArray(["bao", "bao"], dims=["X"]).astype(dtype) result = values.str.replace("f.", "ba") + assert result.dtype == expected.dtype assert_equal(result, expected) - expected = xr.DataArray(["bao", "foo"]).astype(dtype) + expected = xr.DataArray(["bao", "foo"], dims=["X"]).astype(dtype) result = values.str.replace("f.", "ba", regex=False) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # Broadcast + pat = xr.DataArray(["f.", ".o"], dims=["yy"]).astype(dtype) + expected = xr.DataArray([["bao", "fba"], ["bao", "bao"]], dims=["X", "yy"]).astype( + dtype + ) + result = values.str.replace(pat, "ba") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + expected = xr.DataArray([["bao", "fba"], ["foo", "foo"]], dims=["X", "yy"]).astype( + dtype + ) + result = values.str.replace(pat, "ba", regex=False) + assert result.dtype == expected.dtype assert_equal(result, expected) # Cannot do a literal replace if given a callable repl or compiled @@ -224,479 +550,3062 @@ def test_replace_literal(dtype): values.str.replace(compiled_pat, "", regex=False) -def test_repeat(dtype): - values = xr.DataArray(["a", "b", "c", "d"]).astype(dtype) - result = values.str.repeat(3) - expected = xr.DataArray(["aaa", "bbb", "ccc", "ddd"]).astype(dtype) - assert_equal(result, expected) +def test_extract_extractall_findall_empty_raises(dtype): + pat_str = dtype(r".*") + pat_re = re.compile(pat_str) + value = xr.DataArray([["a"]], dims=["X", "Y"]).astype(dtype) -def test_match(dtype): - # New match behavior introduced in 0.13 - values = xr.DataArray(["fooBAD__barBAD", "foo"]).astype(dtype) - result = values.str.match(".*(BAD[_]+).*(BAD)") - expected = xr.DataArray([True, False]) - assert_equal(result, expected) + with pytest.raises(ValueError, match="No capture groups found in pattern."): + value.str.extract(pat=pat_str, dim="ZZ") - values = xr.DataArray(["fooBAD__barBAD", "foo"]).astype(dtype) - result = values.str.match(".*BAD[_]+.*BAD") - expected = xr.DataArray([True, False]) - assert_equal(result, expected) + with pytest.raises(ValueError, match="No capture groups found in pattern."): + value.str.extract(pat=pat_re, dim="ZZ") + with pytest.raises(ValueError, match="No capture groups found in pattern."): + value.str.extractall(pat=pat_str, group_dim="XX", match_dim="YY") -def test_empty_str_methods(): - empty = xr.DataArray(np.empty(shape=(0,), dtype="U")) - empty_str = empty - empty_int = xr.DataArray(np.empty(shape=(0,), dtype=int)) - empty_bool = xr.DataArray(np.empty(shape=(0,), dtype=bool)) - empty_bytes = xr.DataArray(np.empty(shape=(0,), dtype="S")) + with pytest.raises(ValueError, match="No capture groups found in pattern."): + value.str.extractall(pat=pat_re, group_dim="XX", match_dim="YY") - assert_equal(empty_str, empty.str.title()) - assert_equal(empty_int, empty.str.count("a")) - assert_equal(empty_bool, empty.str.contains("a")) - assert_equal(empty_bool, empty.str.startswith("a")) - assert_equal(empty_bool, empty.str.endswith("a")) - assert_equal(empty_str, empty.str.lower()) - assert_equal(empty_str, empty.str.upper()) - assert_equal(empty_str, empty.str.replace("a", "b")) - assert_equal(empty_str, empty.str.repeat(3)) - assert_equal(empty_bool, empty.str.match("^a")) - assert_equal(empty_int, empty.str.len()) - assert_equal(empty_int, empty.str.find("a")) - assert_equal(empty_int, empty.str.rfind("a")) - assert_equal(empty_str, empty.str.pad(42)) - assert_equal(empty_str, empty.str.center(42)) - assert_equal(empty_str, empty.str.slice(stop=1)) - assert_equal(empty_str, empty.str.slice(step=1)) - assert_equal(empty_str, empty.str.strip()) - assert_equal(empty_str, empty.str.lstrip()) - assert_equal(empty_str, empty.str.rstrip()) - assert_equal(empty_str, empty.str.wrap(42)) - assert_equal(empty_str, empty.str.get(0)) - assert_equal(empty_str, empty_bytes.str.decode("ascii")) - assert_equal(empty_bytes, empty.str.encode("ascii")) - assert_equal(empty_str, empty.str.isalnum()) - assert_equal(empty_str, empty.str.isalpha()) - assert_equal(empty_str, empty.str.isdigit()) - assert_equal(empty_str, empty.str.isspace()) - assert_equal(empty_str, empty.str.islower()) - assert_equal(empty_str, empty.str.isupper()) - assert_equal(empty_str, empty.str.istitle()) - assert_equal(empty_str, empty.str.isnumeric()) - assert_equal(empty_str, empty.str.isdecimal()) - assert_equal(empty_str, empty.str.capitalize()) - assert_equal(empty_str, empty.str.swapcase()) - table = str.maketrans("a", "b") - assert_equal(empty_str, empty.str.translate(table)) + with pytest.raises(ValueError, match="No capture groups found in pattern."): + value.str.findall(pat=pat_str) + with pytest.raises(ValueError, match="No capture groups found in pattern."): + value.str.findall(pat=pat_re) -def test_ismethods(dtype): - values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "] - str_s = xr.DataArray(values).astype(dtype) - alnum_e = [True, True, True, True, True, False, True, True, False, False] - alpha_e = [True, True, True, False, False, False, True, False, False, False] - digit_e = [False, False, False, True, False, False, False, True, False, False] - space_e = [False, False, False, False, False, False, False, False, False, True] - lower_e = [False, True, False, False, False, False, False, False, False, False] - upper_e = [True, False, False, False, True, False, True, False, False, False] - title_e = [True, False, True, False, True, False, False, False, False, False] - - assert_equal(str_s.str.isalnum(), xr.DataArray(alnum_e)) - assert_equal(str_s.str.isalpha(), xr.DataArray(alpha_e)) - assert_equal(str_s.str.isdigit(), xr.DataArray(digit_e)) - assert_equal(str_s.str.isspace(), xr.DataArray(space_e)) - assert_equal(str_s.str.islower(), xr.DataArray(lower_e)) - assert_equal(str_s.str.isupper(), xr.DataArray(upper_e)) - assert_equal(str_s.str.istitle(), xr.DataArray(title_e)) +def test_extract_multi_None_raises(dtype): + pat_str = r"(\w+)_(\d+)" + pat_re = re.compile(pat_str) -def test_isnumeric(): - # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER - # 0x2605: ★ not number - # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY - # 0xFF13: 3 Em 3 - values = ["A", "3", "¼", "★", "፸", "3", "four"] - s = xr.DataArray(values) - numeric_e = [False, True, True, False, True, True, False] - decimal_e = [False, True, False, False, False, True, False] - assert_equal(s.str.isnumeric(), xr.DataArray(numeric_e)) - assert_equal(s.str.isdecimal(), xr.DataArray(decimal_e)) + value = xr.DataArray([["a_b"]], dims=["X", "Y"]).astype(dtype) + with pytest.raises( + ValueError, + match="Dimension must be specified if more than one capture group is given.", + ): + value.str.extract(pat=pat_str, dim=None) -def test_len(dtype): - values = ["foo", "fooo", "fooooo", "fooooooo"] - result = xr.DataArray(values).astype(dtype).str.len() - expected = xr.DataArray([len(x) for x in values]) - assert_equal(result, expected) + with pytest.raises( + ValueError, + match="Dimension must be specified if more than one capture group is given.", + ): + value.str.extract(pat=pat_re, dim=None) -def test_find(dtype): - values = xr.DataArray(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXX"]) - values = values.astype(dtype) - result = values.str.find("EF") - assert_equal(result, xr.DataArray([4, 3, 1, 0, -1])) - expected = xr.DataArray([v.find(dtype("EF")) for v in values.values]) - assert_equal(result, expected) +def test_extract_extractall_findall_case_re_raises(dtype): + pat_str = r".*" + pat_re = re.compile(pat_str) - result = values.str.rfind("EF") - assert_equal(result, xr.DataArray([4, 5, 7, 4, -1])) - expected = xr.DataArray([v.rfind(dtype("EF")) for v in values.values]) - assert_equal(result, expected) + value = xr.DataArray([["a"]], dims=["X", "Y"]).astype(dtype) - result = values.str.find("EF", 3) - assert_equal(result, xr.DataArray([4, 3, 7, 4, -1])) - expected = xr.DataArray([v.find(dtype("EF"), 3) for v in values.values]) - assert_equal(result, expected) + with pytest.raises( + ValueError, match="Case cannot be set when pat is a compiled regex." + ): + value.str.extract(pat=pat_re, case=True, dim="ZZ") - result = values.str.rfind("EF", 3) - assert_equal(result, xr.DataArray([4, 5, 7, 4, -1])) - expected = xr.DataArray([v.rfind(dtype("EF"), 3) for v in values.values]) - assert_equal(result, expected) + with pytest.raises( + ValueError, match="Case cannot be set when pat is a compiled regex." + ): + value.str.extract(pat=pat_re, case=False, dim="ZZ") - result = values.str.find("EF", 3, 6) - assert_equal(result, xr.DataArray([4, 3, -1, 4, -1])) - expected = xr.DataArray([v.find(dtype("EF"), 3, 6) for v in values.values]) - assert_equal(result, expected) + with pytest.raises( + ValueError, match="Case cannot be set when pat is a compiled regex." + ): + value.str.extractall(pat=pat_re, case=True, group_dim="XX", match_dim="YY") - result = values.str.rfind("EF", 3, 6) - assert_equal(result, xr.DataArray([4, 3, -1, 4, -1])) - xp = xr.DataArray([v.rfind(dtype("EF"), 3, 6) for v in values.values]) - assert_equal(result, xp) + with pytest.raises( + ValueError, match="Case cannot be set when pat is a compiled regex." + ): + value.str.extractall(pat=pat_re, case=False, group_dim="XX", match_dim="YY") + with pytest.raises( + ValueError, match="Case cannot be set when pat is a compiled regex." + ): + value.str.findall(pat=pat_re, case=True) -def test_index(dtype): - s = xr.DataArray(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]).astype(dtype) + with pytest.raises( + ValueError, match="Case cannot be set when pat is a compiled regex." + ): + value.str.findall(pat=pat_re, case=False) - result = s.str.index("EF") - assert_equal(result, xr.DataArray([4, 3, 1, 0])) - result = s.str.rindex("EF") - assert_equal(result, xr.DataArray([4, 5, 7, 4])) +def test_extract_extractall_name_collision_raises(dtype): + pat_str = r"(\w+)" + pat_re = re.compile(pat_str) - result = s.str.index("EF", 3) - assert_equal(result, xr.DataArray([4, 3, 7, 4])) + value = xr.DataArray([["a"]], dims=["X", "Y"]).astype(dtype) - result = s.str.rindex("EF", 3) - assert_equal(result, xr.DataArray([4, 5, 7, 4])) + with pytest.raises(KeyError, match="Dimension 'X' already present in DataArray."): + value.str.extract(pat=pat_str, dim="X") - result = s.str.index("E", 4, 8) - assert_equal(result, xr.DataArray([4, 5, 7, 4])) + with pytest.raises(KeyError, match="Dimension 'X' already present in DataArray."): + value.str.extract(pat=pat_re, dim="X") - result = s.str.rindex("E", 0, 5) - assert_equal(result, xr.DataArray([4, 3, 1, 4])) + with pytest.raises( + KeyError, match="Group dimension 'X' already present in DataArray." + ): + value.str.extractall(pat=pat_str, group_dim="X", match_dim="ZZ") - with pytest.raises(ValueError): - result = s.str.index("DE") + with pytest.raises( + KeyError, match="Group dimension 'X' already present in DataArray." + ): + value.str.extractall(pat=pat_re, group_dim="X", match_dim="YY") + with pytest.raises( + KeyError, match="Match dimension 'Y' already present in DataArray." + ): + value.str.extractall(pat=pat_str, group_dim="XX", match_dim="Y") -def test_pad(dtype): - values = xr.DataArray(["a", "b", "c", "eeeee"]).astype(dtype) + with pytest.raises( + KeyError, match="Match dimension 'Y' already present in DataArray." + ): + value.str.extractall(pat=pat_re, group_dim="XX", match_dim="Y") - result = values.str.pad(5, side="left") - expected = xr.DataArray([" a", " b", " c", "eeeee"]).astype(dtype) - assert_equal(result, expected) + with pytest.raises( + KeyError, match="Group dimension 'ZZ' is the same as match dimension 'ZZ'." + ): + value.str.extractall(pat=pat_str, group_dim="ZZ", match_dim="ZZ") - result = values.str.pad(5, side="right") - expected = xr.DataArray(["a ", "b ", "c ", "eeeee"]).astype(dtype) - assert_equal(result, expected) + with pytest.raises( + KeyError, match="Group dimension 'ZZ' is the same as match dimension 'ZZ'." + ): + value.str.extractall(pat=pat_re, group_dim="ZZ", match_dim="ZZ") - result = values.str.pad(5, side="both") - expected = xr.DataArray([" a ", " b ", " c ", "eeeee"]).astype(dtype) - assert_equal(result, expected) +def test_extract_single_case(dtype): + pat_str = r"(\w+)_Xy_\d*" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re) -def test_pad_fillchar(dtype): - values = xr.DataArray(["a", "b", "c", "eeeee"]).astype(dtype) + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) - result = values.str.pad(5, side="left", fillchar="X") - expected = xr.DataArray(["XXXXa", "XXXXb", "XXXXc", "eeeee"]).astype(dtype) - assert_equal(result, expected) + targ_none = xr.DataArray( + [["a", "bab", "abc"], ["abcd", "", "abcdef"]], dims=["X", "Y"] + ).astype(dtype) + targ_dim = xr.DataArray( + [[["a"], ["bab"], ["abc"]], [["abcd"], [""], ["abcdef"]]], dims=["X", "Y", "XX"] + ).astype(dtype) - result = values.str.pad(5, side="right", fillchar="X") - expected = xr.DataArray(["aXXXX", "bXXXX", "cXXXX", "eeeee"]).astype(dtype) - assert_equal(result, expected) + res_str_none = value.str.extract(pat=pat_str, dim=None) + res_str_dim = value.str.extract(pat=pat_str, dim="XX") + res_str_none_case = value.str.extract(pat=pat_str, dim=None, case=True) + res_str_dim_case = value.str.extract(pat=pat_str, dim="XX", case=True) + res_re_none = value.str.extract(pat=pat_re, dim=None) + res_re_dim = value.str.extract(pat=pat_re, dim="XX") + + assert res_str_none.dtype == targ_none.dtype + assert res_str_dim.dtype == targ_dim.dtype + assert res_str_none_case.dtype == targ_none.dtype + assert res_str_dim_case.dtype == targ_dim.dtype + assert res_re_none.dtype == targ_none.dtype + assert res_re_dim.dtype == targ_dim.dtype + + assert_equal(res_str_none, targ_none) + assert_equal(res_str_dim, targ_dim) + assert_equal(res_str_none_case, targ_none) + assert_equal(res_str_dim_case, targ_dim) + assert_equal(res_re_none, targ_none) + assert_equal(res_re_dim, targ_dim) + + +def test_extract_single_nocase(dtype): + pat_str = r"(\w+)?_Xy_\d*" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re, flags=re.IGNORECASE) + + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "_Xy_1", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) - result = values.str.pad(5, side="both", fillchar="X") - expected = xr.DataArray(["XXaXX", "XXbXX", "XXcXX", "eeeee"]).astype(dtype) - assert_equal(result, expected) + targ_none = xr.DataArray( + [["a", "ab", "abc"], ["abcd", "", "abcdef"]], dims=["X", "Y"] + ).astype(dtype) + targ_dim = xr.DataArray( + [[["a"], ["ab"], ["abc"]], [["abcd"], [""], ["abcdef"]]], dims=["X", "Y", "XX"] + ).astype(dtype) - msg = "fillchar must be a character, not str" - with pytest.raises(TypeError, match=msg): - result = values.str.pad(5, fillchar="XY") + res_str_none = value.str.extract(pat=pat_str, dim=None, case=False) + res_str_dim = value.str.extract(pat=pat_str, dim="XX", case=False) + res_re_none = value.str.extract(pat=pat_re, dim=None) + res_re_dim = value.str.extract(pat=pat_re, dim="XX") + assert res_re_dim.dtype == targ_none.dtype + assert res_str_dim.dtype == targ_dim.dtype + assert res_re_none.dtype == targ_none.dtype + assert res_re_dim.dtype == targ_dim.dtype -def test_translate(): - values = xr.DataArray(["abcdefg", "abcc", "cdddfg", "cdefggg"]) - table = str.maketrans("abc", "cde") - result = values.str.translate(table) - expected = xr.DataArray(["cdedefg", "cdee", "edddfg", "edefggg"]) - assert_equal(result, expected) + assert_equal(res_str_none, targ_none) + assert_equal(res_str_dim, targ_dim) + assert_equal(res_re_none, targ_none) + assert_equal(res_re_dim, targ_dim) -def test_center_ljust_rjust(dtype): - values = xr.DataArray(["a", "b", "c", "eeeee"]).astype(dtype) +def test_extract_multi_case(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re) - result = values.str.center(5) - expected = xr.DataArray([" a ", " b ", " c ", "eeeee"]).astype(dtype) - assert_equal(result, expected) + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) - result = values.str.ljust(5) - expected = xr.DataArray(["a ", "b ", "c ", "eeeee"]).astype(dtype) - assert_equal(result, expected) + expected = xr.DataArray( + [ + [["a", "0"], ["bab", "110"], ["abc", "01"]], + [["abcd", ""], ["", ""], ["abcdef", "101"]], + ], + dims=["X", "Y", "XX"], + ).astype(dtype) - result = values.str.rjust(5) - expected = xr.DataArray([" a", " b", " c", "eeeee"]).astype(dtype) - assert_equal(result, expected) + res_str = value.str.extract(pat=pat_str, dim="XX") + res_re = value.str.extract(pat=pat_re, dim="XX") + res_str_case = value.str.extract(pat=pat_str, dim="XX", case=True) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert res_str_case.dtype == expected.dtype -def test_center_ljust_rjust_fillchar(dtype): - values = xr.DataArray(["a", "bb", "cccc", "ddddd", "eeeeee"]).astype(dtype) - result = values.str.center(5, fillchar="X") - expected = xr.DataArray(["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"]) - assert_equal(result, expected.astype(dtype)) + assert_equal(res_str, expected) + assert_equal(res_re, expected) + assert_equal(res_str_case, expected) - result = values.str.ljust(5, fillchar="X") - expected = xr.DataArray(["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"]) - assert_equal(result, expected.astype(dtype)) - result = values.str.rjust(5, fillchar="X") - expected = xr.DataArray(["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"]) - assert_equal(result, expected.astype(dtype)) +def test_extract_multi_nocase(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re, flags=re.IGNORECASE) - # If fillchar is not a charatter, normal str raises TypeError - # 'aaa'.ljust(5, 'XY') - # TypeError: must be char, not str - template = "fillchar must be a character, not {dtype}" + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) - with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.center(5, fillchar="XY") + expected = xr.DataArray( + [ + [["a", "0"], ["ab", "10"], ["abc", "01"]], + [["abcd", ""], ["", ""], ["abcdef", "101"]], + ], + dims=["X", "Y", "XX"], + ).astype(dtype) - with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.ljust(5, fillchar="XY") + res_str = value.str.extract(pat=pat_str, dim="XX", case=False) + res_re = value.str.extract(pat=pat_re, dim="XX") - with pytest.raises(TypeError, match=template.format(dtype="str")): - values.str.rjust(5, fillchar="XY") + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert_equal(res_str, expected) + assert_equal(res_re, expected) -def test_zfill(dtype): - values = xr.DataArray(["1", "22", "aaa", "333", "45678"]).astype(dtype) - result = values.str.zfill(5) - expected = xr.DataArray(["00001", "00022", "00aaa", "00333", "45678"]) - assert_equal(result, expected.astype(dtype)) +def test_extract_broadcast(dtype): + value = xr.DataArray( + ["a_Xy_0", "ab_xY_10", "abc_Xy_01"], + dims=["X"], + ).astype(dtype) - result = values.str.zfill(3) - expected = xr.DataArray(["001", "022", "aaa", "333", "45678"]) - assert_equal(result, expected.astype(dtype)) + pat_str = xr.DataArray( + [r"(\w+)_Xy_(\d*)", r"(\w+)_xY_(\d*)"], + dims=["Y"], + ).astype(dtype) + pat_re = value.str._re_compile(pat=pat_str) + expected = [ + [["a", "0"], ["", ""]], + [["", ""], ["ab", "10"]], + [["abc", "01"], ["", ""]], + ] + expected = xr.DataArray(expected, dims=["X", "Y", "Zz"]).astype(dtype) -def test_slice(dtype): - arr = xr.DataArray(["aafootwo", "aabartwo", "aabazqux"]).astype(dtype) + res_str = value.str.extract(pat=pat_str, dim="Zz") + res_re = value.str.extract(pat=pat_re, dim="Zz") - result = arr.str.slice(2, 5) - exp = xr.DataArray(["foo", "bar", "baz"]).astype(dtype) - assert_equal(result, exp) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype - for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2), (3, 0, -1)]: - try: - result = arr.str[start:stop:step] - expected = xr.DataArray([s[start:stop:step] for s in arr.values]) - assert_equal(result, expected.astype(dtype)) - except IndexError: - print(f"failed on {start}:{stop}:{step}") - raise + assert_equal(res_str, expected) + assert_equal(res_re, expected) -def test_slice_replace(dtype): - da = lambda x: xr.DataArray(x).astype(dtype) - values = da(["short", "a bit longer", "evenlongerthanthat", ""]) +def test_extractall_single_single_case(dtype): + pat_str = r"(\w+)_Xy_\d*" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re) - expected = da(["shrt", "a it longer", "evnlongerthanthat", ""]) - result = values.str.slice_replace(2, 3) - assert_equal(result, expected) + value = xr.DataArray( + [["a_Xy_0", "ab_xY_10", "abc_Xy_01"], ["abcd_Xy_", "", "abcdef_Xy_101"]], + dims=["X", "Y"], + ).astype(dtype) - expected = da(["shzrt", "a zit longer", "evznlongerthanthat", "z"]) - result = values.str.slice_replace(2, 3, "z") - assert_equal(result, expected) + expected = xr.DataArray( + [[[["a"]], [[""]], [["abc"]]], [[["abcd"]], [[""]], [["abcdef"]]]], + dims=["X", "Y", "XX", "YY"], + ).astype(dtype) - expected = da(["shzort", "a zbit longer", "evzenlongerthanthat", "z"]) - result = values.str.slice_replace(2, 2, "z") - assert_equal(result, expected) + res_str = value.str.extractall(pat=pat_str, group_dim="XX", match_dim="YY") + res_re = value.str.extractall(pat=pat_re, group_dim="XX", match_dim="YY") + res_str_case = value.str.extractall( + pat=pat_str, group_dim="XX", match_dim="YY", case=True + ) - expected = da(["shzort", "a zbit longer", "evzenlongerthanthat", "z"]) - result = values.str.slice_replace(2, 1, "z") - assert_equal(result, expected) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert res_str_case.dtype == expected.dtype - expected = da(["shorz", "a bit longez", "evenlongerthanthaz", "z"]) - result = values.str.slice_replace(-1, None, "z") - assert_equal(result, expected) + assert_equal(res_str, expected) + assert_equal(res_re, expected) + assert_equal(res_str_case, expected) - expected = da(["zrt", "zer", "zat", "z"]) - result = values.str.slice_replace(None, -2, "z") - assert_equal(result, expected) - expected = da(["shortz", "a bit znger", "evenlozerthanthat", "z"]) - result = values.str.slice_replace(6, 8, "z") - assert_equal(result, expected) +def test_extractall_single_single_nocase(dtype): + pat_str = r"(\w+)_Xy_\d*" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re, flags=re.I) - expected = da(["zrt", "a zit longer", "evenlongzerthanthat", "z"]) - result = values.str.slice_replace(-10, 3, "z") - assert_equal(result, expected) + value = xr.DataArray( + [["a_Xy_0", "ab_xY_10", "abc_Xy_01"], ["abcd_Xy_", "", "abcdef_Xy_101"]], + dims=["X", "Y"], + ).astype(dtype) + expected = xr.DataArray( + [[[["a"]], [["ab"]], [["abc"]]], [[["abcd"]], [[""]], [["abcdef"]]]], + dims=["X", "Y", "XX", "YY"], + ).astype(dtype) -def test_strip_lstrip_rstrip(dtype): - values = xr.DataArray([" aa ", " bb \n", "cc "]).astype(dtype) + res_str = value.str.extractall( + pat=pat_str, group_dim="XX", match_dim="YY", case=False + ) + res_re = value.str.extractall(pat=pat_re, group_dim="XX", match_dim="YY") - result = values.str.strip() - expected = xr.DataArray(["aa", "bb", "cc"]).astype(dtype) - assert_equal(result, expected) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype - result = values.str.lstrip() - expected = xr.DataArray(["aa ", "bb \n", "cc "]).astype(dtype) - assert_equal(result, expected) + assert_equal(res_str, expected) + assert_equal(res_re, expected) - result = values.str.rstrip() - expected = xr.DataArray([" aa", " bb", "cc"]).astype(dtype) - assert_equal(result, expected) +def test_extractall_single_multi_case(dtype): + pat_str = r"(\w+)_Xy_\d*" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re) -def test_strip_lstrip_rstrip_args(dtype): - values = xr.DataArray(["xxABCxx", "xx BNSD", "LDFJH xx"]).astype(dtype) + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) - rs = values.str.strip("x") - xp = xr.DataArray(["ABC", " BNSD", "LDFJH "]).astype(dtype) - assert_equal(rs, xp) + expected = xr.DataArray( + [ + [[["a"], [""], [""]], [["bab"], ["baab"], [""]], [["abc"], ["cbc"], [""]]], + [ + [["abcd"], ["dcd"], ["dccd"]], + [[""], [""], [""]], + [["abcdef"], ["fef"], [""]], + ], + ], + dims=["X", "Y", "XX", "YY"], + ).astype(dtype) - rs = values.str.lstrip("x") - xp = xr.DataArray(["ABCxx", " BNSD", "LDFJH xx"]).astype(dtype) - assert_equal(rs, xp) + res_str = value.str.extractall(pat=pat_str, group_dim="XX", match_dim="YY") + res_re = value.str.extractall(pat=pat_re, group_dim="XX", match_dim="YY") + res_str_case = value.str.extractall( + pat=pat_str, group_dim="XX", match_dim="YY", case=True + ) - rs = values.str.rstrip("x") - xp = xr.DataArray(["xxABC", "xx BNSD", "LDFJH "]).astype(dtype) - assert_equal(rs, xp) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert res_str_case.dtype == expected.dtype + assert_equal(res_str, expected) + assert_equal(res_re, expected) + assert_equal(res_str_case, expected) -def test_wrap(): - # test values are: two words less than width, two words equal to width, - # two words greater than width, one word less than width, one word - # equal to width, one word greater than width, multiple tokens with - # trailing whitespace equal to width - values = xr.DataArray( + +def test_extractall_single_multi_nocase(dtype): + pat_str = r"(\w+)_Xy_\d*" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re, flags=re.I) + + value = xr.DataArray( [ - "hello world", - "hello world!", - "hello world!!", - "abcdefabcde", - "abcdefabcdef", - "abcdefabcdefa", - "ab ab ab ab ", - "ab ab ab ab a", - "\t", - ] - ) + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) - # expected values expected = xr.DataArray( [ - "hello world", - "hello world!", - "hello\nworld!!", - "abcdefabcde", - "abcdefabcdef", - "abcdefabcdef\na", - "ab ab ab ab", - "ab ab ab ab\na", - "", - ] + [ + [["a"], [""], [""]], + [["ab"], ["bab"], ["baab"]], + [["abc"], ["cbc"], [""]], + ], + [ + [["abcd"], ["dcd"], ["dccd"]], + [[""], [""], [""]], + [["abcdef"], ["fef"], [""]], + ], + ], + dims=["X", "Y", "XX", "YY"], + ).astype(dtype) + + res_str = value.str.extractall( + pat=pat_str, group_dim="XX", match_dim="YY", case=False ) + res_re = value.str.extractall(pat=pat_re, group_dim="XX", match_dim="YY") - result = values.str.wrap(12, break_long_words=True) - assert_equal(result, expected) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype - # test with pre and post whitespace (non-unicode), NaN, and non-ascii - # Unicode - values = xr.DataArray([" pre ", "\xac\u20ac\U00008000 abadcafe"]) - expected = xr.DataArray([" pre", "\xac\u20ac\U00008000 ab\nadcafe"]) - result = values.str.wrap(6) - assert_equal(result, expected) + assert_equal(res_str, expected) + assert_equal(res_re, expected) -def test_wrap_kwargs_passed(): - # GH4334 +def test_extractall_multi_single_case(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re) - values = xr.DataArray(" hello world ") + value = xr.DataArray( + [["a_Xy_0", "ab_xY_10", "abc_Xy_01"], ["abcd_Xy_", "", "abcdef_Xy_101"]], + dims=["X", "Y"], + ).astype(dtype) - result = values.str.wrap(7) - expected = xr.DataArray(" hello\nworld") - assert_equal(result, expected) + expected = xr.DataArray( + [ + [[["a", "0"]], [["", ""]], [["abc", "01"]]], + [[["abcd", ""]], [["", ""]], [["abcdef", "101"]]], + ], + dims=["X", "Y", "XX", "YY"], + ).astype(dtype) - result = values.str.wrap(7, drop_whitespace=False) - expected = xr.DataArray(" hello\n world\n ") - assert_equal(result, expected) + res_str = value.str.extractall(pat=pat_str, group_dim="XX", match_dim="YY") + res_re = value.str.extractall(pat=pat_re, group_dim="XX", match_dim="YY") + res_str_case = value.str.extractall( + pat=pat_str, group_dim="XX", match_dim="YY", case=True + ) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert res_str_case.dtype == expected.dtype -def test_get(dtype): - values = xr.DataArray(["a_b_c", "c_d_e", "f_g_h"]).astype(dtype) + assert_equal(res_str, expected) + assert_equal(res_re, expected) + assert_equal(res_str_case, expected) - result = values.str[2] - expected = xr.DataArray(["b", "d", "g"]).astype(dtype) - assert_equal(result, expected) - # bounds testing - values = xr.DataArray(["1_2_3_4_5", "6_7_8_9_10", "11_12"]).astype(dtype) +def test_extractall_multi_single_nocase(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re, flags=re.I) - # positive index - result = values.str[5] - expected = xr.DataArray(["_", "_", ""]).astype(dtype) - assert_equal(result, expected) + value = xr.DataArray( + [["a_Xy_0", "ab_xY_10", "abc_Xy_01"], ["abcd_Xy_", "", "abcdef_Xy_101"]], + dims=["X", "Y"], + ).astype(dtype) - # negative index - result = values.str[-6] - expected = xr.DataArray(["_", "8", ""]).astype(dtype) - assert_equal(result, expected) + expected = xr.DataArray( + [ + [[["a", "0"]], [["ab", "10"]], [["abc", "01"]]], + [[["abcd", ""]], [["", ""]], [["abcdef", "101"]]], + ], + dims=["X", "Y", "XX", "YY"], + ).astype(dtype) + res_str = value.str.extractall( + pat=pat_str, group_dim="XX", match_dim="YY", case=False + ) + res_re = value.str.extractall(pat=pat_re, group_dim="XX", match_dim="YY") -def test_get_default(dtype): - # GH4334 - values = xr.DataArray(["a_b", "c", ""]).astype(dtype) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype - result = values.str.get(2, "default") - expected = xr.DataArray(["b", "default", "default"]).astype(dtype) - assert_equal(result, expected) + assert_equal(res_str, expected) + assert_equal(res_re, expected) -def test_encode_decode(): - data = xr.DataArray(["a", "b", "a\xe4"]) - encoded = data.str.encode("utf-8") - decoded = encoded.str.decode("utf-8") - assert_equal(data, decoded) +def test_extractall_multi_multi_case(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re) + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) -def test_encode_decode_errors(): - encodeBase = xr.DataArray(["a", "b", "a\x9d"]) + expected = xr.DataArray( + [ + [ + [["a", "0"], ["", ""], ["", ""]], + [["bab", "110"], ["baab", "1100"], ["", ""]], + [["abc", "01"], ["cbc", "2210"], ["", ""]], + ], + [ + [["abcd", ""], ["dcd", "33210"], ["dccd", "332210"]], + [["", ""], ["", ""], ["", ""]], + [["abcdef", "101"], ["fef", "5543210"], ["", ""]], + ], + ], + dims=["X", "Y", "XX", "YY"], + ).astype(dtype) - msg = ( - r"'charmap' codec can't encode character '\\x9d' in position 1:" - " character maps to " + res_str = value.str.extractall(pat=pat_str, group_dim="XX", match_dim="YY") + res_re = value.str.extractall(pat=pat_re, group_dim="XX", match_dim="YY") + res_str_case = value.str.extractall( + pat=pat_str, group_dim="XX", match_dim="YY", case=True ) - with pytest.raises(UnicodeEncodeError, match=msg): - encodeBase.str.encode("cp1252") - f = lambda x: x.encode("cp1252", "ignore") - result = encodeBase.str.encode("cp1252", "ignore") - expected = xr.DataArray([f(x) for x in encodeBase.values.tolist()]) - assert_equal(result, expected) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert res_str_case.dtype == expected.dtype - decodeBase = xr.DataArray([b"a", b"b", b"a\x9d"]) + assert_equal(res_str, expected) + assert_equal(res_re, expected) + assert_equal(res_str_case, expected) - msg = ( - "'charmap' codec can't decode byte 0x9d in position 1:" - " character maps to " + +def test_extractall_multi_multi_nocase(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = pat_str if dtype == np.unicode_ else bytes(pat_str, encoding="UTF-8") + pat_re = re.compile(pat_re, flags=re.I) + + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) + + expected = xr.DataArray( + [ + [ + [["a", "0"], ["", ""], ["", ""]], + [["ab", "10"], ["bab", "110"], ["baab", "1100"]], + [["abc", "01"], ["cbc", "2210"], ["", ""]], + ], + [ + [["abcd", ""], ["dcd", "33210"], ["dccd", "332210"]], + [["", ""], ["", ""], ["", ""]], + [["abcdef", "101"], ["fef", "5543210"], ["", ""]], + ], + ], + dims=["X", "Y", "XX", "YY"], + ).astype(dtype) + + res_str = value.str.extractall( + pat=pat_str, group_dim="XX", match_dim="YY", case=False ) - with pytest.raises(UnicodeDecodeError, match=msg): - decodeBase.str.decode("cp1252") + res_re = value.str.extractall(pat=pat_re, group_dim="XX", match_dim="YY") - f = lambda x: x.decode("cp1252", "ignore") - result = decodeBase.str.decode("cp1252", "ignore") - expected = xr.DataArray([f(x) for x in decodeBase.values.tolist()]) - assert_equal(result, expected) + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + + +def test_extractall_broadcast(dtype): + value = xr.DataArray( + ["a_Xy_0", "ab_xY_10", "abc_Xy_01"], + dims=["X"], + ).astype(dtype) + + pat_str = xr.DataArray( + [r"(\w+)_Xy_(\d*)", r"(\w+)_xY_(\d*)"], + dims=["Y"], + ).astype(dtype) + pat_re = value.str._re_compile(pat=pat_str) + + expected = [ + [[["a", "0"]], [["", ""]]], + [[["", ""]], [["ab", "10"]]], + [[["abc", "01"]], [["", ""]]], + ] + expected = xr.DataArray(expected, dims=["X", "Y", "ZX", "ZY"]).astype(dtype) + + res_str = value.str.extractall(pat=pat_str, group_dim="ZX", match_dim="ZY") + res_re = value.str.extractall(pat=pat_re, group_dim="ZX", match_dim="ZY") + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + + +def test_findall_single_single_case(dtype): + pat_str = r"(\w+)_Xy_\d*" + pat_re = re.compile(dtype(pat_str)) + + value = xr.DataArray( + [["a_Xy_0", "ab_xY_10", "abc_Xy_01"], ["abcd_Xy_", "", "abcdef_Xy_101"]], + dims=["X", "Y"], + ).astype(dtype) + + expected = [[["a"], [], ["abc"]], [["abcd"], [], ["abcdef"]]] + expected = [[[dtype(x) for x in y] for y in z] for z in expected] + expected = np.array(expected, dtype=np.object_) + expected = xr.DataArray(expected, dims=["X", "Y"]) + + res_str = value.str.findall(pat=pat_str) + res_re = value.str.findall(pat=pat_re) + res_str_case = value.str.findall(pat=pat_str, case=True) + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert res_str_case.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + assert_equal(res_str_case, expected) + + +def test_findall_single_single_nocase(dtype): + pat_str = r"(\w+)_Xy_\d*" + pat_re = re.compile(dtype(pat_str), flags=re.I) + + value = xr.DataArray( + [["a_Xy_0", "ab_xY_10", "abc_Xy_01"], ["abcd_Xy_", "", "abcdef_Xy_101"]], + dims=["X", "Y"], + ).astype(dtype) + + expected = [[["a"], ["ab"], ["abc"]], [["abcd"], [], ["abcdef"]]] + expected = [[[dtype(x) for x in y] for y in z] for z in expected] + expected = np.array(expected, dtype=np.object_) + expected = xr.DataArray(expected, dims=["X", "Y"]) + + res_str = value.str.findall(pat=pat_str, case=False) + res_re = value.str.findall(pat=pat_re) + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + + +def test_findall_single_multi_case(dtype): + pat_str = r"(\w+)_Xy_\d*" + pat_re = re.compile(dtype(pat_str)) + + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) + + expected = [ + [["a"], ["bab", "baab"], ["abc", "cbc"]], + [ + ["abcd", "dcd", "dccd"], + [], + ["abcdef", "fef"], + ], + ] + expected = [[[dtype(x) for x in y] for y in z] for z in expected] + expected = np.array(expected, dtype=np.object_) + expected = xr.DataArray(expected, dims=["X", "Y"]) + + res_str = value.str.findall(pat=pat_str) + res_re = value.str.findall(pat=pat_re) + res_str_case = value.str.findall(pat=pat_str, case=True) + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert res_str_case.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + assert_equal(res_str_case, expected) + + +def test_findall_single_multi_nocase(dtype): + pat_str = r"(\w+)_Xy_\d*" + pat_re = re.compile(dtype(pat_str), flags=re.I) + + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) + + expected = [ + [ + ["a"], + ["ab", "bab", "baab"], + ["abc", "cbc"], + ], + [ + ["abcd", "dcd", "dccd"], + [], + ["abcdef", "fef"], + ], + ] + expected = [[[dtype(x) for x in y] for y in z] for z in expected] + expected = np.array(expected, dtype=np.object_) + expected = xr.DataArray(expected, dims=["X", "Y"]) + + res_str = value.str.findall(pat=pat_str, case=False) + res_re = value.str.findall(pat=pat_re) + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + + +def test_findall_multi_single_case(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = re.compile(dtype(pat_str)) + + value = xr.DataArray( + [["a_Xy_0", "ab_xY_10", "abc_Xy_01"], ["abcd_Xy_", "", "abcdef_Xy_101"]], + dims=["X", "Y"], + ).astype(dtype) + + expected = [ + [[["a", "0"]], [], [["abc", "01"]]], + [[["abcd", ""]], [], [["abcdef", "101"]]], + ] + expected = [[[tuple(dtype(x) for x in y) for y in z] for z in w] for w in expected] + expected = np.array(expected, dtype=np.object_) + expected = xr.DataArray(expected, dims=["X", "Y"]) + + res_str = value.str.findall(pat=pat_str) + res_re = value.str.findall(pat=pat_re) + res_str_case = value.str.findall(pat=pat_str, case=True) + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert res_str_case.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + assert_equal(res_str_case, expected) + + +def test_findall_multi_single_nocase(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = re.compile(dtype(pat_str), flags=re.I) + + value = xr.DataArray( + [["a_Xy_0", "ab_xY_10", "abc_Xy_01"], ["abcd_Xy_", "", "abcdef_Xy_101"]], + dims=["X", "Y"], + ).astype(dtype) + + expected = [ + [[["a", "0"]], [["ab", "10"]], [["abc", "01"]]], + [[["abcd", ""]], [], [["abcdef", "101"]]], + ] + expected = [[[tuple(dtype(x) for x in y) for y in z] for z in w] for w in expected] + expected = np.array(expected, dtype=np.object_) + expected = xr.DataArray(expected, dims=["X", "Y"]) + + res_str = value.str.findall(pat=pat_str, case=False) + res_re = value.str.findall(pat=pat_re) + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + + +def test_findall_multi_multi_case(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = re.compile(dtype(pat_str)) + + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) + + expected = [ + [ + [["a", "0"]], + [["bab", "110"], ["baab", "1100"]], + [["abc", "01"], ["cbc", "2210"]], + ], + [ + [["abcd", ""], ["dcd", "33210"], ["dccd", "332210"]], + [], + [["abcdef", "101"], ["fef", "5543210"]], + ], + ] + expected = [[[tuple(dtype(x) for x in y) for y in z] for z in w] for w in expected] + expected = np.array(expected, dtype=np.object_) + expected = xr.DataArray(expected, dims=["X", "Y"]) + + res_str = value.str.findall(pat=pat_str) + res_re = value.str.findall(pat=pat_re) + res_str_case = value.str.findall(pat=pat_str, case=True) + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + assert res_str_case.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + assert_equal(res_str_case, expected) + + +def test_findall_multi_multi_nocase(dtype): + pat_str = r"(\w+)_Xy_(\d*)" + pat_re = re.compile(dtype(pat_str), flags=re.I) + + value = xr.DataArray( + [ + ["a_Xy_0", "ab_xY_10-bab_Xy_110-baab_Xy_1100", "abc_Xy_01-cbc_Xy_2210"], + [ + "abcd_Xy_-dcd_Xy_33210-dccd_Xy_332210", + "", + "abcdef_Xy_101-fef_Xy_5543210", + ], + ], + dims=["X", "Y"], + ).astype(dtype) + + expected = [ + [ + [["a", "0"]], + [["ab", "10"], ["bab", "110"], ["baab", "1100"]], + [["abc", "01"], ["cbc", "2210"]], + ], + [ + [["abcd", ""], ["dcd", "33210"], ["dccd", "332210"]], + [], + [["abcdef", "101"], ["fef", "5543210"]], + ], + ] + expected = [[[tuple(dtype(x) for x in y) for y in z] for z in w] for w in expected] + expected = np.array(expected, dtype=np.object_) + expected = xr.DataArray(expected, dims=["X", "Y"]) + + res_str = value.str.findall(pat=pat_str, case=False) + res_re = value.str.findall(pat=pat_re) + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + + +def test_findall_broadcast(dtype): + value = xr.DataArray( + ["a_Xy_0", "ab_xY_10", "abc_Xy_01"], + dims=["X"], + ).astype(dtype) + + pat_str = xr.DataArray( + [r"(\w+)_Xy_\d*", r"\w+_Xy_(\d*)"], + dims=["Y"], + ).astype(dtype) + pat_re = value.str._re_compile(pat=pat_str) + + expected = [[["a"], ["0"]], [[], []], [["abc"], ["01"]]] + expected = [[[dtype(x) for x in y] for y in z] for z in expected] + expected = np.array(expected, dtype=np.object_) + expected = xr.DataArray(expected, dims=["X", "Y"]) + + res_str = value.str.findall(pat=pat_str) + res_re = value.str.findall(pat=pat_re) + + assert res_str.dtype == expected.dtype + assert res_re.dtype == expected.dtype + + assert_equal(res_str, expected) + assert_equal(res_re, expected) + + +def test_repeat(dtype): + values = xr.DataArray(["a", "b", "c", "d"]).astype(dtype) + + result = values.str.repeat(3) + result_mul = values.str * 3 + + expected = xr.DataArray(["aaa", "bbb", "ccc", "ddd"]).astype(dtype) + + assert result.dtype == expected.dtype + assert result_mul.dtype == expected.dtype + + assert_equal(result_mul, expected) + assert_equal(result, expected) + + +def test_repeat_broadcast(dtype): + values = xr.DataArray(["a", "b", "c", "d"], dims=["X"]).astype(dtype) + reps = xr.DataArray([3, 4], dims=["Y"]) + + result = values.str.repeat(reps) + result_mul = values.str * reps + + expected = xr.DataArray( + [["aaa", "aaaa"], ["bbb", "bbbb"], ["ccc", "cccc"], ["ddd", "dddd"]], + dims=["X", "Y"], + ).astype(dtype) + + assert result.dtype == expected.dtype + assert result_mul.dtype == expected.dtype + + assert_equal(result_mul, expected) + assert_equal(result, expected) + + +def test_match(dtype): + values = xr.DataArray(["fooBAD__barBAD", "foo"]).astype(dtype) + + # New match behavior introduced in 0.13 + pat = values.dtype.type(".*(BAD[_]+).*(BAD)") + result = values.str.match(pat) + expected = xr.DataArray([True, False]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.match(re.compile(pat)) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # Case-sensitive + pat = values.dtype.type(".*BAD[_]+.*BAD") + result = values.str.match(pat) + expected = xr.DataArray([True, False]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.match(re.compile(pat)) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # Case-insensitive + pat = values.dtype.type(".*bAd[_]+.*bad") + result = values.str.match(pat, case=False) + expected = xr.DataArray([True, False]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.match(re.compile(pat, flags=re.IGNORECASE)) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_empty_str_methods(): + empty = xr.DataArray(np.empty(shape=(0,), dtype="U")) + empty_str = empty + empty_int = xr.DataArray(np.empty(shape=(0,), dtype=int)) + empty_bool = xr.DataArray(np.empty(shape=(0,), dtype=bool)) + empty_bytes = xr.DataArray(np.empty(shape=(0,), dtype="S")) + + # TODO: Determine why U and S dtype sizes don't match and figure + # out a reliable way to predict what they should be + + assert empty_bool.dtype == empty.str.contains("a").dtype + assert empty_bool.dtype == empty.str.endswith("a").dtype + assert empty_bool.dtype == empty.str.match("^a").dtype + assert empty_bool.dtype == empty.str.startswith("a").dtype + assert empty_bool.dtype == empty.str.isalnum().dtype + assert empty_bool.dtype == empty.str.isalpha().dtype + assert empty_bool.dtype == empty.str.isdecimal().dtype + assert empty_bool.dtype == empty.str.isdigit().dtype + assert empty_bool.dtype == empty.str.islower().dtype + assert empty_bool.dtype == empty.str.isnumeric().dtype + assert empty_bool.dtype == empty.str.isspace().dtype + assert empty_bool.dtype == empty.str.istitle().dtype + assert empty_bool.dtype == empty.str.isupper().dtype + assert empty_bytes.dtype.kind == empty.str.encode("ascii").dtype.kind + assert empty_int.dtype.kind == empty.str.count("a").dtype.kind + assert empty_int.dtype.kind == empty.str.find("a").dtype.kind + assert empty_int.dtype.kind == empty.str.len().dtype.kind + assert empty_int.dtype.kind == empty.str.rfind("a").dtype.kind + assert empty_str.dtype.kind == empty.str.capitalize().dtype.kind + assert empty_str.dtype.kind == empty.str.center(42).dtype.kind + assert empty_str.dtype.kind == empty.str.get(0).dtype.kind + assert empty_str.dtype.kind == empty.str.lower().dtype.kind + assert empty_str.dtype.kind == empty.str.lstrip().dtype.kind + assert empty_str.dtype.kind == empty.str.pad(42).dtype.kind + assert empty_str.dtype.kind == empty.str.repeat(3).dtype.kind + assert empty_str.dtype.kind == empty.str.rstrip().dtype.kind + assert empty_str.dtype.kind == empty.str.slice(step=1).dtype.kind + assert empty_str.dtype.kind == empty.str.slice(stop=1).dtype.kind + assert empty_str.dtype.kind == empty.str.strip().dtype.kind + assert empty_str.dtype.kind == empty.str.swapcase().dtype.kind + assert empty_str.dtype.kind == empty.str.title().dtype.kind + assert empty_str.dtype.kind == empty.str.upper().dtype.kind + assert empty_str.dtype.kind == empty.str.wrap(42).dtype.kind + assert empty_str.dtype.kind == empty_bytes.str.decode("ascii").dtype.kind + + assert_equal(empty_bool, empty.str.contains("a")) + assert_equal(empty_bool, empty.str.endswith("a")) + assert_equal(empty_bool, empty.str.match("^a")) + assert_equal(empty_bool, empty.str.startswith("a")) + assert_equal(empty_bool, empty.str.isalnum()) + assert_equal(empty_bool, empty.str.isalpha()) + assert_equal(empty_bool, empty.str.isdecimal()) + assert_equal(empty_bool, empty.str.isdigit()) + assert_equal(empty_bool, empty.str.islower()) + assert_equal(empty_bool, empty.str.isnumeric()) + assert_equal(empty_bool, empty.str.isspace()) + assert_equal(empty_bool, empty.str.istitle()) + assert_equal(empty_bool, empty.str.isupper()) + assert_equal(empty_bytes, empty.str.encode("ascii")) + assert_equal(empty_int, empty.str.count("a")) + assert_equal(empty_int, empty.str.find("a")) + assert_equal(empty_int, empty.str.len()) + assert_equal(empty_int, empty.str.rfind("a")) + assert_equal(empty_str, empty.str.capitalize()) + assert_equal(empty_str, empty.str.center(42)) + assert_equal(empty_str, empty.str.get(0)) + assert_equal(empty_str, empty.str.lower()) + assert_equal(empty_str, empty.str.lstrip()) + assert_equal(empty_str, empty.str.pad(42)) + assert_equal(empty_str, empty.str.repeat(3)) + assert_equal(empty_str, empty.str.replace("a", "b")) + assert_equal(empty_str, empty.str.rstrip()) + assert_equal(empty_str, empty.str.slice(step=1)) + assert_equal(empty_str, empty.str.slice(stop=1)) + assert_equal(empty_str, empty.str.strip()) + assert_equal(empty_str, empty.str.swapcase()) + assert_equal(empty_str, empty.str.title()) + assert_equal(empty_str, empty.str.upper()) + assert_equal(empty_str, empty.str.wrap(42)) + assert_equal(empty_str, empty_bytes.str.decode("ascii")) + + table = str.maketrans("a", "b") + assert empty_str.dtype.kind == empty.str.translate(table).dtype.kind + assert_equal(empty_str, empty.str.translate(table)) + + +def test_ismethods(dtype): + values = ["A", "b", "Xy", "4", "3A", "", "TT", "55", "-", " "] + + exp_alnum = [True, True, True, True, True, False, True, True, False, False] + exp_alpha = [True, True, True, False, False, False, True, False, False, False] + exp_digit = [False, False, False, True, False, False, False, True, False, False] + exp_space = [False, False, False, False, False, False, False, False, False, True] + exp_lower = [False, True, False, False, False, False, False, False, False, False] + exp_upper = [True, False, False, False, True, False, True, False, False, False] + exp_title = [True, False, True, False, True, False, False, False, False, False] + + values = xr.DataArray(values).astype(dtype) + + exp_alnum = xr.DataArray(exp_alnum) + exp_alpha = xr.DataArray(exp_alpha) + exp_digit = xr.DataArray(exp_digit) + exp_space = xr.DataArray(exp_space) + exp_lower = xr.DataArray(exp_lower) + exp_upper = xr.DataArray(exp_upper) + exp_title = xr.DataArray(exp_title) + + res_alnum = values.str.isalnum() + res_alpha = values.str.isalpha() + res_digit = values.str.isdigit() + res_lower = values.str.islower() + res_space = values.str.isspace() + res_title = values.str.istitle() + res_upper = values.str.isupper() + + assert res_alnum.dtype == exp_alnum.dtype + assert res_alpha.dtype == exp_alpha.dtype + assert res_digit.dtype == exp_digit.dtype + assert res_lower.dtype == exp_lower.dtype + assert res_space.dtype == exp_space.dtype + assert res_title.dtype == exp_title.dtype + assert res_upper.dtype == exp_upper.dtype + + assert_equal(res_alnum, exp_alnum) + assert_equal(res_alpha, exp_alpha) + assert_equal(res_digit, exp_digit) + assert_equal(res_lower, exp_lower) + assert_equal(res_space, exp_space) + assert_equal(res_title, exp_title) + assert_equal(res_upper, exp_upper) + + +def test_isnumeric(): + # 0x00bc: ¼ VULGAR FRACTION ONE QUARTER + # 0x2605: ★ not number + # 0x1378: ፸ ETHIOPIC NUMBER SEVENTY + # 0xFF13: 3 Em 3 + values = ["A", "3", "¼", "★", "፸", "3", "four"] + exp_numeric = [False, True, True, False, True, True, False] + exp_decimal = [False, True, False, False, False, True, False] + + values = xr.DataArray(values) + exp_numeric = xr.DataArray(exp_numeric) + exp_decimal = xr.DataArray(exp_decimal) + + res_numeric = values.str.isnumeric() + res_decimal = values.str.isdecimal() + + assert res_numeric.dtype == exp_numeric.dtype + assert res_decimal.dtype == exp_decimal.dtype + + assert_equal(res_numeric, exp_numeric) + assert_equal(res_decimal, exp_decimal) + + +def test_len(dtype): + values = ["foo", "fooo", "fooooo", "fooooooo"] + result = xr.DataArray(values).astype(dtype).str.len() + expected = xr.DataArray([len(x) for x in values]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_find(dtype): + values = xr.DataArray(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXX"]) + values = values.astype(dtype) + + result_0 = values.str.find("EF") + result_1 = values.str.find("EF", side="left") + expected_0 = xr.DataArray([4, 3, 1, 0, -1]) + expected_1 = xr.DataArray([v.find(dtype("EF")) for v in values.values]) + assert result_0.dtype == expected_0.dtype + assert result_0.dtype == expected_1.dtype + assert result_1.dtype == expected_0.dtype + assert result_1.dtype == expected_1.dtype + assert_equal(result_0, expected_0) + assert_equal(result_0, expected_1) + assert_equal(result_1, expected_0) + assert_equal(result_1, expected_1) + + result_0 = values.str.rfind("EF") + result_1 = values.str.find("EF", side="right") + expected_0 = xr.DataArray([4, 5, 7, 4, -1]) + expected_1 = xr.DataArray([v.rfind(dtype("EF")) for v in values.values]) + assert result_0.dtype == expected_0.dtype + assert result_0.dtype == expected_1.dtype + assert result_1.dtype == expected_0.dtype + assert result_1.dtype == expected_1.dtype + assert_equal(result_0, expected_0) + assert_equal(result_0, expected_1) + assert_equal(result_1, expected_0) + assert_equal(result_1, expected_1) + + result_0 = values.str.find("EF", 3) + result_1 = values.str.find("EF", 3, side="left") + expected_0 = xr.DataArray([4, 3, 7, 4, -1]) + expected_1 = xr.DataArray([v.find(dtype("EF"), 3) for v in values.values]) + assert result_0.dtype == expected_0.dtype + assert result_0.dtype == expected_1.dtype + assert result_1.dtype == expected_0.dtype + assert result_1.dtype == expected_1.dtype + assert_equal(result_0, expected_0) + assert_equal(result_0, expected_1) + assert_equal(result_1, expected_0) + assert_equal(result_1, expected_1) + + result_0 = values.str.rfind("EF", 3) + result_1 = values.str.find("EF", 3, side="right") + expected_0 = xr.DataArray([4, 5, 7, 4, -1]) + expected_1 = xr.DataArray([v.rfind(dtype("EF"), 3) for v in values.values]) + assert result_0.dtype == expected_0.dtype + assert result_0.dtype == expected_1.dtype + assert result_1.dtype == expected_0.dtype + assert result_1.dtype == expected_1.dtype + assert_equal(result_0, expected_0) + assert_equal(result_0, expected_1) + assert_equal(result_1, expected_0) + assert_equal(result_1, expected_1) + + result_0 = values.str.find("EF", 3, 6) + result_1 = values.str.find("EF", 3, 6, side="left") + expected_0 = xr.DataArray([4, 3, -1, 4, -1]) + expected_1 = xr.DataArray([v.find(dtype("EF"), 3, 6) for v in values.values]) + assert result_0.dtype == expected_0.dtype + assert result_0.dtype == expected_1.dtype + assert result_1.dtype == expected_0.dtype + assert result_1.dtype == expected_1.dtype + assert_equal(result_0, expected_0) + assert_equal(result_0, expected_1) + assert_equal(result_1, expected_0) + assert_equal(result_1, expected_1) + + result_0 = values.str.rfind("EF", 3, 6) + result_1 = values.str.find("EF", 3, 6, side="right") + expected_0 = xr.DataArray([4, 3, -1, 4, -1]) + expected_1 = xr.DataArray([v.rfind(dtype("EF"), 3, 6) for v in values.values]) + assert result_0.dtype == expected_0.dtype + assert result_0.dtype == expected_1.dtype + assert result_1.dtype == expected_0.dtype + assert result_1.dtype == expected_1.dtype + assert_equal(result_0, expected_0) + assert_equal(result_0, expected_1) + assert_equal(result_1, expected_0) + assert_equal(result_1, expected_1) + + +def test_find_broadcast(dtype): + values = xr.DataArray( + ["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF", "XXX"], dims=["X"] + ) + values = values.astype(dtype) + sub = xr.DataArray(["EF", "BC", "XX"], dims=["Y"]).astype(dtype) + start = xr.DataArray([0, 7], dims=["Z"]) + end = xr.DataArray([6, 9], dims=["Z"]) + + result_0 = values.str.find(sub, start, end) + result_1 = values.str.find(sub, start, end, side="left") + expected = xr.DataArray( + [ + [[4, -1], [1, -1], [-1, -1]], + [[3, -1], [0, -1], [-1, -1]], + [[1, 7], [-1, -1], [-1, -1]], + [[0, -1], [-1, -1], [-1, -1]], + [[-1, -1], [-1, -1], [0, -1]], + ], + dims=["X", "Y", "Z"], + ) + + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + result_0 = values.str.rfind(sub, start, end) + result_1 = values.str.find(sub, start, end, side="right") + expected = xr.DataArray( + [ + [[4, -1], [1, -1], [-1, -1]], + [[3, -1], [0, -1], [-1, -1]], + [[1, 7], [-1, -1], [-1, -1]], + [[4, -1], [-1, -1], [-1, -1]], + [[-1, -1], [-1, -1], [1, -1]], + ], + dims=["X", "Y", "Z"], + ) + + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + +def test_index(dtype): + s = xr.DataArray(["ABCDEFG", "BCDEFEF", "DEFGHIJEF", "EFGHEF"]).astype(dtype) + + result_0 = s.str.index("EF") + result_1 = s.str.index("EF", side="left") + expected = xr.DataArray([4, 3, 1, 0]) + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + result_0 = s.str.rindex("EF") + result_1 = s.str.index("EF", side="right") + expected = xr.DataArray([4, 5, 7, 4]) + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + result_0 = s.str.index("EF", 3) + result_1 = s.str.index("EF", 3, side="left") + expected = xr.DataArray([4, 3, 7, 4]) + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + result_0 = s.str.rindex("EF", 3) + result_1 = s.str.index("EF", 3, side="right") + expected = xr.DataArray([4, 5, 7, 4]) + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + result_0 = s.str.index("E", 4, 8) + result_1 = s.str.index("E", 4, 8, side="left") + expected = xr.DataArray([4, 5, 7, 4]) + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + result_0 = s.str.rindex("E", 0, 5) + result_1 = s.str.index("E", 0, 5, side="right") + expected = xr.DataArray([4, 3, 1, 4]) + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + matchtype = "subsection" if dtype == np.bytes_ else "substring" + with pytest.raises(ValueError, match=f"{matchtype} not found"): + s.str.index("DE") + + +def test_index_broadcast(dtype): + values = xr.DataArray( + ["ABCDEFGEFDBCA", "BCDEFEFEFDBC", "DEFBCGHIEFBC", "EFGHBCEFBCBCBCEF"], + dims=["X"], + ) + values = values.astype(dtype) + sub = xr.DataArray(["EF", "BC"], dims=["Y"]).astype(dtype) + start = xr.DataArray([0, 6], dims=["Z"]) + end = xr.DataArray([6, 12], dims=["Z"]) + + result_0 = values.str.index(sub, start, end) + result_1 = values.str.index(sub, start, end, side="left") + expected = xr.DataArray( + [[[4, 7], [1, 10]], [[3, 7], [0, 10]], [[1, 8], [3, 10]], [[0, 6], [4, 8]]], + dims=["X", "Y", "Z"], + ) + + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + result_0 = values.str.rindex(sub, start, end) + result_1 = values.str.index(sub, start, end, side="right") + expected = xr.DataArray( + [[[4, 7], [1, 10]], [[3, 7], [0, 10]], [[1, 8], [3, 10]], [[0, 6], [4, 10]]], + dims=["X", "Y", "Z"], + ) + + assert result_0.dtype == expected.dtype + assert result_1.dtype == expected.dtype + assert_equal(result_0, expected) + assert_equal(result_1, expected) + + +def test_translate(): + values = xr.DataArray(["abcdefg", "abcc", "cdddfg", "cdefggg"]) + table = str.maketrans("abc", "cde") + result = values.str.translate(table) + expected = xr.DataArray(["cdedefg", "cdee", "edddfg", "edefggg"]) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_pad_center_ljust_rjust(dtype): + values = xr.DataArray(["a", "b", "c", "eeeee"]).astype(dtype) + + result = values.str.center(5) + expected = xr.DataArray([" a ", " b ", " c ", "eeeee"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.pad(5, side="both") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.ljust(5) + expected = xr.DataArray(["a ", "b ", "c ", "eeeee"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.pad(5, side="right") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.rjust(5) + expected = xr.DataArray([" a", " b", " c", "eeeee"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.pad(5, side="left") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_pad_center_ljust_rjust_fillchar(dtype): + values = xr.DataArray(["a", "bb", "cccc", "ddddd", "eeeeee"]).astype(dtype) + + result = values.str.center(5, fillchar="X") + expected = xr.DataArray(["XXaXX", "XXbbX", "Xcccc", "ddddd", "eeeeee"]).astype( + dtype + ) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.pad(5, side="both", fillchar="X") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.ljust(5, fillchar="X") + expected = xr.DataArray(["aXXXX", "bbXXX", "ccccX", "ddddd", "eeeeee"]).astype( + dtype + ) + assert result.dtype == expected.dtype + assert_equal(result, expected.astype(dtype)) + result = values.str.pad(5, side="right", fillchar="X") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.rjust(5, fillchar="X") + expected = xr.DataArray(["XXXXa", "XXXbb", "Xcccc", "ddddd", "eeeeee"]).astype( + dtype + ) + assert result.dtype == expected.dtype + assert_equal(result, expected.astype(dtype)) + result = values.str.pad(5, side="left", fillchar="X") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # If fillchar is not a charatter, normal str raises TypeError + # 'aaa'.ljust(5, 'XY') + # TypeError: must be char, not str + template = "fillchar must be a character, not {dtype}" + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.center(5, fillchar="XY") + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.ljust(5, fillchar="XY") + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.rjust(5, fillchar="XY") + + with pytest.raises(TypeError, match=template.format(dtype="str")): + values.str.pad(5, fillchar="XY") + + +def test_pad_center_ljust_rjust_broadcast(dtype): + values = xr.DataArray(["a", "bb", "cccc", "ddddd", "eeeeee"], dims="X").astype( + dtype + ) + width = xr.DataArray([5, 4], dims="Y") + fillchar = xr.DataArray(["X", "#"], dims="Y").astype(dtype) + + result = values.str.center(width, fillchar=fillchar) + expected = xr.DataArray( + [ + ["XXaXX", "#a##"], + ["XXbbX", "#bb#"], + ["Xcccc", "cccc"], + ["ddddd", "ddddd"], + ["eeeeee", "eeeeee"], + ], + dims=["X", "Y"], + ).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + result = values.str.pad(width, side="both", fillchar=fillchar) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.ljust(width, fillchar=fillchar) + expected = xr.DataArray( + [ + ["aXXXX", "a###"], + ["bbXXX", "bb##"], + ["ccccX", "cccc"], + ["ddddd", "ddddd"], + ["eeeeee", "eeeeee"], + ], + dims=["X", "Y"], + ).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected.astype(dtype)) + result = values.str.pad(width, side="right", fillchar=fillchar) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.rjust(width, fillchar=fillchar) + expected = xr.DataArray( + [ + ["XXXXa", "###a"], + ["XXXbb", "##bb"], + ["Xcccc", "cccc"], + ["ddddd", "ddddd"], + ["eeeeee", "eeeeee"], + ], + dims=["X", "Y"], + ).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected.astype(dtype)) + result = values.str.pad(width, side="left", fillchar=fillchar) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_zfill(dtype): + values = xr.DataArray(["1", "22", "aaa", "333", "45678"]).astype(dtype) + + result = values.str.zfill(5) + expected = xr.DataArray(["00001", "00022", "00aaa", "00333", "45678"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.zfill(3) + expected = xr.DataArray(["001", "022", "aaa", "333", "45678"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_zfill_broadcast(dtype): + values = xr.DataArray(["1", "22", "aaa", "333", "45678"]).astype(dtype) + width = np.array([4, 5, 0, 3, 8]) + + result = values.str.zfill(width) + expected = xr.DataArray(["0001", "00022", "aaa", "333", "00045678"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_slice(dtype): + arr = xr.DataArray(["aafootwo", "aabartwo", "aabazqux"]).astype(dtype) + + result = arr.str.slice(2, 5) + exp = xr.DataArray(["foo", "bar", "baz"]).astype(dtype) + assert result.dtype == exp.dtype + assert_equal(result, exp) + + for start, stop, step in [(0, 3, -1), (None, None, -1), (3, 10, 2), (3, 0, -1)]: + try: + result = arr.str[start:stop:step] + expected = xr.DataArray([s[start:stop:step] for s in arr.values]) + assert_equal(result, expected.astype(dtype)) + except IndexError: + print(f"failed on {start}:{stop}:{step}") + raise + + +def test_slice_broadcast(dtype): + arr = xr.DataArray(["aafootwo", "aabartwo", "aabazqux"]).astype(dtype) + start = xr.DataArray([1, 2, 3]) + stop = 5 + + result = arr.str.slice(start=start, stop=stop) + exp = xr.DataArray(["afoo", "bar", "az"]).astype(dtype) + assert result.dtype == exp.dtype + assert_equal(result, exp) + + +def test_slice_replace(dtype): + da = lambda x: xr.DataArray(x).astype(dtype) + values = da(["short", "a bit longer", "evenlongerthanthat", ""]) + + expected = da(["shrt", "a it longer", "evnlongerthanthat", ""]) + result = values.str.slice_replace(2, 3) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + expected = da(["shzrt", "a zit longer", "evznlongerthanthat", "z"]) + result = values.str.slice_replace(2, 3, "z") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + expected = da(["shzort", "a zbit longer", "evzenlongerthanthat", "z"]) + result = values.str.slice_replace(2, 2, "z") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + expected = da(["shzort", "a zbit longer", "evzenlongerthanthat", "z"]) + result = values.str.slice_replace(2, 1, "z") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + expected = da(["shorz", "a bit longez", "evenlongerthanthaz", "z"]) + result = values.str.slice_replace(-1, None, "z") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + expected = da(["zrt", "zer", "zat", "z"]) + result = values.str.slice_replace(None, -2, "z") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + expected = da(["shortz", "a bit znger", "evenlozerthanthat", "z"]) + result = values.str.slice_replace(6, 8, "z") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + expected = da(["zrt", "a zit longer", "evenlongzerthanthat", "z"]) + result = values.str.slice_replace(-10, 3, "z") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_slice_replace_broadcast(dtype): + values = xr.DataArray(["short", "a bit longer", "evenlongerthanthat", ""]).astype( + dtype + ) + start = 2 + stop = np.array([4, 5, None, 7]) + repl = "test" + + expected = xr.DataArray(["shtestt", "a test longer", "evtest", "test"]).astype( + dtype + ) + result = values.str.slice_replace(start, stop, repl) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_strip_lstrip_rstrip(dtype): + values = xr.DataArray([" aa ", " bb \n", "cc "]).astype(dtype) + + result = values.str.strip() + expected = xr.DataArray(["aa", "bb", "cc"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.lstrip() + expected = xr.DataArray(["aa ", "bb \n", "cc "]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.rstrip() + expected = xr.DataArray([" aa", " bb", "cc"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_strip_lstrip_rstrip_args(dtype): + values = xr.DataArray(["xxABCxx", "xx BNSD", "LDFJH xx"]).astype(dtype) + + result = values.str.strip("x") + expected = xr.DataArray(["ABC", " BNSD", "LDFJH "]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.lstrip("x") + expected = xr.DataArray(["ABCxx", " BNSD", "LDFJH xx"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.rstrip("x") + expected = xr.DataArray(["xxABC", "xx BNSD", "LDFJH "]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_strip_lstrip_rstrip_broadcast(dtype): + values = xr.DataArray(["xxABCxx", "yy BNSD", "LDFJH zz"]).astype(dtype) + to_strip = xr.DataArray(["x", "y", "z"]).astype(dtype) + + result = values.str.strip(to_strip) + expected = xr.DataArray(["ABC", " BNSD", "LDFJH "]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.lstrip(to_strip) + expected = xr.DataArray(["ABCxx", " BNSD", "LDFJH zz"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.rstrip(to_strip) + expected = xr.DataArray(["xxABC", "yy BNSD", "LDFJH "]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_wrap(): + # test values are: two words less than width, two words equal to width, + # two words greater than width, one word less than width, one word + # equal to width, one word greater than width, multiple tokens with + # trailing whitespace equal to width + values = xr.DataArray( + [ + "hello world", + "hello world!", + "hello world!!", + "abcdefabcde", + "abcdefabcdef", + "abcdefabcdefa", + "ab ab ab ab ", + "ab ab ab ab a", + "\t", + ] + ) + + # expected values + expected = xr.DataArray( + [ + "hello world", + "hello world!", + "hello\nworld!!", + "abcdefabcde", + "abcdefabcdef", + "abcdefabcdef\na", + "ab ab ab ab", + "ab ab ab ab\na", + "", + ] + ) + + result = values.str.wrap(12, break_long_words=True) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # test with pre and post whitespace (non-unicode), NaN, and non-ascii + # Unicode + values = xr.DataArray([" pre ", "\xac\u20ac\U00008000 abadcafe"]) + expected = xr.DataArray([" pre", "\xac\u20ac\U00008000 ab\nadcafe"]) + result = values.str.wrap(6) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_wrap_kwargs_passed(): + # GH4334 + + values = xr.DataArray(" hello world ") + + result = values.str.wrap(7) + expected = xr.DataArray(" hello\nworld") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + result = values.str.wrap(7, drop_whitespace=False) + expected = xr.DataArray(" hello\n world\n ") + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_get(dtype): + values = xr.DataArray(["a_b_c", "c_d_e", "f_g_h"]).astype(dtype) + + result = values.str[2] + expected = xr.DataArray(["b", "d", "g"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # bounds testing + values = xr.DataArray(["1_2_3_4_5", "6_7_8_9_10", "11_12"]).astype(dtype) + + # positive index + result = values.str[5] + expected = xr.DataArray(["_", "_", ""]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + # negative index + result = values.str[-6] + expected = xr.DataArray(["_", "8", ""]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_get_default(dtype): + # GH4334 + values = xr.DataArray(["a_b", "c", ""]).astype(dtype) + + result = values.str.get(2, "default") + expected = xr.DataArray(["b", "default", "default"]).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_get_broadcast(dtype): + values = xr.DataArray(["a_b_c", "c_d_e", "f_g_h"], dims=["X"]).astype(dtype) + inds = xr.DataArray([0, 2], dims=["Y"]) + + result = values.str.get(inds) + expected = xr.DataArray( + [["a", "b"], ["c", "d"], ["f", "g"]], dims=["X", "Y"] + ).astype(dtype) + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_encode_decode(): + data = xr.DataArray(["a", "b", "a\xe4"]) + encoded = data.str.encode("utf-8") + decoded = encoded.str.decode("utf-8") + assert data.dtype == decoded.dtype + assert_equal(data, decoded) + + +def test_encode_decode_errors(): + encodeBase = xr.DataArray(["a", "b", "a\x9d"]) + + msg = ( + r"'charmap' codec can't encode character '\\x9d' in position 1:" + " character maps to " + ) + with pytest.raises(UnicodeEncodeError, match=msg): + encodeBase.str.encode("cp1252") + + f = lambda x: x.encode("cp1252", "ignore") + result = encodeBase.str.encode("cp1252", "ignore") + expected = xr.DataArray([f(x) for x in encodeBase.values.tolist()]) + + assert result.dtype == expected.dtype + assert_equal(result, expected) + + decodeBase = xr.DataArray([b"a", b"b", b"a\x9d"]) + + msg = ( + "'charmap' codec can't decode byte 0x9d in position 1:" + " character maps to " + ) + with pytest.raises(UnicodeDecodeError, match=msg): + decodeBase.str.decode("cp1252") + + f = lambda x: x.decode("cp1252", "ignore") + result = decodeBase.str.decode("cp1252", "ignore") + expected = xr.DataArray([f(x) for x in decodeBase.values.tolist()]) + + assert result.dtype == expected.dtype + assert_equal(result, expected) + + +def test_partition_whitespace(dtype): + values = xr.DataArray( + [ + ["abc def", "spam eggs swallow", "red_blue"], + ["test0 test1 test2 test3", "", "abra ka da bra"], + ], + dims=["X", "Y"], + ).astype(dtype) + + exp_part_dim = [ + [ + ["abc", " ", "def"], + ["spam", " ", "eggs swallow"], + ["red_blue", "", ""], + ], + [ + ["test0", " ", "test1 test2 test3"], + ["", "", ""], + ["abra", " ", "ka da bra"], + ], + ] + + exp_rpart_dim = [ + [ + ["abc", " ", "def"], + ["spam eggs", " ", "swallow"], + ["", "", "red_blue"], + ], + [ + ["test0 test1 test2", " ", "test3"], + ["", "", ""], + ["abra ka da", " ", "bra"], + ], + ] + + exp_part_dim = xr.DataArray(exp_part_dim, dims=["X", "Y", "ZZ"]).astype(dtype) + exp_rpart_dim = xr.DataArray(exp_rpart_dim, dims=["X", "Y", "ZZ"]).astype(dtype) + + res_part_dim = values.str.partition(dim="ZZ") + res_rpart_dim = values.str.rpartition(dim="ZZ") + + assert res_part_dim.dtype == exp_part_dim.dtype + assert res_rpart_dim.dtype == exp_rpart_dim.dtype + + assert_equal(res_part_dim, exp_part_dim) + assert_equal(res_rpart_dim, exp_rpart_dim) + + +def test_partition_comma(dtype): + values = xr.DataArray( + [ + ["abc, def", "spam, eggs, swallow", "red_blue"], + ["test0, test1, test2, test3", "", "abra, ka, da, bra"], + ], + dims=["X", "Y"], + ).astype(dtype) + + exp_part_dim = [ + [ + ["abc", ", ", "def"], + ["spam", ", ", "eggs, swallow"], + ["red_blue", "", ""], + ], + [ + ["test0", ", ", "test1, test2, test3"], + ["", "", ""], + ["abra", ", ", "ka, da, bra"], + ], + ] + + exp_rpart_dim = [ + [ + ["abc", ", ", "def"], + ["spam, eggs", ", ", "swallow"], + ["", "", "red_blue"], + ], + [ + ["test0, test1, test2", ", ", "test3"], + ["", "", ""], + ["abra, ka, da", ", ", "bra"], + ], + ] + + exp_part_dim = xr.DataArray(exp_part_dim, dims=["X", "Y", "ZZ"]).astype(dtype) + exp_rpart_dim = xr.DataArray(exp_rpart_dim, dims=["X", "Y", "ZZ"]).astype(dtype) + + res_part_dim = values.str.partition(sep=", ", dim="ZZ") + res_rpart_dim = values.str.rpartition(sep=", ", dim="ZZ") + + assert res_part_dim.dtype == exp_part_dim.dtype + assert res_rpart_dim.dtype == exp_rpart_dim.dtype + + assert_equal(res_part_dim, exp_part_dim) + assert_equal(res_rpart_dim, exp_rpart_dim) + + +def test_partition_empty(dtype): + values = xr.DataArray([], dims=["X"]).astype(dtype) + expected = xr.DataArray(np.zeros((0, 0)), dims=["X", "ZZ"]).astype(dtype) + + res = values.str.partition(sep=", ", dim="ZZ") + + assert res.dtype == expected.dtype + assert_equal(res, expected) + + +def test_split_whitespace(dtype): + values = xr.DataArray( + [ + ["abc def", "spam\t\teggs\tswallow", "red_blue"], + ["test0\ntest1\ntest2\n\ntest3", "", "abra ka\nda\tbra"], + ], + dims=["X", "Y"], + ).astype(dtype) + + exp_split_dim_full = [ + [ + ["abc", "def", "", ""], + ["spam", "eggs", "swallow", ""], + ["red_blue", "", "", ""], + ], + [ + ["test0", "test1", "test2", "test3"], + ["", "", "", ""], + ["abra", "ka", "da", "bra"], + ], + ] + + exp_rsplit_dim_full = [ + [ + ["", "", "abc", "def"], + ["", "spam", "eggs", "swallow"], + ["", "", "", "red_blue"], + ], + [ + ["test0", "test1", "test2", "test3"], + ["", "", "", ""], + ["abra", "ka", "da", "bra"], + ], + ] + + exp_split_dim_1 = [ + [["abc", "def"], ["spam", "eggs\tswallow"], ["red_blue", ""]], + [["test0", "test1\ntest2\n\ntest3"], ["", ""], ["abra", "ka\nda\tbra"]], + ] + + exp_rsplit_dim_1 = [ + [["abc", "def"], ["spam\t\teggs", "swallow"], ["", "red_blue"]], + [["test0\ntest1\ntest2", "test3"], ["", ""], ["abra ka\nda", "bra"]], + ] + + exp_split_none_full = [ + [["abc", "def"], ["spam", "eggs", "swallow"], ["red_blue"]], + [["test0", "test1", "test2", "test3"], [], ["abra", "ka", "da", "bra"]], + ] + + exp_rsplit_none_full = [ + [["abc", "def"], ["spam", "eggs", "swallow"], ["red_blue"]], + [["test0", "test1", "test2", "test3"], [], ["abra", "ka", "da", "bra"]], + ] + + exp_split_none_1 = [ + [["abc", "def"], ["spam", "eggs\tswallow"], ["red_blue"]], + [["test0", "test1\ntest2\n\ntest3"], [], ["abra", "ka\nda\tbra"]], + ] + + exp_rsplit_none_1 = [ + [["abc", "def"], ["spam\t\teggs", "swallow"], ["red_blue"]], + [["test0\ntest1\ntest2", "test3"], [], ["abra ka\nda", "bra"]], + ] + + exp_split_none_full = [ + [[dtype(x) for x in y] for y in z] for z in exp_split_none_full + ] + exp_rsplit_none_full = [ + [[dtype(x) for x in y] for y in z] for z in exp_rsplit_none_full + ] + exp_split_none_1 = [[[dtype(x) for x in y] for y in z] for z in exp_split_none_1] + exp_rsplit_none_1 = [[[dtype(x) for x in y] for y in z] for z in exp_rsplit_none_1] + + exp_split_none_full = np.array(exp_split_none_full, dtype=np.object_) + exp_rsplit_none_full = np.array(exp_rsplit_none_full, dtype=np.object_) + exp_split_none_1 = np.array(exp_split_none_1, dtype=np.object_) + exp_rsplit_none_1 = np.array(exp_rsplit_none_1, dtype=np.object_) + + exp_split_dim_full = xr.DataArray(exp_split_dim_full, dims=["X", "Y", "ZZ"]).astype( + dtype + ) + exp_rsplit_dim_full = xr.DataArray( + exp_rsplit_dim_full, dims=["X", "Y", "ZZ"] + ).astype(dtype) + exp_split_dim_1 = xr.DataArray(exp_split_dim_1, dims=["X", "Y", "ZZ"]).astype(dtype) + exp_rsplit_dim_1 = xr.DataArray(exp_rsplit_dim_1, dims=["X", "Y", "ZZ"]).astype( + dtype + ) + + exp_split_none_full = xr.DataArray(exp_split_none_full, dims=["X", "Y"]) + exp_rsplit_none_full = xr.DataArray(exp_rsplit_none_full, dims=["X", "Y"]) + exp_split_none_1 = xr.DataArray(exp_split_none_1, dims=["X", "Y"]) + exp_rsplit_none_1 = xr.DataArray(exp_rsplit_none_1, dims=["X", "Y"]) + + res_split_dim_full = values.str.split(dim="ZZ") + res_rsplit_dim_full = values.str.rsplit(dim="ZZ") + res_split_dim_1 = values.str.split(dim="ZZ", maxsplit=1) + res_rsplit_dim_1 = values.str.rsplit(dim="ZZ", maxsplit=1) + res_split_dim_10 = values.str.split(dim="ZZ", maxsplit=10) + res_rsplit_dim_10 = values.str.rsplit(dim="ZZ", maxsplit=10) + + res_split_none_full = values.str.split(dim=None) + res_rsplit_none_full = values.str.rsplit(dim=None) + res_split_none_1 = values.str.split(dim=None, maxsplit=1) + res_rsplit_none_1 = values.str.rsplit(dim=None, maxsplit=1) + res_split_none_10 = values.str.split(dim=None, maxsplit=10) + res_rsplit_none_10 = values.str.rsplit(dim=None, maxsplit=10) + + assert res_split_dim_full.dtype == exp_split_dim_full.dtype + assert res_rsplit_dim_full.dtype == exp_rsplit_dim_full.dtype + assert res_split_dim_1.dtype == exp_split_dim_1.dtype + assert res_rsplit_dim_1.dtype == exp_rsplit_dim_1.dtype + assert res_split_dim_10.dtype == exp_split_dim_full.dtype + assert res_rsplit_dim_10.dtype == exp_rsplit_dim_full.dtype + + assert res_split_none_full.dtype == exp_split_none_full.dtype + assert res_rsplit_none_full.dtype == exp_rsplit_none_full.dtype + assert res_split_none_1.dtype == exp_split_none_1.dtype + assert res_rsplit_none_1.dtype == exp_rsplit_none_1.dtype + assert res_split_none_10.dtype == exp_split_none_full.dtype + assert res_rsplit_none_10.dtype == exp_rsplit_none_full.dtype + + assert_equal(res_split_dim_full, exp_split_dim_full) + assert_equal(res_rsplit_dim_full, exp_rsplit_dim_full) + assert_equal(res_split_dim_1, exp_split_dim_1) + assert_equal(res_rsplit_dim_1, exp_rsplit_dim_1) + assert_equal(res_split_dim_10, exp_split_dim_full) + assert_equal(res_rsplit_dim_10, exp_rsplit_dim_full) + + assert_equal(res_split_none_full, exp_split_none_full) + assert_equal(res_rsplit_none_full, exp_rsplit_none_full) + assert_equal(res_split_none_1, exp_split_none_1) + assert_equal(res_rsplit_none_1, exp_rsplit_none_1) + assert_equal(res_split_none_10, exp_split_none_full) + assert_equal(res_rsplit_none_10, exp_rsplit_none_full) + + +def test_split_comma(dtype): + values = xr.DataArray( + [ + ["abc,def", "spam,,eggs,swallow", "red_blue"], + ["test0,test1,test2,test3", "", "abra,ka,da,bra"], + ], + dims=["X", "Y"], + ).astype(dtype) + + exp_split_dim_full = [ + [ + ["abc", "def", "", ""], + ["spam", "", "eggs", "swallow"], + ["red_blue", "", "", ""], + ], + [ + ["test0", "test1", "test2", "test3"], + ["", "", "", ""], + ["abra", "ka", "da", "bra"], + ], + ] + + exp_rsplit_dim_full = [ + [ + ["", "", "abc", "def"], + ["spam", "", "eggs", "swallow"], + ["", "", "", "red_blue"], + ], + [ + ["test0", "test1", "test2", "test3"], + ["", "", "", ""], + ["abra", "ka", "da", "bra"], + ], + ] + + exp_split_dim_1 = [ + [["abc", "def"], ["spam", ",eggs,swallow"], ["red_blue", ""]], + [["test0", "test1,test2,test3"], ["", ""], ["abra", "ka,da,bra"]], + ] + + exp_rsplit_dim_1 = [ + [["abc", "def"], ["spam,,eggs", "swallow"], ["", "red_blue"]], + [["test0,test1,test2", "test3"], ["", ""], ["abra,ka,da", "bra"]], + ] + + exp_split_none_full = [ + [["abc", "def"], ["spam", "", "eggs", "swallow"], ["red_blue"]], + [["test0", "test1", "test2", "test3"], [""], ["abra", "ka", "da", "bra"]], + ] + + exp_rsplit_none_full = [ + [["abc", "def"], ["spam", "", "eggs", "swallow"], ["red_blue"]], + [["test0", "test1", "test2", "test3"], [""], ["abra", "ka", "da", "bra"]], + ] + + exp_split_none_1 = [ + [["abc", "def"], ["spam", ",eggs,swallow"], ["red_blue"]], + [["test0", "test1,test2,test3"], [""], ["abra", "ka,da,bra"]], + ] + + exp_rsplit_none_1 = [ + [["abc", "def"], ["spam,,eggs", "swallow"], ["red_blue"]], + [["test0,test1,test2", "test3"], [""], ["abra,ka,da", "bra"]], + ] + + exp_split_none_full = [ + [[dtype(x) for x in y] for y in z] for z in exp_split_none_full + ] + exp_rsplit_none_full = [ + [[dtype(x) for x in y] for y in z] for z in exp_rsplit_none_full + ] + exp_split_none_1 = [[[dtype(x) for x in y] for y in z] for z in exp_split_none_1] + exp_rsplit_none_1 = [[[dtype(x) for x in y] for y in z] for z in exp_rsplit_none_1] + + exp_split_none_full = np.array(exp_split_none_full, dtype=np.object_) + exp_rsplit_none_full = np.array(exp_rsplit_none_full, dtype=np.object_) + exp_split_none_1 = np.array(exp_split_none_1, dtype=np.object_) + exp_rsplit_none_1 = np.array(exp_rsplit_none_1, dtype=np.object_) + + exp_split_dim_full = xr.DataArray(exp_split_dim_full, dims=["X", "Y", "ZZ"]).astype( + dtype + ) + exp_rsplit_dim_full = xr.DataArray( + exp_rsplit_dim_full, dims=["X", "Y", "ZZ"] + ).astype(dtype) + exp_split_dim_1 = xr.DataArray(exp_split_dim_1, dims=["X", "Y", "ZZ"]).astype(dtype) + exp_rsplit_dim_1 = xr.DataArray(exp_rsplit_dim_1, dims=["X", "Y", "ZZ"]).astype( + dtype + ) + + exp_split_none_full = xr.DataArray(exp_split_none_full, dims=["X", "Y"]) + exp_rsplit_none_full = xr.DataArray(exp_rsplit_none_full, dims=["X", "Y"]) + exp_split_none_1 = xr.DataArray(exp_split_none_1, dims=["X", "Y"]) + exp_rsplit_none_1 = xr.DataArray(exp_rsplit_none_1, dims=["X", "Y"]) + + res_split_dim_full = values.str.split(sep=",", dim="ZZ") + res_rsplit_dim_full = values.str.rsplit(sep=",", dim="ZZ") + res_split_dim_1 = values.str.split(sep=",", dim="ZZ", maxsplit=1) + res_rsplit_dim_1 = values.str.rsplit(sep=",", dim="ZZ", maxsplit=1) + res_split_dim_10 = values.str.split(sep=",", dim="ZZ", maxsplit=10) + res_rsplit_dim_10 = values.str.rsplit(sep=",", dim="ZZ", maxsplit=10) + + res_split_none_full = values.str.split(sep=",", dim=None) + res_rsplit_none_full = values.str.rsplit(sep=",", dim=None) + res_split_none_1 = values.str.split(sep=",", dim=None, maxsplit=1) + res_rsplit_none_1 = values.str.rsplit(sep=",", dim=None, maxsplit=1) + res_split_none_10 = values.str.split(sep=",", dim=None, maxsplit=10) + res_rsplit_none_10 = values.str.rsplit(sep=",", dim=None, maxsplit=10) + + assert res_split_dim_full.dtype == exp_split_dim_full.dtype + assert res_rsplit_dim_full.dtype == exp_rsplit_dim_full.dtype + assert res_split_dim_1.dtype == exp_split_dim_1.dtype + assert res_rsplit_dim_1.dtype == exp_rsplit_dim_1.dtype + assert res_split_dim_10.dtype == exp_split_dim_full.dtype + assert res_rsplit_dim_10.dtype == exp_rsplit_dim_full.dtype + + assert res_split_none_full.dtype == exp_split_none_full.dtype + assert res_rsplit_none_full.dtype == exp_rsplit_none_full.dtype + assert res_split_none_1.dtype == exp_split_none_1.dtype + assert res_rsplit_none_1.dtype == exp_rsplit_none_1.dtype + assert res_split_none_10.dtype == exp_split_none_full.dtype + assert res_rsplit_none_10.dtype == exp_rsplit_none_full.dtype + + assert_equal(res_split_dim_full, exp_split_dim_full) + assert_equal(res_rsplit_dim_full, exp_rsplit_dim_full) + assert_equal(res_split_dim_1, exp_split_dim_1) + assert_equal(res_rsplit_dim_1, exp_rsplit_dim_1) + assert_equal(res_split_dim_10, exp_split_dim_full) + assert_equal(res_rsplit_dim_10, exp_rsplit_dim_full) + + assert_equal(res_split_none_full, exp_split_none_full) + assert_equal(res_rsplit_none_full, exp_rsplit_none_full) + assert_equal(res_split_none_1, exp_split_none_1) + assert_equal(res_rsplit_none_1, exp_rsplit_none_1) + assert_equal(res_split_none_10, exp_split_none_full) + assert_equal(res_rsplit_none_10, exp_rsplit_none_full) + + +def test_splitters_broadcast(dtype): + values = xr.DataArray( + ["ab cd,de fg", "spam, ,eggs swallow", "red_blue"], + dims=["X"], + ).astype(dtype) + + sep = xr.DataArray( + [" ", ","], + dims=["Y"], + ).astype(dtype) + + expected_left = xr.DataArray( + [ + [["ab", "cd,de fg"], ["ab cd", "de fg"]], + [["spam,", ",eggs swallow"], ["spam", " ,eggs swallow"]], + [["red_blue", ""], ["red_blue", ""]], + ], + dims=["X", "Y", "ZZ"], + ).astype(dtype) + expected_right = xr.DataArray( + [ + [["ab cd,de", "fg"], ["ab cd", "de fg"]], + [["spam, ,eggs", "swallow"], ["spam, ", "eggs swallow"]], + [["", "red_blue"], ["", "red_blue"]], + ], + dims=["X", "Y", "ZZ"], + ).astype(dtype) + + res_left = values.str.split(dim="ZZ", sep=sep, maxsplit=1) + res_right = values.str.rsplit(dim="ZZ", sep=sep, maxsplit=1) + + # assert res_left.dtype == expected_left.dtype + # assert res_right.dtype == expected_right.dtype + + assert_equal(res_left, expected_left) + assert_equal(res_right, expected_right) + + expected_left = xr.DataArray( + [ + [["ab", " ", "cd,de fg"], ["ab cd", ",", "de fg"]], + [["spam,", " ", ",eggs swallow"], ["spam", ",", " ,eggs swallow"]], + [["red_blue", "", ""], ["red_blue", "", ""]], + ], + dims=["X", "Y", "ZZ"], + ).astype(dtype) + expected_right = xr.DataArray( + [ + [["ab", " ", "cd,de fg"], ["ab cd", ",", "de fg"]], + [["spam,", " ", ",eggs swallow"], ["spam", ",", " ,eggs swallow"]], + [["red_blue", "", ""], ["red_blue", "", ""]], + ], + dims=["X", "Y", "ZZ"], + ).astype(dtype) + + res_left = values.str.partition(dim="ZZ", sep=sep) + res_right = values.str.partition(dim="ZZ", sep=sep) + + # assert res_left.dtype == expected_left.dtype + # assert res_right.dtype == expected_right.dtype + + assert_equal(res_left, expected_left) + assert_equal(res_right, expected_right) + + +def test_split_empty(dtype): + values = xr.DataArray([], dims=["X"]).astype(dtype) + expected = xr.DataArray(np.zeros((0, 0)), dims=["X", "ZZ"]).astype(dtype) + + res = values.str.split(sep=", ", dim="ZZ") + + assert res.dtype == expected.dtype + assert_equal(res, expected) + + +def test_get_dummies(dtype): + values_line = xr.DataArray( + [["a|ab~abc|abc", "ab", "a||abc|abcd"], ["abcd|ab|a", "abc|ab~abc", "|a"]], + dims=["X", "Y"], + ).astype(dtype) + values_comma = xr.DataArray( + [["a~ab|abc~~abc", "ab", "a~abc~abcd"], ["abcd~ab~a", "abc~ab|abc", "~a"]], + dims=["X", "Y"], + ).astype(dtype) + + vals_line = np.array(["a", "ab", "abc", "abcd", "ab~abc"]).astype(dtype) + vals_comma = np.array(["a", "ab", "abc", "abcd", "ab|abc"]).astype(dtype) + expected = [ + [ + [True, False, True, False, True], + [False, True, False, False, False], + [True, False, True, True, False], + ], + [ + [True, True, False, True, False], + [False, False, True, False, True], + [True, False, False, False, False], + ], + ] + expected = np.array(expected) + expected = xr.DataArray(expected, dims=["X", "Y", "ZZ"]) + targ_line = expected.copy() + targ_comma = expected.copy() + targ_line.coords["ZZ"] = vals_line + targ_comma.coords["ZZ"] = vals_comma + + res_default = values_line.str.get_dummies(dim="ZZ") + res_line = values_line.str.get_dummies(dim="ZZ", sep="|") + res_comma = values_comma.str.get_dummies(dim="ZZ", sep="~") + + assert res_default.dtype == targ_line.dtype + assert res_line.dtype == targ_line.dtype + assert res_comma.dtype == targ_comma.dtype + + assert_equal(res_default, targ_line) + assert_equal(res_line, targ_line) + assert_equal(res_comma, targ_comma) + + +def test_get_dummies_broadcast(dtype): + values = xr.DataArray( + ["x~x|x~x", "x", "x|x~x", "x~x"], + dims=["X"], + ).astype(dtype) + + sep = xr.DataArray( + ["|", "~"], + dims=["Y"], + ).astype(dtype) + + expected = [ + [[False, False, True], [True, True, False]], + [[True, False, False], [True, False, False]], + [[True, False, True], [True, True, False]], + [[False, False, True], [True, False, False]], + ] + expected = np.array(expected) + expected = xr.DataArray(expected, dims=["X", "Y", "ZZ"]) + expected.coords["ZZ"] = np.array(["x", "x|x", "x~x"]).astype(dtype) + + res = values.str.get_dummies(dim="ZZ", sep=sep) + + assert res.dtype == expected.dtype + + assert_equal(res, expected) + + +def test_get_dummies_empty(dtype): + values = xr.DataArray([], dims=["X"]).astype(dtype) + expected = xr.DataArray(np.zeros((0, 0)), dims=["X", "ZZ"]).astype(dtype) + + res = values.str.get_dummies(dim="ZZ") + + assert res.dtype == expected.dtype + assert_equal(res, expected) + + +def test_splitters_empty_str(dtype): + values = xr.DataArray( + [["", "", ""], ["", "", ""]], + dims=["X", "Y"], + ).astype(dtype) + + targ_partition_dim = xr.DataArray( + [ + [["", "", ""], ["", "", ""], ["", "", ""]], + [["", "", ""], ["", "", ""], ["", "", ""]], + ], + dims=["X", "Y", "ZZ"], + ).astype(dtype) + + targ_partition_none = [ + [["", "", ""], ["", "", ""], ["", "", ""]], + [["", "", ""], ["", "", ""], ["", "", "", ""]], + ] + targ_partition_none = [ + [[dtype(x) for x in y] for y in z] for z in targ_partition_none + ] + targ_partition_none = np.array(targ_partition_none, dtype=np.object_) + del targ_partition_none[-1, -1][-1] + targ_partition_none = xr.DataArray( + targ_partition_none, + dims=["X", "Y"], + ) + + targ_split_dim = xr.DataArray( + [[[""], [""], [""]], [[""], [""], [""]]], + dims=["X", "Y", "ZZ"], + ).astype(dtype) + targ_split_none = xr.DataArray( + np.array([[[], [], []], [[], [], [""]]], dtype=np.object_), + dims=["X", "Y"], + ) + del targ_split_none.data[-1, -1][-1] + + res_partition_dim = values.str.partition(dim="ZZ") + res_rpartition_dim = values.str.rpartition(dim="ZZ") + res_partition_none = values.str.partition(dim=None) + res_rpartition_none = values.str.rpartition(dim=None) + + res_split_dim = values.str.split(dim="ZZ") + res_rsplit_dim = values.str.rsplit(dim="ZZ") + res_split_none = values.str.split(dim=None) + res_rsplit_none = values.str.rsplit(dim=None) + + res_dummies = values.str.rsplit(dim="ZZ") + + assert res_partition_dim.dtype == targ_partition_dim.dtype + assert res_rpartition_dim.dtype == targ_partition_dim.dtype + assert res_partition_none.dtype == targ_partition_none.dtype + assert res_rpartition_none.dtype == targ_partition_none.dtype + + assert res_split_dim.dtype == targ_split_dim.dtype + assert res_rsplit_dim.dtype == targ_split_dim.dtype + assert res_split_none.dtype == targ_split_none.dtype + assert res_rsplit_none.dtype == targ_split_none.dtype + + assert res_dummies.dtype == targ_split_dim.dtype + + assert_equal(res_partition_dim, targ_partition_dim) + assert_equal(res_rpartition_dim, targ_partition_dim) + assert_equal(res_partition_none, targ_partition_none) + assert_equal(res_rpartition_none, targ_partition_none) + + assert_equal(res_split_dim, targ_split_dim) + assert_equal(res_rsplit_dim, targ_split_dim) + assert_equal(res_split_none, targ_split_none) + assert_equal(res_rsplit_none, targ_split_none) + + assert_equal(res_dummies, targ_split_dim) + + +def test_cat_str(dtype): + values_1 = xr.DataArray( + [["a", "bb", "cccc"], ["ddddd", "eeee", "fff"]], + dims=["X", "Y"], + ).astype(dtype) + values_2 = "111" + + targ_blank = xr.DataArray( + [["a111", "bb111", "cccc111"], ["ddddd111", "eeee111", "fff111"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_space = xr.DataArray( + [["a 111", "bb 111", "cccc 111"], ["ddddd 111", "eeee 111", "fff 111"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_bars = xr.DataArray( + [["a||111", "bb||111", "cccc||111"], ["ddddd||111", "eeee||111", "fff||111"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_comma = xr.DataArray( + [["a, 111", "bb, 111", "cccc, 111"], ["ddddd, 111", "eeee, 111", "fff, 111"]], + dims=["X", "Y"], + ).astype(dtype) + + res_blank = values_1.str.cat(values_2) + res_add = values_1.str + values_2 + res_space = values_1.str.cat(values_2, sep=" ") + res_bars = values_1.str.cat(values_2, sep="||") + res_comma = values_1.str.cat(values_2, sep=", ") + + assert res_blank.dtype == targ_blank.dtype + assert res_add.dtype == targ_blank.dtype + assert res_space.dtype == targ_space.dtype + assert res_bars.dtype == targ_bars.dtype + assert res_comma.dtype == targ_comma.dtype + + assert_equal(res_blank, targ_blank) + assert_equal(res_add, targ_blank) + assert_equal(res_space, targ_space) + assert_equal(res_bars, targ_bars) + assert_equal(res_comma, targ_comma) + + +def test_cat_uniform(dtype): + values_1 = xr.DataArray( + [["a", "bb", "cccc"], ["ddddd", "eeee", "fff"]], + dims=["X", "Y"], + ).astype(dtype) + values_2 = xr.DataArray( + [["11111", "222", "33"], ["4", "5555", "66"]], + dims=["X", "Y"], + ) + + targ_blank = xr.DataArray( + [["a11111", "bb222", "cccc33"], ["ddddd4", "eeee5555", "fff66"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_space = xr.DataArray( + [["a 11111", "bb 222", "cccc 33"], ["ddddd 4", "eeee 5555", "fff 66"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_bars = xr.DataArray( + [["a||11111", "bb||222", "cccc||33"], ["ddddd||4", "eeee||5555", "fff||66"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_comma = xr.DataArray( + [["a, 11111", "bb, 222", "cccc, 33"], ["ddddd, 4", "eeee, 5555", "fff, 66"]], + dims=["X", "Y"], + ).astype(dtype) + + res_blank = values_1.str.cat(values_2) + res_add = values_1.str + values_2 + res_space = values_1.str.cat(values_2, sep=" ") + res_bars = values_1.str.cat(values_2, sep="||") + res_comma = values_1.str.cat(values_2, sep=", ") + + assert res_blank.dtype == targ_blank.dtype + assert res_add.dtype == targ_blank.dtype + assert res_space.dtype == targ_space.dtype + assert res_bars.dtype == targ_bars.dtype + assert res_comma.dtype == targ_comma.dtype + + assert_equal(res_blank, targ_blank) + assert_equal(res_add, targ_blank) + assert_equal(res_space, targ_space) + assert_equal(res_bars, targ_bars) + assert_equal(res_comma, targ_comma) + + +def test_cat_broadcast_right(dtype): + values_1 = xr.DataArray( + [["a", "bb", "cccc"], ["ddddd", "eeee", "fff"]], + dims=["X", "Y"], + ).astype(dtype) + values_2 = xr.DataArray( + ["11111", "222", "33"], + dims=["Y"], + ) + + targ_blank = xr.DataArray( + [["a11111", "bb222", "cccc33"], ["ddddd11111", "eeee222", "fff33"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_space = xr.DataArray( + [["a 11111", "bb 222", "cccc 33"], ["ddddd 11111", "eeee 222", "fff 33"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_bars = xr.DataArray( + [["a||11111", "bb||222", "cccc||33"], ["ddddd||11111", "eeee||222", "fff||33"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_comma = xr.DataArray( + [["a, 11111", "bb, 222", "cccc, 33"], ["ddddd, 11111", "eeee, 222", "fff, 33"]], + dims=["X", "Y"], + ).astype(dtype) + + res_blank = values_1.str.cat(values_2) + res_add = values_1.str + values_2 + res_space = values_1.str.cat(values_2, sep=" ") + res_bars = values_1.str.cat(values_2, sep="||") + res_comma = values_1.str.cat(values_2, sep=", ") + + assert res_blank.dtype == targ_blank.dtype + assert res_add.dtype == targ_blank.dtype + assert res_space.dtype == targ_space.dtype + assert res_bars.dtype == targ_bars.dtype + assert res_comma.dtype == targ_comma.dtype + + assert_equal(res_blank, targ_blank) + assert_equal(res_add, targ_blank) + assert_equal(res_space, targ_space) + assert_equal(res_bars, targ_bars) + assert_equal(res_comma, targ_comma) + + +def test_cat_broadcast_left(dtype): + values_1 = xr.DataArray( + ["a", "bb", "cccc"], + dims=["Y"], + ).astype(dtype) + values_2 = xr.DataArray( + [["11111", "222", "33"], ["4", "5555", "66"]], + dims=["X", "Y"], + ) + + targ_blank = ( + xr.DataArray( + [["a11111", "bb222", "cccc33"], ["a4", "bb5555", "cccc66"]], + dims=["X", "Y"], + ) + .astype(dtype) + .T + ) + + targ_space = ( + xr.DataArray( + [["a 11111", "bb 222", "cccc 33"], ["a 4", "bb 5555", "cccc 66"]], + dims=["X", "Y"], + ) + .astype(dtype) + .T + ) + + targ_bars = ( + xr.DataArray( + [["a||11111", "bb||222", "cccc||33"], ["a||4", "bb||5555", "cccc||66"]], + dims=["X", "Y"], + ) + .astype(dtype) + .T + ) + + targ_comma = ( + xr.DataArray( + [["a, 11111", "bb, 222", "cccc, 33"], ["a, 4", "bb, 5555", "cccc, 66"]], + dims=["X", "Y"], + ) + .astype(dtype) + .T + ) + + res_blank = values_1.str.cat(values_2) + res_add = values_1.str + values_2 + res_space = values_1.str.cat(values_2, sep=" ") + res_bars = values_1.str.cat(values_2, sep="||") + res_comma = values_1.str.cat(values_2, sep=", ") + + assert res_blank.dtype == targ_blank.dtype + assert res_add.dtype == targ_blank.dtype + assert res_space.dtype == targ_space.dtype + assert res_bars.dtype == targ_bars.dtype + assert res_comma.dtype == targ_comma.dtype + + assert_equal(res_blank, targ_blank) + assert_equal(res_add, targ_blank) + assert_equal(res_space, targ_space) + assert_equal(res_bars, targ_bars) + assert_equal(res_comma, targ_comma) + + +def test_cat_broadcast_both(dtype): + values_1 = xr.DataArray( + ["a", "bb", "cccc"], + dims=["Y"], + ).astype(dtype) + values_2 = xr.DataArray( + ["11111", "4"], + dims=["X"], + ) + + targ_blank = ( + xr.DataArray( + [["a11111", "bb11111", "cccc11111"], ["a4", "bb4", "cccc4"]], + dims=["X", "Y"], + ) + .astype(dtype) + .T + ) + + targ_space = ( + xr.DataArray( + [["a 11111", "bb 11111", "cccc 11111"], ["a 4", "bb 4", "cccc 4"]], + dims=["X", "Y"], + ) + .astype(dtype) + .T + ) + + targ_bars = ( + xr.DataArray( + [["a||11111", "bb||11111", "cccc||11111"], ["a||4", "bb||4", "cccc||4"]], + dims=["X", "Y"], + ) + .astype(dtype) + .T + ) + + targ_comma = ( + xr.DataArray( + [["a, 11111", "bb, 11111", "cccc, 11111"], ["a, 4", "bb, 4", "cccc, 4"]], + dims=["X", "Y"], + ) + .astype(dtype) + .T + ) + + res_blank = values_1.str.cat(values_2) + res_add = values_1.str + values_2 + res_space = values_1.str.cat(values_2, sep=" ") + res_bars = values_1.str.cat(values_2, sep="||") + res_comma = values_1.str.cat(values_2, sep=", ") + + assert res_blank.dtype == targ_blank.dtype + assert res_add.dtype == targ_blank.dtype + assert res_space.dtype == targ_space.dtype + assert res_bars.dtype == targ_bars.dtype + assert res_comma.dtype == targ_comma.dtype + + assert_equal(res_blank, targ_blank) + assert_equal(res_add, targ_blank) + assert_equal(res_space, targ_space) + assert_equal(res_bars, targ_bars) + assert_equal(res_comma, targ_comma) + + +def test_cat_multi(): + values_1 = xr.DataArray( + ["11111", "4"], + dims=["X"], + ) + + values_2 = xr.DataArray( + ["a", "bb", "cccc"], + dims=["Y"], + ).astype(np.bytes_) + + values_3 = np.array(3.4) + + values_4 = "" + + values_5 = np.array("", dtype=np.unicode_) + + sep = xr.DataArray( + [" ", ", "], + dims=["ZZ"], + ).astype(np.unicode_) + + expected = xr.DataArray( + [ + [ + ["11111 a 3.4 ", "11111, a, 3.4, , "], + ["11111 bb 3.4 ", "11111, bb, 3.4, , "], + ["11111 cccc 3.4 ", "11111, cccc, 3.4, , "], + ], + [ + ["4 a 3.4 ", "4, a, 3.4, , "], + ["4 bb 3.4 ", "4, bb, 3.4, , "], + ["4 cccc 3.4 ", "4, cccc, 3.4, , "], + ], + ], + dims=["X", "Y", "ZZ"], + ).astype(np.unicode_) + + res = values_1.str.cat(values_2, values_3, values_4, values_5, sep=sep) + + assert res.dtype == expected.dtype + assert_equal(res, expected) + + +def test_join_scalar(dtype): + values = xr.DataArray("aaa").astype(dtype) + + targ = xr.DataArray("aaa").astype(dtype) + + res_blank = values.str.join() + res_space = values.str.join(sep=" ") + + assert res_blank.dtype == targ.dtype + assert res_space.dtype == targ.dtype + + assert_identical(res_blank, targ) + assert_identical(res_space, targ) + + +def test_join_vector(dtype): + values = xr.DataArray( + ["a", "bb", "cccc"], + dims=["Y"], + ).astype(dtype) + + targ_blank = xr.DataArray("abbcccc").astype(dtype) + targ_space = xr.DataArray("a bb cccc").astype(dtype) + + res_blank_none = values.str.join() + res_blank_y = values.str.join(dim="Y") + + res_space_none = values.str.join(sep=" ") + res_space_y = values.str.join(dim="Y", sep=" ") + + assert res_blank_none.dtype == targ_blank.dtype + assert res_blank_y.dtype == targ_blank.dtype + assert res_space_none.dtype == targ_space.dtype + assert res_space_y.dtype == targ_space.dtype + + assert_identical(res_blank_none, targ_blank) + assert_identical(res_blank_y, targ_blank) + assert_identical(res_space_none, targ_space) + assert_identical(res_space_y, targ_space) + + +def test_join_2d(dtype): + values = xr.DataArray( + [["a", "bb", "cccc"], ["ddddd", "eeee", "fff"]], + dims=["X", "Y"], + ).astype(dtype) + + targ_blank_x = xr.DataArray( + ["addddd", "bbeeee", "ccccfff"], + dims=["Y"], + ).astype(dtype) + targ_space_x = xr.DataArray( + ["a ddddd", "bb eeee", "cccc fff"], + dims=["Y"], + ).astype(dtype) + + targ_blank_y = xr.DataArray( + ["abbcccc", "dddddeeeefff"], + dims=["X"], + ).astype(dtype) + targ_space_y = xr.DataArray( + ["a bb cccc", "ddddd eeee fff"], + dims=["X"], + ).astype(dtype) + + res_blank_x = values.str.join(dim="X") + res_blank_y = values.str.join(dim="Y") + + res_space_x = values.str.join(dim="X", sep=" ") + res_space_y = values.str.join(dim="Y", sep=" ") + + assert res_blank_x.dtype == targ_blank_x.dtype + assert res_blank_y.dtype == targ_blank_y.dtype + assert res_space_x.dtype == targ_space_x.dtype + assert res_space_y.dtype == targ_space_y.dtype + + assert_identical(res_blank_x, targ_blank_x) + assert_identical(res_blank_y, targ_blank_y) + assert_identical(res_space_x, targ_space_x) + assert_identical(res_space_y, targ_space_y) + + with pytest.raises( + ValueError, match="Dimension must be specified for multidimensional arrays." + ): + values.str.join() + + +def test_join_broadcast(dtype): + values = xr.DataArray( + ["a", "bb", "cccc"], + dims=["X"], + ).astype(dtype) + + sep = xr.DataArray( + [" ", ", "], + dims=["ZZ"], + ).astype(dtype) + + expected = xr.DataArray( + ["a bb cccc", "a, bb, cccc"], + dims=["ZZ"], + ).astype(dtype) + + res = values.str.join(sep=sep) + + assert res.dtype == expected.dtype + assert_identical(res, expected) + + +def test_format_scalar(): + values = xr.DataArray( + ["{}.{Y}.{ZZ}", "{},{},{X},{X}", "{X}-{Y}-{ZZ}"], + dims=["X"], + ).astype(np.unicode_) + + pos0 = 1 + pos1 = 1.2 + pos2 = "2.3" + X = "'test'" + Y = "X" + ZZ = None + W = "NO!" + + expected = xr.DataArray( + ["1.X.None", "1,1.2,'test','test'", "'test'-X-None"], + dims=["X"], + ).astype(np.unicode_) + + res = values.str.format(pos0, pos1, pos2, X=X, Y=Y, ZZ=ZZ, W=W) + + assert res.dtype == expected.dtype + assert_equal(res, expected) + + +def test_format_broadcast(): + values = xr.DataArray( + ["{}.{Y}.{ZZ}", "{},{},{X},{X}", "{X}-{Y}-{ZZ}"], + dims=["X"], + ).astype(np.unicode_) + + pos0 = 1 + pos1 = 1.2 + + pos2 = xr.DataArray( + ["2.3", "3.44444"], + dims=["YY"], + ) + + X = "'test'" + Y = "X" + ZZ = None + W = "NO!" + + expected = xr.DataArray( + [ + ["1.X.None", "1.X.None"], + ["1,1.2,'test','test'", "1,1.2,'test','test'"], + ["'test'-X-None", "'test'-X-None"], + ], + dims=["X", "YY"], + ).astype(np.unicode_) + + res = values.str.format(pos0, pos1, pos2, X=X, Y=Y, ZZ=ZZ, W=W) + + assert res.dtype == expected.dtype + assert_equal(res, expected) + + +def test_mod_scalar(): + values = xr.DataArray( + ["%s.%s.%s", "%s,%s,%s", "%s-%s-%s"], + dims=["X"], + ).astype(np.unicode_) + + pos0 = 1 + pos1 = 1.2 + pos2 = "2.3" + + expected = xr.DataArray( + ["1.1.2.2.3", "1,1.2,2.3", "1-1.2-2.3"], + dims=["X"], + ).astype(np.unicode_) + + res = values.str % (pos0, pos1, pos2) + + assert res.dtype == expected.dtype + assert_equal(res, expected) + + +def test_mod_dict(): + values = xr.DataArray( + ["%(a)s.%(a)s.%(b)s", "%(b)s,%(c)s,%(b)s", "%(c)s-%(b)s-%(a)s"], + dims=["X"], + ).astype(np.unicode_) + + a = 1 + b = 1.2 + c = "2.3" + + expected = xr.DataArray( + ["1.1.1.2", "1.2,2.3,1.2", "2.3-1.2-1"], + dims=["X"], + ).astype(np.unicode_) + + res = values.str % {"a": a, "b": b, "c": c} + + assert res.dtype == expected.dtype + assert_equal(res, expected) + + +def test_mod_broadcast_single(): + values = xr.DataArray( + ["%s_1", "%s_2", "%s_3"], + dims=["X"], + ).astype(np.unicode_) + + pos = xr.DataArray( + ["2.3", "3.44444"], + dims=["YY"], + ) + + expected = xr.DataArray( + [["2.3_1", "3.44444_1"], ["2.3_2", "3.44444_2"], ["2.3_3", "3.44444_3"]], + dims=["X", "YY"], + ).astype(np.unicode_) + + res = values.str % pos + + assert res.dtype == expected.dtype + assert_equal(res, expected) + + +def test_mod_broadcast_multi(): + values = xr.DataArray( + ["%s.%s.%s", "%s,%s,%s", "%s-%s-%s"], + dims=["X"], + ).astype(np.unicode_) + + pos0 = 1 + pos1 = 1.2 + + pos2 = xr.DataArray( + ["2.3", "3.44444"], + dims=["YY"], + ) + + expected = xr.DataArray( + [ + ["1.1.2.2.3", "1.1.2.3.44444"], + ["1,1.2,2.3", "1,1.2,3.44444"], + ["1-1.2-2.3", "1-1.2-3.44444"], + ], + dims=["X", "YY"], + ).astype(np.unicode_) + + res = values.str % (pos0, pos1, pos2) + + assert res.dtype == expected.dtype + assert_equal(res, expected) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index d15736e608d..f6c00a2a9a9 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -2541,6 +2541,14 @@ def test_complex(self, invalid_netcdf, warntype, num_warns): assert recorded_num_warns == num_warns + def test_numpy_bool_(self): + # h5netcdf loads booleans as numpy.bool_, this type needs to be supported + # when writing invalid_netcdf datasets in order to support a roundtrip + expected = Dataset({"x": ("y", np.ones(5), {"numpy_bool": np.bool_(True)})}) + save_kwargs = {"invalid_netcdf": True} + with self.roundtrip(expected, save_kwargs=save_kwargs) as actual: + assert_identical(expected, actual) + def test_cross_engine_read_write_netcdf4(self): # Drop dim3, because its labels include strings. These appear to be # not properly read with python-netCDF4, which converts them into diff --git a/xarray/tests/test_backends_api.py b/xarray/tests/test_backends_api.py index d19f5aab585..340495d4564 100644 --- a/xarray/tests/test_backends_api.py +++ b/xarray/tests/test_backends_api.py @@ -1,5 +1,3 @@ -import pytest - from xarray.backends.api import _get_default_engine from . import requires_netCDF4, requires_scipy @@ -14,8 +12,5 @@ def test__get_default_engine(): engine_gz = _get_default_engine("/example.gz") assert engine_gz == "scipy" - with pytest.raises(ValueError): - _get_default_engine("/example.grib") - engine_default = _get_default_engine("/example") assert engine_default == "netcdf4" diff --git a/xarray/tests/test_coding.py b/xarray/tests/test_coding.py index e0df7782aa7..839f2fd1f2e 100644 --- a/xarray/tests/test_coding.py +++ b/xarray/tests/test_coding.py @@ -117,3 +117,31 @@ def test_scaling_offset_as_list(scale_factor, add_offset): encoded = coder.encode(original) roundtripped = coder.decode(encoded) assert_allclose(original, roundtripped) + + +@pytest.mark.parametrize("bits", [1, 2, 4, 8]) +def test_decode_unsigned_from_signed(bits): + unsigned_dtype = np.dtype(f"u{bits}") + signed_dtype = np.dtype(f"i{bits}") + original_values = np.array([np.iinfo(unsigned_dtype).max], dtype=unsigned_dtype) + encoded = xr.Variable( + ("x",), original_values.astype(signed_dtype), attrs={"_Unsigned": "true"} + ) + coder = variables.UnsignedIntegerCoder() + decoded = coder.decode(encoded) + assert decoded.dtype == unsigned_dtype + assert decoded.values == original_values + + +@pytest.mark.parametrize("bits", [1, 2, 4, 8]) +def test_decode_signed_from_unsigned(bits): + unsigned_dtype = np.dtype(f"u{bits}") + signed_dtype = np.dtype(f"i{bits}") + original_values = np.array([-1], dtype=signed_dtype) + encoded = xr.Variable( + ("x",), original_values.astype(unsigned_dtype), attrs={"_Unsigned": "false"} + ) + coder = variables.UnsignedIntegerCoder() + decoded = coder.decode(encoded) + assert decoded.dtype == signed_dtype + assert decoded.values == original_values diff --git a/xarray/tests/test_coding_times.py b/xarray/tests/test_coding_times.py index eda32d31148..ad0ff9249fc 100644 --- a/xarray/tests/test_coding_times.py +++ b/xarray/tests/test_coding_times.py @@ -79,6 +79,9 @@ (0, "microseconds since 2000-01-01T00:00:00"), (np.int32(788961600), "seconds since 1981-01-01"), # GH2002 (12300 + np.arange(5), "hour since 1680-01-01 00:00:00.500000"), + (164375, "days since 1850-01-01 00:00:00"), + (164374.5, "days since 1850-01-01 00:00:00"), + ([164374.5, 168360.5], "days since 1850-01-01 00:00:00"), ] _CF_DATETIME_TESTS = [ num_dates_units + (calendar,) diff --git a/xarray/tests/test_dataarray.py b/xarray/tests/test_dataarray.py index 259e91083a7..c38c3656eaf 100644 --- a/xarray/tests/test_dataarray.py +++ b/xarray/tests/test_dataarray.py @@ -7,6 +7,7 @@ import numpy as np import pandas as pd import pytest +from pandas.core.computation.ops import UndefinedVariableError from pandas.tseries.frequencies import to_offset import xarray as xr @@ -39,6 +40,7 @@ requires_dask, requires_iris, requires_numbagg, + requires_numexpr, requires_scipy, requires_sparse, source_ndarray, @@ -4620,6 +4622,74 @@ def test_pad_reflect(self, mode, reflect_type): assert actual.shape == (7, 4, 9) assert_identical(actual, expected) + @pytest.mark.parametrize("parser", ["pandas", "python"]) + @pytest.mark.parametrize( + "engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])] + ) + @pytest.mark.parametrize( + "backend", ["numpy", pytest.param("dask", marks=[requires_dask])] + ) + def test_query(self, backend, engine, parser): + """Test querying a dataset.""" + + # setup test data + np.random.seed(42) + a = np.arange(0, 10, 1) + b = np.random.randint(0, 100, size=10) + c = np.linspace(0, 1, 20) + d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( + object + ) + if backend == "numpy": + aa = DataArray(data=a, dims=["x"], name="a") + bb = DataArray(data=b, dims=["x"], name="b") + cc = DataArray(data=c, dims=["y"], name="c") + dd = DataArray(data=d, dims=["z"], name="d") + + elif backend == "dask": + import dask.array as da + + aa = DataArray(data=da.from_array(a, chunks=3), dims=["x"], name="a") + bb = DataArray(data=da.from_array(b, chunks=3), dims=["x"], name="b") + cc = DataArray(data=da.from_array(c, chunks=7), dims=["y"], name="c") + dd = DataArray(data=da.from_array(d, chunks=12), dims=["z"], name="d") + + # query single dim, single variable + actual = aa.query(x="a > 5", engine=engine, parser=parser) + expect = aa.isel(x=(a > 5)) + assert_identical(expect, actual) + + # query single dim, single variable, via dict + actual = aa.query(dict(x="a > 5"), engine=engine, parser=parser) + expect = aa.isel(dict(x=(a > 5))) + assert_identical(expect, actual) + + # query single dim, single variable + actual = bb.query(x="b > 50", engine=engine, parser=parser) + expect = bb.isel(x=(b > 50)) + assert_identical(expect, actual) + + # query single dim, single variable + actual = cc.query(y="c < .5", engine=engine, parser=parser) + expect = cc.isel(y=(c < 0.5)) + assert_identical(expect, actual) + + # query single dim, single string variable + if parser == "pandas": + # N.B., this query currently only works with the pandas parser + # xref https://github.com/pandas-dev/pandas/issues/40436 + actual = dd.query(z='d == "bar"', engine=engine, parser=parser) + expect = dd.isel(z=(d == "bar")) + assert_identical(expect, actual) + + # test error handling + with pytest.raises(ValueError): + aa.query("a > 5") # must be dict or kwargs + with pytest.raises(ValueError): + aa.query(x=(a > 5)) # must be query string + with pytest.raises(UndefinedVariableError): + aa.query(x="spam > 50") # name not present + class TestReduce: @pytest.fixture(autouse=True) diff --git a/xarray/tests/test_dataset.py b/xarray/tests/test_dataset.py index 13cd03acf96..52df7603034 100644 --- a/xarray/tests/test_dataset.py +++ b/xarray/tests/test_dataset.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pytest +from pandas.core.computation.ops import UndefinedVariableError from pandas.core.indexes.datetimes import DatetimeIndex from pandas.tseries.frequencies import to_offset @@ -45,6 +46,7 @@ requires_cftime, requires_dask, requires_numbagg, + requires_numexpr, requires_scipy, requires_sparse, source_ndarray, @@ -188,7 +190,7 @@ def get_variables(self): def lazy_inaccessible(k, v): if k in self._indexvars: return v - data = indexing.LazilyOuterIndexedArray(InaccessibleArray(v.values)) + data = indexing.LazilyIndexedArray(InaccessibleArray(v.values)) return Variable(v.dims, data, v.attrs) return {k: lazy_inaccessible(k, v) for k, v in self._variables.items()} @@ -5807,6 +5809,135 @@ def test_astype_attrs(self): assert not data.astype(float, keep_attrs=False).attrs assert not data.astype(float, keep_attrs=False).var1.attrs + @pytest.mark.parametrize("parser", ["pandas", "python"]) + @pytest.mark.parametrize( + "engine", ["python", None, pytest.param("numexpr", marks=[requires_numexpr])] + ) + @pytest.mark.parametrize( + "backend", ["numpy", pytest.param("dask", marks=[requires_dask])] + ) + def test_query(self, backend, engine, parser): + """Test querying a dataset.""" + + # setup test data + np.random.seed(42) + a = np.arange(0, 10, 1) + b = np.random.randint(0, 100, size=10) + c = np.linspace(0, 1, 20) + d = np.random.choice(["foo", "bar", "baz"], size=30, replace=True).astype( + object + ) + e = np.arange(0, 10 * 20).reshape(10, 20) + f = np.random.normal(0, 1, size=(10, 20, 30)) + if backend == "numpy": + ds = Dataset( + { + "a": ("x", a), + "b": ("x", b), + "c": ("y", c), + "d": ("z", d), + "e": (("x", "y"), e), + "f": (("x", "y", "z"), f), + } + ) + elif backend == "dask": + ds = Dataset( + { + "a": ("x", da.from_array(a, chunks=3)), + "b": ("x", da.from_array(b, chunks=3)), + "c": ("y", da.from_array(c, chunks=7)), + "d": ("z", da.from_array(d, chunks=12)), + "e": (("x", "y"), da.from_array(e, chunks=(3, 7))), + "f": (("x", "y", "z"), da.from_array(f, chunks=(3, 7, 12))), + } + ) + + # query single dim, single variable + actual = ds.query(x="a > 5", engine=engine, parser=parser) + expect = ds.isel(x=(a > 5)) + assert_identical(expect, actual) + + # query single dim, single variable, via dict + actual = ds.query(dict(x="a > 5"), engine=engine, parser=parser) + expect = ds.isel(dict(x=(a > 5))) + assert_identical(expect, actual) + + # query single dim, single variable + actual = ds.query(x="b > 50", engine=engine, parser=parser) + expect = ds.isel(x=(b > 50)) + assert_identical(expect, actual) + + # query single dim, single variable + actual = ds.query(y="c < .5", engine=engine, parser=parser) + expect = ds.isel(y=(c < 0.5)) + assert_identical(expect, actual) + + # query single dim, single string variable + if parser == "pandas": + # N.B., this query currently only works with the pandas parser + # xref https://github.com/pandas-dev/pandas/issues/40436 + actual = ds.query(z='d == "bar"', engine=engine, parser=parser) + expect = ds.isel(z=(d == "bar")) + assert_identical(expect, actual) + + # query single dim, multiple variables + actual = ds.query(x="(a > 5) & (b > 50)", engine=engine, parser=parser) + expect = ds.isel(x=((a > 5) & (b > 50))) + assert_identical(expect, actual) + + # query single dim, multiple variables with computation + actual = ds.query(x="(a * b) > 250", engine=engine, parser=parser) + expect = ds.isel(x=(a * b) > 250) + assert_identical(expect, actual) + + # check pandas query syntax is supported + if parser == "pandas": + actual = ds.query(x="(a > 5) and (b > 50)", engine=engine, parser=parser) + expect = ds.isel(x=((a > 5) & (b > 50))) + assert_identical(expect, actual) + + # query multiple dims via kwargs + actual = ds.query(x="a > 5", y="c < .5", engine=engine, parser=parser) + expect = ds.isel(x=(a > 5), y=(c < 0.5)) + assert_identical(expect, actual) + + # query multiple dims via kwargs + if parser == "pandas": + actual = ds.query( + x="a > 5", y="c < .5", z="d == 'bar'", engine=engine, parser=parser + ) + expect = ds.isel(x=(a > 5), y=(c < 0.5), z=(d == "bar")) + assert_identical(expect, actual) + + # query multiple dims via dict + actual = ds.query(dict(x="a > 5", y="c < .5"), engine=engine, parser=parser) + expect = ds.isel(dict(x=(a > 5), y=(c < 0.5))) + assert_identical(expect, actual) + + # query multiple dims via dict + if parser == "pandas": + actual = ds.query( + dict(x="a > 5", y="c < .5", z="d == 'bar'"), + engine=engine, + parser=parser, + ) + expect = ds.isel(dict(x=(a > 5), y=(c < 0.5), z=(d == "bar"))) + assert_identical(expect, actual) + + # test error handling + with pytest.raises(ValueError): + ds.query("a > 5") # must be dict or kwargs + with pytest.raises(ValueError): + ds.query(x=(a > 5)) # must be query string + with pytest.raises(IndexError): + ds.query(y="a > 5") # wrong length dimension + with pytest.raises(IndexError): + ds.query(x="c < .5") # wrong length dimension + with pytest.raises(IndexError): + ds.query(x="e > 100") # wrong number of dimensions + with pytest.raises(UndefinedVariableError): + ds.query(x="spam > 50") # name not present + # Py.test tests diff --git a/xarray/tests/test_groupby.py b/xarray/tests/test_groupby.py index 85f729a9f7a..5ef7677c499 100644 --- a/xarray/tests/test_groupby.py +++ b/xarray/tests/test_groupby.py @@ -549,4 +549,17 @@ def test_groupby_none_group_name(): assert "group" in mean.dims +def test_groupby_getitem(dataset): + + assert_identical(dataset.sel(x="a"), dataset.groupby("x")["a"]) + assert_identical(dataset.sel(z=1), dataset.groupby("z")[1]) + + assert_identical(dataset.foo.sel(x="a"), dataset.foo.groupby("x")["a"]) + assert_identical(dataset.foo.sel(z=1), dataset.foo.groupby("z")[1]) + + actual = dataset.groupby("boo")["f"].unstack().transpose("x", "y", "z") + expected = dataset.sel(y=[1], z=[1, 2]).transpose("x", "y", "z") + assert_identical(expected, actual) + + # TODO: move other groupby tests from test_dataset and test_dataarray over here diff --git a/xarray/tests/test_indexing.py b/xarray/tests/test_indexing.py index 4ef7536e1f2..10641ff54e9 100644 --- a/xarray/tests/test_indexing.py +++ b/xarray/tests/test_indexing.py @@ -224,7 +224,7 @@ def test_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) x = indexing.NumpyIndexingAdapter(original) v = Variable(["i", "j", "k"], original) - lazy = indexing.LazilyOuterIndexedArray(x) + lazy = indexing.LazilyIndexedArray(x) v_lazy = Variable(["i", "j", "k"], lazy) arr = ReturnItem() # test orthogonally applied indexers @@ -244,9 +244,7 @@ def test_lazily_indexed_array(self): ]: assert expected.shape == actual.shape assert_array_equal(expected, actual) - assert isinstance( - actual._data, indexing.LazilyOuterIndexedArray - ) + assert isinstance(actual._data, indexing.LazilyIndexedArray) # make sure actual.key is appropriate type if all( @@ -282,18 +280,18 @@ def test_lazily_indexed_array(self): actual._data, ( indexing.LazilyVectorizedIndexedArray, - indexing.LazilyOuterIndexedArray, + indexing.LazilyIndexedArray, ), ) - assert isinstance(actual._data, indexing.LazilyOuterIndexedArray) + assert isinstance(actual._data, indexing.LazilyIndexedArray) assert isinstance(actual._data.array, indexing.NumpyIndexingAdapter) def test_vectorized_lazily_indexed_array(self): original = np.random.rand(10, 20, 30) x = indexing.NumpyIndexingAdapter(original) v_eager = Variable(["i", "j", "k"], x) - lazy = indexing.LazilyOuterIndexedArray(x) + lazy = indexing.LazilyIndexedArray(x) v_lazy = Variable(["i", "j", "k"], lazy) arr = ReturnItem() @@ -306,7 +304,7 @@ def check_indexing(v_eager, v_lazy, indexers): actual._data, ( indexing.LazilyVectorizedIndexedArray, - indexing.LazilyOuterIndexedArray, + indexing.LazilyIndexedArray, ), ) assert_array_equal(expected, actual) @@ -364,19 +362,19 @@ def test_index_scalar(self): class TestMemoryCachedArray: def test_wrapper(self): - original = indexing.LazilyOuterIndexedArray(np.arange(10)) + original = indexing.LazilyIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) assert_array_equal(wrapped, np.arange(10)) assert isinstance(wrapped.array, indexing.NumpyIndexingAdapter) def test_sub_array(self): - original = indexing.LazilyOuterIndexedArray(np.arange(10)) + original = indexing.LazilyIndexedArray(np.arange(10)) wrapped = indexing.MemoryCachedArray(original) child = wrapped[B[:5]] assert isinstance(child, indexing.MemoryCachedArray) assert_array_equal(child, np.arange(5)) assert isinstance(child.array, indexing.NumpyIndexingAdapter) - assert isinstance(wrapped.array, indexing.LazilyOuterIndexedArray) + assert isinstance(wrapped.array, indexing.LazilyIndexedArray) def test_setitem(self): original = np.arange(10) diff --git a/xarray/tests/test_utils.py b/xarray/tests/test_utils.py index 193c45f01cd..9d278a6cfb6 100644 --- a/xarray/tests/test_utils.py +++ b/xarray/tests/test_utils.py @@ -233,15 +233,6 @@ def test_is_remote_uri(): assert not utils.is_remote_uri("example.nc") -def test_is_grib_path(): - assert not utils.is_grib_path("example.nc") - assert not utils.is_grib_path("example.grib ") - assert utils.is_grib_path("example.grib") - assert utils.is_grib_path("example.grib2") - assert utils.is_grib_path("example.grb") - assert utils.is_grib_path("example.grb2") - - class Test_is_uniform_and_sorted: def test_sorted_uniform(self): assert utils.is_uniform_spaced(np.arange(5)) diff --git a/xarray/tests/test_variable.py b/xarray/tests/test_variable.py index 7cb62b4d85f..9259862c86d 100644 --- a/xarray/tests/test_variable.py +++ b/xarray/tests/test_variable.py @@ -15,7 +15,7 @@ BasicIndexer, CopyOnWriteArray, DaskIndexingAdapter, - LazilyOuterIndexedArray, + LazilyIndexedArray, MemoryCachedArray, NumpyIndexingAdapter, OuterIndexer, @@ -1098,9 +1098,9 @@ def test_repr(self): assert expected == repr(v) def test_repr_lazy_data(self): - v = Variable("x", LazilyOuterIndexedArray(np.arange(2e5))) + v = Variable("x", LazilyIndexedArray(np.arange(2e5))) assert "200000 values with dtype" in repr(v) - assert isinstance(v._data, LazilyOuterIndexedArray) + assert isinstance(v._data, LazilyIndexedArray) def test_detect_indexer_type(self): """ Tests indexer type was correctly detected. """ @@ -2172,7 +2172,7 @@ def test_coarsen_2d(self): class TestAsCompatibleData: def test_unchanged_types(self): - types = (np.asarray, PandasIndexAdapter, LazilyOuterIndexedArray) + types = (np.asarray, PandasIndexAdapter, LazilyIndexedArray) for t in types: for data in [ np.arange(3), @@ -2355,19 +2355,19 @@ def test_NumpyIndexingAdapter(self): dims=("x", "y"), data=NumpyIndexingAdapter(NumpyIndexingAdapter(self.d)) ) - def test_LazilyOuterIndexedArray(self): - v = Variable(dims=("x", "y"), data=LazilyOuterIndexedArray(self.d)) + def test_LazilyIndexedArray(self): + v = Variable(dims=("x", "y"), data=LazilyIndexedArray(self.d)) self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) # doubly wrapping v = Variable( dims=("x", "y"), - data=LazilyOuterIndexedArray(LazilyOuterIndexedArray(self.d)), + data=LazilyIndexedArray(LazilyIndexedArray(self.d)), ) self.check_orthogonal_indexing(v) # hierarchical wrapping v = Variable( - dims=("x", "y"), data=LazilyOuterIndexedArray(NumpyIndexingAdapter(self.d)) + dims=("x", "y"), data=LazilyIndexedArray(NumpyIndexingAdapter(self.d)) ) self.check_orthogonal_indexing(v) @@ -2376,9 +2376,7 @@ def test_CopyOnWriteArray(self): self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v) # doubly wrapping - v = Variable( - dims=("x", "y"), data=CopyOnWriteArray(LazilyOuterIndexedArray(self.d)) - ) + v = Variable(dims=("x", "y"), data=CopyOnWriteArray(LazilyIndexedArray(self.d))) self.check_orthogonal_indexing(v) self.check_vectorized_indexing(v)