-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
variable.py
2256 lines (1893 loc) · 80.5 KB
/
variable.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import functools
import itertools
from collections import OrderedDict, defaultdict
from datetime import timedelta
from distutils.version import LooseVersion
from typing import Any, Hashable, Mapping, Union
import numpy as np
import pandas as pd
import xarray as xr # only for Dataset and DataArray
from . import arithmetic, common, dtypes, duck_array_ops, indexing, nputils, ops, utils
from .indexing import (
BasicIndexer,
OuterIndexer,
PandasIndexAdapter,
VectorizedIndexer,
as_indexable,
)
from .npcompat import IS_NEP18_ACTIVE
from .options import _get_keep_attrs
from .pycompat import dask_array_type, integer_types
from .utils import (
OrderedSet,
decode_numpy_dict_values,
either_dict_or_kwargs,
ensure_us_time_resolution,
)
try:
import dask.array as da
except ImportError:
pass
NON_NUMPY_SUPPORTED_ARRAY_TYPES = (
indexing.ExplicitlyIndexed,
pd.Index,
) + dask_array_type
# https://github.com/python/mypy/issues/224
BASIC_INDEXING_TYPES = integer_types + (slice,) # type: ignore
class MissingDimensionsError(ValueError):
"""Error class used when we can't safely guess a dimension name.
"""
# inherits from ValueError for backward compatibility
# TODO: move this to an xarray.exceptions module?
def as_variable(obj, name=None) -> "Union[Variable, IndexVariable]":
"""Convert an object into a Variable.
Parameters
----------
obj : object
Object to convert into a Variable.
- If the object is already a Variable, return a shallow copy.
- Otherwise, if the object has 'dims' and 'data' attributes, convert
it into a new Variable.
- If all else fails, attempt to convert the object into a Variable by
unpacking it into the arguments for creating a new Variable.
name : str, optional
If provided:
- `obj` can be a 1D array, which is assumed to label coordinate values
along a dimension of this given name.
- Variables with name matching one of their dimensions are converted
into `IndexVariable` objects.
Returns
-------
var : Variable
The newly created variable.
"""
from .dataarray import DataArray
# TODO: consider extending this method to automatically handle Iris and
if isinstance(obj, DataArray):
# extract the primary Variable from DataArrays
obj = obj.variable
if isinstance(obj, Variable):
obj = obj.copy(deep=False)
elif isinstance(obj, tuple):
try:
obj = Variable(*obj)
except (TypeError, ValueError) as error:
# use .format() instead of % because it handles tuples consistently
raise error.__class__(
"Could not convert tuple of form "
"(dims, data[, attrs, encoding]): "
"{} to Variable.".format(obj)
)
elif utils.is_scalar(obj):
obj = Variable([], obj)
elif isinstance(obj, (pd.Index, IndexVariable)) and obj.name is not None:
obj = Variable(obj.name, obj)
elif isinstance(obj, (set, dict)):
raise TypeError("variable %r has invalid type %r" % (name, type(obj)))
elif name is not None:
data = as_compatible_data(obj)
if data.ndim != 1:
raise MissingDimensionsError(
"cannot set variable %r with %r-dimensional data "
"without explicit dimension names. Pass a tuple of "
"(dims, data) instead." % (name, data.ndim)
)
obj = Variable(name, data, fastpath=True)
else:
raise TypeError(
"unable to convert object into a variable without an "
"explicit list of dimensions: %r" % obj
)
if name is not None and name in obj.dims:
# convert the Variable into an Index
if obj.ndim != 1:
raise MissingDimensionsError(
"%r has more than 1-dimension and the same name as one of its "
"dimensions %r. xarray disallows such variables because they "
"conflict with the coordinates used to label "
"dimensions." % (name, obj.dims)
)
obj = obj.to_index_variable()
return obj
def _maybe_wrap_data(data):
"""
Put pandas.Index and numpy.ndarray arguments in adapter objects to ensure
they can be indexed properly.
NumpyArrayAdapter, PandasIndexAdapter and LazilyOuterIndexedArray should
all pass through unmodified.
"""
if isinstance(data, pd.Index):
return PandasIndexAdapter(data)
return data
def _possibly_convert_objects(values):
"""Convert arrays of datetime.datetime and datetime.timedelta objects into
datetime64 and timedelta64, according to the pandas convention.
"""
return np.asarray(pd.Series(values.ravel())).reshape(values.shape)
def as_compatible_data(data, fastpath=False):
"""Prepare and wrap data to put in a Variable.
- If data does not have the necessary attributes, convert it to ndarray.
- If data has dtype=datetime64, ensure that it has ns precision. If it's a
pandas.Timestamp, convert it to datetime64.
- If data is already a pandas or xarray object (other than an Index), just
use the values.
Finally, wrap it up with an adapter if necessary.
"""
if fastpath and getattr(data, "ndim", 0) > 0:
# can't use fastpath (yet) for scalars
return _maybe_wrap_data(data)
if isinstance(data, Variable):
return data.data
if isinstance(data, NON_NUMPY_SUPPORTED_ARRAY_TYPES):
return _maybe_wrap_data(data)
if isinstance(data, tuple):
data = utils.to_0d_object_array(data)
if isinstance(data, pd.Timestamp):
# TODO: convert, handle datetime objects, too
data = np.datetime64(data.value, "ns")
if isinstance(data, timedelta):
data = np.timedelta64(getattr(data, "value", data), "ns")
# we don't want nested self-described arrays
data = getattr(data, "values", data)
if isinstance(data, np.ma.MaskedArray):
mask = np.ma.getmaskarray(data)
if mask.any():
dtype, fill_value = dtypes.maybe_promote(data.dtype)
data = np.asarray(data, dtype=dtype)
data[mask] = fill_value
else:
data = np.asarray(data)
if not isinstance(data, np.ndarray):
if hasattr(data, "__array_function__"):
if IS_NEP18_ACTIVE:
return data
else:
raise TypeError(
"Got an NumPy-like array type providing the "
"__array_function__ protocol but NEP18 is not enabled. "
"Check that numpy >= v1.16 and that the environment "
'variable "NUMPY_EXPERIMENTAL_ARRAY_FUNCTION" is set to '
'"1"'
)
# validate whether the data is valid data types
data = np.asarray(data)
if isinstance(data, np.ndarray):
if data.dtype.kind == "O":
data = _possibly_convert_objects(data)
elif data.dtype.kind == "M":
data = np.asarray(data, "datetime64[ns]")
elif data.dtype.kind == "m":
data = np.asarray(data, "timedelta64[ns]")
return _maybe_wrap_data(data)
def _as_array_or_item(data):
"""Return the given values as a numpy array, or as an individual item if
it's a 0d datetime64 or timedelta64 array.
Importantly, this function does not copy data if it is already an ndarray -
otherwise, it will not be possible to update Variable values in place.
This function mostly exists because 0-dimensional ndarrays with
dtype=datetime64 are broken :(
https://github.com/numpy/numpy/issues/4337
https://github.com/numpy/numpy/issues/7619
TODO: remove this (replace with np.asarray) once these issues are fixed
"""
data = np.asarray(data)
if data.ndim == 0:
if data.dtype.kind == "M":
data = np.datetime64(data, "ns")
elif data.dtype.kind == "m":
data = np.timedelta64(data, "ns")
return data
class Variable(
common.AbstractArray, arithmetic.SupportsArithmetic, utils.NdimSizeLenMixin
):
"""A netcdf-like variable consisting of dimensions, data and attributes
which describe a single Array. A single Variable object is not fully
described outside the context of its parent Dataset (if you want such a
fully described object, use a DataArray instead).
The main functional difference between Variables and numpy arrays is that
numerical operations on Variables implement array broadcasting by dimension
name. For example, adding an Variable with dimensions `('time',)` to
another Variable with dimensions `('space',)` results in a new Variable
with dimensions `('time', 'space')`. Furthermore, numpy reduce operations
like ``mean`` or ``sum`` are overwritten to take a "dimension" argument
instead of an "axis".
Variables are light-weight objects used as the building block for datasets.
They are more primitive objects, so operations with them provide marginally
higher performance than using DataArrays. However, manipulating data in the
form of a Dataset or DataArray should almost always be preferred, because
they can use more complete metadata in context of coordinate labels.
"""
__slots__ = ("_dims", "_data", "_attrs", "_encoding")
def __init__(self, dims, data, attrs=None, encoding=None, fastpath=False):
"""
Parameters
----------
dims : str or sequence of str
Name(s) of the the data dimension(s). Must be either a string (only
for 1D data) or a sequence of strings with length equal to the
number of dimensions.
data : array_like
Data array which supports numpy-like data access.
attrs : dict_like or None, optional
Attributes to assign to the new variable. If None (default), an
empty attribute dictionary is initialized.
encoding : dict_like or None, optional
Dictionary specifying how to encode this array's data into a
serialized format like netCDF4. Currently used keys (for netCDF)
include '_FillValue', 'scale_factor', 'add_offset' and 'dtype'.
Well-behaved code to serialize a Variable should ignore
unrecognized encoding items.
"""
self._data = as_compatible_data(data, fastpath=fastpath)
self._dims = self._parse_dimensions(dims)
self._attrs = None
self._encoding = None
if attrs is not None:
self.attrs = attrs
if encoding is not None:
self.encoding = encoding
@property
def dtype(self):
return self._data.dtype
@property
def shape(self):
return self._data.shape
@property
def nbytes(self):
return self.size * self.dtype.itemsize
@property
def _in_memory(self):
return isinstance(self._data, (np.ndarray, np.number, PandasIndexAdapter)) or (
isinstance(self._data, indexing.MemoryCachedArray)
and isinstance(self._data.array, indexing.NumpyIndexingAdapter)
)
@property
def data(self):
if hasattr(self._data, "__array_function__") or isinstance(
self._data, dask_array_type
):
return self._data
else:
return self.values
@data.setter
def data(self, data):
data = as_compatible_data(data)
if data.shape != self.shape:
raise ValueError("replacement data must match the Variable's shape")
self._data = data
def load(self, **kwargs):
"""Manually trigger loading of this variable's data from disk or a
remote source into memory and return this variable.
Normally, it should not be necessary to call this method in user code,
because all xarray functions should either work on deferred data or
load data automatically.
Parameters
----------
**kwargs : dict
Additional keyword arguments passed on to ``dask.array.compute``.
See Also
--------
dask.array.compute
"""
if isinstance(self._data, dask_array_type):
self._data = as_compatible_data(self._data.compute(**kwargs))
elif not hasattr(self._data, "__array_function__"):
self._data = np.asarray(self._data)
return self
def compute(self, **kwargs):
"""Manually trigger loading of this variable's data from disk or a
remote source into memory and return a new variable. The original is
left unaltered.
Normally, it should not be necessary to call this method in user code,
because all xarray functions should either work on deferred data or
load data automatically.
Parameters
----------
**kwargs : dict
Additional keyword arguments passed on to ``dask.array.compute``.
See Also
--------
dask.array.compute
"""
new = self.copy(deep=False)
return new.load(**kwargs)
def __dask_graph__(self):
if isinstance(self._data, dask_array_type):
return self._data.__dask_graph__()
else:
return None
def __dask_keys__(self):
return self._data.__dask_keys__()
def __dask_layers__(self):
return self._data.__dask_layers__()
@property
def __dask_optimize__(self):
return self._data.__dask_optimize__
@property
def __dask_scheduler__(self):
return self._data.__dask_scheduler__
def __dask_postcompute__(self):
array_func, array_args = self._data.__dask_postcompute__()
return (
self._dask_finalize,
(array_func, array_args, self._dims, self._attrs, self._encoding),
)
def __dask_postpersist__(self):
array_func, array_args = self._data.__dask_postpersist__()
return (
self._dask_finalize,
(array_func, array_args, self._dims, self._attrs, self._encoding),
)
@staticmethod
def _dask_finalize(results, array_func, array_args, dims, attrs, encoding):
if isinstance(results, dict): # persist case
name = array_args[0]
results = {k: v for k, v in results.items() if k[0] == name}
data = array_func(results, *array_args)
return Variable(dims, data, attrs=attrs, encoding=encoding)
@property
def values(self):
"""The variable's data as a numpy.ndarray"""
return _as_array_or_item(self._data)
@values.setter
def values(self, values):
self.data = values
def to_base_variable(self):
"""Return this variable as a base xarray.Variable"""
return Variable(
self.dims, self._data, self._attrs, encoding=self._encoding, fastpath=True
)
to_variable = utils.alias(to_base_variable, "to_variable")
def to_index_variable(self):
"""Return this variable as an xarray.IndexVariable"""
return IndexVariable(
self.dims, self._data, self._attrs, encoding=self._encoding, fastpath=True
)
to_coord = utils.alias(to_index_variable, "to_coord")
def to_index(self):
"""Convert this variable to a pandas.Index"""
return self.to_index_variable().to_index()
def to_dict(self, data=True):
"""Dictionary representation of variable."""
item = {"dims": self.dims, "attrs": decode_numpy_dict_values(self.attrs)}
if data:
item["data"] = ensure_us_time_resolution(self.values).tolist()
else:
item.update({"dtype": str(self.dtype), "shape": self.shape})
return item
@property
def dims(self):
"""Tuple of dimension names with which this variable is associated.
"""
return self._dims
@dims.setter
def dims(self, value):
self._dims = self._parse_dimensions(value)
def _parse_dimensions(self, dims):
if isinstance(dims, str):
dims = (dims,)
dims = tuple(dims)
if len(dims) != self.ndim:
raise ValueError(
"dimensions %s must have the same length as the "
"number of data dimensions, ndim=%s" % (dims, self.ndim)
)
return dims
def _item_key_to_tuple(self, key):
if utils.is_dict_like(key):
return tuple(key.get(dim, slice(None)) for dim in self.dims)
else:
return key
def _broadcast_indexes(self, key):
"""Prepare an indexing key for an indexing operation.
Parameters
-----------
key: int, slice, array, dict or tuple of integer, slices and arrays
Any valid input for indexing.
Returns
-------
dims: tuple
Dimension of the resultant variable.
indexers: IndexingTuple subclass
Tuple of integer, array-like, or slices to use when indexing
self._data. The type of this argument indicates the type of
indexing to perform, either basic, outer or vectorized.
new_order : Optional[Sequence[int]]
Optional reordering to do on the result of indexing. If not None,
the first len(new_order) indexing should be moved to these
positions.
"""
key = self._item_key_to_tuple(key) # key is a tuple
# key is a tuple of full size
key = indexing.expanded_indexer(key, self.ndim)
# Convert a scalar Variable to an integer
key = tuple(
k.data.item() if isinstance(k, Variable) and k.ndim == 0 else k for k in key
)
# Convert a 0d-array to an integer
key = tuple(
k.item() if isinstance(k, np.ndarray) and k.ndim == 0 else k for k in key
)
if all(isinstance(k, BASIC_INDEXING_TYPES) for k in key):
return self._broadcast_indexes_basic(key)
self._validate_indexers(key)
# Detect it can be mapped as an outer indexer
# If all key is unlabeled, or
# key can be mapped as an OuterIndexer.
if all(not isinstance(k, Variable) for k in key):
return self._broadcast_indexes_outer(key)
# If all key is 1-dimensional and there are no duplicate labels,
# key can be mapped as an OuterIndexer.
dims = []
for k, d in zip(key, self.dims):
if isinstance(k, Variable):
if len(k.dims) > 1:
return self._broadcast_indexes_vectorized(key)
dims.append(k.dims[0])
elif not isinstance(k, integer_types):
dims.append(d)
if len(set(dims)) == len(dims):
return self._broadcast_indexes_outer(key)
return self._broadcast_indexes_vectorized(key)
def _broadcast_indexes_basic(self, key):
dims = tuple(
dim for k, dim in zip(key, self.dims) if not isinstance(k, integer_types)
)
return dims, BasicIndexer(key), None
def _validate_indexers(self, key):
""" Make sanity checks """
for dim, k in zip(self.dims, key):
if isinstance(k, BASIC_INDEXING_TYPES):
pass
else:
if not isinstance(k, Variable):
k = np.asarray(k)
if k.ndim > 1:
raise IndexError(
"Unlabeled multi-dimensional array cannot be "
"used for indexing: {}".format(k)
)
if k.dtype.kind == "b":
if self.shape[self.get_axis_num(dim)] != len(k):
raise IndexError(
"Boolean array size {:d} is used to index array "
"with shape {:s}.".format(len(k), str(self.shape))
)
if k.ndim > 1:
raise IndexError(
"{}-dimensional boolean indexing is "
"not supported. ".format(k.ndim)
)
if getattr(k, "dims", (dim,)) != (dim,):
raise IndexError(
"Boolean indexer should be unlabeled or on the "
"same dimension to the indexed array. Indexer is "
"on {:s} but the target dimension is {:s}.".format(
str(k.dims), dim
)
)
def _broadcast_indexes_outer(self, key):
dims = tuple(
k.dims[0] if isinstance(k, Variable) else dim
for k, dim in zip(key, self.dims)
if not isinstance(k, integer_types)
)
new_key = []
for k in key:
if isinstance(k, Variable):
k = k.data
if not isinstance(k, BASIC_INDEXING_TYPES):
k = np.asarray(k)
if k.dtype.kind == "b":
(k,) = np.nonzero(k)
new_key.append(k)
return dims, OuterIndexer(tuple(new_key)), None
def _nonzero(self):
""" Equivalent numpy's nonzero but returns a tuple of Varibles. """
# TODO we should replace dask's native nonzero
# after https://github.com/dask/dask/issues/1076 is implemented.
nonzeros = np.nonzero(self.data)
return tuple(Variable((dim), nz) for nz, dim in zip(nonzeros, self.dims))
def _broadcast_indexes_vectorized(self, key):
variables = []
out_dims_set = OrderedSet()
for dim, value in zip(self.dims, key):
if isinstance(value, slice):
out_dims_set.add(dim)
else:
variable = (
value
if isinstance(value, Variable)
else as_variable(value, name=dim)
)
if variable.dtype.kind == "b": # boolean indexing case
(variable,) = variable._nonzero()
variables.append(variable)
out_dims_set.update(variable.dims)
variable_dims = set()
for variable in variables:
variable_dims.update(variable.dims)
slices = []
for i, (dim, value) in enumerate(zip(self.dims, key)):
if isinstance(value, slice):
if dim in variable_dims:
# We only convert slice objects to variables if they share
# a dimension with at least one other variable. Otherwise,
# we can equivalently leave them as slices aknd transpose
# the result. This is significantly faster/more efficient
# for most array backends.
values = np.arange(*value.indices(self.sizes[dim]))
variables.insert(i - len(slices), Variable((dim,), values))
else:
slices.append((i, value))
try:
variables = _broadcast_compat_variables(*variables)
except ValueError:
raise IndexError("Dimensions of indexers mismatch: {}".format(key))
out_key = [variable.data for variable in variables]
out_dims = tuple(out_dims_set)
slice_positions = set()
for i, value in slices:
out_key.insert(i, value)
new_position = out_dims.index(self.dims[i])
slice_positions.add(new_position)
if slice_positions:
new_order = [i for i in range(len(out_dims)) if i not in slice_positions]
else:
new_order = None
return out_dims, VectorizedIndexer(tuple(out_key)), new_order
def __getitem__(self, key):
"""Return a new Array object whose contents are consistent with
getting the provided key from the underlying data.
NB. __getitem__ and __setitem__ implement xarray-style indexing,
where if keys are unlabeled arrays, we index the array orthogonally
with them. If keys are labeled array (such as Variables), they are
broadcasted with our usual scheme and then the array is indexed with
the broadcasted key, like numpy's fancy indexing.
If you really want to do indexing like `x[x > 0]`, manipulate the numpy
array `x.values` directly.
"""
dims, indexer, new_order = self._broadcast_indexes(key)
data = as_indexable(self._data)[indexer]
if new_order:
data = duck_array_ops.moveaxis(data, range(len(new_order)), new_order)
return self._finalize_indexing_result(dims, data)
def _finalize_indexing_result(self, dims, data):
"""Used by IndexVariable to return IndexVariable objects when possible.
"""
return type(self)(dims, data, self._attrs, self._encoding, fastpath=True)
def _getitem_with_mask(self, key, fill_value=dtypes.NA):
"""Index this Variable with -1 remapped to fill_value."""
# TODO(shoyer): expose this method in public API somewhere (isel?) and
# use it for reindex.
# TODO(shoyer): add a sanity check that all other integers are
# non-negative
# TODO(shoyer): add an optimization, remapping -1 to an adjacent value
# that is actually indexed rather than mapping it to the last value
# along each axis.
if fill_value is dtypes.NA:
fill_value = dtypes.get_fill_value(self.dtype)
dims, indexer, new_order = self._broadcast_indexes(key)
if self.size:
if isinstance(self._data, dask_array_type):
# dask's indexing is faster this way; also vindex does not
# support negative indices yet:
# https://github.com/dask/dask/pull/2967
actual_indexer = indexing.posify_mask_indexer(indexer)
else:
actual_indexer = indexer
data = as_indexable(self._data)[actual_indexer]
mask = indexing.create_mask(indexer, self.shape, data)
data = duck_array_ops.where(mask, fill_value, data)
else:
# array cannot be indexed along dimensions of size 0, so just
# build the mask directly instead.
mask = indexing.create_mask(indexer, self.shape)
data = np.broadcast_to(fill_value, getattr(mask, "shape", ()))
if new_order:
data = duck_array_ops.moveaxis(data, range(len(new_order)), new_order)
return self._finalize_indexing_result(dims, data)
def __setitem__(self, key, value):
"""__setitem__ is overloaded to access the underlying numpy values with
orthogonal indexing.
See __getitem__ for more details.
"""
dims, index_tuple, new_order = self._broadcast_indexes(key)
if not isinstance(value, Variable):
value = as_compatible_data(value)
if value.ndim > len(dims):
raise ValueError(
"shape mismatch: value array of shape %s could not be "
"broadcast to indexing result with %s dimensions"
% (value.shape, len(dims))
)
if value.ndim == 0:
value = Variable((), value)
else:
value = Variable(dims[-value.ndim :], value)
# broadcast to become assignable
value = value.set_dims(dims).data
if new_order:
value = duck_array_ops.asarray(value)
value = value[(len(dims) - value.ndim) * (np.newaxis,) + (Ellipsis,)]
value = duck_array_ops.moveaxis(value, new_order, range(len(new_order)))
indexable = as_indexable(self._data)
indexable[index_tuple] = value
@property
def attrs(self) -> "OrderedDict[Any, Any]":
"""Dictionary of local attributes on this variable.
"""
if self._attrs is None:
self._attrs = OrderedDict()
return self._attrs
@attrs.setter
def attrs(self, value: Mapping[Hashable, Any]) -> None:
self._attrs = OrderedDict(value)
@property
def encoding(self):
"""Dictionary of encodings on this variable.
"""
if self._encoding is None:
self._encoding = {}
return self._encoding
@encoding.setter
def encoding(self, value):
try:
self._encoding = dict(value)
except ValueError:
raise ValueError("encoding must be castable to a dictionary")
def copy(self, deep=True, data=None):
"""Returns a copy of this object.
If `deep=True`, the data array is loaded into memory and copied onto
the new object. Dimensions, attributes and encodings are always copied.
Use `data` to create a new object with the same structure as
original but entirely new data.
Parameters
----------
deep : bool, optional
Whether the data array is loaded into memory and copied onto
the new object. Default is True.
data : array_like, optional
Data to use in the new object. Must have same shape as original.
When `data` is used, `deep` is ignored.
Returns
-------
object : Variable
New object with dimensions, attributes, encodings, and optionally
data copied from original.
Examples
--------
Shallow copy versus deep copy
>>> var = xr.Variable(data=[1, 2, 3], dims='x')
>>> var.copy()
<xarray.Variable (x: 3)>
array([1, 2, 3])
>>> var_0 = var.copy(deep=False)
>>> var_0[0] = 7
>>> var_0
<xarray.Variable (x: 3)>
array([7, 2, 3])
>>> var
<xarray.Variable (x: 3)>
array([7, 2, 3])
Changing the data using the ``data`` argument maintains the
structure of the original object, but with the new data. Original
object is unaffected.
>>> var.copy(data=[0.1, 0.2, 0.3])
<xarray.Variable (x: 3)>
array([ 0.1, 0.2, 0.3])
>>> var
<xarray.Variable (x: 3)>
array([7, 2, 3])
See Also
--------
pandas.DataFrame.copy
"""
if data is None:
data = self._data
if isinstance(data, indexing.MemoryCachedArray):
# don't share caching between copies
data = indexing.MemoryCachedArray(data.array)
if deep:
if hasattr(data, "__array_function__") or isinstance(
data, dask_array_type
):
data = data.copy()
elif not isinstance(data, PandasIndexAdapter):
# pandas.Index is immutable
data = np.array(data)
else:
data = as_compatible_data(data)
if self.shape != data.shape:
raise ValueError(
"Data shape {} must match shape of object {}".format(
data.shape, self.shape
)
)
# note:
# dims is already an immutable tuple
# attributes and encoding will be copied when the new Array is created
return type(self)(self.dims, data, self._attrs, self._encoding, fastpath=True)
def __copy__(self):
return self.copy(deep=False)
def __deepcopy__(self, memo=None):
# memo does nothing but is required for compatibility with
# copy.deepcopy
return self.copy(deep=True)
# mutable objects should not be hashable
# https://github.com/python/mypy/issues/4266
__hash__ = None # type: ignore
@property
def chunks(self):
"""Block dimensions for this array's data or None if it's not a dask
array.
"""
return getattr(self._data, "chunks", None)
_array_counter = itertools.count()
def chunk(self, chunks=None, name=None, lock=False):
"""Coerce this array's data into a dask arrays with the given chunks.
If this variable is a non-dask array, it will be converted to dask
array. If it's a dask array, it will be rechunked to the given chunk
sizes.
If neither chunks is not provided for one or more dimensions, chunk
sizes along that dimension will not be updated; non-dask arrays will be
converted into dask arrays with a single block.
Parameters
----------
chunks : int, tuple or dict, optional
Chunk sizes along each dimension, e.g., ``5``, ``(5, 5)`` or
``{'x': 5, 'y': 5}``.
name : str, optional
Used to generate the name for this array in the internal dask
graph. Does not need not be unique.
lock : optional
Passed on to :py:func:`dask.array.from_array`, if the array is not
already as dask array.
Returns
-------
chunked : xarray.Variable
"""
import dask
import dask.array as da
if utils.is_dict_like(chunks):
chunks = {self.get_axis_num(dim): chunk for dim, chunk in chunks.items()}
if chunks is None:
chunks = self.chunks or self.shape
data = self._data
if isinstance(data, da.Array):
data = data.rechunk(chunks)
else:
if isinstance(data, indexing.ExplicitlyIndexed):
# Unambiguously handle array storage backends (like NetCDF4 and h5py)
# that can't handle general array indexing. For example, in netCDF4 you
# can do "outer" indexing along two dimensions independent, which works
# differently from how NumPy handles it.
# da.from_array works by using lazy indexing with a tuple of slices.
# Using OuterIndexer is a pragmatic choice: dask does not yet handle
# different indexing types in an explicit way:
# https://github.com/dask/dask/issues/2883
data = indexing.ImplicitToExplicitIndexingAdapter(
data, indexing.OuterIndexer
)
if LooseVersion(dask.__version__) < "2.0.0":
kwargs = {}
else:
# All of our lazily loaded backend array classes should use NumPy
# array operations.
kwargs = {"meta": np.ndarray}
else:
kwargs = {}
if utils.is_dict_like(chunks):
chunks = tuple(chunks.get(n, s) for n, s in enumerate(self.shape))
data = da.from_array(data, chunks, name=name, lock=lock, **kwargs)
return type(self)(self.dims, data, self._attrs, self._encoding, fastpath=True)
def isel(self, indexers=None, drop=False, **indexers_kwargs):
"""Return a new array indexed along the specified dimension(s).
Parameters
----------
**indexers : {dim: indexer, ...}
Keyword arguments with names matching dimensions and values given
by integers, slice objects or arrays.
Returns
-------
obj : Array object
A new Array with the selected data and dimensions. In general,
the new variable's data will be a view of this variable's data,
unless numpy fancy indexing was triggered by using an array
indexer, in which case the data will be a copy.
"""
indexers = either_dict_or_kwargs(indexers, indexers_kwargs, "isel")
invalid = [k for k in indexers if k not in self.dims]
if invalid:
raise ValueError("dimensions %r do not exist" % invalid)
key = [slice(None)] * self.ndim
for i, dim in enumerate(self.dims):
if dim in indexers:
key[i] = indexers[dim]
return self[tuple(key)]
def squeeze(self, dim=None):
"""Return a new object with squeezed data.
Parameters
----------
dim : None or str or tuple of str, optional
Selects a subset of the length one dimensions. If a dimension is
selected with length greater than one, an error is raised. If
None, all length one dimensions are squeezed.
Returns
-------