|
| 1 | +# ruff: noqa: SLF001 |
| 2 | + |
| 3 | +from __future__ import annotations |
| 4 | + |
| 5 | +from typing import TYPE_CHECKING, Any |
| 6 | + |
| 7 | +import numpy as np |
| 8 | +from arro3.core import Array, ChunkedArray, DataType |
| 9 | + |
| 10 | +from lonboard._h3._str_to_h3 import str_to_h3 |
| 11 | +from lonboard._serialization import ACCESSOR_SERIALIZATION |
| 12 | +from lonboard.traits._base import FixedErrorTraitType |
| 13 | + |
| 14 | +if TYPE_CHECKING: |
| 15 | + import pandas as pd |
| 16 | + from numpy.typing import NDArray |
| 17 | + from traitlets.traitlets import TraitType |
| 18 | + |
| 19 | + from lonboard.layer import BaseArrowLayer |
| 20 | + |
| 21 | + |
| 22 | +class A5Accessor(FixedErrorTraitType): |
| 23 | + """A trait to validate A5 cell input. |
| 24 | +
|
| 25 | + Various input is allowed: |
| 26 | +
|
| 27 | + - A numpy `ndarray` with an object, S16, or uint64 data type. |
| 28 | + - A pandas `Series` with an object or uint64 data type. |
| 29 | + - A pyarrow string, large string, string view array, or uint64 array, or a chunked array of those types. |
| 30 | + - Any Arrow string, large string, string view array, or uint64 array, or a chunked array of those types from a library that implements the [Arrow PyCapsule |
| 31 | + Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). |
| 32 | + """ |
| 33 | + |
| 34 | + default_value = None |
| 35 | + info_text = ( |
| 36 | + "a float value or numpy ndarray or Arrow array representing an array of floats" |
| 37 | + ) |
| 38 | + |
| 39 | + def __init__( |
| 40 | + self: TraitType, |
| 41 | + *args: Any, |
| 42 | + **kwargs: Any, |
| 43 | + ) -> None: |
| 44 | + super().__init__(*args, **kwargs) |
| 45 | + self.tag(sync=True, **ACCESSOR_SERIALIZATION) |
| 46 | + |
| 47 | + def _pandas_to_numpy( |
| 48 | + self, |
| 49 | + obj: BaseArrowLayer, |
| 50 | + value: pd.Series, |
| 51 | + ) -> NDArray[np.str_] | NDArray[np.uint64]: |
| 52 | + """Cast pandas Series to numpy ndarray.""" |
| 53 | + if isinstance(value.dtype, np.dtype) and np.issubdtype(value.dtype, np.integer): |
| 54 | + return np.asarray(value, dtype=np.uint64) |
| 55 | + |
| 56 | + if not isinstance(value.dtype, np.dtype) or not np.issubdtype( |
| 57 | + value.dtype, |
| 58 | + np.object_, |
| 59 | + ): |
| 60 | + self.error( |
| 61 | + obj, |
| 62 | + value, |
| 63 | + info="A5 Pandas series not object or uint64 dtype.", |
| 64 | + ) |
| 65 | + |
| 66 | + if not (value.str.len() == 16).all(): |
| 67 | + self.error( |
| 68 | + obj, |
| 69 | + value, |
| 70 | + info="A5 Pandas series not all 16 characters long.", |
| 71 | + ) |
| 72 | + |
| 73 | + return np.asarray(value, dtype="S16") |
| 74 | + |
| 75 | + def _numpy_to_arrow(self, obj: BaseArrowLayer, value: np.ndarray) -> ChunkedArray: |
| 76 | + if np.issubdtype(value.dtype, np.uint64): |
| 77 | + return ChunkedArray([value]) |
| 78 | + |
| 79 | + if np.issubdtype(value.dtype, np.object_): |
| 80 | + if {len(v) for v in value} != {16}: |
| 81 | + self.error( |
| 82 | + obj, |
| 83 | + value, |
| 84 | + info="numpy object array not all 16 characters long", |
| 85 | + ) |
| 86 | + |
| 87 | + value = np.asarray(value, dtype="S16") |
| 88 | + |
| 89 | + if not np.issubdtype(value.dtype, np.dtype("S16")): |
| 90 | + self.error(obj, value, info="numpy array not object, str, or uint64 dtype") |
| 91 | + |
| 92 | + a5_uint8_array = str_to_h3(value) |
| 93 | + return ChunkedArray([a5_uint8_array]) |
| 94 | + |
| 95 | + def validate(self, obj: BaseArrowLayer, value: Any) -> ChunkedArray: |
| 96 | + # pandas Series |
| 97 | + if ( |
| 98 | + value.__class__.__module__.startswith("pandas") |
| 99 | + and value.__class__.__name__ == "Series" |
| 100 | + ): |
| 101 | + value = self._pandas_to_numpy(obj, value) |
| 102 | + |
| 103 | + if isinstance(value, np.ndarray): |
| 104 | + value = self._numpy_to_arrow(obj, value) |
| 105 | + elif hasattr(value, "__arrow_c_array__"): |
| 106 | + value = ChunkedArray([Array.from_arrow(value)]) |
| 107 | + elif hasattr(value, "__arrow_c_stream__"): |
| 108 | + value = ChunkedArray.from_arrow(value) |
| 109 | + else: |
| 110 | + self.error(obj, value) |
| 111 | + |
| 112 | + assert isinstance(value, ChunkedArray) |
| 113 | + |
| 114 | + if ( |
| 115 | + DataType.is_string(value.type) |
| 116 | + or DataType.is_large_string(value.type) |
| 117 | + or DataType.is_string_view(value.type) |
| 118 | + ): |
| 119 | + value = self._numpy_to_arrow(obj, value.to_numpy()) |
| 120 | + |
| 121 | + if not DataType.is_uint64(value.type): |
| 122 | + self.error( |
| 123 | + obj, |
| 124 | + value, |
| 125 | + info="A5 Arrow array must be uint64 type.", |
| 126 | + ) |
| 127 | + |
| 128 | + return value.rechunk(max_chunksize=obj._rows_per_chunk) |
0 commit comments