66import pandas as pd
77
88from deepnote_toolkit .ocelots .constants import DEEPNOTE_INDEX_COLUMN
9+ from deepnote_toolkit .ocelots .pandas .utils import (
10+ is_numeric_or_temporal ,
11+ is_type_datetime_or_timedelta ,
12+ safe_convert_to_string ,
13+ )
914from deepnote_toolkit .ocelots .types import ColumnsStatsRecord , ColumnStats
1015
1116
@@ -24,7 +29,10 @@ def _get_categories(np_array):
2429 # special treatment for empty values
2530 num_nans = pandas_series .isna ().sum ().item ()
2631
27- counter = Counter (pandas_series .dropna ().astype (str ))
32+ try :
33+ counter = Counter (pandas_series .dropna ().astype (str ))
34+ except (TypeError , UnicodeDecodeError , AttributeError ):
35+ counter = Counter (pandas_series .dropna ().apply (safe_convert_to_string ))
2836
2937 max_items = 3
3038 if num_nans > 0 :
@@ -46,33 +54,9 @@ def _get_categories(np_array):
4654 return [{"name" : name , "count" : count } for name , count in categories ]
4755
4856
49- def _is_type_numeric (dtype ):
50- """
51- Returns True if dtype is numeric, False otherwise
52-
53- Numeric means either a number (int, float, complex) or a datetime or timedelta.
54- It means e.g. that a range of these values can be plotted on a histogram.
55- """
56-
57- # datetime doesn't play nice with np.issubdtype, so we need to check explicitly
58- if pd .api .types .is_datetime64_any_dtype (dtype ) or pd .api .types .is_timedelta64_dtype (
59- dtype
60- ):
61- return True
62-
63- try :
64- return np .issubdtype (dtype , np .number )
65- except TypeError :
66- # np.issubdtype crashes on categorical column dtype, and also on others, e.g. geopandas types
67- return False
68-
69-
7057def _get_histogram (pd_series ):
7158 try :
72- if pd .api .types .is_datetime64_any_dtype (
73- pd_series
74- ) or pd .api .types .is_timedelta64_dtype (pd_series ):
75- # convert datetime or timedelta to an integer so that a histogram can be created
59+ if is_type_datetime_or_timedelta (pd_series ):
7660 np_array = np .array (pd_series .dropna ().astype (int ))
7761 else :
7862 # let's drop infinite values because they break histograms
@@ -104,11 +88,15 @@ def _calculate_min_max(column):
10488 """
10589 Calculate min and max values for a given column.
10690 """
107- if _is_type_numeric (column .dtype ):
91+ if not is_numeric_or_temporal (column .dtype ):
92+ return None , None
93+
94+ try :
10895 min_value = str (min (column .dropna ())) if len (column .dropna ()) > 0 else None
10996 max_value = str (max (column .dropna ())) if len (column .dropna ()) > 0 else None
11097 return min_value , max_value
111- return None , None
98+ except (TypeError , ValueError ):
99+ return None , None
112100
113101
114102def analyze_columns (
@@ -167,7 +155,7 @@ def analyze_columns(
167155 unique_count = _count_unique (column ), nan_count = column .isnull ().sum ().item ()
168156 )
169157
170- if _is_type_numeric (column .dtype ):
158+ if is_numeric_or_temporal (column .dtype ):
171159 min_value , max_value = _calculate_min_max (column )
172160 columns [i ].stats .min = min_value
173161 columns [i ].stats .max = max_value
@@ -187,7 +175,7 @@ def analyze_columns(
187175 for i in range (max_columns_to_analyze , len (df .columns )):
188176 # Ignore columns that are not numeric
189177 column = df .iloc [:, i ]
190- if not _is_type_numeric (column .dtype ):
178+ if not is_numeric_or_temporal (column .dtype ):
191179 continue
192180
193181 column_name = columns [i ].name
0 commit comments