capitalone · taylorfturner · Jun 20, 2023 · Jun 19, 2023 · Jun 19, 2023 · Jun 19, 2023
@@ -2,7 +2,11 @@
 from . import json_decoder
 from .base_column_profilers import BaseColumnProfiler
 from .categorical_column_profile import CategoricalColumn
-from .column_profile_compilers import BaseCompiler, ColumnPrimitiveTypeProfileCompiler
+from .column_profile_compilers import (
+    BaseCompiler,
+    ColumnPrimitiveTypeProfileCompiler,
+    ColumnStatsProfileCompiler,
+)
 from .data_labeler_column_profile import DataLabelerColumn
 from .datetime_column_profile import DateTimeColumn
 from .float_column_profile import FloatColumn
@@ -27,4 +31,5 @@
 
 json_decoder._compilers = {
     ColumnPrimitiveTypeProfileCompiler.__name__: ColumnPrimitiveTypeProfileCompiler,
+    ColumnStatsProfileCompiler.__name__: ColumnStatsProfileCompiler,
 }
@@ -73,6 +73,55 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn:
         )
         return merged_profile
 
+    @property
+    def gini_impurity(self) -> float | None:
+        """
+        Return Gini Impurity.
+
+        Gini Impurity is a way to calculate
+        likelihood of an incorrect classification of a new instance of
+        a random variable.
+
+        G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
+        We are traversing through categories and calculating with the column
+
+        :return: None or Gini Impurity probability
+        """
+        if self.sample_size == 0:
+            return None
+        gini_sum: float = 0
+        for i in self._categories:
+            gini_sum += (self._categories[i] / self.sample_size) * (
+                1 - (self._categories[i] / self.sample_size)
+            )
+        return gini_sum
+
+    @property
+    def unalikeability(self) -> float | None:
+        """
+        Return Unlikeability.
+
+        Unikeability checks for "how often observations differ from one another"
+        Reference: Perry, M. and Kader, G. Variation as Unalikeability.
+        Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.
+
+        U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n)
+        Cij = 1 if i!=j, 0 if i=j
+
+        :return: None or unlikeability probability
+        """
+        if self.sample_size == 0:
+            return None
+        elif self.sample_size == 1:
+            return 0
+        unalike_sum: int = 0
+        for category in self._categories:
+            unalike_sum += (
+                self.sample_size - self._categories[category]
+            ) * self._categories[category]
+        unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size)
+        return unalike
+
     def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
         """
         Find the differences for CategoricalColumns.
@@ -228,6 +277,10 @@ def is_match(self) -> bool:
             is_match = True
         return is_match
 
+    def _get_categories(self, df_series):
+        category_count = df_series.value_counts(dropna=False).to_dict()
+        return category_count
+
     @BaseColumnProfiler._timeit(name="categories")
     def _update_categories(
         self,
@@ -250,7 +303,7 @@ def _update_categories(
         :type df_series: pandas.DataFrame
         :return: None
         """
-        category_count = df_series.value_counts(dropna=False).to_dict()
+        category_count = self._get_categories(df_series)
         self._categories = utils.add_nested_dictionaries(
             self._categories, category_count
         )
@@ -292,52 +345,3 @@ def update(self, df_series: Series) -> CategoricalColumn:
         self._update_helper(df_series, profile)
 
         return self
-
-    @property
-    def gini_impurity(self) -> float | None:
-        """
-        Return Gini Impurity.
-
-        Gini Impurity is a way to calculate
-        likelihood of an incorrect classification of a new instance of
-        a random variable.
-
-        G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
-        We are traversing through categories and calculating with the column
-
-        :return: None or Gini Impurity probability
-        """
-        if self.sample_size == 0:
-            return None
-        gini_sum: float = 0
-        for i in self._categories:
-            gini_sum += (self._categories[i] / self.sample_size) * (
-                1 - (self._categories[i] / self.sample_size)
-            )
-        return gini_sum
-
-    @property
-    def unalikeability(self) -> float | None:
-        """
-        Return Unlikeability.
-
-        Unikeability checks for "how often observations differ from one another"
-        Reference: Perry, M. and Kader, G. Variation as Unalikeability.
-        Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.
-
-        U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n)
-        Cij = 1 if i!=j, 0 if i=j
-
-        :return: None or unlikeability probability
-        """
-        if self.sample_size == 0:
-            return None
-        elif self.sample_size == 1:
-            return 0
-        unalike_sum: int = 0
-        for category in self._categories:
-            unalike_sum += (
-                self.sample_size - self._categories[category]
-            ) * self._categories[category]
-        unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size)
-        return unalike
@@ -1,15 +1,29 @@
 """Index profile analysis for individual col within structured profiling."""
 from __future__ import annotations
 
-from typing import cast
+from abc import abstractmethod
+from typing import Protocol, TypeVar
 
+import numpy as np
 from pandas import DataFrame, Series
 
 from . import utils
 from .base_column_profilers import BaseColumnProfiler
 from .profiler_options import OrderOptions
 
 
+class Comparable(Protocol):
+    """Protocol for ensuring comparable types, in this case both floats or strings."""
+
+    @abstractmethod
+    def __lt__(self: CT, other: CT) -> bool:
+        """Protocol for ensuring comparable values."""
+        pass
+
+
+CT = TypeVar("CT", bound=Comparable)
+
+
 class OrderColumn(BaseColumnProfiler):
     """
     Index column profile subclass of BaseColumnProfiler.
@@ -33,28 +47,31 @@ def __init__(self, name: str | None, options: OrderOptions = None) -> None:
                 "OrderColumn parameter 'options' must be of type" " OrderOptions."
             )
         self.order: str | None = None
-        self._last_value: int | None = None
-        self._first_value: int | None = None
+        self._last_value: float | str | None = None
+        self._first_value: float | str | None = None
         self._piecewise: bool | None = False
         self.__calculations: dict = {}
         self._filter_properties_w_options(self.__calculations, options)
         super().__init__(name)
 
     @staticmethod
     def _is_intersecting(
-        first_value1: int, last_value1: int, first_value2: int, last_value2: int
+        first_value1: CT,
+        last_value1: CT,
+        first_value2: CT,
+        last_value2: CT,
     ) -> bool:
         """
         Check to see if the range of the datasets intersect.
 
         :param first_value1: beginning value of dataset 1
-        :type first_value1: Integer
+        :type first_value1: Float | String
         :param last_value1: last value of dataset 1
-        :type last_value1: Integer
+        :type last_value1: Float | String
         :param first_value2: beginning value of dataset 2
-        :type first_value2: Integer
+        :type first_value2: Float | String
         :param last_value2: last value of dataset 2
-        :type last_value2: Integer
+        :type last_value2: Float | String
         :return: Whether or not there is an intersection
         :rtype: Bool
         """
@@ -78,19 +95,22 @@ def _is_intersecting(
 
     @staticmethod
     def _is_enveloping(
-        first_value1: int, last_value1: int, first_value2: int, last_value2: int
+        first_value1: CT,
+        last_value1: CT,
+        first_value2: CT,
+        last_value2: CT,
     ) -> bool:
         """
         Check to see if the range of the dataset 1 envelopes dataset 2.
 
         :param first_value1: beginning value of dataset 1
-        :type first_value1: Integer
+        :type first_value1: Float | String
         :param last_value1: last value of dataset 1
-        :type last_value1: Integer
+        :type last_value1: Float | String
         :param first_value2: beginning value of dataset 2
-        :type first_value2: Integer
+        :type first_value2: Float | String
         :param last_value2: last value of dataset 2
-        :type last_value2: Integer
+        :type last_value2: Float | String
         :return: Whether or not there is an intersection
         :rtype: Bool
         """
@@ -109,14 +129,14 @@ def _is_enveloping(
     def _merge_order(
         self,
         order1: str,
-        first_value1: int,
-        last_value1: int,
+        first_value1: CT,
+        last_value1: CT,
         piecewise1: bool,
         order2: str,
-        first_value2: int,
-        last_value2: int,
+        first_value2: CT,
+        last_value2: CT,
         piecewise2: bool,
-    ) -> tuple[str, int, int, bool]:
+    ) -> tuple[str, CT | None, CT | None, bool]:
         """
         Add the order of two datasets together.
 
@@ -129,15 +149,15 @@ def _merge_order(
         :param last_value2: last value of new dataset
         :param piecewise2: new dataset is piecewise or not
         :type order1: String
-        :type first_value1: Integer
-        :type last_value1: Integer
+        :type first_value1: Float | String
+        :type last_value1: Float | String
         :type piecewise1: Boolean
         :type order2: String
-        :type first_value2: Integer
-        :type last_value2: Integer
+        :type first_value2: Float | String
+        :type last_value2: Float | String
         :type piecewise2: Boolean
         :return: order, first_value, last_value, piecewise
-        :rtype: String, Int, Int, Boolean
+        :rtype: String, Float | String, Float | String, Boolean
         """
         # Return either order if one is None
         if not order1:
@@ -157,8 +177,8 @@ def _merge_order(
 
         # Default initialization
         order = "random"
-        first_value: int | None = None
-        last_value: int | None = None
+        first_value: CT | None = None
+        last_value: CT | None = None
 
         if order1 == "random" or order2 == "random":
             order = "random"
@@ -219,7 +239,7 @@ def _merge_order(
         ) or order == "random":
             piecewise = False
 
-        return order, cast(int, first_value), cast(int, last_value), piecewise
+        return order, first_value, last_value, piecewise
 
     def __add__(self, other: OrderColumn) -> OrderColumn:
         """
@@ -283,7 +303,15 @@ def load_from_dict(cls, data):
         :rtype: CategoricalColumn
         """
         # This is an ambiguous call to super classes.
-        return super().load_from_dict(data)
+        profile = super().load_from_dict(data)
+        try:
+            if profile.sample_size:
+                profile._first_value = np.float64(profile._first_value)
+                profile._last_value = np.float64(profile._last_value)
+        except ValueError:
+            profile._first_value = data["_first_value"]
+            profile._last_value = data["_last_value"]
+        return profile
 
     @property
     def profile(self) -> dict: