capitalone · taylorfturner · Jun 29, 2023 · May 16, 2023 · May 31, 2023 · Jun 2, 2023
@@ -637,7 +637,9 @@ def load_from_library(cls, name: str) -> BaseDataLabeler:
         :return: DataLabeler class
         :rtype: BaseDataLabeler
         """
-        return cls(os.path.join(default_labeler_dir, name))
+        labeler = cls(os.path.join(default_labeler_dir, name))
+        labeler._default_model_loc = name
+        return labeler
 
     @classmethod
     def load_from_disk(cls, dirpath: str, load_options: dict = None) -> BaseDataLabeler:

@@ -102,7 +102,7 @@ def __new__(  # type: ignore
         trainable: bool = False,
     ) -> BaseDataLabeler:
         """
-        Create structured and unstructred data labeler objects.
+        Create structured and unstructured data labeler objects.
 
         :param dirpath: Path to load data labeler
         :type dirpath: str
@@ -143,6 +143,9 @@ def load_from_library(cls, name: str, trainable: bool = False) -> BaseDataLabele
         """
         if trainable:
             return TrainableDataLabeler.load_from_library(name)
+        for _, labeler_class_obj in cls.labeler_classes.items():
+            if name in labeler_class_obj._default_model_loc:
+                return labeler_class_obj()
         return BaseDataLabeler.load_from_library(name)
 
     @classmethod

@@ -1,12 +1,98 @@
 """Package for providing statistics and predictions for a given dataset."""
+from . import json_decoder
 from .base_column_profilers import BaseColumnProfiler
 from .categorical_column_profile import CategoricalColumn
+from .column_profile_compilers import (
+    BaseCompiler,
+    ColumnDataLabelerCompiler,
+    ColumnPrimitiveTypeProfileCompiler,
+    ColumnStatsProfileCompiler,
+)
 from .data_labeler_column_profile import DataLabelerColumn
 from .datetime_column_profile import DateTimeColumn
 from .float_column_profile import FloatColumn
 from .int_column_profile import IntColumn
 from .numerical_column_stats import NumericStatsMixin
 from .order_column_profile import OrderColumn
-from .profile_builder import Profiler, StructuredProfiler, UnstructuredProfiler
+from .profile_builder import (
+    Profiler,
+    StructuredColProfiler,
+    StructuredProfiler,
+    UnstructuredProfiler,
+)
+from .profiler_options import (
+    BaseInspectorOptions,
+    BooleanOption,
+    CategoricalOptions,
+    CorrelationOptions,
+    DataLabelerOptions,
+    DateTimeOptions,
+    FloatOptions,
+    HistogramOption,
+    HyperLogLogOptions,
+    IntOptions,
+    ModeOption,
+    NumericalOptions,
+    OrderOptions,
+    PrecisionOptions,
+    ProfilerOptions,
+    RowStatisticsOptions,
+    StructuredOptions,
+    TextOptions,
+    TextProfilerOptions,
+    UniqueCountOptions,
+    UnstructuredOptions,
+)
 from .text_column_profile import TextColumn
 from .unstructured_labeler_profile import UnstructuredLabelerProfile
+
+# set here to avoid circular imports
+json_decoder._profiles = {
+    CategoricalColumn.__name__: CategoricalColumn,
+    FloatColumn.__name__: FloatColumn,
+    IntColumn.__name__: IntColumn,
+    DateTimeColumn.__name__: DateTimeColumn,
+    OrderColumn.__name__: OrderColumn,
+    DataLabelerColumn.__name__: DataLabelerColumn,
+    TextColumn.__name__: TextColumn,
+}
+
+
+json_decoder._compilers = {
+    ColumnDataLabelerCompiler.__name__: ColumnDataLabelerCompiler,
+    ColumnPrimitiveTypeProfileCompiler.__name__: ColumnPrimitiveTypeProfileCompiler,
+    ColumnStatsProfileCompiler.__name__: ColumnStatsProfileCompiler,
+}
+
+json_decoder._options = {
+    BooleanOption.__name__: BooleanOption,
+    HistogramOption.__name__: HistogramOption,
+    ModeOption.__name__: ModeOption,
+    BaseInspectorOptions.__name__: BaseInspectorOptions,
+    NumericalOptions.__name__: NumericalOptions,
+    IntOptions.__name__: IntOptions,
+    PrecisionOptions.__name__: PrecisionOptions,
+    FloatOptions.__name__: FloatOptions,
+    TextOptions.__name__: TextOptions,
+    DateTimeOptions.__name__: DateTimeOptions,
+    OrderOptions.__name__: OrderOptions,
+    CategoricalOptions.__name__: CategoricalOptions,
+    CorrelationOptions.__name__: CorrelationOptions,
+    UniqueCountOptions.__name__: UniqueCountOptions,
+    HyperLogLogOptions.__name__: HyperLogLogOptions,
+    RowStatisticsOptions.__name__: RowStatisticsOptions,
+    DataLabelerOptions.__name__: DataLabelerOptions,
+    TextProfilerOptions.__name__: TextProfilerOptions,
+    StructuredOptions.__name__: StructuredOptions,
+    UnstructuredOptions.__name__: UnstructuredOptions,
+    ProfilerOptions.__name__: ProfilerOptions,
+}
+
+
+json_decoder._profilers = {
+    StructuredProfiler.__name__: StructuredProfiler,
+}
+
+json_decoder._structured_col_profiler = {
+    StructuredColProfiler.__name__: StructuredColProfiler,
+}
@@ -11,9 +11,8 @@
 import numpy as np
 import pandas as pd
 
-from dataprofiler.profilers.profiler_options import BaseInspectorOptions
-
 from . import utils
+from .profiler_options import BaseInspectorOptions, BaseOption
 
 BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler")
 
@@ -30,7 +29,7 @@ class BaseColumnProfiler(Generic[BaseColumnProfilerT], metaclass=abc.ABCMeta):
     _SAMPLING_RATIO = 0.20
     _MIN_SAMPLING_COUNT = 500
 
-    def __init__(self, name: str | None) -> None:
+    def __init__(self, name: str | None, options: BaseOption | None = None):
         """
         Initialize base class properties for the subclass.
 
@@ -249,6 +248,44 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
         """
         raise NotImplementedError()
 
+    @classmethod
+    def load_from_dict(
+        cls: type[BaseColumnProfilerT],
+        data: dict[str, Any],
+        options: dict | None = None,
+    ) -> BaseColumnProfilerT:
+        """
+        Parse attribute from json dictionary into self.
+
+        :param data: dictionary with attributes and values.
+        :type data: dict[string, Any]
+        :param options: options for loading column profiler params from dictionary
+        :type options: Dict | None
+
+        :return: Profiler with attributes populated.
+        :rtype: BaseColumnProfiler
+        """
+        if options is None:
+            options = {}
+
+        class_options = options.get(cls.__name__)
+        profile: BaseColumnProfilerT = cls(data["name"], class_options)
+
+        time_vals = data.pop("times")
+        setattr(profile, "times", defaultdict(float, time_vals))
+
+        for attr, value in data.items():
+            if "__calculations" in attr:
+                for metric, function in value.items():
+                    if not hasattr(profile, function):
+                        raise AttributeError(
+                            f"Object {type(profile)} has no attribute {function}."
+                        )
+                    value[metric] = getattr(profile, function).__func__
+            setattr(profile, attr, value)
+
+        return profile
+
 
 BaseColumnPrimitiveTypeProfilerT = TypeVar(
     "BaseColumnPrimitiveTypeProfilerT", bound="BaseColumnPrimitiveTypeProfiler"
@@ -282,7 +319,7 @@ def _update_column_base_properties(self, profile: dict) -> None:
         :type profile: base data profile dict
         :return: None
         """
-        self.match_count += profile.pop("match_count")
+        self.match_count += int(profile.pop("match_count"))
         BaseColumnProfiler._update_column_base_properties(self, profile)
 
     def _add_helper(

@@ -8,7 +8,8 @@
 import datasketches
 from pandas import DataFrame, Series
 
-from . import BaseColumnProfiler, utils
+from . import utils
+from .base_column_profilers import BaseColumnProfiler
 from .profiler_options import CategoricalOptions
 
 
@@ -188,6 +189,55 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn:
 
         return merged_profile
 
+    @property
+    def gini_impurity(self) -> float | None:
+        """
+        Return Gini Impurity.
+
+        Gini Impurity is a way to calculate
+        likelihood of an incorrect classification of a new instance of
+        a random variable.
+
+        G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
+        We are traversing through categories and calculating with the column
+
+        :return: None or Gini Impurity probability
+        """
+        if self.sample_size == 0:
+            return None
+        gini_sum: float = 0
+        for i in self._categories:
+            gini_sum += (self._categories[i] / self.sample_size) * (
+                1 - (self._categories[i] / self.sample_size)
+            )
+        return gini_sum
+
+    @property
+    def unalikeability(self) -> float | None:
+        """
+        Return Unlikeability.
+
+        Unikeability checks for "how often observations differ from one another"
+        Reference: Perry, M. and Kader, G. Variation as Unalikeability.
+        Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.
+
+        U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n)
+        Cij = 1 if i!=j, 0 if i=j
+
+        :return: None or unlikeability probability
+        """
+        if self.sample_size == 0:
+            return None
+        elif self.sample_size == 1:
+            return 0
+        unalike_sum: int = 0
+        for category in self._categories:
+            unalike_sum += (
+                self.sample_size - self._categories[category]
+            ) * self._categories[category]
+        unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size)
+        return unalike
+
     def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
         """
         Find the differences for CategoricalColumns.
@@ -267,6 +317,22 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
         """
         return self.profile
 
+    @classmethod
+    def load_from_dict(cls, data: dict, options: dict | None = None):
+        """
+        Parse attribute from json dictionary into self.
+
+        :param data: dictionary with attributes and values.
+        :type data: dict[string, Any]
+
+        :return: Profiler with attributes populated.
+        :rtype: CategoricalColumn
+        """
+        value = data.pop("_categories")
+        profile = super().load_from_dict(data)
+        setattr(profile, "_categories", defaultdict(int, value))
+        return profile
+
     @property
     def profile(self) -> dict:
         """
@@ -479,6 +545,17 @@ def _merge_categories_cms(
                 categories.pop(cat)
         return cms3, categories, max_num_heavy_hitters
 
+    def _get_categories_full(self, df_series) -> dict:
+        """Get the unique counts (categories) of a series.
+
+        :param df_series: df series with nulls removed
+        :type df_series: pandas.core.series.Series
+        :return: dict of counts for each unique value
+        :rtype: dict
+        """
+        category_count: dict = df_series.value_counts(dropna=False).to_dict()
+        return category_count
+
     @BaseColumnProfiler._timeit(name="categories")
     def _update_categories(
         self,
@@ -524,7 +601,7 @@ def _update_categories(
                 self._cms_max_num_heavy_hitters,
             )
         else:
-            category_count = df_series.value_counts(dropna=False).to_dict()
+            category_count = self._get_categories_full(df_series)
             self._categories = utils.add_nested_dictionaries(
                 self._categories, category_count
             )
@@ -570,52 +647,3 @@ def update(self, df_series: Series) -> CategoricalColumn:
         self._update_helper(df_series, profile)
 
         return self
-
-    @property
-    def gini_impurity(self) -> float | None:
-        """
-        Return Gini Impurity.
-
-        Gini Impurity is a way to calculate
-        likelihood of an incorrect classification of a new instance of
-        a random variable.
-
-        G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
-        We are traversing through categories and calculating with the column
-
-        :return: None or Gini Impurity probability
-        """
-        if self.sample_size == 0:
-            return None
-        gini_sum: float = 0
-        for i in self._categories:
-            gini_sum += (self._categories[i] / self.sample_size) * (
-                1 - (self._categories[i] / self.sample_size)
-            )
-        return gini_sum
-
-    @property
-    def unalikeability(self) -> float | None:
-        """
-        Return Unlikeability.
-
-        Unikeability checks for "how often observations differ from one another"
-        Reference: Perry, M. and Kader, G. Variation as Unalikeability.
-        Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.
-
-        U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n)
-        Cij = 1 if i!=j, 0 if i=j
-
-        :return: None or unlikeability probability
-        """
-        if self.sample_size == 0:
-            return None
-        elif self.sample_size == 1:
-            return 0
-        unalike_sum: int = 0
-        for category in self._categories:
-            unalike_sum += (
-                self.sample_size - self._categories[category]
-            ) * self._categories[category]
-        unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size)
-        return unalike