capitalone · taylorfturner · May 16, 2023 · May 11, 2023 · May 12, 2023 · May 12, 2023
@@ -247,6 +247,34 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
         """
         raise NotImplementedError()
 
+    @classmethod
+    def load_from_dict(cls, data) -> BaseColumnProfiler:
+        """
+        Parse attribute from json dictionary into self.
+
+        :param data: dictionary with attributes and values.
+        :type data: dict[string, Any]
+
+        :return: Profiler with attributes populated.
+        :rtype: BaseColumnProfiler
+        """
+        profile = cls(data["name"])
+
+        time_vals = data.pop("times")
+        setattr(profile, "times", defaultdict(float, time_vals))
+
+        for attr, value in data.items():
+            if "__calculations" in attr:
+                for metric, function in value.items():
+                    if not hasattr(profile, function):
+                        raise AttributeError(
+                            f"Object {type(profile)} has no attribute {function}."
+                        )
+                    value[metric] = getattr(profile, function)
+            setattr(profile, attr, value)
+
+        return profile
+
 
 class BaseColumnPrimitiveTypeProfiler(
     BaseColumnProfiler,

@@ -150,6 +150,22 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
         """
         return self.profile
 
+    @classmethod
+    def load_from_dict(cls, data):
+        """
+        Parse attribute from json dictionary into self.
+
+        :param data: dictionary with attributes and values.
+        :type data: dict[string, Any]
+
+        :return: Profiler with attributes populated.
+        :rtype: CategoricalColumn
+        """
+        value = data.pop("_categories")
+        profile = super().load_from_dict(data)
+        setattr(profile, "_categories", defaultdict(int, value))
+        return profile
+
     @property
     def profile(self) -> dict:
         """

@@ -1,12 +1,12 @@
 """Contains methods to decode components of a Profiler."""
-
-import json
+from typing import Dict, Optional, Type
 
 from .base_column_profilers import BaseColumnProfiler
 from .categorical_column_profile import CategoricalColumn
+from .float_column_profile import FloatColumn
 
 
-def get_column_profiler_class(class_name: str) -> BaseColumnProfiler:
+def get_column_profiler_class(class_name: str) -> Type[BaseColumnProfiler]:
     """
     Use name of class to return default-constructed version of that class.
 
@@ -18,15 +18,15 @@ def get_column_profiler_class(class_name: str) -> BaseColumnProfiler:
     :type class_name: str representing name of class
     :return: subclass of BaseColumnProfiler object
     """
-    profiles = {
+    profiles: Dict[str, Type[BaseColumnProfiler]] = {
         CategoricalColumn.__name__: CategoricalColumn,
+        FloatColumn.__name__: FloatColumn,
     }
 
-    profile_class = profiles.get(class_name)
+    profile_class: Optional[Type[BaseColumnProfiler]] = profiles.get(class_name)
     if profile_class is None:
         raise ValueError(f"Invalid profiler class {class_name} " f"failed to load.")
-    profiler: BaseColumnProfiler = profile_class(None)
-    return profiler
+    return profile_class
 
 
 def load_column_profile(serialized_json: dict) -> BaseColumnProfiler:
@@ -49,22 +49,9 @@ def load_column_profile(serialized_json: dict) -> BaseColumnProfiler:
         a JSON representation using the custom encoder
     :return: subclass of BaseColumnProfiler that has been deserialized from
         JSON
-    """
-    column_profiler = get_column_profiler_class(serialized_json["class"])
-    for attr, value in serialized_json["data"].items():
-        column_profiler.__setattr__(attr, value)
-
-    return column_profiler
-
-
-def decode_column_profiler(serialized: str) -> BaseColumnProfiler:
-    """
-    Construct subclass of BaseColumnProfiler given a serialized JSON.
 
-    :param serialized: JSON representation of column profiler that was
-        serialized using the custom encoder in profilers.json_encoder
-    :type serialized: a JSON str serialized using the custom decoder
-    :return: subclass of BaseColumnProfiler that has been deserialized from
-        JSON
     """
-    return load_column_profile(json.loads(serialized))
+    column_profiler_cls: Type[BaseColumnProfiler] = get_column_profiler_class(
+        serialized_json["class"]
+    )
+    return column_profiler_cls.load_from_dict(serialized_json["data"])
@@ -8,7 +8,7 @@
 import pandas as pd
 
 from dataprofiler.profilers import CategoricalColumn
-from dataprofiler.profilers.json_decoder import decode_column_profiler
+from dataprofiler.profilers.json_decoder import load_column_profile
 from dataprofiler.profilers.json_encoder import ProfileEncoder
 from dataprofiler.profilers.profile_builder import StructuredColProfiler
 from dataprofiler.profilers.profiler_options import CategoricalOptions
@@ -233,7 +233,6 @@ def test_mixed_categorical_col_integer_string(self):
         self.assertCountEqual(categories, profile.categories)
 
     def test_categorical_mapping(self):
-
         df1 = pd.Series(
             [
                 "abcd",
@@ -747,7 +746,7 @@ def test_json_encode_after_update(self):
         )
         profile = CategoricalColumn(df_categorical.name)
 
-        with patch("time.time", side_effect=lambda: 0.0):
+        with test_utils.mock_timeit():
             profile.update(df_categorical)
 
         serialized = json.dumps(profile, cls=ProfileEncoder)
@@ -759,7 +758,7 @@ def test_json_encode_after_update(self):
                     "col_index": np.nan,
                     "sample_size": 12,
                     "metadata": {},
-                    "times": {"categories": 0.0},
+                    "times": {"categories": 1.0},
                     "thread_safe": True,
                     "_categories": {"c": 5, "b": 4, "a": 3},
                     "_CategoricalColumn__calculations": {},
@@ -775,7 +774,7 @@ def test_json_decode(self):
         expected_profile = CategoricalColumn(fake_profile_name)
 
         serialized = json.dumps(expected_profile, cls=ProfileEncoder)
-        deserialized = decode_column_profiler(serialized)
+        deserialized = load_column_profile(json.loads(serialized))
 
         test_utils.assert_profiles_equal(deserialized, expected_profile)
 
@@ -802,14 +801,27 @@ def test_json_decode_after_update(self):
         )
         expected_profile = CategoricalColumn(fake_profile_name)
 
-        with patch("time.time", side_effect=lambda: 0.0):
+        with test_utils.mock_timeit():
             expected_profile.update(df_categorical)
 
         serialized = json.dumps(expected_profile, cls=ProfileEncoder)
-        deserialized = decode_column_profiler(serialized)
+        deserialized = load_column_profile(json.loads(serialized))
 
         test_utils.assert_profiles_equal(deserialized, expected_profile)
 
+        df_categorical = pd.Series(
+            [
+                "a",  # add existing
+                "d",  # add new
+            ]
+        )
+
+        # validating update after deserialization
+        deserialized.update(df_categorical)
+
+        assert deserialized.sample_size == 14
+        assert deserialized.categorical_counts == {"c": 5, "b": 4, "a": 4, "d": 1}
+
 
 class TestCategoricalSentence(unittest.TestCase):
     def setUp(self):

@@ -9,6 +9,7 @@
 import dataprofiler as dp
 from dataprofiler.profilers.base_column_profilers import BaseColumnProfiler
 from dataprofiler.profilers.profile_builder import BaseProfiler
+from dataprofiler.profilers.utils import find_diff_of_dicts
 
 
 def set_seed(seed=None):
@@ -166,37 +167,31 @@ def increment_counter():
     return mock.patch("time.time", side_effect=lambda: next(counter))
 
 
-def assert_profiles_equal(profile1, profile2):
+def assert_profiles_equal(actual, expected):
     """
     Checks if two profile objects are equal.
 
     profiles are instances of BaseProfiler or BaseColumnProfiler. Throws
         exception if not equal
 
-    :param profile_1: profile to compare to profile2
-    :type profile_1: instance of BaseProfiler or BaseColumnProfiler
-    :param profile_2: profile to compare to profile1
-    :type profile_2: instance of BaseProfiler or BaseColumnProfiler
-    """
-    profile1_dict = profile1.__dict__
-    profile2_dict = profile2.__dict__
-
-    if len(profile1_dict) != len(profile2_dict):
-        raise ValueError(
-            f"number of attributes on profile1 ({len(profile1_dict)}) != profile2 ({len(profile2_dict)})"
-        )
-
-    for attr1, value1 in profile1_dict.items():
-        if attr1 not in profile2_dict:
-            raise ValueError(f"Profile attributes unmatched {attr1}")
-
-        value2 = profile2_dict[attr1]
-        if not (isinstance(value2, type(value1)) or isinstance(value1, type(value2))):
-            raise ValueError(f"Profile value types unmatched: {value1} != {value2}")
-
-        if isinstance(value1, (BaseProfiler, BaseColumnProfiler)):
-            assert_profiles_equal(value1, value2)
-        elif isinstance(value1, numbers.Number):
-            np.testing.assert_equal(value1, value2)
-        elif value1 != value2:
-            raise ValueError(f"Profile values unmatched: {value1} != {value2}")
+    :param actual: profile to compare to expected
+    :type actual: instance of BaseProfiler or BaseColumnProfiler
+    :param expected: profile to compare to actual
+    :type expected: instance of BaseProfiler or BaseColumnProfiler
+    """
+    actual_dict = actual.__dict__
+    expected_dict = expected.__dict__
+
+    assert len(actual_dict.keys()) == len(expected_dict.keys())
+
+    for actual_value, expected_value in zip(
+        actual_dict.values(), expected_dict.values()
+    ):
+        assert type(actual_value) == type(expected_value)
+
+        if isinstance(actual_value, (BaseProfiler, BaseColumnProfiler)):
+            assert_profiles_equal(actual_value, expected_value)
+        elif isinstance(actual_value, numbers.Number):
+            np.testing.assert_equal(actual_value, expected_value)
+        else:
+            assert actual_value == expected_value