capitalone · taylorfturner · Jun 6, 2023 · May 31, 2023 · Jun 1, 2023 · Jun 2, 2023
@@ -155,6 +155,38 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
 
         return profile
 
+    @classmethod
+    def load_from_dict(cls, data):
+        """
+        Parse attribute from json dictionary into self.
+
+        :param data: dictionary with attributes and values.
+        :type data: dict[string, Any]
+
+        :return: Profiler with attributes populated.
+        :rtype: CategoricalColumn
+        """
+        # This is an ambiguous call to super classes.
+        # If load_from_dict is part of both super classes there may be issues
+        profile = super().load_from_dict(data)
+        profile._reformat_numeric_stats_types_on_serialized_profiles()
+
+        # Fix float specific typing
+        if profile._precision["min"] is not None:
+            profile._precision["min"] = np.float64(profile._precision["min"])
+        if profile._precision["max"] is not None:
+            profile._precision["max"] = np.float64(profile._precision["max"])
+        if profile._precision["sum"] is not None:
+            profile._precision["sum"] = np.float64(profile._precision["sum"])
+        if profile._precision["mean"] is not None:
+            profile._precision["mean"] = np.float64(profile._precision["mean"])
+        if profile._precision["biased_var"] is not None:
+            profile._precision["biased_var"] = np.float64(
+                profile._precision["biased_var"]
+            )
+
+        return profile
+
     @property
     def profile(self) -> dict:
         """
@@ -273,11 +305,11 @@ def _get_float_precision(
         # Determine statistics precision
         precision_sum = len_per_float.sum()
         subset_precision = {
-            "min": len_per_float.min(),
-            "max": len_per_float.max(),
-            "biased_var": float(np.var(len_per_float)),
-            "sum": precision_sum,
-            "mean": precision_sum / sample_size,
+            "min": np.float64(len_per_float.min()),
+            "max": np.float64(len_per_float.max()),
+            "biased_var": np.var(len_per_float),
+            "sum": np.float64(precision_sum),
+            "mean": np.float64(precision_sum / sample_size),
             "sample_size": sample_size,
         }
 

@@ -84,7 +84,7 @@ def load_from_dict(cls, data):
         # This is an ambiguous call to super classes.
         # If load_from_dict is part of both super classes there may be issues
         profile = super().load_from_dict(data)
-        profile._load_stats_helper()
+        profile._reformat_numeric_stats_types_on_serialized_profiles()
         return profile
 
     @property

@@ -359,7 +359,7 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
 
         return profile
 
-    def _load_stats_helper(self):
+    def _reformat_numeric_stats_types_on_serialized_profiles(self):
         """Assistance function in the deserialization of profiler objects.
 
         This function is to be used to enforce correct typing for attributes
@@ -394,19 +394,12 @@ def convert_histogram_key_types_to_np(histogram_info: dict):
                 self.histogram_methods[key]
             )
 
-        # Convert values to correct types
-        if self.min is not None and type(self.min) not in [np.float64, np.int64]:
-            self.min = (
-                np.float64(self.min) if type(self.min) is float else np.int64(self.min)
-            )
-        if self.max is not None and type(self.max) not in [np.float64, np.int64]:
-            self.max = (
-                np.float64(self.max) if type(self.max) is float else np.int64(self.max)
-            )
-        if type(self.sum) not in [np.float64, np.int64]:
-            self.sum = (
-                np.float64(self.sum) if type(self.sum) is float else np.int64(self.sum)
-            )
+        if self.min is not None:
+            self.min = np.float64(self.min)
+        if self.max is not None:
+            self.max = np.float64(self.max)
+        if self.sum is not None:
+            self.sum = np.float64(self.sum)
         if self.num_zeros is not None:
             self.num_zeros = np.int64(self.num_zeros)
         if self.num_negatives is not None:

@@ -9,8 +9,12 @@
 import pandas as pd
 
 from dataprofiler.profilers import FloatColumn
+from dataprofiler.profilers.json_decoder import load_column_profile
+from dataprofiler.profilers.json_encoder import ProfileEncoder
 from dataprofiler.profilers.profiler_options import FloatOptions
 
+from . import utils as test_utils
+
 test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
 
 
@@ -1734,3 +1738,266 @@ def test_diff(self):
             str(exc.exception),
             "Unsupported operand type(s) for diff: 'FloatColumn' and" " 'str'",
         )
+
+    def test_json_encode(self):
+        profiler = FloatColumn("0.0")
+
+        serialized = json.dumps(profiler, cls=ProfileEncoder)
+
+        # Copy of NumericalStatsMixin code to test serialization of dicts
+        expected_histogram_bin_method_names = [
+            "auto",
+            "fd",
+            "doane",
+            "scott",
+            "rice",
+            "sturges",
+            "sqrt",
+        ]
+        expected_min_histogram_bin = 1000
+        expected_historam_methods = {}
+        for method in expected_histogram_bin_method_names:
+            expected_historam_methods[method] = {
+                "total_loss": 0.0,
+                "current_loss": 0.0,
+                "suggested_bin_count": expected_min_histogram_bin,
+                "histogram": {"bin_counts": None, "bin_edges": None},
+            }
+
+        serialized = json.dumps(profiler, cls=ProfileEncoder)
+        expected = json.dumps(
+            {
+                "class": "FloatColumn",
+                "data": {
+                    "min": None,
+                    "max": None,
+                    "_top_k_modes": 5,
+                    "sum": 0.0,
+                    "_biased_variance": np.nan,
+                    "_biased_skewness": np.nan,
+                    "_biased_kurtosis": np.nan,
+                    "_median_is_enabled": True,
+                    "_median_abs_dev_is_enabled": True,
+                    "max_histogram_bin": 100000,
+                    "min_histogram_bin": expected_min_histogram_bin,
+                    "histogram_bin_method_names": expected_histogram_bin_method_names,
+                    "histogram_selection": None,
+                    "user_set_histogram_bin": None,
+                    "bias_correction": True,
+                    "_mode_is_enabled": True,
+                    "num_zeros": 0,
+                    "num_negatives": 0,
+                    "_num_quantiles": 1000,
+                    "histogram_methods": expected_historam_methods,
+                    "_stored_histogram": {
+                        "total_loss": 0.0,
+                        "current_loss": 0.0,
+                        "suggested_bin_count": 1000,
+                        "histogram": {"bin_counts": None, "bin_edges": None},
+                    },
+                    "_batch_history": [],
+                    "quantiles": None,
+                    "_NumericStatsMixin__calculations": {
+                        "min": "_get_min",
+                        "max": "_get_max",
+                        "sum": "_get_sum",
+                        "variance": "_get_variance",
+                        "skewness": "_get_skewness",
+                        "kurtosis": "_get_kurtosis",
+                        "histogram_and_quantiles": "_get_histogram_and_quantiles",
+                        "num_zeros": "_get_num_zeros",
+                        "num_negatives": "_get_num_negatives",
+                    },
+                    "name": "0.0",
+                    "col_index": np.nan,
+                    "sample_size": 0,
+                    "metadata": dict(),
+                    "times": defaultdict(),
+                    "thread_safe": True,
+                    "match_count": 0,
+                    "_precision": {
+                        "min": None,
+                        "max": None,
+                        "sum": None,
+                        "mean": None,
+                        "biased_var": None,
+                        "sample_size": None,
+                        "confidence_level": 0.999,
+                    },
+                    "_FloatColumn__z_value_precision": 3.291,
+                    "_FloatColumn__precision_sample_ratio": None,
+                    "_FloatColumn__calculations": {"precision": "_update_precision"},
+                },
+            }
+        )
+        self.assertEqual(serialized, expected)
+
+    @mock.patch("time.time", return_value=0.0)
+    def test_json_encode_after_update(self, time):
+        data = np.array([0.0, 5.0, 10.0])
+        df = pd.Series(data).apply(str)
+
+        int_options = FloatOptions()
+        int_options.histogram_and_quantiles.bin_count_or_method = 5
+        profiler = FloatColumn("0.0", int_options)
+
+        mocked_quantiles = [0.25, 0.50, 0.75]
+        with mock.patch.object(
+            profiler, "_get_percentile", return_value=mocked_quantiles
+        ):
+            # Mock out complex _get_percentile function.
+            # Only need to test valid serialization of np.ndarry.
+            profiler.update(df)
+
+        # Copy of NumericalStatsMixin code to test serialization of dicts
+        expected_histogram_bin_method_names = ["custom"]
+        expected_min_histogram_bin = 5
+        expected_historam_methods = {}
+        for method in expected_histogram_bin_method_names:
+            expected_historam_methods[method] = {
+                "total_loss": 0.0,
+                "current_loss": 0.0,
+                "suggested_bin_count": expected_min_histogram_bin,
+                "histogram": {"bin_counts": None, "bin_edges": None},
+            }
+        serialized = json.dumps(profiler, cls=ProfileEncoder)
+
+        expected = json.dumps(
+            {
+                "class": "FloatColumn",
+                "data": {
+                    "min": 0.0,
+                    "max": 10.0,
+                    "_top_k_modes": 5,
+                    "sum": 15.0,
+                    "_biased_variance": 16.666666666666668,
+                    "_biased_skewness": 0.0,
+                    "_biased_kurtosis": -1.5,
+                    "_median_is_enabled": True,
+                    "_median_abs_dev_is_enabled": True,
+                    "max_histogram_bin": 100000,
+                    "min_histogram_bin": 1000,
+                    "histogram_bin_method_names": expected_histogram_bin_method_names,
+                    "histogram_selection": None,
+                    "user_set_histogram_bin": 5,
+                    "bias_correction": True,
+                    "_mode_is_enabled": True,
+                    "num_zeros": 1,
+                    "num_negatives": 0,
+                    "_num_quantiles": 1000,
+                    "histogram_methods": expected_historam_methods,
+                    "_stored_histogram": {
+                        "total_loss": 2.0,
+                        "current_loss": 2.0,
+                        "suggested_bin_count": 1000,
+                        "histogram": {
+                            "bin_counts": [1, 0, 1, 0, 1],
+                            "bin_edges": [0.0, 2.0, 4.0, 6.0, 8.0, 10.0],
+                        },
+                    },
+                    "_batch_history": [
+                        {
+                            "match_count": 3,
+                            "sample_size": 3,
+                            "min": 0.0,
+                            "max": 10.0,
+                            "sum": 15.0,
+                            "biased_variance": 16.666666666666668,
+                            "mean": 5.0,
+                            "biased_skewness": 0.0,
+                            "biased_kurtosis": -1.5,
+                            "num_zeros": 1,
+                            "num_negatives": 0,
+                        }
+                    ],
+                    "quantiles": [0.25, 0.5, 0.75],
+                    "_NumericStatsMixin__calculations": {
+                        "min": "_get_min",
+                        "max": "_get_max",
+                        "sum": "_get_sum",
+                        "variance": "_get_variance",
+                        "skewness": "_get_skewness",
+                        "kurtosis": "_get_kurtosis",
+                        "histogram_and_quantiles": "_get_histogram_and_quantiles",
+                        "num_zeros": "_get_num_zeros",
+                        "num_negatives": "_get_num_negatives",
+                    },
+                    "name": "0.0",
+                    "col_index": np.nan,
+                    "sample_size": 3,
+                    "metadata": dict(),
+                    "times": {
+                        "precision": 0.0,
+                        "min": 0.0,
+                        "max": 0.0,
+                        "sum": 0.0,
+                        "variance": 0.0,
+                        "skewness": 0.0,
+                        "kurtosis": 0.0,
+                        "histogram_and_quantiles": 0.0,
+                        "num_zeros": 0.0,
+                        "num_negatives": 0.0,
+                    },
+                    "thread_safe": True,
+                    "match_count": 3,
+                    "_precision": {
+                        "min": 0.0,
+                        "max": 2.0,
+                        "sum": 3.0,
+                        "mean": 1.0,
+                        "biased_var": 0.6666666666666666,
+                        "sample_size": 3,
+                        "confidence_level": 0.999,
+                    },
+                    "_FloatColumn__z_value_precision": 3.291,
+                    "_FloatColumn__precision_sample_ratio": None,
+                    "_FloatColumn__calculations": {"precision": "_update_precision"},
+                },
+            }
+        )
+
+        self.assertEqual(serialized, expected)
+
+    def test_json_decode(self):
+        fake_profile_name = None
+        expected_profile = FloatColumn(fake_profile_name)
+
+        serialized = json.dumps(expected_profile, cls=ProfileEncoder)
+        deserialized = load_column_profile(json.loads(serialized))
+
+        test_utils.assert_profiles_equal(deserialized, expected_profile)
+
+    def test_json_decode_after_update(self):
+        fake_profile_name = "Fake profile name"
+        # Actual deserialization
+
+        # Build expected FloatColumn
+        df_float = pd.Series([1.5, 2.2, 5.0, 7.0, 4.0, 3.0, 2.0, 7.0, 8.0, 9.0]).apply(
+            str
+        )
+        expected_profile = FloatColumn(fake_profile_name)
+
+        with test_utils.mock_timeit():
+            expected_profile.update(df_float)
+
+        serialized = json.dumps(expected_profile, cls=ProfileEncoder)
+        deserialized = load_column_profile(json.loads(serialized))
+
+        test_utils.assert_profiles_equal(deserialized, expected_profile)
+
+        df_float = pd.Series(
+            [
+                4.0,  # add existing
+                15.0,  # add new
+            ]
+        ).apply(str)
+
+        # validating update after deserialization
+        deserialized.update(df_float)
+
+        assert deserialized.sample_size == 12
+        assert (
+            deserialized.mean
+            == sum([1.5, 2.2, 5.0, 7.0, 4.0, 3.0, 2.0, 7.0, 8.0, 9.0, 4, 15]) / 12
+        )
+        assert deserialized.max == 15
@@ -1308,7 +1308,7 @@ def test_json_decode_after_update(self):
         fake_profile_name = "Fake profile name"
         # Actual deserialization
 
-        # Build expected CategoricalColumn
+        # Build expected IntColumn
         df_int = pd.Series([1, 2, 5, 7, 4, 3, 2, 7, 8, 9])
         expected_profile = IntColumn(fake_profile_name)
 

@@ -384,7 +384,7 @@ def test_from_dict_helper(self):
             "bin_counts": None,
             "bin_edges": None,
         }
-        actual_profile._load_stats_helper()
+        actual_profile._reformat_numeric_stats_types_on_serialized_profiles()
 
         test_utils.assert_profiles_equal(expected_profile, actual_profile)