Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 37 additions & 5 deletions dataprofiler/profilers/float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,38 @@ def report(self, remove_disabled_flag: bool = False) -> dict:

return profile

@classmethod
def load_from_dict(cls, data):
"""
Parse attribute from json dictionary into self.

:param data: dictionary with attributes and values.
:type data: dict[string, Any]

:return: Profiler with attributes populated.
:rtype: CategoricalColumn
"""
# This is an ambiguous call to super classes.
# If load_from_dict is part of both super classes there may be issues
profile = super().load_from_dict(data)
profile._reformat_numeric_stats_types_on_serialized_profiles()

# Fix float specific typing
if profile._precision["min"] is not None:
profile._precision["min"] = np.float64(profile._precision["min"])
if profile._precision["max"] is not None:
profile._precision["max"] = np.float64(profile._precision["max"])
if profile._precision["sum"] is not None:
profile._precision["sum"] = np.float64(profile._precision["sum"])
if profile._precision["mean"] is not None:
profile._precision["mean"] = np.float64(profile._precision["mean"])
if profile._precision["biased_var"] is not None:
profile._precision["biased_var"] = np.float64(
profile._precision["biased_var"]
)

return profile

@property
def profile(self) -> dict:
"""
Expand Down Expand Up @@ -273,11 +305,11 @@ def _get_float_precision(
# Determine statistics precision
precision_sum = len_per_float.sum()
subset_precision = {
"min": len_per_float.min(),
"max": len_per_float.max(),
"biased_var": float(np.var(len_per_float)),
"sum": precision_sum,
"mean": precision_sum / sample_size,
"min": np.float64(len_per_float.min()),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are these needed?

"max": np.float64(len_per_float.max()),
"biased_var": np.var(len_per_float),
"sum": np.float64(precision_sum),
"mean": np.float64(precision_sum / sample_size),
"sample_size": sample_size,
}

Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/profilers/int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def load_from_dict(cls, data):
# This is an ambiguous call to super classes.
# If load_from_dict is part of both super classes there may be issues
profile = super().load_from_dict(data)
profile._load_stats_helper()
profile._reformat_numeric_stats_types_on_serialized_profiles()
return profile

@property
Expand Down
21 changes: 7 additions & 14 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,7 @@ def report(self, remove_disabled_flag: bool = False) -> dict:

return profile

def _load_stats_helper(self):
def _reformat_numeric_stats_types_on_serialized_profiles(self):
"""Assistance function in the deserialization of profiler objects.

This function is to be used to enforce correct typing for attributes
Expand Down Expand Up @@ -394,19 +394,12 @@ def convert_histogram_key_types_to_np(histogram_info: dict):
self.histogram_methods[key]
)

# Convert values to correct types
if self.min is not None and type(self.min) not in [np.float64, np.int64]:
self.min = (
np.float64(self.min) if type(self.min) is float else np.int64(self.min)
)
if self.max is not None and type(self.max) not in [np.float64, np.int64]:
self.max = (
np.float64(self.max) if type(self.max) is float else np.int64(self.max)
)
if type(self.sum) not in [np.float64, np.int64]:
self.sum = (
np.float64(self.sum) if type(self.sum) is float else np.int64(self.sum)
)
if self.min is not None:
self.min = np.float64(self.min)
if self.max is not None:
self.max = np.float64(self.max)
if self.sum is not None:
self.sum = np.float64(self.sum)
if self.num_zeros is not None:
self.num_zeros = np.int64(self.num_zeros)
if self.num_negatives is not None:
Expand Down
267 changes: 267 additions & 0 deletions dataprofiler/tests/profilers/test_float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@
import pandas as pd

from dataprofiler.profilers import FloatColumn
from dataprofiler.profilers.json_decoder import load_column_profile
from dataprofiler.profilers.json_encoder import ProfileEncoder
from dataprofiler.profilers.profiler_options import FloatOptions

from . import utils as test_utils

test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))


Expand Down Expand Up @@ -1734,3 +1738,266 @@ def test_diff(self):
str(exc.exception),
"Unsupported operand type(s) for diff: 'FloatColumn' and" " 'str'",
)

def test_json_encode(self):
profiler = FloatColumn("0.0")

serialized = json.dumps(profiler, cls=ProfileEncoder)

# Copy of NumericalStatsMixin code to test serialization of dicts
expected_histogram_bin_method_names = [
"auto",
"fd",
"doane",
"scott",
"rice",
"sturges",
"sqrt",
]
expected_min_histogram_bin = 1000
expected_historam_methods = {}
for method in expected_histogram_bin_method_names:
expected_historam_methods[method] = {
"total_loss": 0.0,
"current_loss": 0.0,
"suggested_bin_count": expected_min_histogram_bin,
"histogram": {"bin_counts": None, "bin_edges": None},
}

serialized = json.dumps(profiler, cls=ProfileEncoder)
expected = json.dumps(
{
"class": "FloatColumn",
"data": {
"min": None,
"max": None,
"_top_k_modes": 5,
"sum": 0.0,
"_biased_variance": np.nan,
"_biased_skewness": np.nan,
"_biased_kurtosis": np.nan,
"_median_is_enabled": True,
"_median_abs_dev_is_enabled": True,
"max_histogram_bin": 100000,
"min_histogram_bin": expected_min_histogram_bin,
"histogram_bin_method_names": expected_histogram_bin_method_names,
"histogram_selection": None,
"user_set_histogram_bin": None,
"bias_correction": True,
"_mode_is_enabled": True,
"num_zeros": 0,
"num_negatives": 0,
"_num_quantiles": 1000,
"histogram_methods": expected_historam_methods,
"_stored_histogram": {
"total_loss": 0.0,
"current_loss": 0.0,
"suggested_bin_count": 1000,
"histogram": {"bin_counts": None, "bin_edges": None},
},
"_batch_history": [],
"quantiles": None,
"_NumericStatsMixin__calculations": {
"min": "_get_min",
"max": "_get_max",
"sum": "_get_sum",
"variance": "_get_variance",
"skewness": "_get_skewness",
"kurtosis": "_get_kurtosis",
"histogram_and_quantiles": "_get_histogram_and_quantiles",
"num_zeros": "_get_num_zeros",
"num_negatives": "_get_num_negatives",
},
"name": "0.0",
"col_index": np.nan,
"sample_size": 0,
"metadata": dict(),
"times": defaultdict(),
"thread_safe": True,
"match_count": 0,
"_precision": {
"min": None,
"max": None,
"sum": None,
"mean": None,
"biased_var": None,
"sample_size": None,
"confidence_level": 0.999,
},
"_FloatColumn__z_value_precision": 3.291,
"_FloatColumn__precision_sample_ratio": None,
"_FloatColumn__calculations": {"precision": "_update_precision"},
},
}
)
self.assertEqual(serialized, expected)

@mock.patch("time.time", return_value=0.0)
def test_json_encode_after_update(self, time):
data = np.array([0.0, 5.0, 10.0])
df = pd.Series(data).apply(str)

int_options = FloatOptions()
int_options.histogram_and_quantiles.bin_count_or_method = 5
profiler = FloatColumn("0.0", int_options)

mocked_quantiles = [0.25, 0.50, 0.75]
with mock.patch.object(
profiler, "_get_percentile", return_value=mocked_quantiles
):
# Mock out complex _get_percentile function.
# Only need to test valid serialization of np.ndarry.
profiler.update(df)

# Copy of NumericalStatsMixin code to test serialization of dicts
expected_histogram_bin_method_names = ["custom"]
expected_min_histogram_bin = 5
expected_historam_methods = {}
for method in expected_histogram_bin_method_names:
expected_historam_methods[method] = {
"total_loss": 0.0,
"current_loss": 0.0,
"suggested_bin_count": expected_min_histogram_bin,
"histogram": {"bin_counts": None, "bin_edges": None},
}
serialized = json.dumps(profiler, cls=ProfileEncoder)

expected = json.dumps(
{
"class": "FloatColumn",
"data": {
"min": 0.0,
"max": 10.0,
"_top_k_modes": 5,
"sum": 15.0,
"_biased_variance": 16.666666666666668,
"_biased_skewness": 0.0,
"_biased_kurtosis": -1.5,
"_median_is_enabled": True,
"_median_abs_dev_is_enabled": True,
"max_histogram_bin": 100000,
"min_histogram_bin": 1000,
"histogram_bin_method_names": expected_histogram_bin_method_names,
"histogram_selection": None,
"user_set_histogram_bin": 5,
"bias_correction": True,
"_mode_is_enabled": True,
"num_zeros": 1,
"num_negatives": 0,
"_num_quantiles": 1000,
"histogram_methods": expected_historam_methods,
"_stored_histogram": {
"total_loss": 2.0,
"current_loss": 2.0,
"suggested_bin_count": 1000,
"histogram": {
"bin_counts": [1, 0, 1, 0, 1],
"bin_edges": [0.0, 2.0, 4.0, 6.0, 8.0, 10.0],
},
},
"_batch_history": [
{
"match_count": 3,
"sample_size": 3,
"min": 0.0,
"max": 10.0,
"sum": 15.0,
"biased_variance": 16.666666666666668,
"mean": 5.0,
"biased_skewness": 0.0,
"biased_kurtosis": -1.5,
"num_zeros": 1,
"num_negatives": 0,
}
],
"quantiles": [0.25, 0.5, 0.75],
"_NumericStatsMixin__calculations": {
"min": "_get_min",
"max": "_get_max",
"sum": "_get_sum",
"variance": "_get_variance",
"skewness": "_get_skewness",
"kurtosis": "_get_kurtosis",
"histogram_and_quantiles": "_get_histogram_and_quantiles",
"num_zeros": "_get_num_zeros",
"num_negatives": "_get_num_negatives",
},
"name": "0.0",
"col_index": np.nan,
"sample_size": 3,
"metadata": dict(),
"times": {
"precision": 0.0,
"min": 0.0,
"max": 0.0,
"sum": 0.0,
"variance": 0.0,
"skewness": 0.0,
"kurtosis": 0.0,
"histogram_and_quantiles": 0.0,
"num_zeros": 0.0,
"num_negatives": 0.0,
},
"thread_safe": True,
"match_count": 3,
"_precision": {
"min": 0.0,
"max": 2.0,
"sum": 3.0,
"mean": 1.0,
"biased_var": 0.6666666666666666,
"sample_size": 3,
"confidence_level": 0.999,
},
"_FloatColumn__z_value_precision": 3.291,
"_FloatColumn__precision_sample_ratio": None,
"_FloatColumn__calculations": {"precision": "_update_precision"},
},
}
)

self.assertEqual(serialized, expected)

def test_json_decode(self):
fake_profile_name = None
expected_profile = FloatColumn(fake_profile_name)

serialized = json.dumps(expected_profile, cls=ProfileEncoder)
deserialized = load_column_profile(json.loads(serialized))

test_utils.assert_profiles_equal(deserialized, expected_profile)

def test_json_decode_after_update(self):
fake_profile_name = "Fake profile name"
# Actual deserialization

# Build expected FloatColumn
df_float = pd.Series([1.5, 2.2, 5.0, 7.0, 4.0, 3.0, 2.0, 7.0, 8.0, 9.0]).apply(
str
)
expected_profile = FloatColumn(fake_profile_name)

with test_utils.mock_timeit():
expected_profile.update(df_float)

serialized = json.dumps(expected_profile, cls=ProfileEncoder)
deserialized = load_column_profile(json.loads(serialized))

test_utils.assert_profiles_equal(deserialized, expected_profile)

df_float = pd.Series(
[
4.0, # add existing
15.0, # add new
]
).apply(str)

# validating update after deserialization
deserialized.update(df_float)

assert deserialized.sample_size == 12
assert (
deserialized.mean
== sum([1.5, 2.2, 5.0, 7.0, 4.0, 3.0, 2.0, 7.0, 8.0, 9.0, 4, 15]) / 12
)
assert deserialized.max == 15
2 changes: 1 addition & 1 deletion dataprofiler/tests/profilers/test_int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1308,7 +1308,7 @@ def test_json_decode_after_update(self):
fake_profile_name = "Fake profile name"
# Actual deserialization

# Build expected CategoricalColumn
# Build expected IntColumn
df_int = pd.Series([1, 2, 5, 7, 4, 3, 2, 7, 8, 9])
expected_profile = IntColumn(fake_profile_name)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ def test_from_dict_helper(self):
"bin_counts": None,
"bin_edges": None,
}
actual_profile._load_stats_helper()
actual_profile._reformat_numeric_stats_types_on_serialized_profiles()

test_utils.assert_profiles_equal(expected_profile, actual_profile)

Expand Down
Loading