Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,34 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
"""
raise NotImplementedError()

@classmethod
def load_from_dict(cls, data) -> BaseColumnProfiler:
"""
Parse attribute from json dictionary into self.

:param data: dictionary with attributes and values.
:type data: dict[string, Any]

:return: Profiler with attributes populated.
:rtype: BaseColumnProfiler
"""
profile = cls(data["name"])

time_vals = data.pop("times")
setattr(profile, "times", defaultdict(float, time_vals))

for attr, value in data.items():
if "__calculations" in attr:
for metric, function in value.items():
if not hasattr(profile, function):
raise AttributeError(
f"Object {type(profile)} has no attribute {function}."
)
value[metric] = getattr(profile, function)
Comment on lines +268 to +273
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Long term I think will need to pull this out into a function bc i think it will be reused by other classes which don't have this as the base.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can refactor then

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I agree -- refactor into a function (away from a @classmethod)

setattr(profile, attr, value)

return profile


class BaseColumnPrimitiveTypeProfiler(
BaseColumnProfiler,
Expand Down
16 changes: 16 additions & 0 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,22 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
"""
return self.profile

@classmethod
def load_from_dict(cls, data):
"""
Parse attribute from json dictionary into self.

:param data: dictionary with attributes and values.
:type data: dict[string, Any]

:return: Profiler with attributes populated.
:rtype: CategoricalColumn
"""
value = data.pop("_categories")
profile = super().load_from_dict(data)
setattr(profile, "_categories", defaultdict(int, value))
return profile

@property
def profile(self) -> dict:
"""
Expand Down
35 changes: 11 additions & 24 deletions dataprofiler/profilers/json_decoder.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
"""Contains methods to decode components of a Profiler."""

import json
from typing import Dict, Optional, Type

from .base_column_profilers import BaseColumnProfiler
from .categorical_column_profile import CategoricalColumn
from .float_column_profile import FloatColumn


def get_column_profiler_class(class_name: str) -> BaseColumnProfiler:
def get_column_profiler_class(class_name: str) -> Type[BaseColumnProfiler]:
"""
Use name of class to return default-constructed version of that class.

Expand All @@ -18,15 +18,15 @@ def get_column_profiler_class(class_name: str) -> BaseColumnProfiler:
:type class_name: str representing name of class
:return: subclass of BaseColumnProfiler object
"""
profiles = {
profiles: Dict[str, Type[BaseColumnProfiler]] = {
CategoricalColumn.__name__: CategoricalColumn,
FloatColumn.__name__: FloatColumn,
}

profile_class = profiles.get(class_name)
profile_class: Optional[Type[BaseColumnProfiler]] = profiles.get(class_name)
if profile_class is None:
raise ValueError(f"Invalid profiler class {class_name} " f"failed to load.")
profiler: BaseColumnProfiler = profile_class(None)
return profiler
return profile_class


def load_column_profile(serialized_json: dict) -> BaseColumnProfiler:
Expand All @@ -49,22 +49,9 @@ def load_column_profile(serialized_json: dict) -> BaseColumnProfiler:
a JSON representation using the custom encoder
:return: subclass of BaseColumnProfiler that has been deserialized from
JSON
"""
column_profiler = get_column_profiler_class(serialized_json["class"])
for attr, value in serialized_json["data"].items():
column_profiler.__setattr__(attr, value)

return column_profiler


def decode_column_profiler(serialized: str) -> BaseColumnProfiler:
"""
Construct subclass of BaseColumnProfiler given a serialized JSON.

:param serialized: JSON representation of column profiler that was
serialized using the custom encoder in profilers.json_encoder
:type serialized: a JSON str serialized using the custom decoder
:return: subclass of BaseColumnProfiler that has been deserialized from
JSON
"""
return load_column_profile(json.loads(serialized))
column_profiler_cls: Type[BaseColumnProfiler] = get_column_profiler_class(
serialized_json["class"]
)
return column_profiler_cls.load_from_dict(serialized_json["data"])
26 changes: 19 additions & 7 deletions dataprofiler/tests/profilers/test_categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd

from dataprofiler.profilers import CategoricalColumn
from dataprofiler.profilers.json_decoder import decode_column_profiler
from dataprofiler.profilers.json_decoder import load_column_profile
from dataprofiler.profilers.json_encoder import ProfileEncoder
from dataprofiler.profilers.profile_builder import StructuredColProfiler
from dataprofiler.profilers.profiler_options import CategoricalOptions
Expand Down Expand Up @@ -233,7 +233,6 @@ def test_mixed_categorical_col_integer_string(self):
self.assertCountEqual(categories, profile.categories)

def test_categorical_mapping(self):

df1 = pd.Series(
[
"abcd",
Expand Down Expand Up @@ -747,7 +746,7 @@ def test_json_encode_after_update(self):
)
profile = CategoricalColumn(df_categorical.name)

with patch("time.time", side_effect=lambda: 0.0):
with test_utils.mock_timeit():
profile.update(df_categorical)

serialized = json.dumps(profile, cls=ProfileEncoder)
Expand All @@ -759,7 +758,7 @@ def test_json_encode_after_update(self):
"col_index": np.nan,
"sample_size": 12,
"metadata": {},
"times": {"categories": 0.0},
"times": {"categories": 1.0},
"thread_safe": True,
"_categories": {"c": 5, "b": 4, "a": 3},
"_CategoricalColumn__calculations": {},
Expand All @@ -775,7 +774,7 @@ def test_json_decode(self):
expected_profile = CategoricalColumn(fake_profile_name)

serialized = json.dumps(expected_profile, cls=ProfileEncoder)
deserialized = decode_column_profiler(serialized)
deserialized = load_column_profile(json.loads(serialized))

test_utils.assert_profiles_equal(deserialized, expected_profile)

Expand All @@ -802,14 +801,27 @@ def test_json_decode_after_update(self):
)
expected_profile = CategoricalColumn(fake_profile_name)

with patch("time.time", side_effect=lambda: 0.0):
with test_utils.mock_timeit():
expected_profile.update(df_categorical)

serialized = json.dumps(expected_profile, cls=ProfileEncoder)
deserialized = decode_column_profiler(serialized)
deserialized = load_column_profile(json.loads(serialized))

test_utils.assert_profiles_equal(deserialized, expected_profile)

df_categorical = pd.Series(
[
"a", # add existing
"d", # add new
Comment on lines +814 to +815
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

love this comment for superb clarity and readability

]
)

# validating update after deserialization
deserialized.update(df_categorical)

assert deserialized.sample_size == 14
assert deserialized.categorical_counts == {"c": 5, "b": 4, "a": 4, "d": 1}


class TestCategoricalSentence(unittest.TestCase):
def setUp(self):
Expand Down
51 changes: 23 additions & 28 deletions dataprofiler/tests/profilers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import dataprofiler as dp
from dataprofiler.profilers.base_column_profilers import BaseColumnProfiler
from dataprofiler.profilers.profile_builder import BaseProfiler
from dataprofiler.profilers.utils import find_diff_of_dicts


def set_seed(seed=None):
Expand Down Expand Up @@ -166,37 +167,31 @@ def increment_counter():
return mock.patch("time.time", side_effect=lambda: next(counter))


def assert_profiles_equal(profile1, profile2):
def assert_profiles_equal(actual, expected):
"""
Checks if two profile objects are equal.

profiles are instances of BaseProfiler or BaseColumnProfiler. Throws
exception if not equal

:param profile_1: profile to compare to profile2
:type profile_1: instance of BaseProfiler or BaseColumnProfiler
:param profile_2: profile to compare to profile1
:type profile_2: instance of BaseProfiler or BaseColumnProfiler
"""
profile1_dict = profile1.__dict__
profile2_dict = profile2.__dict__

if len(profile1_dict) != len(profile2_dict):
raise ValueError(
f"number of attributes on profile1 ({len(profile1_dict)}) != profile2 ({len(profile2_dict)})"
)

for attr1, value1 in profile1_dict.items():
if attr1 not in profile2_dict:
raise ValueError(f"Profile attributes unmatched {attr1}")

value2 = profile2_dict[attr1]
if not (isinstance(value2, type(value1)) or isinstance(value1, type(value2))):
raise ValueError(f"Profile value types unmatched: {value1} != {value2}")

if isinstance(value1, (BaseProfiler, BaseColumnProfiler)):
assert_profiles_equal(value1, value2)
elif isinstance(value1, numbers.Number):
np.testing.assert_equal(value1, value2)
elif value1 != value2:
raise ValueError(f"Profile values unmatched: {value1} != {value2}")
:param actual: profile to compare to expected
:type actual: instance of BaseProfiler or BaseColumnProfiler
:param expected: profile to compare to actual
:type expected: instance of BaseProfiler or BaseColumnProfiler
"""
actual_dict = actual.__dict__
expected_dict = expected.__dict__

assert len(actual_dict.keys()) == len(expected_dict.keys())

for actual_value, expected_value in zip(
actual_dict.values(), expected_dict.values()
):
assert type(actual_value) == type(expected_value)

if isinstance(actual_value, (BaseProfiler, BaseColumnProfiler)):
assert_profiles_equal(actual_value, expected_value)
elif isinstance(actual_value, numbers.Number):
np.testing.assert_equal(actual_value, expected_value)
else:
assert actual_value == expected_value