Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,11 @@ def load_from_dict(cls, data, options: dict | None = None) -> StructuredColProfi
if attr == "options" and value is not None:
value = load_option(value, options)
if attr == "_null_values":
value = {k: profile._null_values[k] for k, v in value.items()}
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is a bug and would cause errors if the prev values didn't exist in the default.

value = {
k: (re.RegexFlag(v) if v != 0 else 0) for k, v in value.items()
}
if attr == "null_types_index":
value = {k: set(v) for k, v in value.items()}
Comment on lines +408 to +409
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added tests which validate this functionality.

setattr(profile, attr, value)
return profile

Expand Down
170 changes: 161 additions & 9 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import dataprofiler as dp
from dataprofiler import StructuredDataLabeler, UnstructuredDataLabeler
from dataprofiler.labelers.base_data_labeler import BaseDataLabeler
from dataprofiler.profilers.column_profile_compilers import (
ColumnDataLabelerCompiler,
ColumnPrimitiveTypeProfileCompiler,
Expand Down Expand Up @@ -2618,7 +2619,124 @@ def test_diff(self, *mocks):

self.assertDictEqual(expected_diff, dict(profile1.diff(profile2)))

def test_json_decode(self, *mocks):
@mock.patch(
"dataprofiler.profilers.data_labeler_column_profile.DataLabeler",
spec=BaseDataLabeler,
)
def test_json_encode(self, mocked_datalabeler, *mocks):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing test

col_profiler = StructuredColProfiler()

serialized = json.dumps(col_profiler, cls=ProfileEncoder)
expected = json.dumps(
{
"class": "StructuredColProfiler",
"data": {
"name": None,
"options": None,
"_min_sample_size": 5000,
"_sampling_ratio": 0.2,
"_min_true_samples": 0,
"sample_size": 0,
"sample": [],
"null_count": 0,
"null_types": [],
"null_types_index": {},
"_min_id": None,
"_max_id": None,
"_index_shift": None,
"_last_batch_size": None,
"profiles": {},
"_null_values": {
"": 0,
"nan": 2,
"none": 2,
"null": 2,
" *": 0,
"--*": 0,
"__*": 0,
},
},
}
)
self.assertEqual(expected, serialized)

@mock.patch(
"dataprofiler.profilers.data_labeler_column_profile.DataLabeler",
spec=BaseDataLabeler,
)
def test_json_encode_after_update(self, mock_DataLabeler, *mocks):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing test

mock_labeler = mock_DataLabeler.return_value
mock_labeler._default_model_loc = "test"
mock_labeler.model.num_labels = 2
mock_labeler.reverse_label_mapping = {1: "a", 2: "b"}
mock_DataLabeler.load_from_library.return_value = mock_labeler

data = pd.Series(["-2", "Nan", "1", "2"], name="test")
# update mock for 4 values
mock_labeler.predict.return_value = {"pred": [], "conf": np.zeros((4, 2))}
with test_utils.mock_timeit():
col_profiler = StructuredColProfiler(data)

serialized = json.dumps(col_profiler, cls=ProfileEncoder)

expected = {
"class": "StructuredColProfiler",
"data": {
"name": "test",
"options": mock.ANY,
"_min_sample_size": 5000,
"_sampling_ratio": 0.2,
"_min_true_samples": 0,
"sample_size": 4,
"sample": ["1", "2", "-2"],
"null_count": 1,
"null_types": ["Nan"],
"null_types_index": {
"Nan": [
1,
]
},
"_min_id": 0,
"_max_id": 3,
"_index_shift": None,
"_last_batch_size": 4,
"_null_values": {
"": 0,
"nan": 2,
"none": 2,
"null": 2,
" *": 0,
"--*": 0,
"__*": 0,
},
"profiles": {
"data_type_profile": {
"class": "ColumnPrimitiveTypeProfileCompiler",
"data": mock.ANY,
},
"data_stats_profile": {
"class": "ColumnStatsProfileCompiler",
"data": mock.ANY,
},
"data_label_profile": {
"class": "ColumnDataLabelerCompiler",
"data": mock.ANY,
},
},
},
}

self.assertDictEqual(expected, json.loads(serialized))

@mock.patch(
"dataprofiler.profilers.data_labeler_column_profile.DataLabeler",
spec=BaseDataLabeler,
)
def test_json_decode(self, mock_DataLabeler, *mocks):
mock_labeler = mock.Mock(spec=BaseDataLabeler)
mock_labeler._default_model_loc = "test"
mock_DataLabeler.load_from_library = mock_labeler

fake_profile_name = None
expected_profile = StructuredColProfiler(fake_profile_name)

Expand All @@ -2627,31 +2745,65 @@ def test_json_decode(self, *mocks):

test_utils.assert_profiles_equal(deserialized, expected_profile)

def test_json_decode_after_update(self):
# Actual deserialization
@mock.patch(
"dataprofiler.profilers.data_labeler_column_profile.DataLabeler",
spec=BaseDataLabeler,
)
def test_json_decode_after_update(self, mock_DataLabeler, *mocks):
mock_labeler = mock_DataLabeler.return_value
mock_labeler._default_model_loc = "test"
mock_labeler.model.num_labels = 2
mock_labeler.reverse_label_mapping = {1: "a", 2: "b"}
mock_DataLabeler.load_from_library.return_value = mock_labeler

# Build expected StructuredColProfiler
df_float = pd.Series([-1.5, None, 5.0, 7.0, 4.0, 3.0, "NaN", 0, 0, 9.0]).apply(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated to include nan types

str
)
# update mock for 10 values
mock_labeler.predict.return_value = {"pred": [], "conf": np.zeros((10, 2))}

# Build expected FloatColumn
df_float = pd.Series([-1.5, 2.2, 5.0, 7.0, 4.0, 3.0, 2.0, 0, 0, 9.0]).apply(str)
expected_profile = StructuredColProfiler(df_float)

serialized = json.dumps(expected_profile, cls=ProfileEncoder)
deserialized = load_structured_col_profiler(json.loads(serialized))

test_utils.assert_profiles_equal(deserialized, expected_profile)
assert deserialized.null_count == 2
assert deserialized.null_types_index == {
"None": {
1,
},
"NaN": {
6,
},
}

df_float = pd.Series(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated to include nan types

[
"4.0", # add existing
"NaN", # add existing
"15.0", # add new
"null", # add new
]
)
# update mock for 2 Values
mock_labeler.predict.return_value = {"pred": [], "conf": [[1, 1], [0, 0]]}

# validating update after deserialization
deserialized.update_profile(df_float)

assert deserialized.sample_size == 12
assert deserialized.null_count == 0
assert deserialized.profile["data_label"] == "ORDINAL"
assert deserialized.sample_size == 13
assert deserialized.null_count == 4
assert deserialized.null_types_index == {
"None": {
1,
},
"NaN": {6, 10},
"null": {
12,
},
}
assert deserialized.profile["data_label"] == "a"
assert deserialized.profile["statistics"]["max"] == 15
assert deserialized.profile["statistics"]["min"] == -1.5

Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ fastavro>=1.0.0.post1
python-snappy>=0.5.4
charset-normalizer>=1.3.6
psutil>=4.0.0
scipy>=1.4.1
scipy>=1.4.1,<1.11.0
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

1.11.0 causes graph profilers to start failing tests.

we will need to investigate what is failing and why https://scipy.github.io/devdocs/release/1.11.0-notes.html#statistical-distributions
causes these failures.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Issue created #905

requests>=2.28.1
networkx>=2.5.1
typing-extensions>=3.10.0.2