Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
339 changes: 212 additions & 127 deletions dataprofiler/tests/profilers/test_column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
UnstructuredOptions,
)

from . import utils as test_utils


class TestBaseProfileCompilerClass(unittest.TestCase):
def test_cannot_instantiate(self):
Expand Down Expand Up @@ -84,6 +86,179 @@ def test_add_profilers(self):
self.assertEqual(3, merged_compiler._profiles["test"])
self.assertEqual("compiler1", merged_compiler.name)

def test_compiler_stats_diff(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not new, I moved the primitive tests below to be int he primitive compiler test class

data1 = pd.Series(["1", "9", "9"])
data2 = pd.Series(["10", "9", "9", "9"])
options = StructuredOptions()

# Test normal diff
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
expected_diff = {
"order": ["ascending", "descending"],
"categorical": "unchanged",
"statistics": {
"unique_count": "unchanged",
"unique_ratio": 0.16666666666666663,
"categories": [["1"], ["9"], ["10"]],
"gini_impurity": 0.06944444444444448,
"unalikeability": 0.16666666666666663,
"categorical_count": {"9": -1, "1": [1, None], "10": [None, 1]},
"chi2-test": {
"chi2-statistic": 2.1,
"df": 2,
"p-value": 0.3499377491111554,
},
},
}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabled categorical column in one compiler
options.category.is_enabled = False
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
expected_diff = {"order": ["ascending", "descending"]}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling categorical profile in both compilers
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options)
expected_diff = {"order": ["ascending", "descending"]}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling everything
options.order.is_enabled = False
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options)
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

@mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler")
@mock.patch(
"dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update"
)
def test_compiler_data_labeler_diff(self, *mocked_datalabeler):
# Initialize dummy data
data = pd.Series([])

# Test normal diff
compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data)
compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data)

# Mock out the data_label, avg_predictions, and label_representation
# properties
with mock.patch(
"dataprofiler.profilers.data_labeler_column_profile"
".DataLabelerColumn.data_label"
), mock.patch(
"dataprofiler.profilers.data_labeler_column_profile."
"DataLabelerColumn.avg_predictions"
), mock.patch(
"dataprofiler.profilers.data_labeler_column_profile."
"DataLabelerColumn.label_representation"
):
compiler1._profiles["data_labeler"].sample_size = 20
compiler1._profiles["data_labeler"].data_label = "a"
compiler1._profiles["data_labeler"].avg_predictions = {
"a": 0.25,
"b": 0.0,
"c": 0.75,
}
compiler1._profiles["data_labeler"].label_representation = {
"a": 0.15,
"b": 0.01,
"c": 0.84,
}

compiler2._profiles["data_labeler"].sample_size = 20
compiler2._profiles["data_labeler"].data_label = "b"
compiler2._profiles["data_labeler"].avg_predictions = {
"a": 0.25,
"b": 0.70,
"c": 0.05,
}
compiler2._profiles["data_labeler"].label_representation = {
"a": 0.99,
"b": 0.01,
"c": 0.0,
}

expected_diff = {
"statistics": {
"avg_predictions": {"a": "unchanged", "b": -0.7, "c": 0.7},
"label_representation": {"a": -0.84, "b": "unchanged", "c": 0.84},
},
"data_label": [["a"], [], ["b"]],
}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling one datalabeler profile for compiler diff
options = StructuredOptions()
options.data_labeler.is_enabled = False
compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data, options)
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling both datalabeler profiles for compiler diff
compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data, options)
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

@mock.patch.multiple(col_pro_compilers.BaseCompiler, __abstractmethods__=set())
def test_no_profilers_error(self):
with self.assertRaises(NotImplementedError) as e:
col_pro_compilers.BaseCompiler()
self.assertEqual("Must add profilers.", str(e.exception))

@mock.patch.multiple(
col_pro_compilers.BaseCompiler, __abstractmethods__=set(), _profilers="mock"
)
def test_no_options_error(self):
with self.assertRaisesRegex(
NotImplementedError, "Must set the expected OptionClass."
):
col_pro_compilers.BaseCompiler()

def test_update_match_are_abstract(self):
self.assertCountEqual(
{"report"}, col_pro_compilers.BaseCompiler.__abstractmethods__
)

@mock.patch.multiple(BaseColumnProfiler, __abstractmethods__=set())
def test_json_encode(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this will go away with #884 merge

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

but doesn't bc of the weird git choices. this is not new

with mock.patch.multiple(
col_pro_compilers.BaseCompiler,
__abstractmethods__=set(),
_profilers=[BaseColumnProfiler],
_option_class=BaseOption,
):
profile = col_pro_compilers.BaseCompiler()

base_column_profiler = BaseColumnProfiler(name="test")
with mock.patch.object(
profile, "_profiles", {"BaseColumn": base_column_profiler}
):
serialized = json.dumps(profile, cls=ProfileEncoder)

dict_of_base_column_profiler = json.loads(
json.dumps(base_column_profiler, cls=ProfileEncoder)
)
expected = json.dumps(
{
"class": "BaseCompiler",
"data": {
"name": None,
"_profiles": {
"BaseColumn": dict_of_base_column_profiler,
},
},
}
)

self.assertEqual(expected, serialized)


class TestColumnPrimitiveTypeProfileCompiler(unittest.TestCase):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new class for this compiler only

def test_primitive_compiler_report(self):
structured_options = StructuredOptions()
data1 = pd.Series(["2.6", "-1.8"])
Expand Down Expand Up @@ -310,144 +485,54 @@ def test_disabling_columns_during_primitive_diff(self):
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

def test_compiler_stats_diff(self):
Copy link
Contributor Author

@JGSweets JGSweets Jun 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is not new, I moved the primitive tests below to be int he primitive compiler test class

data1 = pd.Series(["1", "9", "9"])
data2 = pd.Series(["10", "9", "9", "9"])
options = StructuredOptions()

# Test normal diff
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
expected_diff = {
"order": ["ascending", "descending"],
"categorical": "unchanged",
"statistics": {
"unique_count": "unchanged",
"unique_ratio": 0.16666666666666663,
"categories": [["1"], ["9"], ["10"]],
"gini_impurity": 0.06944444444444448,
"unalikeability": 0.16666666666666663,
"categorical_count": {"9": -1, "1": [1, None], "10": [None, 1]},
"chi2-test": {
"chi2-statistic": 2.1,
"df": 2,
"p-value": 0.3499377491111554,
},
},
}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabled categorical column in one compiler
options.category.is_enabled = False
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2)
expected_diff = {"order": ["ascending", "descending"]}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling categorical profile in both compilers
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options)
expected_diff = {"order": ["ascending", "descending"]}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

# Test disabling everything
options.order.is_enabled = False
compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options)
compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options)
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

@mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler")
@mock.patch(
"dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update"
)
def test_compiler_data_labeler_diff(self, *mocked_datalabeler):
# Initialize dummy data
data = pd.Series([])

# Test normal diff
compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data)
compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data)

# Mock out the data_label, avg_predictions, and label_representation
# properties
with mock.patch(
"dataprofiler.profilers.data_labeler_column_profile"
".DataLabelerColumn.data_label"
), mock.patch(
"dataprofiler.profilers.data_labeler_column_profile."
"DataLabelerColumn.avg_predictions"
), mock.patch(
"dataprofiler.profilers.data_labeler_column_profile."
"DataLabelerColumn.label_representation"
):
compiler1._profiles["data_labeler"].sample_size = 20
compiler1._profiles["data_labeler"].data_label = "a"
compiler1._profiles["data_labeler"].avg_predictions = {
"a": 0.25,
"b": 0.0,
"c": 0.75,
}
compiler1._profiles["data_labeler"].label_representation = {
"a": 0.15,
"b": 0.01,
"c": 0.84,
}
def test_json_encode(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this encode is new, awk how git did this...


compiler2._profiles["data_labeler"].sample_size = 20
compiler2._profiles["data_labeler"].data_label = "b"
compiler2._profiles["data_labeler"].avg_predictions = {
"a": 0.25,
"b": 0.70,
"c": 0.05,
}
compiler2._profiles["data_labeler"].label_representation = {
"a": 0.99,
"b": 0.01,
"c": 0.0,
}
compiler = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler()

expected_diff = {
"statistics": {
"avg_predictions": {"a": "unchanged", "b": -0.7, "c": 0.7},
"label_representation": {"a": -0.84, "b": "unchanged", "c": 0.84},
serialized = json.dumps(compiler, cls=ProfileEncoder)
expected = json.dumps(
{
"class": "ColumnPrimitiveTypeProfileCompiler",
"data": {
"name": None,
"_profiles": {},
},
"data_label": [["a"], [], ["b"]],
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is all just moved, not new

}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
)
self.assertEqual(expected, serialized)

# Test disabling one datalabeler profile for compiler diff
options = StructuredOptions()
options.data_labeler.is_enabled = False
compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data, options)
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
def test_json_encode_after_update(self):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is new for the test for this compiler


# Test disabling both datalabeler profiles for compiler diff
compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data, options)
expected_diff = {}
self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
data = pd.Series(["-2", "-1", "1", "2"], name="test")
with test_utils.mock_timeit():
compiler = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data)

@mock.patch.multiple(col_pro_compilers.BaseCompiler, __abstractmethods__=set())
def test_no_profilers_error(self):
with self.assertRaises(NotImplementedError) as e:
col_pro_compilers.BaseCompiler()
self.assertEqual("Must add profilers.", str(e.exception))
with mock.patch.object(compiler._profiles["datetime"], "__dict__", {}):
with mock.patch.object(compiler._profiles["int"], "__dict__", {}):
with mock.patch.object(compiler._profiles["float"], "__dict__", {}):
with mock.patch.object(compiler._profiles["text"], "__dict__", {}):
serialized = json.dumps(compiler, cls=ProfileEncoder)

@mock.patch.multiple(
col_pro_compilers.BaseCompiler, __abstractmethods__=set(), _profilers="mock"
)
def test_no_options_error(self):
with self.assertRaisesRegex(
NotImplementedError, "Must set the expected OptionClass."
):
col_pro_compilers.BaseCompiler()
# pop the data inside primitive column profiler as we just want to make
# sure generally it is serializing, decode will validate true replication

def test_update_match_are_abstract(self):
self.assertCountEqual(
{"report"}, col_pro_compilers.BaseCompiler.__abstractmethods__
expected = json.dumps(
{
"class": "ColumnPrimitiveTypeProfileCompiler",
"data": {
"name": "test",
"_profiles": {
"datetime": {"class": "DateTimeColumn", "data": {}},
"int": {"class": "IntColumn", "data": {}},
"float": {"class": "FloatColumn", "data": {}},
"text": {"class": "TextColumn", "data": {}},
},
},
}
)

self.assertEqual(expected, serialized)

@mock.patch.multiple(BaseColumnProfiler, __abstractmethods__=set())
def test_json_encode(self):
with mock.patch.multiple(
Expand Down
4 changes: 2 additions & 2 deletions dataprofiler/tests/profilers/test_float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1830,7 +1830,7 @@ def test_json_encode(self):
},
}
)
self.assertEqual(serialized, expected)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

standardized

self.assertEqual(expected, serialized)

@mock.patch("time.time", return_value=0.0)
def test_json_encode_after_update(self, time):
Expand Down Expand Up @@ -1956,7 +1956,7 @@ def test_json_encode_after_update(self, time):
}
)

self.assertEqual(serialized, expected)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

standardized

self.assertEqual(expected, serialized)

def test_json_decode(self):
fake_profile_name = None
Expand Down