-
Notifications
You must be signed in to change notification settings - Fork 185
Adds tests validating serialization with Primitive type for compiler #885
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
f51e7a6
b2b0e2b
a7b698f
4b37e4e
20a7a31
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -14,6 +14,8 @@ | |
| UnstructuredOptions, | ||
| ) | ||
|
|
||
| from . import utils as test_utils | ||
|
|
||
|
|
||
| class TestBaseProfileCompilerClass(unittest.TestCase): | ||
| def test_cannot_instantiate(self): | ||
|
|
@@ -84,6 +86,179 @@ def test_add_profilers(self): | |
| self.assertEqual(3, merged_compiler._profiles["test"]) | ||
| self.assertEqual("compiler1", merged_compiler.name) | ||
|
|
||
| def test_compiler_stats_diff(self): | ||
| data1 = pd.Series(["1", "9", "9"]) | ||
| data2 = pd.Series(["10", "9", "9", "9"]) | ||
| options = StructuredOptions() | ||
|
|
||
| # Test normal diff | ||
| compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1) | ||
| compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) | ||
| expected_diff = { | ||
| "order": ["ascending", "descending"], | ||
| "categorical": "unchanged", | ||
| "statistics": { | ||
| "unique_count": "unchanged", | ||
| "unique_ratio": 0.16666666666666663, | ||
| "categories": [["1"], ["9"], ["10"]], | ||
| "gini_impurity": 0.06944444444444448, | ||
| "unalikeability": 0.16666666666666663, | ||
| "categorical_count": {"9": -1, "1": [1, None], "10": [None, 1]}, | ||
| "chi2-test": { | ||
| "chi2-statistic": 2.1, | ||
| "df": 2, | ||
| "p-value": 0.3499377491111554, | ||
| }, | ||
| }, | ||
| } | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| # Test disabled categorical column in one compiler | ||
| options.category.is_enabled = False | ||
| compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options) | ||
| compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) | ||
| expected_diff = {"order": ["ascending", "descending"]} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| # Test disabling categorical profile in both compilers | ||
| compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options) | ||
| compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options) | ||
| expected_diff = {"order": ["ascending", "descending"]} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| # Test disabling everything | ||
| options.order.is_enabled = False | ||
| compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options) | ||
| compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options) | ||
| expected_diff = {} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") | ||
| @mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update" | ||
| ) | ||
| def test_compiler_data_labeler_diff(self, *mocked_datalabeler): | ||
| # Initialize dummy data | ||
| data = pd.Series([]) | ||
|
|
||
| # Test normal diff | ||
| compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data) | ||
| compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data) | ||
|
|
||
| # Mock out the data_label, avg_predictions, and label_representation | ||
| # properties | ||
| with mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile" | ||
| ".DataLabelerColumn.data_label" | ||
| ), mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile." | ||
| "DataLabelerColumn.avg_predictions" | ||
| ), mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile." | ||
| "DataLabelerColumn.label_representation" | ||
| ): | ||
| compiler1._profiles["data_labeler"].sample_size = 20 | ||
| compiler1._profiles["data_labeler"].data_label = "a" | ||
| compiler1._profiles["data_labeler"].avg_predictions = { | ||
| "a": 0.25, | ||
| "b": 0.0, | ||
| "c": 0.75, | ||
| } | ||
| compiler1._profiles["data_labeler"].label_representation = { | ||
| "a": 0.15, | ||
| "b": 0.01, | ||
| "c": 0.84, | ||
| } | ||
|
|
||
| compiler2._profiles["data_labeler"].sample_size = 20 | ||
| compiler2._profiles["data_labeler"].data_label = "b" | ||
| compiler2._profiles["data_labeler"].avg_predictions = { | ||
| "a": 0.25, | ||
| "b": 0.70, | ||
| "c": 0.05, | ||
| } | ||
| compiler2._profiles["data_labeler"].label_representation = { | ||
| "a": 0.99, | ||
| "b": 0.01, | ||
| "c": 0.0, | ||
| } | ||
|
|
||
| expected_diff = { | ||
| "statistics": { | ||
| "avg_predictions": {"a": "unchanged", "b": -0.7, "c": 0.7}, | ||
| "label_representation": {"a": -0.84, "b": "unchanged", "c": 0.84}, | ||
| }, | ||
| "data_label": [["a"], [], ["b"]], | ||
| } | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| # Test disabling one datalabeler profile for compiler diff | ||
| options = StructuredOptions() | ||
| options.data_labeler.is_enabled = False | ||
| compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data, options) | ||
| expected_diff = {} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| # Test disabling both datalabeler profiles for compiler diff | ||
| compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data, options) | ||
| expected_diff = {} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| @mock.patch.multiple(col_pro_compilers.BaseCompiler, __abstractmethods__=set()) | ||
| def test_no_profilers_error(self): | ||
| with self.assertRaises(NotImplementedError) as e: | ||
| col_pro_compilers.BaseCompiler() | ||
| self.assertEqual("Must add profilers.", str(e.exception)) | ||
|
|
||
| @mock.patch.multiple( | ||
| col_pro_compilers.BaseCompiler, __abstractmethods__=set(), _profilers="mock" | ||
| ) | ||
| def test_no_options_error(self): | ||
| with self.assertRaisesRegex( | ||
| NotImplementedError, "Must set the expected OptionClass." | ||
| ): | ||
| col_pro_compilers.BaseCompiler() | ||
|
|
||
| def test_update_match_are_abstract(self): | ||
| self.assertCountEqual( | ||
| {"report"}, col_pro_compilers.BaseCompiler.__abstractmethods__ | ||
| ) | ||
|
|
||
| @mock.patch.multiple(BaseColumnProfiler, __abstractmethods__=set()) | ||
| def test_json_encode(self): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this will go away with #884 merge
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. but doesn't bc of the weird git choices. this is not new |
||
| with mock.patch.multiple( | ||
| col_pro_compilers.BaseCompiler, | ||
| __abstractmethods__=set(), | ||
| _profilers=[BaseColumnProfiler], | ||
| _option_class=BaseOption, | ||
| ): | ||
| profile = col_pro_compilers.BaseCompiler() | ||
|
|
||
| base_column_profiler = BaseColumnProfiler(name="test") | ||
| with mock.patch.object( | ||
| profile, "_profiles", {"BaseColumn": base_column_profiler} | ||
| ): | ||
| serialized = json.dumps(profile, cls=ProfileEncoder) | ||
|
|
||
| dict_of_base_column_profiler = json.loads( | ||
| json.dumps(base_column_profiler, cls=ProfileEncoder) | ||
| ) | ||
| expected = json.dumps( | ||
| { | ||
| "class": "BaseCompiler", | ||
| "data": { | ||
| "name": None, | ||
| "_profiles": { | ||
| "BaseColumn": dict_of_base_column_profiler, | ||
| }, | ||
| }, | ||
| } | ||
| ) | ||
|
|
||
| self.assertEqual(expected, serialized) | ||
|
|
||
|
|
||
| class TestColumnPrimitiveTypeProfileCompiler(unittest.TestCase): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. new class for this compiler only |
||
| def test_primitive_compiler_report(self): | ||
| structured_options = StructuredOptions() | ||
| data1 = pd.Series(["2.6", "-1.8"]) | ||
|
|
@@ -310,144 +485,54 @@ def test_disabling_columns_during_primitive_diff(self): | |
| expected_diff = {} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| def test_compiler_stats_diff(self): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is not new, I moved the primitive tests below to be int he primitive compiler test class |
||
| data1 = pd.Series(["1", "9", "9"]) | ||
| data2 = pd.Series(["10", "9", "9", "9"]) | ||
| options = StructuredOptions() | ||
|
|
||
| # Test normal diff | ||
| compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1) | ||
| compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) | ||
| expected_diff = { | ||
| "order": ["ascending", "descending"], | ||
| "categorical": "unchanged", | ||
| "statistics": { | ||
| "unique_count": "unchanged", | ||
| "unique_ratio": 0.16666666666666663, | ||
| "categories": [["1"], ["9"], ["10"]], | ||
| "gini_impurity": 0.06944444444444448, | ||
| "unalikeability": 0.16666666666666663, | ||
| "categorical_count": {"9": -1, "1": [1, None], "10": [None, 1]}, | ||
| "chi2-test": { | ||
| "chi2-statistic": 2.1, | ||
| "df": 2, | ||
| "p-value": 0.3499377491111554, | ||
| }, | ||
| }, | ||
| } | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| # Test disabled categorical column in one compiler | ||
| options.category.is_enabled = False | ||
| compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options) | ||
| compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2) | ||
| expected_diff = {"order": ["ascending", "descending"]} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| # Test disabling categorical profile in both compilers | ||
| compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options) | ||
| compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options) | ||
| expected_diff = {"order": ["ascending", "descending"]} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| # Test disabling everything | ||
| options.order.is_enabled = False | ||
| compiler1 = col_pro_compilers.ColumnStatsProfileCompiler(data1, options) | ||
| compiler2 = col_pro_compilers.ColumnStatsProfileCompiler(data2, options) | ||
| expected_diff = {} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
|
|
||
| @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") | ||
| @mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update" | ||
| ) | ||
| def test_compiler_data_labeler_diff(self, *mocked_datalabeler): | ||
| # Initialize dummy data | ||
| data = pd.Series([]) | ||
|
|
||
| # Test normal diff | ||
| compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data) | ||
| compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data) | ||
|
|
||
| # Mock out the data_label, avg_predictions, and label_representation | ||
| # properties | ||
| with mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile" | ||
| ".DataLabelerColumn.data_label" | ||
| ), mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile." | ||
| "DataLabelerColumn.avg_predictions" | ||
| ), mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile." | ||
| "DataLabelerColumn.label_representation" | ||
| ): | ||
| compiler1._profiles["data_labeler"].sample_size = 20 | ||
| compiler1._profiles["data_labeler"].data_label = "a" | ||
| compiler1._profiles["data_labeler"].avg_predictions = { | ||
| "a": 0.25, | ||
| "b": 0.0, | ||
| "c": 0.75, | ||
| } | ||
| compiler1._profiles["data_labeler"].label_representation = { | ||
| "a": 0.15, | ||
| "b": 0.01, | ||
| "c": 0.84, | ||
| } | ||
| def test_json_encode(self): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this encode is new, awk how git did this... |
||
|
|
||
| compiler2._profiles["data_labeler"].sample_size = 20 | ||
| compiler2._profiles["data_labeler"].data_label = "b" | ||
| compiler2._profiles["data_labeler"].avg_predictions = { | ||
| "a": 0.25, | ||
| "b": 0.70, | ||
| "c": 0.05, | ||
| } | ||
| compiler2._profiles["data_labeler"].label_representation = { | ||
| "a": 0.99, | ||
| "b": 0.01, | ||
| "c": 0.0, | ||
| } | ||
| compiler = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler() | ||
|
|
||
| expected_diff = { | ||
| "statistics": { | ||
| "avg_predictions": {"a": "unchanged", "b": -0.7, "c": 0.7}, | ||
| "label_representation": {"a": -0.84, "b": "unchanged", "c": 0.84}, | ||
| serialized = json.dumps(compiler, cls=ProfileEncoder) | ||
| expected = json.dumps( | ||
| { | ||
| "class": "ColumnPrimitiveTypeProfileCompiler", | ||
| "data": { | ||
| "name": None, | ||
| "_profiles": {}, | ||
| }, | ||
| "data_label": [["a"], [], ["b"]], | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is all just moved, not new |
||
| } | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
| ) | ||
| self.assertEqual(expected, serialized) | ||
|
|
||
| # Test disabling one datalabeler profile for compiler diff | ||
| options = StructuredOptions() | ||
| options.data_labeler.is_enabled = False | ||
| compiler1 = col_pro_compilers.ColumnDataLabelerCompiler(data, options) | ||
| expected_diff = {} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
| def test_json_encode_after_update(self): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is new for the test for this compiler |
||
|
|
||
| # Test disabling both datalabeler profiles for compiler diff | ||
| compiler2 = col_pro_compilers.ColumnDataLabelerCompiler(data, options) | ||
| expected_diff = {} | ||
| self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) | ||
| data = pd.Series(["-2", "-1", "1", "2"], name="test") | ||
| with test_utils.mock_timeit(): | ||
| compiler = col_pro_compilers.ColumnPrimitiveTypeProfileCompiler(data) | ||
|
|
||
| @mock.patch.multiple(col_pro_compilers.BaseCompiler, __abstractmethods__=set()) | ||
| def test_no_profilers_error(self): | ||
| with self.assertRaises(NotImplementedError) as e: | ||
| col_pro_compilers.BaseCompiler() | ||
| self.assertEqual("Must add profilers.", str(e.exception)) | ||
| with mock.patch.object(compiler._profiles["datetime"], "__dict__", {}): | ||
| with mock.patch.object(compiler._profiles["int"], "__dict__", {}): | ||
| with mock.patch.object(compiler._profiles["float"], "__dict__", {}): | ||
| with mock.patch.object(compiler._profiles["text"], "__dict__", {}): | ||
| serialized = json.dumps(compiler, cls=ProfileEncoder) | ||
|
|
||
| @mock.patch.multiple( | ||
| col_pro_compilers.BaseCompiler, __abstractmethods__=set(), _profilers="mock" | ||
| ) | ||
| def test_no_options_error(self): | ||
| with self.assertRaisesRegex( | ||
| NotImplementedError, "Must set the expected OptionClass." | ||
| ): | ||
| col_pro_compilers.BaseCompiler() | ||
| # pop the data inside primitive column profiler as we just want to make | ||
| # sure generally it is serializing, decode will validate true replication | ||
|
|
||
| def test_update_match_are_abstract(self): | ||
| self.assertCountEqual( | ||
| {"report"}, col_pro_compilers.BaseCompiler.__abstractmethods__ | ||
| expected = json.dumps( | ||
| { | ||
| "class": "ColumnPrimitiveTypeProfileCompiler", | ||
| "data": { | ||
| "name": "test", | ||
| "_profiles": { | ||
| "datetime": {"class": "DateTimeColumn", "data": {}}, | ||
| "int": {"class": "IntColumn", "data": {}}, | ||
| "float": {"class": "FloatColumn", "data": {}}, | ||
| "text": {"class": "TextColumn", "data": {}}, | ||
| }, | ||
| }, | ||
| } | ||
| ) | ||
|
|
||
| self.assertEqual(expected, serialized) | ||
|
|
||
| @mock.patch.multiple(BaseColumnProfiler, __abstractmethods__=set()) | ||
| def test_json_encode(self): | ||
| with mock.patch.multiple( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -1830,7 +1830,7 @@ def test_json_encode(self): | |
| }, | ||
| } | ||
| ) | ||
| self.assertEqual(serialized, expected) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. standardized |
||
| self.assertEqual(expected, serialized) | ||
|
|
||
| @mock.patch("time.time", return_value=0.0) | ||
| def test_json_encode_after_update(self, time): | ||
|
|
@@ -1956,7 +1956,7 @@ def test_json_encode_after_update(self, time): | |
| } | ||
| ) | ||
|
|
||
| self.assertEqual(serialized, expected) | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. standardized |
||
| self.assertEqual(expected, serialized) | ||
|
|
||
| def test_json_decode(self): | ||
| fake_profile_name = None | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is not new, I moved the primitive tests below to be int he primitive compiler test class