-
Notifications
You must be signed in to change notification settings - Fork 185
refactor: use seed for sample for consistency #927
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -79,10 +79,10 @@ def setUpClass(cls): | |
| ) | ||
|
|
||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" | ||
| "dataprofiler.profilers.profile_builder.ColumnPrimitiveTypeProfileCompiler" | ||
| ) | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder.ColumnStatsProfileCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder.ColumnDataLabelerCompiler") | ||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder.DataLabeler", spec=StructuredDataLabeler | ||
| ) | ||
|
|
@@ -103,16 +103,15 @@ def test_bad_input_data(self, *mocks): | |
| StructuredProfiler(data) | ||
|
|
||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" | ||
| "dataprofiler.profilers.profile_builder.ColumnPrimitiveTypeProfileCompiler" | ||
| ) | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder.ColumnStatsProfileCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder.ColumnDataLabelerCompiler") | ||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder.DataLabeler", spec=StructuredDataLabeler | ||
| ) | ||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder." | ||
| "StructuredProfiler._update_correlation" | ||
| "dataprofiler.profilers.profile_builder.StructuredProfiler._update_correlation" | ||
| ) | ||
| def test_list_data(self, *mocks): | ||
| data = [[1, 1], [None, None], [3, 3], [4, 4], [5, 5], [None, None], [1, 1]] | ||
|
|
@@ -132,7 +131,7 @@ def test_list_data(self, *mocks): | |
|
|
||
| # validates the sample out maintains the same visual data format as the | ||
| # input. | ||
| self.assertListEqual(["5", "1", "1", "3", "4"], profiler.profile[0].sample) | ||
|
||
| self.assertListEqual(["1", "4", "5", "1", "3"], profiler.profile[0].sample) | ||
|
|
||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" | ||
|
|
@@ -1492,7 +1491,7 @@ def test_save_and_load_pkl_file(self): | |
| save_profile.save() | ||
| mock_file.seek(0) | ||
| with mock.patch( | ||
| "dataprofiler.profilers.profile_builder." "DataLabeler", | ||
| "dataprofiler.profilers.profile_builder.DataLabeler", | ||
| return_value=data_labeler, | ||
| ): | ||
| load_profile = dp.StructuredProfiler.load("mock.pkl") | ||
|
|
@@ -1533,7 +1532,7 @@ def test_save_and_load_no_labeler(self): | |
| save_profile.save() | ||
|
|
||
| mock_file.seek(0) | ||
| with mock.patch("dataprofiler.profilers.profile_builder." "DataLabeler"): | ||
| with mock.patch("dataprofiler.profilers.profile_builder.DataLabeler"): | ||
| load_profile = dp.StructuredProfiler.load("mock.pkl") | ||
|
|
||
| # Check that reports are equivalent | ||
|
|
@@ -1546,7 +1545,7 @@ def test_save_and_load_no_labeler(self): | |
| load_profile.update_profile(pd.DataFrame({"a": [4, 5]})) | ||
|
|
||
| @mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile." "DataLabelerColumn.update" | ||
| "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" | ||
| ) | ||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder.DataLabeler", | ||
|
|
@@ -1579,9 +1578,9 @@ def test_save_json_file(self, *mocks): | |
|
|
||
| # Save and Load profile with Mock IO | ||
| with mock.patch("builtins.open") as mock_open, mock.patch( | ||
| "dataprofiler.profilers.profile_builder." "datetime" | ||
| ) as time_out: | ||
| time_out.now().strftime.return_value = "now" | ||
| "dataprofiler.profilers.profile_builder.datetime" | ||
| ) as mock_pb_datetime: | ||
| mock_pb_datetime.now().strftime.return_value = "now" | ||
| mock_file = setup_save_mock_string_open(mock_open) | ||
| save_profile.save("output/mock.json", "json") | ||
| mock_file.seek(0) | ||
|
|
@@ -1624,9 +1623,9 @@ def test_save_json_file(self, *mocks): | |
|
|
||
| # do a second call without a specified file path | ||
| with mock.patch("builtins.open") as mock_open, mock.patch( | ||
| "dataprofiler.profilers.profile_builder." "datetime" | ||
| ) as time_out: | ||
| time_out.now().strftime.return_value = "now" | ||
| "dataprofiler.profilers.profile_builder.datetime" | ||
| ) as mock_pb_datetime: | ||
| mock_pb_datetime.now().strftime.return_value = "now" | ||
| setup_save_mock_string_open(mock_open) | ||
| save_profile.save(save_method="json") | ||
|
|
||
|
|
@@ -2404,7 +2403,17 @@ def test_json_decode(self, mock_utils_DataLabeler, mock_DataLabeler, *mocks): | |
| mock_DataLabeler.return_value = mock_labeler | ||
|
|
||
| fake_profile_name = None | ||
| expected_profile = StructuredProfiler(fake_profile_name) | ||
| profile_options = dp.ProfilerOptions() | ||
|
||
| profile_options.set( | ||
| { | ||
| "correlation.is_enabled": True, | ||
| "null_replication_metrics.is_enabled": True, | ||
| "multiprocess.is_enabled": False, | ||
| } | ||
| ) | ||
| expected_profile = StructuredProfiler( | ||
| fake_profile_name, options=profile_options | ||
| ) | ||
|
|
||
| serialized = json.dumps(expected_profile, cls=ProfileEncoder) | ||
| deserialized = load_profiler(json.loads(serialized)) | ||
|
|
@@ -2440,7 +2449,17 @@ def test_json_decode_after_update( | |
| "pred": [], | ||
| "conf": [[1, 1], [0, 0]], | ||
| } | ||
| expected_profile = StructuredProfiler(fake_profile_name) | ||
|
||
| profile_options = dp.ProfilerOptions() | ||
| profile_options.set( | ||
| { | ||
| "correlation.is_enabled": True, | ||
| "null_replication_metrics.is_enabled": True, | ||
| "multiprocess.is_enabled": False, | ||
| } | ||
| ) | ||
| expected_profile = StructuredProfiler( | ||
| fake_profile_name, options=profile_options | ||
| ) | ||
|
|
||
| with test_utils.mock_timeit(): | ||
| expected_profile.update_profile(df_structured) | ||
|
|
@@ -2471,15 +2490,11 @@ def test_json_decode_after_update( | |
| ] | ||
| ) | ||
|
|
||
| deserialized.update_profile(df_structured) | ||
| with test_utils.mock_timeit(): | ||
|
||
| deserialized.update_profile(df_structured) | ||
| expected_profile.update_profile(df_structured) | ||
|
|
||
| assert deserialized.total_samples == 5 | ||
| assert deserialized._max_col_samples_used == 5 | ||
| assert deserialized._min_col_samples_used == 5 | ||
| assert deserialized.row_has_null_count == 2 | ||
| assert deserialized.row_is_null_count == 1 | ||
| assert deserialized._get_unique_row_ratio() == 0.80 | ||
| assert deserialized.file_type == "<class 'pandas.core.frame.DataFrame'>" | ||
| test_utils.assert_profiles_equal(deserialized, expected_profile) | ||
|
||
|
|
||
|
|
||
| class TestStructuredColProfilerClass(unittest.TestCase): | ||
|
|
@@ -2537,19 +2552,9 @@ def test_base_props(self): | |
| self.assertEqual(2999 * 3, src_profile.sample_size) | ||
|
|
||
| @mock.patch( | ||
| "dataprofiler.profilers.column_profile_compilers." | ||
| "ColumnPrimitiveTypeProfileCompiler" | ||
| ) | ||
| @mock.patch( | ||
| "dataprofiler.profilers.column_profile_compilers." "ColumnStatsProfileCompiler" | ||
| ) | ||
| @mock.patch( | ||
| "dataprofiler.profilers.column_profile_compilers." "ColumnDataLabelerCompiler" | ||
| ) | ||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder." | ||
| "StructuredProfiler._update_correlation" | ||
| "dataprofiler.profilers.column_profile_compilers.BaseCompiler.update_profile" | ||
| ) | ||
| @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") | ||
| def test_add_profilers(self, *mocks): | ||
| data = pd.Series([1, None, 3, 4, 5, None]) | ||
| profile1 = StructuredColProfiler(data[:2]) | ||
|
|
@@ -2654,7 +2659,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): | |
| self.assertTrue(np.issubdtype(np.object_, df_series.dtype)) | ||
| self.assertDictEqual( | ||
| { | ||
| "sample": ["4.0", "6.0", "3.0"], | ||
|
||
| "sample": ["6.0", "3.0", "4.0"], | ||
| "sample_size": 5, | ||
| "null_count": 2, | ||
| "null_types": dict(nan=["e", "b"]), | ||
|
|
@@ -2671,7 +2676,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): | |
| ) | ||
| self.assertDictEqual( | ||
| { | ||
| "sample": ["nan", "6.0", "4.0", "nan"], | ||
|
||
| "sample": ["6.0", "nan", "nan", "4.0"], | ||
| "sample_size": 6, | ||
| "null_count": 2, | ||
| "null_types": {"1.0": ["a"], "3.0": ["c"]}, | ||
|
|
@@ -2688,7 +2693,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): | |
| ) | ||
| self.assertDictEqual( | ||
| { | ||
| "sample": ["3.0", "4.0", "6.0", "nan", "1.0"], | ||
|
||
| "sample": ["3.0", "4.0", "nan", "6.0", "nan"], | ||
| "sample_size": 6, | ||
| "null_count": 0, | ||
| "null_types": {}, | ||
|
|
@@ -2808,11 +2813,9 @@ def test_sample_size_passed_to_profile(self, *mocks): | |
| self.assertEqual(10000, update_mock.call_args[0][1]) | ||
|
|
||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" | ||
|
||
| "dataprofiler.profilers.column_profile_compilers.BaseCompiler.update_profile" | ||
| ) | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") | ||
| @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") | ||
| def test_index_overlap_for_update_profile(self, *mocks): | ||
| data = pd.Series([0, None, 1, 2, None]) | ||
| profile = StructuredColProfiler(data) | ||
|
|
@@ -2827,11 +2830,9 @@ def test_index_overlap_for_update_profile(self, *mocks): | |
| self.assertDictEqual(profile.null_types_index, {"nan": {1, 4, 6, 9}}) | ||
|
|
||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" | ||
| "dataprofiler.profilers.column_profile_compilers.BaseCompiler.update_profile" | ||
|
||
| ) | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") | ||
| @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") | ||
| def test_index_overlap_for_merge(self, *mocks): | ||
| data = pd.Series([0, None, 1, 2, None]) | ||
| profile1 = StructuredColProfiler(data) | ||
|
|
@@ -2852,11 +2853,9 @@ def test_index_overlap_for_merge(self, *mocks): | |
| self.assertDictEqual(profile2.null_types_index, {"nan": {1, 4}}) | ||
|
|
||
| @mock.patch( | ||
| "dataprofiler.profilers.profile_builder." "ColumnPrimitiveTypeProfileCompiler" | ||
|
||
| "dataprofiler.profilers.column_profile_compilers.BaseCompiler.update_profile" | ||
| ) | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnStatsProfileCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder." "ColumnDataLabelerCompiler") | ||
| @mock.patch("dataprofiler.profilers.profile_builder.DataLabeler") | ||
| @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") | ||
| def test_min_max_id_properly_update(self, *mocks): | ||
| data = pd.Series([1, None, 3, 4, 5, None, 1]) | ||
| profile1 = StructuredColProfiler(data[:2]) | ||
|
|
@@ -2879,10 +2878,13 @@ def test_min_max_id_properly_update(self, *mocks): | |
| self.assertEqual(0, profile._min_id) | ||
| self.assertEqual(6, profile._max_id) | ||
|
|
||
| @mock.patch("dataprofiler.profilers.data_labeler_column_profile.DataLabeler") | ||
| @mock.patch( | ||
| "dataprofiler.profilers.data_labeler_column_profile.DataLabelerColumn.update" | ||
| ) | ||
| @mock.patch( | ||
|
||
| "dataprofiler.profilers.data_labeler_column_profile.DataLabeler", | ||
| spec=BaseDataLabeler, | ||
| ) | ||
| @mock.patch( | ||
| "dataprofiler.profilers.column_profile_compilers." | ||
| "ColumnPrimitiveTypeProfileCompiler.diff" | ||
|
|
@@ -3015,7 +3017,7 @@ def test_json_encode_after_update(self, mock_DataLabeler, *mocks): | |
| "_sampling_ratio": 0.2, | ||
| "_min_true_samples": 0, | ||
| "sample_size": 4, | ||
| "sample": ["1", "2", "-2"], | ||
|
||
| "sample": ["2", "-2", "1"], | ||
| "null_count": 1, | ||
| "null_types": ["Nan"], | ||
| "null_types_index": { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the big fix is here for making sampling consistent