capitalone · taylorfturner · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023 · Jun 28, 2023
@@ -33,7 +33,12 @@
 )
 from .graph_profiler import GraphProfiler
 from .helpers.report_helpers import _prepare_report, calculate_quantiles
-from .json_decoder import load_compiler, load_option, load_structured_col_profiler
+from .json_decoder import (
+    load_compiler,
+    load_option,
+    load_profiler,
+    load_structured_col_profiler,
+)
 from .json_encoder import ProfileEncoder
 from .profiler_options import (
     BaseOption,
@@ -1157,7 +1162,7 @@ def save(self, filepath: str = None, save_method: str = "pickle") -> None:
         raise NotImplementedError()
 
     @classmethod
-    def load(cls, filepath: str) -> BaseProfiler:
+    def load(cls, filepath: str, load_method: str | None = "pickle") -> BaseProfiler:
         """
         Load profiler from disk.
 
@@ -1168,8 +1173,24 @@ def load(cls, filepath: str) -> BaseProfiler:
         :rtype: BaseProfiler
         """
         # Load profile from disk
-        with open(filepath, "rb") as infile:
-            data: dict = pickle.load(infile)
+
+        if load_method not in [None, "pickle", "json"]:
+            raise ValueError(
+                "Please specify a valid load_method ('pickle','json' or None)"
+            )
+
+        data: dict | None = None
+        try:
+            if load_method is None or load_method == "pickle":
+                with open(filepath, "rb") as infile:
+                    data = pickle.load(infile)
+        except pickle.UnpicklingError:
+            if load_method == "pickle":
+                raise ValueError("File is unable to be loaded as pickle.")
+        finally:
+            if data is None or load_method == "json":
+                with open(filepath) as infile:
+                    return load_profiler(json.load(infile), {})
 
         # remove profiler class if it exists
         profiler_class: str | None = data.pop("profiler_class", None)
@@ -2015,9 +2036,13 @@ def load_from_dict(
             data["chi2_matrix"] = np.array(data["chi2_matrix"])
         if data["correlation_matrix"] is not None:
             data["correlation_matrix"] = np.array(data["correlation_matrix"])
-        data["_col_name_to_idx"] = defaultdict(
-            list, {int(k): v for k, v in data["_col_name_to_idx"].items()}
-        )
+        try:
+            data["_col_name_to_idx"] = defaultdict(
+                list, {int(k): v for k, v in data["_col_name_to_idx"].items()}
+            )
+        except Exception:
+            data["_col_name_to_idx"] = defaultdict(list, data["_col_name_to_idx"])
+
         data["hashed_row_dict"] = {
             int(k): v for k, v in data["hashed_row_dict"].items()
         }

@@ -1494,7 +1494,7 @@ def test_save_and_load_pkl_file(self):
                     "dataprofiler.profilers.profile_builder.DataLabeler",
                     return_value=data_labeler,
                 ):
-                    load_profile = dp.StructuredProfiler.load("mock.pkl")
+                    load_profile = dp.StructuredProfiler.load("mock.pkl", "pickle")
 
                 # validate loaded profile has same data labeler class
                 self.assertIsInstance(
@@ -1517,6 +1517,55 @@ def test_save_and_load_pkl_file(self):
             load_report = test_utils.clean_report(load_profile.report())
             np.testing.assert_equal(save_report, load_report)
 
+    def test_save_and_load_json_file(self):
+        datapth = "dataprofiler/tests/data/"
+        test_files = ["csv/iris.csv"]
+
+        for test_file in test_files:
+            # Create Data and StructuredProfiler objects
+            data = dp.Data(os.path.join(datapth, test_file))
+            options = ProfilerOptions()
+            options.set(
+                {
+                    "correlation.is_enabled": True,
+                    "null_replication_metrics.is_enabled": True,
+                    "multiprocess.is_enabled": False,
+                }
+            )
+            save_profile = dp.StructuredProfiler(data, options=options)
+
+            # store the expected data_labeler
+            data_labeler = save_profile.options.data_labeler.data_labeler_object
+
+            # Save and Load profile with Mock IO
+            with mock.patch("builtins.open") as m:
+                mock_file = setup_save_mock_string_open(m)
+                save_profile.save(save_method="json")
+                mock_file.seek(0)
+                with mock.patch(
+                    "dataprofiler.profilers.utils.DataLabeler.load_from_library",
+                    return_value=data_labeler,
+                ):
+                    load_profile = dp.StructuredProfiler.load("mock.json", "json")
+
+                # validate loaded profile has same data labeler class
+                self.assertIsInstance(
+                    load_profile.options.data_labeler.data_labeler_object,
+                    data_labeler.__class__,
+                )
+
+                # only checks first columns
+                # get first column
+                first_column_profile = load_profile.profile[0]
+                self.assertIsInstance(
+                    first_column_profile.profiles["data_label_profile"]
+                    ._profiles["data_labeler"]
+                    .data_labeler,
+                    data_labeler.__class__,
+                )
+
+            test_utils.assert_profiles_equal(save_profile, load_profile)
+
     def test_save_and_load_no_labeler(self):
         # Create Data and UnstructuredProfiler objects
         data = pd.DataFrame([1, 2, 3], columns=["a"])
@@ -1532,8 +1581,8 @@ def test_save_and_load_no_labeler(self):
             save_profile.save()
 
             mock_file.seek(0)
-            with mock.patch("dataprofiler.profilers.profile_builder.DataLabeler"):
-                load_profile = dp.StructuredProfiler.load("mock.pkl")
+            with mock.patch("dataprofiler.profilers.profile_builder." "DataLabeler"):
-            with mock.patch("dataprofiler.profilers.profile_builder." "DataLabeler"):
+            with mock.patch("dataprofiler.profilers.profile_builder.DataLabeler"):
-            with mock.patch("dataprofiler.profilers.profile_builder." "DataLabeler"):
+            with mock.patch("dataprofiler.profilers.profile_builder.DataLabeler"):
+                load_profile = dp.StructuredProfiler.load("mock.pkl", "pickle")
 
         # Check that reports are equivalent
         save_report = test_utils.clean_report(save_profile.report())
@@ -3468,6 +3517,7 @@ def test_load_from_dict(self, *mocks):
         ):
             UnstructuredProfiler.load_from_dict({}, None)
 
+    @mock.patch("builtins.open")
     def test_save_json_file(self, *mocks):
         data = pd.Series(["this", "is my", "\n\r", "test"])
         save_profile = UnstructuredProfiler(data)
@@ -3941,7 +3991,7 @@ def test_report_remove_disabled_flag(self):
         self.assertIn("vocab", report["data_stats"]["statistics"])
         self.assertIn("words", report["data_stats"]["statistics"])
 
-    def test_save_and_load_pkl(self):
+    def test_save_and_load_pkl_file(self):
         data_folder = "dataprofiler/tests/data/"
         test_files = ["txt/code.txt", "txt/sentence-10x.txt"]