Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 32 additions & 7 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,12 @@
)
from .graph_profiler import GraphProfiler
from .helpers.report_helpers import _prepare_report, calculate_quantiles
from .json_decoder import load_compiler, load_option, load_structured_col_profiler
from .json_decoder import (
load_compiler,
load_option,
load_profiler,
load_structured_col_profiler,
)
from .json_encoder import ProfileEncoder
from .profiler_options import (
BaseOption,
Expand Down Expand Up @@ -1157,7 +1162,7 @@ def save(self, filepath: str = None, save_method: str = "pickle") -> None:
raise NotImplementedError()

@classmethod
def load(cls, filepath: str) -> BaseProfiler:
def load(cls, filepath: str, load_method: str | None = "pickle") -> BaseProfiler:
"""
Load profiler from disk.

Expand All @@ -1168,8 +1173,24 @@ def load(cls, filepath: str) -> BaseProfiler:
:rtype: BaseProfiler
"""
# Load profile from disk
with open(filepath, "rb") as infile:
data: dict = pickle.load(infile)

if load_method not in [None, "pickle", "json"]:
raise ValueError(
"Please specify a valid load_method ('pickle','json' or None)"
)

data: dict | None = None
try:
if load_method is None or load_method == "pickle":
with open(filepath, "rb") as infile:
data = pickle.load(infile)
except pickle.UnpicklingError:
if load_method == "pickle":
raise ValueError("File is unable to be loaded as pickle.")
finally:
if data is None or load_method == "json":
with open(filepath) as infile:
return load_profiler(json.load(infile), {})

# remove profiler class if it exists
profiler_class: str | None = data.pop("profiler_class", None)
Expand Down Expand Up @@ -2015,9 +2036,13 @@ def load_from_dict(
data["chi2_matrix"] = np.array(data["chi2_matrix"])
if data["correlation_matrix"] is not None:
data["correlation_matrix"] = np.array(data["correlation_matrix"])
data["_col_name_to_idx"] = defaultdict(
list, {int(k): v for k, v in data["_col_name_to_idx"].items()}
)
try:
data["_col_name_to_idx"] = defaultdict(
list, {int(k): v for k, v in data["_col_name_to_idx"].items()}
)
except Exception:
data["_col_name_to_idx"] = defaultdict(list, data["_col_name_to_idx"])

data["hashed_row_dict"] = {
int(k): v for k, v in data["hashed_row_dict"].items()
}
Expand Down
58 changes: 54 additions & 4 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1494,7 +1494,7 @@ def test_save_and_load_pkl_file(self):
"dataprofiler.profilers.profile_builder.DataLabeler",
return_value=data_labeler,
):
load_profile = dp.StructuredProfiler.load("mock.pkl")
load_profile = dp.StructuredProfiler.load("mock.pkl", "pickle")

# validate loaded profile has same data labeler class
self.assertIsInstance(
Expand All @@ -1517,6 +1517,55 @@ def test_save_and_load_pkl_file(self):
load_report = test_utils.clean_report(load_profile.report())
np.testing.assert_equal(save_report, load_report)

def test_save_and_load_json_file(self):
datapth = "dataprofiler/tests/data/"
test_files = ["csv/iris.csv"]

for test_file in test_files:
# Create Data and StructuredProfiler objects
data = dp.Data(os.path.join(datapth, test_file))
options = ProfilerOptions()
options.set(
{
"correlation.is_enabled": True,
"null_replication_metrics.is_enabled": True,
"multiprocess.is_enabled": False,
}
)
save_profile = dp.StructuredProfiler(data, options=options)

# store the expected data_labeler
data_labeler = save_profile.options.data_labeler.data_labeler_object

# Save and Load profile with Mock IO
with mock.patch("builtins.open") as m:
mock_file = setup_save_mock_string_open(m)
save_profile.save(save_method="json")
mock_file.seek(0)
with mock.patch(
"dataprofiler.profilers.utils.DataLabeler.load_from_library",
return_value=data_labeler,
):
load_profile = dp.StructuredProfiler.load("mock.json", "json")

# validate loaded profile has same data labeler class
self.assertIsInstance(
load_profile.options.data_labeler.data_labeler_object,
data_labeler.__class__,
)

# only checks first columns
# get first column
first_column_profile = load_profile.profile[0]
self.assertIsInstance(
first_column_profile.profiles["data_label_profile"]
._profiles["data_labeler"]
.data_labeler,
data_labeler.__class__,
)

test_utils.assert_profiles_equal(save_profile, load_profile)

def test_save_and_load_no_labeler(self):
# Create Data and UnstructuredProfiler objects
data = pd.DataFrame([1, 2, 3], columns=["a"])
Expand All @@ -1532,8 +1581,8 @@ def test_save_and_load_no_labeler(self):
save_profile.save()

mock_file.seek(0)
with mock.patch("dataprofiler.profilers.profile_builder.DataLabeler"):
load_profile = dp.StructuredProfiler.load("mock.pkl")
with mock.patch("dataprofiler.profilers.profile_builder." "DataLabeler"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
with mock.patch("dataprofiler.profilers.profile_builder." "DataLabeler"):
with mock.patch("dataprofiler.profilers.profile_builder.DataLabeler"):

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just to raise this as a potential change that is needed....

load_profile = dp.StructuredProfiler.load("mock.pkl", "pickle")

# Check that reports are equivalent
save_report = test_utils.clean_report(save_profile.report())
Expand Down Expand Up @@ -3468,6 +3517,7 @@ def test_load_from_dict(self, *mocks):
):
UnstructuredProfiler.load_from_dict({}, None)

@mock.patch("builtins.open")
def test_save_json_file(self, *mocks):
data = pd.Series(["this", "is my", "\n\r", "test"])
save_profile = UnstructuredProfiler(data)
Expand Down Expand Up @@ -3941,7 +3991,7 @@ def test_report_remove_disabled_flag(self):
self.assertIn("vocab", report["data_stats"]["statistics"])
self.assertIn("words", report["data_stats"]["statistics"])

def test_save_and_load_pkl(self):
def test_save_and_load_pkl_file(self):
data_folder = "dataprofiler/tests/data/"
test_files = ["txt/code.txt", "txt/sentence-10x.txt"]

Expand Down