Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
105 changes: 68 additions & 37 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from __future__ import annotations

import copy
import json
import logging
import pickle
import random
Expand Down Expand Up @@ -32,6 +33,7 @@
from .graph_profiler import GraphProfiler
from .helpers.report_helpers import _prepare_report, calculate_quantiles
from .json_decoder import load_compiler, load_option, load_structured_col_profiler
from .json_encoder import ProfileEncoder
from .profiler_options import (
BaseOption,
ProfilerOptions,
Expand Down Expand Up @@ -1085,7 +1087,7 @@ def _restore_data_labelers(self, data_labeler: BaseDataLabeler = None) -> None:
data_labeler_profile = profiler._profiles["data_labeler"]
data_labeler_profile.data_labeler = data_labeler

def _save_helper(self, filepath: str | None, data_dict: dict) -> None:
def _pkl_save_helper(self, filepath: str | None, data_dict: dict) -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice

"""
Save profiler to disk.

Expand Down Expand Up @@ -1114,14 +1116,32 @@ def _save_helper(self, filepath: str | None, data_dict: dict) -> None:
# Restore all data labelers
self._restore_data_labelers(data_labelers)

def save(self, filepath: str = None) -> None:
def _json_save_helper(self, filepath: str | None) -> None:
"""
Save profiler to disk.

:param filepath: Path of file to save to
:type filepath: String
:return: None
"""
if filepath is None:
filepath = "profile-{}.json".format(
datetime.now().strftime("%d-%b-%Y-%H:%M:%S.%f")
)

with open(filepath, "w") as f:
json.dump(self, f, cls=ProfileEncoder)

def save(self, filepath: str = None, save_method: str = "pickle") -> None:
"""
Save profiler to disk.

:param filepath: Path of file to save to
:type filepath: String
:param save_method: The desired saving method (must be "pickle" or "json")
:type save_method: String
:return: None
"""
raise NotImplementedError()

@classmethod
Expand Down Expand Up @@ -1525,29 +1545,35 @@ def _update_profile_from_chunk(
else:
self._profile.update_profile(data, pool=pool)

def save(self, filepath: str = None) -> None:
def save(self, filepath: str = None, save_method: str = "pickle") -> None:
"""
Save profiler to disk.

:param filepath: Path of file to save to
:type filepath: String
:param save_method: The desired saving method ("pickle" | "json")
:type save_method: String
:return: None
"""
# Create dictionary for all metadata, options, and profile
Copy link
Contributor

@JGSweets JGSweets Jun 28, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

above in save we need to add the type: str | None with default None where we can choose "pickle "| "json"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

data_dict = {
"total_samples": self.total_samples,
"sample": self.sample,
"encoding": self.encoding,
"file_type": self.file_type,
"_samples_per_update": self._samples_per_update,
"_min_true_samples": self._min_true_samples,
"_empty_line_count": self._empty_line_count,
"memory_size": self.memory_size,
"options": self.options,
"_profile": self.profile,
"times": self.times,
}
self._save_helper(filepath, data_dict)
if save_method == "pickle":
data_dict = {
"total_samples": self.total_samples,
"sample": self.sample,
"encoding": self.encoding,
"file_type": self.file_type,
"_samples_per_update": self._samples_per_update,
"_min_true_samples": self._min_true_samples,
"_empty_line_count": self._empty_line_count,
"memory_size": self.memory_size,
"options": self.options,
"_profile": self.profile,
"times": self.times,
}
self._pkl_save_helper(filepath, data_dict)
elif save_method == "json":
self._json_save_helper(filepath)
else:
raise ValueError('save_method must be "json" or "pickle".')


class StructuredProfiler(BaseProfiler):
Expand Down Expand Up @@ -2820,32 +2846,37 @@ def tqdm(level: set[int]) -> Generator[int, None, None]:
if self.options.null_replication_metrics.is_enabled:
self._update_null_replication_metrics(clean_sampled_dict)

def save(self, filepath: str = None) -> None:
def save(self, filepath: str = None, save_method: str = "pickle") -> None:
"""
Save profiler to disk.

:param filepath: Path of file to save to
:type filepath: String
:param save_method: The desired saving method (must be "pickle" or "json")
:type save_method: String
:return: None
"""
# Create dictionary for all metadata, options, and profile
data_dict = {
"total_samples": self.total_samples,
"encoding": self.encoding,
"file_type": self.file_type,
"row_has_null_count": self.row_has_null_count,
"row_is_null_count": self.row_is_null_count,
"hashed_row_dict": self.hashed_row_dict,
"_samples_per_update": self._samples_per_update,
"_min_true_samples": self._min_true_samples,
"options": self.options,
"chi2_matrix": self.chi2_matrix,
"_profile": self.profile,
"_col_name_to_idx": self._col_name_to_idx,
"times": self.times,
}

self._save_helper(filepath, data_dict)
if save_method == "pickle":
data_dict = {
"total_samples": self.total_samples,
"encoding": self.encoding,
"file_type": self.file_type,
"row_has_null_count": self.row_has_null_count,
"row_is_null_count": self.row_is_null_count,
"hashed_row_dict": self.hashed_row_dict,
"_samples_per_update": self._samples_per_update,
"_min_true_samples": self._min_true_samples,
"options": self.options,
"chi2_matrix": self.chi2_matrix,
"_profile": self.profile,
"_col_name_to_idx": self._col_name_to_idx,
"times": self.times,
}
self._pkl_save_helper(filepath, data_dict)
elif save_method == "json":
self._json_save_helper(filepath)
else:
raise ValueError('save_method must be "json" or "pickle".')


class Profiler:
Expand Down
Loading