Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
7cd3ae4
initial changes to categoricalColumn decoder (#818)
micdavis May 16, 2023
b4ad93d
Implemented decoding for numerical stats mixin and integer profiles (…
ksneab7 May 31, 2023
620f0d9
hot fixes for encode and decode of numeric stats mixin and intcol pro…
ksneab7 Jun 2, 2023
cd77962
Float column profiler encode decode (#854)
ksneab7 Jun 6, 2023
8b4d6e2
Json decode date time column (#861)
tyfarnan Jun 8, 2023
42ad4a4
Added decoding for encoding of ordered column profiles (#864)
ksneab7 Jun 12, 2023
f4f7e47
Added ordered col test to ensure correct response to update when diff…
ksneab7 Jun 13, 2023
2390004
added decode text_column_profiler functionality and tests (#870)
micdavis Jun 14, 2023
6eb6852
Created encoder for the datalabelercolumn (#869)
ksneab7 Jun 14, 2023
6c51368
feat: add test and compiler serialization (#884)
JGSweets Jun 16, 2023
8285293
[WIP] Adds tests validating serialization with Primitive type for com…
JGSweets Jun 16, 2023
12b7dee
Adds deserialization for compilers and validates tests for Primitive;…
JGSweets Jun 16, 2023
d9b5f49
Add Serialization and Deserialization Tests for Stats Compiler, plus …
JGSweets Jun 20, 2023
f46b8a9
ready datalabeler for deserialization and improvement on serializatio…
ksneab7 Jun 20, 2023
3bb1127
Deserialization of datalabeler (#891)
ksneab7 Jun 21, 2023
3eb7f75
Encode Options (#875)
micdavis Jun 21, 2023
3ebadb4
[WIP] ColumnDataLabelerCompiler: serialize / deserialize (#888)
taylorfturner Jun 21, 2023
31f2a7a
Quick Test update (#893)
taylorfturner Jun 22, 2023
96ca39f
Decode options (#894)
micdavis Jun 22, 2023
ee1f602
refactor: allow options to go through all (#902)
JGSweets Jun 23, 2023
ecceaed
StructuredColProfiler Encode / Decode (#901)
taylorfturner Jun 23, 2023
1e03f90
fix: bug and add tests for structuredcolprofiler (#904)
JGSweets Jun 26, 2023
8032bb0
Stuctured profiler encode decode (#903)
ksneab7 Jun 27, 2023
4f49819
[WIP] Added NoImplementationError for UnstructuredProfiler (#907)
micdavis Jun 27, 2023
7f6dffa
Added testing for values for test_json_decode_after_update (#915)
ksneab7 Jun 27, 2023
2320718
Reuse passed labeler (#924)
JGSweets Jun 28, 2023
1a59a33
BaseProfiler save() for json (#923)
micdavis Jun 28, 2023
5ec8907
refactor: use seed for sample for consistency (#927)
JGSweets Jun 28, 2023
52f54f6
WIP top level load (#925)
tyfarnan Jun 28, 2023
55202ac
quick hot fix for input validation on save() save_metho (#931)
micdavis Jun 28, 2023
bd04cd5
BaseProfiler: `load_method` hotfix (#932)
micdavis Jun 29, 2023
bcf9eeb
fix: null_rep mat should calculate even if datetime (#933)
JGSweets Jun 29, 2023
423bc0a
Notebook Example save/load Profile (#930)
taylorfturner Jun 29, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion dataprofiler/labelers/base_data_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,9 @@ def load_from_library(cls, name: str) -> BaseDataLabeler:
:return: DataLabeler class
:rtype: BaseDataLabeler
"""
return cls(os.path.join(default_labeler_dir, name))
labeler = cls(os.path.join(default_labeler_dir, name))
labeler._default_model_loc = name
return labeler

@classmethod
def load_from_disk(cls, dirpath: str, load_options: dict = None) -> BaseDataLabeler:
Expand Down
5 changes: 4 additions & 1 deletion dataprofiler/labelers/data_labelers.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __new__( # type: ignore
trainable: bool = False,
) -> BaseDataLabeler:
"""
Create structured and unstructred data labeler objects.
Create structured and unstructured data labeler objects.

:param dirpath: Path to load data labeler
:type dirpath: str
Expand Down Expand Up @@ -143,6 +143,9 @@ def load_from_library(cls, name: str, trainable: bool = False) -> BaseDataLabele
"""
if trainable:
return TrainableDataLabeler.load_from_library(name)
for _, labeler_class_obj in cls.labeler_classes.items():
if name in labeler_class_obj._default_model_loc:
return labeler_class_obj()
return BaseDataLabeler.load_from_library(name)

@classmethod
Expand Down
82 changes: 81 additions & 1 deletion dataprofiler/profilers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,92 @@
"""Package for providing statistics and predictions for a given dataset."""
from . import json_decoder
from .base_column_profilers import BaseColumnProfiler
from .categorical_column_profile import CategoricalColumn
from .column_profile_compilers import (
BaseCompiler,
ColumnDataLabelerCompiler,
ColumnPrimitiveTypeProfileCompiler,
ColumnStatsProfileCompiler,
)
from .data_labeler_column_profile import DataLabelerColumn
from .datetime_column_profile import DateTimeColumn
from .float_column_profile import FloatColumn
from .int_column_profile import IntColumn
from .numerical_column_stats import NumericStatsMixin
from .order_column_profile import OrderColumn
from .profile_builder import Profiler, StructuredProfiler, UnstructuredProfiler
from .profile_builder import (
Profiler,
StructuredColProfiler,
StructuredProfiler,
UnstructuredProfiler,
)
from .profiler_options import (
BaseInspectorOptions,
BooleanOption,
CategoricalOptions,
CorrelationOptions,
DataLabelerOptions,
DateTimeOptions,
FloatOptions,
HistogramOption,
IntOptions,
ModeOption,
NumericalOptions,
OrderOptions,
PrecisionOptions,
ProfilerOptions,
StructuredOptions,
TextOptions,
TextProfilerOptions,
UnstructuredOptions,
)
from .text_column_profile import TextColumn
from .unstructured_labeler_profile import UnstructuredLabelerProfile

# set here to avoid circular imports
json_decoder._profiles = {
CategoricalColumn.__name__: CategoricalColumn,
FloatColumn.__name__: FloatColumn,
IntColumn.__name__: IntColumn,
DateTimeColumn.__name__: DateTimeColumn,
OrderColumn.__name__: OrderColumn,
DataLabelerColumn.__name__: DataLabelerColumn,
TextColumn.__name__: TextColumn,
}


json_decoder._compilers = {
ColumnDataLabelerCompiler.__name__: ColumnDataLabelerCompiler,
ColumnPrimitiveTypeProfileCompiler.__name__: ColumnPrimitiveTypeProfileCompiler,
ColumnStatsProfileCompiler.__name__: ColumnStatsProfileCompiler,
}

json_decoder._options = {
BooleanOption.__name__: BooleanOption,
HistogramOption.__name__: HistogramOption,
ModeOption.__name__: ModeOption,
BaseInspectorOptions.__name__: BaseInspectorOptions,
NumericalOptions.__name__: NumericalOptions,
IntOptions.__name__: IntOptions,
PrecisionOptions.__name__: PrecisionOptions,
FloatOptions.__name__: FloatOptions,
TextOptions.__name__: TextOptions,
DateTimeOptions.__name__: DateTimeOptions,
OrderOptions.__name__: OrderOptions,
CategoricalOptions.__name__: CategoricalOptions,
CorrelationOptions.__name__: CorrelationOptions,
DataLabelerOptions.__name__: DataLabelerOptions,
TextProfilerOptions.__name__: TextProfilerOptions,
StructuredOptions.__name__: StructuredOptions,
UnstructuredOptions.__name__: UnstructuredOptions,
ProfilerOptions.__name__: ProfilerOptions,
}


json_decoder._profilers = {
StructuredProfiler.__name__: StructuredProfiler,
}

json_decoder._structured_col_profiler = {
StructuredColProfiler.__name__: StructuredColProfiler,
}
72 changes: 58 additions & 14 deletions dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@
import abc
import warnings
from collections import defaultdict
from typing import Any, Callable
from typing import Any, Callable, Generic, TypeVar

import numpy as np
import pandas as pd

from dataprofiler.profilers.profiler_options import BaseInspectorOptions

from . import utils
from .profiler_options import BaseInspectorOptions, BaseOption

BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler")
Copy link
Contributor Author

@taylorfturner taylorfturner Jun 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

actually concerned there may be an issue with the rebase here.... I thought there was or should be another small class here... but don't see it in main, dev, or feature/profile-serialization. Just take your time reviewing

Copy link
Contributor Author

@taylorfturner taylorfturner Jun 27, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

disregard -- the thing I thought would be here is actually properly in order_column_profile.py



class BaseColumnProfiler(metaclass=abc.ABCMeta): # type: ignore
class BaseColumnProfiler(Generic[BaseColumnProfilerT], metaclass=abc.ABCMeta):
"""Abstract class for profiling a column of data."""

col_type = None
Expand All @@ -28,7 +29,7 @@ class BaseColumnProfiler(metaclass=abc.ABCMeta): # type: ignore
_SAMPLING_RATIO = 0.20
_MIN_SAMPLING_COUNT = 500

def __init__(self, name: str | None) -> None:
def __init__(self, name: str | None, options: BaseOption | None = None):
"""
Initialize base class properties for the subclass.

Expand Down Expand Up @@ -147,7 +148,7 @@ def _merge_calculations(
)

def _add_helper(
self, other1: BaseColumnProfiler, other2: BaseColumnProfiler
self, other1: BaseColumnProfilerT, other2: BaseColumnProfilerT
) -> None:
"""
Merge the properties of two BaseColumnProfile objects.
Expand Down Expand Up @@ -176,7 +177,7 @@ def _add_helper(

self.sample_size = other1.sample_size + other2.sample_size

def diff(self, other_profile: BaseColumnProfiler, options: dict = None) -> dict:
def diff(self, other_profile: BaseColumnProfilerT, options: dict = None) -> dict:
"""
Find the differences for columns.

Expand Down Expand Up @@ -247,10 +248,53 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
"""
raise NotImplementedError()

@classmethod
def load_from_dict(
cls: type[BaseColumnProfilerT],
data: dict[str, Any],
options: dict | None = None,
) -> BaseColumnProfilerT:
"""
Parse attribute from json dictionary into self.

:param data: dictionary with attributes and values.
:type data: dict[string, Any]
:param options: options for loading column profiler params from dictionary
:type options: Dict | None

:return: Profiler with attributes populated.
:rtype: BaseColumnProfiler
"""
if options is None:
options = {}

class_options = options.get(cls.__name__)
profile: BaseColumnProfilerT = cls(data["name"], class_options)

time_vals = data.pop("times")
setattr(profile, "times", defaultdict(float, time_vals))

for attr, value in data.items():
if "__calculations" in attr:
for metric, function in value.items():
if not hasattr(profile, function):
raise AttributeError(
f"Object {type(profile)} has no attribute {function}."
)
value[metric] = getattr(profile, function).__func__
setattr(profile, attr, value)

return profile


BaseColumnPrimitiveTypeProfilerT = TypeVar(
"BaseColumnPrimitiveTypeProfilerT", bound="BaseColumnPrimitiveTypeProfiler"
)


class BaseColumnPrimitiveTypeProfiler(
BaseColumnProfiler,
metaclass=abc.ABCMeta, # type: ignore
BaseColumnProfiler[BaseColumnPrimitiveTypeProfilerT],
metaclass=abc.ABCMeta,
):
"""Abstract class for profiling primative data type for col of data."""

Expand All @@ -275,13 +319,13 @@ def _update_column_base_properties(self, profile: dict) -> None:
:type profile: base data profile dict
:return: None
"""
self.match_count += profile.pop("match_count")
self.match_count += int(profile.pop("match_count"))
BaseColumnProfiler._update_column_base_properties(self, profile)

def _add_helper( # type: ignore[override]
def _add_helper(
self,
other1: BaseColumnPrimitiveTypeProfiler,
other2: BaseColumnPrimitiveTypeProfiler,
other1: BaseColumnPrimitiveTypeProfilerT,
other2: BaseColumnPrimitiveTypeProfilerT,
) -> None:
"""
Merge the properties of two objects inputted.
Expand All @@ -291,5 +335,5 @@ def _add_helper( # type: ignore[override]
:type other1: BaseColumnPrimitiveTypeProfiler
:type other2: BaseColumnPrimitiveTypeProfiler
"""
BaseColumnProfiler._add_helper(self, other1, other2)
super()._add_helper(other1, other2)
self.match_count = other1.match_count + other2.match_count
Loading