Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
171e1be
initial changes to categoricalColumn decoder (#818)
micdavis May 16, 2023
bf7077b
Implemented decoding for numerical stats mixin and integer profiles (…
ksneab7 May 31, 2023
fe0e6f1
hot fixes for encode and decode of numeric stats mixin and intcol pro…
ksneab7 Jun 2, 2023
168acb2
Float column profiler encode decode (#854)
ksneab7 Jun 6, 2023
188dea0
Json decode date time column (#861)
tyfarnan Jun 8, 2023
65f74b7
Added decoding for encoding of ordered column profiles (#864)
ksneab7 Jun 12, 2023
0e75b08
Added ordered col test to ensure correct response to update when diff…
ksneab7 Jun 13, 2023
587fc78
added decode text_column_profiler functionality and tests (#870)
micdavis Jun 14, 2023
91b0e9b
Created encoder for the datalabelercolumn (#869)
ksneab7 Jun 14, 2023
0c8e02a
feat: add test and compiler serialization (#884)
JGSweets Jun 16, 2023
e79c4c0
[WIP] Adds tests validating serialization with Primitive type for com…
JGSweets Jun 16, 2023
a37a54a
Adds deserialization for compilers and validates tests for Primitive;…
JGSweets Jun 16, 2023
232e2ab
Add Serialization and Deserialization Tests for Stats Compiler, plus …
JGSweets Jun 20, 2023
2ad1de4
ready datalabeler for deserialization and improvement on serializatio…
ksneab7 Jun 20, 2023
c13c318
Deserialization of datalabeler (#891)
ksneab7 Jun 21, 2023
438feb5
Encode Options (#875)
micdavis Jun 21, 2023
1ad3d83
[WIP] ColumnDataLabelerCompiler: serialize / deserialize (#888)
taylorfturner Jun 21, 2023
16c8d64
Quick Test update (#893)
taylorfturner Jun 22, 2023
2ff5f18
Decode options (#894)
micdavis Jun 22, 2023
2173f8e
refactor: allow options to go through all (#902)
JGSweets Jun 23, 2023
f36e45d
StructuredColProfiler Encode / Decode (#901)
taylorfturner Jun 23, 2023
65af593
fix: bug and add tests for structuredcolprofiler (#904)
JGSweets Jun 26, 2023
cf7b237
Stuctured profiler encode decode (#903)
ksneab7 Jun 27, 2023
2159b78
[WIP] Added NoImplementationError for UnstructuredProfiler (#907)
micdavis Jun 27, 2023
34c2a2b
Added testing for values for test_json_decode_after_update (#915)
ksneab7 Jun 27, 2023
e8ba88c
Reuse passed labeler (#924)
JGSweets Jun 28, 2023
046fa7f
BaseProfiler save() for json (#923)
micdavis Jun 28, 2023
9856720
refactor: use seed for sample for consistency (#927)
JGSweets Jun 28, 2023
d81a9e8
WIP top level load (#925)
tyfarnan Jun 28, 2023
e654afe
quick hot fix for input validation on save() save_metho (#931)
micdavis Jun 28, 2023
f07faf8
BaseProfiler: `load_method` hotfix (#932)
micdavis Jun 29, 2023
454d429
fix: null_rep mat should calculate even if datetime (#933)
JGSweets Jun 29, 2023
1f573f8
Notebook Example save/load Profile (#930)
taylorfturner Jun 29, 2023
146c547
fix: order bug (#939)
JGSweets Jun 29, 2023
e8a3361
fix: typo on rebase
JGSweets Jun 29, 2023
54efd86
fix: typing and bugs from rebase
JGSweets Jun 29, 2023
6ccc203
fix: options tests due to merge and loading new options
JGSweets Jun 29, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion dataprofiler/labelers/base_data_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,9 @@ def load_from_library(cls, name: str) -> BaseDataLabeler:
:return: DataLabeler class
:rtype: BaseDataLabeler
"""
return cls(os.path.join(default_labeler_dir, name))
labeler = cls(os.path.join(default_labeler_dir, name))
labeler._default_model_loc = name
return labeler

@classmethod
def load_from_disk(cls, dirpath: str, load_options: dict = None) -> BaseDataLabeler:
Expand Down
5 changes: 4 additions & 1 deletion dataprofiler/labelers/data_labelers.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def __new__( # type: ignore
trainable: bool = False,
) -> BaseDataLabeler:
"""
Create structured and unstructred data labeler objects.
Create structured and unstructured data labeler objects.

:param dirpath: Path to load data labeler
:type dirpath: str
Expand Down Expand Up @@ -143,6 +143,9 @@ def load_from_library(cls, name: str, trainable: bool = False) -> BaseDataLabele
"""
if trainable:
return TrainableDataLabeler.load_from_library(name)
for _, labeler_class_obj in cls.labeler_classes.items():
if name in labeler_class_obj._default_model_loc:
return labeler_class_obj()
return BaseDataLabeler.load_from_library(name)

@classmethod
Expand Down
88 changes: 87 additions & 1 deletion dataprofiler/profilers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,98 @@
"""Package for providing statistics and predictions for a given dataset."""
from . import json_decoder
from .base_column_profilers import BaseColumnProfiler
from .categorical_column_profile import CategoricalColumn
from .column_profile_compilers import (
BaseCompiler,
ColumnDataLabelerCompiler,
ColumnPrimitiveTypeProfileCompiler,
ColumnStatsProfileCompiler,
)
from .data_labeler_column_profile import DataLabelerColumn
from .datetime_column_profile import DateTimeColumn
from .float_column_profile import FloatColumn
from .int_column_profile import IntColumn
from .numerical_column_stats import NumericStatsMixin
from .order_column_profile import OrderColumn
from .profile_builder import Profiler, StructuredProfiler, UnstructuredProfiler
from .profile_builder import (
Profiler,
StructuredColProfiler,
StructuredProfiler,
UnstructuredProfiler,
)
from .profiler_options import (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated for dev options that weren't in profiler serial

BaseInspectorOptions,
BooleanOption,
CategoricalOptions,
CorrelationOptions,
DataLabelerOptions,
DateTimeOptions,
FloatOptions,
HistogramOption,
HyperLogLogOptions,
IntOptions,
ModeOption,
NumericalOptions,
OrderOptions,
PrecisionOptions,
ProfilerOptions,
RowStatisticsOptions,
StructuredOptions,
TextOptions,
TextProfilerOptions,
UniqueCountOptions,
UnstructuredOptions,
)
from .text_column_profile import TextColumn
from .unstructured_labeler_profile import UnstructuredLabelerProfile

# set here to avoid circular imports
json_decoder._profiles = {
CategoricalColumn.__name__: CategoricalColumn,
FloatColumn.__name__: FloatColumn,
IntColumn.__name__: IntColumn,
DateTimeColumn.__name__: DateTimeColumn,
OrderColumn.__name__: OrderColumn,
DataLabelerColumn.__name__: DataLabelerColumn,
TextColumn.__name__: TextColumn,
}


json_decoder._compilers = {
ColumnDataLabelerCompiler.__name__: ColumnDataLabelerCompiler,
ColumnPrimitiveTypeProfileCompiler.__name__: ColumnPrimitiveTypeProfileCompiler,
ColumnStatsProfileCompiler.__name__: ColumnStatsProfileCompiler,
}

json_decoder._options = {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated these from dev

BooleanOption.__name__: BooleanOption,
HistogramOption.__name__: HistogramOption,
ModeOption.__name__: ModeOption,
BaseInspectorOptions.__name__: BaseInspectorOptions,
NumericalOptions.__name__: NumericalOptions,
IntOptions.__name__: IntOptions,
PrecisionOptions.__name__: PrecisionOptions,
FloatOptions.__name__: FloatOptions,
TextOptions.__name__: TextOptions,
DateTimeOptions.__name__: DateTimeOptions,
OrderOptions.__name__: OrderOptions,
CategoricalOptions.__name__: CategoricalOptions,
CorrelationOptions.__name__: CorrelationOptions,
UniqueCountOptions.__name__: UniqueCountOptions,
HyperLogLogOptions.__name__: HyperLogLogOptions,
RowStatisticsOptions.__name__: RowStatisticsOptions,
DataLabelerOptions.__name__: DataLabelerOptions,
TextProfilerOptions.__name__: TextProfilerOptions,
StructuredOptions.__name__: StructuredOptions,
UnstructuredOptions.__name__: UnstructuredOptions,
ProfilerOptions.__name__: ProfilerOptions,
}


json_decoder._profilers = {
StructuredProfiler.__name__: StructuredProfiler,
}

json_decoder._structured_col_profiler = {
StructuredColProfiler.__name__: StructuredColProfiler,
}
45 changes: 41 additions & 4 deletions dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@
import numpy as np
import pandas as pd

from dataprofiler.profilers.profiler_options import BaseInspectorOptions

from . import utils
from .profiler_options import BaseInspectorOptions, BaseOption

BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler")

Expand All @@ -30,7 +29,7 @@ class BaseColumnProfiler(Generic[BaseColumnProfilerT], metaclass=abc.ABCMeta):
_SAMPLING_RATIO = 0.20
_MIN_SAMPLING_COUNT = 500

def __init__(self, name: str | None) -> None:
def __init__(self, name: str | None, options: BaseOption | None = None):
"""
Initialize base class properties for the subclass.

Expand Down Expand Up @@ -249,6 +248,44 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
"""
raise NotImplementedError()

@classmethod
def load_from_dict(
cls: type[BaseColumnProfilerT],
data: dict[str, Any],
options: dict | None = None,
) -> BaseColumnProfilerT:
"""
Parse attribute from json dictionary into self.

:param data: dictionary with attributes and values.
:type data: dict[string, Any]
:param options: options for loading column profiler params from dictionary
:type options: Dict | None

:return: Profiler with attributes populated.
:rtype: BaseColumnProfiler
"""
if options is None:
options = {}

class_options = options.get(cls.__name__)
profile: BaseColumnProfilerT = cls(data["name"], class_options)

time_vals = data.pop("times")
setattr(profile, "times", defaultdict(float, time_vals))

for attr, value in data.items():
if "__calculations" in attr:
for metric, function in value.items():
if not hasattr(profile, function):
raise AttributeError(
f"Object {type(profile)} has no attribute {function}."
)
value[metric] = getattr(profile, function).__func__
setattr(profile, attr, value)

return profile


BaseColumnPrimitiveTypeProfilerT = TypeVar(
"BaseColumnPrimitiveTypeProfilerT", bound="BaseColumnPrimitiveTypeProfiler"
Expand Down Expand Up @@ -282,7 +319,7 @@ def _update_column_base_properties(self, profile: dict) -> None:
:type profile: base data profile dict
:return: None
"""
self.match_count += profile.pop("match_count")
self.match_count += int(profile.pop("match_count"))
BaseColumnProfiler._update_column_base_properties(self, profile)

def _add_helper(
Expand Down
130 changes: 79 additions & 51 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
import datasketches
from pandas import DataFrame, Series

from . import BaseColumnProfiler, utils
from . import utils
from .base_column_profilers import BaseColumnProfiler
from .profiler_options import CategoricalOptions


Expand Down Expand Up @@ -188,6 +189,55 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn:

return merged_profile

@property
def gini_impurity(self) -> float | None:
"""
Return Gini Impurity.

Gini Impurity is a way to calculate
likelihood of an incorrect classification of a new instance of
a random variable.

G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
We are traversing through categories and calculating with the column

:return: None or Gini Impurity probability
"""
if self.sample_size == 0:
return None
gini_sum: float = 0
for i in self._categories:
gini_sum += (self._categories[i] / self.sample_size) * (
1 - (self._categories[i] / self.sample_size)
)
return gini_sum

@property
def unalikeability(self) -> float | None:
"""
Return Unlikeability.

Unikeability checks for "how often observations differ from one another"
Reference: Perry, M. and Kader, G. Variation as Unalikeability.
Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.

U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n)
Cij = 1 if i!=j, 0 if i=j

:return: None or unlikeability probability
"""
if self.sample_size == 0:
return None
elif self.sample_size == 1:
return 0
unalike_sum: int = 0
for category in self._categories:
unalike_sum += (
self.sample_size - self._categories[category]
) * self._categories[category]
unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size)
return unalike

def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
"""
Find the differences for CategoricalColumns.
Expand Down Expand Up @@ -267,6 +317,22 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
"""
return self.profile

@classmethod
def load_from_dict(cls, data: dict, options: dict | None = None):
"""
Parse attribute from json dictionary into self.

:param data: dictionary with attributes and values.
:type data: dict[string, Any]

:return: Profiler with attributes populated.
:rtype: CategoricalColumn
"""
value = data.pop("_categories")
profile = super().load_from_dict(data)
setattr(profile, "_categories", defaultdict(int, value))
return profile

@property
def profile(self) -> dict:
"""
Expand Down Expand Up @@ -479,6 +545,17 @@ def _merge_categories_cms(
categories.pop(cat)
return cms3, categories, max_num_heavy_hitters

def _get_categories_full(self, df_series) -> dict:
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added descript and renamed method

"""Get the unique counts (categories) of a series.

:param df_series: df series with nulls removed
:type df_series: pandas.core.series.Series
:return: dict of counts for each unique value
:rtype: dict
"""
category_count: dict = df_series.value_counts(dropna=False).to_dict()
return category_count

@BaseColumnProfiler._timeit(name="categories")
def _update_categories(
self,
Expand Down Expand Up @@ -524,7 +601,7 @@ def _update_categories(
self._cms_max_num_heavy_hitters,
)
else:
category_count = df_series.value_counts(dropna=False).to_dict()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed to use method

category_count = self._get_categories_full(df_series)
self._categories = utils.add_nested_dictionaries(
self._categories, category_count
)
Expand Down Expand Up @@ -570,52 +647,3 @@ def update(self, df_series: Series) -> CategoricalColumn:
self._update_helper(df_series, profile)

return self

@property
def gini_impurity(self) -> float | None:
"""
Return Gini Impurity.

Gini Impurity is a way to calculate
likelihood of an incorrect classification of a new instance of
a random variable.

G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
We are traversing through categories and calculating with the column

:return: None or Gini Impurity probability
"""
if self.sample_size == 0:
return None
gini_sum: float = 0
for i in self._categories:
gini_sum += (self._categories[i] / self.sample_size) * (
1 - (self._categories[i] / self.sample_size)
)
return gini_sum

@property
def unalikeability(self) -> float | None:
"""
Return Unlikeability.

Unikeability checks for "how often observations differ from one another"
Reference: Perry, M. and Kader, G. Variation as Unalikeability.
Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.

U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n)
Cij = 1 if i!=j, 0 if i=j

:return: None or unlikeability probability
"""
if self.sample_size == 0:
return None
elif self.sample_size == 1:
return 0
unalike_sum: int = 0
for category in self._categories:
unalike_sum += (
self.sample_size - self._categories[category]
) * self._categories[category]
unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size)
return unalike
Loading