Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion dataprofiler/profilers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,11 @@
from . import json_decoder
from .base_column_profilers import BaseColumnProfiler
from .categorical_column_profile import CategoricalColumn
from .column_profile_compilers import BaseCompiler, ColumnPrimitiveTypeProfileCompiler
from .column_profile_compilers import (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

allow for deserialization

BaseCompiler,
ColumnPrimitiveTypeProfileCompiler,
ColumnStatsProfileCompiler,
)
from .data_labeler_column_profile import DataLabelerColumn
from .datetime_column_profile import DateTimeColumn
from .float_column_profile import FloatColumn
Expand All @@ -27,4 +31,5 @@

json_decoder._compilers = {
ColumnPrimitiveTypeProfileCompiler.__name__: ColumnPrimitiveTypeProfileCompiler,
ColumnStatsProfileCompiler.__name__: ColumnStatsProfileCompiler,
}
104 changes: 54 additions & 50 deletions dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,55 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn:
)
return merged_profile

@property
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just movement to have a similar structure across all profilers

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

props not at bottom of class.

def gini_impurity(self) -> float | None:
"""
Return Gini Impurity.

Gini Impurity is a way to calculate
likelihood of an incorrect classification of a new instance of
a random variable.

G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
We are traversing through categories and calculating with the column

:return: None or Gini Impurity probability
"""
if self.sample_size == 0:
return None
gini_sum: float = 0
for i in self._categories:
gini_sum += (self._categories[i] / self.sample_size) * (
1 - (self._categories[i] / self.sample_size)
)
return gini_sum

@property
def unalikeability(self) -> float | None:
"""
Return Unlikeability.

Unikeability checks for "how often observations differ from one another"
Reference: Perry, M. and Kader, G. Variation as Unalikeability.
Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.

U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n)
Cij = 1 if i!=j, 0 if i=j

:return: None or unlikeability probability
"""
if self.sample_size == 0:
return None
elif self.sample_size == 1:
return 0
unalike_sum: int = 0
for category in self._categories:
unalike_sum += (
self.sample_size - self._categories[category]
) * self._categories[category]
unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size)
return unalike

def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
"""
Find the differences for CategoricalColumns.
Expand Down Expand Up @@ -228,6 +277,10 @@ def is_match(self) -> bool:
is_match = True
return is_match

def _get_categories(self, df_series):
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

slight refactor which moves the get to its own function. important to allow usage of __calculations and to accurately use the timeit functionality.

category_count = df_series.value_counts(dropna=False).to_dict()
return category_count

@BaseColumnProfiler._timeit(name="categories")
def _update_categories(
self,
Expand All @@ -250,7 +303,7 @@ def _update_categories(
:type df_series: pandas.DataFrame
:return: None
"""
category_count = df_series.value_counts(dropna=False).to_dict()
category_count = self._get_categories(df_series)
self._categories = utils.add_nested_dictionaries(
self._categories, category_count
)
Expand Down Expand Up @@ -292,52 +345,3 @@ def update(self, df_series: Series) -> CategoricalColumn:
self._update_helper(df_series, profile)

return self

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

moved above

@property
def gini_impurity(self) -> float | None:
"""
Return Gini Impurity.

Gini Impurity is a way to calculate
likelihood of an incorrect classification of a new instance of
a random variable.

G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes.
We are traversing through categories and calculating with the column

:return: None or Gini Impurity probability
"""
if self.sample_size == 0:
return None
gini_sum: float = 0
for i in self._categories:
gini_sum += (self._categories[i] / self.sample_size) * (
1 - (self._categories[i] / self.sample_size)
)
return gini_sum

@property
def unalikeability(self) -> float | None:
"""
Return Unlikeability.

Unikeability checks for "how often observations differ from one another"
Reference: Perry, M. and Kader, G. Variation as Unalikeability.
Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60.

U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n)
Cij = 1 if i!=j, 0 if i=j

:return: None or unlikeability probability
"""
if self.sample_size == 0:
return None
elif self.sample_size == 1:
return 0
unalike_sum: int = 0
for category in self._categories:
unalike_sum += (
self.sample_size - self._categories[category]
) * self._categories[category]
unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size)
return unalike
82 changes: 55 additions & 27 deletions dataprofiler/profilers/order_column_profile.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,29 @@
"""Index profile analysis for individual col within structured profiling."""
from __future__ import annotations

from typing import cast
from abc import abstractmethod
from typing import Protocol, TypeVar

import numpy as np
from pandas import DataFrame, Series

from . import utils
from .base_column_profilers import BaseColumnProfiler
from .profiler_options import OrderOptions


class Comparable(Protocol):
"""Protocol for ensuring comparable types, in this case both floats or strings."""

@abstractmethod
def __lt__(self: CT, other: CT) -> bool:
"""Protocol for ensuring comparable values."""
pass


CT = TypeVar("CT", bound=Comparable)


Comment on lines +15 to +26
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Allows for comparable types. Specifying float | str would error as you can't do a float + str.

class OrderColumn(BaseColumnProfiler):
"""
Index column profile subclass of BaseColumnProfiler.
Expand All @@ -33,28 +47,31 @@ def __init__(self, name: str | None, options: OrderOptions = None) -> None:
"OrderColumn parameter 'options' must be of type" " OrderOptions."
)
self.order: str | None = None
self._last_value: int | None = None
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

int was incorrect here, it is either a float, or a str

self._first_value: int | None = None
self._last_value: float | str | None = None
self._first_value: float | str | None = None
self._piecewise: bool | None = False
self.__calculations: dict = {}
self._filter_properties_w_options(self.__calculations, options)
super().__init__(name)

@staticmethod
def _is_intersecting(
first_value1: int, last_value1: int, first_value2: int, last_value2: int
first_value1: CT,
last_value1: CT,
first_value2: CT,
last_value2: CT,
) -> bool:
"""
Check to see if the range of the datasets intersect.

:param first_value1: beginning value of dataset 1
:type first_value1: Integer
:type first_value1: Float | String
:param last_value1: last value of dataset 1
:type last_value1: Integer
:type last_value1: Float | String
:param first_value2: beginning value of dataset 2
:type first_value2: Integer
:type first_value2: Float | String
:param last_value2: last value of dataset 2
:type last_value2: Integer
:type last_value2: Float | String
:return: Whether or not there is an intersection
:rtype: Bool
"""
Expand All @@ -78,19 +95,22 @@ def _is_intersecting(

@staticmethod
def _is_enveloping(
first_value1: int, last_value1: int, first_value2: int, last_value2: int
first_value1: CT,
last_value1: CT,
first_value2: CT,
last_value2: CT,
) -> bool:
"""
Check to see if the range of the dataset 1 envelopes dataset 2.

:param first_value1: beginning value of dataset 1
:type first_value1: Integer
:type first_value1: Float | String
:param last_value1: last value of dataset 1
:type last_value1: Integer
:type last_value1: Float | String
:param first_value2: beginning value of dataset 2
:type first_value2: Integer
:type first_value2: Float | String
:param last_value2: last value of dataset 2
:type last_value2: Integer
:type last_value2: Float | String
:return: Whether or not there is an intersection
:rtype: Bool
"""
Expand All @@ -109,14 +129,14 @@ def _is_enveloping(
def _merge_order(
self,
order1: str,
first_value1: int,
last_value1: int,
first_value1: CT,
last_value1: CT,
piecewise1: bool,
order2: str,
first_value2: int,
last_value2: int,
first_value2: CT,
last_value2: CT,
piecewise2: bool,
) -> tuple[str, int, int, bool]:
) -> tuple[str, CT | None, CT | None, bool]:
"""
Add the order of two datasets together.

Expand All @@ -129,15 +149,15 @@ def _merge_order(
:param last_value2: last value of new dataset
:param piecewise2: new dataset is piecewise or not
:type order1: String
:type first_value1: Integer
:type last_value1: Integer
:type first_value1: Float | String
:type last_value1: Float | String
:type piecewise1: Boolean
:type order2: String
:type first_value2: Integer
:type last_value2: Integer
:type first_value2: Float | String
:type last_value2: Float | String
:type piecewise2: Boolean
:return: order, first_value, last_value, piecewise
:rtype: String, Int, Int, Boolean
:rtype: String, Float | String, Float | String, Boolean
"""
# Return either order if one is None
if not order1:
Expand All @@ -157,8 +177,8 @@ def _merge_order(

# Default initialization
order = "random"
first_value: int | None = None
last_value: int | None = None
first_value: CT | None = None
last_value: CT | None = None

if order1 == "random" or order2 == "random":
order = "random"
Expand Down Expand Up @@ -219,7 +239,7 @@ def _merge_order(
) or order == "random":
piecewise = False

return order, cast(int, first_value), cast(int, last_value), piecewise
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

casting to an int was not appropriate. this is fixed with the CT

return order, first_value, last_value, piecewise

def __add__(self, other: OrderColumn) -> OrderColumn:
"""
Expand Down Expand Up @@ -283,7 +303,15 @@ def load_from_dict(cls, data):
:rtype: CategoricalColumn
"""
# This is an ambiguous call to super classes.
return super().load_from_dict(data)
profile = super().load_from_dict(data)
try:
if profile.sample_size:
profile._first_value = np.float64(profile._first_value)
profile._last_value = np.float64(profile._last_value)
except ValueError:
profile._first_value = data["_first_value"]
profile._last_value = data["_last_value"]
return profile

@property
def profile(self) -> dict:
Expand Down
Loading