-
Notifications
You must be signed in to change notification settings - Fork 185
Add Serialization and Deserialization Tests for Stats Compiler, plus refactors for order Typing #887
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add Serialization and Deserialization Tests for Stats Compiler, plus refactors for order Typing #887
Changes from all commits
e98d749
f08ee37
d71eae4
ea731c6
17d6776
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -73,6 +73,55 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn: | |
| ) | ||
| return merged_profile | ||
|
|
||
| @property | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. just movement to have a similar structure across all profilers
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. props not at bottom of class. |
||
| def gini_impurity(self) -> float | None: | ||
| """ | ||
| Return Gini Impurity. | ||
|
|
||
| Gini Impurity is a way to calculate | ||
| likelihood of an incorrect classification of a new instance of | ||
| a random variable. | ||
|
|
||
| G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes. | ||
| We are traversing through categories and calculating with the column | ||
|
|
||
| :return: None or Gini Impurity probability | ||
| """ | ||
| if self.sample_size == 0: | ||
| return None | ||
| gini_sum: float = 0 | ||
| for i in self._categories: | ||
| gini_sum += (self._categories[i] / self.sample_size) * ( | ||
| 1 - (self._categories[i] / self.sample_size) | ||
| ) | ||
| return gini_sum | ||
|
|
||
| @property | ||
| def unalikeability(self) -> float | None: | ||
| """ | ||
| Return Unlikeability. | ||
|
|
||
| Unikeability checks for "how often observations differ from one another" | ||
| Reference: Perry, M. and Kader, G. Variation as Unalikeability. | ||
| Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60. | ||
|
|
||
| U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n) | ||
| Cij = 1 if i!=j, 0 if i=j | ||
|
|
||
| :return: None or unlikeability probability | ||
| """ | ||
| if self.sample_size == 0: | ||
| return None | ||
| elif self.sample_size == 1: | ||
| return 0 | ||
| unalike_sum: int = 0 | ||
| for category in self._categories: | ||
| unalike_sum += ( | ||
| self.sample_size - self._categories[category] | ||
| ) * self._categories[category] | ||
| unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size) | ||
| return unalike | ||
|
|
||
| def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: | ||
| """ | ||
| Find the differences for CategoricalColumns. | ||
|
|
@@ -228,6 +277,10 @@ def is_match(self) -> bool: | |
| is_match = True | ||
| return is_match | ||
|
|
||
| def _get_categories(self, df_series): | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. slight refactor which moves the get to its own function. important to allow usage of |
||
| category_count = df_series.value_counts(dropna=False).to_dict() | ||
| return category_count | ||
|
|
||
| @BaseColumnProfiler._timeit(name="categories") | ||
| def _update_categories( | ||
| self, | ||
|
|
@@ -250,7 +303,7 @@ def _update_categories( | |
| :type df_series: pandas.DataFrame | ||
| :return: None | ||
| """ | ||
| category_count = df_series.value_counts(dropna=False).to_dict() | ||
| category_count = self._get_categories(df_series) | ||
| self._categories = utils.add_nested_dictionaries( | ||
| self._categories, category_count | ||
| ) | ||
|
|
@@ -292,52 +345,3 @@ def update(self, df_series: Series) -> CategoricalColumn: | |
| self._update_helper(df_series, profile) | ||
|
|
||
| return self | ||
|
|
||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. moved above |
||
| @property | ||
| def gini_impurity(self) -> float | None: | ||
| """ | ||
| Return Gini Impurity. | ||
|
|
||
| Gini Impurity is a way to calculate | ||
| likelihood of an incorrect classification of a new instance of | ||
| a random variable. | ||
|
|
||
| G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes. | ||
| We are traversing through categories and calculating with the column | ||
|
|
||
| :return: None or Gini Impurity probability | ||
| """ | ||
| if self.sample_size == 0: | ||
| return None | ||
| gini_sum: float = 0 | ||
| for i in self._categories: | ||
| gini_sum += (self._categories[i] / self.sample_size) * ( | ||
| 1 - (self._categories[i] / self.sample_size) | ||
| ) | ||
| return gini_sum | ||
|
|
||
| @property | ||
| def unalikeability(self) -> float | None: | ||
| """ | ||
| Return Unlikeability. | ||
|
|
||
| Unikeability checks for "how often observations differ from one another" | ||
| Reference: Perry, M. and Kader, G. Variation as Unalikeability. | ||
| Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60. | ||
|
|
||
| U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n) | ||
| Cij = 1 if i!=j, 0 if i=j | ||
|
|
||
| :return: None or unlikeability probability | ||
| """ | ||
| if self.sample_size == 0: | ||
| return None | ||
| elif self.sample_size == 1: | ||
| return 0 | ||
| unalike_sum: int = 0 | ||
| for category in self._categories: | ||
| unalike_sum += ( | ||
| self.sample_size - self._categories[category] | ||
| ) * self._categories[category] | ||
| unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size) | ||
| return unalike | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,15 +1,29 @@ | ||
| """Index profile analysis for individual col within structured profiling.""" | ||
| from __future__ import annotations | ||
|
|
||
| from typing import cast | ||
| from abc import abstractmethod | ||
| from typing import Protocol, TypeVar | ||
|
|
||
| import numpy as np | ||
| from pandas import DataFrame, Series | ||
|
|
||
| from . import utils | ||
| from .base_column_profilers import BaseColumnProfiler | ||
| from .profiler_options import OrderOptions | ||
|
|
||
|
|
||
| class Comparable(Protocol): | ||
| """Protocol for ensuring comparable types, in this case both floats or strings.""" | ||
|
|
||
| @abstractmethod | ||
| def __lt__(self: CT, other: CT) -> bool: | ||
| """Protocol for ensuring comparable values.""" | ||
| pass | ||
|
|
||
|
|
||
| CT = TypeVar("CT", bound=Comparable) | ||
|
|
||
|
|
||
|
Comment on lines
+15
to
+26
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Allows for comparable types. Specifying float | str would error as you can't do a float + str. |
||
| class OrderColumn(BaseColumnProfiler): | ||
| """ | ||
| Index column profile subclass of BaseColumnProfiler. | ||
|
|
@@ -33,28 +47,31 @@ def __init__(self, name: str | None, options: OrderOptions = None) -> None: | |
| "OrderColumn parameter 'options' must be of type" " OrderOptions." | ||
| ) | ||
| self.order: str | None = None | ||
| self._last_value: int | None = None | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| self._first_value: int | None = None | ||
| self._last_value: float | str | None = None | ||
| self._first_value: float | str | None = None | ||
| self._piecewise: bool | None = False | ||
| self.__calculations: dict = {} | ||
| self._filter_properties_w_options(self.__calculations, options) | ||
| super().__init__(name) | ||
|
|
||
| @staticmethod | ||
| def _is_intersecting( | ||
| first_value1: int, last_value1: int, first_value2: int, last_value2: int | ||
| first_value1: CT, | ||
| last_value1: CT, | ||
| first_value2: CT, | ||
| last_value2: CT, | ||
| ) -> bool: | ||
| """ | ||
| Check to see if the range of the datasets intersect. | ||
|
|
||
| :param first_value1: beginning value of dataset 1 | ||
| :type first_value1: Integer | ||
| :type first_value1: Float | String | ||
| :param last_value1: last value of dataset 1 | ||
| :type last_value1: Integer | ||
| :type last_value1: Float | String | ||
| :param first_value2: beginning value of dataset 2 | ||
| :type first_value2: Integer | ||
| :type first_value2: Float | String | ||
| :param last_value2: last value of dataset 2 | ||
| :type last_value2: Integer | ||
| :type last_value2: Float | String | ||
| :return: Whether or not there is an intersection | ||
| :rtype: Bool | ||
| """ | ||
|
|
@@ -78,19 +95,22 @@ def _is_intersecting( | |
|
|
||
| @staticmethod | ||
| def _is_enveloping( | ||
| first_value1: int, last_value1: int, first_value2: int, last_value2: int | ||
| first_value1: CT, | ||
| last_value1: CT, | ||
| first_value2: CT, | ||
| last_value2: CT, | ||
| ) -> bool: | ||
| """ | ||
| Check to see if the range of the dataset 1 envelopes dataset 2. | ||
|
|
||
| :param first_value1: beginning value of dataset 1 | ||
| :type first_value1: Integer | ||
| :type first_value1: Float | String | ||
| :param last_value1: last value of dataset 1 | ||
| :type last_value1: Integer | ||
| :type last_value1: Float | String | ||
| :param first_value2: beginning value of dataset 2 | ||
| :type first_value2: Integer | ||
| :type first_value2: Float | String | ||
| :param last_value2: last value of dataset 2 | ||
| :type last_value2: Integer | ||
| :type last_value2: Float | String | ||
| :return: Whether or not there is an intersection | ||
| :rtype: Bool | ||
| """ | ||
|
|
@@ -109,14 +129,14 @@ def _is_enveloping( | |
| def _merge_order( | ||
| self, | ||
| order1: str, | ||
| first_value1: int, | ||
| last_value1: int, | ||
| first_value1: CT, | ||
| last_value1: CT, | ||
| piecewise1: bool, | ||
| order2: str, | ||
| first_value2: int, | ||
| last_value2: int, | ||
| first_value2: CT, | ||
| last_value2: CT, | ||
| piecewise2: bool, | ||
| ) -> tuple[str, int, int, bool]: | ||
| ) -> tuple[str, CT | None, CT | None, bool]: | ||
| """ | ||
| Add the order of two datasets together. | ||
|
|
||
|
|
@@ -129,15 +149,15 @@ def _merge_order( | |
| :param last_value2: last value of new dataset | ||
| :param piecewise2: new dataset is piecewise or not | ||
| :type order1: String | ||
| :type first_value1: Integer | ||
| :type last_value1: Integer | ||
| :type first_value1: Float | String | ||
| :type last_value1: Float | String | ||
| :type piecewise1: Boolean | ||
| :type order2: String | ||
| :type first_value2: Integer | ||
| :type last_value2: Integer | ||
| :type first_value2: Float | String | ||
| :type last_value2: Float | String | ||
| :type piecewise2: Boolean | ||
| :return: order, first_value, last_value, piecewise | ||
| :rtype: String, Int, Int, Boolean | ||
| :rtype: String, Float | String, Float | String, Boolean | ||
| """ | ||
| # Return either order if one is None | ||
| if not order1: | ||
|
|
@@ -157,8 +177,8 @@ def _merge_order( | |
|
|
||
| # Default initialization | ||
| order = "random" | ||
| first_value: int | None = None | ||
| last_value: int | None = None | ||
| first_value: CT | None = None | ||
| last_value: CT | None = None | ||
|
|
||
| if order1 == "random" or order2 == "random": | ||
| order = "random" | ||
|
|
@@ -219,7 +239,7 @@ def _merge_order( | |
| ) or order == "random": | ||
| piecewise = False | ||
|
|
||
| return order, cast(int, first_value), cast(int, last_value), piecewise | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. casting to an |
||
| return order, first_value, last_value, piecewise | ||
|
|
||
| def __add__(self, other: OrderColumn) -> OrderColumn: | ||
| """ | ||
|
|
@@ -283,7 +303,15 @@ def load_from_dict(cls, data): | |
| :rtype: CategoricalColumn | ||
| """ | ||
| # This is an ambiguous call to super classes. | ||
| return super().load_from_dict(data) | ||
| profile = super().load_from_dict(data) | ||
| try: | ||
| if profile.sample_size: | ||
| profile._first_value = np.float64(profile._first_value) | ||
| profile._last_value = np.float64(profile._last_value) | ||
| except ValueError: | ||
| profile._first_value = data["_first_value"] | ||
| profile._last_value = data["_last_value"] | ||
| return profile | ||
|
|
||
| @property | ||
| def profile(self) -> dict: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
allow for deserialization