-
Notifications
You must be signed in to change notification settings - Fork 185
Staging/dev/profile serialization #940
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
171e1be
bf7077b
fe0e6f1
168acb2
188dea0
65f74b7
0e75b08
587fc78
91b0e9b
0c8e02a
e79c4c0
a37a54a
232e2ab
2ad1de4
c13c318
438feb5
1ad3d83
16c8d64
2ff5f18
2173f8e
f36e45d
65af593
cf7b237
2159b78
34c2a2b
e8ba88c
046fa7f
9856720
d81a9e8
e654afe
f07faf8
454d429
1f573f8
146c547
e8a3361
54efd86
6ccc203
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,12 +1,98 @@ | ||
| """Package for providing statistics and predictions for a given dataset.""" | ||
| from . import json_decoder | ||
| from .base_column_profilers import BaseColumnProfiler | ||
| from .categorical_column_profile import CategoricalColumn | ||
| from .column_profile_compilers import ( | ||
| BaseCompiler, | ||
| ColumnDataLabelerCompiler, | ||
| ColumnPrimitiveTypeProfileCompiler, | ||
| ColumnStatsProfileCompiler, | ||
| ) | ||
| from .data_labeler_column_profile import DataLabelerColumn | ||
| from .datetime_column_profile import DateTimeColumn | ||
| from .float_column_profile import FloatColumn | ||
| from .int_column_profile import IntColumn | ||
| from .numerical_column_stats import NumericStatsMixin | ||
| from .order_column_profile import OrderColumn | ||
| from .profile_builder import Profiler, StructuredProfiler, UnstructuredProfiler | ||
| from .profile_builder import ( | ||
| Profiler, | ||
| StructuredColProfiler, | ||
| StructuredProfiler, | ||
| UnstructuredProfiler, | ||
| ) | ||
| from .profiler_options import ( | ||
| BaseInspectorOptions, | ||
| BooleanOption, | ||
| CategoricalOptions, | ||
| CorrelationOptions, | ||
| DataLabelerOptions, | ||
| DateTimeOptions, | ||
| FloatOptions, | ||
| HistogramOption, | ||
| HyperLogLogOptions, | ||
| IntOptions, | ||
| ModeOption, | ||
| NumericalOptions, | ||
| OrderOptions, | ||
| PrecisionOptions, | ||
| ProfilerOptions, | ||
| RowStatisticsOptions, | ||
| StructuredOptions, | ||
| TextOptions, | ||
| TextProfilerOptions, | ||
| UniqueCountOptions, | ||
| UnstructuredOptions, | ||
| ) | ||
| from .text_column_profile import TextColumn | ||
| from .unstructured_labeler_profile import UnstructuredLabelerProfile | ||
|
|
||
| # set here to avoid circular imports | ||
| json_decoder._profiles = { | ||
| CategoricalColumn.__name__: CategoricalColumn, | ||
| FloatColumn.__name__: FloatColumn, | ||
| IntColumn.__name__: IntColumn, | ||
| DateTimeColumn.__name__: DateTimeColumn, | ||
| OrderColumn.__name__: OrderColumn, | ||
| DataLabelerColumn.__name__: DataLabelerColumn, | ||
| TextColumn.__name__: TextColumn, | ||
| } | ||
|
|
||
|
|
||
| json_decoder._compilers = { | ||
| ColumnDataLabelerCompiler.__name__: ColumnDataLabelerCompiler, | ||
| ColumnPrimitiveTypeProfileCompiler.__name__: ColumnPrimitiveTypeProfileCompiler, | ||
| ColumnStatsProfileCompiler.__name__: ColumnStatsProfileCompiler, | ||
| } | ||
|
|
||
| json_decoder._options = { | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. updated these from dev |
||
| BooleanOption.__name__: BooleanOption, | ||
| HistogramOption.__name__: HistogramOption, | ||
| ModeOption.__name__: ModeOption, | ||
| BaseInspectorOptions.__name__: BaseInspectorOptions, | ||
| NumericalOptions.__name__: NumericalOptions, | ||
| IntOptions.__name__: IntOptions, | ||
| PrecisionOptions.__name__: PrecisionOptions, | ||
| FloatOptions.__name__: FloatOptions, | ||
| TextOptions.__name__: TextOptions, | ||
| DateTimeOptions.__name__: DateTimeOptions, | ||
| OrderOptions.__name__: OrderOptions, | ||
| CategoricalOptions.__name__: CategoricalOptions, | ||
| CorrelationOptions.__name__: CorrelationOptions, | ||
| UniqueCountOptions.__name__: UniqueCountOptions, | ||
| HyperLogLogOptions.__name__: HyperLogLogOptions, | ||
| RowStatisticsOptions.__name__: RowStatisticsOptions, | ||
| DataLabelerOptions.__name__: DataLabelerOptions, | ||
| TextProfilerOptions.__name__: TextProfilerOptions, | ||
| StructuredOptions.__name__: StructuredOptions, | ||
| UnstructuredOptions.__name__: UnstructuredOptions, | ||
| ProfilerOptions.__name__: ProfilerOptions, | ||
| } | ||
|
|
||
|
|
||
| json_decoder._profilers = { | ||
| StructuredProfiler.__name__: StructuredProfiler, | ||
| } | ||
|
|
||
| json_decoder._structured_col_profiler = { | ||
| StructuredColProfiler.__name__: StructuredColProfiler, | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,8 @@ | |
| import datasketches | ||
| from pandas import DataFrame, Series | ||
|
|
||
| from . import BaseColumnProfiler, utils | ||
| from . import utils | ||
| from .base_column_profilers import BaseColumnProfiler | ||
| from .profiler_options import CategoricalOptions | ||
|
|
||
|
|
||
|
|
@@ -188,6 +189,55 @@ def __add__(self, other: CategoricalColumn) -> CategoricalColumn: | |
|
|
||
| return merged_profile | ||
|
|
||
| @property | ||
| def gini_impurity(self) -> float | None: | ||
| """ | ||
| Return Gini Impurity. | ||
|
|
||
| Gini Impurity is a way to calculate | ||
| likelihood of an incorrect classification of a new instance of | ||
| a random variable. | ||
|
|
||
| G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes. | ||
| We are traversing through categories and calculating with the column | ||
|
|
||
| :return: None or Gini Impurity probability | ||
| """ | ||
| if self.sample_size == 0: | ||
| return None | ||
| gini_sum: float = 0 | ||
| for i in self._categories: | ||
| gini_sum += (self._categories[i] / self.sample_size) * ( | ||
| 1 - (self._categories[i] / self.sample_size) | ||
| ) | ||
| return gini_sum | ||
|
|
||
| @property | ||
| def unalikeability(self) -> float | None: | ||
| """ | ||
| Return Unlikeability. | ||
|
|
||
| Unikeability checks for "how often observations differ from one another" | ||
| Reference: Perry, M. and Kader, G. Variation as Unalikeability. | ||
| Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60. | ||
|
|
||
| U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n) | ||
| Cij = 1 if i!=j, 0 if i=j | ||
|
|
||
| :return: None or unlikeability probability | ||
| """ | ||
| if self.sample_size == 0: | ||
| return None | ||
| elif self.sample_size == 1: | ||
| return 0 | ||
| unalike_sum: int = 0 | ||
| for category in self._categories: | ||
| unalike_sum += ( | ||
| self.sample_size - self._categories[category] | ||
| ) * self._categories[category] | ||
| unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size) | ||
| return unalike | ||
|
|
||
| def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: | ||
| """ | ||
| Find the differences for CategoricalColumns. | ||
|
|
@@ -267,6 +317,22 @@ def report(self, remove_disabled_flag: bool = False) -> dict: | |
| """ | ||
| return self.profile | ||
|
|
||
| @classmethod | ||
| def load_from_dict(cls, data: dict, options: dict | None = None): | ||
| """ | ||
| Parse attribute from json dictionary into self. | ||
|
|
||
| :param data: dictionary with attributes and values. | ||
| :type data: dict[string, Any] | ||
|
|
||
| :return: Profiler with attributes populated. | ||
| :rtype: CategoricalColumn | ||
| """ | ||
| value = data.pop("_categories") | ||
| profile = super().load_from_dict(data) | ||
| setattr(profile, "_categories", defaultdict(int, value)) | ||
| return profile | ||
|
|
||
| @property | ||
| def profile(self) -> dict: | ||
| """ | ||
|
|
@@ -479,6 +545,17 @@ def _merge_categories_cms( | |
| categories.pop(cat) | ||
| return cms3, categories, max_num_heavy_hitters | ||
|
|
||
| def _get_categories_full(self, df_series) -> dict: | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added descript and renamed method |
||
| """Get the unique counts (categories) of a series. | ||
|
|
||
| :param df_series: df series with nulls removed | ||
| :type df_series: pandas.core.series.Series | ||
| :return: dict of counts for each unique value | ||
| :rtype: dict | ||
| """ | ||
| category_count: dict = df_series.value_counts(dropna=False).to_dict() | ||
| return category_count | ||
|
|
||
| @BaseColumnProfiler._timeit(name="categories") | ||
| def _update_categories( | ||
| self, | ||
|
|
@@ -524,7 +601,7 @@ def _update_categories( | |
| self._cms_max_num_heavy_hitters, | ||
| ) | ||
| else: | ||
| category_count = df_series.value_counts(dropna=False).to_dict() | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fixed to use method |
||
| category_count = self._get_categories_full(df_series) | ||
| self._categories = utils.add_nested_dictionaries( | ||
| self._categories, category_count | ||
| ) | ||
|
|
@@ -570,52 +647,3 @@ def update(self, df_series: Series) -> CategoricalColumn: | |
| self._update_helper(df_series, profile) | ||
|
|
||
| return self | ||
|
|
||
| @property | ||
| def gini_impurity(self) -> float | None: | ||
| """ | ||
| Return Gini Impurity. | ||
|
|
||
| Gini Impurity is a way to calculate | ||
| likelihood of an incorrect classification of a new instance of | ||
| a random variable. | ||
|
|
||
| G = Σ(i=1; J): P(i) * (1 - P(i)), where i is the category classes. | ||
| We are traversing through categories and calculating with the column | ||
|
|
||
| :return: None or Gini Impurity probability | ||
| """ | ||
| if self.sample_size == 0: | ||
| return None | ||
| gini_sum: float = 0 | ||
| for i in self._categories: | ||
| gini_sum += (self._categories[i] / self.sample_size) * ( | ||
| 1 - (self._categories[i] / self.sample_size) | ||
| ) | ||
| return gini_sum | ||
|
|
||
| @property | ||
| def unalikeability(self) -> float | None: | ||
| """ | ||
| Return Unlikeability. | ||
|
|
||
| Unikeability checks for "how often observations differ from one another" | ||
| Reference: Perry, M. and Kader, G. Variation as Unalikeability. | ||
| Teaching Statistics, Vol. 27, No. 2 (2005), pp. 58-60. | ||
|
|
||
| U = Σ(i=1,n)Σ(j=1,n): (Cij)/(n**2-n) | ||
| Cij = 1 if i!=j, 0 if i=j | ||
|
|
||
| :return: None or unlikeability probability | ||
| """ | ||
| if self.sample_size == 0: | ||
| return None | ||
| elif self.sample_size == 1: | ||
| return 0 | ||
| unalike_sum: int = 0 | ||
| for category in self._categories: | ||
| unalike_sum += ( | ||
| self.sample_size - self._categories[category] | ||
| ) * self._categories[category] | ||
| unalike: float = unalike_sum / (self.sample_size**2 - self.sample_size) | ||
| return unalike | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
updated for dev options that weren't in profiler serial