Skip to content

Commit 1e17a07

Browse files
Louquinzeeddiebergman
authored andcommitted
Text Processing (#1300)
* commit meta learning data bases * commit changed files * commit new files * fixed experimental settings * implemented last comments on old PR * adapted metalearning to last commit * add a text preprocessing example * intigrated feedback * new changes on *.csv files * reset changes * add changes for merging * add changes for merging * add changes for merging * try to merge * fixed string representation for metalearning (some sort of hot fix, maybe this needs to be fixed in a bigger scale) * fixed string representation for metalearning (some sort of hot fix, maybe this needs to be fixed in a bigger scale) * fixed string representation for metalearning (some sort of hot fix, maybe this needs to be fixed in a bigger scale) * init * init * commit changes for text preprocessing * text prepreprocessing commit * fix metalearning * fix metalearning * adapted test to new text feature * fix style guide issues * integrate PR comments * integrate PR comments * implemented the comments to the last PR * fitted operation is not in place therefore we have to assgin the fitted self.preprocessor again to it self * add first text processing tests * add first text processing tests * including comments from 01.25. * including comments from 01.28. * including comments from 01.28. * including comments from 01.28. * including comments from 01.31.
1 parent 43299a9 commit 1e17a07

File tree

133 files changed

+22962
-18279
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

133 files changed

+22962
-18279
lines changed

autosklearn/data/feature_validator.py

Lines changed: 6 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -124,9 +124,9 @@ def fit(
124124
))
125125

126126
for ft in self.feat_type.values():
127-
if ft.lower() not in ['categorical', 'numerical']:
128-
raise ValueError('Only `Categorical` and `Numerical` are '
129-
'valid feature types, you passed `%s`' % ft)
127+
if ft.lower() not in ['categorical', 'numerical', 'string']:
128+
raise ValueError('Only `Categorical`, `Numerical` and `String` are '
129+
'valid feature types')
130130

131131
if X_test is not None:
132132
self._check_data(X_test)
@@ -262,7 +262,7 @@ def get_feat_type_from_columns(
262262
) -> Dict[Union[str, int], str]:
263263
"""
264264
Returns a dictionary that maps pandas dataframe columns to a feature type.
265-
This feature type can be categorical or numerical
265+
This feature type can be categorical, numerical or string
266266
267267
Parameters
268268
----------
@@ -284,8 +284,9 @@ def get_feat_type_from_columns(
284284
raise ValueError("Auto-sklearn does not yet support sparse pandas Series."
285285
f" Please convert {column} to a dense format.")
286286
elif X[column].dtype.name in ['category', 'bool']:
287-
288287
feat_type[column] = 'categorical'
288+
elif X[column].dtype.name == "string":
289+
feat_type[column] = 'string'
289290
# Move away from np.issubdtype as it causes
290291
# TypeError: data type not understood in certain pandas types
291292
elif not is_numeric_dtype(X[column]):
@@ -357,12 +358,6 @@ def list_to_dataframe(
357358

358359
# Store the dtypes and use in case of re-fit
359360
if len(self.dtypes) == 0:
360-
# Categorical data is inferred as string. Convert to categorical.
361-
# Warn the user about dtypes or request him to use a dataframe
362-
for col in X_train.columns:
363-
if X_train[col].dtype.name == 'string':
364-
X_train[col] = X_train[col].astype('category')
365-
366361
self.dtypes = {col: X_train[col].dtype.name.lower() for col in X_train.columns}
367362
else:
368363
for col in X_train.columns:

autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_10CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_3CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_5CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_holdout_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_10CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_3CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_5CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_holdout_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/log_loss/askl2_portfolios/RF_None_10CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/log_loss/askl2_portfolios/RF_None_3CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/log_loss/askl2_portfolios/RF_None_5CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/log_loss/askl2_portfolios/RF_None_holdout_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_10CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_3CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_5CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_holdout_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_10CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_3CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_5CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_holdout_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_10CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_3CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_5CV_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_holdout_iterative_es_if.json

Lines changed: 192 additions & 32 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/accuracy_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/accuracy_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/accuracy_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/accuracy_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/average_precision_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/average_precision_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/average_precision_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/average_precision_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/balanced_accuracy_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/balanced_accuracy_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/balanced_accuracy_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/balanced_accuracy_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_macro_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_macro_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_macro_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_macro_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_micro_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_micro_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_micro_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_micro_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_samples_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_samples_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_samples_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_samples_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_weighted_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_weighted_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_weighted_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/f1_weighted_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/log_loss_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/log_loss_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/log_loss_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/log_loss_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/mean_absolute_error_regression_dense/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/mean_absolute_error_regression_sparse/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/mean_squared_error_regression_dense/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/mean_squared_error_regression_sparse/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/mean_squared_log_error_regression_dense/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/mean_squared_log_error_regression_sparse/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/median_absolute_error_regression_dense/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/median_absolute_error_regression_sparse/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_macro_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_macro_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_macro_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_macro_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_micro_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_micro_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_micro_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_micro_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_samples_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_samples_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_samples_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_samples_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_weighted_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_weighted_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_weighted_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/precision_weighted_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/r2_regression_dense/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/r2_regression_sparse/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_macro_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_macro_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_macro_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_macro_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_micro_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_micro_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_micro_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_micro_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_samples_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_samples_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_samples_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_samples_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_weighted_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_weighted_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_weighted_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/recall_weighted_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/roc_auc_binary.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/roc_auc_binary.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/roc_auc_multiclass.classification_dense/configurations.csv

100644100755
Lines changed: 206 additions & 206 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/roc_auc_multiclass.classification_sparse/configurations.csv

100644100755
Lines changed: 198 additions & 198 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/root_mean_squared_error_regression_dense/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/files/root_mean_squared_error_regression_sparse/configurations.csv

100644100755
Lines changed: 98 additions & 98 deletions
Large diffs are not rendered by default.

autosklearn/metalearning/metafeatures/metafeatures.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1082,11 +1082,19 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger,
10821082
# TODO make sure this is done as efficient as possible (no copy for
10831083
# sparse matrices because of wrong sparse format)
10841084
sparse = scipy.sparse.issparse(X)
1085+
1086+
feat_type = {key: 'categorical' if value else 'numerical'
1087+
for key, value in categorical.items()}
1088+
1089+
# TODO make this more cohesive to the overall structure (quick bug fix)
1090+
if isinstance(X, pd.DataFrame):
1091+
for key in X.select_dtypes(include="string").columns:
1092+
feat_type[key] = "string"
1093+
10851094
DPP = FeatTypeSplit(
10861095
# The difference between feat_type and categorical, is that
10871096
# categorical has True/False instead of categorical/numerical
1088-
feat_type={key: 'categorical' if value else 'numerical'
1089-
for key, value in categorical.items()},
1097+
feat_type=feat_type,
10901098
force_sparse_output=True)
10911099
X_transformed = DPP.fit_transform(X)
10921100
categorical_transformed = {i: False for i in range(X_transformed.shape[1])}

autosklearn/pipeline/components/data_preprocessing/feature_reduction/__init__.py

Whitespace-only changes.
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from typing import Dict, Optional, Tuple, Union
2+
3+
from ConfigSpace.configuration_space import ConfigurationSpace
4+
import ConfigSpace.hyperparameters as CSH
5+
6+
import numpy as np
7+
8+
from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE
9+
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
10+
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT
11+
12+
from sklearn.decomposition import TruncatedSVD
13+
14+
15+
class FeatureReduction(AutoSklearnPreprocessingAlgorithm):
16+
"""
17+
Reduces the features created by a bag of words encoding
18+
"""
19+
20+
def __init__(
21+
self,
22+
n_components: Optional[int] = None,
23+
random_state: Optional[Union[int, np.random.RandomState]] = None
24+
) -> None:
25+
self.n_components = n_components
26+
self.random_state = random_state
27+
28+
def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
29+
) -> 'FeatureReduction':
30+
if X.shape[1] > self.n_components:
31+
self.preprocessor = TruncatedSVD(n_components=self.n_components,
32+
random_state=self.random_state)
33+
elif X.shape[1] <= self.n_components and X.shape[1] != 1:
34+
self.preprocessor = TruncatedSVD(n_components=X.shape[1] - 1,
35+
random_state=self.random_state)
36+
else:
37+
raise ValueError("The text embedding consists only of a single dimension.\n"
38+
"Are you sure that your text data is necessary?")
39+
self.preprocessor.fit(X)
40+
return self
41+
42+
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
43+
if self.preprocessor is None:
44+
raise NotImplementedError()
45+
return self.preprocessor.transform(X)
46+
47+
@staticmethod
48+
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
49+
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
50+
return {'shortname': 'TextFeatureReduction',
51+
'name': 'TextFeatureReduction',
52+
'handles_missing_values': True,
53+
'handles_nominal_values': True,
54+
'handles_numerical_features': True,
55+
'prefers_data_scaled': False,
56+
'prefers_data_normalized': False,
57+
'handles_regression': True,
58+
'handles_classification': True,
59+
'handles_multiclass': True,
60+
'handles_multilabel': True,
61+
'handles_multioutput': True,
62+
'is_deterministic': True,
63+
'handles_sparse': True,
64+
'handles_dense': True,
65+
'input': (DENSE, SPARSE, UNSIGNED_DATA),
66+
'output': (INPUT,),
67+
'preferred_dtype': None}
68+
69+
@staticmethod
70+
def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
71+
) -> ConfigurationSpace:
72+
cs = ConfigurationSpace()
73+
cs.add_hyperparameter(
74+
CSH.UniformIntegerHyperparameter("n_components", lower=1, upper=10000,
75+
default_value=100, log=True))
76+
return cs

autosklearn/pipeline/components/data_preprocessing/feature_type.py

Lines changed: 34 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
import CategoricalPreprocessingPipeline
2020
from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical \
2121
import NumericalPreprocessingPipeline
22+
from autosklearn.pipeline.components.data_preprocessing.feature_type_text \
23+
import TextPreprocessingPipeline
2224
from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice, \
2325
AutoSklearnPreprocessingAlgorithm
2426
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT
@@ -29,8 +31,8 @@
2931

3032

3133
class FeatTypeSplit(AutoSklearnPreprocessingAlgorithm):
32-
""" This component is used to apply distinct transformations to categorical and
33-
numerical features of a dataset. It is built on top of sklearn's ColumnTransformer.
34+
""" This component is used to apply distinct transformations to categorical,
35+
numerical and text features of a dataset. It is built on top of sklearn's ColumnTransformer.
3436
"""
3537

3638
def __init__(
@@ -82,9 +84,23 @@ def __init__(
8284
config=None, steps=pipeline, dataset_properties=dataset_properties,
8385
include=include, exclude=exclude, random_state=random_state,
8486
init_params=init_params)
87+
88+
# The pipeline that will be applied to the text features (i.e. columns)
89+
# of the dataset
90+
# Configuration of the data-preprocessor is different from the configuration of
91+
# the numerical or categorical pipeline. Hence, force to None
92+
# It is actually the call to set_hyperparameter who properly sets this argument
93+
# TODO: Extract the child configuration space from the FeatTypeSplit to the
94+
# pipeline if needed
95+
self.txt_ppl = TextPreprocessingPipeline(
96+
config=None, steps=pipeline, dataset_properties=dataset_properties,
97+
include=include, exclude=exclude, random_state=random_state,
98+
init_params=init_params)
99+
85100
self._transformers: List[Tuple[str, AutoSklearnComponent]] = [
86101
("categorical_transformer", self.categ_ppl),
87102
("numerical_transformer", self.numer_ppl),
103+
("text_transformer", self.txt_ppl),
88104
]
89105
if self.config:
90106
self.set_hyperparameters(self.config, init_params=init_params)
@@ -96,6 +112,7 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
96112
n_feats = X.shape[1]
97113
categorical_features = []
98114
numerical_features = []
115+
text_features = []
99116
if self.feat_type is not None:
100117
# Make sure that we are not missing any column!
101118
expected = set(self.feat_type.keys())
@@ -104,42 +121,36 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
104121
else:
105122
columns = set(range(n_feats))
106123
if expected != columns:
107-
raise ValueError("Train data has columns={} yet the feat_types are feat={}".format(
108-
expected,
109-
columns
110-
))
124+
raise ValueError(f"Train data has columns={expected} yet the"
125+
f" feat_types are feat={columns}")
111126
categorical_features = [key for key, value in self.feat_type.items()
112127
if value.lower() == 'categorical']
113128
numerical_features = [key for key, value in self.feat_type.items()
114129
if value.lower() == 'numerical']
130+
text_features = [key for key, value in self.feat_type.items()
131+
if value.lower() == "string"]
115132

116-
# If no categorical features, assume we have a numerical only pipeline
117-
if len(categorical_features) == 0:
118-
sklearn_transf_spec: List[Tuple[str, BaseEstimator, List[Union[str, bool, int]]]] = [
119-
("numerical_transformer", self.numer_ppl, [True] * n_feats)
120-
]
121-
# If all features are categorical, then just the categorical transformer is used
122-
elif len(numerical_features) == 0:
123133
sklearn_transf_spec = [
124-
("categorical_transformer", self.categ_ppl, [True] * n_feats)
134+
(name, transformer, feature_columns)
135+
for name, transformer, feature_columns
136+
in [
137+
("text_transformer", self.txt_ppl, text_features),
138+
("categorical_transformer", self.categ_ppl, categorical_features),
139+
("numerical_transformer", self.numer_ppl, numerical_features)
140+
]
141+
if len(feature_columns) > 0
125142
]
126-
# For the other cases, both transformers are used
127143
else:
128-
sklearn_transf_spec = [
129-
("categorical_transformer", self.categ_ppl, categorical_features),
130-
("numerical_transformer", self.numer_ppl, numerical_features)
131-
]
144+
# self.feature_type == None assumes numerical case
145+
sklearn_transf_spec = [("numerical_transformer", self.numer_ppl, [True]*n_feats)]
132146

133147
# And one last check in case feat type is None
134148
# And to make sure the final specification has all the columns
135149
# considered in the column transformer
136150
total_columns = sum([len(features) for name, ppl, features in sklearn_transf_spec])
137151
if total_columns != n_feats:
138152
raise ValueError("Missing columns in the specification of the data validator"
139-
" for train data={} and spec={}".format(
140-
np.shape(X),
141-
sklearn_transf_spec,
142-
))
153+
f" for train data={np.shape(X)} and spec={sklearn_transf_spec}")
143154

144155
self.sparse_ = sparse.issparse(X) or self.force_sparse_output
145156
self.column_transformer = sklearn.compose.ColumnTransformer(
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
from typing import Any, List, Dict, Optional, Tuple, Union
2+
3+
from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
4+
5+
import numpy as np
6+
7+
from sklearn.base import BaseEstimator
8+
9+
from autosklearn.pipeline.components.data_preprocessing.text_encoding \
10+
import BagOfWordChoice
11+
from autosklearn.pipeline.components.data_preprocessing.feature_reduction.truncated_svd import \
12+
FeatureReduction
13+
from autosklearn.pipeline.base import (
14+
BasePipeline,
15+
DATASET_PROPERTIES_TYPE,
16+
)
17+
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT
18+
19+
20+
class TextPreprocessingPipeline(BasePipeline):
21+
"""This class implements a pipeline for data preprocessing of text features.
22+
It assumes that the data to be transformed is made only of text features.
23+
The steps of this pipeline are:
24+
1 - Vectorize: Fits a *Vecotrizer object and apply this
25+
2 - text feature reduction: TruncatedSVD
26+
27+
Parameters
28+
----------
29+
config : ConfigSpace.configuration_space.Configuration
30+
The configuration to evaluate.
31+
32+
random_state : Optional[int | RandomState]
33+
If int, random_state is the seed used by the random number generator;
34+
If RandomState instance, random_state is the random number generator;
35+
If None, the random number generator is the RandomState instance
36+
used by `np.random`."""
37+
38+
def __init__(
39+
self,
40+
config: Optional[Configuration] = None,
41+
steps: Optional[List[Tuple[str, BaseEstimator]]] = None,
42+
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
43+
include: Optional[Dict[str, str]] = None,
44+
exclude: Optional[Dict[str, str]] = None,
45+
random_state: Optional[Union[int, np.random.RandomState]] = None,
46+
init_params: Optional[Dict[str, Any]] = None
47+
) -> None:
48+
self._output_dtype = np.int32
49+
super().__init__(
50+
config, steps, dataset_properties, include, exclude,
51+
random_state, init_params
52+
)
53+
54+
@staticmethod
55+
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
56+
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
57+
return {'shortname': 'txt_datapreproc',
58+
'name': 'text data preprocessing',
59+
'handles_missing_values': True,
60+
'handles_nominal_values': False,
61+
'handles_numerical_features': False,
62+
'prefers_data_scaled': False,
63+
'prefers_data_normalized': False,
64+
'handles_regression': True,
65+
'handles_classification': True,
66+
'handles_multiclass': True,
67+
'handles_multilabel': True,
68+
'is_deterministic': True,
69+
'handles_sparse': True,
70+
'handles_dense': True,
71+
'input': (DENSE, SPARSE, UNSIGNED_DATA),
72+
'output': (INPUT,),
73+
'preferred_dtype': None}
74+
75+
def _get_hyperparameter_search_space(
76+
self,
77+
include: Optional[Dict[str, str]] = None,
78+
exclude: Optional[Dict[str, str]] = None,
79+
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
80+
) -> ConfigurationSpace:
81+
"""Create the hyperparameter configuration space.
82+
83+
Parameters
84+
----------
85+
# TODO add parameter description
86+
87+
Returns
88+
-------
89+
cs : ConfigSpace.configuration_space.Configuration
90+
The configuration space describing the SimpleRegressionClassifier.
91+
"""
92+
cs = ConfigurationSpace()
93+
if dataset_properties is None or not isinstance(dataset_properties, dict):
94+
dataset_properties = dict()
95+
96+
cs = self._get_base_search_space(
97+
cs=cs, dataset_properties=dataset_properties,
98+
exclude=exclude, include=include, pipeline=self.steps)
99+
100+
return cs
101+
102+
def _get_pipeline_steps(self,
103+
dataset_properties: Optional[Dict[str, str]] = None,
104+
) -> List[Tuple[str, BaseEstimator]]:
105+
steps = []
106+
107+
default_dataset_properties = {}
108+
if dataset_properties is not None and isinstance(dataset_properties, dict):
109+
default_dataset_properties.update(dataset_properties)
110+
111+
steps.extend([
112+
("text_encoding", BagOfWordChoice(default_dataset_properties,
113+
random_state=self.random_state)),
114+
("feature_reduction", FeatureReduction(random_state=self.random_state))
115+
])
116+
return steps
117+
118+
def _get_estimator_hyperparameter_name(self) -> str:
119+
return "text data preprocessing"

0 commit comments

Comments
 (0)