automl · mfeurer · Feb 3, 2022 · Nov 9, 2021 · Nov 9, 2021 · Nov 9, 2021
diff --git a/autosklearn/data/feature_validator.py b/autosklearn/data/feature_validator.py
@@ -129,9 +129,9 @@ def fit(
                     ))
 
                 for ft in self.feat_type.values():
-                    if ft.lower() not in ['categorical', 'numerical']:
-                        raise ValueError('Only `Categorical` and `Numerical` are '
-                                         'valid feature types, you passed `%s`' % ft)
+                    if ft.lower() not in ['categorical', 'numerical', 'string']:
+                        raise ValueError('Only `Categorical`, `Numerical` and `String` are '
+                                         'valid feature types')
 
         if X_test is not None:
             self._check_data(X_test)
@@ -264,7 +264,7 @@ def get_feat_type_from_columns(
     ) -> typing.Dict[typing.Union[str, int], str]:
         """
         Returns a dictionary that maps pandas dataframe columns to a feature type.
-        This feature type can be categorical or numerical
+        This feature type can be categorical, numerical or string
 
         Parameters
         ----------
@@ -286,8 +286,9 @@ def get_feat_type_from_columns(
                 raise ValueError("Auto-sklearn does not yet support sparse pandas Series."
                                  f" Please convert {column} to a dense format.")
             elif X[column].dtype.name in ['category', 'bool']:
-
                 feat_type[column] = 'categorical'
+            elif X[column].dtype.name == "string":
+                feat_type[column] = 'string'
             # Move away from np.issubdtype as it causes
             # TypeError: data type not understood in certain pandas types
             elif not is_numeric_dtype(X[column]):
@@ -359,12 +360,6 @@ def list_to_dataframe(
 
         # Store the dtypes and use in case of re-fit
         if len(self.dtypes) == 0:
-            # Categorical data is inferred as string. Convert to categorical.
-            # Warn the user about dtypes or request him to use a dataframe
-            for col in X_train.columns:
-                if X_train[col].dtype.name == 'string':
-                    X_train[col] = X_train[col].astype('category')
-
             self.dtypes = {col: X_train[col].dtype.name.lower() for col in X_train.columns}
         else:
             for col in X_train.columns:

diff --git a/...sklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_10CV_iterative_es_if.json b/...sklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_10CV_iterative_es_if.json
diff --git a/autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_3CV_iterative_es_if.json b/autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_3CV_iterative_es_if.json
diff --git a/autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_5CV_iterative_es_if.json b/autosklearn/experimental/balanced_accuracy/askl2_portfolios/RF_None_5CV_iterative_es_if.json
diff --git a/...earn/experimental/balanced_accuracy/askl2_portfolios/RF_None_holdout_iterative_es_if.json b/...earn/experimental/balanced_accuracy/askl2_portfolios/RF_None_holdout_iterative_es_if.json
diff --git a/...rn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_10CV_iterative_es_if.json b/...rn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_10CV_iterative_es_if.json
diff --git a/...arn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_3CV_iterative_es_if.json b/...arn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_3CV_iterative_es_if.json
diff --git a/...arn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_5CV_iterative_es_if.json b/...arn/experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_5CV_iterative_es_if.json
diff --git a/...experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_holdout_iterative_es_if.json b/...experimental/balanced_accuracy/askl2_portfolios/RF_SH-eta4-i_holdout_iterative_es_if.json
diff --git a/autosklearn/experimental/log_loss/askl2_portfolios/RF_None_10CV_iterative_es_if.json b/autosklearn/experimental/log_loss/askl2_portfolios/RF_None_10CV_iterative_es_if.json
diff --git a/autosklearn/experimental/log_loss/askl2_portfolios/RF_None_3CV_iterative_es_if.json b/autosklearn/experimental/log_loss/askl2_portfolios/RF_None_3CV_iterative_es_if.json
diff --git a/autosklearn/experimental/log_loss/askl2_portfolios/RF_None_5CV_iterative_es_if.json b/autosklearn/experimental/log_loss/askl2_portfolios/RF_None_5CV_iterative_es_if.json
diff --git a/autosklearn/experimental/log_loss/askl2_portfolios/RF_None_holdout_iterative_es_if.json b/autosklearn/experimental/log_loss/askl2_portfolios/RF_None_holdout_iterative_es_if.json
diff --git a/autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_10CV_iterative_es_if.json b/autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_10CV_iterative_es_if.json
diff --git a/autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_3CV_iterative_es_if.json b/autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_3CV_iterative_es_if.json
diff --git a/autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_5CV_iterative_es_if.json b/autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_5CV_iterative_es_if.json
diff --git a/autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_holdout_iterative_es_if.json b/autosklearn/experimental/log_loss/askl2_portfolios/RF_SH-eta4-i_holdout_iterative_es_if.json
diff --git a/autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_10CV_iterative_es_if.json b/autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_10CV_iterative_es_if.json
diff --git a/autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_3CV_iterative_es_if.json b/autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_3CV_iterative_es_if.json
diff --git a/autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_5CV_iterative_es_if.json b/autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_5CV_iterative_es_if.json
diff --git a/autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_holdout_iterative_es_if.json b/autosklearn/experimental/roc_auc/askl2_portfolios/RF_None_holdout_iterative_es_if.json
diff --git a/autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_10CV_iterative_es_if.json b/autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_10CV_iterative_es_if.json
diff --git a/autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_3CV_iterative_es_if.json b/autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_3CV_iterative_es_if.json
diff --git a/autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_5CV_iterative_es_if.json b/autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_5CV_iterative_es_if.json
diff --git a/autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_holdout_iterative_es_if.json b/autosklearn/experimental/roc_auc/askl2_portfolios/RF_SH-eta4-i_holdout_iterative_es_if.json
diff --git a/autosklearn/metalearning/files/accuracy_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/accuracy_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/accuracy_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/accuracy_binary.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/accuracy_multiclass.classification_dense/configurations.csv b/autosklearn/metalearning/files/accuracy_multiclass.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/accuracy_multiclass.classification_sparse/configurations.csv b/autosklearn/metalearning/files/accuracy_multiclass.classification_sparse/configurations.csv
diff --git a/...learn/metalearning/files/average_precision_binary.classification_dense/configurations.csv b/...learn/metalearning/files/average_precision_binary.classification_dense/configurations.csv
diff --git a/...earn/metalearning/files/average_precision_binary.classification_sparse/configurations.csv b/...earn/metalearning/files/average_precision_binary.classification_sparse/configurations.csv
diff --git a/...n/metalearning/files/average_precision_multiclass.classification_dense/configurations.csv b/...n/metalearning/files/average_precision_multiclass.classification_dense/configurations.csv
diff --git a/.../metalearning/files/average_precision_multiclass.classification_sparse/configurations.csv b/.../metalearning/files/average_precision_multiclass.classification_sparse/configurations.csv
diff --git a/...learn/metalearning/files/balanced_accuracy_binary.classification_dense/configurations.csv b/...learn/metalearning/files/balanced_accuracy_binary.classification_dense/configurations.csv
diff --git a/...earn/metalearning/files/balanced_accuracy_binary.classification_sparse/configurations.csv b/...earn/metalearning/files/balanced_accuracy_binary.classification_sparse/configurations.csv
diff --git a/...n/metalearning/files/balanced_accuracy_multiclass.classification_dense/configurations.csv b/...n/metalearning/files/balanced_accuracy_multiclass.classification_dense/configurations.csv
diff --git a/.../metalearning/files/balanced_accuracy_multiclass.classification_sparse/configurations.csv b/.../metalearning/files/balanced_accuracy_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/f1_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/f1_binary.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_macro_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/f1_macro_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_macro_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/f1_macro_binary.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_macro_multiclass.classification_dense/configurations.csv b/autosklearn/metalearning/files/f1_macro_multiclass.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_macro_multiclass.classification_sparse/configurations.csv b/autosklearn/metalearning/files/f1_macro_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_micro_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/f1_micro_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_micro_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/f1_micro_binary.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_micro_multiclass.classification_dense/configurations.csv b/autosklearn/metalearning/files/f1_micro_multiclass.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_micro_multiclass.classification_sparse/configurations.csv b/autosklearn/metalearning/files/f1_micro_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_multiclass.classification_dense/configurations.csv b/autosklearn/metalearning/files/f1_multiclass.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_multiclass.classification_sparse/configurations.csv b/autosklearn/metalearning/files/f1_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_samples_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/f1_samples_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_samples_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/f1_samples_binary.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_samples_multiclass.classification_dense/configurations.csv b/autosklearn/metalearning/files/f1_samples_multiclass.classification_dense/configurations.csv
diff --git a/...sklearn/metalearning/files/f1_samples_multiclass.classification_sparse/configurations.csv b/...sklearn/metalearning/files/f1_samples_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_weighted_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/f1_weighted_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/f1_weighted_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/f1_weighted_binary.classification_sparse/configurations.csv
diff --git a/...sklearn/metalearning/files/f1_weighted_multiclass.classification_dense/configurations.csv b/...sklearn/metalearning/files/f1_weighted_multiclass.classification_dense/configurations.csv
diff --git a/...klearn/metalearning/files/f1_weighted_multiclass.classification_sparse/configurations.csv b/...klearn/metalearning/files/f1_weighted_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/log_loss_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/log_loss_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/log_loss_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/log_loss_binary.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/log_loss_multiclass.classification_dense/configurations.csv b/autosklearn/metalearning/files/log_loss_multiclass.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/log_loss_multiclass.classification_sparse/configurations.csv b/autosklearn/metalearning/files/log_loss_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/mean_absolute_error_regression_dense/configurations.csv b/autosklearn/metalearning/files/mean_absolute_error_regression_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/mean_absolute_error_regression_sparse/configurations.csv b/autosklearn/metalearning/files/mean_absolute_error_regression_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/mean_squared_error_regression_dense/configurations.csv b/autosklearn/metalearning/files/mean_squared_error_regression_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/mean_squared_error_regression_sparse/configurations.csv b/autosklearn/metalearning/files/mean_squared_error_regression_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/mean_squared_log_error_regression_dense/configurations.csv b/autosklearn/metalearning/files/mean_squared_log_error_regression_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/mean_squared_log_error_regression_sparse/configurations.csv b/autosklearn/metalearning/files/mean_squared_log_error_regression_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/median_absolute_error_regression_dense/configurations.csv b/autosklearn/metalearning/files/median_absolute_error_regression_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/median_absolute_error_regression_sparse/configurations.csv b/autosklearn/metalearning/files/median_absolute_error_regression_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/precision_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/precision_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/precision_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/precision_binary.classification_sparse/configurations.csv
diff --git a/...sklearn/metalearning/files/precision_macro_binary.classification_dense/configurations.csv b/...sklearn/metalearning/files/precision_macro_binary.classification_dense/configurations.csv
diff --git a/...klearn/metalearning/files/precision_macro_binary.classification_sparse/configurations.csv b/...klearn/metalearning/files/precision_macro_binary.classification_sparse/configurations.csv
diff --git a/...arn/metalearning/files/precision_macro_multiclass.classification_dense/configurations.csv b/...arn/metalearning/files/precision_macro_multiclass.classification_dense/configurations.csv
diff --git a/...rn/metalearning/files/precision_macro_multiclass.classification_sparse/configurations.csv b/...rn/metalearning/files/precision_macro_multiclass.classification_sparse/configurations.csv
diff --git a/...sklearn/metalearning/files/precision_micro_binary.classification_dense/configurations.csv b/...sklearn/metalearning/files/precision_micro_binary.classification_dense/configurations.csv
diff --git a/...klearn/metalearning/files/precision_micro_binary.classification_sparse/configurations.csv b/...klearn/metalearning/files/precision_micro_binary.classification_sparse/configurations.csv
diff --git a/...arn/metalearning/files/precision_micro_multiclass.classification_dense/configurations.csv b/...arn/metalearning/files/precision_micro_multiclass.classification_dense/configurations.csv
diff --git a/...rn/metalearning/files/precision_micro_multiclass.classification_sparse/configurations.csv b/...rn/metalearning/files/precision_micro_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/precision_multiclass.classification_dense/configurations.csv b/autosklearn/metalearning/files/precision_multiclass.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/precision_multiclass.classification_sparse/configurations.csv b/autosklearn/metalearning/files/precision_multiclass.classification_sparse/configurations.csv
diff --git a/...learn/metalearning/files/precision_samples_binary.classification_dense/configurations.csv b/...learn/metalearning/files/precision_samples_binary.classification_dense/configurations.csv
diff --git a/...earn/metalearning/files/precision_samples_binary.classification_sparse/configurations.csv b/...earn/metalearning/files/precision_samples_binary.classification_sparse/configurations.csv
diff --git a/...n/metalearning/files/precision_samples_multiclass.classification_dense/configurations.csv b/...n/metalearning/files/precision_samples_multiclass.classification_dense/configurations.csv
diff --git a/.../metalearning/files/precision_samples_multiclass.classification_sparse/configurations.csv b/.../metalearning/files/precision_samples_multiclass.classification_sparse/configurations.csv
diff --git a/...earn/metalearning/files/precision_weighted_binary.classification_dense/configurations.csv b/...earn/metalearning/files/precision_weighted_binary.classification_dense/configurations.csv
diff --git a/...arn/metalearning/files/precision_weighted_binary.classification_sparse/configurations.csv b/...arn/metalearning/files/precision_weighted_binary.classification_sparse/configurations.csv
diff --git a/.../metalearning/files/precision_weighted_multiclass.classification_dense/configurations.csv b/.../metalearning/files/precision_weighted_multiclass.classification_dense/configurations.csv
diff --git a/...metalearning/files/precision_weighted_multiclass.classification_sparse/configurations.csv b/...metalearning/files/precision_weighted_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/r2_regression_dense/configurations.csv b/autosklearn/metalearning/files/r2_regression_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/r2_regression_sparse/configurations.csv b/autosklearn/metalearning/files/r2_regression_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/recall_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/recall_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/recall_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/recall_binary.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/recall_macro_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/recall_macro_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/recall_macro_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/recall_macro_binary.classification_sparse/configurations.csv
diff --git a/...klearn/metalearning/files/recall_macro_multiclass.classification_dense/configurations.csv b/...klearn/metalearning/files/recall_macro_multiclass.classification_dense/configurations.csv
diff --git a/...learn/metalearning/files/recall_macro_multiclass.classification_sparse/configurations.csv b/...learn/metalearning/files/recall_macro_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/recall_micro_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/recall_micro_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/recall_micro_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/recall_micro_binary.classification_sparse/configurations.csv
diff --git a/...klearn/metalearning/files/recall_micro_multiclass.classification_dense/configurations.csv b/...klearn/metalearning/files/recall_micro_multiclass.classification_dense/configurations.csv
diff --git a/...learn/metalearning/files/recall_micro_multiclass.classification_sparse/configurations.csv b/...learn/metalearning/files/recall_micro_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/recall_multiclass.classification_dense/configurations.csv b/autosklearn/metalearning/files/recall_multiclass.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/recall_multiclass.classification_sparse/configurations.csv b/autosklearn/metalearning/files/recall_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/recall_samples_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/recall_samples_binary.classification_dense/configurations.csv
diff --git a/...sklearn/metalearning/files/recall_samples_binary.classification_sparse/configurations.csv b/...sklearn/metalearning/files/recall_samples_binary.classification_sparse/configurations.csv
diff --git a/...earn/metalearning/files/recall_samples_multiclass.classification_dense/configurations.csv b/...earn/metalearning/files/recall_samples_multiclass.classification_dense/configurations.csv
diff --git a/...arn/metalearning/files/recall_samples_multiclass.classification_sparse/configurations.csv b/...arn/metalearning/files/recall_samples_multiclass.classification_sparse/configurations.csv
diff --git a/...sklearn/metalearning/files/recall_weighted_binary.classification_dense/configurations.csv b/...sklearn/metalearning/files/recall_weighted_binary.classification_dense/configurations.csv
diff --git a/...klearn/metalearning/files/recall_weighted_binary.classification_sparse/configurations.csv b/...klearn/metalearning/files/recall_weighted_binary.classification_sparse/configurations.csv
diff --git a/...arn/metalearning/files/recall_weighted_multiclass.classification_dense/configurations.csv b/...arn/metalearning/files/recall_weighted_multiclass.classification_dense/configurations.csv
diff --git a/...rn/metalearning/files/recall_weighted_multiclass.classification_sparse/configurations.csv b/...rn/metalearning/files/recall_weighted_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/roc_auc_binary.classification_dense/configurations.csv b/autosklearn/metalearning/files/roc_auc_binary.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/roc_auc_binary.classification_sparse/configurations.csv b/autosklearn/metalearning/files/roc_auc_binary.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/roc_auc_multiclass.classification_dense/configurations.csv b/autosklearn/metalearning/files/roc_auc_multiclass.classification_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/roc_auc_multiclass.classification_sparse/configurations.csv b/autosklearn/metalearning/files/roc_auc_multiclass.classification_sparse/configurations.csv
diff --git a/autosklearn/metalearning/files/root_mean_squared_error_regression_dense/configurations.csv b/autosklearn/metalearning/files/root_mean_squared_error_regression_dense/configurations.csv
diff --git a/autosklearn/metalearning/files/root_mean_squared_error_regression_sparse/configurations.csv b/autosklearn/metalearning/files/root_mean_squared_error_regression_sparse/configurations.csv
diff --git a/autosklearn/metalearning/metafeatures/metafeatures.py b/autosklearn/metalearning/metafeatures/metafeatures.py
@@ -1082,11 +1082,19 @@ def calculate_all_metafeatures(X, y, categorical, dataset_name, logger,
                 # TODO make sure this is done as efficient as possible (no copy for
                 # sparse matrices because of wrong sparse format)
                 sparse = scipy.sparse.issparse(X)
+
+                feat_type = {key: 'categorical' if value else 'numerical'
+                             for key, value in categorical.items()}
+
+                # TODO make this more cohesive to the overall structure (quick bug fix)
+                if isinstance(X, pd.DataFrame):
+                    for key in X.select_dtypes(include="string").columns:
+                        feat_type[key] = "string"
+
                 DPP = FeatTypeSplit(
                     # The difference between feat_type and categorical, is that
                     # categorical has True/False instead of categorical/numerical
-                    feat_type={key: 'categorical' if value else 'numerical'
-                               for key, value in categorical.items()},
+                    feat_type=feat_type,
                     force_sparse_output=True)
                 X_transformed = DPP.fit_transform(X)
                 categorical_transformed = {i: False for i in range(X_transformed.shape[1])}

diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_reduction/__init__.py b/autosklearn/pipeline/components/data_preprocessing/feature_reduction/__init__.py
diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_reduction/truncated_svd.py b/autosklearn/pipeline/components/data_preprocessing/feature_reduction/truncated_svd.py
@@ -0,0 +1,76 @@
+from typing import Dict, Optional, Tuple, Union
+
+from ConfigSpace.configuration_space import ConfigurationSpace
+import ConfigSpace.hyperparameters as CSH
+
+import numpy as np
+
+from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE
+from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm
+from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT
+
+from sklearn.decomposition import TruncatedSVD
+
+
+class FeatureReduction(AutoSklearnPreprocessingAlgorithm):
+    """
+    Reduces the features created by a bag of words encoding
+    """
+
+    def __init__(
+        self,
+        n_components: Optional[int] = None,
+        random_state: Optional[Union[int, np.random.RandomState]] = None
+    ) -> None:
+        self.n_components = n_components
+        self.random_state = random_state
+
+    def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
+            ) -> 'FeatureReduction':
+        if X.shape[1] > self.n_components:
+            self.preprocessor = TruncatedSVD(n_components=self.n_components,
+                                             random_state=self.random_state)
+        elif X.shape[1] <= self.n_components and X.shape[1] != 1:
+            self.preprocessor = TruncatedSVD(n_components=X.shape[1] - 1,
+                                             random_state=self.random_state)
+        else:
+            raise ValueError("The text embedding consists only of a single dimension.\n"
+                             "Are you sure that your text data is necessary?")
+        self.preprocessor.fit(X)
+        return self
+
+    def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
+        if self.preprocessor is None:
+            raise NotImplementedError()
+        return self.preprocessor.transform(X)
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
+                       ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
+        return {'shortname': 'TextFeatureReduction',
+                'name': 'TextFeatureReduction',
+                'handles_missing_values': True,
+                'handles_nominal_values': True,
+                'handles_numerical_features': True,
+                'prefers_data_scaled': False,
+                'prefers_data_normalized': False,
+                'handles_regression': True,
+                'handles_classification': True,
+                'handles_multiclass': True,
+                'handles_multilabel': True,
+                'handles_multioutput': True,
+                'is_deterministic': True,
+                'handles_sparse': True,
+                'handles_dense': True,
+                'input': (DENSE, SPARSE, UNSIGNED_DATA),
+                'output': (INPUT,),
+                'preferred_dtype': None}
+
+    @staticmethod
+    def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
+                                        ) -> ConfigurationSpace:
+        cs = ConfigurationSpace()
+        cs.add_hyperparameter(
+            CSH.UniformIntegerHyperparameter("n_components", lower=1, upper=10000,
+                                             default_value=100, log=True))
+        return cs
diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type.py b/autosklearn/pipeline/components/data_preprocessing/feature_type.py
@@ -19,6 +19,8 @@
     import CategoricalPreprocessingPipeline
 from autosklearn.pipeline.components.data_preprocessing.feature_type_numerical \
     import NumericalPreprocessingPipeline
+from autosklearn.pipeline.components.data_preprocessing.feature_type_text \
+    import TextPreprocessingPipeline
 from autosklearn.pipeline.components.base import AutoSklearnComponent, AutoSklearnChoice, \
     AutoSklearnPreprocessingAlgorithm
 from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT
@@ -29,8 +31,8 @@
 
 
 class FeatTypeSplit(AutoSklearnPreprocessingAlgorithm):
-    """ This component is used to apply distinct transformations to categorical and
-    numerical features of a dataset. It is built on top of sklearn's ColumnTransformer.
+    """ This component is used to apply distinct transformations to categorical,
+    numerical and text features of a dataset. It is built on top of sklearn's ColumnTransformer.
     """
 
     def __init__(
@@ -82,9 +84,23 @@ def __init__(
             config=None, steps=pipeline, dataset_properties=dataset_properties,
             include=include, exclude=exclude, random_state=random_state,
             init_params=init_params)
+
+        # The pipeline that will be applied to the text features (i.e. columns)
+        # of the dataset
+        # Configuration of the data-preprocessor is different from the configuration of
+        # the numerical or categorical pipeline. Hence, force to None
+        # It is actually the call to set_hyperparameter who properly sets this argument
+        # TODO: Extract the child configuration space from the FeatTypeSplit to the
+        # pipeline if needed
+        self.txt_ppl = TextPreprocessingPipeline(
+            config=None, steps=pipeline, dataset_properties=dataset_properties,
+            include=include, exclude=exclude, random_state=random_state,
+            init_params=init_params)
+
         self._transformers: List[Tuple[str, AutoSklearnComponent]] = [
             ("categorical_transformer", self.categ_ppl),
             ("numerical_transformer", self.numer_ppl),
+            ("text_transformer", self.txt_ppl),
         ]
         if self.config:
             self.set_hyperparameters(self.config, init_params=init_params)
@@ -96,6 +112,7 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
         n_feats = X.shape[1]
         categorical_features = []
         numerical_features = []
+        text_features = []
         if self.feat_type is not None:
             # Make sure that we are not missing any column!
             expected = set(self.feat_type.keys())
@@ -104,42 +121,36 @@ def fit(self, X: SUPPORTED_FEAT_TYPES, y: Optional[SUPPORTED_TARGET_TYPES] = Non
             else:
                 columns = set(range(n_feats))
             if expected != columns:
-                raise ValueError("Train data has columns={} yet the feat_types are feat={}".format(
-                    expected,
-                    columns
-                ))
+                raise ValueError(f"Train data has columns={expected} yet the"
+                                 f" feat_types are feat={columns}")
             categorical_features = [key for key, value in self.feat_type.items()
                                     if value.lower() == 'categorical']
             numerical_features = [key for key, value in self.feat_type.items()
                                   if value.lower() == 'numerical']
+            text_features = [key for key, value in self.feat_type.items()
+                             if value.lower() == "string"]
 
-        # If no categorical features, assume we have a numerical only pipeline
-        if len(categorical_features) == 0:
-            sklearn_transf_spec: List[Tuple[str, BaseEstimator, List[Union[str, bool, int]]]] = [
-                ("numerical_transformer", self.numer_ppl, [True] * n_feats)
-            ]
-        # If all features are categorical, then just the categorical transformer is used
-        elif len(numerical_features) == 0:
             sklearn_transf_spec = [
-                ("categorical_transformer", self.categ_ppl, [True] * n_feats)
+                (name, transformer, feature_columns)
+                for name, transformer, feature_columns
+                in [
+                    ("text_transformer", self.txt_ppl, text_features),
+                    ("categorical_transformer", self.categ_ppl, categorical_features),
+                    ("numerical_transformer", self.numer_ppl, numerical_features)
+                ]
+                if len(feature_columns) > 0
             ]
-        # For the other cases, both transformers are used
         else:
-            sklearn_transf_spec = [
-                ("categorical_transformer", self.categ_ppl, categorical_features),
-                ("numerical_transformer", self.numer_ppl, numerical_features)
-            ]
+            # self.feature_type == None assumes numerical case
+            sklearn_transf_spec = [("numerical_transformer", self.numer_ppl, [True]*n_feats)]
 
         # And one last check in case feat type is None
         # And to make sure the final specification has all the columns
         # considered in the column transformer
         total_columns = sum([len(features) for name, ppl, features in sklearn_transf_spec])
         if total_columns != n_feats:
             raise ValueError("Missing columns in the specification of the data validator"
-                             " for train data={} and spec={}".format(
-                                 np.shape(X),
-                                 sklearn_transf_spec,
-                             ))
+                             f" for train data={np.shape(X)} and spec={sklearn_transf_spec}")
 
         self.sparse_ = sparse.issparse(X) or self.force_sparse_output
         self.column_transformer = sklearn.compose.ColumnTransformer(

diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py b/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py
@@ -0,0 +1,119 @@
+from typing import Any, List, Dict, Optional, Tuple, Union
+
+from ConfigSpace.configuration_space import Configuration, ConfigurationSpace
+
+import numpy as np
+
+from sklearn.base import BaseEstimator
+
+from autosklearn.pipeline.components.data_preprocessing.text_encoding \
+    import BagOfWordChoice
+from autosklearn.pipeline.components.data_preprocessing.feature_reduction.truncated_svd import \
+    FeatureReduction
+from autosklearn.pipeline.base import (
+    BasePipeline,
+    DATASET_PROPERTIES_TYPE,
+)
+from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT
+
+
+class TextPreprocessingPipeline(BasePipeline):
+    """This class implements a pipeline for data preprocessing of text features.
+    It assumes that the data to be transformed is made only of text features.
+    The steps of this pipeline are:
+        1 - Vectorize: Fits a *Vecotrizer object and apply this
+        2 - text feature reduction: TruncatedSVD
+
+    Parameters
+    ----------
+    config : ConfigSpace.configuration_space.Configuration
+        The configuration to evaluate.
+
+    random_state : Optional[int | RandomState]
+        If int, random_state is the seed used by the random number generator;
+        If RandomState instance, random_state is the random number generator;
+        If None, the random number generator is the RandomState instance
+        used by `np.random`."""
+
+    def __init__(
+        self,
+        config: Optional[Configuration] = None,
+        steps: Optional[List[Tuple[str, BaseEstimator]]] = None,
+        dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
+        include: Optional[Dict[str, str]] = None,
+        exclude: Optional[Dict[str, str]] = None,
+        random_state: Optional[Union[int, np.random.RandomState]] = None,
+        init_params: Optional[Dict[str, Any]] = None
+    ) -> None:
+        self._output_dtype = np.int32
+        super().__init__(
+            config, steps, dataset_properties, include, exclude,
+            random_state, init_params
+        )
+
+    @staticmethod
+    def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
+                       ) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]:
+        return {'shortname': 'txt_datapreproc',
+                'name': 'text data preprocessing',
+                'handles_missing_values': True,
+                'handles_nominal_values': False,
+                'handles_numerical_features': False,
+                'prefers_data_scaled': False,
+                'prefers_data_normalized': False,
+                'handles_regression': True,
+                'handles_classification': True,
+                'handles_multiclass': True,
+                'handles_multilabel': True,
+                'is_deterministic': True,
+                'handles_sparse': True,
+                'handles_dense': True,
+                'input': (DENSE, SPARSE, UNSIGNED_DATA),
+                'output': (INPUT,),
+                'preferred_dtype': None}
+
+    def _get_hyperparameter_search_space(
+        self,
+        include: Optional[Dict[str, str]] = None,
+        exclude: Optional[Dict[str, str]] = None,
+        dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None,
+    ) -> ConfigurationSpace:
+        """Create the hyperparameter configuration space.
+
+        Parameters
+        ----------
+        # TODO add parameter description
+
+        Returns
+        -------
+        cs : ConfigSpace.configuration_space.Configuration
+            The configuration space describing the SimpleRegressionClassifier.
+        """
+        cs = ConfigurationSpace()
+        if dataset_properties is None or not isinstance(dataset_properties, dict):
+            dataset_properties = dict()
+
+        cs = self._get_base_search_space(
+            cs=cs, dataset_properties=dataset_properties,
+            exclude=exclude, include=include, pipeline=self.steps)
+
+        return cs
+
+    def _get_pipeline_steps(self,
+                            dataset_properties: Optional[Dict[str, str]] = None,
+                            ) -> List[Tuple[str, BaseEstimator]]:
+        steps = []
+
+        default_dataset_properties = {}
+        if dataset_properties is not None and isinstance(dataset_properties, dict):
+            default_dataset_properties.update(dataset_properties)
+
+        steps.extend([
+            ("text_encoding", BagOfWordChoice(default_dataset_properties,
+                                              random_state=self.random_state)),
+            ("feature_reduction", FeatureReduction(random_state=self.random_state))
+        ])
+        return steps
+
+    def _get_estimator_hyperparameter_name(self) -> str:
+        return "text data preprocessing"