-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Text Processing #1300
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Text Processing #1300
Changes from all commits
4450d86
e821eaf
ae4f59f
d0a10ab
65271a9
55e87e2
590387d
2809c46
ffe8ccf
8094eb5
1a27144
63e6fdb
1a2f66d
107e854
88aa101
11f092f
d5a03d6
220807e
38ffd06
fa7c8e7
2e1947a
b56f05f
0d95435
3a00674
cabdb66
fdd7007
8fe74a4
20caf09
42a7bdb
b7bc8fb
e85eb2e
cafb1d4
b9da42d
d2d5a24
ac40ff9
38be7c3
5f6d6a7
94b9c27
bc6e883
ce1c0d1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
from typing import Dict, Optional, Tuple, Union | ||
|
||
from ConfigSpace.configuration_space import ConfigurationSpace | ||
import ConfigSpace.hyperparameters as CSH | ||
|
||
import numpy as np | ||
|
||
from autosklearn.pipeline.base import DATASET_PROPERTIES_TYPE, PIPELINE_DATA_DTYPE | ||
from autosklearn.pipeline.components.base import AutoSklearnPreprocessingAlgorithm | ||
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT | ||
|
||
from sklearn.decomposition import TruncatedSVD | ||
|
||
|
||
class FeatureReduction(AutoSklearnPreprocessingAlgorithm): | ||
""" | ||
Reduces the features created by a bag of words encoding | ||
""" | ||
|
||
def __init__( | ||
self, | ||
n_components: Optional[int] = None, | ||
random_state: Optional[Union[int, np.random.RandomState]] = None | ||
) -> None: | ||
self.n_components = n_components | ||
self.random_state = random_state | ||
|
||
def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None | ||
) -> 'FeatureReduction': | ||
if X.shape[1] > self.n_components: | ||
self.preprocessor = TruncatedSVD(n_components=self.n_components, | ||
random_state=self.random_state) | ||
elif X.shape[1] <= self.n_components and X.shape[1] != 1: | ||
self.preprocessor = TruncatedSVD(n_components=X.shape[1] - 1, | ||
random_state=self.random_state) | ||
else: | ||
raise ValueError("The text embedding consists only of a single dimension.\n" | ||
"Are you sure that your text data is necessary?") | ||
self.preprocessor.fit(X) | ||
return self | ||
|
||
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE: | ||
if self.preprocessor is None: | ||
raise NotImplementedError() | ||
return self.preprocessor.transform(X) | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None | ||
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: | ||
return {'shortname': 'TextFeatureReduction', | ||
'name': 'TextFeatureReduction', | ||
'handles_missing_values': True, | ||
'handles_nominal_values': True, | ||
'handles_numerical_features': True, | ||
'prefers_data_scaled': False, | ||
'prefers_data_normalized': False, | ||
'handles_regression': True, | ||
'handles_classification': True, | ||
'handles_multiclass': True, | ||
'handles_multilabel': True, | ||
'handles_multioutput': True, | ||
'is_deterministic': True, | ||
'handles_sparse': True, | ||
'handles_dense': True, | ||
'input': (DENSE, SPARSE, UNSIGNED_DATA), | ||
'output': (INPUT,), | ||
'preferred_dtype': None} | ||
|
||
@staticmethod | ||
def get_hyperparameter_search_space(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None | ||
) -> ConfigurationSpace: | ||
cs = ConfigurationSpace() | ||
cs.add_hyperparameter( | ||
CSH.UniformIntegerHyperparameter("n_components", lower=1, upper=10000, | ||
mfeurer marked this conversation as resolved.
Show resolved
Hide resolved
|
||
default_value=100, log=True)) | ||
return cs |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
from typing import Any, List, Dict, Optional, Tuple, Union | ||
|
||
from ConfigSpace.configuration_space import Configuration, ConfigurationSpace | ||
|
||
import numpy as np | ||
|
||
from sklearn.base import BaseEstimator | ||
|
||
from autosklearn.pipeline.components.data_preprocessing.text_encoding \ | ||
import BagOfWordChoice | ||
from autosklearn.pipeline.components.data_preprocessing.feature_reduction.truncated_svd import \ | ||
FeatureReduction | ||
from autosklearn.pipeline.base import ( | ||
BasePipeline, | ||
DATASET_PROPERTIES_TYPE, | ||
) | ||
from autosklearn.pipeline.constants import DENSE, SPARSE, UNSIGNED_DATA, INPUT | ||
|
||
|
||
class TextPreprocessingPipeline(BasePipeline): | ||
"""This class implements a pipeline for data preprocessing of text features. | ||
It assumes that the data to be transformed is made only of text features. | ||
The steps of this pipeline are: | ||
1 - Vectorize: Fits a *Vecotrizer object and apply this | ||
2 - text feature reduction: TruncatedSVD | ||
|
||
Parameters | ||
---------- | ||
config : ConfigSpace.configuration_space.Configuration | ||
The configuration to evaluate. | ||
|
||
random_state : Optional[int | RandomState] | ||
If int, random_state is the seed used by the random number generator; | ||
If RandomState instance, random_state is the random number generator; | ||
If None, the random number generator is the RandomState instance | ||
used by `np.random`.""" | ||
eddiebergman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
def __init__( | ||
self, | ||
config: Optional[Configuration] = None, | ||
steps: Optional[List[Tuple[str, BaseEstimator]]] = None, | ||
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, | ||
include: Optional[Dict[str, str]] = None, | ||
exclude: Optional[Dict[str, str]] = None, | ||
random_state: Optional[Union[int, np.random.RandomState]] = None, | ||
init_params: Optional[Dict[str, Any]] = None | ||
) -> None: | ||
self._output_dtype = np.int32 | ||
super().__init__( | ||
config, steps, dataset_properties, include, exclude, | ||
random_state, init_params | ||
) | ||
|
||
@staticmethod | ||
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None | ||
) -> Dict[str, Optional[Union[str, int, bool, Tuple]]]: | ||
return {'shortname': 'txt_datapreproc', | ||
'name': 'text data preprocessing', | ||
'handles_missing_values': True, | ||
'handles_nominal_values': False, | ||
'handles_numerical_features': False, | ||
'prefers_data_scaled': False, | ||
'prefers_data_normalized': False, | ||
'handles_regression': True, | ||
'handles_classification': True, | ||
'handles_multiclass': True, | ||
'handles_multilabel': True, | ||
'is_deterministic': True, | ||
'handles_sparse': True, | ||
'handles_dense': True, | ||
'input': (DENSE, SPARSE, UNSIGNED_DATA), | ||
'output': (INPUT,), | ||
'preferred_dtype': None} | ||
|
||
def _get_hyperparameter_search_space( | ||
self, | ||
include: Optional[Dict[str, str]] = None, | ||
exclude: Optional[Dict[str, str]] = None, | ||
dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None, | ||
) -> ConfigurationSpace: | ||
"""Create the hyperparameter configuration space. | ||
|
||
Parameters | ||
---------- | ||
eddiebergman marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# TODO add parameter description | ||
|
||
Returns | ||
------- | ||
cs : ConfigSpace.configuration_space.Configuration | ||
The configuration space describing the SimpleRegressionClassifier. | ||
""" | ||
cs = ConfigurationSpace() | ||
if dataset_properties is None or not isinstance(dataset_properties, dict): | ||
dataset_properties = dict() | ||
|
||
cs = self._get_base_search_space( | ||
cs=cs, dataset_properties=dataset_properties, | ||
exclude=exclude, include=include, pipeline=self.steps) | ||
|
||
return cs | ||
|
||
def _get_pipeline_steps(self, | ||
dataset_properties: Optional[Dict[str, str]] = None, | ||
) -> List[Tuple[str, BaseEstimator]]: | ||
steps = [] | ||
|
||
default_dataset_properties = {} | ||
if dataset_properties is not None and isinstance(dataset_properties, dict): | ||
default_dataset_properties.update(dataset_properties) | ||
|
||
steps.extend([ | ||
("text_encoding", BagOfWordChoice(default_dataset_properties, | ||
random_state=self.random_state)), | ||
("feature_reduction", FeatureReduction(random_state=self.random_state)) | ||
]) | ||
return steps | ||
|
||
def _get_estimator_hyperparameter_name(self) -> str: | ||
return "text data preprocessing" |
Uh oh!
There was an error while loading. Please reload this page.