Skip to content

Commit 94b9c27

Browse files
committed
including comments from 01.28.
1 parent 5f6d6a7 commit 94b9c27

File tree

5 files changed

+10
-69
lines changed

5 files changed

+10
-69
lines changed

autosklearn/pipeline/components/data_preprocessing/feature_type_text.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,8 +109,9 @@ def _get_pipeline_steps(self,
109109
default_dataset_properties.update(dataset_properties)
110110

111111
steps.extend([
112-
("text_encoding", BagOfWordChoice(default_dataset_properties)),
113-
("feature_reduction", FeatureReduction())
112+
("text_encoding", BagOfWordChoice(default_dataset_properties,
113+
random_state=self.random_state)),
114+
("feature_reduction", FeatureReduction(random_state=self.random_state))
114115
])
115116
return steps
116117

autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
5757
return self
5858

5959
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
60+
X.fillna("", inplace=True)
6061
if self.preprocessor is None:
6162
raise NotImplementedError()
6263
return sum(self.preprocessor.transform(X[feature] for feature in X.columns))

autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,8 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
4040

4141
for feature in X.columns:
4242
vectorizer = CountVectorizer(min_df=self.min_df_absolute,
43-
ngram_range=(1, self.ngram_range)).fit(X[feature])
43+
ngram_range=(1, self.ngram_range)).fit(
44+
X[feature].dropna())
4445
self.preprocessor[feature] = vectorizer
4546

4647
elif self.min_df_choice == "min_df_relative":
@@ -49,7 +50,8 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
4950

5051
for feature in X.columns:
5152
vectorizer = CountVectorizer(min_df=self.min_df_relative,
52-
ngram_range=(1, self.ngram_range)).fit(X[feature])
53+
ngram_range=(1, self.ngram_range)).fit(
54+
X[feature].dropna())
5355
self.preprocessor[feature] = vectorizer
5456
else:
5557
raise KeyError()
@@ -60,6 +62,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
6062
return self
6163

6264
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
65+
X.fillna("", inplace=True)
6366
X_new = None
6467
if self.preprocessor is None:
6568
raise NotImplementedError()

autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
6060
return self
6161

6262
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
63+
X.fillna("", inplace=True)
6364
if self.preprocessor is None:
6465
raise NotImplementedError()
6566
return sum(self.preprocessor.transform(X[feature]) for feature in X.columns)

test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py

Lines changed: 0 additions & 65 deletions
This file was deleted.

0 commit comments

Comments
 (0)