Skip to content

Commit bc6e883

Browse files
committed
including comments from 01.28.
1 parent 94b9c27 commit bc6e883

File tree

3 files changed

+7
-6
lines changed

3 files changed

+7
-6
lines changed

autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
3535
) -> 'BagOfWordEncoder':
3636

3737
if isinstance(X, pd.DataFrame):
38+
X.fillna("", inplace=True)
3839
# define a CountVectorizer for every feature (implicitly defined by order of columns,
3940
# maybe change the list
4041
# to a dictionary with features as keys)
@@ -47,7 +48,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
4748
else:
4849
raise KeyError()
4950

50-
all_text = itertools.chain.from_iterable(X[col].dropna() for col in X.columns)
51+
all_text = itertools.chain.from_iterable(X[col] for col in X.columns)
5152
self.preprocessor = self.preprocessor.fit(all_text)
5253

5354
else:

autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,14 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
3434
) -> 'BagOfWordEncoder':
3535

3636
if isinstance(X, pd.DataFrame):
37+
X.fillna("", inplace=True)
3738
if self.min_df_choice == "min_df_absolute":
3839

3940
self.preprocessor = {}
4041

4142
for feature in X.columns:
4243
vectorizer = CountVectorizer(min_df=self.min_df_absolute,
43-
ngram_range=(1, self.ngram_range)).fit(
44-
X[feature].dropna())
44+
ngram_range=(1, self.ngram_range)).fit(X[feature])
4545
self.preprocessor[feature] = vectorizer
4646

4747
elif self.min_df_choice == "min_df_relative":
@@ -50,8 +50,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
5050

5151
for feature in X.columns:
5252
vectorizer = CountVectorizer(min_df=self.min_df_relative,
53-
ngram_range=(1, self.ngram_range)).fit(
54-
X[feature].dropna())
53+
ngram_range=(1, self.ngram_range)).fit(X[feature])
5554
self.preprocessor[feature] = vectorizer
5655
else:
5756
raise KeyError()

autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
3636
) -> 'TfidfEncoder':
3737

3838
if isinstance(X, pd.DataFrame):
39+
X.fillna("", inplace=True)
3940
# define a CountVectorizer for every feature (implicitly defined by order of columns,
4041
# maybe change the list
4142
# to a dictionary with features as keys)
@@ -50,7 +51,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
5051
else:
5152
raise KeyError()
5253

53-
all_text = itertools.chain.from_iterable(X[col].dropna() for col in X.columns)
54+
all_text = itertools.chain.from_iterable(X[col] for col in X.columns)
5455
self.preprocessor = self.preprocessor.fit(all_text)
5556

5657
else:

0 commit comments

Comments
 (0)