including comments from 01.28.

Louquinze · Louquinze · commit 94b9c27dd44c · 2022-01-28T17:58:42.000+01:00
diff --git a/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py b/autosklearn/pipeline/components/data_preprocessing/feature_type_text.py
@@ -109,8 +109,9 @@ def _get_pipeline_steps(self,
             default_dataset_properties.update(dataset_properties)
 
         steps.extend([
-            ("text_encoding", BagOfWordChoice(default_dataset_properties)),
-            ("feature_reduction", FeatureReduction())
+            ("text_encoding", BagOfWordChoice(default_dataset_properties,
+                                              random_state=self.random_state)),
+            ("feature_reduction", FeatureReduction(random_state=self.random_state))
         ])
         return steps
 
diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py
@@ -57,6 +57,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
         return self
 
     def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
+        X.fillna("", inplace=True)
         if self.preprocessor is None:
             raise NotImplementedError()
         return sum(self.preprocessor.transform(X[feature] for feature in X.columns))
diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding_distinct.py
@@ -40,7 +40,8 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
 
                 for feature in X.columns:
                     vectorizer = CountVectorizer(min_df=self.min_df_absolute,
-                                                 ngram_range=(1, self.ngram_range)).fit(X[feature])
+                                                 ngram_range=(1, self.ngram_range)).fit(
+                        X[feature].dropna())
                     self.preprocessor[feature] = vectorizer
 
             elif self.min_df_choice == "min_df_relative":
@@ -49,7 +50,8 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
 
                 for feature in X.columns:
                     vectorizer = CountVectorizer(min_df=self.min_df_relative,
-                                                 ngram_range=(1, self.ngram_range)).fit(X[feature])
+                                                 ngram_range=(1, self.ngram_range)).fit(
+                        X[feature].dropna())
                     self.preprocessor[feature] = vectorizer
             else:
                 raise KeyError()
@@ -60,6 +62,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
         return self
 
     def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
+        X.fillna("", inplace=True)
         X_new = None
         if self.preprocessor is None:
             raise NotImplementedError()
diff --git a/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py b/autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py
@@ -60,6 +60,7 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
         return self
 
     def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
+        X.fillna("", inplace=True)
         if self.preprocessor is None:
             raise NotImplementedError()
         return sum(self.preprocessor.transform(X[feature]) for feature in X.columns)
diff --git a/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py b/test/test_pipeline/components/data_preprocessing/test_data_preprocessing_text.py