Skip to content

Commit ce1c0d1

Browse files
committed
including comments from 01.31.
1 parent bc6e883 commit ce1c0d1

File tree

3 files changed

+109
-2
lines changed

3 files changed

+109
-2
lines changed

autosklearn/pipeline/components/data_preprocessing/text_encoding/bag_of_word_encoding.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,15 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
5959

6060
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
6161
X.fillna("", inplace=True)
62+
X_transformed = None
6263
if self.preprocessor is None:
6364
raise NotImplementedError()
64-
return sum(self.preprocessor.transform(X[feature] for feature in X.columns))
65+
for feature in X.columns:
66+
if X_transformed is None:
67+
X_transformed = self.preprocessor.transform(X[feature])
68+
else:
69+
X_transformed += self.preprocessor.transform(X[feature])
70+
return X_transformed
6571

6672
@staticmethod
6773
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None

autosklearn/pipeline/components/data_preprocessing/text_encoding/tfidf_encoding.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,9 +62,15 @@ def fit(self, X: PIPELINE_DATA_DTYPE, y: Optional[PIPELINE_DATA_DTYPE] = None
6262

6363
def transform(self, X: PIPELINE_DATA_DTYPE) -> PIPELINE_DATA_DTYPE:
6464
X.fillna("", inplace=True)
65+
X_transformed = None
6566
if self.preprocessor is None:
6667
raise NotImplementedError()
67-
return sum(self.preprocessor.transform(X[feature]) for feature in X.columns)
68+
for feature in X.columns:
69+
if X_transformed is None:
70+
X_transformed = self.preprocessor.transform(X[feature])
71+
else:
72+
X_transformed += self.preprocessor.transform(X[feature])
73+
return X_transformed
6874

6975
@staticmethod
7076
def get_properties(dataset_properties: Optional[DATASET_PROPERTIES_TYPE] = None
Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import unittest
2+
import numpy as np
3+
import pandas as pd
4+
5+
from autosklearn.pipeline.components.data_preprocessing.text_encoding.bag_of_word_encoding import \
6+
BagOfWordEncoder as BOW
7+
from autosklearn.pipeline.components.data_preprocessing.\
8+
text_encoding.bag_of_word_encoding_distinct import BagOfWordEncoder as BOW_distinct
9+
10+
11+
class TextPreprocessingPipelineTest(unittest.TestCase):
12+
13+
def test_fit_transform(self):
14+
X = pd.DataFrame({"col1": ["hello world",
15+
"This is a test"],
16+
"col2": ["hello mars",
17+
"This is the second column"]}).astype({"col1": "string",
18+
"col2": "string"})
19+
BOW_fitted = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
20+
min_df_relative=0, random_state=1).fit(X.copy())
21+
22+
Yt = BOW_fitted.preprocessor.vocabulary_
23+
words = sorted(["hello", "world", "this", "is", "test", # "a" is not added, len(...)=1
24+
"mars", "the", "second", "column"]) # is ignored by CountVectorizer
25+
Y = {key: idx for idx, key in enumerate(words)}
26+
27+
np.testing.assert_array_equal(Yt, Y)
28+
29+
BOW_fitted = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
30+
min_df_relative=0, random_state=1).fit(X.copy())
31+
32+
for key in BOW_fitted.preprocessor:
33+
y = []
34+
for col in X[key]:
35+
y += [word for word in col.lower().split(" ") if len(word) > 1]
36+
y = sorted(y)
37+
yt = sorted(BOW_fitted.preprocessor[key].vocabulary_.keys())
38+
np.testing.assert_array_equal(yt, y)
39+
40+
def test_transform(self):
41+
X = pd.DataFrame({"col1": ["hello world",
42+
"this is a test"],
43+
"col2": ["hello mars",
44+
"this is the second column"]}).astype({"col1": "string",
45+
"col2": "string"})
46+
X_t = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
47+
min_df_relative=0, random_state=1).fit_transform(X.copy())
48+
49+
# ['column', 'hello', 'is', 'mars', 'second', 'test', 'the', 'this', 'world']
50+
y = np.array([[0, 2, 0, 1, 0, 0, 0, 0, 1],
51+
[1, 0, 2, 0, 1, 1, 1, 2, 0]])
52+
np.testing.assert_array_equal(X_t.toarray(), y)
53+
54+
X_t = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
55+
min_df_relative=0, random_state=1).fit_transform(X.copy())
56+
57+
# 'hello', 'is', 'test', 'this', 'world',
58+
# 'column', 'hello', 'is', 'mars', 'second', 'the', 'this'
59+
y = np.array([[1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
60+
[0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1]])
61+
np.testing.assert_array_equal(X_t.toarray(), y)
62+
63+
def test_check_shape(self):
64+
X = pd.DataFrame({"col1": ["hello world",
65+
"this is test"],
66+
"col2": ["test test",
67+
"test test"]}).astype({"col1": "string",
68+
"col2": "string"})
69+
X_t = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
70+
min_df_relative=0, random_state=1).fit_transform(X.copy())
71+
72+
self.assertEqual(X_t.shape, (2, 5))
73+
74+
X_t = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
75+
min_df_relative=0, random_state=1).fit_transform(X.copy())
76+
77+
self.assertEqual(X_t.shape, (2, 6))
78+
79+
def test_check_nan(self):
80+
X = pd.DataFrame({"col1": ["hello world",
81+
"this is test",
82+
None],
83+
"col2": ["test test",
84+
"test test",
85+
"test"]}).astype({"col1": "string",
86+
"col2": "string"})
87+
X_t = BOW(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
88+
min_df_relative=0, random_state=1).fit_transform(X.copy())
89+
90+
self.assertEqual(X_t.shape, (3, 5))
91+
92+
X_t = BOW_distinct(ngram_range=1, min_df_choice="min_df_absolute", min_df_absolute=0,
93+
min_df_relative=0, random_state=1).fit_transform(X.copy())
94+
95+
self.assertEqual(X_t.shape, (3, 6))

0 commit comments

Comments
 (0)