55import numpy as np
66
77
8-
9-
108def test_imputer ():
119 data = pd .DataFrame (
1210 {"col1" : [None , 4 , 4 , 7 ], "col2" : [2 , None , None , 2 ], "col3" : [3 , None , 6 , 6 ]}
@@ -64,6 +62,9 @@ def test_imputer():
6462 with raises (Exception ):
6563 eda_utils_py .imputer (data , strategy = "median" , fill_value = 3 )
6664
65+ with raises (Exception ):
66+ eda_utils_py .imputer (data , strategy = "others" )
67+
6768 assert pd .DataFrame .equals (
6869 eda_utils_py .imputer (data ), imp_mean
6970 ), "The returned dataframe using mean inputer is not correct"
@@ -101,7 +102,7 @@ def test_cor_map():
101102
102103 # Tests whether or not there are NaNs produced in the correlation values
103104 assert (
104- plot .data ["cor" ].isnull ().sum () == 0
105+ plot .data ["cor" ].isnull ().sum () == 0
105106 ), "There are NaN produced as correlation values"
106107
107108 # Tests whether plot output scheme is one of the three given color schemes
@@ -117,20 +118,20 @@ def test_cor_map():
117118
118119 # Tests whether heatmap and correlation values have the same referenced var column
119120 assert (
120- plot_dict ["layer" ][0 ]["encoding" ]["x" ]["field" ]
121- == plot_dict ["layer" ][1 ]["encoding" ]["x" ]["field" ]
121+ plot_dict ["layer" ][0 ]["encoding" ]["x" ]["field" ]
122+ == plot_dict ["layer" ][1 ]["encoding" ]["x" ]["field" ]
122123 ), "The heatmap and the correlation values are not referring to the same corresponding underlying variable x"
123124 assert (
124- plot_dict ["layer" ][0 ]["encoding" ]["y" ]["field" ]
125- == plot_dict ["layer" ][1 ]["encoding" ]["y" ]["field" ]
125+ plot_dict ["layer" ][0 ]["encoding" ]["y" ]["field" ]
126+ == plot_dict ["layer" ][1 ]["encoding" ]["y" ]["field" ]
126127 ), "The heatmap and the correlation values are not referring to the same corresponding underlying variable y"
127128
128129 # Tests whether axes is using correct calculated var column as reference
129130 assert (
130- plot_dict ["layer" ][0 ]["encoding" ]["x" ]["field" ] == "var1"
131+ plot_dict ["layer" ][0 ]["encoding" ]["x" ]["field" ] == "var1"
131132 ), "x should be referring to var1"
132133 assert (
133- plot_dict ["layer" ][0 ]["encoding" ]["y" ]["field" ] == "var2"
134+ plot_dict ["layer" ][0 ]["encoding" ]["y" ]["field" ] == "var2"
134135 ), "y should be referring to var2"
135136
136137 # Testing the Exception Errors
@@ -173,56 +174,68 @@ def test_cor_map():
173174
174175def test_scaler ():
175176 mock_df_1 = pd .DataFrame (
176- {"col1" : [1 , 0 , 0 , 3 , 4 ],
177- "col2" : [4 , 1 , 1 , 0 , 1 ],
178- "col3" : [2 , 0 , 0 , 2 , 1 ]}
177+ {"col1" : [1 , 0 , 0 , 3 , 4 ], "col2" : [4 , 1 , 1 , 0 , 1 ], "col3" : [2 , 0 , 0 , 2 , 1 ]}
179178 )
180179
181- mock_df_2 = pd .DataFrame (
182- {"col1" : [1 , 2 , 1 ],
183- "col2" : [0 , 1 , 2 ]}
184- )
180+ mock_df_2 = pd .DataFrame ({"col1" : [1 , 2 , 1 ], "col2" : [0 , 1 , 2 ]})
185181
186182 mock_df_1_standard = pd .DataFrame (
187- {"col1" : [- 0.3302891295379082 , - 0.8807710121010884 , - 0.8807710121010884 , 0.7706746355884523 ,
188- 1.3211565181516325 ],
189- "col2" : [1.714389230829046 , - 0.26375218935831474 , - 0.26375218935831474 , - 0.9231326627541017 ,
190- - 0.26375218935831474 ],
191- "col3" : [1.0 , - 1.0 , - 1.0 , 1.0 , 0.0 ]}
183+ {
184+ "col1" : [
185+ - 0.3302891295379082 ,
186+ - 0.8807710121010884 ,
187+ - 0.8807710121010884 ,
188+ 0.7706746355884523 ,
189+ 1.3211565181516325 ,
190+ ],
191+ "col2" : [
192+ 1.714389230829046 ,
193+ - 0.26375218935831474 ,
194+ - 0.26375218935831474 ,
195+ - 0.9231326627541017 ,
196+ - 0.26375218935831474 ,
197+ ],
198+ "col3" : [1.0 , - 1.0 , - 1.0 , 1.0 , 0.0 ],
199+ }
192200 )
193201
194202 mock_df_1_minmax = pd .DataFrame (
195- {"col1" : [0.25 , 0.00 , 0.00 , 0.75 , 1.00 ],
196- "col2" : [1.00 , 0.25 , 0.25 , 0.00 , 0.25 ],
197- "col3" : [1.0 , 0.0 , 0.0 , 1.0 , 0.5 ]}
203+ {
204+ "col1" : [0.25 , 0.00 , 0.00 , 0.75 , 1.00 ],
205+ "col2" : [1.00 , 0.25 , 0.25 , 0.00 , 0.25 ],
206+ "col3" : [1.0 , 0.0 , 0.0 , 1.0 , 0.5 ],
207+ }
198208 )
199209
200210 mock_df_2_standard = pd .DataFrame (
201- {"col1" : [- 0.5773502691896256 , 1.1547005383792517 , - 0.5773502691896256 ],
202- "col2" : [- 1.0 , 0.0 , 1.0 ]}
211+ {
212+ "col1" : [- 0.5773502691896256 , 1.1547005383792517 , - 0.5773502691896256 ],
213+ "col2" : [- 1.0 , 0.0 , 1.0 ],
214+ }
203215 )
204216
205- mock_df_2_minmax = pd .DataFrame (
206- {"col1" : [0.0 , 1.0 , 0.0 ],
207- "col2" : [0.0 , 0.5 , 1.0 ]}
208- )
217+ mock_df_2_minmax = pd .DataFrame ({"col1" : [0.0 , 1.0 , 0.0 ], "col2" : [0.0 , 0.5 , 1.0 ]})
209218
210- standard_scaled_mock_df_1 = eda_utils_py .scale (mock_df_1 , ['col1' , 'col2' , 'col3' ])
211- standard_scaled_mock_df_2 = eda_utils_py .scale (mock_df_2 , ['col1' , 'col2' ])
212- minmax_scaled_mock_df_1 = eda_utils_py .scale (mock_df_1 , ['col1' , 'col2' , 'col3' ], scaler = "minmax" )
213- minmax_scaled_mock_df_2 = eda_utils_py .scale (mock_df_2 , ['col1' , 'col2' ], scaler = "minmax" )
219+ standard_scaled_mock_df_1 = eda_utils_py .scale (mock_df_1 , ["col1" , "col2" , "col3" ])
220+ standard_scaled_mock_df_2 = eda_utils_py .scale (mock_df_2 , ["col1" , "col2" ])
221+ minmax_scaled_mock_df_1 = eda_utils_py .scale (
222+ mock_df_1 , ["col1" , "col2" , "col3" ], scaler = "minmax"
223+ )
224+ minmax_scaled_mock_df_2 = eda_utils_py .scale (
225+ mock_df_2 , ["col1" , "col2" ], scaler = "minmax"
226+ )
214227
215228 # Tests whether data is not of type pd.Dataframe raises TypeError
216229 with raises (TypeError ):
217230 eda_utils_py .scale ([14 , None , 3 , 27 ])
218231
219232 # Tests whether scaler of incorrect method raises TypeError
220233 with raises (TypeError ):
221- eda_utils_py .scale (mock_df_1 , [' col1' , ' col2' ], scaler = 1 )
234+ eda_utils_py .scale (mock_df_1 , [" col1" , " col2" ], scaler = 1 )
222235
223236 # Tests whether columns of incorrect type raises TypeError
224237 with raises (TypeError ):
225- eda_utils_py .scale (mock_df_1 , {' col1' : 1 , ' col2' : 3 })
238+ eda_utils_py .scale (mock_df_1 , {" col1" : 1 , " col2" : 3 })
226239
227240 assert pd .DataFrame .equals (
228241 standard_scaled_mock_df_1 , mock_df_1_standard
@@ -240,47 +253,109 @@ def test_scaler():
240253
241254
242255def test_outlier_identifier ():
243- test_df = pd .DataFrame ({
244- 'SepalLengthCm' : [5.1 , 4.9 , 4.7 , 5.5 , 5.1 , 50 , 5.4 , 5.0 , 5.2 , 5.3 , 5.1 ],
245- 'SepalWidthCm' : [1.4 , 1.4 , 20 , 2.0 , 0.7 , 1.6 , 1.2 , 1.4 , 1.8 , 1.5 , 2.1 ],
246- 'PetalWidthCm' : [0.2 , 0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.5 , 0.6 , 0.4 , 0.2 , 5 ],
247- 'Species' : ['Iris-setosa' , 'Iris-virginica' , 'Iris-germanica' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' ,
248- 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' ]
249- })
250-
251- test_column = ['SepalLengthCm' , 'SepalWidthCm' , 'PetalWidthCm' ]
252-
253- median_output = pd .DataFrame ({
254- 'SepalLengthCm' : [5.1 , 4.9 , 4.7 , 5.5 , 5.1 , 5.1 , 5.4 , 5.0 , 5.2 , 5.3 , 5.1 ],
255- 'SepalWidthCm' : [1.4 , 1.4 , 1.5 , 2.0 , 0.7 , 1.6 , 1.2 , 1.4 , 1.8 , 1.5 , 2.1 ],
256- 'PetalWidthCm' : [0.2 , 0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.5 , 0.6 , 0.4 , 0.2 , 0.4 ],
257- 'Species' : ['Iris-setosa' , 'Iris-virginica' , 'Iris-germanica' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' ,
258- 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' ]
259- })
260-
261- trim_output = pd .DataFrame ({
262- 'SepalLengthCm' : [5.1 , 4.9 , 5.5 , 5.1 , 5.4 , 5.0 , 5.2 , 5.3 ],
263- 'SepalWidthCm' : [1.4 , 1.4 , 2.0 , 0.7 , 1.2 , 1.4 , 1.8 , 1.5 ],
264- 'PetalWidthCm' : [0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.4 , 0.2 ],
265- 'Species' : ['Iris-setosa' , 'Iris-virginica' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' ,
266- 'Iris-setosa' , 'Iris-setosa' ]
267- })
268-
269- mean_output = pd .DataFrame ({
270- 'SepalLengthCm' : [5.1 , 4.9 , 4.7 , 5.5 , 5.1 , 9.21 , 5.4 , 5.0 , 5.2 , 5.3 , 5.1 ],
271- 'SepalWidthCm' : [1.4 , 1.4 , 3.19 , 2.0 , 0.7 , 1.6 , 1.2 , 1.4 , 1.8 , 1.5 , 2.1 ],
272- 'PetalWidthCm' : [0.2 , 0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.5 , 0.6 , 0.4 , 0.2 , 0.77 ],
273- 'Species' : ['Iris-setosa' , 'Iris-virginica' , 'Iris-germanica' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' ,
274- 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' ]
275- })
276-
277- column_output = pd .DataFrame ({
278- 'SepalLengthCm' : [5.1 , 4.9 , 4.7 , 5.5 , 5.1 , 9.21 , 5.4 , 5.0 , 5.2 , 5.3 , 5.1 ],
279- 'SepalWidthCm' : [1.4 , 1.4 , 20 , 2.0 , 0.7 , 1.6 , 1.2 , 1.4 , 1.8 , 1.5 , 2.1 ],
280- 'PetalWidthCm' : [0.2 , 0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.5 , 0.6 , 0.4 , 0.2 , 5 ],
281- 'Species' : ['Iris-setosa' , 'Iris-virginica' , 'Iris-germanica' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' ,
282- 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' , 'Iris-setosa' ]
283- })
256+ test_df = pd .DataFrame (
257+ {
258+ "SepalLengthCm" : [5.1 , 4.9 , 4.7 , 5.5 , 5.1 , 50 , 5.4 , 5.0 , 5.2 , 5.3 , 5.1 ],
259+ "SepalWidthCm" : [1.4 , 1.4 , 20 , 2.0 , 0.7 , 1.6 , 1.2 , 1.4 , 1.8 , 1.5 , 2.1 ],
260+ "PetalWidthCm" : [0.2 , 0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.5 , 0.6 , 0.4 , 0.2 , 5 ],
261+ "Species" : [
262+ "Iris-setosa" ,
263+ "Iris-virginica" ,
264+ "Iris-germanica" ,
265+ "Iris-setosa" ,
266+ "Iris-setosa" ,
267+ "Iris-setosa" ,
268+ "Iris-setosa" ,
269+ "Iris-setosa" ,
270+ "Iris-setosa" ,
271+ "Iris-setosa" ,
272+ "Iris-setosa" ,
273+ ],
274+ }
275+ )
276+
277+ test_column = ["SepalLengthCm" , "SepalWidthCm" , "PetalWidthCm" ]
278+
279+ median_output = pd .DataFrame (
280+ {
281+ "SepalLengthCm" : [5.1 , 4.9 , 4.7 , 5.5 , 5.1 , 5.1 , 5.4 , 5.0 , 5.2 , 5.3 , 5.1 ],
282+ "SepalWidthCm" : [1.4 , 1.4 , 1.5 , 2.0 , 0.7 , 1.6 , 1.2 , 1.4 , 1.8 , 1.5 , 2.1 ],
283+ "PetalWidthCm" : [0.2 , 0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.5 , 0.6 , 0.4 , 0.2 , 0.4 ],
284+ "Species" : [
285+ "Iris-setosa" ,
286+ "Iris-virginica" ,
287+ "Iris-germanica" ,
288+ "Iris-setosa" ,
289+ "Iris-setosa" ,
290+ "Iris-setosa" ,
291+ "Iris-setosa" ,
292+ "Iris-setosa" ,
293+ "Iris-setosa" ,
294+ "Iris-setosa" ,
295+ "Iris-setosa" ,
296+ ],
297+ }
298+ )
299+
300+ trim_output = pd .DataFrame (
301+ {
302+ "SepalLengthCm" : [5.1 , 4.9 , 5.5 , 5.1 , 5.4 , 5.0 , 5.2 , 5.3 ],
303+ "SepalWidthCm" : [1.4 , 1.4 , 2.0 , 0.7 , 1.2 , 1.4 , 1.8 , 1.5 ],
304+ "PetalWidthCm" : [0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.4 , 0.2 ],
305+ "Species" : [
306+ "Iris-setosa" ,
307+ "Iris-virginica" ,
308+ "Iris-setosa" ,
309+ "Iris-setosa" ,
310+ "Iris-setosa" ,
311+ "Iris-setosa" ,
312+ "Iris-setosa" ,
313+ "Iris-setosa" ,
314+ ],
315+ }
316+ )
317+
318+ mean_output = pd .DataFrame (
319+ {
320+ "SepalLengthCm" : [5.1 , 4.9 , 4.7 , 5.5 , 5.1 , 9.21 , 5.4 , 5.0 , 5.2 , 5.3 , 5.1 ],
321+ "SepalWidthCm" : [1.4 , 1.4 , 3.19 , 2.0 , 0.7 , 1.6 , 1.2 , 1.4 , 1.8 , 1.5 , 2.1 ],
322+ "PetalWidthCm" : [0.2 , 0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.5 , 0.6 , 0.4 , 0.2 , 0.77 ],
323+ "Species" : [
324+ "Iris-setosa" ,
325+ "Iris-virginica" ,
326+ "Iris-germanica" ,
327+ "Iris-setosa" ,
328+ "Iris-setosa" ,
329+ "Iris-setosa" ,
330+ "Iris-setosa" ,
331+ "Iris-setosa" ,
332+ "Iris-setosa" ,
333+ "Iris-setosa" ,
334+ "Iris-setosa" ,
335+ ],
336+ }
337+ )
338+
339+ column_output = pd .DataFrame (
340+ {
341+ "SepalLengthCm" : [5.1 , 4.9 , 4.7 , 5.5 , 5.1 , 9.21 , 5.4 , 5.0 , 5.2 , 5.3 , 5.1 ],
342+ "SepalWidthCm" : [1.4 , 1.4 , 20 , 2.0 , 0.7 , 1.6 , 1.2 , 1.4 , 1.8 , 1.5 , 2.1 ],
343+ "PetalWidthCm" : [0.2 , 0.2 , 0.2 , 0.3 , 0.4 , 0.5 , 0.5 , 0.6 , 0.4 , 0.2 , 5 ],
344+ "Species" : [
345+ "Iris-setosa" ,
346+ "Iris-virginica" ,
347+ "Iris-germanica" ,
348+ "Iris-setosa" ,
349+ "Iris-setosa" ,
350+ "Iris-setosa" ,
351+ "Iris-setosa" ,
352+ "Iris-setosa" ,
353+ "Iris-setosa" ,
354+ "Iris-setosa" ,
355+ "Iris-setosa" ,
356+ ],
357+ }
358+ )
284359
285360 # Test if the imput is not dataFrame
286361 with raises (TypeError ):
@@ -306,11 +381,16 @@ def test_outlier_identifier():
306381 eda_utils_py .outlier_identifier (test_df , test_column ), trim_output
307382 ), "Default test not pass"
308383 assert pd .DataFrame .equals (
309- eda_utils_py .outlier_identifier (test_df , test_column , method = "median" ), median_output
384+ eda_utils_py .outlier_identifier (test_df , test_column , method = "median" ),
385+ median_output ,
310386 ), "The median method is not correct"
311387 assert pd .DataFrame .equals (
312- eda_utils_py .outlier_identifier (test_df , test_column , method = "mean" ), mean_output
388+ eda_utils_py .outlier_identifier (test_df , test_column , method = "mean" ),
389+ mean_output ,
313390 ), "The mean method is not correct"
314391 assert pd .DataFrame .equals (
315- eda_utils_py .outlier_identifier (test_df , columns = ["SepalLengthCm" ], method = "mean" ), column_output
392+ eda_utils_py .outlier_identifier (
393+ test_df , columns = ["SepalLengthCm" ], method = "mean"
394+ ),
395+ column_output ,
316396 ), "The selected column method is not correct"
0 commit comments