add more test for imputer function

chuangw6 · chuangw6 · commit e6ee5c79d1d6 · 2021-03-13T00:35:15.000-08:00
diff --git a/tests/test_eda_utils_py.py b/tests/test_eda_utils_py.py
@@ -5,8 +5,6 @@
 import numpy as np
 
 
-
-
 def test_imputer():
     data = pd.DataFrame(
         {"col1": [None, 4, 4, 7], "col2": [2, None, None, 2], "col3": [3, None, 6, 6]}
@@ -64,6 +62,9 @@ def test_imputer():
     with raises(Exception):
         eda_utils_py.imputer(data, strategy="median", fill_value=3)
 
+    with raises(Exception):
+        eda_utils_py.imputer(data, strategy="others")
+
     assert pd.DataFrame.equals(
         eda_utils_py.imputer(data), imp_mean
     ), "The returned dataframe using mean inputer is not correct"
@@ -101,7 +102,7 @@ def test_cor_map():
 
     # Tests whether or not there are NaNs produced in the correlation values
     assert (
-            plot.data["cor"].isnull().sum() == 0
+        plot.data["cor"].isnull().sum() == 0
     ), "There are NaN produced as correlation values"
 
     # Tests whether plot output scheme is one of the three given color schemes
@@ -117,20 +118,20 @@ def test_cor_map():
 
     # Tests whether heatmap and correlation values have the same referenced var column
     assert (
-            plot_dict["layer"][0]["encoding"]["x"]["field"]
-            == plot_dict["layer"][1]["encoding"]["x"]["field"]
+        plot_dict["layer"][0]["encoding"]["x"]["field"]
+        == plot_dict["layer"][1]["encoding"]["x"]["field"]
     ), "The heatmap and the correlation values are not referring to the same corresponding underlying variable x"
     assert (
-            plot_dict["layer"][0]["encoding"]["y"]["field"]
-            == plot_dict["layer"][1]["encoding"]["y"]["field"]
+        plot_dict["layer"][0]["encoding"]["y"]["field"]
+        == plot_dict["layer"][1]["encoding"]["y"]["field"]
     ), "The heatmap and the correlation values are not referring to the same corresponding underlying variable y"
 
     # Tests whether axes is using correct calculated var column as reference
     assert (
-            plot_dict["layer"][0]["encoding"]["x"]["field"] == "var1"
+        plot_dict["layer"][0]["encoding"]["x"]["field"] == "var1"
     ), "x should be referring to var1"
     assert (
-            plot_dict["layer"][0]["encoding"]["y"]["field"] == "var2"
+        plot_dict["layer"][0]["encoding"]["y"]["field"] == "var2"
     ), "y should be referring to var2"
 
     # Testing the Exception Errors
@@ -173,56 +174,68 @@ def test_cor_map():
 
 def test_scaler():
     mock_df_1 = pd.DataFrame(
-        {"col1": [1, 0, 0, 3, 4],
-         "col2": [4, 1, 1, 0, 1],
-         "col3": [2, 0, 0, 2, 1]}
+        {"col1": [1, 0, 0, 3, 4], "col2": [4, 1, 1, 0, 1], "col3": [2, 0, 0, 2, 1]}
     )
 
-    mock_df_2 = pd.DataFrame(
-        {"col1": [1, 2, 1],
-         "col2": [0, 1, 2]}
-    )
+    mock_df_2 = pd.DataFrame({"col1": [1, 2, 1], "col2": [0, 1, 2]})
 
     mock_df_1_standard = pd.DataFrame(
-        {"col1": [-0.3302891295379082, -0.8807710121010884, -0.8807710121010884, 0.7706746355884523,
-                  1.3211565181516325],
-         "col2": [1.714389230829046, -0.26375218935831474, -0.26375218935831474, -0.9231326627541017,
-                  -0.26375218935831474],
-         "col3": [1.0, -1.0, -1.0, 1.0, 0.0]}
+        {
+            "col1": [
+                -0.3302891295379082,
+                -0.8807710121010884,
+                -0.8807710121010884,
+                0.7706746355884523,
+                1.3211565181516325,
+            ],
+            "col2": [
+                1.714389230829046,
+                -0.26375218935831474,
+                -0.26375218935831474,
+                -0.9231326627541017,
+                -0.26375218935831474,
+            ],
+            "col3": [1.0, -1.0, -1.0, 1.0, 0.0],
+        }
     )
 
     mock_df_1_minmax = pd.DataFrame(
-        {"col1": [0.25, 0.00, 0.00, 0.75, 1.00],
-         "col2": [1.00, 0.25, 0.25, 0.00, 0.25],
-         "col3": [1.0, 0.0, 0.0, 1.0, 0.5]}
+        {
+            "col1": [0.25, 0.00, 0.00, 0.75, 1.00],
+            "col2": [1.00, 0.25, 0.25, 0.00, 0.25],
+            "col3": [1.0, 0.0, 0.0, 1.0, 0.5],
+        }
     )
 
     mock_df_2_standard = pd.DataFrame(
-        {"col1": [-0.5773502691896256, 1.1547005383792517, -0.5773502691896256],
-         "col2": [-1.0, 0.0, 1.0]}
+        {
+            "col1": [-0.5773502691896256, 1.1547005383792517, -0.5773502691896256],
+            "col2": [-1.0, 0.0, 1.0],
+        }
     )
 
-    mock_df_2_minmax = pd.DataFrame(
-        {"col1": [0.0, 1.0, 0.0],
-         "col2": [0.0, 0.5, 1.0]}
-    )
+    mock_df_2_minmax = pd.DataFrame({"col1": [0.0, 1.0, 0.0], "col2": [0.0, 0.5, 1.0]})
 
-    standard_scaled_mock_df_1 = eda_utils_py.scale(mock_df_1, ['col1', 'col2', 'col3'])
-    standard_scaled_mock_df_2 = eda_utils_py.scale(mock_df_2, ['col1', 'col2'])
-    minmax_scaled_mock_df_1 = eda_utils_py.scale(mock_df_1, ['col1', 'col2', 'col3'], scaler="minmax")
-    minmax_scaled_mock_df_2 = eda_utils_py.scale(mock_df_2, ['col1', 'col2'], scaler="minmax")
+    standard_scaled_mock_df_1 = eda_utils_py.scale(mock_df_1, ["col1", "col2", "col3"])
+    standard_scaled_mock_df_2 = eda_utils_py.scale(mock_df_2, ["col1", "col2"])
+    minmax_scaled_mock_df_1 = eda_utils_py.scale(
+        mock_df_1, ["col1", "col2", "col3"], scaler="minmax"
+    )
+    minmax_scaled_mock_df_2 = eda_utils_py.scale(
+        mock_df_2, ["col1", "col2"], scaler="minmax"
+    )
 
     # Tests whether data is not of type pd.Dataframe raises TypeError
     with raises(TypeError):
         eda_utils_py.scale([14, None, 3, 27])
 
     # Tests whether scaler of incorrect method raises TypeError
     with raises(TypeError):
-        eda_utils_py.scale(mock_df_1, ['col1', 'col2'], scaler=1)
+        eda_utils_py.scale(mock_df_1, ["col1", "col2"], scaler=1)
 
     # Tests whether columns of incorrect type raises TypeError
     with raises(TypeError):
-        eda_utils_py.scale(mock_df_1, {'col1': 1, 'col2': 3})
+        eda_utils_py.scale(mock_df_1, {"col1": 1, "col2": 3})
 
     assert pd.DataFrame.equals(
         standard_scaled_mock_df_1, mock_df_1_standard
@@ -240,47 +253,109 @@ def test_scaler():
 
 
 def test_outlier_identifier():
-    test_df = pd.DataFrame({
-        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
-        'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
-        'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
-        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
-                    'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
-    })
-
-    test_column = ['SepalLengthCm', 'SepalWidthCm', 'PetalWidthCm']
-
-    median_output = pd.DataFrame({
-        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
-        'SepalWidthCm': [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
-        'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
-        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
-                    'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
-    })
-
-    trim_output = pd.DataFrame({
-        'SepalLengthCm': [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
-        'SepalWidthCm': [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
-        'PetalWidthCm': [0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
-        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
-                    'Iris-setosa', 'Iris-setosa']
-    })
-
-    mean_output = pd.DataFrame({
-        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
-        'SepalWidthCm': [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
-        'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
-        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
-                    'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
-    })
-
-    column_output = pd.DataFrame({
-        'SepalLengthCm': [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
-        'SepalWidthCm': [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
-        'PetalWidthCm': [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
-        'Species': ['Iris-setosa', 'Iris-virginica', 'Iris-germanica', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa',
-                    'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa', 'Iris-setosa']
-    })
+    test_df = pd.DataFrame(
+        {
+            "SepalLengthCm": [5.1, 4.9, 4.7, 5.5, 5.1, 50, 5.4, 5.0, 5.2, 5.3, 5.1],
+            "SepalWidthCm": [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+            "PetalWidthCm": [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
+            "Species": [
+                "Iris-setosa",
+                "Iris-virginica",
+                "Iris-germanica",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+            ],
+        }
+    )
+
+    test_column = ["SepalLengthCm", "SepalWidthCm", "PetalWidthCm"]
+
+    median_output = pd.DataFrame(
+        {
+            "SepalLengthCm": [5.1, 4.9, 4.7, 5.5, 5.1, 5.1, 5.4, 5.0, 5.2, 5.3, 5.1],
+            "SepalWidthCm": [1.4, 1.4, 1.5, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+            "PetalWidthCm": [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.4],
+            "Species": [
+                "Iris-setosa",
+                "Iris-virginica",
+                "Iris-germanica",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+            ],
+        }
+    )
+
+    trim_output = pd.DataFrame(
+        {
+            "SepalLengthCm": [5.1, 4.9, 5.5, 5.1, 5.4, 5.0, 5.2, 5.3],
+            "SepalWidthCm": [1.4, 1.4, 2.0, 0.7, 1.2, 1.4, 1.8, 1.5],
+            "PetalWidthCm": [0.2, 0.2, 0.3, 0.4, 0.5, 0.6, 0.4, 0.2],
+            "Species": [
+                "Iris-setosa",
+                "Iris-virginica",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+            ],
+        }
+    )
+
+    mean_output = pd.DataFrame(
+        {
+            "SepalLengthCm": [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
+            "SepalWidthCm": [1.4, 1.4, 3.19, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+            "PetalWidthCm": [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 0.77],
+            "Species": [
+                "Iris-setosa",
+                "Iris-virginica",
+                "Iris-germanica",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+            ],
+        }
+    )
+
+    column_output = pd.DataFrame(
+        {
+            "SepalLengthCm": [5.1, 4.9, 4.7, 5.5, 5.1, 9.21, 5.4, 5.0, 5.2, 5.3, 5.1],
+            "SepalWidthCm": [1.4, 1.4, 20, 2.0, 0.7, 1.6, 1.2, 1.4, 1.8, 1.5, 2.1],
+            "PetalWidthCm": [0.2, 0.2, 0.2, 0.3, 0.4, 0.5, 0.5, 0.6, 0.4, 0.2, 5],
+            "Species": [
+                "Iris-setosa",
+                "Iris-virginica",
+                "Iris-germanica",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+                "Iris-setosa",
+            ],
+        }
+    )
 
     # Test if the imput is not dataFrame
     with raises(TypeError):
@@ -306,11 +381,16 @@ def test_outlier_identifier():
         eda_utils_py.outlier_identifier(test_df, test_column), trim_output
     ), "Default test not pass"
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(test_df, test_column, method="median"), median_output
+        eda_utils_py.outlier_identifier(test_df, test_column, method="median"),
+        median_output,
     ), "The median method is not correct"
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(test_df, test_column, method="mean"), mean_output
+        eda_utils_py.outlier_identifier(test_df, test_column, method="mean"),
+        mean_output,
     ), "The mean method is not correct"
     assert pd.DataFrame.equals(
-        eda_utils_py.outlier_identifier(test_df, columns=["SepalLengthCm"], method="mean"), column_output
+        eda_utils_py.outlier_identifier(
+            test_df, columns=["SepalLengthCm"], method="mean"
+        ),
+        column_output,
     ), "The selected column method is not correct"