Static type checking (ydataai#330)

Static type checking with `mypy` Introducing static type checking to this repository (ydataai#302). The code base is updated to resolve detected typing errors. A workaround is used to be compatible with Python 3.5, that does not support variable typing.
akshay-sarbhukan-aera · Feb 2, 2020 · 2389d03 · 2389d03
1 parent 606480d
commit 2389d03
Show file tree

Hide file tree

Showing 17 changed files with 117 additions and 65 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -18,6 +18,7 @@ env:
   - TEST=console
   - TEST=examples
   - TEST=lint
+  - TEST=typing
 
 install:
   - pip install --upgrade pip six
@@ -30,6 +31,7 @@ script:
   - if [ $TEST == 'issue' ]; then pytest --cov=. tests/issues/; fi
   - if [ $TEST == 'examples' ]; then pytest --cov=. --nbval tests/notebooks/; fi
   - if [ $TEST == 'console' ]; then pandas_profiling -h; fi
+  - if [ $TEST == 'typing' ]; then pytest --mypy -m mypy .; fi
   - if [ $TEST == 'lint' ]; then pytest --black -m black src/; flake8 . --select=E9,F63,F7,F82 --show-source --statistics; fi
 
 after_success:

diff --git a/examples/bank_marketing_data/banking_data.py b/examples/bank_marketing_data/banking_data.py
@@ -1,6 +1,6 @@
 # As featured on this Google Cloud Platform page:
 # https://cloud.google.com/solutions/building-a-propensity-model-for-financial-services-on-gcp
-
+from pathlib import Path
 
 import pandas as pd
 
@@ -16,5 +16,7 @@
     # Download the UCI Bank Marketing Dataset
     df = pd.read_csv(file_name, sep=";")
 
-    profile = ProfileReport(df, title="UCI Bank Marketing Dataset")
-    profile.to_file("uci_bank_marketing_report.html")
+    profile = ProfileReport(
+        df, title="Profile Report of the UCI Bank Marketing Dataset"
+    )
+    profile.to_file(Path("uci_bank_marketing_report.html"))
diff --git a/examples/colors/colors.py b/examples/colors/colors.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import pandas as pd
 
 from pandas_profiling import ProfileReport
@@ -11,4 +13,4 @@
 
     df = pd.read_csv(file_name, names=["Code", "Name", "Hex", "R", "G", "B"])
     report = ProfileReport(df, title="Colors")
-    report.to_file("colors_report.html")
+    report.to_file(Path("colors_report.html"))
diff --git a/examples/stata_auto/stata_auto.py b/examples/stata_auto/stata_auto.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import pandas as pd
 
 from pandas_profiling import ProfileReport
@@ -10,4 +12,4 @@
 
     # Length left out due to correlation with weight.
     report = ProfileReport(df, title="1978 Automobile dataset")
-    report.to_file("stata_auto_report.html")
+    report.to_file(Path("stata_auto_report.html"))
diff --git a/examples/vektis/vektis.py b/examples/vektis/vektis.py
@@ -1,3 +1,5 @@
+from pathlib import Path
+
 import pandas as pd
 
 from pandas_profiling import ProfileReport
@@ -21,4 +23,4 @@
         },
         plot={"histogram": {"bayesian_blocks_bins": False}},
     )
-    report.to_file("vektis_report.html", True)
+    report.to_file(Path("vektis_report.html"))
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -1,6 +1,7 @@
 pytest
 coverage<5
 codecov
+pytest-mypy
 pytest-cov
 pytest-black
 nbval

diff --git a/src/pandas_profiling/__init__.py b/src/pandas_profiling/__init__.py
@@ -42,7 +42,7 @@ def __init__(self, df, minimal=False, config_file: Path = None, **kwargs):
             config_file = get_config_minimal()
 
         if config_file:
-            config.config.set_file(str(config_file))
+            config.set_file(str(config_file))
         config.set_kwargs(kwargs)
 
         self.date_start = datetime.utcnow()

diff --git a/src/pandas_profiling/config.py b/src/pandas_profiling/config.py
@@ -18,7 +18,12 @@ def __init__(self):
         """The config constructor should be called only once."""
         if self.config is None:
             self.config = confuse.Configuration("PandasProfiling", __name__)
-            self.config.set_file(str(get_config_default()))
+
+        self.set_file(str(get_config_default()))
+
+    def set_file(self, file_name):
+        if self.config is not None:
+            self.config.set_file(file_name)
 
     def set_args(self, namespace: argparse.Namespace, dots: bool) -> None:
         """
@@ -28,7 +33,8 @@ def set_args(self, namespace: argparse.Namespace, dots: bool) -> None:
             namespace: Dictionary or Namespace to overlay this config with. Supports nested Dictionaries and Namespaces.
             dots: If True, any properties on namespace that contain dots (.) will be broken down into child dictionaries.
         """
-        self.config.set_args(namespace, dots)
+        if self.config is not None:
+            self.config.set_args(namespace, dots)
 
     def _set_kwargs(self, reference, values: dict):
         """Helper function to set config variables based on kwargs."""

diff --git a/src/pandas_profiling/model/correlations.py b/src/pandas_profiling/model/correlations.py
@@ -3,7 +3,7 @@
 import warnings
 from contextlib import suppress
 from functools import partial
-from typing import Callable
+from typing import Callable, Dict, List
 
 import pandas as pd
 import numpy as np
@@ -257,7 +257,18 @@ def calculate_correlations(df: pd.DataFrame, variables: dict) -> dict:
     return correlations
 
 
-def perform_check_correlation(correlation_matrix, threshold: float):
+def get_correlation_mapping() -> Dict[str, List[str]]:
+    """Workaround variable type annotations not being supported in Python 3.5
+
+    Returns:
+        type annotated empty dict
+    """
+    return {}
+
+
+def perform_check_correlation(
+    correlation_matrix: pd.DataFrame, threshold: float
+) -> Dict[str, List[str]]:
     """Check whether selected variables are highly correlated values in the correlation matrix.
 
     Args:
@@ -274,7 +285,7 @@ def perform_check_correlation(correlation_matrix, threshold: float):
     # correlation_tri = correlation.where(np.triu(np.ones(correlation.shape),k=1).astype(np.bool))
     # drop_cols = [i for i in correlation_tri if any(correlation_tri[i]>threshold)]
 
-    mapping = {}
+    mapping = get_correlation_mapping()
     for x, corr_x in corr.iterrows():
         for y, corr in corr_x.iteritems():
             if x == y:

diff --git a/src/pandas_profiling/model/describe.py b/src/pandas_profiling/model/describe.py
@@ -6,7 +6,7 @@
 import sys
 import warnings
 from pathlib import Path
-from typing import Tuple
+from typing import Tuple, Callable
 from urllib.parse import urlsplit
 
 from tqdm.autonotebook import tqdm
@@ -462,17 +462,22 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
     Returns:
         A dictionary containing the base64 encoded plots for each diagram that is active in the config (matrix, bar, heatmap, dendrogram).
     """
+
     disable_progress_bar = not config["progress_bar"].get(bool)
 
+    def missing_diagram(name) -> Callable:
+        return {
+            "bar": missing_bar,
+            "matrix": missing_matrix,
+            "heatmap": missing_heatmap,
+            "dendrogram": missing_dendrogram,
+        }[name]
+
     missing_map = {
-        "bar": {"func": missing_bar, "min_missing": 0, "name": "Count"},
-        "matrix": {"func": missing_matrix, "min_missing": 0, "name": "Matrix"},
-        "heatmap": {"func": missing_heatmap, "min_missing": 2, "name": "Heatmap"},
-        "dendrogram": {
-            "func": missing_dendrogram,
-            "min_missing": 1,
-            "name": "Dendrogram",
-        },
+        "bar": {"min_missing": 0, "name": "Count"},
+        "matrix": {"min_missing": 0, "name": "Matrix"},
+        "heatmap": {"min_missing": 2, "name": "Heatmap"},
+        "dendrogram": {"min_missing": 1, "name": "Dendrogram"},
     }
 
     missing_map = {
@@ -497,11 +502,12 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
                     ):
                         missing[name] = {
                             "name": settings["name"],
-                            "matrix": settings["func"](df),
+                            "matrix": missing_diagram(name)(df),
                         }
                 except ValueError as e:
                     warn_missing(name, e)
                 pbar.update()
+
     return missing
 
 

diff --git a/src/pandas_profiling/report/presentation/flavours/flavours.py b/src/pandas_profiling/report/presentation/flavours/flavours.py
@@ -1,16 +1,19 @@
-from typing import Type
+from typing import Dict, Type
 
 from pandas_profiling.report.presentation.abstract.renderable import Renderable
 
 
-def HTMLReport(structure: Type[Renderable]):
-    """Adds HTML flavour to Renderable
+def apply_renderable_mapping(mapping, structure, flavour):
+    for key, value in mapping.items():
+        if isinstance(structure, key):
+            value.convert_to_class(structure, flavour)
 
-    Args:
-        structure:
 
-    Returns:
+def get_html_renderable_mapping() -> Dict[Type[Renderable], Type[Renderable]]:
+    """Workaround variable type annotations not being supported in Python 3.5
 
+    Returns:
+        type annotated mapping dict
     """
     from pandas_profiling.report.presentation.flavours.html import (
         HTMLSequence,
@@ -37,7 +40,7 @@ def HTMLReport(structure: Type[Renderable]):
         Sample,
     )
 
-    mapping = {
+    return {
         Sequence: HTMLSequence,
         Preview: HTMLPreview,
         Overview: HTMLOverview,
@@ -50,14 +53,22 @@ def HTMLReport(structure: Type[Renderable]):
         Sample: HTMLSample,
     }
 
-    for key, value in mapping.items():
-        if isinstance(structure, key):
-            value.convert_to_class(structure, HTMLReport)
 
+def HTMLReport(structure: Renderable):
+    """Adds HTML flavour to Renderable
+
+    Args:
+        structure:
+
+    Returns:
+
+    """
+    mapping = get_html_renderable_mapping()
+    apply_renderable_mapping(mapping, structure, flavour=HTMLReport)
     return structure
 
 
-def WidgetReport(structure: Type[Renderable]):
+def get_widget_renderable_mapping() -> Dict[Type[Renderable], Type[Renderable]]:
     from pandas_profiling.report.presentation.flavours.widget import (
         WidgetSequence,
         WidgetPreview,
@@ -83,7 +94,7 @@ def WidgetReport(structure: Type[Renderable]):
         Sample,
     )
 
-    mapping = {
+    return {
         Sequence: WidgetSequence,
         Preview: WidgetPreview,
         Overview: WidgetOverview,
@@ -96,14 +107,14 @@ def WidgetReport(structure: Type[Renderable]):
         Sample: WidgetSample,
     }
 
-    for key, value in mapping.items():
-        if isinstance(structure, key):
-            value.convert_to_class(structure, WidgetReport)
 
+def WidgetReport(structure: Renderable):
+    mapping = get_html_renderable_mapping()
+    apply_renderable_mapping(mapping, structure, flavour=WidgetReport)
     return structure
 
 
-def QtReport(structure: Type[Renderable]):
+def get_qt_renderable_mapping() -> Dict[Type[Renderable], Type[Renderable]]:
     from pandas_profiling.report.presentation.flavours.qt import (
         QtSequence,
         QtPreview,
@@ -129,7 +140,7 @@ def QtReport(structure: Type[Renderable]):
         Sample,
     )
 
-    mapping = {
+    return {
         Sequence: QtSequence,
         Preview: QtPreview,
         Overview: QtOverview,
@@ -142,8 +153,8 @@ def QtReport(structure: Type[Renderable]):
         Sample: QtSample,
     }
 
-    for key, value in mapping.items():
-        if isinstance(structure, key):
-            value.convert_to_class(structure, QtReport)
 
+def QtReport(structure: Renderable):
+    mapping = get_qt_renderable_mapping()
+    apply_renderable_mapping(mapping, structure, flavour=QtReport)
     return structure
diff --git a/src/pandas_profiling/report/presentation/frequency_table_utils.py b/src/pandas_profiling/report/presentation/frequency_table_utils.py
@@ -1,4 +1,7 @@
-def freq_table(freqtable, n: int, max_number_to_print: int) -> list:
+from typing import List, Dict
+
+
+def freq_table(freqtable, n: int, max_number_to_print: int) -> List[Dict]:
     """Render the rows for a frequency table (value, count).
 
     Args:
@@ -9,7 +12,6 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> list:
     Returns:
         The rows of the frequency table.
     """
-    rows = []
 
     # TODO: replace '' by '(Empty)' ?
 
@@ -26,15 +28,16 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> list:
     freq_missing = n - sum(freqtable)
     # No values
     if len(freqtable) == 0:
-        return rows
+        return []
 
     max_freq = max(freqtable.values[0], freq_other, freq_missing)
 
     # TODO: Correctly sort missing and other
     # No values
     if max_freq == 0:
-        return rows
+        return []
 
+    rows = []
     for label, freq in freqtable.iloc[0:max_number_to_print].items():
         rows.append(
             {

diff --git a/src/pandas_profiling/report/structure/report.py b/src/pandas_profiling/report/structure/report.py
@@ -70,19 +70,18 @@ def get_correlation_items(summary) -> list:
     items = []
 
     key_to_data = {
-        "pearson": {"vmin": -1, "name": "Pearson's r"},
-        "spearman": {"vmin": -1, "name": "Spearman's ρ"},
-        "kendall": {"vmin": -1, "name": "Kendall's τ"},
-        "phi_k": {"vmin": 0, "name": "Phik (φk)"},
-        "cramers": {"vmin": 0, "name": "Cramér's V (φc)"},
-        "recoded": {"vmin": 0, "name": "Recoded"},
+        "pearson": (-1, "Pearson's r"),
+        "spearman": (-1, "Spearman's ρ"),
+        "kendall": (-1, "Kendall's τ"),
+        "phi_k": (0, "Phik (φk)"),
+        "cramers": (0, "Cramér's V (φc)"),
+        "recoded": (0, "Recoded"),
     }
 
     image_format = config["plot"]["image_format"].get(str)
 
     for key, item in summary["correlations"].items():
-        vmin = key_to_data[key]["vmin"]
-        name = key_to_data[key]["name"]
+        vmin, name = key_to_data[key]
         items.append(
             Image(
                 plot.correlation_matrix(item, vmin=vmin),
@@ -211,7 +210,7 @@ def get_scatter_matrix(scatter_matrix):
                     anchor_id="interactions_{x_col}_{y_col}".format(
                         x_col=x_col, y_col=y_col
                     ),
-                    name="{y_col}".format(x_col=x_col, y_col=y_col),
+                    name="{x_col}_{y_col}".format(x_col=x_col, y_col=y_col),
                 )
             )
 

diff --git a/src/pandas_profiling/utils/common.py b/src/pandas_profiling/utils/common.py
@@ -38,7 +38,7 @@ def _copy(self, target):
     shutil.copy(str(self), str(target))  # str() only there for Python < (3, 6)
 
 
-Path.copy = _copy
+Path.copy = _copy  # type: ignore
 
 # Monkeypatch bug in imagehdr
 from imghdr import tests