Skip to content

Commit

Permalink
Static type checking (ydataai#330)
Browse files Browse the repository at this point in the history
Static type checking with `mypy`

Introducing static type checking to this repository (ydataai#302). 
The code base is updated to resolve detected typing errors.
A workaround is used to be compatible with Python 3.5, that does not support variable typing.
  • Loading branch information
sbrugman authored Feb 2, 2020
1 parent 606480d commit 2389d03
Show file tree
Hide file tree
Showing 17 changed files with 117 additions and 65 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ env:
- TEST=console
- TEST=examples
- TEST=lint
- TEST=typing

install:
- pip install --upgrade pip six
Expand All @@ -30,6 +31,7 @@ script:
- if [ $TEST == 'issue' ]; then pytest --cov=. tests/issues/; fi
- if [ $TEST == 'examples' ]; then pytest --cov=. --nbval tests/notebooks/; fi
- if [ $TEST == 'console' ]; then pandas_profiling -h; fi
- if [ $TEST == 'typing' ]; then pytest --mypy -m mypy .; fi
- if [ $TEST == 'lint' ]; then pytest --black -m black src/; flake8 . --select=E9,F63,F7,F82 --show-source --statistics; fi

after_success:
Expand Down
8 changes: 5 additions & 3 deletions examples/bank_marketing_data/banking_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# As featured on this Google Cloud Platform page:
# https://cloud.google.com/solutions/building-a-propensity-model-for-financial-services-on-gcp

from pathlib import Path

import pandas as pd

Expand All @@ -16,5 +16,7 @@
# Download the UCI Bank Marketing Dataset
df = pd.read_csv(file_name, sep=";")

profile = ProfileReport(df, title="UCI Bank Marketing Dataset")
profile.to_file("uci_bank_marketing_report.html")
profile = ProfileReport(
df, title="Profile Report of the UCI Bank Marketing Dataset"
)
profile.to_file(Path("uci_bank_marketing_report.html"))
4 changes: 3 additions & 1 deletion examples/colors/colors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
Expand All @@ -11,4 +13,4 @@

df = pd.read_csv(file_name, names=["Code", "Name", "Hex", "R", "G", "B"])
report = ProfileReport(df, title="Colors")
report.to_file("colors_report.html")
report.to_file(Path("colors_report.html"))
4 changes: 3 additions & 1 deletion examples/stata_auto/stata_auto.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
Expand All @@ -10,4 +12,4 @@

# Length left out due to correlation with weight.
report = ProfileReport(df, title="1978 Automobile dataset")
report.to_file("stata_auto_report.html")
report.to_file(Path("stata_auto_report.html"))
4 changes: 3 additions & 1 deletion examples/vektis/vektis.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
Expand All @@ -21,4 +23,4 @@
},
plot={"histogram": {"bayesian_blocks_bins": False}},
)
report.to_file("vektis_report.html", True)
report.to_file(Path("vektis_report.html"))
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
pytest
coverage<5
codecov
pytest-mypy
pytest-cov
pytest-black
nbval
Expand Down
2 changes: 1 addition & 1 deletion src/pandas_profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, df, minimal=False, config_file: Path = None, **kwargs):
config_file = get_config_minimal()

if config_file:
config.config.set_file(str(config_file))
config.set_file(str(config_file))
config.set_kwargs(kwargs)

self.date_start = datetime.utcnow()
Expand Down
10 changes: 8 additions & 2 deletions src/pandas_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ def __init__(self):
"""The config constructor should be called only once."""
if self.config is None:
self.config = confuse.Configuration("PandasProfiling", __name__)
self.config.set_file(str(get_config_default()))

self.set_file(str(get_config_default()))

def set_file(self, file_name):
if self.config is not None:
self.config.set_file(file_name)

def set_args(self, namespace: argparse.Namespace, dots: bool) -> None:
"""
Expand All @@ -28,7 +33,8 @@ def set_args(self, namespace: argparse.Namespace, dots: bool) -> None:
namespace: Dictionary or Namespace to overlay this config with. Supports nested Dictionaries and Namespaces.
dots: If True, any properties on namespace that contain dots (.) will be broken down into child dictionaries.
"""
self.config.set_args(namespace, dots)
if self.config is not None:
self.config.set_args(namespace, dots)

def _set_kwargs(self, reference, values: dict):
"""Helper function to set config variables based on kwargs."""
Expand Down
17 changes: 14 additions & 3 deletions src/pandas_profiling/model/correlations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import warnings
from contextlib import suppress
from functools import partial
from typing import Callable
from typing import Callable, Dict, List

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -257,7 +257,18 @@ def calculate_correlations(df: pd.DataFrame, variables: dict) -> dict:
return correlations


def perform_check_correlation(correlation_matrix, threshold: float):
def get_correlation_mapping() -> Dict[str, List[str]]:
"""Workaround variable type annotations not being supported in Python 3.5
Returns:
type annotated empty dict
"""
return {}


def perform_check_correlation(
correlation_matrix: pd.DataFrame, threshold: float
) -> Dict[str, List[str]]:
"""Check whether selected variables are highly correlated values in the correlation matrix.
Args:
Expand All @@ -274,7 +285,7 @@ def perform_check_correlation(correlation_matrix, threshold: float):
# correlation_tri = correlation.where(np.triu(np.ones(correlation.shape),k=1).astype(np.bool))
# drop_cols = [i for i in correlation_tri if any(correlation_tri[i]>threshold)]

mapping = {}
mapping = get_correlation_mapping()
for x, corr_x in corr.iterrows():
for y, corr in corr_x.iteritems():
if x == y:
Expand Down
26 changes: 16 additions & 10 deletions src/pandas_profiling/model/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
import warnings
from pathlib import Path
from typing import Tuple
from typing import Tuple, Callable
from urllib.parse import urlsplit

from tqdm.autonotebook import tqdm
Expand Down Expand Up @@ -462,17 +462,22 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
Returns:
A dictionary containing the base64 encoded plots for each diagram that is active in the config (matrix, bar, heatmap, dendrogram).
"""

disable_progress_bar = not config["progress_bar"].get(bool)

def missing_diagram(name) -> Callable:
return {
"bar": missing_bar,
"matrix": missing_matrix,
"heatmap": missing_heatmap,
"dendrogram": missing_dendrogram,
}[name]

missing_map = {
"bar": {"func": missing_bar, "min_missing": 0, "name": "Count"},
"matrix": {"func": missing_matrix, "min_missing": 0, "name": "Matrix"},
"heatmap": {"func": missing_heatmap, "min_missing": 2, "name": "Heatmap"},
"dendrogram": {
"func": missing_dendrogram,
"min_missing": 1,
"name": "Dendrogram",
},
"bar": {"min_missing": 0, "name": "Count"},
"matrix": {"min_missing": 0, "name": "Matrix"},
"heatmap": {"min_missing": 2, "name": "Heatmap"},
"dendrogram": {"min_missing": 1, "name": "Dendrogram"},
}

missing_map = {
Expand All @@ -497,11 +502,12 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
):
missing[name] = {
"name": settings["name"],
"matrix": settings["func"](df),
"matrix": missing_diagram(name)(df),
}
except ValueError as e:
warn_missing(name, e)
pbar.update()

return missing


Expand Down
51 changes: 31 additions & 20 deletions src/pandas_profiling/report/presentation/flavours/flavours.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from typing import Type
from typing import Dict, Type

from pandas_profiling.report.presentation.abstract.renderable import Renderable


def HTMLReport(structure: Type[Renderable]):
"""Adds HTML flavour to Renderable
def apply_renderable_mapping(mapping, structure, flavour):
for key, value in mapping.items():
if isinstance(structure, key):
value.convert_to_class(structure, flavour)

Args:
structure:

Returns:
def get_html_renderable_mapping() -> Dict[Type[Renderable], Type[Renderable]]:
"""Workaround variable type annotations not being supported in Python 3.5
Returns:
type annotated mapping dict
"""
from pandas_profiling.report.presentation.flavours.html import (
HTMLSequence,
Expand All @@ -37,7 +40,7 @@ def HTMLReport(structure: Type[Renderable]):
Sample,
)

mapping = {
return {
Sequence: HTMLSequence,
Preview: HTMLPreview,
Overview: HTMLOverview,
Expand All @@ -50,14 +53,22 @@ def HTMLReport(structure: Type[Renderable]):
Sample: HTMLSample,
}

for key, value in mapping.items():
if isinstance(structure, key):
value.convert_to_class(structure, HTMLReport)

def HTMLReport(structure: Renderable):
"""Adds HTML flavour to Renderable
Args:
structure:
Returns:
"""
mapping = get_html_renderable_mapping()
apply_renderable_mapping(mapping, structure, flavour=HTMLReport)
return structure


def WidgetReport(structure: Type[Renderable]):
def get_widget_renderable_mapping() -> Dict[Type[Renderable], Type[Renderable]]:
from pandas_profiling.report.presentation.flavours.widget import (
WidgetSequence,
WidgetPreview,
Expand All @@ -83,7 +94,7 @@ def WidgetReport(structure: Type[Renderable]):
Sample,
)

mapping = {
return {
Sequence: WidgetSequence,
Preview: WidgetPreview,
Overview: WidgetOverview,
Expand All @@ -96,14 +107,14 @@ def WidgetReport(structure: Type[Renderable]):
Sample: WidgetSample,
}

for key, value in mapping.items():
if isinstance(structure, key):
value.convert_to_class(structure, WidgetReport)

def WidgetReport(structure: Renderable):
mapping = get_html_renderable_mapping()
apply_renderable_mapping(mapping, structure, flavour=WidgetReport)
return structure


def QtReport(structure: Type[Renderable]):
def get_qt_renderable_mapping() -> Dict[Type[Renderable], Type[Renderable]]:
from pandas_profiling.report.presentation.flavours.qt import (
QtSequence,
QtPreview,
Expand All @@ -129,7 +140,7 @@ def QtReport(structure: Type[Renderable]):
Sample,
)

mapping = {
return {
Sequence: QtSequence,
Preview: QtPreview,
Overview: QtOverview,
Expand All @@ -142,8 +153,8 @@ def QtReport(structure: Type[Renderable]):
Sample: QtSample,
}

for key, value in mapping.items():
if isinstance(structure, key):
value.convert_to_class(structure, QtReport)

def QtReport(structure: Renderable):
mapping = get_qt_renderable_mapping()
apply_renderable_mapping(mapping, structure, flavour=QtReport)
return structure
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
def freq_table(freqtable, n: int, max_number_to_print: int) -> list:
from typing import List, Dict


def freq_table(freqtable, n: int, max_number_to_print: int) -> List[Dict]:
"""Render the rows for a frequency table (value, count).
Args:
Expand All @@ -9,7 +12,6 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> list:
Returns:
The rows of the frequency table.
"""
rows = []

# TODO: replace '' by '(Empty)' ?

Expand All @@ -26,15 +28,16 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> list:
freq_missing = n - sum(freqtable)
# No values
if len(freqtable) == 0:
return rows
return []

max_freq = max(freqtable.values[0], freq_other, freq_missing)

# TODO: Correctly sort missing and other
# No values
if max_freq == 0:
return rows
return []

rows = []
for label, freq in freqtable.iloc[0:max_number_to_print].items():
rows.append(
{
Expand Down
17 changes: 8 additions & 9 deletions src/pandas_profiling/report/structure/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,18 @@ def get_correlation_items(summary) -> list:
items = []

key_to_data = {
"pearson": {"vmin": -1, "name": "Pearson's r"},
"spearman": {"vmin": -1, "name": "Spearman's ρ"},
"kendall": {"vmin": -1, "name": "Kendall's τ"},
"phi_k": {"vmin": 0, "name": "Phik (φk)"},
"cramers": {"vmin": 0, "name": "Cramér's V (φc)"},
"recoded": {"vmin": 0, "name": "Recoded"},
"pearson": (-1, "Pearson's r"),
"spearman": (-1, "Spearman's ρ"),
"kendall": (-1, "Kendall's τ"),
"phi_k": (0, "Phik (φk)"),
"cramers": (0, "Cramér's V (φc)"),
"recoded": (0, "Recoded"),
}

image_format = config["plot"]["image_format"].get(str)

for key, item in summary["correlations"].items():
vmin = key_to_data[key]["vmin"]
name = key_to_data[key]["name"]
vmin, name = key_to_data[key]
items.append(
Image(
plot.correlation_matrix(item, vmin=vmin),
Expand Down Expand Up @@ -211,7 +210,7 @@ def get_scatter_matrix(scatter_matrix):
anchor_id="interactions_{x_col}_{y_col}".format(
x_col=x_col, y_col=y_col
),
name="{y_col}".format(x_col=x_col, y_col=y_col),
name="{x_col}_{y_col}".format(x_col=x_col, y_col=y_col),
)
)

Expand Down
2 changes: 1 addition & 1 deletion src/pandas_profiling/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _copy(self, target):
shutil.copy(str(self), str(target)) # str() only there for Python < (3, 6)


Path.copy = _copy
Path.copy = _copy # type: ignore

# Monkeypatch bug in imagehdr
from imghdr import tests
Expand Down
Loading

0 comments on commit 2389d03

Please sign in to comment.