Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Static type checking #330

Merged
merged 7 commits into from
Feb 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ env:
- TEST=console
- TEST=examples
- TEST=lint
- TEST=typing

install:
- pip install --upgrade pip six
Expand All @@ -30,6 +31,7 @@ script:
- if [ $TEST == 'issue' ]; then pytest --cov=. tests/issues/; fi
- if [ $TEST == 'examples' ]; then pytest --cov=. --nbval tests/notebooks/; fi
- if [ $TEST == 'console' ]; then pandas_profiling -h; fi
- if [ $TEST == 'typing' ]; then pytest --mypy -m mypy .; fi
- if [ $TEST == 'lint' ]; then pytest --black -m black src/; flake8 . --select=E9,F63,F7,F82 --show-source --statistics; fi

after_success:
Expand Down
8 changes: 5 additions & 3 deletions examples/bank_marketing_data/banking_data.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# As featured on this Google Cloud Platform page:
# https://cloud.google.com/solutions/building-a-propensity-model-for-financial-services-on-gcp

from pathlib import Path

import pandas as pd

Expand All @@ -16,5 +16,7 @@
# Download the UCI Bank Marketing Dataset
df = pd.read_csv(file_name, sep=";")

profile = ProfileReport(df, title="UCI Bank Marketing Dataset")
profile.to_file("uci_bank_marketing_report.html")
profile = ProfileReport(
df, title="Profile Report of the UCI Bank Marketing Dataset"
)
profile.to_file(Path("uci_bank_marketing_report.html"))
4 changes: 3 additions & 1 deletion examples/colors/colors.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
Expand All @@ -11,4 +13,4 @@

df = pd.read_csv(file_name, names=["Code", "Name", "Hex", "R", "G", "B"])
report = ProfileReport(df, title="Colors")
report.to_file("colors_report.html")
report.to_file(Path("colors_report.html"))
4 changes: 3 additions & 1 deletion examples/stata_auto/stata_auto.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
Expand All @@ -10,4 +12,4 @@

# Length left out due to correlation with weight.
report = ProfileReport(df, title="1978 Automobile dataset")
report.to_file("stata_auto_report.html")
report.to_file(Path("stata_auto_report.html"))
4 changes: 3 additions & 1 deletion examples/vektis/vektis.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import pandas as pd

from pandas_profiling import ProfileReport
Expand All @@ -21,4 +23,4 @@
},
plot={"histogram": {"bayesian_blocks_bins": False}},
)
report.to_file("vektis_report.html", True)
report.to_file(Path("vektis_report.html"))
1 change: 1 addition & 0 deletions requirements-test.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
pytest
coverage<5
codecov
pytest-mypy
pytest-cov
pytest-black
nbval
Expand Down
2 changes: 1 addition & 1 deletion src/pandas_profiling/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, df, minimal=False, config_file: Path = None, **kwargs):
config_file = get_config_minimal()

if config_file:
config.config.set_file(str(config_file))
config.set_file(str(config_file))
config.set_kwargs(kwargs)

self.date_start = datetime.utcnow()
Expand Down
10 changes: 8 additions & 2 deletions src/pandas_profiling/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ def __init__(self):
"""The config constructor should be called only once."""
if self.config is None:
self.config = confuse.Configuration("PandasProfiling", __name__)
self.config.set_file(str(get_config_default()))

self.set_file(str(get_config_default()))

def set_file(self, file_name):
if self.config is not None:
self.config.set_file(file_name)

def set_args(self, namespace: argparse.Namespace, dots: bool) -> None:
"""
Expand All @@ -28,7 +33,8 @@ def set_args(self, namespace: argparse.Namespace, dots: bool) -> None:
namespace: Dictionary or Namespace to overlay this config with. Supports nested Dictionaries and Namespaces.
dots: If True, any properties on namespace that contain dots (.) will be broken down into child dictionaries.
"""
self.config.set_args(namespace, dots)
if self.config is not None:
self.config.set_args(namespace, dots)

def _set_kwargs(self, reference, values: dict):
"""Helper function to set config variables based on kwargs."""
Expand Down
17 changes: 14 additions & 3 deletions src/pandas_profiling/model/correlations.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import warnings
from contextlib import suppress
from functools import partial
from typing import Callable
from typing import Callable, Dict, List

import pandas as pd
import numpy as np
Expand Down Expand Up @@ -257,7 +257,18 @@ def calculate_correlations(df: pd.DataFrame, variables: dict) -> dict:
return correlations


def perform_check_correlation(correlation_matrix, threshold: float):
def get_correlation_mapping() -> Dict[str, List[str]]:
"""Workaround variable type annotations not being supported in Python 3.5

Returns:
type annotated empty dict
"""
return {}


def perform_check_correlation(
correlation_matrix: pd.DataFrame, threshold: float
) -> Dict[str, List[str]]:
"""Check whether selected variables are highly correlated values in the correlation matrix.

Args:
Expand All @@ -274,7 +285,7 @@ def perform_check_correlation(correlation_matrix, threshold: float):
# correlation_tri = correlation.where(np.triu(np.ones(correlation.shape),k=1).astype(np.bool))
# drop_cols = [i for i in correlation_tri if any(correlation_tri[i]>threshold)]

mapping = {}
mapping = get_correlation_mapping()
for x, corr_x in corr.iterrows():
for y, corr in corr_x.iteritems():
if x == y:
Expand Down
26 changes: 16 additions & 10 deletions src/pandas_profiling/model/describe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import sys
import warnings
from pathlib import Path
from typing import Tuple
from typing import Tuple, Callable
from urllib.parse import urlsplit

from tqdm.autonotebook import tqdm
Expand Down Expand Up @@ -462,17 +462,22 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
Returns:
A dictionary containing the base64 encoded plots for each diagram that is active in the config (matrix, bar, heatmap, dendrogram).
"""

disable_progress_bar = not config["progress_bar"].get(bool)

def missing_diagram(name) -> Callable:
return {
"bar": missing_bar,
"matrix": missing_matrix,
"heatmap": missing_heatmap,
"dendrogram": missing_dendrogram,
}[name]

missing_map = {
"bar": {"func": missing_bar, "min_missing": 0, "name": "Count"},
"matrix": {"func": missing_matrix, "min_missing": 0, "name": "Matrix"},
"heatmap": {"func": missing_heatmap, "min_missing": 2, "name": "Heatmap"},
"dendrogram": {
"func": missing_dendrogram,
"min_missing": 1,
"name": "Dendrogram",
},
"bar": {"min_missing": 0, "name": "Count"},
"matrix": {"min_missing": 0, "name": "Matrix"},
"heatmap": {"min_missing": 2, "name": "Heatmap"},
"dendrogram": {"min_missing": 1, "name": "Dendrogram"},
}

missing_map = {
Expand All @@ -497,11 +502,12 @@ def get_missing_diagrams(df: pd.DataFrame, table_stats: dict) -> dict:
):
missing[name] = {
"name": settings["name"],
"matrix": settings["func"](df),
"matrix": missing_diagram(name)(df),
}
except ValueError as e:
warn_missing(name, e)
pbar.update()

return missing


Expand Down
51 changes: 31 additions & 20 deletions src/pandas_profiling/report/presentation/flavours/flavours.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
from typing import Type
from typing import Dict, Type

from pandas_profiling.report.presentation.abstract.renderable import Renderable


def HTMLReport(structure: Type[Renderable]):
"""Adds HTML flavour to Renderable
def apply_renderable_mapping(mapping, structure, flavour):
for key, value in mapping.items():
if isinstance(structure, key):
value.convert_to_class(structure, flavour)

Args:
structure:

Returns:
def get_html_renderable_mapping() -> Dict[Type[Renderable], Type[Renderable]]:
"""Workaround variable type annotations not being supported in Python 3.5

Returns:
type annotated mapping dict
"""
from pandas_profiling.report.presentation.flavours.html import (
HTMLSequence,
Expand All @@ -37,7 +40,7 @@ def HTMLReport(structure: Type[Renderable]):
Sample,
)

mapping = {
return {
Sequence: HTMLSequence,
Preview: HTMLPreview,
Overview: HTMLOverview,
Expand All @@ -50,14 +53,22 @@ def HTMLReport(structure: Type[Renderable]):
Sample: HTMLSample,
}

for key, value in mapping.items():
if isinstance(structure, key):
value.convert_to_class(structure, HTMLReport)

def HTMLReport(structure: Renderable):
"""Adds HTML flavour to Renderable

Args:
structure:

Returns:

"""
mapping = get_html_renderable_mapping()
apply_renderable_mapping(mapping, structure, flavour=HTMLReport)
return structure


def WidgetReport(structure: Type[Renderable]):
def get_widget_renderable_mapping() -> Dict[Type[Renderable], Type[Renderable]]:
from pandas_profiling.report.presentation.flavours.widget import (
WidgetSequence,
WidgetPreview,
Expand All @@ -83,7 +94,7 @@ def WidgetReport(structure: Type[Renderable]):
Sample,
)

mapping = {
return {
Sequence: WidgetSequence,
Preview: WidgetPreview,
Overview: WidgetOverview,
Expand All @@ -96,14 +107,14 @@ def WidgetReport(structure: Type[Renderable]):
Sample: WidgetSample,
}

for key, value in mapping.items():
if isinstance(structure, key):
value.convert_to_class(structure, WidgetReport)

def WidgetReport(structure: Renderable):
mapping = get_html_renderable_mapping()
apply_renderable_mapping(mapping, structure, flavour=WidgetReport)
return structure


def QtReport(structure: Type[Renderable]):
def get_qt_renderable_mapping() -> Dict[Type[Renderable], Type[Renderable]]:
from pandas_profiling.report.presentation.flavours.qt import (
QtSequence,
QtPreview,
Expand All @@ -129,7 +140,7 @@ def QtReport(structure: Type[Renderable]):
Sample,
)

mapping = {
return {
Sequence: QtSequence,
Preview: QtPreview,
Overview: QtOverview,
Expand All @@ -142,8 +153,8 @@ def QtReport(structure: Type[Renderable]):
Sample: QtSample,
}

for key, value in mapping.items():
if isinstance(structure, key):
value.convert_to_class(structure, QtReport)

def QtReport(structure: Renderable):
mapping = get_qt_renderable_mapping()
apply_renderable_mapping(mapping, structure, flavour=QtReport)
return structure
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
def freq_table(freqtable, n: int, max_number_to_print: int) -> list:
from typing import List, Dict


def freq_table(freqtable, n: int, max_number_to_print: int) -> List[Dict]:
"""Render the rows for a frequency table (value, count).

Args:
Expand All @@ -9,7 +12,6 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> list:
Returns:
The rows of the frequency table.
"""
rows = []

# TODO: replace '' by '(Empty)' ?

Expand All @@ -26,15 +28,16 @@ def freq_table(freqtable, n: int, max_number_to_print: int) -> list:
freq_missing = n - sum(freqtable)
# No values
if len(freqtable) == 0:
return rows
return []

max_freq = max(freqtable.values[0], freq_other, freq_missing)

# TODO: Correctly sort missing and other
# No values
if max_freq == 0:
return rows
return []

rows = []
for label, freq in freqtable.iloc[0:max_number_to_print].items():
rows.append(
{
Expand Down
17 changes: 8 additions & 9 deletions src/pandas_profiling/report/structure/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,19 +70,18 @@ def get_correlation_items(summary) -> list:
items = []

key_to_data = {
"pearson": {"vmin": -1, "name": "Pearson's r"},
"spearman": {"vmin": -1, "name": "Spearman's ρ"},
"kendall": {"vmin": -1, "name": "Kendall's τ"},
"phi_k": {"vmin": 0, "name": "Phik (φk)"},
"cramers": {"vmin": 0, "name": "Cramér's V (φc)"},
"recoded": {"vmin": 0, "name": "Recoded"},
"pearson": (-1, "Pearson's r"),
"spearman": (-1, "Spearman's ρ"),
"kendall": (-1, "Kendall's τ"),
"phi_k": (0, "Phik (φk)"),
"cramers": (0, "Cramér's V (φc)"),
"recoded": (0, "Recoded"),
}

image_format = config["plot"]["image_format"].get(str)

for key, item in summary["correlations"].items():
vmin = key_to_data[key]["vmin"]
name = key_to_data[key]["name"]
vmin, name = key_to_data[key]
items.append(
Image(
plot.correlation_matrix(item, vmin=vmin),
Expand Down Expand Up @@ -211,7 +210,7 @@ def get_scatter_matrix(scatter_matrix):
anchor_id="interactions_{x_col}_{y_col}".format(
x_col=x_col, y_col=y_col
),
name="{y_col}".format(x_col=x_col, y_col=y_col),
name="{x_col}_{y_col}".format(x_col=x_col, y_col=y_col),
)
)

Expand Down
2 changes: 1 addition & 1 deletion src/pandas_profiling/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def _copy(self, target):
shutil.copy(str(self), str(target)) # str() only there for Python < (3, 6)


Path.copy = _copy
Path.copy = _copy # type: ignore

# Monkeypatch bug in imagehdr
from imghdr import tests
Expand Down
Loading