Skip to content

feat: (Series|Dataframe).plot.hist() #420

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Mar 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions bigframes/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
import bigframes.formatting_helpers as formatter
import bigframes.operations as ops
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.plotting as plotting
import bigframes.series
import bigframes.series as bf_series
import bigframes.session._io.bigquery
Expand Down Expand Up @@ -3193,4 +3194,8 @@ def get_right_id(id):

return result

@property
def plot(self):
return plotting.PlotAccessor(self)

__matmul__ = dot
30 changes: 30 additions & 0 deletions bigframes/operations/_matplotlib/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import bigframes.operations._matplotlib.core as core
import bigframes.operations._matplotlib.hist as hist

PLOT_CLASSES: dict[str, type[core.MPLPlot]] = {
"hist": hist.HistPlot,
}


def plot(data, kind, **kwargs):
plot_obj = PLOT_CLASSES[kind](data, **kwargs)
plot_obj.generate()
plot_obj.draw()
return plot_obj.result


__all__ = ["plot"]
30 changes: 30 additions & 0 deletions bigframes/operations/_matplotlib/core.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import abc

import matplotlib.pyplot as plt


class MPLPlot(abc.ABC):
@abc.abstractmethod
def generate(self):
pass

def draw(self) -> None:
plt.draw_if_interactive()

@property
def result(self):
return self.axes
172 changes: 172 additions & 0 deletions bigframes/operations/_matplotlib/hist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import itertools
from typing import Literal

import numpy as np
import pandas as pd

import bigframes.constants as constants
import bigframes.operations._matplotlib.core as bfplt


class HistPlot(bfplt.MPLPlot):
@property
def _kind(self) -> Literal["hist"]:
return "hist"

def __init__(
self,
data,
bins: int = 10,
**kwargs,
) -> None:
self.bins = bins
self.label = kwargs.get("label", None)
self.by = kwargs.pop("by", None)
self.kwargs = kwargs

if self.by is not None:
raise NotImplementedError(
f"Non-none `by` argument is not yet supported. {constants.FEEDBACK_LINK}"
)
if not isinstance(self.bins, int):
raise NotImplementedError(
f"Only integer values are supported for the `bins` argument. {constants.FEEDBACK_LINK}"
)
if kwargs.get("weight", None) is not None:
raise NotImplementedError(
f"Non-none `weight` argument is not yet supported. {constants.FEEDBACK_LINK}"
)

self.data = self._compute_plot_data(data)

def generate(self) -> None:
"""
Calculates weighted histograms through BigQuery and plots them through pandas
native histogram plot.
"""
hist_bars = self._calculate_hist_bars(self.data, self.bins)
bin_edges = self._calculate_bin_edges(
hist_bars, self.bins, self.kwargs.get("range", None)
)

weights = {
col_name: hist_bar.values for col_name, hist_bar in hist_bars.items()
}
hist_x = {
col_name: pd.Series(
(
hist_bar.index.get_level_values("left_exclusive")
+ hist_bar.index.get_level_values("right_inclusive")
)
/ 2.0
)
for col_name, hist_bar in hist_bars.items()
}

# Align DataFrames for plotting despite potential differences in column
# lengths, filling shorter columns with zeros.
hist_x_pd = pd.DataFrame(
list(itertools.zip_longest(*hist_x.values())), columns=list(hist_x.keys())
).sort_index(axis=1)[self.data.columns.values]
weights_pd = pd.DataFrame(
list(itertools.zip_longest(*weights.values())), columns=list(weights.keys())
).sort_index(axis=1)[self.data.columns.values]

# Prevents pandas from dropping NA values and causing length mismatches by
# filling them with zeros.
hist_x_pd.fillna(0, inplace=True)
weights_pd.fillna(0, inplace=True)

self.axes = hist_x_pd.plot.hist(
bins=bin_edges,
weights=np.array(weights_pd.values),
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this method still not working on pandas 1.5 or did the fillna solve that?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the fillna solves the issue in pandas 1.5. I can add a comment above calling.

**self.kwargs,
) # type: ignore

def _compute_plot_data(self, data):
"""
Prepares data for plotting, focusing on numeric data types.

Raises:
TypeError: If the input data contains no numeric columns.
"""
# Importing at the top of the file causes a circular import.
import bigframes.series as series

if isinstance(data, series.Series):
label = self.label
if label is None and data.name is None:
label = ""
if label is None:
data = data.to_frame()
else:
data = data.to_frame(name=label)

# TODO(chelsealin): Support timestamp/date types here.
include_type = ["number"]
numeric_data = data.select_dtypes(include=include_type)
try:
is_empty = numeric_data.columns.empty
except AttributeError:
is_empty = not len(numeric_data)

if is_empty:
raise TypeError("no numeric data to plot")

return numeric_data

@staticmethod
def _calculate_hist_bars(data, bins):
"""
Calculates histogram bars for each column in a BigFrames DataFrame, and
returns a dictionary where keys are column names and values are pandas
Series. The series values are the histogram bins' heights with a
multi-index defining 'left_exclusive' and 'right_inclusive' bin edges.
"""
import bigframes.pandas as bpd

# TODO: Optimize this by batching multiple jobs into one.
hist_bar = {}
for _, col in enumerate(data.columns):
cutted_data = bpd.cut(data[col], bins=bins, labels=None)
hist_bar[col] = (
cutted_data.struct.explode()
.value_counts()
.to_pandas()
.sort_index(level="left_exclusive")
)
return hist_bar

@staticmethod
def _calculate_bin_edges(hist_bars, bins, range):
"""
Calculate bin edges from the histogram bars.
"""
bin_edges = None
for _, hist_bar in hist_bars.items():
left = hist_bar.index.get_level_values("left_exclusive")
right = hist_bar.index.get_level_values("right_inclusive")
if bin_edges is None:
bin_edges = left.union(right)
else:
bin_edges = left.union(right).union(bin_edges)

if bin_edges is None:
return None

_, bins = np.histogram(bin_edges, bins=bins, range=range)
return bins
34 changes: 34 additions & 0 deletions bigframes/operations/plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Optional, Sequence

import bigframes.constants as constants
import bigframes.operations._matplotlib as bfplt
import third_party.bigframes_vendored.pandas.plotting._core as vendordt


class PlotAccessor:
__doc__ = vendordt.PlotAccessor.__doc__

def __init__(self, data) -> None:
self._parent = data

def hist(self, by: Optional[Sequence[str]] = None, bins: int = 10, **kwargs):
if kwargs.pop("backend", None) is not None:
raise NotImplementedError(
f"Only support matplotlib backend for now. {constants.FEEDBACK_LINK}"
)
# Calls matplotlib backend to plot the data.
return bfplt.plot(self._parent.copy(), kind="hist", by=by, bins=bins, **kwargs)
5 changes: 5 additions & 0 deletions bigframes/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import bigframes.operations.aggregations as agg_ops
import bigframes.operations.base
import bigframes.operations.datetimes as dt
import bigframes.operations.plotting as plotting
import bigframes.operations.strings as strings
import bigframes.operations.structs as structs
import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series
Expand Down Expand Up @@ -1557,6 +1558,10 @@ def __array_ufunc__(
def str(self) -> strings.StringMethods:
return strings.StringMethods(self._block)

@property
def plot(self):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! Added PlotAccessor to the series.rst and frames.rst.
As for the docstring for def plot, it is defined at third_party/bigframes_vendored/pandas/core/series.py

return plotting.PlotAccessor(self)

def _slice(
self,
start: typing.Optional[int] = None,
Expand Down
11 changes: 11 additions & 0 deletions docs/reference/bigframes.pandas/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,14 @@ DataFrame
:members:
:inherited-members:
:undoc-members:

Accessors
---------

Plotting handling
^^^^^^^^^^^^^^^^^

.. automodule:: bigframes.operations.plotting
:members:
:inherited-members:
:undoc-members:
9 changes: 9 additions & 0 deletions docs/reference/bigframes.pandas/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,12 @@ Struct handling
:members:
:inherited-members:
:undoc-members:

Plotting handling
^^^^^^^^^^^^^^^^^

.. automodule:: bigframes.operations.plotting
:members:
:inherited-members:
:undoc-members:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for checking! Fixed.

:noindex:
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
"tabulate >= 0.9",
"ipywidgets >=7.7.1",
"humanize >= 4.6.0",
"matplotlib >= 3.7.1",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add this to https://github.com/googleapis/python-bigquery-dataframes/blob/main/testing/constraints-3.10.txt too so that we know we always test against our advertised minimum version in at least one test session.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

matplotlib==3.7.1 has been added into this file earlier. The system-3.10 tests are not triggerred according to

SYSTEM_TEST_PYTHON_VERSIONS = ["3.9", "3.11"]

If so, should I add the matplotlib to contraints-3.9.txt also?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, you're right. Yes add to contraints-3.9.txt. I was looking at the list of files sorted alphabetically.

]
extras = {
# Optional test dependencies packages. If they're missed, may skip some tests.
Expand Down
1 change: 1 addition & 0 deletions testing/constraints-3.9.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,6 @@ sqlglot==20.8.0
tabulate==0.9
ipywidgets==7.7.1
humanize==4.6.0
matplotlib==3.7.1
# extras
pandas-gbq==0.19.0
Loading