Skip to content

Commit 06fb1e6

Browse files
committed
feat: (Series|Dataframe).plot.hist()
1 parent 38bd2ba commit 06fb1e6

File tree

11 files changed

+442
-0
lines changed

11 files changed

+442
-0
lines changed

bigframes/dataframe.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,7 @@
5959
import bigframes.formatting_helpers as formatter
6060
import bigframes.operations as ops
6161
import bigframes.operations.aggregations as agg_ops
62+
import bigframes.operations.plot as plot
6263
import bigframes.series
6364
import bigframes.series as bf_series
6465
import bigframes.session._io.bigquery
@@ -3190,4 +3191,8 @@ def get_right_id(id):
31903191

31913192
return result
31923193

3194+
@property
3195+
def plot(self):
3196+
return plot.PlotAccessor(self)
3197+
31933198
__matmul__ = dot
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from bigframes.operations._matplotlib.core import MPLPlot
16+
from bigframes.operations._matplotlib.hist import HistPlot
17+
18+
PLOT_CLASSES: dict[str, type[MPLPlot]] = {
19+
"hist": HistPlot,
20+
}
21+
22+
23+
def plot(data, kind, **kwargs):
24+
plot_obj = PLOT_CLASSES[kind](data, **kwargs)
25+
plot_obj.generate()
26+
plot_obj.draw()
27+
return plot_obj.result
28+
29+
30+
__all__ = ["plot"]
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from abc import ABC, abstractmethod
16+
17+
import matplotlib.pyplot as plt
18+
19+
20+
class MPLPlot(ABC):
21+
@abstractmethod
22+
def generate(self):
23+
pass
24+
25+
def draw(self) -> None:
26+
plt.draw_if_interactive()
27+
28+
@property
29+
def result(self):
30+
return self.axes
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
import itertools
16+
from typing import Literal
17+
18+
import numpy as np
19+
import pandas as pd
20+
21+
import bigframes.constants as constants
22+
from bigframes.operations._matplotlib.core import MPLPlot
23+
24+
25+
class HistPlot(MPLPlot):
26+
@property
27+
def _kind(self) -> Literal["hist"]:
28+
return "hist"
29+
30+
def __init__(
31+
self,
32+
data,
33+
bins: int = 10,
34+
**kwargs,
35+
) -> None:
36+
self.bins = bins
37+
self.label = kwargs.get("label", None)
38+
self.by = kwargs.pop("by", None)
39+
self.kwargs = kwargs
40+
41+
self.data = self._compute_plot_data(data)
42+
43+
if self.by is not None:
44+
raise NotImplementedError(
45+
f"Non-none `by` argument is not yet supported. {constants.FEEDBACK_LINK}"
46+
)
47+
if kwargs.get("weight", None) is not None:
48+
raise NotImplementedError(
49+
f"Non-none `weight` argument is not yet supported. {constants.FEEDBACK_LINK}"
50+
)
51+
52+
def generate(self) -> None:
53+
hist_bars = self._calculate_hist_bar(self.data, self.bins)
54+
55+
bin_edges = []
56+
hist_x = {}
57+
weights = {}
58+
for col_name, hist_bar in hist_bars.items():
59+
left = hist_bar.index.get_level_values("left_exclusive")
60+
right = hist_bar.index.get_level_values("right_inclusive")
61+
62+
hist_x[col_name] = pd.Series((left + right) / 2.0)
63+
weights[col_name] = hist_bar.values
64+
bin_edges = left.union(right).union(bin_edges)
65+
66+
_, bins = np.histogram(
67+
bin_edges, bins=self.bins, range=self.kwargs.get("range", None)
68+
)
69+
70+
# Fills with NA values when items have different lengths.
71+
ordered_cols = [col_name for _, col_name in enumerate(self.data.columns)]
72+
hist_x_pd = pd.DataFrame(
73+
list(itertools.zip_longest(*hist_x.values())), columns=hist_x.keys()
74+
).sort_index(axis=1)
75+
weights_pd = pd.DataFrame(
76+
list(itertools.zip_longest(*weights.values())), columns=weights.keys()
77+
).sort_index(axis=1)
78+
79+
self.axes = hist_x_pd[ordered_cols].plot.hist(
80+
bins=bins, weights=weights_pd[ordered_cols].values, **self.kwargs
81+
)
82+
83+
def _compute_plot_data(self, data):
84+
# Importing at the top of the file causes a circular import.
85+
import bigframes.series as series
86+
87+
if isinstance(data, series.Series):
88+
label = self.label
89+
if label is None and data.name is None:
90+
label = ""
91+
if label is None:
92+
data = data.to_frame()
93+
else:
94+
data = data.to_frame(name=label)
95+
96+
# TODO(chelsealin): Support timestamp/date types here.
97+
include_type = ["number"]
98+
numeric_data = data.select_dtypes(include=include_type)
99+
try:
100+
is_empty = numeric_data.columns.empty
101+
except AttributeError:
102+
is_empty = not len(numeric_data)
103+
104+
if is_empty:
105+
raise TypeError("no numeric data to plot")
106+
107+
return numeric_data
108+
109+
@staticmethod
110+
def _calculate_hist_bar(data, bins):
111+
import bigframes.pandas as bpd
112+
113+
# TODO: Optimize this by batching multiple jobs into one.
114+
hist_bar = {}
115+
for _, col in enumerate(data.columns):
116+
cutted_data = bpd.cut(data[col], bins=bins, labels=None)
117+
hist_bar[col] = (
118+
cutted_data.struct.explode()
119+
.value_counts()
120+
.to_pandas()
121+
.sort_index(level="left_exclusive")
122+
)
123+
return hist_bar

bigframes/operations/plot.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
# Copyright 2023 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from typing import Sequence
16+
17+
import bigframes.constants as constants
18+
import bigframes.operations._matplotlib as plotbackend
19+
import third_party.bigframes_vendored.pandas.plotting._core as vendordt
20+
21+
22+
class PlotAccessor:
23+
__doc__ = vendordt.PlotAccessor.__doc__
24+
25+
def __init__(self, data) -> None:
26+
self._parent = data
27+
28+
def hist(self, by: Sequence[str] | None = None, bins: int = 10, **kwargs):
29+
if kwargs.pop("backend", None) is not None:
30+
raise NotImplementedError(
31+
f"Only support matplotlib backend for now. {constants.FEEDBACK_LINK}"
32+
)
33+
kwargs["by"] = by
34+
kwargs["bins"] = bins
35+
return plotbackend.plot(self._parent.copy(), kind="hist", **kwargs)

bigframes/series.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import bigframes.operations.aggregations as agg_ops
5151
import bigframes.operations.base
5252
import bigframes.operations.datetimes as dt
53+
import bigframes.operations.plot as plot
5354
import bigframes.operations.strings as strings
5455
import bigframes.operations.structs as structs
5556
import third_party.bigframes_vendored.pandas.core.series as vendored_pandas_series
@@ -1557,6 +1558,10 @@ def __array_ufunc__(
15571558
def str(self) -> strings.StringMethods:
15581559
return strings.StringMethods(self._block)
15591560

1561+
@property
1562+
def plot(self):
1563+
return plot.PlotAccessor(self)
1564+
15601565
def _slice(
15611566
self,
15621567
start: typing.Optional[int] = None,

setup.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@
5858
"tabulate >= 0.9",
5959
"ipywidgets >=7.7.1",
6060
"humanize >= 4.6.0",
61+
"matplotlib >= 3.7.1",
6162
]
6263
extras = {
6364
# Optional test dependencies packages. If they're missed, may skip some tests.

0 commit comments

Comments
 (0)