Skip to content

Commit a4205f8

Browse files
authored
feat: add describe() method to Series (#1827)
* feat: add describe() method to Series * fix test * update docs * fix tests
1 parent 7bc7f36 commit a4205f8

File tree

9 files changed

+465
-261
lines changed

9 files changed

+465
-261
lines changed

bigframes/dataframe.py

Lines changed: 2 additions & 96 deletions
Original file line numberDiff line numberDiff line change
@@ -562,17 +562,6 @@ def select_dtypes(self, include=None, exclude=None) -> DataFrame:
562562
)
563563
return DataFrame(self._block.select_columns(selected_columns))
564564

565-
def _select_exact_dtypes(
566-
self, dtypes: Sequence[bigframes.dtypes.Dtype]
567-
) -> DataFrame:
568-
"""Selects columns without considering inheritance relationships."""
569-
columns = [
570-
col_id
571-
for col_id, dtype in zip(self._block.value_columns, self._block.dtypes)
572-
if dtype in dtypes
573-
]
574-
return DataFrame(self._block.select_columns(columns))
575-
576565
def _set_internal_query_job(self, query_job: Optional[bigquery.QueryJob]):
577566
self._query_job = query_job
578567

@@ -3079,92 +3068,9 @@ def melt(
30793068
)
30803069

30813070
def describe(self, include: None | Literal["all"] = None) -> DataFrame:
3082-
if include is None:
3083-
numeric_df = self._select_exact_dtypes(
3084-
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
3085-
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
3086-
)
3087-
if len(numeric_df.columns) == 0:
3088-
# Describe eligible non-numeric columns
3089-
return self._describe_non_numeric()
3090-
3091-
# Otherwise, only describe numeric columns
3092-
return self._describe_numeric()
3071+
from bigframes.pandas.core.methods import describe
30933072

3094-
elif include == "all":
3095-
numeric_result = self._describe_numeric()
3096-
non_numeric_result = self._describe_non_numeric()
3097-
3098-
if len(numeric_result.columns) == 0:
3099-
return non_numeric_result
3100-
elif len(non_numeric_result.columns) == 0:
3101-
return numeric_result
3102-
else:
3103-
import bigframes.core.reshape.api as rs
3104-
3105-
# Use reindex after join to preserve the original column order.
3106-
return rs.concat(
3107-
[non_numeric_result, numeric_result], axis=1
3108-
)._reindex_columns(self.columns)
3109-
3110-
else:
3111-
raise ValueError(f"Unsupported include type: {include}")
3112-
3113-
def _describe_numeric(self) -> DataFrame:
3114-
number_df_result = typing.cast(
3115-
DataFrame,
3116-
self._select_exact_dtypes(
3117-
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
3118-
).agg(
3119-
[
3120-
"count",
3121-
"mean",
3122-
"std",
3123-
"min",
3124-
"25%",
3125-
"50%",
3126-
"75%",
3127-
"max",
3128-
]
3129-
),
3130-
)
3131-
temporal_df_result = typing.cast(
3132-
DataFrame,
3133-
self._select_exact_dtypes(
3134-
bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
3135-
).agg(["count"]),
3136-
)
3137-
3138-
if len(number_df_result.columns) == 0:
3139-
return temporal_df_result
3140-
elif len(temporal_df_result.columns) == 0:
3141-
return number_df_result
3142-
else:
3143-
import bigframes.core.reshape.api as rs
3144-
3145-
original_columns = self._select_exact_dtypes(
3146-
bigframes.dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
3147-
+ bigframes.dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES
3148-
).columns
3149-
3150-
# Use reindex after join to preserve the original column order.
3151-
return rs.concat(
3152-
[number_df_result, temporal_df_result],
3153-
axis=1,
3154-
)._reindex_columns(original_columns)
3155-
3156-
def _describe_non_numeric(self) -> DataFrame:
3157-
return typing.cast(
3158-
DataFrame,
3159-
self._select_exact_dtypes(
3160-
[
3161-
bigframes.dtypes.STRING_DTYPE,
3162-
bigframes.dtypes.BOOL_DTYPE,
3163-
bigframes.dtypes.BYTES_DTYPE,
3164-
bigframes.dtypes.TIME_DTYPE,
3165-
]
3166-
).agg(["count", "nunique"]),
3167-
)
3073+
return typing.cast(DataFrame, describe.describe(self, include))
31683074

31693075
def skew(self, *, numeric_only: bool = False):
31703076
if not numeric_only:
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
import typing
18+
19+
from bigframes import dataframe, dtypes, series
20+
from bigframes.core.reshape import api as rs
21+
22+
23+
def describe(
24+
input: dataframe.DataFrame | series.Series,
25+
include: None | typing.Literal["all"],
26+
) -> dataframe.DataFrame | series.Series:
27+
if isinstance(input, series.Series):
28+
# Convert the series to a dataframe, describe it, and cast the result back to a series.
29+
return series.Series(describe(input.to_frame(), include)._block)
30+
elif not isinstance(input, dataframe.DataFrame):
31+
raise TypeError(f"Unsupported type: {type(input)}")
32+
33+
if include is None:
34+
numeric_df = _select_dtypes(
35+
input,
36+
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
37+
+ dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
38+
)
39+
if len(numeric_df.columns) == 0:
40+
# Describe eligible non-numeric columns
41+
return _describe_non_numeric(input)
42+
43+
# Otherwise, only describe numeric columns
44+
return _describe_numeric(input)
45+
46+
elif include == "all":
47+
numeric_result = _describe_numeric(input)
48+
non_numeric_result = _describe_non_numeric(input)
49+
50+
if len(numeric_result.columns) == 0:
51+
return non_numeric_result
52+
elif len(non_numeric_result.columns) == 0:
53+
return numeric_result
54+
else:
55+
# Use reindex after join to preserve the original column order.
56+
return rs.concat(
57+
[non_numeric_result, numeric_result], axis=1
58+
)._reindex_columns(input.columns)
59+
60+
else:
61+
raise ValueError(f"Unsupported include type: {include}")
62+
63+
64+
def _describe_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
65+
number_df_result = typing.cast(
66+
dataframe.DataFrame,
67+
_select_dtypes(df, dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE).agg(
68+
[
69+
"count",
70+
"mean",
71+
"std",
72+
"min",
73+
"25%",
74+
"50%",
75+
"75%",
76+
"max",
77+
]
78+
),
79+
)
80+
temporal_df_result = typing.cast(
81+
dataframe.DataFrame,
82+
_select_dtypes(df, dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES).agg(["count"]),
83+
)
84+
85+
if len(number_df_result.columns) == 0:
86+
return temporal_df_result
87+
elif len(temporal_df_result.columns) == 0:
88+
return number_df_result
89+
else:
90+
import bigframes.core.reshape.api as rs
91+
92+
original_columns = _select_dtypes(
93+
df,
94+
dtypes.NUMERIC_BIGFRAMES_TYPES_RESTRICTIVE
95+
+ dtypes.TEMPORAL_NUMERIC_BIGFRAMES_TYPES,
96+
).columns
97+
98+
# Use reindex after join to preserve the original column order.
99+
return rs.concat(
100+
[number_df_result, temporal_df_result],
101+
axis=1,
102+
)._reindex_columns(original_columns)
103+
104+
105+
def _describe_non_numeric(df: dataframe.DataFrame) -> dataframe.DataFrame:
106+
return typing.cast(
107+
dataframe.DataFrame,
108+
_select_dtypes(
109+
df,
110+
[
111+
dtypes.STRING_DTYPE,
112+
dtypes.BOOL_DTYPE,
113+
dtypes.BYTES_DTYPE,
114+
dtypes.TIME_DTYPE,
115+
],
116+
).agg(["count", "nunique"]),
117+
)
118+
119+
120+
def _select_dtypes(
121+
df: dataframe.DataFrame, dtypes: typing.Sequence[dtypes.Dtype]
122+
) -> dataframe.DataFrame:
123+
"""Selects columns without considering inheritance relationships."""
124+
columns = [
125+
col_id
126+
for col_id, dtype in zip(df._block.value_columns, df._block.dtypes)
127+
if dtype in dtypes
128+
]
129+
return dataframe.DataFrame(df._block.select_columns(columns))

bigframes/series.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1293,6 +1293,11 @@ def agg(self, func: str | typing.Sequence[str]) -> scalars.Scalar | Series:
12931293
aggregate = agg
12941294
aggregate.__doc__ = inspect.getdoc(vendored_pandas_series.Series.agg)
12951295

1296+
def describe(self) -> Series:
1297+
from bigframes.pandas.core.methods import describe
1298+
1299+
return cast(Series, describe.describe(self, include="all"))
1300+
12961301
def skew(self):
12971302
count = self.count()
12981303
if count < 3:
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.

0 commit comments

Comments
 (0)