Skip to content

Commit 89b9503

Browse files
authored
chore: sync changes from internal repo (#10)
feat: support `Series.corr` fix: raise AttributeError for unimplemented pandas methods feat: support `DataFrame.stack` feat: support `np.arcsin`, `np.arccos`, `np.arctan`, `np.sinh`, `np.cosh`, `np.tanh`, `np.arcsinh`, `np.arccosh`, `np.arctanh`, `np.exp` with Series argument fix: align column names with pandas in `DataFrame.agg` results docs: set `options.bigquery.project` in sample code chore: unit test internal `get_standardized_ids` method fix: include survey link in abstract `NotImplementedError` exception messages perf: lazily instantiate client library objects fix: allow (but still not recommended) `ORDER BY` in `read_gbq` input when an `index_col` is defined feat: support `read_json` with `engine=bigquery` for newline-delimited JSON files chore: remove unneeded `types-retry` reference feat: support `np.sin`, `np.cos`, `np.tan`, `np.log`, `np.log10`, `np.sqrt`, `np.abs` with Series argument fix: label temp table creation jobs with `source=bigquery-dataframes-temp` label fix: support spaces in column names in `DataFrame` initializater chore: fix permissions on publish docs script feat: support `df[my_column] = [a python list]` feat: add `components_`, `explained_variance_`, and `explained_variance_ratio_` properties to `bigframes.ml.decomposition.PCA` chore: add execute permissions on publish docs script docs: fix link to GitHub chore: fix docs build fix: check for IAM role on the BigQuery connection when initializing a `remote_function` chore: revert pin to maximum pytest-retry plugin version in tests
1 parent a32b747 commit 89b9503

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+3599
-774
lines changed

.kokoro/docs/common.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ env_vars: {
2020
}
2121
env_vars: {
2222
key: "TRAMPOLINE_BUILD_FILE"
23-
value: "git/bigframes/.kokoro/publish-docs.sh"
23+
value: ".kokoro/publish-docs.sh"
2424
}
2525

2626
env_vars: {

.kokoro/publish-docs.sh

100644100755
File mode changed.

README.rst

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ method accepts either a fully-qualified table ID or a SQL query.
4141
4242
import bigframes.pandas as bpd
4343
44+
bpd.options.bigquery.project = your_gcp_project_id
4445
df1 = bpd.read_gbq("project.dataset.table")
4546
df2 = bpd.read_gbq("SELECT a, b, c, FROM `project.dataset.table`")
4647
@@ -260,7 +261,7 @@ To view and manage Cloud Functions functions, use the
260261
`Functions <https://console.cloud.google.com/functions/list?env=gen2>`_
261262
page and use the project picker to select the project in which you
262263
created the function. For easy identification, the names of the functions
263-
created by BigQuery DataFrames are prefixed by ``bigframes-``.
264+
created by BigQuery DataFrames are prefixed by ``bigframes``.
264265

265266
**Requirements**
266267

@@ -283,7 +284,9 @@ following IAM roles:
283284
* BigQuery Data Editor (roles/bigquery.dataEditor)
284285
* BigQuery Connection Admin (roles/bigquery.connectionAdmin)
285286
* Cloud Functions Developer (roles/cloudfunctions.developer)
286-
* Service Account User (roles/iam.serviceAccountUser)
287+
* Service Account User (roles/iam.serviceAccountUser) on the
288+
`service account <https://cloud.google.com/functions/docs/reference/iam/roles#additional-configuration> `
289+
``PROJECT_NUMBER-compute@developer.gserviceaccount.com``
287290
* Storage Object Viewer (roles/storage.objectViewer)
288291
* Project IAM Admin (roles/resourcemanager.projectIamAdmin)
289292

@@ -330,7 +333,7 @@ Data processing location
330333

331334
BigQuery DataFrames is designed for scale, which it achieves by keeping data
332335
and processing on the BigQuery service. However, you can bring data into the
333-
memory of your client machine by calling ``.execute()`` on a DataFrame or Series
336+
memory of your client machine by calling ``.to_pandas()`` on a DataFrame or Series
334337
object. If you choose to do this, the memory limitation of your client machine
335338
applies.
336339

bigframes/constants.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,5 @@
2121
"Share your usecase with the BigQuery DataFrames team at the "
2222
"https://bit.ly/bigframes-feedback survey."
2323
)
24+
25+
ABSTRACT_METHOD_ERROR_MESSAGE = f"Abstract method. You have likely encountered a bug. Please share this stacktrace and how you reached it with the BigQuery DataFrames team. {FEEDBACK_LINK}"

bigframes/core/__init__.py

Lines changed: 86 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
reencode_order_string,
3636
StringEncoding,
3737
)
38+
import bigframes.core.utils as utils
3839
import bigframes.dtypes
3940
import bigframes.operations as ops
4041
import bigframes.operations.aggregations as agg_ops
@@ -562,6 +563,36 @@ def aggregate(
562563
ordering=ordering,
563564
)
564565

566+
def corr_aggregate(
567+
self, corr_aggregations: typing.Sequence[typing.Tuple[str, str, str]]
568+
) -> ArrayValue:
569+
"""
570+
Get correlations between each lef_column_id and right_column_id, stored in the respective output_column_id.
571+
This uses BigQuery's CORR under the hood, and thus only Pearson's method is used.
572+
Arguments:
573+
corr_aggregations: left_column_id, right_column_id, output_column_id tuples
574+
"""
575+
table = self.to_ibis_expr(ordering_mode="unordered")
576+
stats = {
577+
col_out: table[col_left].corr(table[col_right], how="pop")
578+
for col_left, col_right, col_out in corr_aggregations
579+
}
580+
aggregates = {**stats, ORDER_ID_COLUMN: ibis_types.literal(0)}
581+
result = table.aggregate(**aggregates)
582+
# Ordering is irrelevant for single-row output, but set ordering id regardless as other ops(join etc.) expect it.
583+
ordering = ExpressionOrdering(
584+
ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)],
585+
total_ordering_columns=frozenset([ORDER_ID_COLUMN]),
586+
integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True),
587+
)
588+
return ArrayValue(
589+
self._session,
590+
result,
591+
columns=[result[col_id] for col_id in [*stats.keys()]],
592+
hidden_ordering_columns=[result[ORDER_ID_COLUMN]],
593+
ordering=ordering,
594+
)
595+
565596
def project_window_op(
566597
self,
567598
column_name: str,
@@ -852,52 +883,91 @@ def _ibis_window_from_spec(self, window_spec: WindowSpec, allow_ties: bool = Fal
852883
group_by=group_by,
853884
)
854885

855-
def unpivot_single_row(
886+
def unpivot(
856887
self,
857888
row_labels: typing.Sequence[typing.Hashable],
858-
unpivot_columns: typing.Sequence[typing.Tuple[str, typing.Sequence[str]]],
889+
unpivot_columns: typing.Sequence[
890+
typing.Tuple[str, typing.Sequence[typing.Optional[str]]]
891+
],
859892
*,
893+
passthrough_columns: typing.Sequence[str] = (),
860894
index_col_id: str = "index",
861-
dtype=pandas.Float64Dtype(),
895+
dtype: typing.Union[
896+
bigframes.dtypes.Dtype, typing.Sequence[bigframes.dtypes.Dtype]
897+
] = pandas.Float64Dtype(),
862898
) -> ArrayValue:
863-
"""Unpivot a single row."""
864-
# TODO: Generalize to multiple row input
865-
table = self.to_ibis_expr(ordering_mode="unordered")
899+
"""
900+
Unpivot ArrayValue columns.
901+
902+
Args:
903+
row_labels: Identifies the source of the row. Must be equal to length to source column list in unpivot_columns argument.
904+
unpivot_columns: Mapping of column id to list of input column ids. Lists of input columns may use None.
905+
passthrough_columns: Columns that will not be unpivoted. Column id will be preserved.
906+
index_col_id (str): The column id to be used for the row labels.
907+
dtype (dtype or list of dtype): Dtype to use for the unpivot columns. If list, must be equal in number to unpivot_columns.
908+
909+
Returns:
910+
ArrayValue: The unpivoted ArrayValue
911+
"""
912+
table = self.to_ibis_expr(ordering_mode="offset_col")
866913
sub_expressions = []
867914

868-
# TODO: validate all columns are equal length, as well as row labels
915+
# Use ibis memtable to infer type of rowlabels (if possible)
916+
# TODO: Allow caller to specify dtype
917+
labels_ibis_type = ibis.memtable({"col": row_labels})["col"].type()
918+
labels_dtype = bigframes.dtypes.ibis_dtype_to_bigframes_dtype(labels_ibis_type)
919+
869920
row_n = len(row_labels)
870921
if not all(
871922
len(source_columns) == row_n for _, source_columns in unpivot_columns
872923
):
873924
raise ValueError("Columns and row labels must all be same length.")
874925

875-
# Select each column
876926
for i in range(row_n):
877927
values = []
878-
for result_col, source_cols in unpivot_columns:
879-
values.append(
880-
ops.AsTypeOp(dtype)._as_ibis(table[source_cols[i]]).name(result_col)
881-
)
882-
928+
for j in range(len(unpivot_columns)):
929+
result_col, source_cols = unpivot_columns[j]
930+
col_dtype = dtype[j] if utils.is_list_like(dtype) else dtype
931+
if source_cols[i] is not None:
932+
values.append(
933+
ops.AsTypeOp(col_dtype)
934+
._as_ibis(table[source_cols[i]])
935+
.name(result_col)
936+
)
937+
else:
938+
values.append(
939+
bigframes.dtypes.literal_to_ibis_scalar(
940+
None, force_dtype=col_dtype
941+
).name(result_col)
942+
)
943+
offsets_value = (
944+
((table[ORDER_ID_COLUMN] * row_n) + i)
945+
.cast(ibis_dtypes.int64)
946+
.name(ORDER_ID_COLUMN),
947+
)
883948
sub_expr = table.select(
884-
ibis_types.literal(row_labels[i]).name(index_col_id),
949+
passthrough_columns,
950+
bigframes.dtypes.literal_to_ibis_scalar(
951+
row_labels[i], force_dtype=labels_dtype # type:ignore
952+
).name(index_col_id),
885953
*values,
886-
ibis_types.literal(i).name(ORDER_ID_COLUMN),
954+
offsets_value,
887955
)
888956
sub_expressions.append(sub_expr)
889957
rotated_table = ibis.union(*sub_expressions)
890958

891959
value_columns = [
892960
rotated_table[value_col_id] for value_col_id, _ in unpivot_columns
893961
]
962+
passthrough_values = [rotated_table[col] for col in passthrough_columns]
894963
return ArrayValue(
895964
session=self._session,
896965
table=rotated_table,
897-
columns=[rotated_table[index_col_id], *value_columns],
966+
columns=[rotated_table[index_col_id], *value_columns, *passthrough_values],
898967
hidden_ordering_columns=[rotated_table[ORDER_ID_COLUMN]],
899968
ordering=ExpressionOrdering(
900969
ordering_value_columns=[OrderingColumnReference(ORDER_ID_COLUMN)],
970+
integer_encoding=IntegerEncoding(is_encoded=True, is_sequential=True),
901971
total_ordering_columns=frozenset([ORDER_ID_COLUMN]),
902972
),
903973
)

bigframes/core/block_transforms.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -197,3 +197,35 @@ def rank(
197197
)
198198

199199
return block.select_columns(rownum_col_ids).with_column_labels(labels)
200+
201+
202+
def dropna(block: blocks.Block, how: typing.Literal["all", "any"] = "any"):
203+
"""
204+
Drop na entries from block
205+
"""
206+
if how == "any":
207+
filtered_block = block
208+
for column in block.value_columns:
209+
filtered_block, result_id = filtered_block.apply_unary_op(
210+
column, ops.notnull_op
211+
)
212+
filtered_block = filtered_block.filter(result_id)
213+
filtered_block = filtered_block.drop_columns([result_id])
214+
return filtered_block
215+
else: # "all"
216+
filtered_block = block
217+
predicate = None
218+
for column in block.value_columns:
219+
filtered_block, partial_predicate = filtered_block.apply_unary_op(
220+
column, ops.notnull_op
221+
)
222+
if predicate:
223+
filtered_block, predicate = filtered_block.apply_binary_op(
224+
partial_predicate, predicate, ops.or_op
225+
)
226+
else:
227+
predicate = partial_predicate
228+
if predicate:
229+
filtered_block = filtered_block.filter(predicate)
230+
filtered_block = filtered_block.select_columns(block.value_columns)
231+
return filtered_block

0 commit comments

Comments
 (0)