Skip to content

Commit 962b152

Browse files
authored
feat: (Preview) Support arithmetics between dates and timedeltas (#1413)
* [WIP] Implement date_diff, date_add, and date_sub. Some tests are broken. * fix format * change implementation to emulate PyArrow * fix mypy * fix format * fix tests * fix more tests * bump pyarrow dependency to 15.0.2 * raise pandas version in system-3.12 tests * fix format * restore constraints-3.12 * raise pandas version to 2.2.0 in 3.12 testing env * remove pandas constraints for 3.12 * fix merge error * cast factor to int instead of calling floor()
1 parent c598c0a commit 962b152

File tree

10 files changed

+252
-5
lines changed

10 files changed

+252
-5
lines changed

bigframes/core/compile/scalar_op_compiler.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -740,6 +740,21 @@ def timestamp_sub_op_impl(x: ibis_types.TimestampValue, y: ibis_types.IntegerVal
740740
return x - y.to_interval("us")
741741

742742

743+
@scalar_op_compiler.register_binary_op(ops.date_diff_op)
744+
def date_diff_op_impl(x: ibis_types.DateValue, y: ibis_types.DateValue):
745+
return x.delta(y, "day") * int(UNIT_TO_US_CONVERSION_FACTORS["d"]) # type: ignore
746+
747+
748+
@scalar_op_compiler.register_binary_op(ops.date_add_op)
749+
def date_add_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue):
750+
return x.cast("timestamp") + y.to_interval("us") # type: ignore
751+
752+
753+
@scalar_op_compiler.register_binary_op(ops.date_sub_op)
754+
def date_sub_op_impl(x: ibis_types.DateValue, y: ibis_types.IntegerValue):
755+
return x.cast("timestamp") - y.to_interval("us") # type: ignore
756+
757+
743758
@scalar_op_compiler.register_unary_op(ops.FloorDtOp, pass_op=True)
744759
def floor_dt_op_impl(x: ibis_types.Value, op: ops.FloorDtOp):
745760
supported_freqs = ["Y", "Q", "M", "W", "D", "h", "min", "s", "ms", "us", "ns"]

bigframes/core/rewrite/timedeltas.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,12 @@ def _rewrite_sub_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:
151151
if dtypes.is_datetime_like(left.dtype) and right.dtype is dtypes.TIMEDELTA_DTYPE:
152152
return _TypedExpr.create_op_expr(ops.timestamp_sub_op, left, right)
153153

154+
if left.dtype == dtypes.DATE_DTYPE and right.dtype == dtypes.DATE_DTYPE:
155+
return _TypedExpr.create_op_expr(ops.date_diff_op, left, right)
156+
157+
if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE:
158+
return _TypedExpr.create_op_expr(ops.date_sub_op, left, right)
159+
154160
return _TypedExpr.create_op_expr(ops.sub_op, left, right)
155161

156162

@@ -163,6 +169,14 @@ def _rewrite_add_op(left: _TypedExpr, right: _TypedExpr) -> _TypedExpr:
163169
# always on the right.
164170
return _TypedExpr.create_op_expr(ops.timestamp_add_op, right, left)
165171

172+
if left.dtype == dtypes.DATE_DTYPE and right.dtype is dtypes.TIMEDELTA_DTYPE:
173+
return _TypedExpr.create_op_expr(ops.date_add_op, left, right)
174+
175+
if left.dtype is dtypes.TIMEDELTA_DTYPE and right.dtype == dtypes.DATE_DTYPE:
176+
# Re-arrange operands such that date is always on the left and timedelta is
177+
# always on the right.
178+
return _TypedExpr.create_op_expr(ops.date_add_op, right, left)
179+
166180
return _TypedExpr.create_op_expr(ops.add_op, left, right)
167181

168182

bigframes/operations/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@
3939
ne_op,
4040
)
4141
from bigframes.operations.date_ops import (
42+
date_diff_op,
4243
day_op,
4344
dayofweek_op,
4445
month_op,
@@ -184,6 +185,8 @@
184185
from bigframes.operations.struct_ops import StructFieldOp, StructOp
185186
from bigframes.operations.time_ops import hour_op, minute_op, normalize_op, second_op
186187
from bigframes.operations.timedelta_ops import (
188+
date_add_op,
189+
date_sub_op,
187190
timedelta_floor_op,
188191
timestamp_add_op,
189192
timestamp_sub_op,
@@ -249,6 +252,7 @@
249252
"upper_op",
250253
"ZfillOp",
251254
# Date ops
255+
"date_diff_op",
252256
"day_op",
253257
"month_op",
254258
"year_op",
@@ -260,6 +264,8 @@
260264
"second_op",
261265
"normalize_op",
262266
# Timedelta ops
267+
"date_add_op",
268+
"date_sub_op",
263269
"timedelta_floor_op",
264270
"timestamp_add_op",
265271
"timestamp_sub_op",

bigframes/operations/date_ops.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
import dataclasses
16+
import typing
17+
18+
from bigframes import dtypes
1519
from bigframes.operations import base_ops
1620
import bigframes.operations.type as op_typing
1721

@@ -39,3 +43,22 @@
3943
name="quarter",
4044
type_signature=op_typing.DATELIKE_ACCESSOR,
4145
)
46+
47+
48+
@dataclasses.dataclass(frozen=True)
49+
class DateDiffOp(base_ops.BinaryOp):
50+
name: typing.ClassVar[str] = "date_diff"
51+
52+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
53+
if input_types[0] is not input_types[1]:
54+
raise TypeError(
55+
f"two inputs have different types. left: {input_types[0]}, right: {input_types[1]}"
56+
)
57+
58+
if input_types[0] != dtypes.DATE_DTYPE:
59+
raise TypeError("expected date input")
60+
61+
return dtypes.TIMEDELTA_DTYPE
62+
63+
64+
date_diff_op = DateDiffOp()

bigframes/operations/numeric_ops.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -123,12 +123,18 @@ def output_type(self, *input_types):
123123
# String addition
124124
return input_types[0]
125125

126-
# Timestamp addition.
126+
# Temporal addition.
127127
if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE:
128128
return left_type
129129
if left_type is dtypes.TIMEDELTA_DTYPE and dtypes.is_datetime_like(right_type):
130130
return right_type
131131

132+
if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE:
133+
return dtypes.DATETIME_DTYPE
134+
135+
if left_type == dtypes.TIMEDELTA_DTYPE and right_type == dtypes.DATE_DTYPE:
136+
return dtypes.DATETIME_DTYPE
137+
132138
if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE:
133139
return dtypes.TIMEDELTA_DTYPE
134140

@@ -155,9 +161,15 @@ def output_type(self, *input_types):
155161
if dtypes.is_datetime_like(left_type) and dtypes.is_datetime_like(right_type):
156162
return dtypes.TIMEDELTA_DTYPE
157163

164+
if left_type == dtypes.DATE_DTYPE and right_type == dtypes.DATE_DTYPE:
165+
return dtypes.TIMEDELTA_DTYPE
166+
158167
if dtypes.is_datetime_like(left_type) and right_type is dtypes.TIMEDELTA_DTYPE:
159168
return left_type
160169

170+
if left_type == dtypes.DATE_DTYPE and right_type == dtypes.TIMEDELTA_DTYPE:
171+
return dtypes.DATETIME_DTYPE
172+
161173
if left_type is dtypes.TIMEDELTA_DTYPE and right_type is dtypes.TIMEDELTA_DTYPE:
162174
return dtypes.TIMEDELTA_DTYPE
163175

bigframes/operations/timedelta_ops.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
7979
timestamp_add_op = TimestampAddOp()
8080

8181

82+
@dataclasses.dataclass(frozen=True)
8283
class TimestampSubOp(base_ops.BinaryOp):
8384
name: typing.ClassVar[str] = "timestamp_sub"
8485

@@ -96,3 +97,49 @@ def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionT
9697

9798

9899
timestamp_sub_op = TimestampSubOp()
100+
101+
102+
@dataclasses.dataclass(frozen=True)
103+
class DateAddOp(base_ops.BinaryOp):
104+
name: typing.ClassVar[str] = "date_add"
105+
106+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
107+
# date + timedelta => timestamp without timezone
108+
if (
109+
input_types[0] == dtypes.DATE_DTYPE
110+
and input_types[1] == dtypes.TIMEDELTA_DTYPE
111+
):
112+
return dtypes.DATETIME_DTYPE
113+
# timedelta + date => timestamp without timezone
114+
if (
115+
input_types[0] == dtypes.TIMEDELTA_DTYPE
116+
and input_types[1] == dtypes.DATE_DTYPE
117+
):
118+
return dtypes.DATETIME_DTYPE
119+
120+
raise TypeError(
121+
f"unsupported types for date_add. left: {input_types[0]} right: {input_types[1]}"
122+
)
123+
124+
125+
date_add_op = DateAddOp()
126+
127+
128+
@dataclasses.dataclass(frozen=True)
129+
class DateSubOp(base_ops.BinaryOp):
130+
name: typing.ClassVar[str] = "date_sub"
131+
132+
def output_type(self, *input_types: dtypes.ExpressionType) -> dtypes.ExpressionType:
133+
# date - timedelta => timestamp without timezone
134+
if (
135+
input_types[0] == dtypes.DATE_DTYPE
136+
and input_types[1] == dtypes.TIMEDELTA_DTYPE
137+
):
138+
return dtypes.DATETIME_DTYPE
139+
140+
raise TypeError(
141+
f"unsupported types for date_sub. left: {input_types[0]} right: {input_types[1]}"
142+
)
143+
144+
145+
date_sub_op = DateSubOp()

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@
5252
"numpy >=1.24.0",
5353
"pandas >=1.5.3",
5454
"pandas-gbq >=0.26.0",
55-
"pyarrow >=10.0.1",
55+
"pyarrow >=15.0.2",
5656
"pydata-google-auth >=1.8.2",
5757
"requests >=2.27.1",
5858
"sqlglot >=23.6.3",

testing/constraints-3.9.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ jellyfish==0.8.9
1616
numpy==1.24.0
1717
pandas==1.5.3
1818
pandas-gbq==0.26.0
19-
pyarrow==10.0.1
19+
pyarrow==15.0.2
2020
pydata-google-auth==1.8.2
2121
requests==2.27.1
2222
scikit-learn==1.2.2

tests/system/small/operations/test_dates.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,56 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15+
16+
import datetime
17+
18+
import pandas as pd
1519
import pandas.testing
1620

1721
from bigframes import dtypes
1822

1923

24+
def test_date_diff_between_series(session):
25+
pd_df = pd.DataFrame(
26+
{
27+
"col_1": [datetime.date(2025, 1, 2), datetime.date(2025, 2, 1)],
28+
"col_2": [datetime.date(2024, 1, 2), datetime.date(2026, 1, 30)],
29+
}
30+
).astype(dtypes.DATE_DTYPE)
31+
bf_df = session.read_pandas(pd_df)
32+
33+
actual_result = (bf_df["col_1"] - bf_df["col_2"]).to_pandas()
34+
35+
expected_result = (pd_df["col_1"] - pd_df["col_2"]).astype(dtypes.TIMEDELTA_DTYPE)
36+
pandas.testing.assert_series_equal(
37+
actual_result, expected_result, check_index_type=False
38+
)
39+
40+
41+
def test_date_diff_literal_sub_series(scalars_dfs):
42+
bf_df, pd_df = scalars_dfs
43+
literal = datetime.date(2030, 5, 20)
44+
45+
actual_result = (literal - bf_df["date_col"]).to_pandas()
46+
47+
expected_result = (literal - pd_df["date_col"]).astype(dtypes.TIMEDELTA_DTYPE)
48+
pandas.testing.assert_series_equal(
49+
actual_result, expected_result, check_index_type=False
50+
)
51+
52+
53+
def test_date_diff_series_sub_literal(scalars_dfs):
54+
bf_df, pd_df = scalars_dfs
55+
literal = datetime.date(1980, 5, 20)
56+
57+
actual_result = (bf_df["date_col"] - literal).to_pandas()
58+
59+
expected_result = (pd_df["date_col"] - literal).astype(dtypes.TIMEDELTA_DTYPE)
60+
pandas.testing.assert_series_equal(
61+
actual_result, expected_result, check_index_type=False
62+
)
63+
64+
2065
def test_date_series_diff_agg(scalars_dfs):
2166
bf_df, pd_df = scalars_dfs
2267

0 commit comments

Comments
 (0)