feat(datafusion): add problem 180

IndexSeek · IndexSeek · commit d03c597a3632 · 2024-12-22T19:21:44.000Z
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ Fiddling around with DataFusion, pandas, and PyArrow.
 | 1731 | [The Number of Employees Which Report to Each Employee](https://leetcode.com/problems/the-number-of-employees-which-report-to-each-employee) | Easy | ✅ | ❌ | ❌ |
 | 1789 | [Primary Department for Each Employee](https://leetcode.com/problems/primary-department-for-each-employee) | Easy | ✅ | ❌ | ❌ |
 | 610  | [Triangle Judgement](https://leetcode.com/problems/triangle-judgement) | Easy | ✅ | ❌ | ❌ |
-| 180  | [Consecutive Numbers](https://leetcode.com/problems/consecutive-numbers) | Medium | ✅ | ❌ | ✅ |
+| 180  | [Consecutive Numbers](https://leetcode.com/problems/consecutive-numbers) | Medium | ✅ | ✅ | ✅ |
 | 1164 | [Product Price at a Given Date](https://leetcode.com/problems/product-price-at-a-given-date) | Medium | ✅ | ❌ | ❌ |
 | 1204 | [Last Person to Fit in the Bus](https://leetcode.com/problems/last-person-to-fit-in-the-bus) | Medium | ✅ | ❌ | ❌ |
 | 1907 | [Count Salary Categories](https://leetcode.com/problems/count-salary-categories) | Medium | ✅ | ❌ | ❌ |
diff --git a/problems/datafusion.py b/problems/datafusion.py
@@ -38,6 +38,64 @@ def problem_176(employee: pa.Table) -> datafusion.dataframe.DataFrame:
     return t
 
 
+def problem_180(logs: pa.Table) -> datafusion.dataframe.DataFrame:
+    """Find all numbers that appear at least three times consecutively.
+
+    Return the result table in any order.
+
+    Parameters
+    ----------
+    logs : pa.Table
+        A table containing sequential ids and numbers.
+
+    Returns
+    -------
+    datafusion.dataframe.DataFrame
+
+    Examples
+    --------
+    >>> import datafusion
+    >>> import datafusion.functions as F
+    >>> import pyarrow as pa
+    >>> from problems.datafusion import problem_180
+    >>> from problems.datasets import load_problem_180
+    >>> ctx = datafusion.SessionContext()
+    >>> logs = pa.table(load_problem_180())
+    >>> problem_180(logs)
+    DataFrame()
+    +-----------------+
+    | ConsecutiveNums |
+    +-----------------+
+    | 1               |
+    +-----------------+
+
+    """
+    ctx = datafusion.SessionContext()
+    logs = ctx.from_arrow(logs)
+    logs = logs.select(
+        F.col("num"),
+        F.lag(F.col("num"), order_by=[F.col("id")]).alias("num_lag_1"),
+        F.lag(F.col("num"), 2, order_by=[F.col("id")]).alias("num_lag_2"),
+    )
+    filtered = (
+        logs.filter(
+            (F.col("num") == F.col("num_lag_1")) & (F.col("num") == F.col("num_lag_2"))
+        )
+        .select("num")
+        .with_column_renamed("num", "ConsecutiveNums")
+    )
+    ctx.from_arrow(filtered.to_arrow_table(), "filtered")
+    result = ctx.sql("""SELECT DISTINCT "ConsecutiveNums" FROM filtered""")
+    if result.to_arrow_table().num_rows == 0:
+        return ctx.from_arrow(
+            pa.table(
+                {"ConsecutiveNums": [pa.scalar(None, type=pa.int64())]},
+                schema=pa.schema({"ConsecutiveNums": pa.int64()}),
+            )
+        )
+    return result
+
+
 def problem_584(customer: pa.Table) -> datafusion.dataframe.DataFrame:
     """Find names of customers not referred by the customer with ID = 2.
 
diff --git a/problems/datasets.py b/problems/datasets.py
@@ -1,5 +1,7 @@
 """Functions to load LeetCode problem datasets to pandas DataFrames."""
 
+from typing import Tuple
+
 import pandas as pd
 
 
@@ -17,7 +19,7 @@ def load_problem_180() -> pd.DataFrame:
     )
 
 
-def load_problem_185() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_185() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [
         [1, "Joe", 85000, 1],
         [2, "Henry", 80000, 2],
@@ -92,7 +94,7 @@ def load_problem_570() -> pd.DataFrame:
     )
 
 
-def load_problem_577() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_577() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [
         [3, "Brad", None, 4000],
         [1, "John", 3, 1000],
@@ -233,7 +235,7 @@ def load_problem_626() -> pd.DataFrame:
     )
 
 
-def load_problem_1045() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_1045() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [[1, 5], [2, 6], [3, 5], [3, 6], [1, 6]]
     customer = pd.DataFrame(data, columns=["customer_id", "product_key"]).astype(
         {"customer_id": "Int64", "product_key": "Int64"}
@@ -269,7 +271,7 @@ def load_problem_1068() -> pd.DataFrame:
     return sales, product
 
 
-def load_problem_1070() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_1070() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [
         [1, 100, 2008, 10, 5000],
         [2, 100, 2009, 12, 5000],
@@ -293,7 +295,7 @@ def load_problem_1070() -> tuple(pd.DataFrame, pd.DataFrame):
     return sales, product
 
 
-def load_problem_1075() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_1075() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [[1, 1], [1, 2], [1, 3], [2, 1], [2, 4]]
     project = pd.DataFrame(data, columns=["project_id", "employee_id"]).astype(
         {"project_id": "Int64", "employee_id": "Int64"}
@@ -459,7 +461,7 @@ def load_problem_1211() -> pd.DataFrame:
     )
 
 
-def load_problem_1251() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_1251() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [
         [1, "2019-02-17", "2019-02-28", 5],
         [1, "2019-03-01", "2019-03-22", 20],
@@ -490,7 +492,7 @@ def load_problem_1251() -> tuple(pd.DataFrame, pd.DataFrame):
     return prices, units_sold
 
 
-def load_problem_1280() -> tuple(pd.DataFrame, pd.DataFrame, pd.DataFrame):
+def load_problem_1280() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     data = [[1, "Alice"], [2, "Bob"], [13, "John"], [6, "Alex"]]
     students = pd.DataFrame(data, columns=["student_id", "student_name"]).astype(
         {"student_id": "Int64", "student_name": "object"}
@@ -544,7 +546,7 @@ def load_problem_1321() -> pd.DataFrame:
     )
 
 
-def load_problem_1327() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_1327() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [
         [1, "Leetcode Solutions", "Book"],
         [2, "Jewels of Stringology", "Book"],
@@ -577,7 +579,7 @@ def load_problem_1327() -> tuple(pd.DataFrame, pd.DataFrame):
     return products, orders
 
 
-def load_problem_1341() -> tuple(pd.DataFrame, pd.DataFrame, pd.DataFrame):
+def load_problem_1341() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
     data = [[1, "Avengers"], [2, "Frozen 2"], [3, "Joker"]]
     movies = pd.DataFrame(data, columns=["movie_id", "title"]).astype(
         {"movie_id": "Int64", "title": "object"}
@@ -610,7 +612,7 @@ def load_problem_1341() -> tuple(pd.DataFrame, pd.DataFrame, pd.DataFrame):
     return movies, users, movie_rating
 
 
-def load_problem_1378() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_1378() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [[1, "Alice"], [7, "Bob"], [11, "Meir"], [90, "Winston"], [3, "Jonathan"]]
     employees = pd.DataFrame(data, columns=["id", "name"]).astype(
         {"id": "int64", "name": "object"}
@@ -665,7 +667,7 @@ def load_problem_1527() -> pd.DataFrame:
     ).astype({"patient_id": "int64", "patient_name": "object", "conditions": "object"})
 
 
-def load_problem_1581() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_1581() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [[1, 23], [2, 9], [4, 30], [5, 54], [6, 96], [7, 54], [8, 54]]
     visits = pd.DataFrame(data, columns=["visit_id", "customer_id"]).astype(
         {"visit_id": "Int64", "customer_id": "Int64"}
@@ -677,7 +679,7 @@ def load_problem_1581() -> tuple(pd.DataFrame, pd.DataFrame):
     return visits, transactions
 
 
-def load_problem_1633() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_1633() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [[6, "Alice"], [2, "Bob"], [7, "Alex"]]
     users = pd.DataFrame(data, columns=["user_id", "user_name"]).astype(
         {"user_id": "Int64", "user_name": "object"}
@@ -806,7 +808,7 @@ def load_problem_1907() -> pd.DataFrame:
     )
 
 
-def load_problem_1934() -> tuple(pd.DataFrame, pd.DataFrame):
+def load_problem_1934() -> Tuple[pd.DataFrame, pd.DataFrame]:
     data = [
         [3, "2020-03-21 10:16:13"],
         [7, "2020-01-04 13:57:59"],
diff --git a/tests/test_datafusion.py b/tests/test_datafusion.py
@@ -5,6 +5,7 @@
 
 from problems.datafusion import (
     problem_176,
+    problem_180,
     problem_584,
     problem_595,
     problem_620,
@@ -45,6 +46,63 @@ def test_problem_176(input_data, expected_data):
     assert result.to_arrow_table().equals(expected_table)
 
 
+@pytest.mark.parametrize(
+    "input_data, expected_data",
+    [
+        pytest.param(
+            {
+                "id": [1, 2, 3, 4, 5, 6, 7, 8],
+                "num": [1, 2, 3, 1, 1, 1, 4, 5],
+            },
+            {
+                "ConsecutiveNums": [1],
+            },
+            id="one_consecutive_number_three_times",
+        ),
+        pytest.param(
+            {
+                "id": [1, 2, 3, 4, 5, 6, 7, 8],
+                "num": [1, 2, 3, 1, 1, 1, 1, 5],
+            },
+            {
+                "ConsecutiveNums": [1],
+            },
+            id="one_consecutive_number_four_times",
+        ),
+        pytest.param(
+            {
+                "id": [1, 2, 3, 4, 5],
+                "num": [1, 2, 3, 4, 5],
+            },
+            {
+                "ConsecutiveNums": [None],
+            },
+            id="no_consecutive_numbers",
+        ),
+        pytest.param(
+            {
+                "id": [],
+                "num": [],
+            },
+            {
+                "ConsecutiveNums": [None],
+            },
+            id="empty_table",
+        ),
+    ],
+)
+def test_problem_180(input_data, expected_data):
+    table = pa.Table.from_pydict(
+        input_data,
+        schema=pa.schema([pa.field("id", pa.int64()), pa.field("num", pa.int64())]),
+    )
+    expected_table = pa.Table.from_pydict(
+        expected_data, schema=pa.schema([pa.field("ConsecutiveNums", pa.int64())])
+    )
+    result = problem_180(table)
+    assert result.to_arrow_table().equals(expected_table)
+
+
 @pytest.mark.parametrize(
     "input_data, expected_data",
     [