Implementing to_iceberg

datapythonista · datapythonista · commit b31ae80710a5 · 2025-05-27T23:32:12.000+02:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -3547,6 +3547,54 @@ def to_xml(
 
         return xml_formatter.write_output()
 
+    def to_iceberg(
+        self,
+        table_identifier: str,
+        catalog_name: str | None = None,
+        *,
+        catalog_properties: dict[str, Any] | None = None,
+        location: str | None = None,
+        snapshot_properties: dict[str, str] | None = None,
+    ):
+        """
+        Write a DataFrame to an Apache Iceberg table.
+
+        .. versionadded:: 3.0.0
+
+        Parameters
+        ----------
+        table_identifier : str
+            Table identifier.
+        catalog_name : str, optional
+            The name of the catalog.
+        catalog_properties : dict of {str: str}, optional
+            The properties that are used next to the catalog configuration.
+        location : str, optional
+            Location for the table.
+        snapshot_properties : dict of {str: str}, optional
+            Custom properties to be added to the snapshot summary
+
+        See Also
+        --------
+        read_iceberg : Read an Apache Iceberg table.
+        DataFrame.to_parquet : Write a DataFrame in Parquet format.
+
+        Examples
+        --------
+        >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
+        >>> df.to_iceberg("my_table", catalog_name="my_catalog")
+        """
+        from pandas.io.iceberg import to_iceberg
+
+        return to_iceberg(
+            self,
+            table_identifier,
+            catalog_name,
+            catalog_properties=catalog_properties,
+            location=location,
+            snapshot_properties=snapshot_properties,
+        )
+
     # ----------------------------------------------------------------------
     @doc(INFO_DOCSTRING, **frame_sub_kwargs)
     def info(
diff --git a/pandas/io/iceberg.py b/pandas/io/iceberg.py
@@ -7,9 +7,17 @@
 from pandas import DataFrame
 
 
+def _get_catalog(catalog_name: str | None, catalog_properties: dict[str, Any] | None):
+    pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+    if catalog_properties is None:
+        catalog_properties = {}
+    return pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
+
+
 def read_iceberg(
     table_identifier: str,
     catalog_name: str | None = None,
+    *,
     catalog_properties: dict[str, Any] | None = None,
     row_filter: str | None = None,
     selected_fields: tuple[str] | None = None,
@@ -69,12 +77,8 @@ def read_iceberg(
     ...     selected_fields=("VendorID", "tpep_pickup_datetime"),
     ... )  # doctest: +SKIP
     """
-    pyiceberg_catalog = import_optional_dependency("pyiceberg.catalog")
+    catalog = _get_catalog(catalog_name, catalog_properties)
     pyiceberg_expressions = import_optional_dependency("pyiceberg.expressions")
-
-    if catalog_properties is None:
-        catalog_properties = {}
-    catalog = pyiceberg_catalog.load_catalog(catalog_name, **catalog_properties)
     table = catalog.load_table(table_identifier)
     if row_filter is None:
         row_filter = pyiceberg_expressions.AlwaysTrue()
@@ -91,3 +95,56 @@ def read_iceberg(
         limit=limit,
     )
     return result.to_pandas()
+
+
+def to_iceberg(
+    df: DataFrame,
+    table_identifier: str,
+    catalog_name: str | None = None,
+    *,
+    catalog_properties: dict[str, Any] | None = None,
+    location: str | None = None,
+    snapshot_properties: dict[str, str] | None = None,
+):
+    """
+    Write a DataFrame to an Apache Iceberg table.
+
+    .. versionadded:: 3.0.0
+
+    Parameters
+    ----------
+    table_identifier : str
+        Table identifier.
+    catalog_name : str, optional
+        The name of the catalog.
+    catalog_properties : dict of {str: str}, optional
+        The properties that are used next to the catalog configuration.
+    location : str, optional
+        Location for the table.
+    snapshot_properties : dict of {str: str}, optional
+        Custom properties to be added to the snapshot summary
+
+    See Also
+    --------
+    read_iceberg : Read an Apache Iceberg table.
+    DataFrame.to_parquet : Write a DataFrame in Parquet format.
+
+    Examples
+    --------
+    >>> df = pd.DataFrame(data={"col1": [1, 2], "col2": [4, 3]})
+    >>> df.to_iceberg("my_table", catalog_name="my_catalog")
+    """
+    pa = import_optional_dependency("pyarrow")
+
+    catalog = _get_catalog(catalog_name, catalog_properties)
+    arrow_table = pa.Table.from_pandas(df)
+    table = catalog.create_table_if_not_exists(
+        identifier=table_identifier,
+        schema=arrow_table.schema,
+        location=location,
+        # we could add `partition_spec`, `sort_order` and `properties` in the
+        # future, but it may not be trivial without exposing PyIceberg objects
+    )
+    if snapshot_properties is None:
+        snapshot_properties = {}
+    table.append(arrow_table, snapshot_properties=snapshot_properties)
diff --git a/pandas/tests/io/test_iceberg.py b/pandas/tests/io/test_iceberg.py
@@ -22,7 +22,7 @@
 pyiceberg_catalog = pytest.importorskip("pyiceberg.catalog")
 pq = pytest.importorskip("pyarrow.parquet")
 
-Catalog = collections.namedtuple("Catalog", ["name", "uri"])
+Catalog = collections.namedtuple("Catalog", ["name", "uri", "warehouse"])
 
 
 @pytest.fixture
@@ -58,7 +58,7 @@ def catalog(request, tmp_path):
 
         importlib.reload(pyiceberg_catalog)  # needed to reload the config file
 
-    yield Catalog(name=catalog_name or "default", uri=uri)
+    yield Catalog(name=catalog_name or "default", uri=uri, warehouse=warehouse)
 
     if catalog_name is not None:
         config_path.unlink()
@@ -141,3 +141,39 @@ def test_read_with_limit(self, catalog):
             limit=2,
         )
         tm.assert_frame_equal(result, expected)
+
+    def test_write(self, catalog):
+        df = pd.DataFrame(
+            {
+                "A": [1, 2, 3],
+                "B": ["foo", "foo", "foo"],
+            }
+        )
+        df.to_iceberg(
+            "ns.new_table",
+            catalog_properties={"uri": catalog.uri},
+            location=catalog.warehouse,
+        )
+        result = read_iceberg(
+            "ns.new_table",
+            catalog_properties={"uri": catalog.uri},
+        )
+        tm.assert_frame_equal(result, df)
+
+    @pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True)
+    def test_write_by_catalog_name(self, catalog):
+        df = pd.DataFrame(
+            {
+                "A": [1, 2, 3],
+                "B": ["foo", "foo", "foo"],
+            }
+        )
+        df.to_iceberg(
+            "ns.new_table",
+            catalog_name=catalog.name,
+        )
+        result = read_iceberg(
+            "ns.new_table",
+            catalog_name=catalog.name,
+        )
+        tm.assert_frame_equal(result, df)