apache · eladkal · Jan 18, 2024 · Jan 16, 2024 · Jan 17, 2024 · Jan 17, 2024
diff --git a/airflow/providers/google/CHANGELOG.rst b/airflow/providers/google/CHANGELOG.rst
@@ -27,6 +27,16 @@
 Changelog
 ---------
 
+.. note::
+  The default value of ``parquet_row_group_size`` in ``BaseSQLToGCSOperator`` has changed from 1 to
+  100000, in order to have a default that provides better compression efficiency and performance of
+  reading the data in the output Parquet files. In many cases, the previous value of 1 resulted in
+  very large files, long task durations and out of memory issues. A default value of 100000 may require
+  more memory to execute the operator, in which case users can override the ``parquet_row_group_size``
+  parameter in the operator. All operators that are derived from ``BaseSQLToGCSOperator`` are affected
+  when ``export_format`` is ``parquet``: ``MySQLToGCSOperator``, ``PrestoToGCSOperator``,
+  ``OracleToGCSOperator``, ``TrinoToGCSOperator``, ``MSSQLToGCSOperator`` and ``PostgresToGCSOperator``. Due to the above we treat this change as bug fix.
+
 10.13.1
 .......
 

diff --git a/airflow/providers/google/cloud/transfers/sql_to_gcs.py b/airflow/providers/google/cloud/transfers/sql_to_gcs.py
@@ -85,7 +85,7 @@ class BaseSQLToGCSOperator(BaseOperator):
     :param parquet_row_group_size: The approximate number of rows in each row group
         when using parquet format. Using a large row group size can reduce the file size
         and improve the performance of reading the data, but it needs more memory to
-        execute the operator. (default: 1)
+        execute the operator. (default: 100000)
     """
 
     template_fields: Sequence[str] = (
@@ -123,7 +123,7 @@ def __init__(
         exclude_columns: set | None = None,
         partition_columns: list | None = None,
         write_on_empty: bool = False,
-        parquet_row_group_size: int = 1,
+        parquet_row_group_size: int = 100000,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)