ENH: Add sort_columns parameter to combine_first

pandas-dev · U-S-jun · Nov 28, 2024 · Nov 28, 2024 · Nov 28, 2024 · Nov 28, 2024
commit 6333c3b906d86b5bf2072012fa910ea05c766c40
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -8712,7 +8712,7 @@ def combine(
         frame_result = self._constructor(result, index=new_index, columns=new_columns)
         return frame_result.__finalize__(self, method="combine")
 
-    def combine_first(self, other: DataFrame) -> DataFrame:
+    def combine_first(self, other: DataFrame, sort_columns=True) -> DataFrame:
         """
         Update null elements with value in the same location in `other`.
 
@@ -8728,6 +8728,10 @@ def combine_first(self, other: DataFrame) -> DataFrame:
         ----------
         other : DataFrame
             Provided DataFrame to use to fill null values.
+        sort_columns : bool, default True
+            Whether to sort the columns in the result DataFrame. If False, the
+            order of the columns in `self` is preserved.
+
 
         Returns
         -------
@@ -8741,13 +8745,25 @@ def combine_first(self, other: DataFrame) -> DataFrame:
 
         Examples
         --------
+        Default behavior with `sort_columns=True` (default):
+
         >>> df1 = pd.DataFrame({"A": [None, 0], "B": [None, 4]})
         >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]})
         >>> df1.combine_first(df2)
              A    B
         0  1.0  3.0
         1  0.0  4.0
 
+
+        Preserving the column order of `self` with `sort_columns=False`:
+
+        >>> df1 = pd.DataFrame({"B": [None, 4], "A": [0, None]})
+        >>> df2 = pd.DataFrame({"A": [1, 1], "B": [3, 3]})
+        >>> df1.combine_first(df2, sort_columns=False)
+             B    A
+        0  3.0  0.0
+        1  4.0  1.0
+
         Null values still persist if the location of that null value
         does not exist in `other`
 
@@ -8773,6 +8789,8 @@ def combiner(x: Series, y: Series):
                 return y_values
 
             return expressions.where(mask, y_values, x_values)
+
+        all_columns = self.columns.union(other.columns)
 
         if len(other) == 0:
             combined = self.reindex(
@@ -8790,6 +8808,13 @@ def combiner(x: Series, y: Series):
 
         if dtypes:
             combined = combined.astype(dtypes)
+
+        combined = combined.reindex(columns=all_columns, fill_value=None)
+
+        if not sort_columns:
+            combined = combined[self.columns]
+
+
 
         return combined.__finalize__(self, method="combine_first")
 

diff --git a/pandas/tests/frame/methods/test_combine_first.py b/pandas/tests/frame/methods/test_combine_first.py
@@ -560,3 +560,11 @@ def test_combine_first_empty_columns():
     result = left.combine_first(right)
     expected = DataFrame(columns=["a", "b", "c"])
     tm.assert_frame_equal(result, expected)
+
+def test_combine_first_column_order():
+    df1 = pd.DataFrame({"B": [1, 2], "A": [3, 4]})
+    df2 = pd.DataFrame({"A": [5]}, index=[1])
+
+    result = df1.combine_first(df2,sort_columns=False)
+    expected = pd.DataFrame({"B": [1, 2], "A": [3, 4]})
+    pd.testing.assert_frame_equal(result, expected)