Skip to content

⚡️ Speed up function drop_duplicates by 485% #4

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed

Conversation

codeflash-ai[bot]
Copy link

@codeflash-ai codeflash-ai bot commented Apr 20, 2025

📄 485% (4.85x) speedup for drop_duplicates in src/numpy_pandas/dataframe_operations.py

⏱️ Runtime : 34.2 milliseconds 5.85 milliseconds (best of 62 runs)

📝 Explanation and details

To improve the performance of this code, we can leverage the built-in drop_duplicates method provided by pandas, which is optimized for such operations. Using this built-in method is both faster and more concise. Here is the optimized version of the function.

This optimized version leverages the efficient internal implementation of drop_duplicates provided by pandas, significantly improving the runtime.

Correctness verification report:

Test Status
⚙️ Existing Unit Tests 🔘 None Found
🌀 Generated Regression Tests 40 Passed
⏪ Replay Tests 🔘 None Found
🔎 Concolic Coverage Tests 🔘 None Found
📊 Tests Coverage 100.0%
🌀 Generated Regression Tests Details
from typing import List

import pandas as pd
# imports
import pytest  # used for our unit tests
from src.numpy_pandas.dataframe_operations import drop_duplicates

# unit tests

def test_single_column_with_duplicates():
    # DataFrame with a single column containing duplicate values
    df = pd.DataFrame({'A': [1, 1, 2, 3, 3]})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = pd.DataFrame({'A': [1, 2, 3]})
    pd.testing.assert_frame_equal(result, expected)

def test_single_column_all_unique():
    # DataFrame with a single column containing all unique values
    df = pd.DataFrame({'A': [1, 2, 3, 4, 5]})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

def test_multiple_columns_with_duplicates():
    # DataFrame with multiple columns where some rows are duplicates
    df = pd.DataFrame({'A': [1, 2, 2, 3], 'B': ['x', 'y', 'y', 'z']})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'z']})
    pd.testing.assert_frame_equal(result, expected)

def test_multiple_columns_all_unique():
    # DataFrame with multiple columns where all rows are unique
    df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': ['w', 'x', 'y', 'z']})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

def test_empty_dataframe():
    # DataFrame with no rows and no columns
    df = pd.DataFrame()
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

def test_empty_dataframe_with_columns():
    # DataFrame with columns but no rows
    df = pd.DataFrame(columns=['A', 'B'])
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

def test_single_row_dataframe():
    # DataFrame with a single row and single column
    df = pd.DataFrame({'A': [1]})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

def test_single_row_multiple_columns():
    # DataFrame with a single row and multiple columns
    df = pd.DataFrame({'A': [1], 'B': ['x']})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

def test_single_column_multiple_rows():
    # DataFrame with a single column and multiple rows, some of which are duplicates
    df = pd.DataFrame({'A': [1, 2, 2, 3]})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = pd.DataFrame({'A': [1, 2, 3]})
    pd.testing.assert_frame_equal(result, expected)

def test_subset_specified():
    # DataFrame with multiple columns, specifying a subset that contains duplicates
    df = pd.DataFrame({'A': [1, 2, 2, 3], 'B': ['x', 'y', 'y', 'z']})
    codeflash_output = drop_duplicates(df, subset=['A']); result = codeflash_output
    expected = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'z']})
    pd.testing.assert_frame_equal(result, expected)

def test_subset_all_unique():
    # DataFrame with multiple columns, specifying a subset that contains all unique values
    df = pd.DataFrame({'A': [1, 2, 3, 4], 'B': ['w', 'x', 'y', 'z']})
    codeflash_output = drop_duplicates(df, subset=['A']); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

def test_different_data_types():
    # DataFrame with columns of different data types (e.g., integers, floats, strings, dates)
    df = pd.DataFrame({
        'A': [1, 2, 2, 3],
        'B': [1.1, 2.2, 2.2, 3.3],
        'C': ['x', 'y', 'y', 'z'],
        'D': [pd.NaT, pd.Timestamp('20200101'), pd.Timestamp('20200101'), pd.Timestamp('20200201')]
    })
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = pd.DataFrame({
        'A': [1, 2, 3],
        'B': [1.1, 2.2, 3.3],
        'C': ['x', 'y', 'z'],
        'D': [pd.NaT, pd.Timestamp('20200101'), pd.Timestamp('20200201')]
    }).reset_index(drop=True)
    pd.testing.assert_frame_equal(result, expected)

def test_large_dataframe():
    # DataFrame with a large number of rows and columns to test performance and scalability
    df = pd.DataFrame({'A': range(1000), 'B': range(1000)})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

def test_large_dataframe_single_column():
    # DataFrame with a large number of rows and a single column to test performance with many duplicates
    df = pd.DataFrame({'A': [1] * 1000})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = pd.DataFrame({'A': [1]})
    pd.testing.assert_frame_equal(result, expected)

def test_special_characters():
    # DataFrame with string columns containing special characters (e.g., punctuation, whitespace)
    df = pd.DataFrame({'A': ['a', 'a ', 'a', 'A'], 'B': ['x', ' x', 'x', 'X']})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = pd.DataFrame({'A': ['a', 'a ', 'A'], 'B': ['x', ' x', 'X']})
    pd.testing.assert_frame_equal(result, expected)

def test_small_subset():
    # DataFrame with many columns but a subset containing only a few columns
    df = pd.DataFrame({'A': [1, 2, 2, 3], 'B': ['x', 'y', 'y', 'z'], 'C': [10, 20, 20, 30]})
    codeflash_output = drop_duplicates(df, subset=['A', 'B']); result = codeflash_output
    expected = pd.DataFrame({'A': [1, 2, 3], 'B': ['x', 'y', 'z'], 'C': [10, 20, 30]})
    pd.testing.assert_frame_equal(result, expected)

def test_mixed_dataframe():
    # DataFrame with a mix of duplicate and unique rows, some of which are identified by the subset
    df = pd.DataFrame({'A': [1, 2, 2, 3, 4], 'B': ['x', 'y', 'y', 'z', 'x'], 'C': [10, 20, 20, 30, 40]})
    codeflash_output = drop_duplicates(df, subset=['A', 'B']); result = codeflash_output
    expected = pd.DataFrame({'A': [1, 2, 3, 4], 'B': ['x', 'y', 'z', 'x'], 'C': [10, 20, 30, 40]})
    pd.testing.assert_frame_equal(result, expected)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

from typing import List  # used for type hinting

import pandas as pd  # used for DataFrame manipulation
# imports
import pytest  # used for our unit tests
from src.numpy_pandas.dataframe_operations import drop_duplicates


# unit tests
def test_basic_functionality():
    # Single Column, No Duplicates
    df = pd.DataFrame({'col1': ['A', 'B', 'C']})
    codeflash_output = drop_duplicates(df, subset=['col1']); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

    # Single Column, With Duplicates
    df = pd.DataFrame({'col1': ['A', 'B', 'A']})
    codeflash_output = drop_duplicates(df, subset=['col1']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'B']})
    pd.testing.assert_frame_equal(result, expected)

    # Multiple Columns, No Duplicates
    df = pd.DataFrame({'col1': ['A', 'B', 'C'], 'col2': [1, 2, 3]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

    # Multiple Columns, With Duplicates
    df = pd.DataFrame({'col1': ['A', 'B', 'A'], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'B'], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

def test_edge_cases():
    # Empty DataFrame
    df = pd.DataFrame(columns=['col1', 'col2'])
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

    # Single Row DataFrame
    df = pd.DataFrame({'col1': ['A'], 'col2': [1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

    # All Rows Identical
    df = pd.DataFrame({'col1': ['A', 'A', 'A'], 'col2': [1, 1, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A'], 'col2': [1]})
    pd.testing.assert_frame_equal(result, expected)

def test_subset_parameter():
    # Subset is None
    df = pd.DataFrame({'col1': ['A', 'B', 'A'], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'B'], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

    # Subset with One Column
    df = pd.DataFrame({'col1': ['A', 'B', 'A'], 'col2': [1, 2, 3]})
    codeflash_output = drop_duplicates(df, subset=['col1']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'B'], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

    # Subset with Multiple Columns
    df = pd.DataFrame({'col1': ['A', 'B', 'A'], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'B'], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

    # Subset with Non-Existent Column
    df = pd.DataFrame({'col1': ['A', 'B', 'A'], 'col2': [1, 2, 1]})
    with pytest.raises(KeyError):
        drop_duplicates(df, subset=['col3'])

def test_data_types():
    # Mixed Data Types
    df = pd.DataFrame({'col1': ['A', 'B', 'A'], 'col2': [1, 2.0, '1']})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

    # Numeric Data Types
    df = pd.DataFrame({'col1': [1, 3, 1], 'col2': [2, 4, 2]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': [1, 3], 'col2': [2, 4]})
    pd.testing.assert_frame_equal(result, expected)

    # String Data Types
    df = pd.DataFrame({'col1': ['A', 'C', 'A'], 'col2': ['B', 'D', 'B']})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'C'], 'col2': ['B', 'D']})
    pd.testing.assert_frame_equal(result, expected)

def test_performance_and_scalability():
    # Large DataFrame with No Duplicates
    df = pd.DataFrame({'col1': list(range(1000)), 'col2': list(range(1, 1001))})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = df
    pd.testing.assert_frame_equal(result, expected)

    # Large DataFrame with Duplicates
    df = pd.DataFrame({'col1': [i // 2 for i in range(1000)], 'col2': [i // 2 + 1 for i in range(1000)]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = df.drop_duplicates(subset=['col1', 'col2']).reset_index(drop=True)
    pd.testing.assert_frame_equal(result, expected)

def test_complex_scenarios():
    # DataFrame with NaN Values
    df = pd.DataFrame({'col1': ['A', 'B', 'A'], 'col2': [1, None, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'B'], 'col2': [1, None]})
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with Mixed NaN and Non-NaN Values
    df = pd.DataFrame({'col1': ['A', None, 'A'], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', None], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with DateTime Columns
    df = pd.DataFrame({'col1': [pd.Timestamp('2023-01-01'), pd.Timestamp('2023-01-02'), pd.Timestamp('2023-01-01')], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': [pd.Timestamp('2023-01-01'), pd.Timestamp('2023-01-02')], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

def test_rare_edge_cases():
    # DataFrame with Special Characters
    df = pd.DataFrame({'col1': ['A@', 'B#', 'A@'], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A@', 'B#'], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with Boolean Values
    df = pd.DataFrame({'col1': [True, False, True], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': [True, False], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with Nested Data Structures
    df = pd.DataFrame({'col1': [[1, 2], [3, 4], [1, 2]], 'col2': ['A', 'B', 'A']})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': [[1, 2], [3, 4]], 'col2': ['A', 'B']})
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with Mixed Data Types in Columns
    df = pd.DataFrame({'col1': ['A', 1, 'A'], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 1], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with Duplicate Column Names
    df = pd.DataFrame([['A', 1], ['B', 2], ['A', 1]], columns=['col1', 'col1'])
    codeflash_output = drop_duplicates(df, subset=['col1']); result = codeflash_output
    expected = pd.DataFrame([['A', 1], ['B', 2]], columns=['col1', 'col1'])
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with MultiIndex
    df = pd.DataFrame({'col1': ['A', 'B', 'A'], 'col2': [1, 2, 1]}, index=[['x', 'y', 'x'], [1, 2, 1]])
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'B'], 'col2': [1, 2]}, index=[['x', 'y'], [1, 2]]).reset_index(drop=True)
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with Timezone-Aware DateTime Columns
    df = pd.DataFrame({'col1': [pd.Timestamp('2023-01-01', tz='UTC'), pd.Timestamp('2023-01-02', tz='UTC'), pd.Timestamp('2023-01-01', tz='UTC')], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': [pd.Timestamp('2023-01-01', tz='UTC'), pd.Timestamp('2023-01-02', tz='UTC')], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with Object Data Type Columns
    df = pd.DataFrame({'col1': [object(), object(), object()], 'col2': [1, 2, 1]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': [df['col1'][0], df['col1'][1]], 'col2': [1, 2]})
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with Custom Index Types
    df = pd.DataFrame({'col1': ['A', 'B', 'A'], 'col2': [1, 2, 1]}, index=pd.Index([1, 2, 3], name='custom_index'))
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'B'], 'col2': [1, 2]}, index=pd.Index([1, 2], name='custom_index')).reset_index(drop=True)
    pd.testing.assert_frame_equal(result, expected)

    # DataFrame with Mixed Encodings
    df = pd.DataFrame({'col1': ['A', 'C', 'A'], 'col2': ['B', 'D', 'B'.encode('utf-8')]})
    codeflash_output = drop_duplicates(df, subset=['col1', 'col2']); result = codeflash_output
    expected = pd.DataFrame({'col1': ['A', 'C'], 'col2': ['B', 'D']})
    pd.testing.assert_frame_equal(result, expected)
# codeflash_output is used to check that the output of the original code is the same as that of the optimized code.

To edit these changes git checkout codeflash/optimize-drop_duplicates-m9piw4rh and push.

Codeflash

To improve the performance of this code, we can leverage the built-in `drop_duplicates` method provided by pandas, which is optimized for such operations. Using this built-in method is both faster and more concise. Here is the optimized version of the function.



This optimized version leverages the efficient internal implementation of `drop_duplicates` provided by pandas, significantly improving the runtime.
@codeflash-ai codeflash-ai bot added the ⚡️ codeflash Optimization PR opened by Codeflash AI label Apr 20, 2025
@codeflash-ai codeflash-ai bot requested a review from KRRT7 April 20, 2025 10:47
@codeflash-ai codeflash-ai bot deleted the codeflash/optimize-drop_duplicates-m9piw4rh branch May 20, 2025 05:34
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
⚡️ codeflash Optimization PR opened by Codeflash AI
Projects
None yet
Development

Successfully merging this pull request may close these issues.

1 participant