From ea79b79a8f2c4bef935ee65c7a5678b0a25d5c34 Mon Sep 17 00:00:00 2001 From: Dmitry Chigarev Date: Wed, 11 Oct 2023 12:26:03 +0000 Subject: [PATCH] adjust threshold Signed-off-by: Dmitry Chigarev --- .../pandas_on_ray/partitioning/partition_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py index 95a0cf652ce..7a5e1aa0c75 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition_manager.py @@ -73,11 +73,11 @@ def split_pandas_df_into_partitions( # 3. The distributed splitting consumes more memory that the sequential one. # It was estimated that it requires ~2.5x of the dataframe size, so to avoid # OOM problems, we fall back to sequential implementation in case it doesn't - # fit into memory (using 3x threshold to be on the safe side). + # fit into memory (using 3.5x threshold to be on the safe side). enough_elements = (len(df) * len(df.columns)) > 6_000_000 all_numeric_types = all(is_numeric_dtype(dtype) for dtype in df.dtypes) three_copies_fits_into_memory = psutil.virtual_memory().available > ( - df.memory_usage().sum() * 3 + df.memory_usage().sum() * 3.5 ) distributed_splitting = ( enough_elements and all_numeric_types and three_copies_fits_into_memory