implemented df splitting tool and its unity test

petrobras · Reinaldo-Kn · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024 · Oct 15, 2024
commit 72be3cc0430c5feec687c12909c3afea9ced9a51
diff --git a/.gitignore b/.gitignore
@@ -172,3 +172,5 @@ dmypy.json
 # Pyre type checker
 .pyre/
 
+# Temporary files
+temp.py
diff --git a/bibmon/_bibmon_tools.py b/bibmon/_bibmon_tools.py
@@ -739,4 +739,40 @@ def find_df_transitions(
                 transitions.append(i)
                 previous_event = df[label].iloc[i]
 
-    return transitions
+    return transitions
+
+###############################################################################
+
+def split_df_percentages(df: pd.DataFrame, percentages: list[float]) -> list[pd.DataFrame]:
+    """
+    Splits a DataFrame into multiple DataFrames according to the given percentages, the sum of percentages must equal 1.
+
+    For example, if percentage = [0.6, 0.2, 0.2], the function will return a list with three DataFrames, the first one with 60% of the data, the second one with 20% and the third one with 20%.
+
+    Warning: This function may cause data loss if the split cannot be done exactly according to the percentages.
+
+    Parameters
+    ----------
+    df: pandas.DataFrame
+        Data to be split.
+    percentages: list of floats
+        List of percentages to be used in the split.
+
+    Returns
+    ----------
+    : list of pandas.DataFrames
+        List with the split DataFrames.
+    """
+
+    if sum(percentages) != 1:
+        raise ValueError("The sum of the percentages must be 1.")
+
+    split_dfs = []
+    start = 0
+
+    for i in range(len(percentages)):
+        end = start + int(percentages[i] * len(df))
+        split_dfs.append(df.iloc[start:end])
+        start = end
+
+    return split_dfs
diff --git a/test/test_tools.py b/test/test_tools.py
@@ -68,4 +68,21 @@ def test_find_df_transitions():
 
     transitions = bibmon._bibmon_tools.find_df_transitions(data, 1, "number", "tag101")
 
-    assert transitions == [99, 101, 102, 103, 104, 106, 107, 108, 243]
+    assert transitions == [99, 101, 102, 103, 104, 106, 107, 108, 243]
+
+def test_split_df_percentages():
+    data = bibmon.load_real_data()
+
+    splitted = bibmon._bibmon_tools.split_df_percentages(data, [0.6, 0.2, 0.2])
+
+    assert splitted[0].shape[0] == 1901
+    assert splitted[1].shape[0] == 633
+    assert splitted[2].shape[0] == 633
+
+def test_split_df_percentages_error():
+    data = bibmon.load_real_data()
+
+    try:
+        _ = bibmon._bibmon_tools.split_df_percentages(data, [0.6, 0.2])
+    except ValueError:
+        assert True
-Original file line number
+Diff line change
@@ Expand Up / @@ -172,3 +172,5 @@ dmypy.json @@
     # Pyre type checker
     .pyre/
+    # Temporary files
+    temp.py