modin-project
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎benchmarks/.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎benchmarks/arithmetic_benchmark.py‎
Lines changed: 62 additions & 0 deletions b/‎benchmarks/arithmetic_benchmark.py‎
Lines changed: 62 additions & 0 deletions
diff --git a/‎benchmarks/df_op_benchmark.py‎
Lines changed: 69 additions & 0 deletions b/‎benchmarks/df_op_benchmark.py‎
Lines changed: 69 additions & 0 deletions
diff --git a/‎benchmarks/generate_data.py‎
Lines changed: 32 additions & 0 deletions b/‎benchmarks/generate_data.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎benchmarks/groupby_benchmark.py‎
Lines changed: 39 additions & 0 deletions b/‎benchmarks/groupby_benchmark.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎benchmarks/io_benchmark.py‎
Lines changed: 29 additions & 0 deletions b/‎benchmarks/io_benchmark.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎benchmarks/join_merge_benchmark.py‎
Lines changed: 64 additions & 0 deletions b/‎benchmarks/join_merge_benchmark.py‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎benchmarks/pandas/arithmetic_benchmark.py‎
Lines changed: 58 additions & 0 deletions b/‎benchmarks/pandas/arithmetic_benchmark.py‎
Lines changed: 58 additions & 0 deletions
@@ -113,6 +113,7 @@ venv.bak/
 .idea/**/usage.statistics.xml
 .idea/**/dictionaries
 .idea/**/shelf
+*.DS_Store
 
 # Sensitive or high-churn files
 .idea/**/dataSources/
 
@@ -0,0 +1,2 @@
+*.csv
+*.png
@@ -0,0 +1,62 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import argparse
+import ray
+import os
+import modin.pandas as pd
+
+from utils import time_logger
+
+
+parser = argparse.ArgumentParser(description='arithmetic benchmark')
+parser.add_argument('--path', dest='path', help='path to the csv data file')
+parser.add_argument('--logfile', dest='logfile', help='path to the log file')
+args = parser.parse_args()
+file = args.path
+file_size = os.path.getsize(file)
+
+if not os.path.exists(os.path.split(args.logfile)[0]):
+    os.makedirs(os.path.split(args.logfile)[0])
+
+logging.basicConfig(filename=args.logfile, level=logging.INFO)
+
+df = pd.read_csv(file)
+blocks = df._block_partitions.flatten().tolist()
+ray.wait(blocks, len(blocks))
+
+with time_logger("Transpose: {}; Size: {} bytes".format(file, file_size)):
+    blocks = df.T.flatten().tolist()
+    ray.wait(blocks, len(blocks))
+
+with time_logger("Sum on axis=0: {}; Size: {} bytes".format(file, file_size)):
+    df.sum()
+
+with time_logger("Sum on axis=1: {}; Size: {} bytes".format(file, file_size)):
+    df.sum(axis=1)
+
+with time_logger("Median on axis=0: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.median()
+
+with time_logger("Median on axis=1: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.median(axis=1)
+
+with time_logger("nunique on axis=0: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.nunique()
+
+with time_logger("nunique on axis=1: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.nunique(axis=1)
+
+with time_logger("Sum UDF on axis=0: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.apply(lambda df: df.sum())
+
+with time_logger("Sum UDF on axis=1: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.apply(lambda df: df.sum(), axis=1)
@@ -0,0 +1,69 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import argparse
+import ray
+import os
+import modin.pandas as pd
+
+from utils import time_logger
+import numpy as np
+
+parser = argparse.ArgumentParser(description='arithmetic benchmark')
+parser.add_argument('--path', dest='path', help='path to the csv data file')
+parser.add_argument('--logfile', dest='logfile', help='path to the log file')
+args = parser.parse_args()
+file = args.path
+file_size = os.path.getsize(file)
+
+logging.basicConfig(filename=args.logfile, level=logging.INFO)
+
+df = pd.read_csv(file)
+blocks = df._block_partitions.flatten().tolist()
+ray.wait(blocks, len(blocks))
+
+num_rows, num_cols = df.shape
+new_row = np.random.randint(0, 100, size=num_cols)
+new_col = np.random.randint(0, 100, size=num_rows)
+
+
+def rand_row_loc():
+    return np.random.randint(0, num_rows)
+
+
+def rand_col_loc():
+    return np.random.randint(0, num_cols)
+
+
+# row/col r/w
+with time_logger("read a column: {}; Size: {} bytes".format(file, file_size)):
+    df.iloc[:, rand_col_loc()]
+
+with time_logger("read a row: {}; Size: {} bytes".format(file, file_size)):
+    df.iloc[rand_row_loc(), :]
+
+with time_logger("write a column: {}; Size: {} bytes".format(file, file_size)):
+    df.iloc[:, rand_col_loc()] = new_col
+
+with time_logger("write a row: {}; Size: {} bytes".format(file, file_size)):
+    df.iloc[rand_row_loc(), :] = new_row
+
+# element r/w
+
+with time_logger("read an element: {}; Size: {} bytes".format(file,
+                                                              file_size)):
+    df.iloc[rand_row_loc(), rand_col_loc()]
+
+with time_logger("write an element: {}; Size: {} bytes".format(
+        file, file_size)):
+    df.iloc[rand_row_loc(), rand_col_loc()] = np.random.randint(0, 100)
+
+# appending
+with time_logger("append a row: {}; Size: {} bytes".format(file, file_size)):
+    df.append(pd.Series(new_row), ignore_index=True)
+
+with time_logger("append a column: {}; Size: {} bytes".format(file,
+                                                              file_size)):
+    df['new'] = new_col
@@ -0,0 +1,32 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import modin.pandas as pd
+import numpy as np
+import os
+
+num_rows = [100, 10000, 100000, 150000, 200000, 350000, 500000]
+num_cols = [1000]
+
+path_to_data = "benchmarks/data/"
+if not os.path.exists(path_to_data):
+    os.makedirs(path_to_data)
+
+for r in num_rows:
+    for c in num_cols:
+        df = pd.DataFrame(np.random.randint(0, 100, size=(r, c)))
+        df.to_csv(path_to_data + "test-data-{}-{}.csv".format(r, c))
+
+# Files for multi df tests
+num_rows = [100, 1000, 100000, 1000000]
+num_cols = [1000]
+
+path_to_data = "benchmarks/data/multi/"
+if not os.path.exists(path_to_data):
+    os.makedirs(path_to_data)
+
+for r in num_rows:
+    for c in num_cols:
+        df = pd.DataFrame(np.random.randint(0, 100, size=(r, c)))
+        df.to_csv(path_to_data + "test-data-{}-{}.csv".format(r, c))
@@ -0,0 +1,39 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import argparse
+import ray
+import os
+import modin.pandas as pd
+
+from utils import time_logger
+
+
+parser = argparse.ArgumentParser(description='groupby benchmark')
+parser.add_argument('--path', dest='path', help='path to the csv data file')
+parser.add_argument('--logfile', dest='logfile', help='path to the log file')
+args = parser.parse_args()
+file = args.path
+file_size = os.path.getsize(file)
+
+if not os.path.exists(os.path.split(args.logfile)[0]):
+    os.makedirs(os.path.split(args.logfile)[0])
+
+logging.basicConfig(filename=args.logfile, level=logging.INFO)
+
+df = pd.read_csv(file)
+blocks = df._block_partitions.flatten().tolist()
+ray.wait(blocks, len(blocks))
+
+with time_logger("Groupby + sum aggregation on axis=0: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df_groupby = df.groupby('1')
+    blocks = df_groupby.sum()._block_partitions.flatten().tolist()
+    ray.wait(blocks, len(blocks))
+
+with time_logger("Groupby mean on axis=0: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    blocks = df_groupby.mean()._block_partitions.flatten().tolist()
+    ray.wait(blocks, len(blocks))
@@ -0,0 +1,29 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import argparse
+import os
+import ray
+import modin.pandas as pd
+
+from utils import time_logger
+
+
+parser = argparse.ArgumentParser(description='read_csv benchmark')
+parser.add_argument('--path', dest='path', help='path to the csv file')
+parser.add_argument('--logfile', dest='logfile', help='path to the log file')
+args = parser.parse_args()
+file = args.path
+file_size = os.path.getsize(file)
+
+if not os.path.exists(os.path.split(args.logfile)[0]):
+    os.makedirs(os.path.split(args.logfile)[0])
+
+logging.basicConfig(filename=args.logfile, level=logging.INFO)
+
+with time_logger("Read csv file: {}; Size: {} bytes".format(file, file_size)):
+    df = pd.read_csv(file)
+    blocks = df._block_partitions.flatten().tolist()
+    ray.wait(blocks, len(blocks))
@@ -0,0 +1,64 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import argparse
+import ray
+import os
+import modin.pandas as pd
+
+from utils import time_logger
+
+
+parser = argparse.ArgumentParser(description='arithmetic benchmark')
+parser.add_argument('--left', dest='left', help='path to the left csv data '
+                                                'file')
+parser.add_argument('--right', dest='right', help='path to the right csv data '
+                                                  'file')
+parser.add_argument('--logfile', dest='logfile', help='path to the log file')
+args = parser.parse_args()
+file_left = args.left
+file_size_left = os.path.getsize(file_left)
+
+file_right = args.right
+file_size_right = os.path.getsize(file_right)
+
+if not os.path.exists(os.path.split(args.logfile)[0]):
+    os.makedirs(os.path.split(args.logfile)[0])
+
+logging.basicConfig(filename=args.logfile, level=logging.INFO)
+
+df_left = pd.read_csv(file_left)
+df_right = pd.read_csv(file_right)
+
+blocks = df_left._block_partitions.flatten().tolist()
+ray.wait(blocks, len(blocks))
+blocks = df_right._block_partitions.flatten().tolist()
+ray.wait(blocks, len(blocks))
+
+with time_logger("Inner Join: {} & {}; Left Size: {} bytes; Right Size: {} "
+                 "bytes".format(file_left, file_right, file_size_left,
+                                file_size_right)):
+    result = df_left.join(df_right, how="inner", lsuffix='left_')
+    ray.wait(result._block_partitions.flatten().tolist())
+
+with time_logger("Outer Join: {} & {}; Left Size: {} bytes; Right Size: {} "
+                 "bytes".format(file_left, file_right, file_size_left,
+                                file_size_right)):
+    result = df_left.join(df_right, how="outer", lsuffix='left_')
+    ray.wait(result._block_partitions.flatten().tolist())
+
+with time_logger("Inner Merge: {} & {}; Left Size: {} bytes; Right Size: {} "
+                 "bytes".format(file_left, file_right, file_size_left,
+                                file_size_right)):
+    result = df_left.merge(df_right, how="inner",
+                           left_index=True, right_index=True)
+    ray.wait(result._block_partitions.flatten().tolist())
+
+with time_logger("Outer Merge: {} & {}; Left Size: {} bytes; Right Size: {} "
+                 "bytes".format(file_left, file_right, file_size_left,
+                                file_size_right)):
+    result = df_left.merge(df_right, how="outer",
+                           left_index=True, right_index=True)
+    ray.wait(result._block_partitions.flatten().tolist())
@@ -0,0 +1,58 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import logging
+import argparse
+import os
+import pandas as pd
+
+from utils import time_logger
+
+
+parser = argparse.ArgumentParser(description='arithmetic benchmark')
+parser.add_argument('--path', dest='path', help='path to the csv data file')
+parser.add_argument('--logfile', dest='logfile', help='path to the log file')
+args = parser.parse_args()
+file = args.path
+file_size = os.path.getsize(file)
+
+if not os.path.exists(os.path.split(args.logfile)[0]):
+    os.makedirs(os.path.split(args.logfile)[0])
+
+logging.basicConfig(filename=args.logfile, level=logging.INFO)
+
+df = pd.read_csv(file)
+
+with time_logger("Transpose: {}; Size: {} bytes".format(file, file_size)):
+    df.T
+
+with time_logger("Sum on axis=0: {}; Size: {} bytes".format(file, file_size)):
+    df.sum()
+
+with time_logger("Sum on axis=1: {}; Size: {} bytes".format(file, file_size)):
+    df.sum(axis=1)
+
+with time_logger("Median on axis=0: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.median()
+
+with time_logger("Median on axis=1: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.median(axis=1)
+
+with time_logger("nunique on axis=0: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.nunique()
+
+with time_logger("nunique on axis=1: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.nunique(axis=1)
+
+with time_logger("Sum UDF on axis=0: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.apply(lambda df: df.sum())
+
+with time_logger("Sum UDF on axis=1: {}; Size: {} bytes"
+                 .format(file, file_size)):
+    df.apply(lambda df: df.sum(), axis=1)