diff --git a/.github/actions/mamba-env/action.yml b/.github/actions/mamba-env/action.yml
new file mode 100644
index 00000000000..6badcf68e15
--- /dev/null
+++ b/.github/actions/mamba-env/action.yml
@@ -0,0 +1,44 @@
+name: "Install environment using Mamba"
+description: "Prepare the environment to run Modin"
+inputs:
+ python-version:
+ description: "Python version to install"
+ default: "3.8"
+ environment-file:
+ description: "Conda environment yml"
+ required: true
+ activate-environment:
+ description: "Conda environment to activate"
+ default: "modin"
+
+runs:
+ using: "composite"
+ steps:
+ - name: Get current week
+ id: get-week
+ # use current week as cache key to periodically refresh the cache,
+ # as cache is based on requirements, but dependencies push
+ # updated versions at some irregular pace
+ run: echo "thisweek=$(/bin/date -u '+%Y.w%W')" >> $GITHUB_OUTPUT
+ shell: bash
+ - name: Cache conda
+ id: cache-conda
+ uses: actions/cache@v3
+ with:
+ path: |
+ ~/conda_pkgs_dir
+ ~/.cache/pip
+ key:
+ ${{ runner.os }}-conda-${{ steps.get-week.outputs.thisweek }}-${{ hashFiles(inputs.environment-file) }}
+ - uses: conda-incubator/setup-miniconda@v2
+ with:
+ miniforge-variant: Mambaforge
+ miniforge-version: latest
+ use-mamba: true
+ activate-environment: ${{ inputs.activate-environment }}
+ environment-file: ${{ inputs.environment-file }}
+ python-version: ${{ inputs.python-version }}
+ channel-priority: strict
+ # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
+ # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
+ use-only-tar-bz2: false
diff --git a/.github/actions/python-only/action.yml b/.github/actions/python-only/action.yml
new file mode 100644
index 00000000000..2fe3d23c4fc
--- /dev/null
+++ b/.github/actions/python-only/action.yml
@@ -0,0 +1,15 @@
+name: "Install Python only"
+description: "Prepare the environment to run simple tasks"
+inputs:
+ python-version:
+ description: "Python version to install"
+ default: "3.8.x"
+
+runs:
+ using: "composite"
+ steps:
+ - uses: actions/setup-python@v4
+ with:
+ python-version: ${{ inputs.python-version }}
+ architecture: "x64"
+ cache: 'pip'
diff --git a/.github/actions/run-core-tests/action.yml b/.github/actions/run-core-tests/action.yml
new file mode 100644
index 00000000000..ae0f21e08d6
--- /dev/null
+++ b/.github/actions/run-core-tests/action.yml
@@ -0,0 +1,29 @@
+name: "Run core Modin tests"
+description: "Run core Modin tests like dataframe or groupby"
+inputs:
+ runner:
+ description: "Runner for tests"
+ default: "python -m pytest"
+ parallel:
+ description: "How to run tests in parallel"
+ default: "-n 2"
+
+runs:
+ using: "composite"
+ steps:
+ - uses: ./.github/actions/run-core-tests/group_1
+ with:
+ runner: ${{ inputs.runner }}
+ parallel: ${{ inputs.parallel }}
+ - uses: ./.github/actions/run-core-tests/group_2
+ with:
+ runner: ${{ inputs.runner }}
+ parallel: ${{ inputs.parallel }}
+ - uses: ./.github/actions/run-core-tests/group_3
+ with:
+ runner: ${{ inputs.runner }}
+ parallel: ${{ inputs.parallel }}
+ - uses: ./.github/actions/run-core-tests/group_4
+ with:
+ runner: ${{ inputs.runner }}
+ parallel: ${{ inputs.parallel }}
diff --git a/.github/actions/run-core-tests/group_1/action.yml b/.github/actions/run-core-tests/group_1/action.yml
new file mode 100644
index 00000000000..a338209e656
--- /dev/null
+++ b/.github/actions/run-core-tests/group_1/action.yml
@@ -0,0 +1,21 @@
+name: "Run core Modin tests - group 1"
+description: "Run core Modin tests like dataframe or groupby"
+inputs:
+ runner:
+ description: "Runner for tests"
+ default: "python -m pytest"
+ parallel:
+ description: "How to run tests in parallel"
+ default: "-n 2"
+
+runs:
+ using: "composite"
+ steps:
+ - run: |
+ echo "::group::Running dataframe tests (group 1)..."
+ ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_binary.py \
+ modin/pandas/test/dataframe/test_default.py \
+ modin/pandas/test/dataframe/test_indexing.py \
+ modin/pandas/test/dataframe/test_iter.py
+ echo "::endgroup::"
+ shell: bash -l {0}
diff --git a/.github/actions/run-core-tests/group_2/action.yml b/.github/actions/run-core-tests/group_2/action.yml
new file mode 100644
index 00000000000..d330e65061a
--- /dev/null
+++ b/.github/actions/run-core-tests/group_2/action.yml
@@ -0,0 +1,22 @@
+name: "Run core Modin tests - group 2"
+description: "Run core Modin tests like dataframe or groupby"
+inputs:
+ runner:
+ description: "Runner for tests"
+ default: "python -m pytest"
+ parallel:
+ description: "How to run tests in parallel"
+ default: "-n 2"
+
+runs:
+ using: "composite"
+ steps:
+ - run: |
+ echo "::group::Running dataframe tests (group 2)..."
+ ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_join_sort.py \
+ modin/pandas/test/dataframe/test_reduce.py \
+ modin/pandas/test/dataframe/test_udf.py \
+ modin/pandas/test/dataframe/test_window.py \
+ modin/pandas/test/dataframe/test_pickle.py
+ echo "::endgroup::"
+ shell: bash -l {0}
diff --git a/.github/actions/run-core-tests/group_3/action.yml b/.github/actions/run-core-tests/group_3/action.yml
new file mode 100644
index 00000000000..578673326f9
--- /dev/null
+++ b/.github/actions/run-core-tests/group_3/action.yml
@@ -0,0 +1,24 @@
+name: "Run core Modin tests - group 3"
+description: "Run core Modin tests like dataframe or groupby"
+inputs:
+ runner:
+ description: "Runner for tests"
+ default: "python -m pytest"
+ parallel:
+ description: "How to run tests in parallel"
+ default: "-n 2"
+
+runs:
+ using: "composite"
+ steps:
+ - run: |
+ echo "::group::Running tests (group 3)..."
+ ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_series.py \
+ modin/pandas/test/dataframe/test_map_metadata.py
+ echo "::endgroup::"
+ shell: bash -l {0}
+ - run: |
+ echo "::group::Running experimental groupby tests (group 3)..."
+ MODIN_EXPERIMENTAL_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py
+ echo "::endgroup::"
+ shell: bash -l {0}
diff --git a/.github/actions/run-core-tests/group_4/action.yml b/.github/actions/run-core-tests/group_4/action.yml
new file mode 100644
index 00000000000..a3588b1469b
--- /dev/null
+++ b/.github/actions/run-core-tests/group_4/action.yml
@@ -0,0 +1,27 @@
+name: "Run core Modin tests - group 4"
+description: "Run core Modin tests like dataframe or groupby"
+inputs:
+ runner:
+ description: "Runner for tests"
+ default: "python -m pytest"
+ parallel:
+ description: "How to run tests in parallel"
+ default: "-n 2"
+
+runs:
+ using: "composite"
+ steps:
+ - run: |
+ echo "::group::Running tests (group 4)..."
+ ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_rolling.py \
+ modin/pandas/test/test_expanding.py \
+ modin/pandas/test/test_groupby.py \
+ modin/pandas/test/test_reshape.py \
+ modin/pandas/test/test_general.py
+ echo "::endgroup::"
+ shell: bash -l {0}
+ - run: |
+ echo "::group::Running concat tests (group 4)..."
+ ${{ inputs.runner }} modin/pandas/test/test_concat.py # Ray and Dask versions fails with -n 2
+ echo "::endgroup::"
+ shell: bash -l {0}
diff --git a/.github/workflows/upload-coverage/action.yml b/.github/actions/upload-coverage/action.yml
similarity index 100%
rename from .github/workflows/upload-coverage/action.yml
rename to .github/actions/upload-coverage/action.yml
diff --git a/.github/workflows/ci-notebooks.yml b/.github/workflows/ci-notebooks.yml
index 196a421edbc..b632604aa28 100644
--- a/.github/workflows/ci-notebooks.yml
+++ b/.github/workflows/ci-notebooks.yml
@@ -7,6 +7,7 @@ on:
- .github/workflows/ci-notebooks.yml
- setup.cfg
- setup.py
+ - requirements/env_hdk.yml
concurrency:
# Cancel other jobs in the same branch. We don't care whether CI passes
# on old commits.
@@ -26,41 +27,19 @@ jobs:
execution: [pandas_on_ray, pandas_on_dask, pandas_on_unidist, hdk_on_native]
steps:
- uses: actions/checkout@v3
- with:
- fetch-depth: 1
- - uses: actions/setup-python@v4
- with:
- python-version: "3.8.x"
- architecture: "x64"
+ - uses: ./.github/actions/python-only
if: matrix.execution != 'hdk_on_native'
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('requirements/env_hdk.yml') }}
- if: matrix.execution == 'hdk_on_native'
- - uses: conda-incubator/setup-miniconda@v2
+ - uses: ./.github/actions/mamba-env
with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin_on_hdk
environment-file: requirements/env_hdk.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
+ activate-environment: modin_on_hdk
if: matrix.execution == 'hdk_on_native'
- name: Cache datasets
uses: actions/cache@v2
with:
path: taxi.csv
# update cache only if notebooks require it to be changed
- key: hashFiles("examples/tutorial/jupyter/**")
+ key: taxi-csv-dataset-${{ hashFiles('examples/tutorial/jupyter/**') }}
# replace modin with . in the tutorial requirements file for `pandas_on_ray` and
# `pandas_on_dask` since we need Modin built from sources
- run: sed -i 's/modin/./g' examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt
diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml
index 8b74e67eaaf..4a07a2aea82 100644
--- a/.github/workflows/ci-required.yml
+++ b/.github/workflows/ci-required.yml
@@ -5,7 +5,11 @@ concurrency:
# on old commits.
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
+env:
+ MODIN_GITHUB_CI: true
+
jobs:
+
check-pr-title:
runs-on: ubuntu-latest
steps:
@@ -14,6 +18,7 @@ jobs:
# NOTE: If you change the allowed prefixes here, update
# the documentation about them in /docs/development/contributing.rst
regexp: '^(?:FEAT|DOCS|FIX|REFACTOR|TEST|PERF)-#\d+:'
+
build-docs:
name: build docs
runs-on: ubuntu-latest
@@ -29,5 +34,77 @@ jobs:
cache-dependency-path: '**/requirements-doc.txt'
- run: pip install -r docs/requirements-doc.txt
- run: cd docs && sphinx-build -T -E -W -b html . build
-env:
- MODIN_GITHUB_CI: true
+
+ lint-pydocstyle:
+ name: lint (pydocstyle)
+ runs-on: ubuntu-latest
+ steps:
+ - uses: actions/checkout@v3
+ - uses: ./.github/actions/python-only
+ # The `numpydoc` version here MUST match the versions in the dev requirements files.
+ - run: pip install pytest pytest-cov pydocstyle numpydoc==1.1.0 xgboost
+ - run: python -m pytest scripts/test
+ - run: pip install -e ".[all]"
+ - run: |
+ python scripts/doc_checker.py --add-ignore=D101,D102,D103,D105 --disable-numpydoc \
+ modin/pandas/dataframe.py modin/pandas/series.py \
+ modin/pandas/groupby.py \
+ modin/pandas/series_utils.py modin/pandas/general.py \
+ modin/pandas/plotting.py modin/pandas/utils.py \
+ modin/pandas/iterator.py modin/pandas/indexing.py \
+ - run: python scripts/doc_checker.py modin/core/dataframe
+ - run: python scripts/doc_checker.py modin/core/execution/dask
+ - run: |
+ python scripts/doc_checker.py \
+ modin/pandas/accessor.py modin/pandas/general.py \
+ modin/pandas/groupby.py modin/pandas/indexing.py \
+ modin/pandas/iterator.py modin/pandas/plotting.py \
+ modin/pandas/series_utils.py modin/pandas/utils.py \
+ modin/pandas/base.py \
+ modin/pandas/io.py \
+ asv_bench/benchmarks/utils \
+ asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \
+ asv_bench/benchmarks/scalability/__init__.py \
+ modin/core/io \
+ modin/experimental/core/execution/ray/implementations/pandas_on_ray \
+ modin/experimental/core/execution/ray/implementations/pyarrow_on_ray \
+ modin/pandas/series.py \
+ modin/core/execution/python \
+ modin/pandas/dataframe.py \
+ modin/config/__init__.py \
+ modin/config/__main__.py \
+ modin/config/envvars.py \
+ modin/config/pubsub.py
+ - run: python scripts/doc_checker.py modin/distributed
+ - run: python scripts/doc_checker.py modin/utils.py
+ - run: python scripts/doc_checker.py modin/experimental/sklearn
+ - run: |
+ python scripts/doc_checker.py modin/experimental/xgboost/__init__.py \
+ modin/experimental/xgboost/utils.py modin/experimental/xgboost/xgboost.py \
+ modin/experimental/xgboost/xgboost_ray.py
+ - run: python scripts/doc_checker.py modin/core/execution/ray
+ - run: |
+ python scripts/doc_checker.py modin/core/execution/dispatching/factories/factories.py \
+ modin/core/execution/dispatching/factories/dispatcher.py \
+ - run: python scripts/doc_checker.py scripts/doc_checker.py
+ - run: |
+ python scripts/doc_checker.py modin/experimental/pandas/io.py \
+ modin/experimental/pandas/numpy_wrap.py modin/experimental/pandas/__init__.py
+ - run: python scripts/doc_checker.py modin/core/storage_formats/base
+ - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow
+ - run: python scripts/doc_checker.py modin/core/storage_formats/pandas
+ - run: |
+ python scripts/doc_checker.py \
+ modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe \
+ modin/experimental/core/execution/native/implementations/hdk_on_native/io \
+ modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning \
+ modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py \
+ modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py \
+ modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py \
+ modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py \
+ modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py \
+ modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py \
+ - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/hdk
+ - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol
+ - run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py
+ - run: python scripts/doc_checker.py modin/logging
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a1ad7c5076a..4a7b4f8cd53 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -5,6 +5,7 @@ on:
# NOTE: keep these paths in sync with the paths that trigger the
# fuzzydata Github Actions in .github/workflows/fuzzydata-test.yml
- .github/workflows/**
+ - .github/actions/**
- '!.github/workflows/push-to-master.yml'
- asv_bench/**
- modin/**
@@ -23,18 +24,14 @@ concurrency:
cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }}
env:
MODIN_GITHUB_CI: true
+
jobs:
lint-black:
name: lint (black)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- with:
- fetch-depth: 1
- - uses: actions/setup-python@v4
- with:
- python-version: "3.8.x"
- architecture: "x64"
+ - uses: ./.github/actions/python-only
- run: pip install black
# NOTE: keep the black command here in sync with the pre-commit hook in
# /contributing/pre-commit
@@ -45,106 +42,16 @@ jobs:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- with:
- fetch-depth: 1
- - uses: actions/setup-python@v4
- with:
- python-version: "3.8.x"
- architecture: "x64"
+ - uses: ./.github/actions/python-only
- run: pip install -r requirements-dev.txt
- run: mypy --config-file mypy.ini
- lint-pydocstyle:
- if: github.event_name == 'pull_request'
- name: lint (pydocstyle)
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v3
- with:
- fetch-depth: 1
- - uses: actions/setup-python@v4
- with:
- python-version: "3.8.x"
- architecture: "x64"
- # The `numpydoc` version here MUST match the versions in the dev requirements files.
- - run: pip install pytest pytest-cov pydocstyle numpydoc==1.1.0 xgboost
- - run: python -m pytest scripts/test
- - run: pip install -e ".[all]"
- - run: |
- python scripts/doc_checker.py --add-ignore=D101,D102,D103,D105 --disable-numpydoc \
- modin/pandas/dataframe.py modin/pandas/series.py \
- modin/pandas/groupby.py \
- modin/pandas/series_utils.py modin/pandas/general.py \
- modin/pandas/plotting.py modin/pandas/utils.py \
- modin/pandas/iterator.py modin/pandas/indexing.py \
- - run: python scripts/doc_checker.py modin/core/dataframe
- - run: python scripts/doc_checker.py modin/core/execution/dask
- - run: |
- python scripts/doc_checker.py \
- modin/pandas/accessor.py modin/pandas/general.py \
- modin/pandas/groupby.py modin/pandas/indexing.py \
- modin/pandas/iterator.py modin/pandas/plotting.py \
- modin/pandas/series_utils.py modin/pandas/utils.py \
- modin/pandas/base.py \
- modin/pandas/io.py \
- asv_bench/benchmarks/utils \
- asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \
- asv_bench/benchmarks/scalability/__init__.py \
- modin/core/io \
- modin/experimental/core/execution/ray/implementations/pandas_on_ray \
- modin/experimental/core/execution/ray/implementations/pyarrow_on_ray \
- modin/pandas/series.py \
- modin/core/execution/python \
- modin/pandas/dataframe.py \
- modin/config/__init__.py \
- modin/config/__main__.py \
- modin/config/envvars.py \
- modin/config/pubsub.py
- - run: python scripts/doc_checker.py modin/distributed
- - run: python scripts/doc_checker.py modin/utils.py
- - run: python scripts/doc_checker.py modin/experimental/sklearn
- - run: |
- python scripts/doc_checker.py modin/experimental/xgboost/__init__.py \
- modin/experimental/xgboost/utils.py modin/experimental/xgboost/xgboost.py \
- modin/experimental/xgboost/xgboost_ray.py
- - run: python scripts/doc_checker.py modin/core/execution/ray
- - run: |
- python scripts/doc_checker.py modin/core/execution/dispatching/factories/factories.py \
- modin/core/execution/dispatching/factories/dispatcher.py \
- - run: python scripts/doc_checker.py scripts/doc_checker.py
- - run: |
- python scripts/doc_checker.py modin/experimental/pandas/io.py \
- modin/experimental/pandas/numpy_wrap.py modin/experimental/pandas/__init__.py
- - run: python scripts/doc_checker.py modin/core/storage_formats/base
- - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow
- - run: python scripts/doc_checker.py modin/core/storage_formats/pandas
- - run: |
- python scripts/doc_checker.py \
- modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe \
- modin/experimental/core/execution/native/implementations/hdk_on_native/io \
- modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning \
- modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py \
- modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py \
- modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py \
- modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py \
- modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py \
- modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py \
- - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/hdk
- - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol
- - run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py
- - run: python scripts/doc_checker.py modin/logging
-
lint-flake8:
name: lint (flake8)
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- with:
- fetch-depth: 1
- - uses: actions/setup-python@v4
- with:
- python-version: "3.8.x"
- architecture: "x64"
+ - uses: ./.github/actions/python-only
# NOTE: If you are changing the set of packages installed here, make sure that
# the dev requirements match them.
- run: pip install flake8 flake8-print flake8-no-implicit-concat
@@ -152,130 +59,47 @@ jobs:
# /contributing/pre-commit
- run: flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py
- test-api:
+ test-api-and-no-engine:
+ name: Test API, headers and no-engine mode
runs-on: ubuntu-latest
- name: test api
defaults:
run:
- # `shell: bash -l {0}` - special way to activate modin environment
shell: bash -l {0}
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 1
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
- environment-file: environment-dev.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- - run: sudo apt update && sudo apt install -y libhdf5-dev
- - name: Api tests
- run: python -m pytest modin/pandas/test/test_api.py
- - name: Executions Api tests
- run: python -m pytest modin/test/test_executions_api.py
-
- test-headers:
- runs-on: ubuntu-latest
- name: test-headers
- defaults:
- run:
- shell: bash -l {0}
- steps:
- - uses: actions/checkout@v3
- with:
- fetch-depth: 1
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
- environment-file: environment-dev.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- - name: Headers tests
- run: python -m pytest modin/test/test_headers.py
-
- test-clean-install-ubuntu:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
- runs-on: ubuntu-latest
- defaults:
- run:
- shell: bash -l {0}
- name: test-clean-install-ubuntu
- steps:
- - uses: actions/checkout@v3
- with:
- fetch-depth: 1
- - uses: actions/setup-python@v4
- with:
- python-version: "3.8.x"
- architecture: "x64"
- - name: Clean install and run
- run: |
- python -m pip install -e ".[all]"
- MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
- MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
- MODIN_ENGINE=unidist UNIDIST_BACKEND=mpi mpiexec -n 1 python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
+ environment-file: requirements/requirements-no-engine.yml
+ - run: python -m pytest modin/pandas/test/test_api.py
+ - run: python -m pytest modin/test/test_executions_api.py
+ - run: python -m pytest modin/test/test_headers.py
+ - run: python -m pytest modin/core/execution/dispatching/factories/test/test_dispatcher.py::test_add_option
+ - uses: ./.github/actions/upload-coverage
- test-clean-install-windows:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
- runs-on: windows-latest
+ test-clean-install:
+ needs: [lint-flake8, lint-black]
+ strategy:
+ matrix:
+ os:
+ - ubuntu
+ - windows
+ runs-on: ${{ matrix.os }}-latest
defaults:
run:
shell: bash -l {0}
- name: test-clean-install-windows
+ name: test-clean-install-${{ matrix.os }}
steps:
- uses: actions/checkout@v3
- with:
- fetch-depth: 1
- - uses: actions/setup-python@v4
- with:
- python-version: "3.8.x"
- architecture: "x64"
- - name: Clean install and run
+ - uses: ./.github/actions/python-only
+ - run: python -m pip install -e ".[all]"
+ - name: Ensure all engines start up
run: |
- python -m pip install -e ".[all]"
MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
MODIN_ENGINE=unidist UNIDIST_BACKEND=mpi mpiexec -n 1 python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))"
test-internals:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
+ needs: [lint-flake8, lint-black]
runs-on: ubuntu-latest
defaults:
run:
@@ -283,32 +107,9 @@ jobs:
name: test-internals
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 1
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- name: Internals tests
run: python -m pytest modin/core/execution/dispatching/factories/test/test_dispatcher.py modin/experimental/cloud/test/test_cloud.py
- run: python -m pytest modin/config/test
@@ -320,46 +121,10 @@ jobs:
- run: python -m pytest asv_bench/test/test_utils.py
- run: python -m pytest modin/test/interchange/dataframe_protocol/base
- run: python -m pytest modin/test/test_logging.py
- - uses: ./.github/workflows/upload-coverage
-
- test-no-engine:
- runs-on: ubuntu-latest
- defaults:
- run:
- shell: bash -l {0}
- steps:
- - uses: actions/checkout@v3
- with:
- fetch-depth: 1
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('requirements-no-engine.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
- environment-file: requirements/requirements-no-engine.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- - run: python -m pytest modin/core/execution/dispatching/factories/test/test_dispatcher.py::test_add_option
- - uses: ./.github/workflows/upload-coverage
+ - uses: ./.github/actions/upload-coverage
test-defaults:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
+ needs: [lint-flake8, lint-black]
runs-on: ubuntu-latest
defaults:
run:
@@ -372,58 +137,20 @@ jobs:
name: Test ${{ matrix.execution }} execution, Python 3.8
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 2
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
- run: python -m pytest modin/experimental/xgboost/test/test_default.py --execution=${{ matrix.execution }}
- run: python -m pytest -n 2 modin/test/storage_formats/base/test_internals.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_binary.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_default.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_indexing.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_iter.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_join_sort.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_map_metadata.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_reduce.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_udf.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_window.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_pickle.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/test_series.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/test_expanding.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/test_concat.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py --execution=${{ matrix.execution }}
- - run: MODIN_EXPERIMENTAL_GROUPBY=1 python -m pytest -n 2 modin/pandas/test/test_groupby.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py --execution=${{ matrix.execution }}
- - run: python -m pytest -n 2 modin/pandas/test/test_general.py --execution=${{ matrix.execution }}
- - uses: ./.github/workflows/upload-coverage
+ - uses: ./.github/actions/run-core-tests
+ with:
+ runner: python -m pytest --execution=${{ matrix.execution }}
+ - uses: ./.github/actions/upload-coverage
test-hdk:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
+ needs: [lint-flake8, lint-black]
runs-on: ubuntu-latest
defaults:
run:
@@ -443,32 +170,10 @@ jobs:
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 2
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('requirements/env_hdk.yml') }}
- - name: Setting up Modin environment
- uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin_on_hdk
environment-file: requirements/env_hdk.yml
- python-version: 3.8
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
+ activate-environment: modin_on_hdk
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
- run: python -m pytest modin/test/storage_formats/hdk/test_internals.py
@@ -498,11 +203,11 @@ jobs:
examples/data/plasticc_training_set_metadata_1k.csv \
examples/data/plasticc_test_set_metadata_1k.csv \
-no-ml
- - uses: ./.github/workflows/upload-coverage
+ - uses: ./.github/actions/upload-coverage
test-asv-benchmarks:
if: github.event_name == 'pull_request'
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
+ needs: [lint-flake8, lint-black]
runs-on: ubuntu-latest
defaults:
run:
@@ -528,14 +233,15 @@ jobs:
pip install git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9
- name: Running benchmarks
run: |
- # ASV correctly creates environments for testing only from the branch
- # with `master` name
- git checkout -b master
- cd asv_bench
- asv check -v
git remote add upstream https://github.com/modin-project/modin.git
git fetch upstream
if git diff upstream/master --name-only | grep -q "^asv_bench/"; then
+ # ASV correctly creates environments for testing only from the branch
+ # with `master` name
+ git checkout -b master
+ cd asv_bench
+ asv check -v
+
asv machine --yes
# check Modin on Ray
@@ -567,8 +273,43 @@ jobs:
path: asv_bench/benchmarks.log
if: failure()
+ execution-filter:
+ # see if execution backend-specific changes were made
+ runs-on: ubuntu-latest
+ outputs:
+ ray: ${{ steps.filter.outputs.ray }}
+ dask: ${{ steps.filter.outputs.dask }}
+ unidist: ${{ steps.filter.outputs.unidist }}
+ engines: ${{ steps.engines.outputs.engines }}
+ experimental: ${{ steps.experimental.outputs.experimental }}
+ steps:
+ - uses: actions/checkout@v3
+ - uses: dorny/paths-filter@v2
+ id: filter
+ with:
+ filters: |
+ shared: &shared
+ - 'modin/core/execution/dispatching/**'
+ ray:
+ - *shared
+ - 'modin/core/execution/ray/**'
+ dask:
+ - *shared
+ - 'modin/core/execution/dask/**'
+ unidist:
+ - *shared
+ - 'modin/core/execution/unidist/**'
+ experimental:
+ - 'modin/experimental/**'
+ - uses: actions/setup-python@v4
+ - id: engines
+ run: |
+ python -c "import sys, json; print('engines=' + json.dumps(['python'] + (sys.argv[1] == 'true' and ['ray'] or []) + (sys.argv[2] == 'true' and ['dask'] or []) ))" \
+ "${{ steps.filter.outputs.ray }}" "${{ steps.filter.outputs.dask }}" >> $GITHUB_OUTPUT
+
test-all-unidist:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
+ needs: [lint-flake8, lint-black, execution-filter]
+ if: github.event_name == 'push' || needs.execution-filter.outputs.unidist == 'true'
runs-on: ubuntu-latest
defaults:
run:
@@ -596,32 +337,11 @@ jobs:
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 2
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('requirements/env_unidist.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin_on_unidist
environment-file: requirements/env_unidist.yml
+ activate-environment: modin_on_unidist
python-version: ${{matrix.python-version}}
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
- name: Set up postgres
@@ -630,36 +350,14 @@ jobs:
run: |
sudo docker pull postgres
sudo docker run --name some-postgres -e POSTGRES_USER=sa -e POSTGRES_PASSWORD=Strong.Pwd-123 -e POSTGRES_DB=postgres -d -p 2345:5432 postgres
- - run: MODIN_BENCHMARK_MODE=True mpiexec -n 1 python -m pytest modin/pandas/test/internals/test_benchmark_mode.py
+ - run: mpiexec -n 1 python -m pytest modin/pandas/test/internals/test_benchmark_mode.py
- run: mpiexec -n 1 python -m pytest modin/pandas/test/internals/test_repartition.py
- run: mpiexec -n 1 python -m pytest modin/test/test_partition_api.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_binary.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_default.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_indexing.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_iter.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_map_metadata.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_reduce.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_udf.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_window.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_pickle.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_series.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_rolling.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_expanding.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_concat.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_groupby.py
- - run: MODIN_EXPERIMENTAL_GROUPBY=1 mpiexec -n 1 python -m pytest modin/pandas/test/test_groupby.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_reshape.py
- - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_general.py
- - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array.py
- - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_creation.py
- - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_arithmetic.py
- - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_axis_functions.py
- - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_logic.py
- - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_linalg.py
- - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_indexing.py
- - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_math.py
- - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_shaping.py
+ - uses: ./.github/actions/run-core-tests
+ with:
+ runner: mpiexec -n 1 python -m pytest
+ parallel: ""
+ - run: mpiexec -n 1 python -m pytest modin/numpy/test
- run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh
- run: ./.github/workflows/sql_server/set_up_sql_server.sh
# need an extra argument "genv" to set environment variables for mpiexec. We need
@@ -672,18 +370,17 @@ jobs:
- run: |
python -m pip install lazy_import
mpiexec -n 1 python -m pytest modin/pandas/test/integrations/
- - uses: ./.github/workflows/upload-coverage
+ - uses: ./.github/actions/upload-coverage
test-all:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
- runs-on: ubuntu-latest
- defaults:
- run:
- shell: bash -l {0}
+ needs: [lint-flake8, lint-black, execution-filter]
strategy:
matrix:
+ os:
+ - ubuntu
+ - windows
python-version: ["3.8"]
- engine: ["python", "ray", "dask"]
+ engine: ${{ fromJSON( github.event_name == 'push' && '["python", "ray", "dask"]' || needs.execution-filter.outputs.engines ) }}
test_task:
- group_1
- group_2
@@ -696,21 +393,23 @@ jobs:
test_task: "group_3"
- engine: "python"
test_task: "group_4"
+ runs-on: ${{ matrix.os }}-latest
+ defaults:
+ run:
+ shell: bash -l {0}
env:
MODIN_ENGINE: ${{matrix.engine}}
# Only test reading from SQL server and postgres on ubuntu for now.
# Eventually, we should test on Windows, too, but we will have to set up
# the servers differently.
- MODIN_TEST_READ_FROM_SQL_SERVER: true
- MODIN_TEST_READ_FROM_POSTGRES: true
- name: test-ubuntu (engine ${{matrix.engine}}, python ${{matrix.python-version}}, ${{matrix.test_task}})
+ MODIN_TEST_READ_FROM_SQL_SERVER: ${{ matrix.os == 'ubuntu' }}
+ MODIN_TEST_READ_FROM_POSTGRES: ${{ matrix.os == 'ubuntu' }}
+ name: test-${{ matrix.os }} (engine ${{matrix.engine}}, python ${{matrix.python-version}}, ${{matrix.test_task}})
services:
- # This service only needs to run for test_task group_4; however, GitHub does not
- # currently support conditionally running services. This issue:
- # is open https://github.com/actions/runner/issues/822 - until GitHub implements this feature,
- # we will just have to run `moto` for all groups.
+ # Using workaround https://github.com/actions/runner/issues/822#issuecomment-1524826092
moto:
- image: motoserver/moto
+ # we only need moto service on Ubuntu and for group_4 task or python engine
+ image: ${{ (matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4')) && 'motoserver/moto' || '' }}
ports:
- 5000:5000
env:
@@ -719,43 +418,36 @@ jobs:
steps:
- name: Limit ray memory
run: echo "MODIN_MEMORY=1000000000" >> $GITHUB_ENV
- if: matrix.engine == 'ray'
+ if: matrix.os == 'ubuntu' && matrix.engine == 'ray'
+ - name: Tell Modin to use existing ray cluster
+ run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV
+ if: matrix.os == 'windows' && matrix.engine == 'ray'
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 2
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
+ - name: Start local ray cluster
+ # Try a few times to start ray to work around
+ # https://github.com/modin-project/modin/issues/4562
+ uses: nick-fields/retry@v2
+ with:
+ timeout_minutes: 5
+ max_attempts: 5
+ command: ray start --head --port=6379 --object-store-memory=1000000000
+ if: matrix.os == 'windows' && matrix.engine == 'ray'
- name: Install HDF5
run: sudo apt update && sudo apt install -y libhdf5-dev
+ if: matrix.os == 'ubuntu'
- name: Set up postgres
# Locally, specifying port 2345:5432 works, but 2345:2345 and 5432:5432 do not. This solution is from
# https://stackoverflow.com/questions/36415654/cant-connect-docker-postgresql-9-3
run: |
sudo docker pull postgres
sudo docker run --name some-postgres -e POSTGRES_USER=sa -e POSTGRES_PASSWORD=Strong.Pwd-123 -e POSTGRES_DB=postgres -d -p 2345:5432 postgres
- - run: MODIN_BENCHMARK_MODE=True python -m pytest modin/pandas/test/internals/test_benchmark_mode.py
+ if: matrix.os == 'ubuntu'
+
+ - run: python -m pytest modin/pandas/test/internals/test_benchmark_mode.py
if: matrix.engine == 'python' || matrix.test_task == 'group_1'
- run: python -m pytest modin/pandas/test/internals/test_repartition.py
if: matrix.engine == 'python' || matrix.test_task == 'group_1'
@@ -764,76 +456,33 @@ jobs:
- run: python -m pytest -n 2 modin/experimental/xgboost/test/test_default.py
if: matrix.engine == 'python' || matrix.test_task == 'group_1'
- run: python -m pytest -n 2 modin/experimental/xgboost/test/test_xgboost.py
- if: matrix.engine == 'ray' && matrix.test_task == 'group_1'
+ if: matrix.os == 'ubuntu' && matrix.engine == 'ray' && matrix.test_task == 'group_1'
- run: python -m pytest -n 2 modin/experimental/xgboost/test/test_dmatrix.py
if: matrix.engine == 'ray' && matrix.test_task == 'group_1'
- run: python -m pytest -n 2 modin/experimental/batch/test/test_pipeline.py
if: matrix.engine == 'python' || matrix.test_task == 'group_1'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_binary.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_1'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_default.py
+ - uses: ./.github/actions/run-core-tests/group_1
if: matrix.engine == 'python' || matrix.test_task == 'group_1'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_indexing.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_1'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_iter.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_join_sort.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_reduce.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_udf.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_window.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_pickle.py
+ - uses: ./.github/actions/run-core-tests/group_2
if: matrix.engine == 'python' || matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_map_metadata.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_3'
- - run: python -m pytest -n 2 modin/pandas/test/test_series.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_3'
- - run: MODIN_EXPERIMENTAL_GROUPBY=1 python -m pytest -n 2 modin/pandas/test/test_groupby.py
+ - uses: ./.github/actions/run-core-tests/group_3
if: matrix.engine == 'python' || matrix.test_task == 'group_3'
- - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/pandas/test/test_expanding.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest modin/pandas/test/test_concat.py # Ray and Dask versions fails with -n 2
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_creation.py
+ - uses: ./.github/actions/run-core-tests/group_4
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_arithmetic.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_axis_functions.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_logic.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_linalg.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_indexing.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_math.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_shaping.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/pandas/test/test_general.py
+ - run: python -m pytest -n 2 modin/numpy/test
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
+ if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4')
- run: ./.github/workflows/sql_server/set_up_sql_server.sh
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
+ if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4')
# Do not add parallelism (`-n` argument) here - it will cause mock S3 service to fail.
- run: python -m pytest modin/pandas/test/test_io.py --verbose
+ timeout-minutes: 60
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- run: python -m pytest modin/experimental/pandas/test/test_io_exp.py
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- run: pip install "dfsql>=0.4.2" "pyparsing<=2.4.7" && python -m pytest modin/experimental/sql/test/test_sql.py
- if: matrix.engine == 'python' || matrix.test_task == 'group_4'
+ if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4')
- run: python -m pytest modin/test/interchange/dataframe_protocol/test_general.py
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- run: python -m pytest modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
@@ -842,10 +491,139 @@ jobs:
python -m pip install lazy_import
python -m pytest modin/pandas/test/integrations/
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- - uses: ./.github/workflows/upload-coverage
+
+ - uses: ./.github/actions/upload-coverage
+ - name: Stop local ray cluster
+ run: ray stop
+ if: matrix.os == 'windows' && matrix.engine == 'ray'
+ - name: Rename the dirs with conda packages so it won't be deleted, it's too slow on Windows.
+ run: |
+ mkdir -p "${CONDA_PKGS_DIR}_do_not_cache" && \
+ find "${CONDA_PKGS_DIR}" -mindepth 1 -maxdepth 1 -type d -exec mv {} "${CONDA_PKGS_DIR}_do_not_cache" \;
+ if: matrix.os == 'windows'
+
+ test-sanity:
+ needs: [lint-flake8, lint-black, execution-filter]
+ if: github.event_name == 'pull_request'
+ strategy:
+ matrix:
+ os:
+ - ubuntu
+ - windows
+ python-version: ["3.8"]
+ execution:
+ - name: ray
+ shell-ex: "python -m pytest"
+ if: needs.execution-filter.ray != 'true'
+ - name: dask
+ shell-ex: "python -m pytest"
+ if: needs.execution-filter.dask != 'true'
+ - name: unidist
+ shell-ex: "mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest"
+ if: needs.execution-filter.unidist != 'true'
+ runs-on: ${{ matrix.os }}-latest
+ defaults:
+ run:
+ shell: bash -l {0}
+ env:
+ MODIN_ENGINE: ${{ matrix.execution.name }}
+ UNIDIST_BACKEND: "mpi"
+ PARALLEL: ${{ matrix.execution.name != 'unidist' && matrix.os != 'windows' && '-n 2' || '' }}
+ name: test-${{ matrix.os }}-sanity (engine ${{ matrix.execution.name }}, python ${{matrix.python-version}})
+ services:
+ moto:
+ image: ${{ matrix.os != 'windows' && 'motoserver/moto' || '' }}
+ ports:
+ - 5000:5000
+ env:
+ AWS_ACCESS_KEY_ID: foobar_key
+ AWS_SECRET_ACCESS_KEY: foobar_secret
+ steps:
+ - uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
+ with:
+ environment-file: ${{ matrix.execution.name == 'unidist' && 'requirements/env_unidist.yml' || 'environment-dev.yml' }}
+ activate-environment: ${{ matrix.execution.name == 'unidist' && 'modin_on_unidist' || 'modin' }}
+ python-version: ${{matrix.python-version}}
+ - name: Install HDF5
+ run: sudo apt update && sudo apt install -y libhdf5-dev
+ if: matrix.os != 'windows'
+ - name: Limit ray memory
+ run: echo "MODIN_MEMORY=1000000000" >> $GITHUB_ENV
+ if: matrix.os != 'windows' && matrix.execution.name == 'ray'
+ - name: Tell Modin to use existing ray cluster
+ run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV
+ if: matrix.os == 'windows' && matrix.execution.name == 'ray'
+ - name: Start local ray cluster
+ # Try a few times to start ray to work around
+ # https://github.com/modin-project/modin/issues/4562
+ uses: nick-fields/retry@v2
+ with:
+ timeout_minutes: 5
+ max_attempts: 5
+ command: ray start --head --port=6379 --object-store-memory=1000000000
+ if: matrix.os == 'windows' && matrix.execution.name == 'ray'
+ - run: MODIN_BENCHMARK_MODE=True ${{ matrix.execution.shell-ex }} modin/pandas/test/internals/test_benchmark_mode.py
+ - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/pandas/test/internals/test_repartition.py
+ - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/test_partition_api.py
+ - name: xgboost tests
+ run: |
+ ${{ matrix.execution.shell-ex }} $PARALLEL \
+ modin/experimental/xgboost/test/test_default.py \
+ modin/experimental/xgboost/test/test_xgboost.py \
+ modin/experimental/xgboost/test/test_dmatrix.py
+ if: matrix.os != 'windows' && matrix.execution.name == 'ray' && needs.execution-filter.experimental == 'true'
+ - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/experimental/batch/test/test_pipeline.py
+ if: matrix.os != 'windows' && matrix.execution.name != 'unidist' && needs.execution-filter.experimental == 'true'
+ - name: "test DF: binary, default, iter"
+ run: |
+ ${{ matrix.execution.shell-ex }} $PARALLEL \
+ modin/pandas/test/dataframe/test_binary.py \
+ modin/pandas/test/dataframe/test_default.py \
+ modin/pandas/test/dataframe/test_iter.py
+ if: matrix.os != 'windows'
+ - name: "test DF: reduce, udf, window, pickle"
+ run: |
+ ${{ matrix.execution.shell-ex }} $PARALLEL \
+ modin/pandas/test/dataframe/test_reduce.py \
+ modin/pandas/test/dataframe/test_udf.py \
+ modin/pandas/test/dataframe/test_window.py \
+ modin/pandas/test/dataframe/test_pickle.py
+ if: matrix.os != 'windows'
+ - run: ${{ matrix.execution.shell-ex }} modin/pandas/test/test_series.py
+ if: matrix.execution.name == 'ray'
+ - run: ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_series.py
+ if: matrix.execution.name != 'ray'
+ - run: ${{ matrix.execution.shell-ex }} modin/pandas/test/dataframe/test_map_metadata.py
+ if: matrix.execution.name == 'ray'
+ - run: ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/dataframe/test_map_metadata.py
+ if: matrix.execution.name != 'ray'
+ - name: "test rolling, expanding, reshape, general, concat"
+ run: |
+ ${{ matrix.execution.shell-ex }} $PARALLEL \
+ modin/pandas/test/test_rolling.py \
+ modin/pandas/test/test_expanding.py \
+ modin/pandas/test/test_reshape.py \
+ modin/pandas/test/test_general.py \
+ modin/pandas/test/test_concat.py
+ if: matrix.os != 'windows'
+ - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/numpy/test
+ - run: ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_io.py --verbose
+ - run: ${{ matrix.execution.shell-ex }} modin/experimental/pandas/test/test_io_exp.py
+ - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/test_general.py
+ - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
+ - name: Stop local ray cluster
+ run: ray stop
+ if: matrix.os == 'windows' && matrix.execution.name == 'ray'
+ - name: Rename the dirs with conda packages so it won't be deleted, it's too slow on Windows.
+ run: |
+ mkdir -p "${CONDA_PKGS_DIR}_do_not_cache" && \
+ find "${CONDA_PKGS_DIR}" -mindepth 1 -maxdepth 1 -type d -exec mv {} "${CONDA_PKGS_DIR}_do_not_cache" \;
+ if: matrix.os == 'windows'
+ - uses: ./.github/actions/upload-coverage
test-experimental:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
+ needs: [lint-flake8, lint-black]
runs-on: ubuntu-latest
defaults:
run:
@@ -864,40 +642,17 @@ jobs:
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 2
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- run: python -m pytest -n 2 modin/pandas/test/dataframe/test_map_metadata.py
- run: python -m pytest -n 2 modin/pandas/test/test_series.py
# Do not add parallelism (`-n` argument) here - it will cause mock S3 service to fail.
- run: python -m pytest modin/pandas/test/test_io.py --verbose
- - uses: ./.github/workflows/upload-coverage
+ - uses: ./.github/actions/upload-coverage
test-cloud:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
+ needs: [lint-flake8, lint-black]
runs-on: ubuntu-latest
defaults:
run:
@@ -916,157 +671,19 @@ jobs:
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 2
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
# TODO(https://github.com/modin-project/modin/issues/4004): Re-add
# "python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py --verbose"
# once that test stops crashing.
- run: python -m pytest --simulate-cloud=normal modin/pandas/test/dataframe/test_default.py::test_kurt_kurtosis --verbose
- # When running without parameters, some of the tests fail
run: python -m pytest --simulate-cloud=normal modin/pandas/test/dataframe/test_binary.py::test_math_functions[add-rows-scalar]
- - uses: ./.github/workflows/upload-coverage
-
- test-windows:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
- runs-on: windows-latest
- defaults:
- run:
- shell: bash -l {0}
- strategy:
- matrix:
- python-version: ["3.8"]
- engine: ["ray", "dask"]
- test_task:
- - group_1
- - group_2
- - group_3
- - group_4
- env:
- MODIN_ENGINE: ${{matrix.engine}}
- name: test-windows (engine ${{matrix.engine}}, python ${{matrix.python-version}}, ${{matrix.test_task}})
- steps:
- - uses: actions/checkout@v3
- with:
- fetch-depth: 2
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
- environment-file: environment-dev.yml
- python-version: ${{matrix.python-version}}
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- - name: Start local ray cluster
- # Try a few times to start ray to work around
- # https://github.com/modin-project/modin/issues/4562
- uses: nick-fields/retry@v2
- with:
- timeout_minutes: 5
- max_attempts: 5
- command: |
- ray start --head --port=6379 --object-store-memory=1000000000
- if: matrix.engine == 'ray'
- - name: Tell Modin to use existing ray cluster
- run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV
- if: matrix.engine == 'ray'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_binary.py
- if: matrix.test_task == 'group_1'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_default.py
- if: matrix.test_task == 'group_1'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_indexing.py
- if: matrix.test_task == 'group_1'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_iter.py
- if: matrix.test_task == 'group_1'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_join_sort.py
- if: matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_reduce.py
- if: matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_udf.py
- if: matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_window.py
- if: matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_pickle.py
- if: matrix.test_task == 'group_2'
- - run: python -m pytest -n 2 modin/pandas/test/test_series.py
- if: matrix.test_task == 'group_3'
- - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_map_metadata.py
- if: matrix.test_task == 'group_3'
- - run: MODIN_EXPERIMENTAL_GROUPBY=1 python -m pytest -n 2 modin/pandas/test/test_groupby.py
- if: matrix.test_task == 'group_3'
- - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/pandas/test/test_expanding.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest modin/pandas/test/test_concat.py # Ray and Dask versions fails with -n 2
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_creation.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_arithmetic.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_axis_functions.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_logic.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_linalg.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_indexing.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_math.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/numpy/test/test_array_shaping.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py
- if: matrix.test_task == 'group_4'
- - run: python -m pytest -n 2 modin/pandas/test/test_general.py
- if: matrix.test_task == 'group_4'
- - timeout-minutes: 60
- run: python -m pytest modin/pandas/test/test_io.py --verbose
- if: matrix.test_task == 'group_4'
- - uses: ./.github/workflows/upload-coverage
- - name: Stop local ray cluster
- run: ray stop
- if: matrix.engine == 'ray'
- - name: Rename the folder with conda packages so it won't be deleted, it's too slow on Windows.
- run: mv "${CONDA_PKGS_DIR}" "${CONDA_PKGS_DIR}_do_not_cache"
+ - uses: ./.github/actions/upload-coverage
test-pyarrow:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
+ needs: [lint-flake8, lint-black]
runs-on: ubuntu-latest
defaults:
run:
@@ -1088,37 +705,15 @@ jobs:
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 1
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- run: sudo apt update && sudo apt install -y libhdf5-dev
- run: python -m pytest modin/pandas/test/test_io.py::TestCsv --verbose
test-spreadsheet:
- needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers]
+ needs: [lint-flake8, lint-black]
runs-on: ubuntu-latest
defaults:
run:
@@ -1133,49 +728,26 @@ jobs:
name: test-spreadsheet (engine ${{matrix.engine}}, python ${{matrix.python-version}})
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 1
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
python-version: ${{matrix.python-version}}
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- run: python -m pytest modin/experimental/spreadsheet/test/test_general.py
upload-coverage:
- needs: [test-internals, test-no-engine, test-defaults, test-hdk, test-all-unidist, test-all, test-experimental, test-cloud, test-windows]
+ needs: [test-internals, test-api-and-no-engine, test-defaults, test-hdk, test-all-unidist, test-all, test-experimental, test-cloud, test-sanity]
+ if: always() # we need to run it regardless of some job being skipped, like in PR
runs-on: ubuntu-latest
defaults:
run:
shell: bash -l {0}
steps:
- - uses: actions/checkout@v2
- with:
- fetch-depth: 1
+ - uses: actions/checkout@v3
+ - uses: ./.github/actions/python-only
- name: Download coverage data
uses: actions/download-artifact@v3.0.2
with:
name: coverage-data
- - uses: actions/setup-python@v4
- run: pip install coverage
- name: Combine coverage
run: python -m coverage combine
@@ -1183,4 +755,4 @@ jobs:
run: python -m coverage xml
- uses: codecov/codecov-action@v3
with:
- fail_ci_if_error: true
+ fail_ci_if_error: ${{ github.event_name == 'push' }} # do not care about uploads in PR
diff --git a/.github/workflows/fuzzydata-test.yml b/.github/workflows/fuzzydata-test.yml
index f16a6f18e7b..b0407c16137 100644
--- a/.github/workflows/fuzzydata-test.yml
+++ b/.github/workflows/fuzzydata-test.yml
@@ -34,32 +34,10 @@ jobs:
engine: ["ray", "dask"]
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 1
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
+ python-version: ${{matrix.python-version}}
- name: test-fuzzydata (engine ${{matrix.engine}}, python ${{matrix.python-version}})
run: python -m pytest modin/experimental/fuzzydata/test/test_fuzzydata.py -Wignore::UserWarning --log-file=/tmp/fuzzydata-test-wf-${{matrix.engine}}/run.log --log-file-level=INFO
env:
diff --git a/.github/workflows/push-to-master.yml b/.github/workflows/push-to-master.yml
index fc3b8bdc451..d5152bd98d3 100644
--- a/.github/workflows/push-to-master.yml
+++ b/.github/workflows/push-to-master.yml
@@ -27,28 +27,9 @@ jobs:
AWS_SECRET_ACCESS_KEY: foobar_secret
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 1
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- name: install Ray nightly build
# Use --force-reinstall to always reinstall ray and its dependencies.
# botocore isn't compatible with urllib3>=2; see #6094 for details
@@ -97,32 +78,9 @@ jobs:
name: test docs
steps:
- uses: actions/checkout@v3
+ - uses: ./.github/actions/mamba-env
with:
- fetch-depth: 1
- - name: Cache conda
- uses: actions/cache@v3
- with:
- path: |
- ~/conda_pkgs_dir
- ~/.cache/pip
- key:
- ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }}
- - uses: conda-incubator/setup-miniconda@v2
- with:
- miniforge-variant: Mambaforge
- miniforge-version: latest
- use-mamba: true
- activate-environment: modin
environment-file: environment-dev.yml
- python-version: 3.8
- channel-priority: strict
- # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed
- # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264
- use-only-tar-bz2: false
- - name: Conda environment
- run: |
- conda info
- conda list
- run: sudo apt update && sudo apt install -y libhdf5-dev
- name: Docstring URL validity check
run: python -m pytest modin/test/test_docstring_urls.py
diff --git a/README.md b/README.md
index 5a118cbd602..4029350db18 100644
--- a/README.md
+++ b/README.md
@@ -13,6 +13,7 @@
+
diff --git a/asv_bench/asv.conf.dask.json b/asv_bench/asv.conf.dask.json
index cc12302b149..30e44e14821 100644
--- a/asv_bench/asv.conf.dask.json
+++ b/asv_bench/asv.conf.dask.json
@@ -48,7 +48,7 @@
// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
- //"install_timeout": 600,
+ "install_timeout": 6000,
// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/modin-project/modin/commit/",
diff --git a/asv_bench/asv.conf.hdk.json b/asv_bench/asv.conf.hdk.json
index 7d8b947ced1..317becb9fc8 100644
--- a/asv_bench/asv.conf.hdk.json
+++ b/asv_bench/asv.conf.hdk.json
@@ -25,6 +25,10 @@
// variable.
"environment_type": "conda",
+ // timeout in seconds for installing any dependencies in environment
+ // defaults to 10 min
+ "install_timeout": 6000,
+
// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/modin-project/modin/commit/",
diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json
index fc7a3d99525..234004dbbb2 100644
--- a/asv_bench/asv.conf.json
+++ b/asv_bench/asv.conf.json
@@ -48,7 +48,7 @@
// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
- //"install_timeout": 600,
+ "install_timeout": 6000,
// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/modin-project/modin/commit/",
diff --git a/asv_bench/asv.conf.unidist.json b/asv_bench/asv.conf.unidist.json
index df011617ea3..b8e04c5bb1e 100644
--- a/asv_bench/asv.conf.unidist.json
+++ b/asv_bench/asv.conf.unidist.json
@@ -48,7 +48,7 @@
// timeout in seconds for installing any dependencies in environment
// defaults to 10 min
- //"install_timeout": 600,
+ "install_timeout": 6000,
// the base URL to show a commit for the project.
"show_commit_url": "https://github.com/modin-project/modin/commit/",
diff --git a/asv_bench/benchmarks/scalability/scalability_benchmarks.py b/asv_bench/benchmarks/scalability/scalability_benchmarks.py
index 05a465b2ab8..f9850ff1999 100644
--- a/asv_bench/benchmarks/scalability/scalability_benchmarks.py
+++ b/asv_bench/benchmarks/scalability/scalability_benchmarks.py
@@ -17,7 +17,7 @@
from modin.pandas.utils import from_pandas
try:
- from modin.utils import to_pandas
+ from modin.utils import to_pandas, to_numpy
except ImportError:
# This provides compatibility with older versions of the Modin, allowing us to test old commits.
from modin.pandas.utils import to_pandas
@@ -70,4 +70,22 @@ def time_to_pandas(self, shape, cpus):
to_pandas(self.data)
+class TimeToNumPy:
+ param_names = ["shape", "cpus"]
+ params = [
+ get_benchmark_shapes("TimeToNumPy"),
+ [4, 16, 32],
+ ]
+
+ def setup(self, shape, cpus):
+ from modin.config import NPartitions
+
+ NPartitions.get = lambda: cpus
+ self.data = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH, impl="modin")
+
+ def time_to_numpy(self, shape, cpus):
+ # to_numpy is already synchronous
+ to_numpy(self.data)
+
+
from ..utils import setup # noqa: E402, F401
diff --git a/asv_bench/benchmarks/utils/common.py b/asv_bench/benchmarks/utils/common.py
index e67ca677de9..9a0a1dab276 100644
--- a/asv_bench/benchmarks/utils/common.py
+++ b/asv_bench/benchmarks/utils/common.py
@@ -459,17 +459,8 @@ def trigger_import(*dfs):
if ASV_USE_STORAGE_FORMAT != "hdk" or ASV_USE_IMPL == "pandas":
return
- from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import (
- DbWorker,
- )
-
for df in dfs:
- df.shape # to trigger real execution
- df._query_compiler._modin_frame._partitions[0][
- 0
- ].frame_id = DbWorker().import_arrow_table(
- df._query_compiler._modin_frame._partitions[0][0].get()
- ) # to trigger real execution
+ df._query_compiler._modin_frame.force_import()
def execute(
diff --git a/asv_bench/benchmarks/utils/data_shapes.py b/asv_bench/benchmarks/utils/data_shapes.py
index 98d58aa4291..af3ce71014f 100644
--- a/asv_bench/benchmarks/utils/data_shapes.py
+++ b/asv_bench/benchmarks/utils/data_shapes.py
@@ -116,6 +116,7 @@
# Scalability benchmarks
"TimeFromPandas",
"TimeToPandas",
+ "TimeToNumPy",
],
),
(
diff --git a/codecov.yml b/codecov.yml
index 69cb76019a4..95adf7b6b26 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -1 +1,11 @@
comment: false
+coverage:
+ status:
+ project:
+ default:
+ branches:
+ - master
+ target: 85%
+ patch:
+ default:
+ target: 30%
diff --git a/docs/conf.py b/docs/conf.py
index 1e6ac43a891..9cf86535675 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -13,6 +13,7 @@
import ray
+
# stub ray.remote to be a no-op so it doesn't shadow docstrings
def noop_decorator(*args, **kwargs):
if len(args) == 1 and len(kwargs) == 0 and callable(args[0]):
@@ -24,7 +25,7 @@ def noop_decorator(*args, **kwargs):
ray.remote = noop_decorator
# fake modules if they're missing
-for mod_name in ("cudf", "cupy", "pyarrow.gandiva", "pyhdk"):
+for mod_name in ("cudf", "cupy", "pyarrow.gandiva", "pyhdk", "pyhdk.hdk"):
try:
__import__(mod_name)
except ImportError:
@@ -37,6 +38,17 @@ def noop_decorator(*args, **kwargs):
sys.modules["cupy"].ndarray = type("ndarray", (object,), {})
if not hasattr(sys.modules["pyhdk"], "PyDbEngine"):
sys.modules["pyhdk"].PyDbEngine = type("PyDbEngine", (object,), {})
+if not hasattr(sys.modules["pyhdk.hdk"], "HDK"):
+ sys.modules["pyhdk.hdk"].HDK = type("HDK", (object,), {})
+if not hasattr(sys.modules["pyhdk.hdk"], "QueryNode"):
+ sys.modules["pyhdk.hdk"].QueryNode = type("QueryNode", (object,), {})
+if not hasattr(sys.modules["pyhdk.hdk"], "ExecutionResult"):
+ sys.modules["pyhdk.hdk"].ExecutionResult = type("ExecutionResult", (object,), {})
+if not hasattr(sys.modules["pyhdk.hdk"], "RelAlgExecutor"):
+ sys.modules["pyhdk.hdk"].RelAlgExecutor = type("RelAlgExecutor", (object,), {})
+if not hasattr(sys.modules["pyhdk"], "__version__"):
+ # Show all known pyhdk config options in documentation
+ sys.modules["pyhdk"].__version__ = "999"
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
import modin
diff --git a/docs/development/using_hdk.rst b/docs/development/using_hdk.rst
index 63455178462..86a6b39ffc5 100644
--- a/docs/development/using_hdk.rst
+++ b/docs/development/using_hdk.rst
@@ -34,4 +34,22 @@ If for some reasons ``Native`` engine is explicitly set using ``modin.config`` o
If you encounter ``LLVM ERROR: inconsistency in registered CommandLine options`` error when using HDK,
please refer to the respective section in :doc:`Troubleshooting ` page to avoid the issue.
-.. _HDK: https://github.com/intel-ai/hdk
\ No newline at end of file
+
+Running on a GPU
+----------------
+
+Prerequisites:
+
+* HDK's GPU mode is currently supported on Linux and Intel GPU only.
+* HDK supports Gen9 architecture and higher (including Xe & Arc).
+* HDK's GPU mode requires proper driver installation. Follow this guide_ to set up your system. Make sure to install the compute runtime packages: ``intel-opencl-icd``, ``intel-level-zero-gpu``, ``level-zero``.
+* Make sure your GPU is visible and accessible.
+
+.. note::
+ You can use ``hwinfo`` and ``clinfo`` utilities to verify the driver installation and device accessibility.
+
+HDK supports a heterogeneous execution mode (experimental) that is disabled by default in Modin. Starting with pyHDK version 0.7 Modin can run the workload on Intel GPU.
+Run on a GPU via ``MODIN_HDK_LAUNCH_PARAMETERS="cpu_only=0" python ``.
+
+.. _HDK: https://github.com/intel-ai/hdk
+.. _guide: https://dgpu-docs.intel.com/driver/installation.html
\ No newline at end of file
diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst
index 8a4e98e2bf9..e3911d36bd0 100644
--- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst
+++ b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst
@@ -46,12 +46,16 @@ engine itself and we don't need to manage multiple partitions.
:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe`
always has a single partition.
-A partition holds data in either ``pandas.DataFrame`` or ``pyarrow.Table``
+A partition holds data in either ``pandas.DataFrame``, ``pyarrow.Table`` or ``DbTable``
format. ``pandas.DataFrame`` is preferred only when we detect unsupported
data type and therefore have to use ``pandas`` framework for processing.
-In other cases ``pyarrow.Table`` format is preferred. Arrow tables can be
-zero-copy imported into HDK. A query execution result is also
-returned as an Arrow table.
+The ``pyarrow.Table`` format is used when a ``DataFrame`` is created and until the
+table is imported into HDK. When it's imported, the partition data is replaced with
+a ``DbTable``. ``DbTable`` represents a table in the HDK database and provides basic
+information about the table: table name, column names, shape. It also allows
+exporting the data into the ``pyarrow.Table`` format. Depending on the data types,
+a ``pyarrow.Table`` import/export could be performed zero-copy. A query execution
+result is also returned as a ``DbTable``.
Data Ingress
------------
@@ -173,15 +177,15 @@ Arrow execution
For simple operations which don't include actual computations, execution can use
Arrow API. We can use it to rename columns, drop columns and concatenate
-frames. Arrow execution is preferable since it doesn't require actual data import/export
-from/to HDK.
+frames. Arrow execution is performed if we have an arrow table in the partition
+and it's preferable since it doesn't require actual data import into HDK.
HDK execution
'''''''''''''
To execute a query in the HDK engine we need to import data first. We should
find all leaves of an operation tree and import their Arrow tables. Partitions
-with imported tables hold corresponding table names used to refer to them in
+with ``DbTable`` hold corresponding table names used to refer to them in
queries.
HDK executes queries expressed in HDK-specific intermediate representation (IR) format.
@@ -215,9 +219,9 @@ The building of Calcite query (starting from the conversion to the Calcite Algeb
the forming JSON query) is orchestrated by
:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.partitioning.partition_manager.HdkOnNativeDataframePartitionManager`.
-An execution result is a new Arrow table which is used to form a new
-partition. This partition is assigned to the executed frame. The frame's
-operation tree is replaced with
+An execution result is a new table in the HDK database, that is represented by ``DbTable``,
+which is used to form a new partition. This partition is assigned to the executed frame.
+The frame's operation tree is replaced with
:py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.FrameNode` operation.
Rowid column and sub-queries
@@ -261,11 +265,11 @@ class.
Column name mangling
""""""""""""""""""""
-In ``pandas.DataFrame`` columns might have names not allowed in SQL (e. g.
-an empty string). To handle this we simply add '`F_`' prefix to
-column names. Index labels are more tricky because they might be non-unique.
-Indexes are represented as regular columns, and we have to perform a special
-mangling to get valid and unique column names. Demangling is done when we
+In ``pandas.DataFrame`` columns might have names of non-string types or not allowed
+in SQL (e. g. an empty string). To handle this we use an internal encoder, that
+makes the names SQL-compatible. Index labels are more tricky because they might be
+non-unique. Indexes are represented as regular columns, and we have to perform a
+special mangling to get valid and unique column names. Demangling is done when we
transform our frame (i.e. its Arrow table) into ``pandas.DataFrame`` format.
.. toctree::
diff --git a/docs/img/hdk/hdk_calcite_serialization_flow.svg b/docs/img/hdk/hdk_calcite_serialization_flow.svg
index 01719049c4f..6914ef81755 100644
--- a/docs/img/hdk/hdk_calcite_serialization_flow.svg
+++ b/docs/img/hdk/hdk_calcite_serialization_flow.svg
@@ -1,4 +1,4 @@
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/docs/img/hdk/hdk_import.svg b/docs/img/hdk/hdk_import.svg
index 34791905cc5..6350102ca58 100644
--- a/docs/img/hdk/hdk_import.svg
+++ b/docs/img/hdk/hdk_import.svg
@@ -1,4 +1,4 @@
-
\ No newline at end of file
+
\ No newline at end of file
diff --git a/docs/requirements-doc.txt b/docs/requirements-doc.txt
index 74626d8d610..620f8a7a560 100644
--- a/docs/requirements-doc.txt
+++ b/docs/requirements-doc.txt
@@ -12,7 +12,10 @@ pyyaml
recommonmark
sphinx<6.0.0
sphinx-click
-ray[default]>=1.13.0
+# ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100
+ray[default]>=1.13.0,!=2.5.0
+# https://github.com/modin-project/modin/issues/6336
+pydantic<2
# Override to latest version of modin-spreadsheet
git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
sphinxcontrib_plantuml
diff --git a/environment-dev.yml b/environment-dev.yml
index a4b70551534..f1cc505ab4d 100644
--- a/environment-dev.yml
+++ b/environment-dev.yml
@@ -2,21 +2,30 @@ name: modin
channels:
- conda-forge
dependencies:
+ - pip
+
+ # required dependencies
- pandas>=2,<2.1
- numpy>=1.18.5
- - ray-default>=1.13.0
+ - fsspec
+ - packaging
+ - psutil
+
+ # optional dependencies
+ # ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100
+ - ray-default>=1.13.0,!=2.5.0
+ # https://github.com/modin-project/modin/issues/6336
+ - pydantic<2
- pyarrow
# workaround for https://github.com/conda/conda/issues/11744
- grpcio!=1.45.*
- grpcio!=1.46.*
- dask>=2.22.0
- distributed>=2.22.0
- - fsspec
# TODO: uncomment after Modin switch to python>=3.9
# - xarray
- Jinja2
- scipy
- - pip
- s3fs>=2021.8
- feather-format
- lxml
@@ -27,43 +36,47 @@ dependencies:
- pandas-gbq
- pytables
- msgpack-python
- - psutil
- - pytest>=6.0.1
- - pytest-benchmark
- - pytest-cov>=2.10.1
- - pytest-xdist>=2.1.0
- - packaging
- - coverage
- - pygithub
- - rpyc==4.1.5
- - cloudpickle
- - boto3
- - moto
- scikit-learn
- pymssql
- psycopg2
- # Mypy 0.990 doesn't work: https://github.com/modin-project/modin/issues/5206
- - mypy!=0.990
- - pandas-stubs
- fastparquet<2023.1.0
- # for release script
- - pygit2
# TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost
# when we use collective instead of rabit.
- xgboost>=1.7.1,<2.0.0
- tqdm
+
+ ## modin in the cloud dependencies
+ - boto3
+ - cloudpickle
+ - rpyc==4.1.5
+
+ # dependencies for making release
+ - pygithub>=v1.58.0
+ - pygit2>=1.9.2
+
+ # test dependencies
+ - coverage>=7.1.0
+ - moto>=4.1.0
+ - pytest>=7.2.1
+ - pytest-benchmark>=4.0.0
+ - pytest-cov>=4.0.0
+ - pytest-xdist>=3.2.0
+
# code linters
- - black
- - flake8
- - flake8-no-implicit-concat
- - flake8-print
+ - black>=23.1.0
+ - flake8>=6.0.0
+ - flake8-no-implicit-concat>=0.3.4
+ - flake8-print>=5.0.0
+ - mypy>=1.0.0
+ - pandas-stubs>=2.0.0
+
- pip:
+ # no conda package for windows so we install it with pip
+ - connectorx>=0.2.6a4
+ # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
+ - fuzzydata>=0.0.6
# Fixes breaking ipywidgets changes, but didn't release yet.
- git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
- git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9
- # no conda package for windows so we install it with pip
- - connectorx>=0.2.6a4
# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
- numpydoc==1.1.0
- # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
- - fuzzydata>=0.0.6
diff --git a/examples/docker/modin-hdk/census-hdk.py b/examples/docker/modin-hdk/census-hdk.py
index 7d5337c93ce..4d325320a58 100644
--- a/examples/docker/modin-hdk/census-hdk.py
+++ b/examples/docker/modin-hdk/census-hdk.py
@@ -14,16 +14,8 @@
import sys
from utils import measure
import modin.pandas as pd
-from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import (
- DbWorker,
-)
-from sklearn import config_context
-import sklearnex
-sklearnex.patch_sklearn()
-from sklearn.model_selection import train_test_split
-import sklearn.linear_model as lm
import numpy as np
@@ -131,12 +123,8 @@ def read(filename):
skiprows=1,
)
- df.shape # to trigger real execution
- df._query_compiler._modin_frame._partitions[0][
- 0
- ].frame_id = DbWorker().import_arrow_table(
- df._query_compiler._modin_frame._partitions[0][0].get()
- ) # to trigger real execution
+ # to trigger real execution and table import
+ df._query_compiler._modin_frame.force_import()
return df
@@ -203,6 +191,14 @@ def cod(y_test, y_pred):
def ml(X, y, random_state, n_runs, test_size):
+ # to not install ML dependencies unless required
+ from sklearn import config_context
+ import sklearnex
+
+ sklearnex.patch_sklearn()
+ from sklearn.model_selection import train_test_split
+ import sklearn.linear_model as lm
+
clf = lm.Ridge()
X = np.ascontiguousarray(X, dtype=np.float64)
diff --git a/examples/docker/modin-hdk/nyc-taxi-hdk.py b/examples/docker/modin-hdk/nyc-taxi-hdk.py
index ba8ba9de55f..b13d3ab9b23 100644
--- a/examples/docker/modin-hdk/nyc-taxi-hdk.py
+++ b/examples/docker/modin-hdk/nyc-taxi-hdk.py
@@ -15,9 +15,6 @@
from utils import measure
import modin.pandas as pd
from modin.pandas.test.utils import df_equals
-from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import (
- DbWorker,
-)
from modin.experimental.sql import query
@@ -145,12 +142,8 @@ def read(filename):
parse_dates=dates_only,
)
- df.shape # to trigger real execution
- df._query_compiler._modin_frame._partitions[0][
- 0
- ].frame_id = DbWorker().import_arrow_table(
- df._query_compiler._modin_frame._partitions[0][0].get()
- ) # to trigger real execution
+ # to trigger real execution and table import
+ df._query_compiler._modin_frame.force_import()
return df
diff --git a/examples/docker/modin-hdk/plasticc-hdk.py b/examples/docker/modin-hdk/plasticc-hdk.py
index 7f7a98f4ced..9a4632ec1ea 100644
--- a/examples/docker/modin-hdk/plasticc-hdk.py
+++ b/examples/docker/modin-hdk/plasticc-hdk.py
@@ -18,13 +18,6 @@
import modin.pandas as pd
import numpy as np
-import xgboost as xgb
-
-import sklearnex
-
-sklearnex.patch_sklearn()
-from sklearn.model_selection import train_test_split
-from sklearn.preprocessing import LabelEncoder
################ helper functions ###############################
@@ -81,6 +74,9 @@ def all_etl(train, train_meta, test, test_meta):
def split_step(train_final, test_final):
+ from sklearn.model_selection import train_test_split
+ from sklearn.preprocessing import LabelEncoder
+
X = train_final.drop(["object_id", "target"], axis=1).values
Xt = test_final.drop(["object_id"], axis=1).values
@@ -197,6 +193,13 @@ def etl(df, df_meta):
def ml(train_final, test_final):
+ # to not install ML dependencies unless required
+ import xgboost as xgb
+ import sklearnex
+
+ sklearnex.patch_sklearn()
+
+
X_train, y_train, X_test, y_test, Xt, classes, class_weights = split_step(
train_final, test_final
)
diff --git a/examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt b/examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt
index 8cc9c67fb3b..d19ba10cd43 100644
--- a/examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt
+++ b/examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt
@@ -3,4 +3,6 @@ jupyterlab
ipywidgets
tqdm
modin[ray]
+# https://github.com/modin-project/modin/issues/6336
+pydantic<2
modin[spreadsheet]
diff --git a/modin/config/envvars.py b/modin/config/envvars.py
index 87a3f97fdaa..eb54441ae4c 100644
--- a/modin/config/envvars.py
+++ b/modin/config/envvars.py
@@ -517,14 +517,6 @@ class HdkLaunchParameters(EnvironmentVariable, type=dict):
"""
varname = "MODIN_HDK_LAUNCH_PARAMETERS"
- default = {
- "enable_union": 1,
- "enable_columnar_output": 1,
- "enable_lazy_fetch": 0,
- "null_div_by_zero": 1,
- "enable_watchdog": 0,
- "enable_thrift_logs": 0,
- }
@classmethod
def get(cls) -> dict:
@@ -557,12 +549,44 @@ def _get(cls) -> dict:
Decoded and verified config value.
"""
custom_parameters = super().get()
- result = cls.default.copy()
+ result = cls._get_default().copy()
result.update(
{key.replace("-", "_"): value for key, value in custom_parameters.items()}
)
return result
+ @classmethod
+ def _get_default(cls) -> Any:
+ """
+ Get default value of the config. Checks the pyhdk version and omits variables unsupported in prior versions.
+
+ Returns
+ -------
+ dict
+ Config keys and corresponding values.
+ """
+ if (default := getattr(cls, "default", None)) is None:
+ cls.default = default = {
+ "enable_union": 1,
+ "enable_columnar_output": 1,
+ "enable_lazy_fetch": 0,
+ "null_div_by_zero": 1,
+ "enable_watchdog": 0,
+ "enable_thrift_logs": 0,
+ "cpu_only": 1,
+ }
+
+ try:
+ import pyhdk
+
+ if version.parse(pyhdk.__version__) >= version.parse("0.6.1"):
+ default["enable_lazy_dict_materialization"] = 0
+ default["log_dir"] = "pyhdk_log"
+ except ImportError:
+ # if pyhdk is not available, do not show any additional options
+ pass
+ return default
+
class OmnisciLaunchParameters(HdkLaunchParameters, type=dict):
"""
diff --git a/modin/config/test/test_envvars.py b/modin/config/test/test_envvars.py
index 01ed1c9304f..0abfa6dc5ac 100644
--- a/modin/config/test/test_envvars.py
+++ b/modin/config/test/test_envvars.py
@@ -16,6 +16,8 @@
import modin.config as cfg
from modin.config.envvars import EnvironmentVariable, _check_vars, ExactStr
+from packaging import version
+
@pytest.fixture
def make_unknown_env():
@@ -63,9 +65,22 @@ def test_custom_help(make_custom_envvar):
def test_hdk_envvar():
+ try:
+ import pyhdk
+
+ defaults = cfg.HdkLaunchParameters.get()
+ assert defaults["enable_union"] == 1
+ if version.parse(pyhdk.__version__) >= version.parse("0.6.1"):
+ assert defaults["log_dir"] == "pyhdk_log"
+ del cfg.HdkLaunchParameters._value
+ except ImportError:
+ # This test is intended to check pyhdk internals. If pyhdk is not available, skip the version check test.
+ pass
+
os.environ[
cfg.OmnisciLaunchParameters.varname
] = "enable_union=2,enable_thrift_logs=3"
+ del cfg.OmnisciLaunchParameters._value
params = cfg.OmnisciLaunchParameters.get()
assert params["enable_union"] == 2
assert params["enable_thrift_logs"] == 3
@@ -74,11 +89,27 @@ def test_hdk_envvar():
assert params["enable_union"] == 2
assert params["enable_thrift_logs"] == 3
- os.environ[cfg.HdkLaunchParameters.varname] = "enable_union=4,enable_thrift_logs=5"
+ os.environ[cfg.HdkLaunchParameters.varname] = "unsupported=X"
+ params = cfg.HdkLaunchParameters.get()
+ assert params["unsupported"] == "X"
+ try:
+ import pyhdk
+
+ pyhdk.buildConfig(**cfg.HdkLaunchParameters.get())
+ except RuntimeError as e:
+ assert str(e) == "unrecognised option '--unsupported'"
+ except ImportError:
+ # This test is intended to check pyhdk internals. If pyhdk is not available, skip the version check test.
+ pass
+
+ os.environ[
+ cfg.HdkLaunchParameters.varname
+ ] = "enable_union=4,enable_thrift_logs=5,enable_lazy_dict_materialization=6"
del cfg.HdkLaunchParameters._value
params = cfg.HdkLaunchParameters.get()
assert params["enable_union"] == 4
assert params["enable_thrift_logs"] == 5
+ assert params["enable_lazy_dict_materialization"] == 6
params = cfg.OmnisciLaunchParameters.get()
assert params["enable_union"] == 2
diff --git a/modin/conftest.py b/modin/conftest.py
index 0647e8360fd..789d5ca331e 100644
--- a/modin/conftest.py
+++ b/modin/conftest.py
@@ -62,6 +62,7 @@ def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url):
CIAWSAccessKeyID,
CIAWSSecretAccessKey,
AsyncReadMode,
+ BenchmarkMode,
)
import uuid # noqa: E402
@@ -558,6 +559,14 @@ def set_num_partitions(request):
NPartitions.put(old_num_partitions)
+@pytest.fixture()
+def set_benchmark_mode(request):
+ old_benchmark_mode = BenchmarkMode.get()
+ BenchmarkMode.put(request.param)
+ yield
+ BenchmarkMode.put(old_benchmark_mode)
+
+
@pytest.fixture
def set_async_read_mode(request):
old_async_read_mode = AsyncReadMode.get()
diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py
index b3d011dfff4..cdffd3c9bb0 100644
--- a/modin/core/dataframe/algebra/binary.py
+++ b/modin/core/dataframe/algebra/binary.py
@@ -382,6 +382,7 @@ def caller(
lambda x, y: func(x, y, *args, **kwargs),
[other._modin_frame],
join_type=join_type,
+ labels=labels,
dtypes=dtypes,
),
shape_hint=shape_hint,
diff --git a/modin/core/dataframe/algebra/default2pandas/groupby.py b/modin/core/dataframe/algebra/default2pandas/groupby.py
index 1c6256cd24b..d5bd9cd4a20 100644
--- a/modin/core/dataframe/algebra/default2pandas/groupby.py
+++ b/modin/core/dataframe/algebra/default2pandas/groupby.py
@@ -601,10 +601,13 @@ def register(cls, func, **kwargs):
# 2. `.apply(func)` applies func to a DataFrames, holding a whole group (group-wise).
# 3. `.transform(func)` is the same as `.apply()` but also broadcast the `func`
# result to the group's original shape.
+ # 4. 'direct' mode means that the passed `func` has to be applied directly
+ # to the `pandas.DataFrameGroupBy` object.
_aggregation_methods_dict = {
"axis_wise": pandas.core.groupby.DataFrameGroupBy.aggregate,
"group_wise": pandas.core.groupby.DataFrameGroupBy.apply,
"transform": pandas.core.groupby.DataFrameGroupBy.transform,
+ "direct": lambda grp, func, *args, **kwargs: func(grp, *args, **kwargs),
}
@classmethod
@@ -637,4 +640,5 @@ class SeriesGroupByDefault(GroupByDefault):
"axis_wise": pandas.core.groupby.SeriesGroupBy.aggregate,
"group_wise": pandas.core.groupby.SeriesGroupBy.apply,
"transform": pandas.core.groupby.SeriesGroupBy.transform,
+ "direct": lambda grp, func, *args, **kwargs: func(grp, *args, **kwargs),
}
diff --git a/modin/core/dataframe/algebra/default2pandas/rolling.py b/modin/core/dataframe/algebra/default2pandas/rolling.py
index 0259849225e..cdc6c7f5480 100644
--- a/modin/core/dataframe/algebra/default2pandas/rolling.py
+++ b/modin/core/dataframe/algebra/default2pandas/rolling.py
@@ -37,9 +37,9 @@ def _build_rolling(cls, func):
Function that takes pandas DataFrame and applies `func` on a rolling window.
"""
- def fn(df, rolling_args, *args, **kwargs):
+ def fn(df, rolling_kwargs, *args, **kwargs):
"""Create rolling window for the passed frame and execute specified `func` on it."""
- roller = df.rolling(*rolling_args)
+ roller = df.rolling(**rolling_kwargs)
if type(func) == property:
return func.fget(roller)
diff --git a/modin/core/dataframe/base/dataframe/dataframe.py b/modin/core/dataframe/base/dataframe/dataframe.py
index 44c8efa8695..07274788977 100644
--- a/modin/core/dataframe/base/dataframe/dataframe.py
+++ b/modin/core/dataframe/base/dataframe/dataframe.py
@@ -209,7 +209,7 @@ def window(
Notes
-----
- The user-defined reduce function must reduce each window’s column
+ The user-defined reduce function must reduce each window's column
(row if axis=1) down to a single value.
"""
pass
@@ -467,7 +467,7 @@ def from_labels(self) -> "ModinDataframe":
Notes
-----
- In the case that the dataframe has hierarchical labels, all label "levels” are inserted into the dataframe
+ In the case that the dataframe has hierarchical labels, all label "levels" are inserted into the dataframe
in the order they occur in the labels, with the outermost being in position 0.
"""
pass
diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py
index 44736aa6753..db15c489a4a 100644
--- a/modin/core/dataframe/pandas/dataframe/dataframe.py
+++ b/modin/core/dataframe/pandas/dataframe/dataframe.py
@@ -23,7 +23,11 @@
import datetime
from pandas.api.types import is_object_dtype
from pandas.core.indexes.api import Index, RangeIndex
-from pandas.core.dtypes.common import is_numeric_dtype, is_list_like
+from pandas.core.dtypes.common import (
+ is_numeric_dtype,
+ is_list_like,
+ is_categorical_dtype,
+)
from pandas._libs.lib import no_default
from typing import List, Hashable, Optional, Callable, Union, Dict, TYPE_CHECKING
@@ -2896,20 +2900,6 @@ def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all):
passed_len += len(internal)
return result_dict
- def __make_init_labels_args(self, partitions, index, columns) -> dict:
- kw = {}
- kw["index"], kw["row_lengths"] = (
- self._compute_axis_labels_and_lengths(0, partitions)
- if index is None
- else (index, None)
- )
- kw["columns"], kw["column_widths"] = (
- self._compute_axis_labels_and_lengths(1, partitions)
- if columns is None
- else (columns, None)
- )
- return kw
-
@lazy_metadata_decorator(apply_axis="both")
def broadcast_apply_select_indices(
self,
@@ -2988,9 +2978,9 @@ def broadcast_apply_select_indices(
broadcasted_dict,
keep_remaining,
)
-
- kw = self.__make_init_labels_args(new_partitions, new_index, new_columns)
- return self.__constructor__(new_partitions, **kw)
+ return self.__constructor__(
+ new_partitions, index=new_index, columns=new_columns
+ )
@lazy_metadata_decorator(apply_axis="both")
def broadcast_apply_full_axis(
@@ -3280,6 +3270,7 @@ def n_ary_op(
right_frames: list,
join_type="outer",
copartition_along_columns=True,
+ labels="replace",
dtypes=None,
):
"""
@@ -3296,6 +3287,9 @@ def n_ary_op(
copartition_along_columns : bool, default: True
Whether to perform copartitioning along columns or not.
For some ops this isn't needed (e.g., `fillna`).
+ labels : {"replace", "drop"}, default: "replace"
+ Whether use labels from joined DataFrame or drop altogether to make
+ them be computed lazily later.
dtypes : series, default: None
Dtypes of the resultant dataframe, this argument will be
received if the resultant dtypes of n-opary operation is precomputed.
@@ -3346,6 +3340,8 @@ def n_ary_op(
left_parts, op, list_of_right_parts
)
)
+ if labels == "drop":
+ joined_index = joined_columns = row_lengths = column_widths = None
return self.__constructor__(
new_frame,
@@ -3544,7 +3540,7 @@ def groupby(
by = [by]
def apply_func(df): # pragma: no cover
- if any(dtype == "category" for dtype in df.dtypes[by].values):
+ if any(is_categorical_dtype(dtype) for dtype in df.dtypes[by].values):
raise NotImplementedError(
"Reshuffling groupby is not yet supported when grouping on a categorical column. "
+ "https://github.com/modin-project/modin/issues/5925"
@@ -3564,7 +3560,7 @@ def apply_func(df): # pragma: no cover
return result
- @lazy_metadata_decorator(apply_axis="opposite", axis_arg=0)
+ @lazy_metadata_decorator(apply_axis="both")
def groupby_reduce(
self,
axis,
@@ -3612,11 +3608,16 @@ def groupby_reduce(
self._get_dict_of_block_index(axis ^ 1, numeric_indices).keys()
)
+ if by_parts is not None:
+ # inplace operation
+ if by_parts.shape[axis] != self._partitions.shape[axis]:
+ self._filter_empties(compute_metadata=False)
new_partitions = self._partition_mgr_cls.groupby_reduce(
axis, self._partitions, by_parts, map_func, reduce_func, apply_indices
)
- kw = self.__make_init_labels_args(new_partitions, new_index, new_columns)
- return self.__constructor__(new_partitions, **kw)
+ return self.__constructor__(
+ new_partitions, index=new_index, columns=new_columns
+ )
@classmethod
def from_pandas(cls, df):
@@ -3724,6 +3725,8 @@ def to_pandas(self):
df = self._partition_mgr_cls.to_pandas(self._partitions)
if df.empty:
df = pandas.DataFrame(columns=self.columns, index=self.index)
+ if len(df.columns) and self.has_materialized_dtypes:
+ df = df.astype(self.dtypes)
else:
for axis, has_external_index in enumerate(
["has_materialized_index", "has_materialized_columns"]
diff --git a/modin/core/dataframe/pandas/partitioning/partition.py b/modin/core/dataframe/pandas/partitioning/partition.py
index 2c00983c65a..6f7666212d8 100644
--- a/modin/core/dataframe/pandas/partitioning/partition.py
+++ b/modin/core/dataframe/pandas/partitioning/partition.py
@@ -38,6 +38,24 @@ class PandasDataframePartition(ABC): # pragma: no cover
_width_cache = None
_identity_cache = None
_data = None
+ execution_wrapper = None
+
+ # these variables are intentionally initialized at runtime
+ # so as not to initialize the engine during import
+ _iloc_func = None
+
+ def __init__(self):
+ if type(self)._iloc_func is None:
+ # Places `_iloc` function into the storage to speed up
+ # remote function calls and caches the result.
+ # It also postpones engine initialization, which happens
+ # implicitly when `execution_wrapper.put` is called.
+ if self.execution_wrapper is not None:
+ type(self)._iloc_func = staticmethod(
+ self.execution_wrapper.put(self._iloc)
+ )
+ else:
+ type(self)._iloc_func = staticmethod(self._iloc)
@cache_readonly
def __constructor__(self):
@@ -236,7 +254,7 @@ def is_full_axis_mask(index, axis_length):
):
return copy(self)
- new_obj = self.add_to_apply_calls(self._iloc, row_labels, col_labels)
+ new_obj = self.add_to_apply_calls(self._iloc_func, row_labels, col_labels)
def try_recompute_cache(indices, previous_cache):
"""Compute new axis-length cache for the masked frame based on its previous cache."""
diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py
index 3a5f639c866..7dce9702f22 100644
--- a/modin/core/dataframe/pandas/partitioning/partition_manager.py
+++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py
@@ -241,6 +241,13 @@ def groupby_reduce(
)
if by is not None:
+ # need to make sure that the partitioning of the following objects
+ # coincides in the required axis, because `partition_manager.broadcast_apply`
+ # doesn't call `_copartition` unlike `modin_frame.broadcast_apply`
+ assert partitions.shape[axis] == by.shape[axis], (
+ f"the number of partitions along {axis=} is not equal: "
+ + f"{partitions.shape[axis]} != {by.shape[axis]}"
+ )
mapped_partitions = cls.broadcast_apply(
axis, map_func, left=partitions, right=by
)
@@ -1546,6 +1553,7 @@ def rebalance_partitions(cls, partitions):
return new_partitions, lengths
@classmethod
+ @wait_computations_if_benchmark_mode
def shuffle_partitions(
cls, partitions, index, shuffle_functions, final_shuffle_func
):
diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py
index c7101b62100..6173da7b94b 100644
--- a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py
+++ b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py
@@ -43,6 +43,7 @@ class PandasOnDaskDataframePartition(PandasDataframePartition):
execution_wrapper = DaskWrapper
def __init__(self, data, length=None, width=None, ip=None, call_queue=None):
+ super().__init__()
assert isinstance(data, Future)
self._data = data
if call_queue is None:
diff --git a/modin/core/execution/dispatching/factories/dispatcher.py b/modin/core/execution/dispatching/factories/dispatcher.py
index c42f856d838..64323f5b928 100644
--- a/modin/core/execution/dispatching/factories/dispatcher.py
+++ b/modin/core/execution/dispatching/factories/dispatcher.py
@@ -155,7 +155,16 @@ def _update_factory(cls, _):
raise FactoryNotFoundError(msg.format(factory_name))
cls.__factory = StubFactory.set_failing_name(factory_name)
else:
- cls.__factory.prepare()
+ try:
+ cls.__factory.prepare()
+ except ModuleNotFoundError as err:
+ # incorrectly initialized, should be reset to None again
+ # so that an unobvious error does not appear in the following code:
+ # "AttributeError: 'NoneType' object has no attribute 'from_non_pandas'"
+ cls.__factory = None
+ raise ModuleNotFoundError(
+ f"Make sure all required packages are installed: {str(err)}"
+ ) from err
@classmethod
@_inherit_docstrings(factories.BaseFactory._from_pandas)
diff --git a/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py b/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py
index ae3601da162..059f3ae2286 100644
--- a/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py
+++ b/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py
@@ -45,6 +45,7 @@ class PandasOnPythonDataframePartition(PandasDataframePartition):
execution_wrapper = PythonWrapper
def __init__(self, data, length=None, width=None, call_queue=None):
+ super().__init__()
if hasattr(data, "copy"):
data = data.copy()
self._data = data
diff --git a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py
index c391d3b54f9..75dd47f0ba9 100644
--- a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py
+++ b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py
@@ -43,6 +43,7 @@ class cuDFOnRayDataframePartition(PandasDataframePartition):
"""
def __init__(self, gpu_manager, key, length=None, width=None):
+ super().__init__()
self.gpu_manager = gpu_manager
self.key = key
self._length_cache = length
diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py
index 1804ba67dab..a7caabf2f9d 100644
--- a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py
+++ b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py
@@ -46,6 +46,7 @@ class PandasOnRayDataframePartition(PandasDataframePartition):
execution_wrapper = RayWrapper
def __init__(self, data, length=None, width=None, ip=None, call_queue=None):
+ super().__init__()
assert isinstance(data, ObjectIDType)
self._data = data
if call_queue is None:
@@ -173,10 +174,6 @@ def __copy__(self):
call_queue=self.call_queue,
)
- # If Ray has not been initialized yet by Modin,
- # it will be initialized when calling `RayWrapper.put`.
- _iloc = execution_wrapper.put(PandasDataframePartition._iloc)
-
def mask(self, row_labels, col_labels):
"""
Lazily create a mask that extracts the indices provided.
diff --git a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py
index c880e1dfe50..fa1c71993cd 100644
--- a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py
+++ b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py
@@ -45,6 +45,7 @@ class PandasOnUnidistDataframePartition(PandasDataframePartition):
execution_wrapper = UnidistWrapper
def __init__(self, data, length=None, width=None, ip=None, call_queue=None):
+ super().__init__()
assert unidist.is_object_ref(data)
self._data = data
self.call_queue = call_queue if call_queue is not None else []
@@ -150,11 +151,6 @@ def wait(self):
self.drain_call_queue()
UnidistWrapper.wait(self._data)
- # If unidist has not been initialized yet by Modin,
- # unidist itself handles initialization when calling `unidist.put`,
- # which is called inside of `UnidistWrapper.put`.
- _iloc = execution_wrapper.put(PandasDataframePartition._iloc)
-
def mask(self, row_labels, col_labels):
"""
Lazily create a mask that extracts the indices provided.
diff --git a/modin/core/io/text/excel_dispatcher.py b/modin/core/io/text/excel_dispatcher.py
index eb1fd884d1c..4fd40b6424f 100644
--- a/modin/core/io/text/excel_dispatcher.py
+++ b/modin/core/io/text/excel_dispatcher.py
@@ -147,7 +147,7 @@ def _read(cls, io, **kwargs):
ex.shared_strings,
False,
)
- if cls.need_rich_text_param:
+ if cls.need_rich_text_param():
reader = WorksheetReader(*common_args, rich_text=False)
else:
reader = WorksheetReader(*common_args)
diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py
index 14447fc7b11..158212b37ec 100644
--- a/modin/core/io/text/text_file_dispatcher.py
+++ b/modin/core/io/text/text_file_dispatcher.py
@@ -1105,6 +1105,14 @@ def _read(cls, filepath_or_buffer, **kwargs):
if can_compute_metadata_while_skipping_rows:
pd_df_metadata = pd_df_metadata_temp
+ # compute dtypes if possible
+ common_dtypes = None
+ if kwargs["dtype"] is None:
+ most_common_dtype = (object,)
+ common_dtypes = {}
+ for col, dtype in pd_df_metadata.dtypes.to_dict().items():
+ if dtype in most_common_dtype:
+ common_dtypes[col] = dtype
column_names = pd_df_metadata.columns
column_widths, num_splits = cls._define_metadata(pd_df_metadata, column_names)
# kwargs that will be passed to the workers
@@ -1117,6 +1125,7 @@ def _read(cls, filepath_or_buffer, **kwargs):
skiprows=None,
nrows=None,
compression=compression_infered,
+ common_dtypes=common_dtypes,
)
# this is done mostly for performance; see PR#5678 for details
filepath_or_buffer_md_ref = cls.put(filepath_or_buffer_md)
diff --git a/modin/core/storage_formats/base/doc_utils.py b/modin/core/storage_formats/base/doc_utils.py
index 847ddaecd2f..8b58af073ad 100644
--- a/modin/core/storage_formats/base/doc_utils.py
+++ b/modin/core/storage_formats/base/doc_utils.py
@@ -598,11 +598,11 @@ def doc_window_method(
if action is None:
action = f"compute {result}"
if win_type == "rolling window":
- window_args_name = "rolling_args"
+ window_args_name = "rolling_kwargs"
elif win_type == "expanding window":
window_args_name = "expanding_args"
else:
- window_args_name = "window_args"
+ window_args_name = "window_kwargs"
# We need that `params` value ended with new line to have
# an empty line between "parameters" and "return" sections
diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py
index a535c4ca52d..188960a19a1 100644
--- a/modin/core/storage_formats/base/query_compiler.py
+++ b/modin/core/storage_formats/base/query_compiler.py
@@ -579,6 +579,10 @@ def combine_first(self, other, **kwargs): # noqa: PR02
def eq(self, other, **kwargs): # noqa: PR02
return BinaryDefault.register(pandas.DataFrame.eq)(self, other=other, **kwargs)
+ @doc_utils.add_refer_to("DataFrame.equals")
+ def equals(self, other): # noqa: PR01, RT01
+ return BinaryDefault.register(pandas.DataFrame.equals)(self, other=other)
+
@doc_utils.doc_binary_method(operation="integer division", sign="//")
def floordiv(self, other, **kwargs): # noqa: PR02
return BinaryDefault.register(pandas.DataFrame.floordiv)(
@@ -3021,6 +3025,68 @@ def groupby_size(
result.columns = result.columns[:-1].append(pandas.Index(["size"]))
return result
+ @doc_utils.add_refer_to("GroupBy.rolling")
+ def groupby_rolling(
+ self,
+ by,
+ agg_func,
+ axis,
+ groupby_kwargs,
+ rolling_kwargs,
+ agg_args,
+ agg_kwargs,
+ drop=False,
+ ):
+ """
+ Group QueryCompiler data and apply passed aggregation function to a rolling window in each group.
+
+ Parameters
+ ----------
+ by : BaseQueryCompiler, column or index label, Grouper or list of such
+ Object that determine groups.
+ agg_func : str, dict or callable(Series | DataFrame) -> scalar | Series | DataFrame
+ Function to apply to the GroupBy object.
+ axis : {0, 1}
+ Axis to group and apply aggregation function along.
+ 0 is for index, when 1 is for columns.
+ groupby_kwargs : dict
+ GroupBy parameters as expected by ``modin.pandas.DataFrame.groupby`` signature.
+ rolling_kwargs : dict
+ Parameters to build a rolling window as expected by ``modin.pandas.window.RollingGroupby`` signature.
+ agg_args : list-like
+ Positional arguments to pass to the `agg_func`.
+ agg_kwargs : dict
+ Key arguments to pass to the `agg_func`.
+ drop : bool, default: False
+ If `by` is a QueryCompiler indicates whether or not by-data came
+ from the `self`.
+
+ Returns
+ -------
+ BaseQueryCompiler
+ QueryCompiler containing the result of groupby aggregation.
+ """
+ if isinstance(agg_func, str):
+ str_func = agg_func
+
+ def agg_func(window, *args, **kwargs):
+ return getattr(window, str_func)(*args, **kwargs)
+
+ else:
+ assert callable(agg_func)
+ return self.groupby_agg(
+ by=by,
+ agg_func=lambda grp, *args, **kwargs: agg_func(
+ grp.rolling(**rolling_kwargs), *args, **kwargs
+ ),
+ axis=axis,
+ groupby_kwargs=groupby_kwargs,
+ agg_args=agg_args,
+ agg_kwargs=agg_kwargs,
+ how="direct",
+ drop=drop,
+ )
+
@doc_utils.add_refer_to("GroupBy.aggregate")
def groupby_agg(
self,
@@ -5760,9 +5826,9 @@ def str_casefold(self):
**kwargs : dict""",
build_rules="udf_aggregation",
)
- def rolling_aggregate(self, fold_axis, rolling_args, func, *args, **kwargs):
+ def rolling_aggregate(self, fold_axis, rolling_kwargs, func, *args, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.aggregate)(
- self, rolling_args, func, *args, **kwargs
+ self, rolling_kwargs, func, *args, **kwargs
)
# FIXME: at the query compiler method `rolling_apply` is an alias for `rolling_aggregate`,
@@ -5787,7 +5853,7 @@ def rolling_aggregate(self, fold_axis, rolling_args, func, *args, **kwargs):
def rolling_apply(
self,
fold_axis,
- rolling_args,
+ rolling_kwargs,
func,
raw=False,
engine=None,
@@ -5796,7 +5862,7 @@ def rolling_apply(
kwargs=None,
):
return RollingDefault.register(pandas.core.window.rolling.Rolling.apply)(
- self, rolling_args, func, raw, engine, engine_kwargs, args, kwargs
+ self, rolling_kwargs, func, raw, engine, engine_kwargs, args, kwargs
)
@doc_utils.doc_window_method(
@@ -5810,18 +5876,18 @@ def rolling_apply(
**kwargs : dict""",
)
def rolling_corr(
- self, fold_axis, rolling_args, other=None, pairwise=None, *args, **kwargs
+ self, fold_axis, rolling_kwargs, other=None, pairwise=None, *args, **kwargs
):
return RollingDefault.register(pandas.core.window.rolling.Rolling.corr)(
- self, rolling_args, other, pairwise, *args, **kwargs
+ self, rolling_kwargs, other, pairwise, *args, **kwargs
)
@doc_utils.doc_window_method(
window_cls_name="Rolling", result="number of non-NA values", refer_to="count"
)
- def rolling_count(self, fold_axis, rolling_args):
+ def rolling_count(self, fold_axis, rolling_kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.count)(
- self, rolling_args
+ self, rolling_kwargs
)
@doc_utils.doc_window_method(
@@ -5835,10 +5901,10 @@ def rolling_count(self, fold_axis, rolling_args):
**kwargs : dict""",
)
def rolling_cov(
- self, fold_axis, rolling_args, other=None, pairwise=None, ddof=1, **kwargs
+ self, fold_axis, rolling_kwargs, other=None, pairwise=None, ddof=1, **kwargs
):
return RollingDefault.register(pandas.core.window.rolling.Rolling.cov)(
- self, rolling_args, other, pairwise, ddof, **kwargs
+ self, rolling_kwargs, other, pairwise, ddof, **kwargs
)
@doc_utils.doc_window_method(
@@ -5847,9 +5913,9 @@ def rolling_cov(
refer_to="kurt",
params="**kwargs : dict",
)
- def rolling_kurt(self, fold_axis, rolling_args, **kwargs):
+ def rolling_kurt(self, fold_axis, rolling_kwargs, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.kurt)(
- self, rolling_args, **kwargs
+ self, rolling_kwargs, **kwargs
)
@doc_utils.doc_window_method(
@@ -5860,9 +5926,9 @@ def rolling_kurt(self, fold_axis, rolling_args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def rolling_max(self, fold_axis, rolling_args, *args, **kwargs):
+ def rolling_max(self, fold_axis, rolling_kwargs, *args, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.max)(
- self, rolling_args, *args, **kwargs
+ self, rolling_kwargs, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -5873,9 +5939,9 @@ def rolling_max(self, fold_axis, rolling_args, *args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def rolling_mean(self, fold_axis, rolling_args, *args, **kwargs):
+ def rolling_mean(self, fold_axis, rolling_kwargs, *args, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.mean)(
- self, rolling_args, *args, **kwargs
+ self, rolling_kwargs, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -5884,9 +5950,9 @@ def rolling_mean(self, fold_axis, rolling_args, *args, **kwargs):
refer_to="median",
params="**kwargs : dict",
)
- def rolling_median(self, fold_axis, rolling_args, **kwargs):
+ def rolling_median(self, fold_axis, rolling_kwargs, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.median)(
- self, rolling_args, **kwargs
+ self, rolling_kwargs, **kwargs
)
@doc_utils.doc_window_method(
@@ -5897,9 +5963,9 @@ def rolling_median(self, fold_axis, rolling_args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def rolling_min(self, fold_axis, rolling_args, *args, **kwargs):
+ def rolling_min(self, fold_axis, rolling_kwargs, *args, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.min)(
- self, rolling_args, *args, **kwargs
+ self, rolling_kwargs, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -5912,10 +5978,10 @@ def rolling_min(self, fold_axis, rolling_args, *args, **kwargs):
**kwargs : dict""",
)
def rolling_quantile(
- self, fold_axis, rolling_args, quantile, interpolation="linear", **kwargs
+ self, fold_axis, rolling_kwargs, quantile, interpolation="linear", **kwargs
):
return RollingDefault.register(pandas.core.window.rolling.Rolling.quantile)(
- self, rolling_args, quantile, interpolation, **kwargs
+ self, rolling_kwargs, quantile, interpolation, **kwargs
)
@doc_utils.doc_window_method(
@@ -5924,9 +5990,9 @@ def rolling_quantile(
refer_to="skew",
params="**kwargs : dict",
)
- def rolling_skew(self, fold_axis, rolling_args, **kwargs):
+ def rolling_skew(self, fold_axis, rolling_kwargs, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.skew)(
- self, rolling_args, **kwargs
+ self, rolling_kwargs, **kwargs
)
@doc_utils.doc_window_method(
@@ -5938,9 +6004,9 @@ def rolling_skew(self, fold_axis, rolling_args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def rolling_std(self, fold_axis, rolling_args, ddof=1, *args, **kwargs):
+ def rolling_std(self, fold_axis, rolling_kwargs, ddof=1, *args, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.std)(
- self, rolling_args, ddof, *args, **kwargs
+ self, rolling_kwargs, ddof, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -5951,9 +6017,9 @@ def rolling_std(self, fold_axis, rolling_args, ddof=1, *args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def rolling_sum(self, fold_axis, rolling_args, *args, **kwargs):
+ def rolling_sum(self, fold_axis, rolling_kwargs, *args, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.sum)(
- self, rolling_args, *args, **kwargs
+ self, rolling_kwargs, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -5964,9 +6030,9 @@ def rolling_sum(self, fold_axis, rolling_args, *args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def rolling_sem(self, fold_axis, rolling_args, *args, **kwargs):
+ def rolling_sem(self, fold_axis, rolling_kwargs, *args, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.sem)(
- self, rolling_args, *args, **kwargs
+ self, rolling_kwargs, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -5978,9 +6044,9 @@ def rolling_sem(self, fold_axis, rolling_args, *args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def rolling_var(self, fold_axis, rolling_args, ddof=1, *args, **kwargs):
+ def rolling_var(self, fold_axis, rolling_kwargs, ddof=1, *args, **kwargs):
return RollingDefault.register(pandas.core.window.rolling.Rolling.var)(
- self, rolling_args, ddof, *args, **kwargs
+ self, rolling_kwargs, ddof, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -5998,7 +6064,7 @@ def rolling_var(self, fold_axis, rolling_args, ddof=1, *args, **kwargs):
def rolling_rank(
self,
fold_axis,
- rolling_args,
+ rolling_kwargs,
method="average",
ascending=True,
pct=False,
@@ -6008,7 +6074,7 @@ def rolling_rank(
):
return RollingDefault.register(pandas.core.window.rolling.Rolling.rank)(
self,
- rolling_args,
+ rolling_kwargs,
method=method,
ascending=ascending,
pct=pct,
@@ -6371,9 +6437,9 @@ def expanding_rank(
*args : iterable
**kwargs : dict""",
)
- def window_mean(self, fold_axis, window_args, *args, **kwargs):
+ def window_mean(self, fold_axis, window_kwargs, *args, **kwargs):
return RollingDefault.register(pandas.core.window.Window.mean)(
- self, window_args, *args, **kwargs
+ self, window_kwargs, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -6386,9 +6452,9 @@ def window_mean(self, fold_axis, window_args, *args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def window_std(self, fold_axis, window_args, ddof=1, *args, **kwargs):
+ def window_std(self, fold_axis, window_kwargs, ddof=1, *args, **kwargs):
return RollingDefault.register(pandas.core.window.Window.std)(
- self, window_args, ddof, *args, **kwargs
+ self, window_kwargs, ddof, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -6400,9 +6466,9 @@ def window_std(self, fold_axis, window_args, ddof=1, *args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def window_sum(self, fold_axis, window_args, *args, **kwargs):
+ def window_sum(self, fold_axis, window_kwargs, *args, **kwargs):
return RollingDefault.register(pandas.core.window.Window.sum)(
- self, window_args, *args, **kwargs
+ self, window_kwargs, *args, **kwargs
)
@doc_utils.doc_window_method(
@@ -6415,9 +6481,9 @@ def window_sum(self, fold_axis, window_args, *args, **kwargs):
*args : iterable
**kwargs : dict""",
)
- def window_var(self, fold_axis, window_args, ddof=1, *args, **kwargs):
+ def window_var(self, fold_axis, window_kwargs, ddof=1, *args, **kwargs):
return RollingDefault.register(pandas.core.window.Window.var)(
- self, window_args, ddof, *args, **kwargs
+ self, window_kwargs, ddof, *args, **kwargs
)
# End of Window methods
@@ -6527,6 +6593,7 @@ def repartition(self, axis=None):
lambda df: df,
new_index=self._modin_frame.copy_index_cache(),
new_columns=self._modin_frame.copy_columns_cache(),
+ dtypes=self._modin_frame.copy_dtypes_cache(),
keep_partitioning=False,
sync_labels=False,
)
diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py
index 207f5e775a9..5139a651bf6 100644
--- a/modin/core/storage_formats/pandas/parsers.py
+++ b/modin/core/storage_formats/pandas/parsers.py
@@ -162,6 +162,7 @@ def generic_parse(fname, **kwargs):
start = kwargs.pop("start", None)
end = kwargs.pop("end", None)
header_size = kwargs.pop("header_size", 0)
+ common_dtypes = kwargs.pop("common_dtypes", None)
encoding = kwargs.get("encoding", None)
callback = kwargs.pop("callback")
if start is None or end is None:
@@ -208,6 +209,8 @@ def generic_parse(fname, **kwargs):
if "memory_map" in kwargs:
kwargs = kwargs.copy()
del kwargs["memory_map"]
+ if common_dtypes is not None:
+ kwargs["dtype"] = common_dtypes
pandas_df = callback(BytesIO(to_read), **kwargs)
index = (
pandas_df.index
@@ -557,8 +560,8 @@ def _convert_cell(cls, cell, convert_float):
return cell.value
- @property
- def need_rich_text_param(self):
+ @staticmethod
+ def need_rich_text_param():
"""
Determine whether a required `rich_text` parameter should be specified for the ``WorksheetReader`` constructor.
diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py
index e5668e26cc4..243ba712cff 100644
--- a/modin/core/storage_formats/pandas/query_compiler.py
+++ b/modin/core/storage_formats/pandas/query_compiler.py
@@ -21,7 +21,6 @@
import re
import numpy as np
import pandas
-import functools
from pandas.api.types import is_scalar
from pandas.core.common import is_bool_indexer
from pandas.core.indexing import check_bool_indexer
@@ -32,6 +31,7 @@
is_datetime_or_timedelta_dtype,
is_datetime64_any_dtype,
is_bool_dtype,
+ is_categorical_dtype,
)
from pandas.core.dtypes.cast import find_common_type
from pandas.errors import DataError, MergeError
@@ -390,6 +390,12 @@ def to_numpy(self, **kwargs):
combine = Binary.register(pandas.DataFrame.combine, infer_dtypes="common_cast")
combine_first = Binary.register(pandas.DataFrame.combine_first, infer_dtypes="bool")
eq = Binary.register(pandas.DataFrame.eq, infer_dtypes="bool")
+ equals = Binary.register(
+ lambda df, other: pandas.DataFrame([[df.equals(other)]]),
+ join_type=None,
+ labels="drop",
+ infer_dtypes="bool",
+ )
floordiv = Binary.register(pandas.DataFrame.floordiv, infer_dtypes="common_cast")
ge = Binary.register(pandas.DataFrame.ge, infer_dtypes="bool")
gt = Binary.register(pandas.DataFrame.gt, infer_dtypes="bool")
@@ -1378,81 +1384,83 @@ def expanding_corr(
)
window_mean = Fold.register(
- lambda df, rolling_args, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).mean(*args, **kwargs)
+ lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).mean(*args, **kwargs)
)
)
window_sum = Fold.register(
- lambda df, rolling_args, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).sum(*args, **kwargs)
+ lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).sum(*args, **kwargs)
)
)
window_var = Fold.register(
- lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).var(ddof=ddof, *args, **kwargs)
+ lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).var(ddof=ddof, *args, **kwargs)
)
)
window_std = Fold.register(
- lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).std(ddof=ddof, *args, **kwargs)
+ lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).std(ddof=ddof, *args, **kwargs)
)
)
rolling_count = Fold.register(
- lambda df, rolling_args: pandas.DataFrame(df.rolling(*rolling_args).count())
+ lambda df, rolling_kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).count()
+ )
)
rolling_sum = Fold.register(
- lambda df, rolling_args, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).sum(*args, **kwargs)
+ lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).sum(*args, **kwargs)
)
)
rolling_sem = Fold.register(
- lambda df, rolling_args, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).sem(*args, **kwargs)
+ lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).sem(*args, **kwargs)
)
)
rolling_mean = Fold.register(
- lambda df, rolling_args, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).mean(*args, **kwargs)
+ lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).mean(*args, **kwargs)
)
)
rolling_median = Fold.register(
- lambda df, rolling_args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).median(**kwargs)
+ lambda df, rolling_kwargs, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).median(**kwargs)
)
)
rolling_var = Fold.register(
- lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).var(ddof=ddof, *args, **kwargs)
+ lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).var(ddof=ddof, *args, **kwargs)
)
)
rolling_std = Fold.register(
- lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).std(ddof=ddof, *args, **kwargs)
+ lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).std(ddof=ddof, *args, **kwargs)
)
)
rolling_min = Fold.register(
- lambda df, rolling_args, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).min(*args, **kwargs)
+ lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).min(*args, **kwargs)
)
)
rolling_max = Fold.register(
- lambda df, rolling_args, *args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).max(*args, **kwargs)
+ lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).max(*args, **kwargs)
)
)
rolling_skew = Fold.register(
- lambda df, rolling_args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).skew(**kwargs)
+ lambda df, rolling_kwargs, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).skew(**kwargs)
)
)
rolling_kurt = Fold.register(
- lambda df, rolling_args, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).kurt(**kwargs)
+ lambda df, rolling_kwargs, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).kurt(**kwargs)
)
)
rolling_apply = Fold.register(
- lambda df, rolling_args, func, raw, engine, engine_kwargs, args, kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).apply(
+ lambda df, rolling_kwargs, func, raw, engine, engine_kwargs, args, kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).apply(
func=func,
raw=raw,
engine=engine,
@@ -1463,15 +1471,15 @@ def expanding_corr(
)
)
rolling_quantile = Fold.register(
- lambda df, rolling_args, quantile, interpolation, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).quantile(
+ lambda df, rolling_kwargs, quantile, interpolation, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).quantile(
quantile=quantile, interpolation=interpolation, **kwargs
)
)
)
rolling_rank = Fold.register(
- lambda df, rolling_args, method, ascending, pct, numeric_only, **kwargs: pandas.DataFrame(
- df.rolling(*rolling_args).rank(
+ lambda df, rolling_kwargs, method, ascending, pct, numeric_only, **kwargs: pandas.DataFrame(
+ df.rolling(**rolling_kwargs).rank(
method=method,
ascending=ascending,
pct=pct,
@@ -1481,43 +1489,43 @@ def expanding_corr(
)
)
- def rolling_corr(self, axis, rolling_args, other, pairwise, *args, **kwargs):
+ def rolling_corr(self, axis, rolling_kwargs, other, pairwise, *args, **kwargs):
if len(self.columns) > 1:
return self.default_to_pandas(
- lambda df: pandas.DataFrame.rolling(df, *rolling_args).corr(
+ lambda df: pandas.DataFrame.rolling(df, **rolling_kwargs).corr(
other=other, pairwise=pairwise, *args, **kwargs
)
)
else:
return Fold.register(
lambda df: pandas.DataFrame(
- df.rolling(*rolling_args).corr(
+ df.rolling(**rolling_kwargs).corr(
other=other, pairwise=pairwise, *args, **kwargs
)
)
)(self, axis)
- def rolling_cov(self, axis, rolling_args, other, pairwise, ddof, **kwargs):
+ def rolling_cov(self, axis, rolling_kwargs, other, pairwise, ddof, **kwargs):
if len(self.columns) > 1:
return self.default_to_pandas(
- lambda df: pandas.DataFrame.rolling(df, *rolling_args).cov(
+ lambda df: pandas.DataFrame.rolling(df, **rolling_kwargs).cov(
other=other, pairwise=pairwise, ddof=ddof, **kwargs
)
)
else:
return Fold.register(
lambda df: pandas.DataFrame(
- df.rolling(*rolling_args).cov(
+ df.rolling(**rolling_kwargs).cov(
other=other, pairwise=pairwise, ddof=ddof, **kwargs
)
)
)(self, axis)
- def rolling_aggregate(self, axis, rolling_args, func, *args, **kwargs):
+ def rolling_aggregate(self, axis, rolling_kwargs, func, *args, **kwargs):
new_modin_frame = self._modin_frame.apply_full_axis(
axis,
lambda df: pandas.DataFrame(
- df.rolling(*rolling_args).aggregate(func=func, *args, **kwargs)
+ df.rolling(**rolling_kwargs).aggregate(func=func, *args, **kwargs)
),
new_index=self.index,
)
@@ -2454,8 +2462,10 @@ def rank(self, **kwargs):
new_modin_frame = self._modin_frame.apply_full_axis(
axis,
lambda df: df.rank(**kwargs),
- new_index=self.index,
- new_columns=self.columns if not numeric_only else None,
+ new_index=self._modin_frame.copy_index_cache(),
+ new_columns=self._modin_frame.copy_columns_cache()
+ if not numeric_only
+ else None,
dtypes=np.float64,
)
return self.__constructor__(new_modin_frame)
@@ -3410,7 +3420,7 @@ def _groupby_shuffle(
# So this check works only if we have dtypes cache materialized, otherwise the exception will be thrown
# inside the kernel and so it will be uncatchable. TODO: figure out a better way to handle this.
if self._modin_frame._dtypes is not None and any(
- dtype == "category" for dtype in self.dtypes[by].values
+ is_categorical_dtype(dtype) for dtype in self.dtypes[by].values
):
raise NotImplementedError(
"Reshuffling groupby is not yet supported when grouping on a categorical column. "
@@ -3439,9 +3449,11 @@ def _groupby_shuffle(
else:
obj = self
- agg_func = functools.partial(
- GroupByDefault.get_aggregation_method(how), func=agg_func
- )
+ agg_method = GroupByDefault.get_aggregation_method(how)
+ original_agg_func = agg_func
+
+ def agg_func(grp, *args, **kwargs):
+ return agg_method(grp, original_agg_func, *args, **kwargs)
result = obj._modin_frame.groupby(
axis=axis,
@@ -3504,6 +3516,51 @@ def groupby_cov(
drop=drop,
)
+ def groupby_rolling(
+ self,
+ by,
+ agg_func,
+ axis,
+ groupby_kwargs,
+ rolling_kwargs,
+ agg_args,
+ agg_kwargs,
+ drop=False,
+ ):
+ # 'corr' and 'cov' require knowledge about the whole row axis (all columns have
+ # to be available in the same partitions), this requirement is not being satisfied
+ # in the current groupby implementation
+ unsupported_groupby = (
+ agg_func in ("corr", "cov") or rolling_kwargs.get("on") is not None
+ )
+
+ if isinstance(agg_func, str):
+ str_func = agg_func
+
+ def agg_func(window, *args, **kwargs):
+ return getattr(window, str_func)(*args, **kwargs)
+
+ else:
+ assert callable(agg_func)
+
+ if unsupported_groupby:
+ obj = super(PandasQueryCompiler, self)
+ else:
+ obj = self
+
+ return obj.groupby_agg(
+ by=by,
+ agg_func=lambda grp, *args, **kwargs: agg_func(
+ grp.rolling(**rolling_kwargs), *args, **kwargs
+ ),
+ axis=axis,
+ groupby_kwargs=groupby_kwargs,
+ agg_args=agg_args,
+ agg_kwargs=agg_kwargs,
+ how="direct",
+ drop=drop,
+ )
+
def groupby_agg(
self,
by,
@@ -3559,12 +3616,12 @@ def groupby_agg(
how == "axis_wise"
), f"Only 'axis_wise' aggregation is supported with dictionary functions, got: {how}"
else:
- agg_func = functools.partial(
- (
- SeriesGroupByDefault if series_groupby else GroupByDefault
- ).get_aggregation_method(how),
- func=agg_func,
- )
+ agg_method = (
+ SeriesGroupByDefault if series_groupby else GroupByDefault
+ ).get_aggregation_method(how)
+
+ def agg_func(grp, *args, **kwargs):
+ return agg_method(grp, original_agg_func, *args, **kwargs)
# since we're going to modify `groupby_kwargs` dict in a `groupby_agg_builder`,
# we want to copy it to not propagate these changes into source dict, in case
@@ -4081,7 +4138,7 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs):
def cat_codes(self):
def func(df: pandas.DataFrame) -> pandas.DataFrame:
ser = df.iloc[:, 0]
- assert ser.dtype == "category"
+ assert is_categorical_dtype(ser.dtype)
return ser.cat.codes.to_frame(name=MODIN_UNNAMED_SERIES_LABEL)
res = self._modin_frame.map(func=func, new_columns=[MODIN_UNNAMED_SERIES_LABEL])
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py
index 75c96d59557..4327b0c277b 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py
@@ -15,15 +15,70 @@
import abc
import uuid
-import os
+from typing import Tuple, List
import pyarrow as pa
import numpy as np
-from modin.config import OmnisciFragmentSize, HdkFragmentSize
from modin.error_message import ErrorMessage
+class DbTable(abc.ABC):
+ """
+ Base class, representing a table in the HDK database.
+
+ Attributes
+ ----------
+ name : str
+ Table name.
+ """
+
+ @property
+ @abc.abstractmethod
+ def shape(self) -> Tuple[int, int]:
+ """
+ Return a tuple with the number of rows and columns.
+
+ Returns
+ -------
+ tuple of int
+ """
+ pass
+
+ @property
+ @abc.abstractmethod
+ def column_names(self) -> List[str]:
+ """
+ Return a list of the table column names.
+
+ Returns
+ -------
+ tuple of str
+ """
+ pass
+
+ @abc.abstractmethod
+ def to_arrow(self) -> pa.Table:
+ """
+ Convert this table to arrow.
+
+ Returns
+ -------
+ pyarrow.Table
+ """
+ pass
+
+ def __len__(self):
+ """
+ Return the number of rows in the table.
+
+ Returns
+ -------
+ int
+ """
+ return self.shape[0]
+
+
class BaseDbWorker(abc.ABC):
"""Base class for HDK storage format based execution engine ."""
@@ -53,7 +108,7 @@ def executeDML(cls, query):
Returns
-------
- pyarrow.Table
+ DbTable
Execution result.
"""
pass
@@ -71,7 +126,7 @@ def executeRA(cls, query):
Returns
-------
- pyarrow.Table
+ DbTable
Execution result.
"""
pass
@@ -184,36 +239,6 @@ def cast_to_compatible_types(table):
return table
- @classmethod
- def compute_fragment_size(cls, table):
- """
- Compute fragment size to be used for table import.
-
- Parameters
- ----------
- table : pyarrow.Table
- A table to import.
-
- Returns
- -------
- int
- Fragment size to use for import.
- """
- fragment_size = HdkFragmentSize.get()
- if fragment_size is None:
- fragment_size = OmnisciFragmentSize.get()
- if fragment_size is None:
- cpu_count = os.cpu_count()
- if cpu_count is not None:
- fragment_size = table.num_rows // cpu_count
- fragment_size = min(fragment_size, 2**25)
- fragment_size = max(fragment_size, 2**18)
- else:
- fragment_size = 0
- else:
- fragment_size = int(fragment_size)
- return fragment_size
-
@classmethod
@abc.abstractmethod
def import_arrow_table(cls, table, name=None):
@@ -229,8 +254,8 @@ def import_arrow_table(cls, table, name=None):
Returns
-------
- str
- Imported table name.
+ DbTable
+ Imported table.
"""
pass
@@ -248,7 +273,7 @@ def import_pandas_dataframe(cls, df, name=None):
Returns
-------
- str
- Imported table name.
+ DbTable
+ Imported table.
"""
return cls.import_arrow_table(pa.Table.from_pandas(df), name=name)
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py
index 46c42f79fef..8929890d11c 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py
@@ -20,6 +20,7 @@
import abc
+from .db_worker import DbTable
from .dataframe.utils import ColNameCodec
from .expr import BaseExpr
@@ -181,9 +182,10 @@ class CalciteScanNode(CalciteBaseNode):
def __init__(self, modin_frame):
assert modin_frame._partitions is not None
- assert modin_frame._partitions[0][0].frame_id is not None
+ table = modin_frame._partitions[0][0].get()
+ assert isinstance(table, DbTable)
super(CalciteScanNode, self).__init__("EnumerableTableScan")
- self.table = ["hdk", modin_frame._partitions[0][0].frame_id]
+ self.table = ["hdk", table.name]
self.fieldNames = [
ColNameCodec.encode(col) for col in modin_frame._table_cols
] + ["rowid"]
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py
index 6f122339327..bd56ac7bab0 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py
@@ -342,6 +342,7 @@ class InputContext:
"min": "MIN",
"size": "COUNT",
"count": "COUNT",
+ "median": "APPROX_QUANTILE",
}
_no_arg_aggregates = {"size"}
@@ -737,6 +738,18 @@ def _push(self, node):
node : CalciteBaseNode
A node to add.
"""
+ if (
+ len(self.res) != 0
+ and isinstance(node, CalciteProjectionNode)
+ and isinstance(self.res[-1], CalciteProjectionNode)
+ and all(isinstance(expr, CalciteInputRefExpr) for expr in node.exprs)
+ ):
+ # Replace the last CalciteProjectionNode with this one and
+ # translate the input refs.
+ exprs = self.res.pop().exprs
+ node = CalciteProjectionNode(
+ node.fields, [exprs[expr.input] for expr in node.exprs]
+ )
self.res.append(node)
def _last(self):
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py
index f3d7c34d6f4..a92e5afd212 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py
@@ -13,6 +13,8 @@
"""Module provides ``CalciteSerializer`` class."""
+from pandas.core.dtypes.common import is_datetime64_dtype
+
from .expr import (
BaseExpr,
LiteralExpr,
@@ -65,6 +67,7 @@ class CalciteSerializer:
"bool": "BOOLEAN",
"float32": "FLOAT",
"float64": "DOUBLE",
+ "datetime64": "TIMESTAMP",
}
_INT_OPTS = {
@@ -79,6 +82,16 @@ class CalciteSerializer:
int: ("BIGINT", 19),
}
+ _TIMESTAMP_PRECISION = {
+ "s": 0,
+ "ms": 3,
+ "us": 6,
+ "ns": 9,
+ }
+ _DTYPE_STRINGS.update(
+ {f"datetime64[{u}]": "TIMESTAMP" for u in _TIMESTAMP_PRECISION}
+ )
+
def serialize(self, plan):
"""
Serialize a sequence of Calcite nodes into JSON format.
@@ -327,6 +340,20 @@ def serialize_literal(self, literal):
"type_scale": -2147483648,
"type_precision": 1,
}
+ if isinstance(val, np.datetime64):
+ unit = np.datetime_data(val)[0]
+ precision = self._TIMESTAMP_PRECISION.get(unit, None)
+ if precision is not None:
+ return {
+ "literal": int(val.astype(np.int64)),
+ "type": "TIMESTAMP",
+ "target_type": "TIMESTAMP",
+ "scale": -2147483648,
+ "precision": precision,
+ "type_scale": -2147483648,
+ "type_precision": precision,
+ }
+
raise NotImplementedError(f"Can not serialize {type(val).__name__}")
def opts_for_int_type(self, int_type):
@@ -367,7 +394,11 @@ def serialize_dtype(self, dtype):
"""
_warn_if_unsigned(dtype)
try:
- return {"type": self._DTYPE_STRINGS[dtype.name], "nullable": True}
+ type_info = {"type": self._DTYPE_STRINGS[dtype.name], "nullable": True}
+ if is_datetime64_dtype(dtype):
+ unit = np.datetime_data(dtype)[0]
+ type_info["precision"] = self._TIMESTAMP_PRECISION[unit]
+ return type_info
except KeyError:
raise TypeError(f"Unsupported dtype: {dtype}")
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py
index ad57dba7c8a..e9a0fe2bb2e 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py
@@ -45,12 +45,14 @@
)
from .utils import (
ColNameCodec,
+ maybe_range,
arrow_to_pandas,
check_join_supported,
check_cols_to_join,
get_data_for_join_by_index,
build_categorical_from_at,
)
+from ..db_worker import DbTable
from ..partitioning.partition_manager import HdkOnNativeDataframePartitionManager
from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype
from modin.error_message import ErrorMessage
@@ -80,6 +82,7 @@
is_cmp_op,
)
from modin.pandas.utils import check_both_not_none
+from modin.pandas.indexing import is_range_like
IDX_COL_NAME = ColNameCodec.IDX_COL_NAME
ROWID_COL_NAME = ColNameCodec.ROWID_COL_NAME
@@ -167,8 +170,8 @@ class HdkOnNativeDataframe(PandasDataframe):
_force_execution_mode : str or None
Used by tests to control frame's execution process. Value "lazy"
is used to raise RuntimeError if execution is triggered for the frame.
- Value "arrow" is used to raise RuntimeError execution is triggered
- and cannot be done using Arrow API (have to use HDK for execution).
+ The values "arrow" and "hdk" are used to force the corresponding
+ execution mode.
"""
_query_compiler_cls = DFAlgQueryCompiler
@@ -250,6 +253,7 @@ def copy(
dtypes=no_default,
op=no_default,
index_cols=no_default,
+ uses_rowid=no_default,
):
"""
Copy this DataFrame.
@@ -271,6 +275,9 @@ def copy(
index_cols : list of str, optional
A list of columns included into the frame's index. None value means
a default index (row id is used as an index).
+ uses_rowid : bool, optional
+ True for frames which require access to the virtual 'rowid' column
+ for its execution.
Returns
-------
@@ -289,6 +296,8 @@ def copy(
dtypes = self.copy_dtypes_cache()
if index_cols is no_default:
index_cols = self._index_cols
+ if uses_rowid is no_default:
+ uses_rowid = self._uses_rowid
return self.__constructor__(
partitions=partitions,
index=index,
@@ -298,7 +307,7 @@ def copy(
dtypes=dtypes,
op=op,
index_cols=index_cols,
- uses_rowid=self._uses_rowid,
+ uses_rowid=uses_rowid,
force_execution_mode=self._force_execution_mode,
has_unsupported_data=self._has_unsupported_data,
)
@@ -419,15 +428,50 @@ def take_2d_labels_or_positional(
if row_labels is not None:
raise NotImplementedError("Row labels masking is not yet supported")
- if row_positions is not None:
- base = base._maybe_materialize_rowid()
- op = MaskNode(base, row_labels=row_labels, row_positions=row_positions)
- return self.__constructor__(
- columns=base.columns,
- dtypes=base.copy_dtypes_cache(),
- op=op,
- index_cols=base._index_cols,
- force_execution_mode=base._force_execution_mode,
+ if row_positions is None:
+ return base
+
+ row_positions = maybe_range(row_positions)
+ base = base._maybe_materialize_rowid()
+ op = MaskNode(base, row_labels=row_labels, row_positions=row_positions)
+ base = self.__constructor__(
+ columns=base.columns,
+ dtypes=base.copy_dtypes_cache(),
+ op=op,
+ index_cols=base._index_cols,
+ force_execution_mode=base._force_execution_mode,
+ )
+
+ # Reverse the frame rows, if performing a reverse order selection via HDK.
+ if (
+ is_range_like(row_positions) and row_positions.step < 0
+ ) and not base._can_execute_arrow():
+ cols = base.columns
+ table_cols = base._table_cols
+ # Add the rowid column
+ rowid_col = "__tmp_rowid__"
+ while rowid_col in table_cols:
+ rowid_col += "1"
+ exprs = base._index_exprs()
+ exprs[rowid_col] = base.ref(ROWID_COL_NAME)
+ for col in cols:
+ exprs[col] = base.ref(col)
+ base = base.copy(
+ columns=[rowid_col] + base.columns.tolist(),
+ dtypes=base._dtypes_for_exprs(exprs),
+ op=TransformNode(base, exprs),
+ uses_rowid=True,
+ )
+ # Sort by the rowid column
+ base = base.copy(op=SortNode(base, [rowid_col], [False], "last"))
+ # Remove the rowid column
+ exprs = OrderedDict()
+ for col in table_cols:
+ exprs[col] = base.ref(col)
+ base = base.copy(
+ columns=cols,
+ dtypes=base._dtypes_for_exprs(exprs),
+ op=TransformNode(base, exprs),
)
return base
@@ -463,8 +507,10 @@ def _dtypes_for_exprs(self, exprs):
def _maybe_update_proxies(self, dtypes, new_parent=None):
if new_parent is not None:
super()._maybe_update_proxies(dtypes, new_parent)
- elif self._has_arrow_table():
- table = self._partitions[0, 0].get()
+ if self._partitions is None:
+ return
+ table = self._partitions[0][0].get()
+ if isinstance(table, pyarrow.Table):
super()._maybe_update_proxies(dtypes, new_parent=table)
def groupby_agg(self, by, axis, agg, groupby_args, **kwargs):
@@ -1373,10 +1419,15 @@ def _join_arrow_columns(self, other_modin_frames):
if all(
f._index_cols is None
# Make sure all the frames have an arrow table in partitions.
- and isinstance(f._execute(), pyarrow.Table)
+ and isinstance(f._execute(), (DbTable, pyarrow.Table))
for f in frames
):
- tables = [f._partitions[0][0].get() for f in frames]
+ tables = [
+ t
+ if isinstance(t := f._partitions[0][0].get(), pyarrow.Table)
+ else t.to_arrow()
+ for f in frames
+ ]
column_names = [c for t in tables for c in t.column_names]
if len(column_names) != len(set(column_names)):
raise NotImplementedError("Duplicate column names")
@@ -1616,7 +1667,7 @@ def cat_codes(self):
The new frame.
"""
assert len(self.columns) == 1
- assert self._dtypes[-1] == "category"
+ assert is_categorical_dtype(self._dtypes[-1])
exprs = self._index_exprs()
col_expr = self.ref(self.columns[-1])
@@ -1818,6 +1869,20 @@ def filter(self, key):
force_execution_mode=self._force_execution_mode,
)
+ def force_import(self) -> DbTable:
+ """
+ Force table import.
+
+ Returns
+ -------
+ DbTable
+ The imported table.
+ """
+ if self._has_unsupported_data:
+ raise NotImplementedError("Unable to import a frame with unsupported data")
+ self._execute()
+ return self._partition_mgr_cls.import_table(self)
+
def _maybe_materialize_rowid(self):
"""
Materialize virtual 'rowid' column if frame uses it as an index.
@@ -1923,7 +1988,7 @@ def _execute(self):
Returns
-------
- pyarrow.Table or pandas.Dataframe
+ DbTable or pyarrow.Table or pandas.Dataframe
"""
if isinstance(self._op, FrameNode):
return self._op.execute_arrow()
@@ -1938,7 +2003,7 @@ def _execute(self):
if isinstance(frame._op, FrameNode):
result = frame._op.execute_arrow()
continue
- if not frame._op.can_execute_hdk():
+ if not frame._op.can_execute_hdk() and stack[-1] != frame._materialize:
stack.append(frame._materialize)
if frame._uses_rowid or frame._op.require_executed_base():
for i in reversed(frame._op.input):
@@ -1955,24 +2020,25 @@ def _materialize(self):
Returns
-------
- pyarrow.Table
+ DbTable or pyarrow.Table
"""
- assert (
- self._force_execution_mode != "lazy"
- ), "Unexpected execution triggered on lazy frame!"
+ mode = self._force_execution_mode
+ assert mode != "lazy", "Unexpected execution triggered on lazy frame!"
+
+ if isinstance(self._op, FrameNode):
+ return self._op.execute_arrow()
- if self._force_execution_mode != "hdk" and self._can_execute_arrow():
+ if (
+ mode == "arrow"
+ or not self._op.can_execute_hdk()
+ or (self._can_execute_arrow() and mode != "hdk")
+ ):
new_table = self._execute_arrow()
partitions = self._partition_mgr_cls.from_arrow(
new_table, unsupported_cols=[], encode_col_names=False
)[0]
else:
- assert (
- self._force_execution_mode != "arrow"
- ), "Forced arrow execution failed!"
- partitions = self._partition_mgr_cls.run_exec_plan(
- self._op, self._table_cols
- )
+ partitions = self._partition_mgr_cls.run_exec_plan(self._op)
self._partitions = partitions
self._op = FrameNode(self)
@@ -1989,6 +2055,9 @@ def _can_execute_arrow(self):
-------
bool
"""
+ if self._force_execution_mode == "hdk":
+ return False
+
stack = [self]
while stack:
op = stack.pop()._op
@@ -2014,7 +2083,8 @@ def _execute_arrow(self):
frame = stack.pop()
if callable(frame):
- result = frame(result)
+ if isinstance(result := frame(result), DbTable):
+ result = result.to_arrow()
elif input := getattr(frame._op, "input", None):
if len(input) == 1:
stack.append(frame._op.execute_arrow)
@@ -2045,45 +2115,37 @@ def to_arrow(result, op=frame._op, tables=[], frames=iter(input)):
return result
to_arrow(result)
- else:
- result = frame._op.execute_arrow(result)
+ elif isinstance(result := frame._op.execute_arrow(result), DbTable):
+ result = result.to_arrow()
return result
def _build_index_cache(self):
- """
- Materialize index and store it in the cache.
-
- Can only be called for materialized frames.
- """
- assert isinstance(self._op, FrameNode)
+ """Materialize index and store it in the cache."""
+ obj = self._execute()
- if self._partitions is None:
- self.set_index_cache(Index.__new__(Index))
+ if self._index_cols is None:
+ self.set_index_cache(Index.__new__(RangeIndex, data=range(len(obj))))
+ return
+ if isinstance(obj, DbTable):
+ # TODO: Get the index columns only
+ obj = obj.to_arrow()
+ if isinstance(obj, pyarrow.Table):
+ # The index columns must be in the beginning of the list
+ col_names = obj.column_names[len(self._index_cols) :]
+ index_at = obj.drop(col_names)
+ index_df = index_at.to_pandas()
+ index_df.set_index(self._index_cols, inplace=True)
+ idx = index_df.index
+ idx.rename(demangle_index_names(self._index_cols), inplace=True)
+ if (
+ isinstance(idx, (pd.DatetimeIndex, pd.TimedeltaIndex))
+ and len(idx) >= 3 # infer_freq() requires at least 3 values
+ ):
+ idx.freq = pd.infer_freq(idx)
+ self.set_index_cache(idx)
else:
- obj = self._partitions[0][0].get()
- if isinstance(obj, (pd.DataFrame, pd.Series)):
- self.set_index_cache(obj.index)
- else:
- assert isinstance(obj, pyarrow.Table)
- if self._index_cols is None:
- self.set_index_cache(
- Index.__new__(RangeIndex, data=range(obj.num_rows))
- )
- else:
- # The index columns must be in the beginning of the list
- col_names = obj.column_names[len(self._index_cols) :]
- index_at = obj.drop(col_names)
- index_df = index_at.to_pandas()
- index_df.set_index(self._index_cols, inplace=True)
- idx = index_df.index
- idx.rename(demangle_index_names(self._index_cols), inplace=True)
- if (
- isinstance(idx, (pd.DatetimeIndex, pd.TimedeltaIndex))
- and len(idx) >= 3 # infer_freq() requires at least 3 values
- ):
- idx.freq = pd.infer_freq(idx)
- self.set_index_cache(idx)
+ self.set_index_cache(obj.index)
def _get_index(self):
"""
@@ -2095,7 +2157,6 @@ def _get_index(self):
-------
pandas.Index
"""
- self._execute()
if not self.has_index_cache:
self._build_index_cache()
return self._index_cache.get()
@@ -2125,9 +2186,7 @@ def _set_index(self, new_index):
"HdkOnNativeDataframe._set_index is not yet suported"
)
else:
- assert isinstance(obj, pyarrow.Table)
-
- at = obj
+ at = obj if isinstance(obj, pyarrow.Table) else obj.to_arrow()
if self._index_cols:
at = at.drop(self._index_cols)
@@ -2493,6 +2552,8 @@ def to_pandas(self):
obj = self._execute()
+ if isinstance(obj, DbTable):
+ obj = obj.to_arrow()
if isinstance(obj, pyarrow.Table):
# If the table is exported from HDK, the string columns are converted
# to dictionary. On conversion to pandas, these columns will be of type
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py
index 161b9be8998..5fb2d01dfe8 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py
@@ -29,6 +29,7 @@
import pyarrow as pa
from pyarrow.types import is_dictionary
+from modin.pandas.indexing import is_range_like
from modin.utils import MODIN_UNNAMED_SERIES_LABEL
EMPTY_ARROW_TABLE = pa.Table.from_pandas(pandas.DataFrame({}))
@@ -428,12 +429,12 @@ def to_empty_pandas_df(df):
index_cols = None
else:
index_cols = ColNameCodec.mangle_index_names(merged.index.names)
- for orig_name, mangled_name in zip(merged.index.names, index_cols):
+ for name in index_cols:
# Using _dtypes here since it contains all column names,
# including the index.
- df = left if mangled_name in left._dtypes else right
- exprs[orig_name] = df.ref(mangled_name)
- new_dtypes.append(df._dtypes[mangled_name])
+ df = left if name in left._dtypes else right
+ exprs[name] = df.ref(name)
+ new_dtypes.append(df._dtypes[name])
left_col_names = set(left.columns)
right_col_names = set(right.columns)
@@ -465,6 +466,30 @@ def to_empty_pandas_df(df):
return index_cols, exprs, new_dtypes, merged.columns
+def maybe_range(numbers: Union[List[int], range]) -> Union[List[int], range]:
+ """
+ Try to convert the specified sequence of numbers to a range.
+
+ Parameters
+ ----------
+ numbers : list of ints or range
+
+ Returns
+ -------
+ list of ints or range
+ """
+ if len(numbers) > 2 and not is_range_like(numbers):
+ diff = numbers[1] - numbers[0]
+ is_range = True
+ for i in range(2, len(numbers)):
+ if (numbers[i] - numbers[i - 1]) != diff:
+ is_range = False
+ break
+ if is_range:
+ numbers = range(numbers[0], numbers[-1] + diff, diff)
+ return numbers
+
+
def to_arrow_type(dtype) -> pa.lib.DataType:
"""
Convert the specified dtype to arrow.
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py
index 02ce4f7f68c..80bb792b7cd 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py
@@ -12,6 +12,7 @@
# governing permissions and limitations under the License.
"""Module chooses a proper worker class."""
+from .base_worker import DbTable
from .hdk_worker import HdkWorker as DbWorker
-__all__ = ["DbWorker"]
+__all__ = ["DbTable", "DbWorker"]
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py
index 82a56ba563b..da5e068eac0 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py
@@ -30,6 +30,7 @@
from .expr import InputRefExpr, LiteralExpr, OpExpr
from .dataframe.utils import ColNameCodec, EMPTY_ARROW_TABLE, get_common_arrow_type
+from .db_worker import DbTable
if TYPE_CHECKING:
from .dataframe.dataframe import HdkOnNativeDataframe
@@ -416,12 +417,12 @@ def __init__(self, modin_frame: "HdkOnNativeDataframe"):
def can_execute_arrow(self) -> bool:
return self.modin_frame._has_arrow_table()
- def execute_arrow(self, ignore=None) -> Union[pa.Table, pandas.DataFrame]:
+ def execute_arrow(self, ignore=None) -> Union[DbTable, pa.Table, pandas.DataFrame]:
"""
Materialized frame.
If `can_execute_arrow` returns True, this method returns an arrow table,
- otherwise - a pandas Dataframe.
+ otherwise - a pandas Dataframe or DbTable.
Parameters
----------
@@ -429,7 +430,7 @@ def execute_arrow(self, ignore=None) -> Union[pa.Table, pandas.DataFrame]:
Returns
-------
- pa.Table or pandas.Dataframe
+ DbTable or pa.Table or pandas.Dataframe
"""
frame = self.modin_frame
if frame._partitions is not None:
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py
index 698db1be538..58addb5c7ac 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py
@@ -31,8 +31,10 @@
is_categorical_dtype,
is_datetime64_any_dtype,
is_bool_dtype,
+ is_datetime64_dtype,
)
+from modin.pandas.indexing import is_range_like
from modin.utils import _inherit_docstrings
from .dataframe.utils import ColNameCodec, to_arrow_type
@@ -65,6 +67,8 @@ def _get_common_dtype(lhs_dtype, rhs_dtype):
return get_dtype(float)
if is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype):
return get_dtype(int)
+ if is_datetime64_dtype(lhs_dtype) and is_datetime64_dtype(rhs_dtype):
+ return np.promote_types(lhs_dtype, rhs_dtype)
raise NotImplementedError(
f"Cannot perform operation on types: {lhs_dtype}, {rhs_dtype}"
)
@@ -72,7 +76,7 @@ def _get_common_dtype(lhs_dtype, rhs_dtype):
_aggs_preserving_numeric_type = {"sum", "min", "max"}
_aggs_with_int_result = {"count", "size"}
-_aggs_with_float_result = {"mean", "std", "skew"}
+_aggs_with_float_result = {"mean", "median", "std", "skew"}
def _agg_dtype(agg, dtype):
@@ -801,45 +805,48 @@ class LiteralExpr(BaseExpr):
Parameters
----------
- val : int, np.int, float, bool, str or None
+ val : int, np.int, float, bool, str, np.datetime64 or None
Literal value.
dtype : None or dtype, default: None
Value dtype.
Attributes
----------
- val : int, np.int, float, bool, str or None
+ val : int, np.int, float, bool, str, np.datetime64 or None
Literal value.
_dtype : dtype
Literal data type.
"""
def __init__(self, val, dtype=None):
- if dtype is None:
- if val is not None and not isinstance(
- val,
- (
- int,
- float,
- bool,
- str,
- np.int8,
- np.int16,
- np.int32,
- np.int64,
- np.uint8,
- np.uint16,
- np.uint32,
- np.uint64,
- ),
- ):
- raise NotImplementedError(f"Literal value {val} of type {type(val)}")
- if val is None:
- dtype = get_dtype(float)
- else:
- dtype = get_dtype(type(val))
+ if val is not None and not isinstance(
+ val,
+ (
+ int,
+ float,
+ bool,
+ str,
+ np.int8,
+ np.int16,
+ np.int32,
+ np.int64,
+ np.uint8,
+ np.uint16,
+ np.uint32,
+ np.uint64,
+ np.datetime64,
+ ),
+ ):
+ raise NotImplementedError(f"Literal value {val} of type {type(val)}")
self.val = val
- self._dtype = dtype
+ if dtype is not None:
+ self._dtype = dtype
+ elif val is None:
+ self._dtype = get_dtype(float)
+ else:
+ self._dtype = (
+ val.dtype if isinstance(val, np.generic) else get_dtype(type(val))
+ )
def copy(self):
"""
@@ -857,8 +864,21 @@ def fold(self):
@_inherit_docstrings(BaseExpr.cast)
def cast(self, res_type):
- dtype = np.dtype(res_type)
- return LiteralExpr(dtype.type(self.val), dtype)
+ val = self.val
+ if val is not None:
+ if isinstance(val, np.generic):
+ val = val.astype(res_type)
+ elif is_integer_dtype(res_type):
+ val = int(val)
+ elif is_float_dtype(res_type):
+ val = float(val)
+ elif is_bool_dtype(res_type):
+ val = bool(val)
+ elif is_string_dtype(res_type):
+ val = str(val)
+ else:
+ raise TypeError(f"Cannot cast '{val}' to '{res_type}'")
+ return LiteralExpr(val, res_type)
@_inherit_docstrings(BaseExpr.is_null)
def is_null(self):
@@ -1308,8 +1328,17 @@ def build_row_idx_filter_expr(row_idx, row_col):
if not is_list_like(row_idx):
return row_col.eq(row_idx)
- if isinstance(row_idx, (pandas.RangeIndex, range)) and row_idx.step == 1:
- exprs = [row_col.ge(row_idx[0]), row_col.le(row_idx[-1])]
+ if is_range_like(row_idx):
+ start = row_idx[0]
+ stop = row_idx[-1]
+ step = row_idx.step
+ if step < 0:
+ start, stop = stop, start
+ step = -step
+ exprs = [row_col.ge(start), row_col.le(stop)]
+ if step > 1:
+ mod = OpExpr("MOD", [row_col, LiteralExpr(step)], get_dtype(int))
+ exprs.append(mod.eq(0))
return OpExpr("AND", exprs, get_dtype(bool))
exprs = [row_col.eq(idx) for idx in row_idx]
@@ -1336,6 +1365,11 @@ def build_if_then_else(cond, then_val, else_val, res_type):
BaseExpr
The conditional operator expression.
"""
+ if is_datetime64_dtype(res_type):
+ if then_val._dtype != res_type:
+ then_val = then_val.cast(res_type)
+ if else_val._dtype != res_type:
+ else_val = else_val.cast(res_type)
return OpExpr("CASE", [cond, then_val, else_val], res_type)
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py
index 24e930ac46d..619e9359c7c 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py
@@ -12,94 +12,171 @@
# governing permissions and limitations under the License.
"""Module provides ``HdkWorker`` class."""
+from typing import Optional, Tuple, List, Union
-import pyhdk
+import pyarrow as pa
+import os
-from .base_worker import BaseDbWorker
+from pyhdk.hdk import HDK, QueryNode, ExecutionResult, RelAlgExecutor
+
+from .base_worker import DbTable, BaseDbWorker
from modin.utils import _inherit_docstrings
-from modin.config import HdkLaunchParameters
+from modin.config import HdkLaunchParameters, OmnisciFragmentSize, HdkFragmentSize
+
+
+class HdkTable(DbTable):
+ """
+ Represents a table in the HDK database.
+
+ Parameters
+ ----------
+ table : QueryNode or ExecutionResult
+ """
+
+ def __init__(self, table: Union[QueryNode, ExecutionResult]):
+ self.name = table.table_name
+ self._table = table
+
+ def __del__(self):
+ """Drop table."""
+ # The ExecutionResults are cleared by HDK.
+ if not isinstance(self._table, ExecutionResult):
+ HdkWorker.dropTable(self.name)
+
+ @property
+ @_inherit_docstrings(DbTable.shape)
+ def shape(self) -> Tuple[int, int]:
+ shape = getattr(self, "_shape", None)
+ if shape is None:
+ self._shape = shape = self.scan().shape
+ return shape
+
+ @property
+ @_inherit_docstrings(DbTable.column_names)
+ def column_names(self) -> List[str]:
+ names = getattr(self, "_column_names", None)
+ if names is None:
+ self._column_names = names = list(self.scan().schema)
+ return names
+
+ @_inherit_docstrings(DbTable.to_arrow)
+ def to_arrow(self) -> pa.Table:
+ return (
+ self._table.to_arrow()
+ if isinstance(self._table, ExecutionResult)
+ else self._table.run().to_arrow()
+ )
+
+ def scan(self):
+ """
+ Return a scan query node referencing this table.
+
+ Returns
+ -------
+ QueryNode
+ """
+ if isinstance(self._table, QueryNode):
+ return self._table
+ scan = getattr(self, "_scan", None)
+ if scan is None:
+ self._scan = scan = HdkWorker._hdk().scan(self.name)
+ return scan
@_inherit_docstrings(BaseDbWorker)
-class HdkWorker(BaseDbWorker):
+class HdkWorker(BaseDbWorker): # noqa: PR01
"""PyHDK based wrapper class for HDK storage format."""
- _config = None
- _storage = None
- _data_mgr = None
- _calcite = None
- _executor = None
+ def __new__(cls, *args, **kwargs):
+ instance = getattr(cls, "_instance", None)
+ if instance is None:
+ cls._instance = instance = object.__new__(cls)
+ return instance
@classmethod
- def setup_engine(cls):
- """
- Initialize PyHDK.
+ def dropTable(cls, name: str):
+ cls.dropTable = cls._hdk().drop_table
+ cls.dropTable(name)
- Do nothing if it is initiliazed already.
- """
- if cls._executor is None:
- cls._config = pyhdk.buildConfig(**HdkLaunchParameters.get())
- cls._storage = pyhdk.storage.ArrowStorage(1)
- cls._data_mgr = pyhdk.storage.DataMgr(cls._config)
- cls._data_mgr.registerDataProvider(cls._storage)
-
- cls._calcite = pyhdk.sql.Calcite(cls._storage, cls._config)
- cls._executor = pyhdk.Executor(cls._data_mgr, cls._config)
+ @classmethod
+ def executeDML(cls, query: str):
+ return cls.executeRA(query, True)
- def __init__(self):
- """Initialize HDK storage format."""
- self.setup_engine()
+ @classmethod
+ def executeRA(cls, query: str, exec_calcite=False):
+ hdk = cls._hdk()
+ if exec_calcite or query.startswith("execute calcite"):
+ ra = hdk._calcite.process(query, db_name="hdk", legacy_syntax=True)
+ else:
+ ra = query
+ ra_executor = RelAlgExecutor(hdk._executor, hdk._schema_mgr, hdk._data_mgr, ra)
+ return HdkTable(ra_executor.execute(device_type=cls._preferred_device))
@classmethod
- def dropTable(cls, name):
- cls._storage.dropTable(name)
+ def import_arrow_table(cls, table: pa.Table, name: Optional[str] = None):
+ name = cls._genName(name)
+ table = cls.cast_to_compatible_types(table)
+ fragment_size = cls.compute_fragment_size(table)
+ return HdkTable(cls._hdk().import_arrow(table, name, fragment_size))
@classmethod
- def _executeRelAlgJson(cls, ra):
+ def compute_fragment_size(cls, table):
"""
- Execute RelAlg JSON query.
+ Compute fragment size to be used for table import.
Parameters
----------
- ra : str
- RelAlg JSON string.
+ table : pyarrow.Table
+ A table to import.
Returns
-------
- pyarrow.Table
- Execution result.
+ int
+ Fragment size to use for import.
"""
- rel_alg_executor = pyhdk.sql.RelAlgExecutor(
- cls._executor, cls._storage, cls._data_mgr, ra
- )
- res = rel_alg_executor.execute()
- return res.to_arrow()
-
- @classmethod
- def executeDML(cls, query):
- ra = cls._calcite.process(query, db_name="hdk")
- return cls._executeRelAlgJson(ra)
-
- @classmethod
- def executeRA(cls, query):
- if query.startswith("execute relalg"):
- # 14 == len("execute relalg")
- ra = query[14:]
+ fragment_size = HdkFragmentSize.get()
+ if fragment_size is None:
+ fragment_size = OmnisciFragmentSize.get()
+ if fragment_size is None:
+ if bool(HdkLaunchParameters.get()["cpu_only"]):
+ cpu_count = os.cpu_count()
+ if cpu_count is not None:
+ fragment_size = table.num_rows // cpu_count
+ fragment_size = min(fragment_size, 2**25)
+ fragment_size = max(fragment_size, 2**18)
+ else:
+ fragment_size = 0
+ else:
+ fragment_size = 2**25
else:
- assert query.startswith("execute calcite")
- ra = cls._calcite.process(query, db_name="hdk")
-
- return cls._executeRelAlgJson(ra)
+ fragment_size = int(fragment_size)
+ return fragment_size
@classmethod
- def import_arrow_table(cls, table, name=None):
- name = cls._genName(name)
+ def _hdk(cls) -> HDK:
+ """
+ Initialize and return an HDK instance.
- table = cls.cast_to_compatible_types(table)
- fragment_size = cls.compute_fragment_size(table)
+ Returns
+ -------
+ HDK
+ """
+ params = HdkLaunchParameters.get()
+ cls._preferred_device = (
+ "CPU" if bool(HdkLaunchParameters.get()["cpu_only"]) else "GPU"
+ )
+ cls._hdk_instance = HDK(**params)
+ cls._hdk = cls._get_hdk_instance
+ return cls._hdk()
- opt = pyhdk.storage.TableOptions(fragment_size)
- cls._storage.importArrowTable(table, name, opt)
+ @classmethod
+ def _get_hdk_instance(cls) -> HDK:
+ """
+ Return the initialized HDK instance.
- return name
+ Returns
+ -------
+ HDK
+ """
+ return cls._hdk_instance
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py
index 88af4ff509b..3003eef70fc 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py
@@ -210,6 +210,8 @@ def _pyarrow_table(self) -> pa.Table:
pyarrow.Table
"""
at = self._df._execute()
+ if not isinstance(at, pa.Table):
+ at = at.to_arrow()
assert at is not None
return at
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py
index 12c57db9673..bb5702f2af3 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py
@@ -12,7 +12,7 @@
# governing permissions and limitations under the License.
"""Module provides a partition class for ``HdkOnNativeDataframe`` frame."""
-from typing import Optional, Union
+from typing import Union
import pandas
@@ -20,29 +20,24 @@
from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition
from ..dataframe.utils import arrow_to_pandas
-from ..db_worker import DbWorker
+from ..db_worker import DbTable
class HdkOnNativeDataframePartition(PandasDataframePartition):
"""
A partition of ``HdkOnNativeDataframe`` frame.
- Class holds either a ``pandas.DataFrame`` or ``pyarrow.Table``.
+ Class holds either a ``DbTable`` or ``pandas.DataFrame`` or ``pyarrow.Table``.
Parameters
----------
- data : pandas.DataFrame or pyarrow.Table
+ data : DbTable or pandas.DataFrame or pyarrow.Table
Partition data in either pandas or PyArrow format.
- frame_id : str, optional
- A corresponding HDK table name or None.
Attributes
----------
- _data : pandas.DataFrame or pyarrow.Table
+ _data : DbTable or pandas.DataFrame or pyarrow.Table
Partition data in either pandas or PyArrow format.
- frame_id : str
- A corresponding HDK table name if partition was imported
- into HDK. Otherwise None.
_length_cache : int
Length of the partition.
_width_cache : int
@@ -51,23 +46,11 @@ class HdkOnNativeDataframePartition(PandasDataframePartition):
def __init__(
self,
- data: Union[pa.Table, pandas.DataFrame],
- frame_id: Optional[str] = None,
+ data: Union[DbTable, pa.Table, pandas.DataFrame],
):
+ super().__init__()
+ assert isinstance(data, (DbTable, pa.Table, pandas.DataFrame))
self._data = data
- self.frame_id = frame_id
- if isinstance(data, pa.Table):
- self._length_cache = data.num_rows
- self._width_cache = data.num_columns
- else:
- assert isinstance(data, pandas.DataFrame)
- self._length_cache = len(data)
- self._width_cache = len(data.columns)
-
- def __del__(self):
- """Deallocate HDK resources related to the partition."""
- if self.frame_id is not None:
- DbWorker.dropTable(self.frame_id)
def to_pandas(self):
"""
@@ -80,7 +63,8 @@ def to_pandas(self):
obj = self.get()
if isinstance(obj, pandas.DataFrame):
return obj
- assert isinstance(obj, pa.Table)
+ if isinstance(obj, DbTable):
+ obj = obj.to_arrow()
return arrow_to_pandas(obj)
def to_numpy(self, **kwargs):
@@ -104,18 +88,18 @@ def get(self):
Returns
-------
- pandas.DataFrame or pyarrow.Table
+ DbTable or pandas.DataFrame or pyarrow.Table
"""
return self._data
@classmethod
def put(cls, obj):
"""
- Create partition from ``pandas.DataFrame`` or ``pyarrow.Table``.
+ Create partition from ``DbTable`` or ``pandas.DataFrame`` or ``pyarrow.Table``.
Parameters
----------
- obj : pandas.DataFrame or pyarrow.Table
+ obj : DbTable or pandas.DataFrame or pyarrow.Table
Source frame.
Returns
@@ -124,3 +108,28 @@ def put(cls, obj):
The new partition.
"""
return cls(obj)
+
+ @property
+ def _length_cache(self):
+ """
+ Number of rows.
+
+ Returns
+ -------
+ int
+ """
+ return len(self._data)
+
+ @property
+ def _width_cache(self):
+ """
+ Number of columns.
+
+ Returns
+ -------
+ int
+ """
+ if isinstance(self._data, pa.Table):
+ return self._data.num_columns
+ else:
+ return self._data.shape[1]
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py
index ad7c995a532..dd242bee9d3 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py
@@ -22,7 +22,7 @@
)
from ..dataframe.utils import ColNameCodec
from ..partitioning.partition import HdkOnNativeDataframePartition
-from ..db_worker import DbWorker
+from ..db_worker import DbTable, DbWorker
from ..calcite_builder import CalciteBuilder
from ..calcite_serializer import CalciteSerializer
from modin.config import DoUseCalcite
@@ -227,7 +227,7 @@ def is_supported_dtype(dtype):
)
@classmethod
- def run_exec_plan(cls, plan, columns):
+ def run_exec_plan(cls, plan):
"""
Run execution plan in HDK storage format to materialize frame.
@@ -235,61 +235,64 @@ def run_exec_plan(cls, plan, columns):
----------
plan : DFAlgNode
A root of an execution plan tree.
- columns : list of str
- A frame column names.
Returns
-------
np.array
Created frame's partitions.
"""
- omniSession = DbWorker()
+ worker = DbWorker()
# First step is to make sure all partitions are in HDK.
frames = plan.collect_frames()
for frame in frames:
- for p in frame._partitions.flatten():
- if p.frame_id is None:
- obj = p.get()
- if isinstance(obj, (pandas.DataFrame, pandas.Series)):
- p.frame_id = omniSession.import_pandas_dataframe(obj)
- else:
- assert isinstance(obj, pyarrow.Table)
- if obj.num_columns == 0:
- # Tables without columns are not supported.
- # Creating an empty table with index columns only.
- idx_names = (
- frame.index.names
- if frame.has_materialized_index
- else [None]
- )
- idx_names = ColNameCodec.mangle_index_names(idx_names)
- obj = pyarrow.table(
- {n: [] for n in idx_names},
- schema=pyarrow.schema(
- {n: pyarrow.int64() for n in idx_names}
- ),
- )
- p.frame_id = omniSession.import_arrow_table(obj)
+ cls.import_table(frame, worker)
calcite_plan = CalciteBuilder().build(plan)
calcite_json = CalciteSerializer().serialize(calcite_plan)
-
- cmd_prefix = "execute relalg "
-
if DoUseCalcite.get():
- cmd_prefix = "execute calcite "
-
- at = omniSession.executeRA(cmd_prefix + calcite_json)
+ calcite_json = "execute calcite " + calcite_json
+ table = worker.executeRA(calcite_json)
res = np.empty((1, 1), dtype=np.dtype(object))
- # workaround for https://github.com/modin-project/modin/issues/1851
- if DoUseCalcite.get():
- at = at.rename_columns([ColNameCodec.encode(c) for c in columns])
- res[0][0] = cls._partition_class(at)
+ res[0][0] = cls._partition_class(table)
return res
+ @classmethod
+ def import_table(cls, frame, worker=DbWorker()) -> DbTable:
+ """
+ Import the frame's partition data, if required.
+
+ Parameters
+ ----------
+ frame : HdkOnNativeDataframe
+ worker : DbWorker, optional
+
+ Returns
+ -------
+ DbTable
+ """
+ table = frame._partitions[0][0].get()
+ if isinstance(table, pandas.DataFrame):
+ table = worker.import_pandas_dataframe(table)
+ frame._partitions[0][0] = cls._partition_class(table)
+ elif isinstance(table, pyarrow.Table):
+ if table.num_columns == 0:
+ # Tables without columns are not supported.
+ # Creating an empty table with index columns only.
+ idx_names = (
+ frame.index.names if frame.has_materialized_index else [None]
+ )
+ idx_names = ColNameCodec.mangle_index_names(idx_names)
+ table = pyarrow.table(
+ {n: [] for n in idx_names},
+ schema=pyarrow.schema({n: pyarrow.int64() for n in idx_names}),
+ )
+ table = worker.import_arrow_table(table)
+ frame._partitions[0][0] = cls._partition_class(table)
+ return table
+
@classmethod
def _names_from_index_cols(cls, cols):
"""
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py
index 1d42b977485..35dbeb752af 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py
@@ -20,7 +20,7 @@
from pandas._testing import ensure_clean
-from modin.config import StorageFormat, DoUseCalcite
+from modin.config import StorageFormat
from modin.pandas.test.utils import (
io_ops_bad_exc,
default_to_pandas_ignore_string,
@@ -54,6 +54,9 @@
from modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra import (
FrameNode,
)
+from modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_serializer import (
+ CalciteSerializer,
+)
# Our configuration in pytest.ini requires that we explicitly catch all
@@ -221,9 +224,6 @@ def test_null_col(self, null_dtype):
with ForceHdkImport(exp):
exp = to_pandas(exp)
exp["c"] = exp["c"].astype("string")
- # The arrow table contains empty strings, when reading as category.
- assert all(v == "" for v in exp["c"])
- exp["c"] = None
df_equals(ref, exp)
@@ -939,7 +939,7 @@ def groupby(df, **kwargs):
run_and_compare(groupby, data=self.data)
@pytest.mark.parametrize("by", [["a"], ["a", "b", "c"]])
- @pytest.mark.parametrize("agg", ["sum", "size", "mean"])
+ @pytest.mark.parametrize("agg", ["sum", "size", "mean", "median"])
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_agg_by_col(self, by, agg, as_index):
def simple_agg(df, **kwargs):
@@ -1263,19 +1263,7 @@ def groupby(df, **kwargs):
@pytest.mark.parametrize("invert", [True, False])
@pytest.mark.parametrize("select", [True, False])
@pytest.mark.parametrize("ascending", [None, True, False])
- @pytest.mark.parametrize(
- "use_calcite",
- [
- False,
- pytest.param(
- True,
- marks=pytest.mark.xfail(
- reason="Function ROW_NUMBER() is not yet supported by Calcite"
- ),
- ),
- ],
- )
- def test_head_tail(self, op, n, invert, select, ascending, use_calcite):
+ def test_head_tail(self, op, n, invert, select, ascending):
def head(df, **kwargs):
if invert:
df = df[~df["col3"].isna()]
@@ -1287,13 +1275,8 @@ def head(df, **kwargs):
df = getattr(df, op)(n)
return df.sort_values(list(df.columns))
- orig_value = DoUseCalcite.get()
- DoUseCalcite._value = use_calcite
- try:
- # When invert is false, the rowid column is materialized.
- run_and_compare(head, data=test_data["int_data"], force_lazy=invert)
- finally:
- DoUseCalcite._value = orig_value
+ # When invert is false, the rowid column is materialized.
+ run_and_compare(head, data=test_data["int_data"], force_lazy=invert)
class TestAgg:
@@ -1978,16 +1961,7 @@ def compute(df, operation, **kwargs):
force_hdk_execute=force_hdk,
)
- @pytest.mark.parametrize(
- "force_hdk",
- [
- False,
- pytest.param(
- True,
- marks=pytest.mark.xfail(reason="Invert is not yet supported by HDK"),
- ),
- ],
- )
+ @pytest.mark.parametrize("force_hdk", [False, True])
def test_invert_op(self, force_hdk):
def invert(df, **kwargs):
return ~df
@@ -2037,6 +2011,33 @@ def dt_hour(df, **kwargs):
run_and_compare(dt_hour, data=self.datetime_data)
+ @pytest.mark.parametrize("cast", [True, False])
+ @pytest.mark.parametrize("unit", CalciteSerializer._TIMESTAMP_PRECISION.keys())
+ def test_dt_serialization(self, cast, unit):
+ fill_value = np.datetime64(3, unit)
+
+ def serialize(df, **kwargs):
+ if cast:
+ df = df.astype(f"datetime64[{unit}]")
+ return df.fillna(fill_value)
+
+ def cmp(df1, df2):
+ assert df1["date"].max().asm8 == fill_value
+ assert df2["date"].max().asm8 == fill_value
+ df_equals(df1, df2)
+
+ run_and_compare(
+ serialize,
+ data={
+ "date": [
+ np.datetime64(1, unit),
+ np.datetime64(2, unit),
+ None,
+ ]
+ },
+ comparator=cmp,
+ )
+
class TestCategory:
data = {
diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/utils.py
index 0dc5a5134d6..1fa289535cd 100644
--- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/utils.py
+++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/utils.py
@@ -18,18 +18,12 @@
import datetime
import numpy as np
from pandas.api.types import is_datetime64_any_dtype
-import pyarrow as pa
from modin.pandas.test.utils import (
df_equals,
io_ops_bad_exc,
eval_io as general_eval_io,
)
-from ..df_algebra import FrameNode
-
-from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import (
- DbWorker,
-)
def eval_io(
@@ -58,6 +52,15 @@ def hdk_comparator(df1, df2, **kwargs):
# Aligning DateTime dtypes because of the bug related to the `parse_dates` parameter:
# https://github.com/modin-project/modin/issues/3485
df1, df2 = align_datetime_dtypes(df1, df2)
+
+ # 1. Replace NA with empty strings. HDK treats empty strings and NA equally.
+ # 2. HdkWorker.cast_to_compatible_types() converts all categorical columns to string.
+ for dtype in ("object", "category"):
+ for df in (df1, df2):
+ sdf = df.select_dtypes(dtype)
+ if len(sdf.columns) != 0:
+ sdf = sdf.fillna("") if dtype == "object" else sdf.astype(str)
+ df[sdf.columns] = sdf[sdf.columns]
comparator(df1, df2, **kwargs)
general_eval_io(
@@ -158,27 +161,14 @@ def __init__(self, *dfs):
for df in dfs:
if not isinstance(df, (pd.DataFrame, pd.Series)):
continue
- df.shape # to trigger real execution
if df.empty:
continue
- modin_frame = df._query_compiler._modin_frame
- partition = modin_frame._partitions[0][0]
- if partition.frame_id is not None:
- continue
- frame = partition.get()
- if isinstance(frame, (pandas.DataFrame, pandas.Series)):
- frame = pa.Table.from_pandas(frame)
- if isinstance(frame, pa.Table):
- _, cols = modin_frame._partition_mgr_cls._get_unsupported_cols(frame)
- if len(cols) != 0:
- continue
- frame_id = DbWorker().import_arrow_table(frame)
- else:
- raise TypeError(
- f"Unexpected storage format, expected pandas.DataFrame or pyarrow.Table, got: {type(frame)}."
- )
- partition.frame_id = frame_id
- self._imported_frames.append((df, frame_id))
+ try:
+ modin_frame = df._query_compiler._modin_frame
+ modin_frame.force_import()
+ self._imported_frames.append(df)
+ except NotImplementedError:
+ ...
def __enter__(self):
return self
@@ -194,28 +184,20 @@ def export_frames(self):
that was just exported from HDK.
"""
result = []
- for df, frame_id in self._imported_frames:
+ for df in self._imported_frames:
# Append `TransformNode`` selecting all the columns (SELECT * FROM frame_id)
df = df[df.columns.tolist()]
modin_frame = df._query_compiler._modin_frame
- # Forcibly executing plan via HDK. We can't use `modin_frame._execute()` here
- # as it has a chance of running via pyarrow bypassing HDK
- new_partitions = modin_frame._partition_mgr_cls.run_exec_plan(
- modin_frame._op,
- modin_frame._table_cols,
- )
- modin_frame._partitions = new_partitions
- modin_frame._op = FrameNode(modin_frame)
+ # Forcibly executing plan via HDK.
+ mode = modin_frame._force_execution_mode
+ modin_frame._force_execution_mode = "hdk"
+ modin_frame._execute()
+ modin_frame._force_execution_mode = mode
result.append(df)
return result
def __exit__(self, exc_type, exc_val, exc_tb):
- for df, frame_id in self._imported_frames:
- actual_frame_id = df._query_compiler._modin_frame._partitions[0][0].frame_id
- DbWorker().dropTable(frame_id)
- if actual_frame_id == frame_id:
- df._query_compiler._modin_frame._partitions[0][0].frame_id = None
- self._imported_frames = []
+ self._imported_frames.clear()
def set_execution_mode(frame, mode, recursive=False):
diff --git a/modin/experimental/sql/hdk/query.py b/modin/experimental/sql/hdk/query.py
index 6ede4493d25..ded9a93fa7e 100644
--- a/modin/experimental/sql/hdk/query.py
+++ b/modin/experimental/sql/hdk/query.py
@@ -69,10 +69,10 @@ def hdk_query(query: str, **kwargs) -> pd.DataFrame:
modin.pandas.DataFrame
Execution result.
"""
- worker = HdkWorker()
if len(kwargs) > 0:
- query = _build_query(query, kwargs, worker.import_arrow_table)
- df = from_arrow(worker.executeDML(query))
+ query = _build_query(query, kwargs)
+ table = HdkWorker().executeDML(query)
+ df = from_arrow(table.to_arrow())
mdf = df._query_compiler._modin_frame
schema = mdf._partitions[0][0].get().schema
# HDK returns strings as dictionary. For the proper conversion to
@@ -87,7 +87,7 @@ def hdk_query(query: str, **kwargs) -> pd.DataFrame:
return df
-def _build_query(query: str, frames: dict, import_table: callable) -> str:
+def _build_query(query: str, frames: dict) -> str:
"""
Build query to be executed.
@@ -100,8 +100,6 @@ def _build_query(query: str, frames: dict, import_table: callable) -> str:
SQL query to be processed.
frames : dict
DataFrames referenced by the query.
- import_table : callable
- Used to import tables and assign the table names.
Returns
-------
@@ -112,22 +110,14 @@ def _build_query(query: str, frames: dict, import_table: callable) -> str:
for name, df in frames.items():
assert isinstance(df._query_compiler, DFAlgQueryCompiler)
mf = df._query_compiler._modin_frame
- if not mf._has_arrow_table():
- mf._execute()
- assert mf._has_arrow_table()
- part = mf._partitions[0][0]
- at = part.get()
-
- if part.frame_id is None:
- part.frame_id = import_table(at)
-
+ table = mf.force_import()
alias.append("WITH " if len(alias) == 0 else "\n),\n")
alias.extend((name, " AS (\n", " SELECT\n"))
- for i, col in enumerate(at.column_names):
+ for i, col in enumerate(table.column_names):
alias.append(" " if i == 0 else ",\n ")
alias.extend(('"', col, '"', " AS ", '"', ColNameCodec.decode(col), '"'))
- alias.extend(("\n FROM\n ", part.frame_id))
+ alias.extend(("\n FROM\n ", table.name))
alias.extend(("\n)\n", query))
return "".join(alias)
diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py
index 42f79f0a365..dffc1188d90 100644
--- a/modin/numpy/arr.py
+++ b/modin/numpy/arr.py
@@ -2239,7 +2239,7 @@ def argmax(self, axis=None, out=None, keepdims=None):
na_mask = self._query_compiler.isna().any(axis=apply_axis)
if na_mask.any(axis=apply_axis ^ 1).to_numpy()[0, 0]:
na_idxs = self._query_compiler.isna().idxmax(axis=apply_axis)
- result = na_mask.where(na_idxs, result)
+ result = na_idxs.where(na_mask, result)
new_ndim = self._ndim - 1 if not keepdims else self._ndim
if new_ndim == 0:
result = result.to_numpy()[0, 0]
@@ -2304,7 +2304,7 @@ def argmin(self, axis=None, out=None, keepdims=None):
na_mask = self._query_compiler.isna().any(axis=apply_axis)
if na_mask.any(axis=apply_axis ^ 1).to_numpy()[0, 0]:
na_idxs = self._query_compiler.isna().idxmax(axis=apply_axis)
- result = na_mask.where(na_idxs, result)
+ result = na_idxs.where(na_mask, result)
new_ndim = self._ndim - 1 if not keepdims else self._ndim
if new_ndim == 0:
result = result.to_numpy()[0, 0]
diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py
index f96962741d0..15284225ad2 100644
--- a/modin/numpy/test/test_array.py
+++ b/modin/numpy/test/test_array.py
@@ -271,6 +271,16 @@ def test_array_where():
assert_scalar_or_array_equal(modin_result, numpy_result)
+@pytest.mark.parametrize("method", ["argmax", "argmin"])
+def test_argmax_argmin(method):
+ numpy_arr = numpy.array([[1, 2, 3], [4, 5, np.NaN]])
+ modin_arr = np.array(numpy_arr)
+ assert_scalar_or_array_equal(
+ getattr(np, method)(modin_arr, axis=1),
+ getattr(numpy, method)(numpy_arr, axis=1),
+ )
+
+
def test_flatten():
numpy_flat_arr = numpy.random.randint(-100, 100, size=100)
modin_flat_arr = np.array(numpy_flat_arr)
diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py
index 91457b0c47b..dd04f00f052 100644
--- a/modin/pandas/__init__.py
+++ b/modin/pandas/__init__.py
@@ -13,15 +13,22 @@
import pandas
import warnings
+from packaging import version
-__pandas_version__ = "2.0.2"
+__pandas_version__ = "2.0"
-if pandas.__version__ != __pandas_version__:
+if (
+ version.parse(pandas.__version__).release[:2]
+ != version.parse(__pandas_version__).release[:2]
+):
warnings.warn(
f"The pandas version installed ({pandas.__version__}) does not match the supported pandas version in"
- + f" Modin ({__pandas_version__}). This may cause undesired side effects!"
+ + f" Modin ({__pandas_version__}.X). This may cause undesired side effects!"
)
+# to not pollute namespace
+del version
+
with warnings.catch_warnings():
warnings.simplefilter("ignore")
from pandas import (
@@ -149,6 +156,11 @@ def _update_engine(publisher: Parameter):
initialize_unidist()
elif publisher.get() == "Cloudray":
+ warnings.warn(
+ "Cloud feature is deprecated and will be removed in 0.24.0 release",
+ DeprecationWarning,
+ )
+
from modin.experimental.cloud import get_connection
conn = get_connection()
@@ -174,10 +186,20 @@ def init_remote_ray(partition):
else:
get_connection().modules["modin"].set_execution("Ray", StorageFormat.get())
elif publisher.get() == "Cloudpython":
+ warnings.warn(
+ "Cloud feature is deprecated and will be removed in 0.24.0 release",
+ DeprecationWarning,
+ )
+
from modin.experimental.cloud import get_connection
get_connection().modules["modin"].set_execution("Python")
elif publisher.get() == "Cloudnative":
+ warnings.warn(
+ "Cloud feature is deprecated and will be removed in 0.24.0 release",
+ DeprecationWarning,
+ )
+
from modin.experimental.cloud import get_connection
assert (
diff --git a/modin/pandas/base.py b/modin/pandas/base.py
index d6a513851f2..e6eb94d64c7 100644
--- a/modin/pandas/base.py
+++ b/modin/pandas/base.py
@@ -1083,8 +1083,6 @@ def clip(
if upper is not None and np.any(np.isnan(upper)):
upper = None
if is_list_like(lower) or is_list_like(upper):
- if axis is None:
- raise ValueError("Must specify axis = 0 or 1")
lower = self._validate_other(lower, axis)
upper = self._validate_other(upper, axis)
# FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility
diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py
index 938625c550f..e0738263346 100644
--- a/modin/pandas/dataframe.py
+++ b/modin/pandas/dataframe.py
@@ -759,11 +759,18 @@ def equals(self, other): # noqa: PR01, RT01, D200
if isinstance(other, pandas.DataFrame):
# Copy into a Modin DataFrame to simplify logic below
other = self.__constructor__(other)
- return (
- self.index.equals(other.index)
- and self.columns.equals(other.columns)
- and self.eq(other).all().all()
+
+ if (
+ type(self) != type(other)
+ or not self.index.equals(other.index)
+ or not self.columns.equals(other.columns)
+ ):
+ return False
+
+ result = self.__constructor__(
+ query_compiler=self._query_compiler.equals(other._query_compiler)
)
+ return result.all(axis=None)
def _update_var_dicts_in_kwargs(self, expr, kwargs):
"""
diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py
index 3d208bc02be..3ef68f17c2e 100644
--- a/modin/pandas/groupby.py
+++ b/modin/pandas/groupby.py
@@ -39,6 +39,7 @@
from modin.core.dataframe.algebra.default2pandas.groupby import GroupBy
from modin.config import IsExperimental
from .series import Series
+from .window import RollingGroupby
from .utils import is_label
@@ -135,7 +136,20 @@ def __init__(
}
self._kwargs.update(kwargs)
- def __override(self, **kwargs):
+ def _override(self, **kwargs):
+ """
+ Override groupby parameters.
+
+ Parameters
+ ----------
+ **kwargs : dict
+ Parameters to override.
+
+ Returns
+ -------
+ DataFrameGroupBy
+ A groupby object with new parameters.
+ """
new_kw = dict(
df=self._df,
by=self._by,
@@ -836,7 +850,7 @@ def do_relabel(obj_to_relabel):
# for list-list aggregation pandas always puts
# groups as index in the result, ignoring as_index,
# so we have to reset it to default value
- res = self.__override(as_index=True)._wrap_aggregation(
+ res = self._override(as_index=True)._wrap_aggregation(
qc_method=type(self._query_compiler).groupby_agg,
numeric_only=False,
agg_func=func,
@@ -916,7 +930,7 @@ def var(self, ddof=1, engine=None, engine_kwargs=None, numeric_only=False):
)
def get_group(self, name, obj=None):
- work_object = self.__override(
+ work_object = self._override(
df=obj if obj is not None else self._df, as_index=True
)
@@ -1067,7 +1081,7 @@ def median(self, numeric_only=False):
def head(self, n=5):
# groupby().head()/.tail() ignore as_index, so override it to True
- work_object = self.__override(as_index=True)
+ work_object = self._override(as_index=True)
return work_object._check_index(
work_object._wrap_aggregation(
@@ -1184,7 +1198,7 @@ def cumcount(self, ascending=True):
def tail(self, n=5):
# groupby().head()/.tail() ignore as_index, so override it to True
- work_object = self.__override(as_index=True)
+ work_object = self._override(as_index=True)
return work_object._check_index(
work_object._wrap_aggregation(
type(work_object._query_compiler).groupby_tail,
@@ -1199,7 +1213,7 @@ def expanding(self, *args, **kwargs):
return self._default_to_pandas(lambda df: df.expanding(*args, **kwargs))
def rolling(self, *args, **kwargs):
- return self._default_to_pandas(lambda df: df.rolling(*args, **kwargs))
+ return RollingGroupby(self, *args, **kwargs)
def hist(
self,
diff --git a/modin/pandas/series.py b/modin/pandas/series.py
index d873ab7f488..aa061598657 100644
--- a/modin/pandas/series.py
+++ b/modin/pandas/series.py
@@ -23,6 +23,7 @@
from pandas.core.dtypes.common import (
is_dict_like,
is_list_like,
+ is_categorical_dtype,
)
from pandas.core.series import _coerce_method
from pandas._libs.lib import no_default, NoDefault
@@ -421,7 +422,7 @@ def __repr__(self):
if (
isinstance(temp_df, pandas.Series)
and temp_df.name is not None
- and temp_df.dtype == "category"
+ and is_categorical_dtype(temp_df.dtype)
):
maxsplit = 2
return temp_str.rsplit("\n", maxsplit)[0] + "\n{}{}{}{}".format(
@@ -926,11 +927,26 @@ def equals(self, other): # noqa: PR01, RT01, D200
"""
Test whether two objects contain the same elements.
"""
- return (
- self.name == other.name
- and self.index.equals(other.index)
- and self.eq(other).all()
- )
+ if isinstance(other, pandas.Series):
+ # Copy into a Modin Series to simplify logic below
+ other = self.__constructor__(other)
+
+ if type(self) != type(other) or not self.index.equals(other.index):
+ return False
+
+ old_name_self = self.name
+ old_name_other = other.name
+ try:
+ self.name = "temp_name_for_equals_op"
+ other.name = "temp_name_for_equals_op"
+ # this function should return only scalar
+ res = self.__constructor__(
+ query_compiler=self._query_compiler.equals(other._query_compiler)
+ )
+ finally:
+ self.name = old_name_self
+ other.name = old_name_other
+ return res.all()
def explode(self, ignore_index: bool = False): # noqa: PR01, RT01, D200
"""
@@ -1397,6 +1413,22 @@ def reindex(
fill_value=fill_value,
)
+ def rename_axis(
+ self,
+ mapper=no_default,
+ *,
+ index=no_default,
+ axis=0,
+ copy=True,
+ inplace=False,
+ ): # noqa: PR01, RT01, D200
+ """
+ Set the name of the axis for the index or columns.
+ """
+ return super().rename_axis(
+ mapper=mapper, index=index, axis=axis, copy=copy, inplace=inplace
+ )
+
def rename(
self,
index=None,
diff --git a/modin/pandas/test/data/issue_1930.csv b/modin/pandas/test/data/issue_1930.csv
new file mode 100644
index 00000000000..d61b5455de7
--- /dev/null
+++ b/modin/pandas/test/data/issue_1930.csv
@@ -0,0 +1,5 @@
+,col1,col2,col3,col4,col5
+0,0,4,8,12,0
+1,1,5,9,13,0
+2,2,6,10,14,0
+3,3,7,11,15,0
diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py
index 92d5c25f6e9..c5b9e775d6c 100644
--- a/modin/pandas/test/dataframe/test_binary.py
+++ b/modin/pandas/test/dataframe/test_binary.py
@@ -13,6 +13,7 @@
import pytest
import pandas
+import numpy as np
import matplotlib
import modin.pandas as pd
@@ -223,28 +224,81 @@ def test_multi_level_comparison(data, op):
getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1)
-def test_equals():
- frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 4, 1]}
- modin_df1 = pd.DataFrame(frame_data)
- modin_df2 = pd.DataFrame(frame_data)
-
- assert modin_df1.equals(modin_df2)
-
- df_equals(modin_df1, modin_df2)
- df_equals(modin_df1, pd.DataFrame(modin_df1))
+@pytest.mark.parametrize(
+ "frame1_data,frame2_data,expected_pandas_equals",
+ [
+ pytest.param({}, {}, True, id="two_empty_dataframes"),
+ pytest.param([[1]], [[0]], False, id="single_unequal_values"),
+ pytest.param([[None]], [[None]], True, id="single_none_values"),
+ pytest.param([[np.NaN]], [[np.NaN]], True, id="single_nan_values"),
+ pytest.param({1: [10]}, {1.0: [10]}, True, id="different_column_types"),
+ pytest.param({1: [10]}, {2: [10]}, False, id="different_columns"),
+ pytest.param(
+ pandas.DataFrame({1: [10]}, index=[1]),
+ pandas.DataFrame({1: [10]}, index=[1.0]),
+ True,
+ id="different_index_types",
+ ),
+ pytest.param(
+ pandas.DataFrame({1: [10]}, index=[1]),
+ pandas.DataFrame({1: [10]}, index=[2]),
+ False,
+ id="different_indexes",
+ ),
+ pytest.param({1: [10]}, {1: [10.0]}, False, id="different_value_types"),
+ pytest.param(
+ [[1, 2], [3, 4]],
+ [[1, 2], [3, 4]],
+ True,
+ id="equal_two_by_two_dataframes",
+ ),
+ pytest.param(
+ [[1, 2], [3, 4]],
+ [[5, 2], [3, 4]],
+ False,
+ id="unequal_two_by_two_dataframes",
+ ),
+ pytest.param(
+ [[1, 1]],
+ [[1]],
+ False,
+ id="different_row_lengths",
+ ),
+ pytest.param(
+ [[1], [1]],
+ [[1]],
+ False,
+ id="different_column_lengths",
+ ),
+ ],
+)
+def test_equals(frame1_data, frame2_data, expected_pandas_equals):
+ modin_df1 = pd.DataFrame(frame1_data)
+ pandas_df1 = pandas.DataFrame(frame1_data)
+ modin_df2 = pd.DataFrame(frame2_data)
+ pandas_df2 = pandas.DataFrame(frame2_data)
+
+ pandas_equals = pandas_df1.equals(pandas_df2)
+ assert pandas_equals == expected_pandas_equals, (
+ "Test expected pandas to say the dataframes were"
+ + f"{'' if expected_pandas_equals else ' not'} equal, but they were"
+ + f"{' not' if expected_pandas_equals else ''} equal."
+ )
- frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 5, 1]}
- modin_df3 = pd.DataFrame(frame_data, index=list("abcd"))
+ assert modin_df1.equals(modin_df2) == pandas_equals
+ assert modin_df1.equals(pandas_df2) == pandas_equals
- assert not modin_df1.equals(modin_df3)
- with pytest.raises(AssertionError):
- df_equals(modin_df3, modin_df1)
+def test_equals_several_partitions():
+ modin_series1 = pd.concat([pd.DataFrame([0, 1]), pd.DataFrame([None, 1])])
+ modin_series2 = pd.concat([pd.DataFrame([0, 1]), pd.DataFrame([1, None])])
+ assert not modin_series1.equals(modin_series2)
- with pytest.raises(AssertionError):
- df_equals(modin_df3, modin_df2)
- assert modin_df1.equals(modin_df2._query_compiler.to_pandas())
+def test_equals_with_nans():
+ df1 = pd.DataFrame([0, 1, None], dtype="uint8[pyarrow]")
+ df2 = pd.DataFrame([None, None, None], dtype="uint8[pyarrow]")
+ assert not df1.equals(df2)
@pytest.mark.parametrize("is_more_other_partitions", [True, False])
diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py
index 66208d07004..8844e8d1766 100644
--- a/modin/pandas/test/dataframe/test_map_metadata.py
+++ b/modin/pandas/test/dataframe/test_map_metadata.py
@@ -320,6 +320,12 @@ def test_copy(data):
df_equals(modin_df, modin_df_cp)
+def test_copy_empty_dataframe():
+ df = pd.DataFrame(range(3))
+ res = df[:0].copy()
+ assert res.dtypes.equals(df.dtypes)
+
+
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_dtypes(data):
modin_df = pd.DataFrame(data)
@@ -757,6 +763,7 @@ def test_infer_objects_single_partition():
@pytest.mark.parametrize(
"convert_floating", bool_arg_values, ids=arg_keys("convert_floating", bool_arg_keys)
)
+@pytest.mark.exclude_in_sanity
def test_convert_dtypes_single_partition(
infer_objects, convert_string, convert_integer, convert_boolean, convert_floating
):
@@ -844,6 +851,7 @@ def test_convert_dtypes_5653():
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("axis", axis_values, ids=axis_keys)
@pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"])
+@pytest.mark.exclude_in_sanity
def test_clip(request, data, axis, bound_type):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
@@ -893,6 +901,12 @@ def test_clip(request, data, axis, bound_type):
modin_df.clip(lower=[1, 2, 3], axis=None)
+def test_clip_4485():
+ modin_result = pd.DataFrame([1]).clip([3])
+ pandas_result = pandas.DataFrame([1]).clip([3])
+ df_equals(modin_result, pandas_result)
+
+
def test_drop():
frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}
simple = pandas.DataFrame(frame_data)
@@ -1037,6 +1051,7 @@ def test_droplevel():
ids=["None", "string", "name", "tuple", "list"],
)
@pytest.mark.parametrize("ignore_index", [True, False], ids=["True", "False"])
+@pytest.mark.exclude_in_sanity
def test_drop_duplicates(data, keep, subset, ignore_index):
modin_df = pd.DataFrame(data)
pandas_df = pandas.DataFrame(data)
@@ -1690,6 +1705,7 @@ def test___round__():
],
)
@pytest.mark.parametrize("dtype", [None, "str"])
+@pytest.mark.exclude_in_sanity
def test_constructor_from_modin_series(get_index, get_columns, dtype):
modin_df, pandas_df = create_test_dfs(test_data_values[0])
diff --git a/modin/pandas/test/internals/test_benchmark_mode.py b/modin/pandas/test/internals/test_benchmark_mode.py
index e42e4ee76c6..8d67503ff33 100644
--- a/modin/pandas/test/internals/test_benchmark_mode.py
+++ b/modin/pandas/test/internals/test_benchmark_mode.py
@@ -13,9 +13,10 @@
import unittest.mock as mock
+import pytest
+
import modin.pandas as pd
-from modin.pandas.test.utils import test_data_values
-from modin.config import BenchmarkMode, Engine
+from modin.config import Engine
engine = Engine.get()
@@ -46,26 +47,17 @@
)
-def test_from_environment_variable():
- assert BenchmarkMode.get()
- with mock.patch(wait_method) as wait:
- pd.DataFrame(test_data_values[0]).mean()
-
- wait.assert_called()
-
-
-def test_turn_off():
+@pytest.mark.parametrize("set_benchmark_mode", [False], indirect=True)
+def test_turn_off(set_benchmark_mode):
df = pd.DataFrame([0])
- BenchmarkMode.put(False)
with mock.patch(wait_method) as wait:
df.dropna()
wait.assert_not_called()
-def test_turn_on():
- BenchmarkMode.put(False)
+@pytest.mark.parametrize("set_benchmark_mode", [True], indirect=True)
+def test_turn_on(set_benchmark_mode):
df = pd.DataFrame([0])
- BenchmarkMode.put(True)
with mock.patch(wait_method) as wait:
df.dropna()
wait.assert_called()
diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py
index 58f8088ae63..37984eb3d10 100644
--- a/modin/pandas/test/test_api.py
+++ b/modin/pandas/test/test_api.py
@@ -272,8 +272,6 @@ def test_series_api_equality():
# These have to be checked manually
allowed_different = ["to_hdf", "hist"]
- # skip verifying .rename_axis() due to https://github.com/modin-project/modin/issues/5077
- allowed_different.append("rename_axis")
assert_parameters_eq((pandas.Series, pd.Series), modin_dir, allowed_different)
diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py
index 6edeabb4157..2348fb8abdf 100644
--- a/modin/pandas/test/test_groupby.py
+++ b/modin/pandas/test/test_groupby.py
@@ -2651,3 +2651,135 @@ def test_groupby_pct_change_diff_6194():
df._to_pandas(),
lambda df: df.groupby(by="by").diff(),
)
+
+
+def eval_rolling(md_window, pd_window):
+ eval_general(md_window, pd_window, lambda window: window.count())
+ eval_general(md_window, pd_window, lambda window: window.sum())
+ eval_general(md_window, pd_window, lambda window: window.mean())
+ eval_general(md_window, pd_window, lambda window: window.median())
+ eval_general(md_window, pd_window, lambda window: window.var())
+ eval_general(md_window, pd_window, lambda window: window.std())
+ eval_general(md_window, pd_window, lambda window: window.min())
+ eval_general(md_window, pd_window, lambda window: window.max())
+ eval_general(md_window, pd_window, lambda window: window.corr())
+ eval_general(md_window, pd_window, lambda window: window.cov())
+ eval_general(md_window, pd_window, lambda window: window.skew())
+ eval_general(md_window, pd_window, lambda window: window.kurt())
+ eval_general(
+ md_window, pd_window, lambda window: window.apply(lambda df: (df + 10).sum())
+ )
+ eval_general(md_window, pd_window, lambda window: window.agg("sum"))
+ eval_general(md_window, pd_window, lambda window: window.quantile(0.2))
+ eval_general(md_window, pd_window, lambda window: window.rank())
+
+ if not md_window._as_index:
+ # There's a mismatch in group columns when 'as_index=False'
+ # see: https://github.com/modin-project/modin/issues/6291
+ by_cols = list(md_window._groupby_obj._internal_by)
+ eval_general(
+ md_window,
+ pd_window,
+ lambda window: window.sem().drop(columns=by_cols, errors="ignore"),
+ )
+ else:
+ eval_general(
+ md_window,
+ pd_window,
+ lambda window: window.sem(),
+ )
+
+
+@pytest.mark.parametrize("center", [True, False])
+@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
+@pytest.mark.parametrize("as_index", [True, False])
+def test_rolling_int_window(center, closed, as_index):
+ col_part1 = pd.DataFrame(
+ {
+ "by": np.tile(np.arange(15), 10),
+ "col1": np.arange(150),
+ "col2": np.arange(10, 160),
+ }
+ )
+ col_part2 = pd.DataFrame({"col3": np.arange(20, 170)})
+
+ md_df = pd.concat([col_part1, col_part2], axis=1)
+ pd_df = md_df._to_pandas()
+
+ if StorageFormat.get() == "Pandas":
+ assert md_df._query_compiler._modin_frame._partitions.shape[1] == 2
+
+ md_window = md_df.groupby("by", as_index=as_index).rolling(
+ 3, center=center, closed=closed
+ )
+ pd_window = pd_df.groupby("by", as_index=as_index).rolling(
+ 3, center=center, closed=closed
+ )
+ eval_rolling(md_window, pd_window)
+
+
+@pytest.mark.parametrize("center", [True, False])
+@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"])
+@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize("on", [None, "col4"])
+def test_rolling_timedelta_window(center, closed, as_index, on):
+ col_part1 = pd.DataFrame(
+ {
+ "by": np.tile(np.arange(15), 10),
+ "col1": np.arange(150),
+ "col2": np.arange(10, 160),
+ }
+ )
+ col_part2 = pd.DataFrame({"col3": np.arange(20, 170)})
+
+ if on is not None:
+ col_part2[on] = pandas.DatetimeIndex(
+ [
+ datetime.date(2020, 1, 1) + datetime.timedelta(hours=12) * i
+ for i in range(150)
+ ]
+ )
+
+ md_df = pd.concat([col_part1, col_part2], axis=1)
+ md_df.index = pandas.DatetimeIndex(
+ [datetime.date(2020, 1, 1) + datetime.timedelta(days=1) * i for i in range(150)]
+ )
+
+ pd_df = md_df._to_pandas()
+
+ if StorageFormat.get() == "Pandas":
+ assert md_df._query_compiler._modin_frame._partitions.shape[1] == 2
+
+ md_window = md_df.groupby("by", as_index=as_index).rolling(
+ datetime.timedelta(days=3), center=center, closed=closed, on=on
+ )
+ pd_window = pd_df.groupby("by", as_index=as_index).rolling(
+ datetime.timedelta(days=3), center=center, closed=closed, on=on
+ )
+ eval_rolling(md_window, pd_window)
+
+
+@pytest.mark.parametrize(
+ "func",
+ [
+ pytest.param("sum", id="map_reduce_func"),
+ pytest.param("median", id="full_axis_func"),
+ ],
+)
+def test_groupby_deferred_index(func):
+ # the test is copied from the issue:
+ # https://github.com/modin-project/modin/issues/6368
+
+ def perform(lib):
+ df1 = lib.DataFrame({"a": [1, 1, 2, 2]})
+ df2 = lib.DataFrame({"b": [3, 4, 5, 6], "c": [7, 5, 4, 3]})
+
+ df = lib.concat([df1, df2], axis=1)
+ df.index = [10, 11, 12, 13]
+
+ grp = df.groupby("a")
+ grp.indices
+
+ return getattr(grp, func)()
+
+ eval_general(pd, pandas, perform)
diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py
index b14cf3d130a..ff25601fccf 100644
--- a/modin/pandas/test/test_io.py
+++ b/modin/pandas/test/test_io.py
@@ -950,6 +950,7 @@ def test_read_csv_google_cloud_storage(self):
],
],
)
+ @pytest.mark.exclude_in_sanity
def test_read_csv_parse_dates(
self, names, header, index_col, parse_dates, encoding, encoding_errors
):
@@ -1124,6 +1125,7 @@ def test_read_csv_wrong_path(self):
condition="config.getoption('--simulate-cloud').lower() != 'off'",
reason="The reason of tests fail in `cloud` mode is unknown for now - issue #2340",
)
+ @pytest.mark.exclude_in_sanity
def test_to_csv(
self,
tmp_path,
@@ -1337,6 +1339,16 @@ def test_read_csv_issue_5150(self, set_async_read_mode):
if not AsyncReadMode.get():
df_equals(expected_pandas_df, actual_pandas_df)
+ @pytest.mark.parametrize("usecols", [None, [0, 1, 2, 3, 4]])
+ def test_read_csv_1930(self, usecols):
+ eval_io(
+ fn_name="read_csv",
+ # read_csv kwargs
+ filepath_or_buffer="modin/pandas/test/data/issue_1930.csv",
+ names=["c1", "c2", "c3", "c4", "c5"],
+ usecols=usecols,
+ )
+
class TestTable:
def test_read_table(self, make_csv_file):
@@ -1474,6 +1486,7 @@ def test_read_parquet_indexing_by_column(self, tmp_path, engine, make_parquet_fi
condition="config.getoption('--simulate-cloud').lower() != 'off'",
reason="The reason of tests fail in `cloud` mode is unknown for now - issue #3264",
)
+ @pytest.mark.exclude_in_sanity
def test_read_parquet_directory(
self, engine, make_parquet_dir, columns, row_group_size, rows_per_file
):
@@ -2032,10 +2045,6 @@ def test_read_excel_every_other_nan(self):
io="modin/pandas/test/data/every_other_row_nan.xlsx",
)
- @pytest.mark.xfail(
- StorageFormat.get() == "Hdk",
- reason="The frame contains different dtypes in the same column and could not be converted to arrow",
- )
@check_file_leaks
def test_read_excel_header_none(self):
eval_io(
diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py
index 45eb9145c3a..513dceec7d2 100644
--- a/modin/pandas/test/test_series.py
+++ b/modin/pandas/test/test_series.py
@@ -567,12 +567,14 @@ def test___repr__4186():
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+@pytest.mark.exclude_in_sanity
def test___round__(data):
modin_series, pandas_series = create_test_series(data)
df_equals(round(modin_series), round(pandas_series))
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+@pytest.mark.exclude_in_sanity
def test___setitem__(data):
modin_series, pandas_series = create_test_series(data)
for key in modin_series.keys():
@@ -1409,6 +1411,12 @@ def test_copy(data):
df_equals(modin_series.copy(), pandas_series.copy())
+def test_copy_empty_series():
+ ser = pd.Series(range(3))
+ res = ser[:0].copy()
+ assert res.dtype == ser.dtype
+
+
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
def test_corr(data):
modin_series, pandas_series = create_test_series(data)
@@ -1897,26 +1905,74 @@ def test_eq(data):
inter_df_math_helper(modin_series, pandas_series, "eq")
-def test_equals():
- series_data = [2.9, 3, 3, 3]
- modin_df1 = pd.Series(series_data)
- modin_df2 = pd.Series(series_data)
+@pytest.mark.parametrize(
+ "series1_data,series2_data,expected_pandas_equals",
+ [
+ pytest.param([1], [0], False, id="single_unequal_values"),
+ pytest.param([None], [None], True, id="single_none_values"),
+ pytest.param(
+ pandas.Series(1, name="series1"),
+ pandas.Series(1, name="series2"),
+ True,
+ id="different_names",
+ ),
+ pytest.param(
+ pandas.Series([1], index=[1]),
+ pandas.Series([1], index=[1.0]),
+ True,
+ id="different_index_types",
+ ),
+ pytest.param(
+ pandas.Series([1], index=[1]),
+ pandas.Series([1], index=[2]),
+ False,
+ id="different_index_values",
+ ),
+ pytest.param([1], [1.0], False, id="different_value_types"),
+ pytest.param(
+ [1, 2],
+ [1, 2],
+ True,
+ id="equal_series_of_length_two",
+ ),
+ pytest.param(
+ [1, 2],
+ [1, 3],
+ False,
+ id="unequal_series_of_length_two",
+ ),
+ pytest.param(
+ [[1, 2]],
+ [[1]],
+ False,
+ id="different_lengths",
+ ),
+ ],
+)
+def test_equals(series1_data, series2_data, expected_pandas_equals):
+ modin_series1, pandas_df1 = create_test_series(series1_data)
+ modin_series2, pandas_df2 = create_test_series(series2_data)
- assert modin_df1.equals(modin_df2)
- assert modin_df1.equals(pd.Series(modin_df1))
- df_equals(modin_df1, modin_df2)
- df_equals(modin_df1, pd.Series(modin_df1))
+ pandas_equals = pandas_df1.equals(pandas_df2)
+ assert pandas_equals == expected_pandas_equals, (
+ "Test expected pandas to say the series were"
+ + f"{'' if expected_pandas_equals else ' not'} equal, but they were"
+ + f"{' not' if expected_pandas_equals else ''} equal."
+ )
+ assert modin_series1.equals(modin_series2) == pandas_equals
+ assert modin_series1.equals(pandas_df2) == pandas_equals
- series_data = [2, 3, 5, 1]
- modin_df3 = pd.Series(series_data, index=list("abcd"))
- assert not modin_df1.equals(modin_df3)
+def test_equals_several_partitions():
+ modin_series1 = pd.concat([pd.Series([0, 1]), pd.Series([None, 1])])
+ modin_series2 = pd.concat([pd.Series([0, 1]), pd.Series([1, None])])
+ assert not modin_series1.equals(modin_series2)
- with pytest.raises(AssertionError):
- df_equals(modin_df3, modin_df1)
- with pytest.raises(AssertionError):
- df_equals(modin_df3, modin_df2)
+def test_equals_with_nans():
+ ser1 = pd.Series([0, 1, None], dtype="uint8[pyarrow]")
+ ser2 = pd.Series([None, None, None], dtype="uint8[pyarrow]")
+ assert not ser1.equals(ser2)
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@@ -1954,6 +2010,7 @@ def test_ffill(data):
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
@pytest.mark.parametrize("reindex", [None, 2, -2])
@pytest.mark.parametrize("limit", [None, 1, 2, 0.5, -1, -2, 1.5])
+@pytest.mark.exclude_in_sanity
def test_fillna(data, reindex, limit):
modin_series, pandas_series = create_test_series(data)
index = pandas_series.index
@@ -2376,6 +2433,10 @@ def test_map(data, na_values):
)
+@pytest.mark.xfail(
+ StorageFormat.get() == "Hdk",
+ reason="https://github.com/intel-ai/hdk/issues/542",
+)
def test_mask():
modin_series = pd.Series(np.arange(10))
m = modin_series % 3 == 0
@@ -2850,6 +2911,12 @@ def test_repeat_lists(data, repeats):
)
+def test_clip_4485():
+ modin_result = pd.Series([1]).clip([3])
+ pandas_result = pandas.Series([1]).clip([3])
+ df_equals(modin_result, pandas_result)
+
+
def test_replace():
modin_series = pd.Series([0, 1, 2, 3, 4])
pandas_series = pandas.Series([0, 1, 2, 3, 4])
@@ -2865,6 +2932,7 @@ def test_replace():
@pytest.mark.parametrize("closed", ["left", "right"])
@pytest.mark.parametrize("label", ["right", "left"])
@pytest.mark.parametrize("level", [None, 1])
+@pytest.mark.exclude_in_sanity
def test_resample(closed, label, level):
rule = "5T"
freq = "H"
@@ -3050,6 +3118,7 @@ def test_sample(data):
@pytest.mark.parametrize("values_number", [1, 2, 5])
@pytest.mark.parametrize("side", ["left", "right"])
@pytest.mark.parametrize("data", test_data_values, ids=test_data_keys)
+@pytest.mark.exclude_in_sanity
def test_searchsorted(
data, side, values_number, sorter, use_multiindex, single_value_data
):
@@ -3312,6 +3381,7 @@ def test_subtract(data):
@pytest.mark.parametrize(
"min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys)
)
+@pytest.mark.exclude_in_sanity
def test_sum(data, axis, skipna, numeric_only, min_count):
eval_general(
*create_test_series(data),
@@ -3670,6 +3740,7 @@ def test_update(data, other_data):
],
)
@pytest.mark.parametrize("ascending", bool_arg_values, ids=bool_arg_keys)
+@pytest.mark.exclude_in_sanity
def test_value_counts(sort, normalize, bins, dropna, ascending):
def sort_sensitive_comparator(df1, df2):
# We sort indices for Modin and pandas result because of issue #1650
diff --git a/modin/pandas/window.py b/modin/pandas/window.py
index 473b2a91d97..4e05cece9e4 100644
--- a/modin/pandas/window.py
+++ b/modin/pandas/window.py
@@ -20,6 +20,7 @@
from modin.logging import ClassLogger
from modin.utils import _inherit_docstrings
from modin.pandas.utils import cast_function_modin2pandas
+from modin.error_message import ErrorMessage
@_inherit_docstrings(pandas.core.window.rolling.Window)
@@ -39,44 +40,44 @@ def __init__(
):
self._dataframe = dataframe
self._query_compiler = dataframe._query_compiler
- self.window_args = [
- window,
- min_periods,
- center,
- win_type,
- on,
- axis,
- closed,
- step,
- method,
- ]
+ self.window_kwargs = {
+ "window": window,
+ "min_periods": min_periods,
+ "center": center,
+ "win_type": win_type,
+ "on": on,
+ "axis": axis,
+ "closed": closed,
+ "step": step,
+ "method": method,
+ }
self.axis = axis
def mean(self, *args, **kwargs):
return self._dataframe.__constructor__(
query_compiler=self._query_compiler.window_mean(
- self.axis, self.window_args, *args, **kwargs
+ self.axis, self.window_kwargs, *args, **kwargs
)
)
def sum(self, *args, **kwargs):
return self._dataframe.__constructor__(
query_compiler=self._query_compiler.window_sum(
- self.axis, self.window_args, *args, **kwargs
+ self.axis, self.window_kwargs, *args, **kwargs
)
)
def var(self, ddof=1, *args, **kwargs):
return self._dataframe.__constructor__(
query_compiler=self._query_compiler.window_var(
- self.axis, self.window_args, ddof, *args, **kwargs
+ self.axis, self.window_kwargs, ddof, *args, **kwargs
)
)
def std(self, ddof=1, *args, **kwargs):
return self._dataframe.__constructor__(
query_compiler=self._query_compiler.window_std(
- self.axis, self.window_args, ddof, *args, **kwargs
+ self.axis, self.window_kwargs, ddof, *args, **kwargs
)
)
@@ -103,81 +104,87 @@ def __init__(
raise NotImplementedError("step parameter is not implemented yet.")
self._dataframe = dataframe
self._query_compiler = dataframe._query_compiler
- self.rolling_args = [
- window,
- min_periods,
- center,
- win_type,
- on,
- axis,
- closed,
- step,
- method,
- ]
+ self.rolling_kwargs = {
+ "window": window,
+ "min_periods": min_periods,
+ "center": center,
+ "win_type": win_type,
+ "on": on,
+ "axis": axis,
+ "closed": closed,
+ "step": step,
+ "method": method,
+ }
self.axis = axis
+ def _call_qc_method(self, method_name, *args, **kwargs):
+ """
+ Call a query compiler method for the specified rolling aggregation.
+
+ Parameters
+ ----------
+ method_name : str
+ Name of the aggregation.
+ *args : tuple
+ Positional arguments to pass to the query compiler method.
+ **kwargs : dict
+ Keyword arguments to pass to the query compiler method.
+
+ Returns
+ -------
+ BaseQueryCompiler
+ QueryCompiler holding the result of the aggregation.
+ """
+ qc_method = getattr(self._query_compiler, f"rolling_{method_name}")
+ return qc_method(self.axis, self.rolling_kwargs, *args, **kwargs)
+
+ def _aggregate(self, method_name, *args, **kwargs):
+ """
+ Run the specified rolling aggregation.
+
+ Parameters
+ ----------
+ method_name : str
+ Name of the aggregation.
+ *args : tuple
+ Positional arguments to pass to the aggregation.
+ **kwargs : dict
+ Keyword arguments to pass to the aggregation.
+
+ Returns
+ -------
+ DataFrame or Series
+ Result of the aggregation.
+ """
+ qc_result = self._call_qc_method(method_name, *args, **kwargs)
+ return self._dataframe.__constructor__(query_compiler=qc_result)
+
def count(self):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_count(
- self.axis, self.rolling_args
- )
- )
+ return self._aggregate("count")
def sem(self, *args, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_sem(
- self.axis, self.rolling_args, *args, **kwargs
- )
- )
+ return self._aggregate("sem", *args, **kwargs)
def sum(self, *args, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_sum(
- self.axis, self.rolling_args, *args, **kwargs
- )
- )
+ return self._aggregate("sum", *args, **kwargs)
def mean(self, *args, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_mean(
- self.axis, self.rolling_args, *args, **kwargs
- )
- )
+ return self._aggregate("mean", *args, **kwargs)
def median(self, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_median(
- self.axis, self.rolling_args, **kwargs
- )
- )
+ return self._aggregate("median", **kwargs)
def var(self, ddof=1, *args, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_var(
- self.axis, self.rolling_args, ddof, *args, **kwargs
- )
- )
+ return self._aggregate("var", ddof, *args, **kwargs)
def std(self, ddof=1, *args, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_std(
- self.axis, self.rolling_args, ddof, *args, **kwargs
- )
- )
+ return self._aggregate("std", ddof, *args, **kwargs)
def min(self, *args, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_min(
- self.axis, self.rolling_args, *args, **kwargs
- )
- )
+ return self._aggregate("min", *args, **kwargs)
def max(self, *args, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_max(
- self.axis, self.rolling_args, *args, **kwargs
- )
- )
+ return self._aggregate("max", *args, **kwargs)
def corr(self, other=None, pairwise=None, *args, **kwargs):
from .dataframe import DataFrame
@@ -188,11 +195,7 @@ def corr(self, other=None, pairwise=None, *args, **kwargs):
elif isinstance(other, Series):
other = other._query_compiler.to_pandas().squeeze()
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_corr(
- self.axis, self.rolling_args, other, pairwise, *args, **kwargs
- )
- )
+ return self._aggregate("corr", other, pairwise, *args, **kwargs)
def cov(self, other=None, pairwise=None, ddof: Optional[int] = 1, **kwargs):
from .dataframe import DataFrame
@@ -203,25 +206,13 @@ def cov(self, other=None, pairwise=None, ddof: Optional[int] = 1, **kwargs):
elif isinstance(other, Series):
other = other._query_compiler.to_pandas().squeeze()
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_cov(
- self.axis, self.rolling_args, other, pairwise, ddof, **kwargs
- )
- )
+ return self._aggregate("cov", other, pairwise, ddof, **kwargs)
def skew(self, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_skew(
- self.axis, self.rolling_args, **kwargs
- )
- )
+ return self._aggregate("skew", **kwargs)
def kurt(self, **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_kurt(
- self.axis, self.rolling_args, **kwargs
- )
- )
+ return self._aggregate("kurt", **kwargs)
def apply(
self,
@@ -233,18 +224,7 @@ def apply(
kwargs=None,
):
func = cast_function_modin2pandas(func)
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_apply(
- self.axis,
- self.rolling_args,
- func,
- raw,
- engine,
- engine_kwargs,
- args,
- kwargs,
- )
- )
+ return self._aggregate("apply", func, raw, engine, engine_kwargs, args, kwargs)
def aggregate(
self,
@@ -255,9 +235,8 @@ def aggregate(
from .dataframe import DataFrame
dataframe = DataFrame(
- query_compiler=self._query_compiler.rolling_aggregate(
- self.axis,
- self.rolling_args,
+ query_compiler=self._call_qc_method(
+ "aggregate",
func,
*args,
**kwargs,
@@ -265,7 +244,7 @@ def aggregate(
)
if isinstance(self._dataframe, DataFrame):
return dataframe
- elif is_list_like(func):
+ elif is_list_like(func) and dataframe.columns.nlevels > 1:
dataframe.columns = dataframe.columns.droplevel()
return dataframe
else:
@@ -274,26 +253,86 @@ def aggregate(
agg = aggregate
def quantile(self, quantile, interpolation="linear", **kwargs):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_quantile(
- self.axis, self.rolling_args, quantile, interpolation, **kwargs
- )
- )
+ return self._aggregate("quantile", quantile, interpolation, **kwargs)
def rank(
self, method="average", ascending=True, pct=False, numeric_only=False, **kwargs
):
- return self._dataframe.__constructor__(
- query_compiler=self._query_compiler.rolling_rank(
- self.axis,
- self.rolling_args,
- method,
- ascending,
- pct,
- numeric_only,
- **kwargs,
- )
+ return self._aggregate("rank", method, ascending, pct, numeric_only, **kwargs)
+
+
+@_inherit_docstrings(Rolling)
+class RollingGroupby(Rolling):
+ def __init__(self, groupby_obj, *args, **kwargs):
+ self._as_index = groupby_obj._kwargs.get("as_index", True)
+ self._groupby_obj = (
+ groupby_obj if self._as_index else groupby_obj._override(as_index=True)
)
+ super().__init__(self._groupby_obj._df, *args, **kwargs)
+
+ def sem(self, *args, **kwargs):
+ ErrorMessage.missmatch_with_pandas(
+ operation="RollingGroupby.sem() when 'as_index=False'",
+ message=(
+ "The group columns won't be involved in the aggregation.\n"
+ + "See this gh-issue for more information: https://github.com/modin-project/modin/issues/6291"
+ ),
+ )
+ return super().sem(*args, **kwargs)
+
+ def corr(self, other=None, pairwise=None, *args, **kwargs):
+ # pandas behavior is that it always assumes that 'as_index=True' for the '.corr()' method
+ return super().corr(
+ *args, as_index=True, other=other, pairwise=pairwise, **kwargs
+ )
+
+ def cov(self, other=None, pairwise=None, ddof: Optional[int] = 1, **kwargs):
+ # pandas behavior is that it always assumes that 'as_index=True' for the '.cov()' method
+ return super().cov(as_index=True, other=other, pairwise=pairwise, **kwargs)
+
+ def _aggregate(self, method_name, *args, as_index=None, **kwargs):
+ """
+ Run the specified rolling aggregation.
+
+ Parameters
+ ----------
+ method_name : str
+ Name of the aggregation.
+ *args : tuple
+ Positional arguments to pass to the aggregation.
+ as_index : bool, optional
+ Whether the result should have the group labels as index levels or as columns.
+ If not specified the parameter value will be taken from groupby kwargs.
+ **kwargs : dict
+ Keyword arguments to pass to the aggregation.
+
+ Returns
+ -------
+ DataFrame or Series
+ Result of the aggregation.
+ """
+ res = self._groupby_obj._wrap_aggregation(
+ qc_method=type(self._query_compiler).groupby_rolling,
+ numeric_only=False,
+ agg_args=args,
+ agg_kwargs=kwargs,
+ agg_func=method_name,
+ rolling_kwargs=self.rolling_kwargs,
+ )
+
+ if as_index is None:
+ as_index = self._as_index
+
+ if not as_index:
+ res = res.reset_index(
+ level=[i for i in range(len(self._groupby_obj._internal_by))],
+ drop=False,
+ )
+
+ return res
+
+ def _call_qc_method(self, method_name, *args, **kwargs):
+ return self._aggregate(method_name, *args, **kwargs)._query_compiler
@_inherit_docstrings(
diff --git a/modin/test/interchange/dataframe_protocol/hdk/test_protocol.py b/modin/test/interchange/dataframe_protocol/hdk/test_protocol.py
index b52f20b8b7e..2a116c7143b 100644
--- a/modin/test/interchange/dataframe_protocol/hdk/test_protocol.py
+++ b/modin/test/interchange/dataframe_protocol/hdk/test_protocol.py
@@ -47,6 +47,16 @@ def test_simple_export(data_has_nulls, from_hdk, n_chunks):
md_df = pd.DataFrame(data)
exported_df = export_frame(md_df, from_hdk, n_chunks=n_chunks)
+
+ # export_frame() splits the frame into multiple chunks. When it's
+ # split with HDK, each categorical column will have a different
+ # set of categories. When concatenating the chunks, the categorical
+ # column will be of type object.
+ cat_cols = md_df.select_dtypes(include=["category"]).columns
+ with warns_that_defaulting_to_pandas():
+ md_df[cat_cols] = md_df[cat_cols].astype(str)
+ exported_df[cat_cols] = exported_df[cat_cols].astype(str)
+
df_equals(md_df, exported_df)
diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py
index 6f484248ffd..210ea0dc1b1 100644
--- a/modin/test/storage_formats/pandas/test_internals.py
+++ b/modin/test/storage_formats/pandas/test_internals.py
@@ -726,6 +726,26 @@ def test_merge_partitioning(
)
+def test_groupby_with_empty_partition():
+ # see #5461 for details
+ md_df = construct_modin_df_by_scheme(
+ pandas_df=pandas.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}),
+ partitioning_scheme={"row_lengths": [2, 2], "column_widths": [2]},
+ )
+ md_res = md_df.query("a > 1")
+ grp_obj = md_res.groupby("a")
+ # check index error due to partitioning missmatching
+ grp_obj.count()
+
+ md_df = construct_modin_df_by_scheme(
+ pandas_df=pandas.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}),
+ partitioning_scheme={"row_lengths": [2, 2], "column_widths": [2]},
+ )
+ md_res = md_df.query("a > 1")
+ grp_obj = md_res.groupby(md_res["a"])
+ grp_obj.count()
+
+
@pytest.mark.parametrize("set_num_partitions", [2], indirect=True)
def test_repartitioning(set_num_partitions):
"""
diff --git a/modin/utils.py b/modin/utils.py
index daac8abde6c..db32b39a1ad 100644
--- a/modin/utils.py
+++ b/modin/utils.py
@@ -79,7 +79,7 @@ def _to_numpy(self) -> Any: # noqa: GL08
pass
-MIN_RAY_VERSION = version.parse("1.4.0")
+MIN_RAY_VERSION = version.parse("1.13.0")
MIN_DASK_VERSION = version.parse("2.22.0")
MIN_UNIDIST_VERSION = version.parse("0.2.1")
diff --git a/requirements-dev.txt b/requirements-dev.txt
index fb3884472ea..606c14ca4f0 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,57 +1,70 @@
+## required dependencies
pandas>=2,<2.1
numpy>=1.18.5
+fsspec
+packaging
+psutil
+
+## optional dependencies
+# ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100
+ray[default]>=1.13.0,!=2.5.0
+# https://github.com/modin-project/modin/issues/6336
+pydantic<2
+pyarrow
dask[complete]>=2.22.0
distributed>=2.22.0
-ray[default]>=1.13.0
-pyarrow
-psutil
-fsspec
xarray
Jinja2
-tables
scipy
s3fs>=2021.8
-pytest
-pytest-benchmark
-coverage
-pytest-cov
-pytest-xdist
feather-format
lxml
openpyxl
xlrd
matplotlib
sqlalchemy>=1.4.0,<1.4.46
+pandas-gbq
+tables
msgpack
-pandas_gbq
-cloudpickle
-rpyc==4.1.5
scikit-learn
-git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9
+pymssql
+psycopg2
+connectorx>=0.2.6a4
+fastparquet<2023.1.0
+flask-cors
# TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost
# when we use collective instead of rabit.
xgboost>=1.7.1,<2.0.0
tqdm
# Latest modin-spreadsheet with widget fix
git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
-pymssql
-psycopg2
-connectorx>=0.2.6a4
-black
-flake8
-flake8-no-implicit-concat
-flake8-print
-# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
-numpydoc==1.1.0
+
+## modin in the cloud dependencies
+boto3
+cloudpickle
+rpyc==4.1.5
+
+## dependencies for making release
+PyGithub>=1.58.0
+pygit2>=1.9.2
+
+## test dependencies
+coverage>=7.1.0
# experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies
fuzzydata>=0.0.6
-# Mypy 0.990 doesn't work: https://github.com/modin-project/modin/issues/5206
-mypy!=0.990
-pandas-stubs
-fastparquet<2023.1.0
-# for release script
-PyGithub
-pygit2
-moto
-flask-cors
-boto3
\ No newline at end of file
+git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9
+# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
+numpydoc==1.1.0
+moto>=4.1.0
+pytest>=7.2.1
+pytest-benchmark>=4.0.0
+pytest-cov>=4.0.0
+pytest-xdist>=3.2.0
+
+## code linters
+black>=23.1.0
+flake8>=6.0.0
+flake8-no-implicit-concat>=0.3.4
+flake8-print>=5.0.0
+mypy>=1.0.0
+pandas-stubs>=2.0.0
diff --git a/requirements/env_hdk.yml b/requirements/env_hdk.yml
index 65c83dae087..690a3090c42 100644
--- a/requirements/env_hdk.yml
+++ b/requirements/env_hdk.yml
@@ -2,37 +2,46 @@ name: modin_on_hdk
channels:
- conda-forge
dependencies:
+ - pip
+
+ # required dependencies
- pandas>=2,<2.1
- - pyarrow
- numpy>=1.18.5
+ - pyhdk==0.7
- fsspec
- - pip
- - pytest>=6.0.1
- - pytest-cov>=2.10.1
- - pytest-xdist>=2.1.0
- - coverage
- - pygithub
- - pyhdk==0.6
- - s3fs>=2021.8
+ - packaging
- psutil
+
+ # optional dependencies
+ - s3fs>=2021.8
- openpyxl
- xlrd
- sqlalchemy>=1.4.0,<1.4.46
- scipy
- - xgboost>=1.7.1,<2.0.0
- - scikit-learn-intelex
- matplotlib
# TODO: uncomment after Modin switch to python>=3.9
# - xarray
- pytables
- fastparquet
- # code linters
- - black
- - flake8
- - flake8-no-implicit-concat
- - flake8-print
+
+ # dependencies for making release
+ - pygithub
+
+ # test dependencies
- boto3
- - moto
+ - coverage>=7.1.0
+ - moto>=4.1.0
+ - pytest>=7.2.1
+ - pytest-cov>=4.0.0
+ - pytest-xdist>=3.2.0
+
+ # code linters
+ - black>=23.1.0
+ - flake8>=6.0.0
+ - flake8-no-implicit-concat>=0.3.4
+ - flake8-print>=5.0.0
+ - mypy>=1.0.0
+
- pip:
# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
- numpydoc==1.1.0
diff --git a/requirements/env_unidist.yml b/requirements/env_unidist.yml
index 54e39ac6b45..e8b4ca88d3c 100644
--- a/requirements/env_unidist.yml
+++ b/requirements/env_unidist.yml
@@ -2,16 +2,22 @@ name: modin_on_unidist
channels:
- conda-forge
dependencies:
- - unidist-mpi>=0.2.1
+ - pip
+
+ # required dependencies
- pandas>=2,<2.1
- numpy>=1.18.5
- - pyarrow
+ - unidist-mpi>=0.2.1
- fsspec
+ - packaging
+ - psutil
+
+ # optional dependencies
+ - pyarrow
# TODO: uncomment after Modin switch to python>=3.9
# - xarray
- Jinja2
- scipy
- - pip
- s3fs>=2021.8
- feather-format
- lxml
@@ -22,32 +28,36 @@ dependencies:
- pandas-gbq
- pytables
- msgpack-python
- - psutil
- - pytest>=6.0.1
- - pytest-benchmark
- - pytest-cov>=2.10.1
- - pytest-xdist>=2.1.0
- - packaging
- - coverage
- - pygithub
- - rpyc==4.1.5
- - cloudpickle
- - boto3
- scikit-learn
- pymssql
- psycopg2
- - mypy
- - pandas-stubs
- fastparquet<2023.1.0
- tqdm
- # for release script
- - pygit2
+
+ # modin in the cloud dependencies
+ - boto3
+ - cloudpickle
+ - rpyc==4.1.5
+
+ # dependencies for making release
+ - pygithub>=v1.58.0
+ - pygit2>=1.9.2
+
+ # test dependencies
+ - coverage>=7.1.0
+ - moto>=4.1.0
+ - pytest>=7.2.1
+ - pytest-cov>=4.0.0
+ - pytest-xdist>=3.2.0
+
# code linters
- - black
- - flake8
- - flake8-no-implicit-concat
- - flake8-print
- - moto
+ - black>=23.1.0
+ - flake8>=6.0.0
+ - flake8-no-implicit-concat>=0.3.4
+ - flake8-print>=5.0.0
+ - mypy>=1.0.0
+ - pandas-stubs>=2.0.0
+
- pip:
# Fixes breaking ipywidgets changes, but didn't release yet.
- git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml
index 4cc13cce2b9..9d2255902c3 100644
--- a/requirements/requirements-no-engine.yml
+++ b/requirements/requirements-no-engine.yml
@@ -1,15 +1,21 @@
channels:
- conda-forge
dependencies:
+ - pip
+
+ # required dependencies
- pandas>=2,<2.1
- numpy>=1.18.5
- - pyarrow
- fsspec
+ - packaging
+ - psutil
+
+ # optional dependencies
+ - pyarrow
# TODO: uncomment after Modin switch to python>=3.9
# - xarray
- Jinja2
- scipy
- - pip
- s3fs>=2021.8
- feather-format
- lxml
@@ -20,31 +26,38 @@ dependencies:
- pandas-gbq
- pytables
- msgpack-python
- - psutil
- - pytest>=6.0.1
- - pytest-benchmark
- - pytest-cov>=2.10.1
- - pytest-xdist>=2.1.0
- - coverage
- - pygithub
- - rpyc==4.1.5
- - cloudpickle
- - boto3
- - moto
# TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost
# when we use collective instead of rabit.
- xgboost>=1.7.1,<2.0.0
- tqdm
+
+ # modin in the cloud dependencies
+ - boto3
+ - cloudpickle
+ - rpyc==4.1.5
+
+ # dependencies for making release
+ - pygithub>=v1.58.0
+ - pygit2>=1.9.2
+
+ # test dependencies
+ - coverage>=7.1.0
+ - moto>=4.1.0
+ - pytest>=7.2.1
+ - pytest-cov>=4.0.0
+ - pytest-xdist>=3.2.0
+
# code linters
- - black
- - flake8
- - flake8-no-implicit-concat
- - flake8-print
+ - black>=23.1.0
+ - flake8>=6.0.0
+ - flake8-no-implicit-concat>=0.3.4
+ - flake8-print>=5.0.0
+
- pip:
+ # no conda package for windows
+ - connectorx>=0.2.6a4
# Fixes breaking ipywidgets changes, but didn't release yet.
- git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5
- git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9
- # no conda package for windows
- - connectorx>=0.2.6a4
# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI.
- numpydoc==1.1.0
diff --git a/scripts/doc_checker.py b/scripts/doc_checker.py
index 3f68509bd8c..2eb7083f8c8 100644
--- a/scripts/doc_checker.py
+++ b/scripts/doc_checker.py
@@ -527,8 +527,9 @@ def load_obj(name, old_load_obj=Docstring._load_obj):
Docstring._load_obj = staticmethod(load_obj)
# for testing hdk-engine docs without `pyhdk` installation
- # TODO: check if we could remove these lines
sys.modules["pyhdk"] = Mock()
+ sys.modules["pyhdk.hdk"] = Mock()
+ sys.modules["pyhdk._sql"] = Mock()
# enable docs testing on windows
sys.getdlopenflags = Mock()
sys.setdlopenflags = Mock()
diff --git a/scripts/release.py b/scripts/release.py
index cc383480a30..455bb56aef3 100644
--- a/scripts/release.py
+++ b/scripts/release.py
@@ -113,14 +113,26 @@ def __init__(self):
def is_on_master(self):
return self.repo.references["refs/heads/master"] == self.repo.head
+ @staticmethod
+ def __get_tag_version(entry):
+ try:
+ return version.parse(entry.lstrip("refs/tags/"))
+ except version.InvalidVersion as ex:
+ return f''
+
def get_previous_release(self, rel_type):
tags = [
- (entry, version.parse(entry.lstrip("refs/tags/")))
+ (entry, self.__get_tag_version(entry))
for entry in self.repo.references
if entry.startswith("refs/tags/")
]
- # filter away legacy versions (which aren't following the proper naming schema)
- tags = [(entry, ver) for entry, ver in tags if isinstance(ver, version.Version)]
+ # filter away legacy versions (which aren't following the proper naming schema);
+ # also skip pre-releases
+ tags = [
+ (entry, ver)
+ for entry, ver in tags
+ if isinstance(ver, version.Version) and not ver.pre
+ ]
if rel_type == "minor":
# leave only minor releases
tags = [(entry, ver) for entry, ver in tags if ver.micro == 0]
diff --git a/setup.cfg b/setup.cfg
index 0d6e97bc180..8f1cf929dd8 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -17,6 +17,7 @@ xfail_strict=true
markers =
xfail_executions
skip_executions
+ exclude_in_sanity
filterwarnings =
error:.*defaulting to pandas.*:UserWarning
diff --git a/setup.py b/setup.py
index 259e2f04a6f..43d67ee1fb7 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,9 @@
long_description = fh.read()
dask_deps = ["dask>=2.22.0", "distributed>=2.22.0"]
-ray_deps = ["ray[default]>=1.13.0", "pyarrow"]
+# ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100
+# pydantic<2: https://github.com/modin-project/modin/issues/6336
+ray_deps = ["ray[default]>=1.13.0,!=2.5.0", "pyarrow", "pydantic<2"]
unidist_deps = ["unidist[mpi]>=0.2.1"]
remote_deps = ["rpyc==4.1.5", "cloudpickle", "boto3"]
spreadsheet_deps = ["modin-spreadsheet>=0.1.0"]