diff --git a/.github/actions/mamba-env/action.yml b/.github/actions/mamba-env/action.yml new file mode 100644 index 00000000000..6badcf68e15 --- /dev/null +++ b/.github/actions/mamba-env/action.yml @@ -0,0 +1,44 @@ +name: "Install environment using Mamba" +description: "Prepare the environment to run Modin" +inputs: + python-version: + description: "Python version to install" + default: "3.8" + environment-file: + description: "Conda environment yml" + required: true + activate-environment: + description: "Conda environment to activate" + default: "modin" + +runs: + using: "composite" + steps: + - name: Get current week + id: get-week + # use current week as cache key to periodically refresh the cache, + # as cache is based on requirements, but dependencies push + # updated versions at some irregular pace + run: echo "thisweek=$(/bin/date -u '+%Y.w%W')" >> $GITHUB_OUTPUT + shell: bash + - name: Cache conda + id: cache-conda + uses: actions/cache@v3 + with: + path: | + ~/conda_pkgs_dir + ~/.cache/pip + key: + ${{ runner.os }}-conda-${{ steps.get-week.outputs.thisweek }}-${{ hashFiles(inputs.environment-file) }} + - uses: conda-incubator/setup-miniconda@v2 + with: + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true + activate-environment: ${{ inputs.activate-environment }} + environment-file: ${{ inputs.environment-file }} + python-version: ${{ inputs.python-version }} + channel-priority: strict + # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed + # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 + use-only-tar-bz2: false diff --git a/.github/actions/python-only/action.yml b/.github/actions/python-only/action.yml new file mode 100644 index 00000000000..2fe3d23c4fc --- /dev/null +++ b/.github/actions/python-only/action.yml @@ -0,0 +1,15 @@ +name: "Install Python only" +description: "Prepare the environment to run simple tasks" +inputs: + python-version: + description: "Python version to install" + default: "3.8.x" + +runs: + using: "composite" + steps: + - uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python-version }} + architecture: "x64" + cache: 'pip' diff --git a/.github/actions/run-core-tests/action.yml b/.github/actions/run-core-tests/action.yml new file mode 100644 index 00000000000..ae0f21e08d6 --- /dev/null +++ b/.github/actions/run-core-tests/action.yml @@ -0,0 +1,29 @@ +name: "Run core Modin tests" +description: "Run core Modin tests like dataframe or groupby" +inputs: + runner: + description: "Runner for tests" + default: "python -m pytest" + parallel: + description: "How to run tests in parallel" + default: "-n 2" + +runs: + using: "composite" + steps: + - uses: ./.github/actions/run-core-tests/group_1 + with: + runner: ${{ inputs.runner }} + parallel: ${{ inputs.parallel }} + - uses: ./.github/actions/run-core-tests/group_2 + with: + runner: ${{ inputs.runner }} + parallel: ${{ inputs.parallel }} + - uses: ./.github/actions/run-core-tests/group_3 + with: + runner: ${{ inputs.runner }} + parallel: ${{ inputs.parallel }} + - uses: ./.github/actions/run-core-tests/group_4 + with: + runner: ${{ inputs.runner }} + parallel: ${{ inputs.parallel }} diff --git a/.github/actions/run-core-tests/group_1/action.yml b/.github/actions/run-core-tests/group_1/action.yml new file mode 100644 index 00000000000..a338209e656 --- /dev/null +++ b/.github/actions/run-core-tests/group_1/action.yml @@ -0,0 +1,21 @@ +name: "Run core Modin tests - group 1" +description: "Run core Modin tests like dataframe or groupby" +inputs: + runner: + description: "Runner for tests" + default: "python -m pytest" + parallel: + description: "How to run tests in parallel" + default: "-n 2" + +runs: + using: "composite" + steps: + - run: | + echo "::group::Running dataframe tests (group 1)..." + ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_binary.py \ + modin/pandas/test/dataframe/test_default.py \ + modin/pandas/test/dataframe/test_indexing.py \ + modin/pandas/test/dataframe/test_iter.py + echo "::endgroup::" + shell: bash -l {0} diff --git a/.github/actions/run-core-tests/group_2/action.yml b/.github/actions/run-core-tests/group_2/action.yml new file mode 100644 index 00000000000..d330e65061a --- /dev/null +++ b/.github/actions/run-core-tests/group_2/action.yml @@ -0,0 +1,22 @@ +name: "Run core Modin tests - group 2" +description: "Run core Modin tests like dataframe or groupby" +inputs: + runner: + description: "Runner for tests" + default: "python -m pytest" + parallel: + description: "How to run tests in parallel" + default: "-n 2" + +runs: + using: "composite" + steps: + - run: | + echo "::group::Running dataframe tests (group 2)..." + ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/dataframe/test_join_sort.py \ + modin/pandas/test/dataframe/test_reduce.py \ + modin/pandas/test/dataframe/test_udf.py \ + modin/pandas/test/dataframe/test_window.py \ + modin/pandas/test/dataframe/test_pickle.py + echo "::endgroup::" + shell: bash -l {0} diff --git a/.github/actions/run-core-tests/group_3/action.yml b/.github/actions/run-core-tests/group_3/action.yml new file mode 100644 index 00000000000..578673326f9 --- /dev/null +++ b/.github/actions/run-core-tests/group_3/action.yml @@ -0,0 +1,24 @@ +name: "Run core Modin tests - group 3" +description: "Run core Modin tests like dataframe or groupby" +inputs: + runner: + description: "Runner for tests" + default: "python -m pytest" + parallel: + description: "How to run tests in parallel" + default: "-n 2" + +runs: + using: "composite" + steps: + - run: | + echo "::group::Running tests (group 3)..." + ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_series.py \ + modin/pandas/test/dataframe/test_map_metadata.py + echo "::endgroup::" + shell: bash -l {0} + - run: | + echo "::group::Running experimental groupby tests (group 3)..." + MODIN_EXPERIMENTAL_GROUPBY=1 ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_groupby.py + echo "::endgroup::" + shell: bash -l {0} diff --git a/.github/actions/run-core-tests/group_4/action.yml b/.github/actions/run-core-tests/group_4/action.yml new file mode 100644 index 00000000000..a3588b1469b --- /dev/null +++ b/.github/actions/run-core-tests/group_4/action.yml @@ -0,0 +1,27 @@ +name: "Run core Modin tests - group 4" +description: "Run core Modin tests like dataframe or groupby" +inputs: + runner: + description: "Runner for tests" + default: "python -m pytest" + parallel: + description: "How to run tests in parallel" + default: "-n 2" + +runs: + using: "composite" + steps: + - run: | + echo "::group::Running tests (group 4)..." + ${{ inputs.runner }} ${{ inputs.parallel }} modin/pandas/test/test_rolling.py \ + modin/pandas/test/test_expanding.py \ + modin/pandas/test/test_groupby.py \ + modin/pandas/test/test_reshape.py \ + modin/pandas/test/test_general.py + echo "::endgroup::" + shell: bash -l {0} + - run: | + echo "::group::Running concat tests (group 4)..." + ${{ inputs.runner }} modin/pandas/test/test_concat.py # Ray and Dask versions fails with -n 2 + echo "::endgroup::" + shell: bash -l {0} diff --git a/.github/workflows/upload-coverage/action.yml b/.github/actions/upload-coverage/action.yml similarity index 100% rename from .github/workflows/upload-coverage/action.yml rename to .github/actions/upload-coverage/action.yml diff --git a/.github/workflows/ci-notebooks.yml b/.github/workflows/ci-notebooks.yml index 196a421edbc..b632604aa28 100644 --- a/.github/workflows/ci-notebooks.yml +++ b/.github/workflows/ci-notebooks.yml @@ -7,6 +7,7 @@ on: - .github/workflows/ci-notebooks.yml - setup.cfg - setup.py + - requirements/env_hdk.yml concurrency: # Cancel other jobs in the same branch. We don't care whether CI passes # on old commits. @@ -26,41 +27,19 @@ jobs: execution: [pandas_on_ray, pandas_on_dask, pandas_on_unidist, hdk_on_native] steps: - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - uses: actions/setup-python@v4 - with: - python-version: "3.8.x" - architecture: "x64" + - uses: ./.github/actions/python-only if: matrix.execution != 'hdk_on_native' - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('requirements/env_hdk.yml') }} - if: matrix.execution == 'hdk_on_native' - - uses: conda-incubator/setup-miniconda@v2 + - uses: ./.github/actions/mamba-env with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin_on_hdk environment-file: requirements/env_hdk.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false + activate-environment: modin_on_hdk if: matrix.execution == 'hdk_on_native' - name: Cache datasets uses: actions/cache@v2 with: path: taxi.csv # update cache only if notebooks require it to be changed - key: hashFiles("examples/tutorial/jupyter/**") + key: taxi-csv-dataset-${{ hashFiles('examples/tutorial/jupyter/**') }} # replace modin with . in the tutorial requirements file for `pandas_on_ray` and # `pandas_on_dask` since we need Modin built from sources - run: sed -i 's/modin/./g' examples/tutorial/jupyter/execution/${{ matrix.execution }}/requirements.txt diff --git a/.github/workflows/ci-required.yml b/.github/workflows/ci-required.yml index 8b74e67eaaf..4a07a2aea82 100644 --- a/.github/workflows/ci-required.yml +++ b/.github/workflows/ci-required.yml @@ -5,7 +5,11 @@ concurrency: # on old commits. group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} +env: + MODIN_GITHUB_CI: true + jobs: + check-pr-title: runs-on: ubuntu-latest steps: @@ -14,6 +18,7 @@ jobs: # NOTE: If you change the allowed prefixes here, update # the documentation about them in /docs/development/contributing.rst regexp: '^(?:FEAT|DOCS|FIX|REFACTOR|TEST|PERF)-#\d+:' + build-docs: name: build docs runs-on: ubuntu-latest @@ -29,5 +34,77 @@ jobs: cache-dependency-path: '**/requirements-doc.txt' - run: pip install -r docs/requirements-doc.txt - run: cd docs && sphinx-build -T -E -W -b html . build -env: - MODIN_GITHUB_CI: true + + lint-pydocstyle: + name: lint (pydocstyle) + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - uses: ./.github/actions/python-only + # The `numpydoc` version here MUST match the versions in the dev requirements files. + - run: pip install pytest pytest-cov pydocstyle numpydoc==1.1.0 xgboost + - run: python -m pytest scripts/test + - run: pip install -e ".[all]" + - run: | + python scripts/doc_checker.py --add-ignore=D101,D102,D103,D105 --disable-numpydoc \ + modin/pandas/dataframe.py modin/pandas/series.py \ + modin/pandas/groupby.py \ + modin/pandas/series_utils.py modin/pandas/general.py \ + modin/pandas/plotting.py modin/pandas/utils.py \ + modin/pandas/iterator.py modin/pandas/indexing.py \ + - run: python scripts/doc_checker.py modin/core/dataframe + - run: python scripts/doc_checker.py modin/core/execution/dask + - run: | + python scripts/doc_checker.py \ + modin/pandas/accessor.py modin/pandas/general.py \ + modin/pandas/groupby.py modin/pandas/indexing.py \ + modin/pandas/iterator.py modin/pandas/plotting.py \ + modin/pandas/series_utils.py modin/pandas/utils.py \ + modin/pandas/base.py \ + modin/pandas/io.py \ + asv_bench/benchmarks/utils \ + asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \ + asv_bench/benchmarks/scalability/__init__.py \ + modin/core/io \ + modin/experimental/core/execution/ray/implementations/pandas_on_ray \ + modin/experimental/core/execution/ray/implementations/pyarrow_on_ray \ + modin/pandas/series.py \ + modin/core/execution/python \ + modin/pandas/dataframe.py \ + modin/config/__init__.py \ + modin/config/__main__.py \ + modin/config/envvars.py \ + modin/config/pubsub.py + - run: python scripts/doc_checker.py modin/distributed + - run: python scripts/doc_checker.py modin/utils.py + - run: python scripts/doc_checker.py modin/experimental/sklearn + - run: | + python scripts/doc_checker.py modin/experimental/xgboost/__init__.py \ + modin/experimental/xgboost/utils.py modin/experimental/xgboost/xgboost.py \ + modin/experimental/xgboost/xgboost_ray.py + - run: python scripts/doc_checker.py modin/core/execution/ray + - run: | + python scripts/doc_checker.py modin/core/execution/dispatching/factories/factories.py \ + modin/core/execution/dispatching/factories/dispatcher.py \ + - run: python scripts/doc_checker.py scripts/doc_checker.py + - run: | + python scripts/doc_checker.py modin/experimental/pandas/io.py \ + modin/experimental/pandas/numpy_wrap.py modin/experimental/pandas/__init__.py + - run: python scripts/doc_checker.py modin/core/storage_formats/base + - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow + - run: python scripts/doc_checker.py modin/core/storage_formats/pandas + - run: | + python scripts/doc_checker.py \ + modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe \ + modin/experimental/core/execution/native/implementations/hdk_on_native/io \ + modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning \ + modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py \ + modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py \ + modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py \ + modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py \ + modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py \ + modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py \ + - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/hdk + - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol + - run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py + - run: python scripts/doc_checker.py modin/logging diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a1ad7c5076a..4a7b4f8cd53 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -5,6 +5,7 @@ on: # NOTE: keep these paths in sync with the paths that trigger the # fuzzydata Github Actions in .github/workflows/fuzzydata-test.yml - .github/workflows/** + - .github/actions/** - '!.github/workflows/push-to-master.yml' - asv_bench/** - modin/** @@ -23,18 +24,14 @@ concurrency: cancel-in-progress: ${{ startsWith(github.ref, 'refs/pull/') }} env: MODIN_GITHUB_CI: true + jobs: lint-black: name: lint (black) runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - uses: actions/setup-python@v4 - with: - python-version: "3.8.x" - architecture: "x64" + - uses: ./.github/actions/python-only - run: pip install black # NOTE: keep the black command here in sync with the pre-commit hook in # /contributing/pre-commit @@ -45,106 +42,16 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - uses: actions/setup-python@v4 - with: - python-version: "3.8.x" - architecture: "x64" + - uses: ./.github/actions/python-only - run: pip install -r requirements-dev.txt - run: mypy --config-file mypy.ini - lint-pydocstyle: - if: github.event_name == 'pull_request' - name: lint (pydocstyle) - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - uses: actions/setup-python@v4 - with: - python-version: "3.8.x" - architecture: "x64" - # The `numpydoc` version here MUST match the versions in the dev requirements files. - - run: pip install pytest pytest-cov pydocstyle numpydoc==1.1.0 xgboost - - run: python -m pytest scripts/test - - run: pip install -e ".[all]" - - run: | - python scripts/doc_checker.py --add-ignore=D101,D102,D103,D105 --disable-numpydoc \ - modin/pandas/dataframe.py modin/pandas/series.py \ - modin/pandas/groupby.py \ - modin/pandas/series_utils.py modin/pandas/general.py \ - modin/pandas/plotting.py modin/pandas/utils.py \ - modin/pandas/iterator.py modin/pandas/indexing.py \ - - run: python scripts/doc_checker.py modin/core/dataframe - - run: python scripts/doc_checker.py modin/core/execution/dask - - run: | - python scripts/doc_checker.py \ - modin/pandas/accessor.py modin/pandas/general.py \ - modin/pandas/groupby.py modin/pandas/indexing.py \ - modin/pandas/iterator.py modin/pandas/plotting.py \ - modin/pandas/series_utils.py modin/pandas/utils.py \ - modin/pandas/base.py \ - modin/pandas/io.py \ - asv_bench/benchmarks/utils \ - asv_bench/benchmarks/__init__.py asv_bench/benchmarks/io/__init__.py \ - asv_bench/benchmarks/scalability/__init__.py \ - modin/core/io \ - modin/experimental/core/execution/ray/implementations/pandas_on_ray \ - modin/experimental/core/execution/ray/implementations/pyarrow_on_ray \ - modin/pandas/series.py \ - modin/core/execution/python \ - modin/pandas/dataframe.py \ - modin/config/__init__.py \ - modin/config/__main__.py \ - modin/config/envvars.py \ - modin/config/pubsub.py - - run: python scripts/doc_checker.py modin/distributed - - run: python scripts/doc_checker.py modin/utils.py - - run: python scripts/doc_checker.py modin/experimental/sklearn - - run: | - python scripts/doc_checker.py modin/experimental/xgboost/__init__.py \ - modin/experimental/xgboost/utils.py modin/experimental/xgboost/xgboost.py \ - modin/experimental/xgboost/xgboost_ray.py - - run: python scripts/doc_checker.py modin/core/execution/ray - - run: | - python scripts/doc_checker.py modin/core/execution/dispatching/factories/factories.py \ - modin/core/execution/dispatching/factories/dispatcher.py \ - - run: python scripts/doc_checker.py scripts/doc_checker.py - - run: | - python scripts/doc_checker.py modin/experimental/pandas/io.py \ - modin/experimental/pandas/numpy_wrap.py modin/experimental/pandas/__init__.py - - run: python scripts/doc_checker.py modin/core/storage_formats/base - - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/pyarrow - - run: python scripts/doc_checker.py modin/core/storage_formats/pandas - - run: | - python scripts/doc_checker.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe \ - modin/experimental/core/execution/native/implementations/hdk_on_native/io \ - modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning \ - modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py \ - modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py \ - - run: python scripts/doc_checker.py modin/experimental/core/storage_formats/hdk - - run: python scripts/doc_checker.py modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol - - run: python scripts/doc_checker.py modin/experimental/batch/pipeline.py - - run: python scripts/doc_checker.py modin/logging - lint-flake8: name: lint (flake8) runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - uses: actions/setup-python@v4 - with: - python-version: "3.8.x" - architecture: "x64" + - uses: ./.github/actions/python-only # NOTE: If you are changing the set of packages installed here, make sure that # the dev requirements match them. - run: pip install flake8 flake8-print flake8-no-implicit-concat @@ -152,130 +59,47 @@ jobs: # /contributing/pre-commit - run: flake8 modin/ asv_bench/benchmarks scripts/doc_checker.py - test-api: + test-api-and-no-engine: + name: Test API, headers and no-engine mode runs-on: ubuntu-latest - name: test api defaults: run: - # `shell: bash -l {0}` - special way to activate modin environment shell: bash -l {0} steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 1 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin - environment-file: environment-dev.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - - run: sudo apt update && sudo apt install -y libhdf5-dev - - name: Api tests - run: python -m pytest modin/pandas/test/test_api.py - - name: Executions Api tests - run: python -m pytest modin/test/test_executions_api.py - - test-headers: - runs-on: ubuntu-latest - name: test-headers - defaults: - run: - shell: bash -l {0} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin - environment-file: environment-dev.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - - name: Headers tests - run: python -m pytest modin/test/test_headers.py - - test-clean-install-ubuntu: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - name: test-clean-install-ubuntu - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - uses: actions/setup-python@v4 - with: - python-version: "3.8.x" - architecture: "x64" - - name: Clean install and run - run: | - python -m pip install -e ".[all]" - MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))" - MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))" - MODIN_ENGINE=unidist UNIDIST_BACKEND=mpi mpiexec -n 1 python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))" + environment-file: requirements/requirements-no-engine.yml + - run: python -m pytest modin/pandas/test/test_api.py + - run: python -m pytest modin/test/test_executions_api.py + - run: python -m pytest modin/test/test_headers.py + - run: python -m pytest modin/core/execution/dispatching/factories/test/test_dispatcher.py::test_add_option + - uses: ./.github/actions/upload-coverage - test-clean-install-windows: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] - runs-on: windows-latest + test-clean-install: + needs: [lint-flake8, lint-black] + strategy: + matrix: + os: + - ubuntu + - windows + runs-on: ${{ matrix.os }}-latest defaults: run: shell: bash -l {0} - name: test-clean-install-windows + name: test-clean-install-${{ matrix.os }} steps: - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - uses: actions/setup-python@v4 - with: - python-version: "3.8.x" - architecture: "x64" - - name: Clean install and run + - uses: ./.github/actions/python-only + - run: python -m pip install -e ".[all]" + - name: Ensure all engines start up run: | - python -m pip install -e ".[all]" MODIN_ENGINE=dask python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))" MODIN_ENGINE=ray python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))" MODIN_ENGINE=unidist UNIDIST_BACKEND=mpi mpiexec -n 1 python -c "import modin.pandas as pd; print(pd.DataFrame([1,2,3]))" test-internals: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] + needs: [lint-flake8, lint-black] runs-on: ubuntu-latest defaults: run: @@ -283,32 +107,9 @@ jobs: name: test-internals steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 1 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - name: Internals tests run: python -m pytest modin/core/execution/dispatching/factories/test/test_dispatcher.py modin/experimental/cloud/test/test_cloud.py - run: python -m pytest modin/config/test @@ -320,46 +121,10 @@ jobs: - run: python -m pytest asv_bench/test/test_utils.py - run: python -m pytest modin/test/interchange/dataframe_protocol/base - run: python -m pytest modin/test/test_logging.py - - uses: ./.github/workflows/upload-coverage - - test-no-engine: - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 1 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('requirements-no-engine.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin - environment-file: requirements/requirements-no-engine.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - - run: python -m pytest modin/core/execution/dispatching/factories/test/test_dispatcher.py::test_add_option - - uses: ./.github/workflows/upload-coverage + - uses: ./.github/actions/upload-coverage test-defaults: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] + needs: [lint-flake8, lint-black] runs-on: ubuntu-latest defaults: run: @@ -372,58 +137,20 @@ jobs: name: Test ${{ matrix.execution }} execution, Python 3.8 steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 2 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - run: python -m pytest modin/experimental/xgboost/test/test_default.py --execution=${{ matrix.execution }} - run: python -m pytest -n 2 modin/test/storage_formats/base/test_internals.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_binary.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_default.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_indexing.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_iter.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_join_sort.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_map_metadata.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_reduce.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_udf.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_window.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_pickle.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/test_series.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/test_expanding.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/test_concat.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py --execution=${{ matrix.execution }} - - run: MODIN_EXPERIMENTAL_GROUPBY=1 python -m pytest -n 2 modin/pandas/test/test_groupby.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py --execution=${{ matrix.execution }} - - run: python -m pytest -n 2 modin/pandas/test/test_general.py --execution=${{ matrix.execution }} - - uses: ./.github/workflows/upload-coverage + - uses: ./.github/actions/run-core-tests + with: + runner: python -m pytest --execution=${{ matrix.execution }} + - uses: ./.github/actions/upload-coverage test-hdk: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] + needs: [lint-flake8, lint-black] runs-on: ubuntu-latest defaults: run: @@ -443,32 +170,10 @@ jobs: AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 2 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('requirements/env_hdk.yml') }} - - name: Setting up Modin environment - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin_on_hdk environment-file: requirements/env_hdk.yml - python-version: 3.8 - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list + activate-environment: modin_on_hdk - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - run: python -m pytest modin/test/storage_formats/hdk/test_internals.py @@ -498,11 +203,11 @@ jobs: examples/data/plasticc_training_set_metadata_1k.csv \ examples/data/plasticc_test_set_metadata_1k.csv \ -no-ml - - uses: ./.github/workflows/upload-coverage + - uses: ./.github/actions/upload-coverage test-asv-benchmarks: if: github.event_name == 'pull_request' - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] + needs: [lint-flake8, lint-black] runs-on: ubuntu-latest defaults: run: @@ -528,14 +233,15 @@ jobs: pip install git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9 - name: Running benchmarks run: | - # ASV correctly creates environments for testing only from the branch - # with `master` name - git checkout -b master - cd asv_bench - asv check -v git remote add upstream https://github.com/modin-project/modin.git git fetch upstream if git diff upstream/master --name-only | grep -q "^asv_bench/"; then + # ASV correctly creates environments for testing only from the branch + # with `master` name + git checkout -b master + cd asv_bench + asv check -v + asv machine --yes # check Modin on Ray @@ -567,8 +273,43 @@ jobs: path: asv_bench/benchmarks.log if: failure() + execution-filter: + # see if execution backend-specific changes were made + runs-on: ubuntu-latest + outputs: + ray: ${{ steps.filter.outputs.ray }} + dask: ${{ steps.filter.outputs.dask }} + unidist: ${{ steps.filter.outputs.unidist }} + engines: ${{ steps.engines.outputs.engines }} + experimental: ${{ steps.experimental.outputs.experimental }} + steps: + - uses: actions/checkout@v3 + - uses: dorny/paths-filter@v2 + id: filter + with: + filters: | + shared: &shared + - 'modin/core/execution/dispatching/**' + ray: + - *shared + - 'modin/core/execution/ray/**' + dask: + - *shared + - 'modin/core/execution/dask/**' + unidist: + - *shared + - 'modin/core/execution/unidist/**' + experimental: + - 'modin/experimental/**' + - uses: actions/setup-python@v4 + - id: engines + run: | + python -c "import sys, json; print('engines=' + json.dumps(['python'] + (sys.argv[1] == 'true' and ['ray'] or []) + (sys.argv[2] == 'true' and ['dask'] or []) ))" \ + "${{ steps.filter.outputs.ray }}" "${{ steps.filter.outputs.dask }}" >> $GITHUB_OUTPUT + test-all-unidist: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] + needs: [lint-flake8, lint-black, execution-filter] + if: github.event_name == 'push' || needs.execution-filter.outputs.unidist == 'true' runs-on: ubuntu-latest defaults: run: @@ -596,32 +337,11 @@ jobs: AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 2 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('requirements/env_unidist.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin_on_unidist environment-file: requirements/env_unidist.yml + activate-environment: modin_on_unidist python-version: ${{matrix.python-version}} - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev - name: Set up postgres @@ -630,36 +350,14 @@ jobs: run: | sudo docker pull postgres sudo docker run --name some-postgres -e POSTGRES_USER=sa -e POSTGRES_PASSWORD=Strong.Pwd-123 -e POSTGRES_DB=postgres -d -p 2345:5432 postgres - - run: MODIN_BENCHMARK_MODE=True mpiexec -n 1 python -m pytest modin/pandas/test/internals/test_benchmark_mode.py + - run: mpiexec -n 1 python -m pytest modin/pandas/test/internals/test_benchmark_mode.py - run: mpiexec -n 1 python -m pytest modin/pandas/test/internals/test_repartition.py - run: mpiexec -n 1 python -m pytest modin/test/test_partition_api.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_binary.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_default.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_indexing.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_iter.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_join_sort.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_map_metadata.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_reduce.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_udf.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_window.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/dataframe/test_pickle.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_series.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_rolling.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_expanding.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_concat.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_groupby.py - - run: MODIN_EXPERIMENTAL_GROUPBY=1 mpiexec -n 1 python -m pytest modin/pandas/test/test_groupby.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_reshape.py - - run: mpiexec -n 1 python -m pytest modin/pandas/test/test_general.py - - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array.py - - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_creation.py - - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_arithmetic.py - - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_axis_functions.py - - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_logic.py - - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_linalg.py - - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_indexing.py - - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_math.py - - run: mpiexec -n 1 python -m pytest modin/numpy/test/test_array_shaping.py + - uses: ./.github/actions/run-core-tests + with: + runner: mpiexec -n 1 python -m pytest + parallel: "" + - run: mpiexec -n 1 python -m pytest modin/numpy/test - run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh - run: ./.github/workflows/sql_server/set_up_sql_server.sh # need an extra argument "genv" to set environment variables for mpiexec. We need @@ -672,18 +370,17 @@ jobs: - run: | python -m pip install lazy_import mpiexec -n 1 python -m pytest modin/pandas/test/integrations/ - - uses: ./.github/workflows/upload-coverage + - uses: ./.github/actions/upload-coverage test-all: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] - runs-on: ubuntu-latest - defaults: - run: - shell: bash -l {0} + needs: [lint-flake8, lint-black, execution-filter] strategy: matrix: + os: + - ubuntu + - windows python-version: ["3.8"] - engine: ["python", "ray", "dask"] + engine: ${{ fromJSON( github.event_name == 'push' && '["python", "ray", "dask"]' || needs.execution-filter.outputs.engines ) }} test_task: - group_1 - group_2 @@ -696,21 +393,23 @@ jobs: test_task: "group_3" - engine: "python" test_task: "group_4" + runs-on: ${{ matrix.os }}-latest + defaults: + run: + shell: bash -l {0} env: MODIN_ENGINE: ${{matrix.engine}} # Only test reading from SQL server and postgres on ubuntu for now. # Eventually, we should test on Windows, too, but we will have to set up # the servers differently. - MODIN_TEST_READ_FROM_SQL_SERVER: true - MODIN_TEST_READ_FROM_POSTGRES: true - name: test-ubuntu (engine ${{matrix.engine}}, python ${{matrix.python-version}}, ${{matrix.test_task}}) + MODIN_TEST_READ_FROM_SQL_SERVER: ${{ matrix.os == 'ubuntu' }} + MODIN_TEST_READ_FROM_POSTGRES: ${{ matrix.os == 'ubuntu' }} + name: test-${{ matrix.os }} (engine ${{matrix.engine}}, python ${{matrix.python-version}}, ${{matrix.test_task}}) services: - # This service only needs to run for test_task group_4; however, GitHub does not - # currently support conditionally running services. This issue: - # is open https://github.com/actions/runner/issues/822 - until GitHub implements this feature, - # we will just have to run `moto` for all groups. + # Using workaround https://github.com/actions/runner/issues/822#issuecomment-1524826092 moto: - image: motoserver/moto + # we only need moto service on Ubuntu and for group_4 task or python engine + image: ${{ (matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4')) && 'motoserver/moto' || '' }} ports: - 5000:5000 env: @@ -719,43 +418,36 @@ jobs: steps: - name: Limit ray memory run: echo "MODIN_MEMORY=1000000000" >> $GITHUB_ENV - if: matrix.engine == 'ray' + if: matrix.os == 'ubuntu' && matrix.engine == 'ray' + - name: Tell Modin to use existing ray cluster + run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV + if: matrix.os == 'windows' && matrix.engine == 'ray' - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 2 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml python-version: ${{matrix.python-version}} - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list + - name: Start local ray cluster + # Try a few times to start ray to work around + # https://github.com/modin-project/modin/issues/4562 + uses: nick-fields/retry@v2 + with: + timeout_minutes: 5 + max_attempts: 5 + command: ray start --head --port=6379 --object-store-memory=1000000000 + if: matrix.os == 'windows' && matrix.engine == 'ray' - name: Install HDF5 run: sudo apt update && sudo apt install -y libhdf5-dev + if: matrix.os == 'ubuntu' - name: Set up postgres # Locally, specifying port 2345:5432 works, but 2345:2345 and 5432:5432 do not. This solution is from # https://stackoverflow.com/questions/36415654/cant-connect-docker-postgresql-9-3 run: | sudo docker pull postgres sudo docker run --name some-postgres -e POSTGRES_USER=sa -e POSTGRES_PASSWORD=Strong.Pwd-123 -e POSTGRES_DB=postgres -d -p 2345:5432 postgres - - run: MODIN_BENCHMARK_MODE=True python -m pytest modin/pandas/test/internals/test_benchmark_mode.py + if: matrix.os == 'ubuntu' + + - run: python -m pytest modin/pandas/test/internals/test_benchmark_mode.py if: matrix.engine == 'python' || matrix.test_task == 'group_1' - run: python -m pytest modin/pandas/test/internals/test_repartition.py if: matrix.engine == 'python' || matrix.test_task == 'group_1' @@ -764,76 +456,33 @@ jobs: - run: python -m pytest -n 2 modin/experimental/xgboost/test/test_default.py if: matrix.engine == 'python' || matrix.test_task == 'group_1' - run: python -m pytest -n 2 modin/experimental/xgboost/test/test_xgboost.py - if: matrix.engine == 'ray' && matrix.test_task == 'group_1' + if: matrix.os == 'ubuntu' && matrix.engine == 'ray' && matrix.test_task == 'group_1' - run: python -m pytest -n 2 modin/experimental/xgboost/test/test_dmatrix.py if: matrix.engine == 'ray' && matrix.test_task == 'group_1' - run: python -m pytest -n 2 modin/experimental/batch/test/test_pipeline.py if: matrix.engine == 'python' || matrix.test_task == 'group_1' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_binary.py - if: matrix.engine == 'python' || matrix.test_task == 'group_1' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_default.py + - uses: ./.github/actions/run-core-tests/group_1 if: matrix.engine == 'python' || matrix.test_task == 'group_1' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_indexing.py - if: matrix.engine == 'python' || matrix.test_task == 'group_1' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_iter.py - if: matrix.engine == 'python' || matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_join_sort.py - if: matrix.engine == 'python' || matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_reduce.py - if: matrix.engine == 'python' || matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_udf.py - if: matrix.engine == 'python' || matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_window.py - if: matrix.engine == 'python' || matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_pickle.py + - uses: ./.github/actions/run-core-tests/group_2 if: matrix.engine == 'python' || matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_map_metadata.py - if: matrix.engine == 'python' || matrix.test_task == 'group_3' - - run: python -m pytest -n 2 modin/pandas/test/test_series.py - if: matrix.engine == 'python' || matrix.test_task == 'group_3' - - run: MODIN_EXPERIMENTAL_GROUPBY=1 python -m pytest -n 2 modin/pandas/test/test_groupby.py + - uses: ./.github/actions/run-core-tests/group_3 if: matrix.engine == 'python' || matrix.test_task == 'group_3' - - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/pandas/test/test_expanding.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest modin/pandas/test/test_concat.py # Ray and Dask versions fails with -n 2 - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_creation.py + - uses: ./.github/actions/run-core-tests/group_4 if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_arithmetic.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_axis_functions.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_logic.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_linalg.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_indexing.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_math.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_shaping.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/pandas/test/test_general.py + - run: python -m pytest -n 2 modin/numpy/test if: matrix.engine == 'python' || matrix.test_task == 'group_4' - run: chmod +x ./.github/workflows/sql_server/set_up_sql_server.sh - if: matrix.engine == 'python' || matrix.test_task == 'group_4' + if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4') - run: ./.github/workflows/sql_server/set_up_sql_server.sh - if: matrix.engine == 'python' || matrix.test_task == 'group_4' + if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4') # Do not add parallelism (`-n` argument) here - it will cause mock S3 service to fail. - run: python -m pytest modin/pandas/test/test_io.py --verbose + timeout-minutes: 60 if: matrix.engine == 'python' || matrix.test_task == 'group_4' - run: python -m pytest modin/experimental/pandas/test/test_io_exp.py if: matrix.engine == 'python' || matrix.test_task == 'group_4' - run: pip install "dfsql>=0.4.2" "pyparsing<=2.4.7" && python -m pytest modin/experimental/sql/test/test_sql.py - if: matrix.engine == 'python' || matrix.test_task == 'group_4' + if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4') - run: python -m pytest modin/test/interchange/dataframe_protocol/test_general.py if: matrix.engine == 'python' || matrix.test_task == 'group_4' - run: python -m pytest modin/test/interchange/dataframe_protocol/pandas/test_protocol.py @@ -842,10 +491,139 @@ jobs: python -m pip install lazy_import python -m pytest modin/pandas/test/integrations/ if: matrix.engine == 'python' || matrix.test_task == 'group_4' - - uses: ./.github/workflows/upload-coverage + + - uses: ./.github/actions/upload-coverage + - name: Stop local ray cluster + run: ray stop + if: matrix.os == 'windows' && matrix.engine == 'ray' + - name: Rename the dirs with conda packages so it won't be deleted, it's too slow on Windows. + run: | + mkdir -p "${CONDA_PKGS_DIR}_do_not_cache" && \ + find "${CONDA_PKGS_DIR}" -mindepth 1 -maxdepth 1 -type d -exec mv {} "${CONDA_PKGS_DIR}_do_not_cache" \; + if: matrix.os == 'windows' + + test-sanity: + needs: [lint-flake8, lint-black, execution-filter] + if: github.event_name == 'pull_request' + strategy: + matrix: + os: + - ubuntu + - windows + python-version: ["3.8"] + execution: + - name: ray + shell-ex: "python -m pytest" + if: needs.execution-filter.ray != 'true' + - name: dask + shell-ex: "python -m pytest" + if: needs.execution-filter.dask != 'true' + - name: unidist + shell-ex: "mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest" + if: needs.execution-filter.unidist != 'true' + runs-on: ${{ matrix.os }}-latest + defaults: + run: + shell: bash -l {0} + env: + MODIN_ENGINE: ${{ matrix.execution.name }} + UNIDIST_BACKEND: "mpi" + PARALLEL: ${{ matrix.execution.name != 'unidist' && matrix.os != 'windows' && '-n 2' || '' }} + name: test-${{ matrix.os }}-sanity (engine ${{ matrix.execution.name }}, python ${{matrix.python-version}}) + services: + moto: + image: ${{ matrix.os != 'windows' && 'motoserver/moto' || '' }} + ports: + - 5000:5000 + env: + AWS_ACCESS_KEY_ID: foobar_key + AWS_SECRET_ACCESS_KEY: foobar_secret + steps: + - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env + with: + environment-file: ${{ matrix.execution.name == 'unidist' && 'requirements/env_unidist.yml' || 'environment-dev.yml' }} + activate-environment: ${{ matrix.execution.name == 'unidist' && 'modin_on_unidist' || 'modin' }} + python-version: ${{matrix.python-version}} + - name: Install HDF5 + run: sudo apt update && sudo apt install -y libhdf5-dev + if: matrix.os != 'windows' + - name: Limit ray memory + run: echo "MODIN_MEMORY=1000000000" >> $GITHUB_ENV + if: matrix.os != 'windows' && matrix.execution.name == 'ray' + - name: Tell Modin to use existing ray cluster + run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV + if: matrix.os == 'windows' && matrix.execution.name == 'ray' + - name: Start local ray cluster + # Try a few times to start ray to work around + # https://github.com/modin-project/modin/issues/4562 + uses: nick-fields/retry@v2 + with: + timeout_minutes: 5 + max_attempts: 5 + command: ray start --head --port=6379 --object-store-memory=1000000000 + if: matrix.os == 'windows' && matrix.execution.name == 'ray' + - run: MODIN_BENCHMARK_MODE=True ${{ matrix.execution.shell-ex }} modin/pandas/test/internals/test_benchmark_mode.py + - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/pandas/test/internals/test_repartition.py + - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/test_partition_api.py + - name: xgboost tests + run: | + ${{ matrix.execution.shell-ex }} $PARALLEL \ + modin/experimental/xgboost/test/test_default.py \ + modin/experimental/xgboost/test/test_xgboost.py \ + modin/experimental/xgboost/test/test_dmatrix.py + if: matrix.os != 'windows' && matrix.execution.name == 'ray' && needs.execution-filter.experimental == 'true' + - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/experimental/batch/test/test_pipeline.py + if: matrix.os != 'windows' && matrix.execution.name != 'unidist' && needs.execution-filter.experimental == 'true' + - name: "test DF: binary, default, iter" + run: | + ${{ matrix.execution.shell-ex }} $PARALLEL \ + modin/pandas/test/dataframe/test_binary.py \ + modin/pandas/test/dataframe/test_default.py \ + modin/pandas/test/dataframe/test_iter.py + if: matrix.os != 'windows' + - name: "test DF: reduce, udf, window, pickle" + run: | + ${{ matrix.execution.shell-ex }} $PARALLEL \ + modin/pandas/test/dataframe/test_reduce.py \ + modin/pandas/test/dataframe/test_udf.py \ + modin/pandas/test/dataframe/test_window.py \ + modin/pandas/test/dataframe/test_pickle.py + if: matrix.os != 'windows' + - run: ${{ matrix.execution.shell-ex }} modin/pandas/test/test_series.py + if: matrix.execution.name == 'ray' + - run: ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_series.py + if: matrix.execution.name != 'ray' + - run: ${{ matrix.execution.shell-ex }} modin/pandas/test/dataframe/test_map_metadata.py + if: matrix.execution.name == 'ray' + - run: ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/dataframe/test_map_metadata.py + if: matrix.execution.name != 'ray' + - name: "test rolling, expanding, reshape, general, concat" + run: | + ${{ matrix.execution.shell-ex }} $PARALLEL \ + modin/pandas/test/test_rolling.py \ + modin/pandas/test/test_expanding.py \ + modin/pandas/test/test_reshape.py \ + modin/pandas/test/test_general.py \ + modin/pandas/test/test_concat.py + if: matrix.os != 'windows' + - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/numpy/test + - run: ${{ matrix.execution.shell-ex }} -m "not exclude_in_sanity" modin/pandas/test/test_io.py --verbose + - run: ${{ matrix.execution.shell-ex }} modin/experimental/pandas/test/test_io_exp.py + - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/test_general.py + - run: ${{ matrix.execution.shell-ex }} $PARALLEL modin/test/interchange/dataframe_protocol/pandas/test_protocol.py + - name: Stop local ray cluster + run: ray stop + if: matrix.os == 'windows' && matrix.execution.name == 'ray' + - name: Rename the dirs with conda packages so it won't be deleted, it's too slow on Windows. + run: | + mkdir -p "${CONDA_PKGS_DIR}_do_not_cache" && \ + find "${CONDA_PKGS_DIR}" -mindepth 1 -maxdepth 1 -type d -exec mv {} "${CONDA_PKGS_DIR}_do_not_cache" \; + if: matrix.os == 'windows' + - uses: ./.github/actions/upload-coverage test-experimental: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] + needs: [lint-flake8, lint-black] runs-on: ubuntu-latest defaults: run: @@ -864,40 +642,17 @@ jobs: AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 2 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_map_metadata.py - run: python -m pytest -n 2 modin/pandas/test/test_series.py # Do not add parallelism (`-n` argument) here - it will cause mock S3 service to fail. - run: python -m pytest modin/pandas/test/test_io.py --verbose - - uses: ./.github/workflows/upload-coverage + - uses: ./.github/actions/upload-coverage test-cloud: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] + needs: [lint-flake8, lint-black] runs-on: ubuntu-latest defaults: run: @@ -916,157 +671,19 @@ jobs: AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 2 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list # TODO(https://github.com/modin-project/modin/issues/4004): Re-add # "python -m pytest --simulate-cloud=normal modin/pandas/test/test_io.py --verbose" # once that test stops crashing. - run: python -m pytest --simulate-cloud=normal modin/pandas/test/dataframe/test_default.py::test_kurt_kurtosis --verbose - # When running without parameters, some of the tests fail run: python -m pytest --simulate-cloud=normal modin/pandas/test/dataframe/test_binary.py::test_math_functions[add-rows-scalar] - - uses: ./.github/workflows/upload-coverage - - test-windows: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] - runs-on: windows-latest - defaults: - run: - shell: bash -l {0} - strategy: - matrix: - python-version: ["3.8"] - engine: ["ray", "dask"] - test_task: - - group_1 - - group_2 - - group_3 - - group_4 - env: - MODIN_ENGINE: ${{matrix.engine}} - name: test-windows (engine ${{matrix.engine}}, python ${{matrix.python-version}}, ${{matrix.test_task}}) - steps: - - uses: actions/checkout@v3 - with: - fetch-depth: 2 - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin - environment-file: environment-dev.yml - python-version: ${{matrix.python-version}} - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - - name: Start local ray cluster - # Try a few times to start ray to work around - # https://github.com/modin-project/modin/issues/4562 - uses: nick-fields/retry@v2 - with: - timeout_minutes: 5 - max_attempts: 5 - command: | - ray start --head --port=6379 --object-store-memory=1000000000 - if: matrix.engine == 'ray' - - name: Tell Modin to use existing ray cluster - run: echo "MODIN_RAY_CLUSTER=True" >> $GITHUB_ENV - if: matrix.engine == 'ray' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_binary.py - if: matrix.test_task == 'group_1' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_default.py - if: matrix.test_task == 'group_1' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_indexing.py - if: matrix.test_task == 'group_1' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_iter.py - if: matrix.test_task == 'group_1' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_join_sort.py - if: matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_reduce.py - if: matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_udf.py - if: matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_window.py - if: matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_pickle.py - if: matrix.test_task == 'group_2' - - run: python -m pytest -n 2 modin/pandas/test/test_series.py - if: matrix.test_task == 'group_3' - - run: python -m pytest -n 2 modin/pandas/test/dataframe/test_map_metadata.py - if: matrix.test_task == 'group_3' - - run: MODIN_EXPERIMENTAL_GROUPBY=1 python -m pytest -n 2 modin/pandas/test/test_groupby.py - if: matrix.test_task == 'group_3' - - run: python -m pytest -n 2 modin/pandas/test/test_rolling.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/pandas/test/test_expanding.py - if: matrix.test_task == 'group_4' - - run: python -m pytest modin/pandas/test/test_concat.py # Ray and Dask versions fails with -n 2 - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_creation.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_arithmetic.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_axis_functions.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_logic.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_linalg.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_indexing.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_math.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/numpy/test/test_array_shaping.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/pandas/test/test_groupby.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/pandas/test/test_reshape.py - if: matrix.test_task == 'group_4' - - run: python -m pytest -n 2 modin/pandas/test/test_general.py - if: matrix.test_task == 'group_4' - - timeout-minutes: 60 - run: python -m pytest modin/pandas/test/test_io.py --verbose - if: matrix.test_task == 'group_4' - - uses: ./.github/workflows/upload-coverage - - name: Stop local ray cluster - run: ray stop - if: matrix.engine == 'ray' - - name: Rename the folder with conda packages so it won't be deleted, it's too slow on Windows. - run: mv "${CONDA_PKGS_DIR}" "${CONDA_PKGS_DIR}_do_not_cache" + - uses: ./.github/actions/upload-coverage test-pyarrow: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] + needs: [lint-flake8, lint-black] runs-on: ubuntu-latest defaults: run: @@ -1088,37 +705,15 @@ jobs: AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 1 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml python-version: ${{matrix.python-version}} - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - run: sudo apt update && sudo apt install -y libhdf5-dev - run: python -m pytest modin/pandas/test/test_io.py::TestCsv --verbose test-spreadsheet: - needs: [lint-flake8, lint-black, lint-mypy, test-api, test-headers] + needs: [lint-flake8, lint-black] runs-on: ubuntu-latest defaults: run: @@ -1133,49 +728,26 @@ jobs: name: test-spreadsheet (engine ${{matrix.engine}}, python ${{matrix.python-version}}) steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 1 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml python-version: ${{matrix.python-version}} - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - run: python -m pytest modin/experimental/spreadsheet/test/test_general.py upload-coverage: - needs: [test-internals, test-no-engine, test-defaults, test-hdk, test-all-unidist, test-all, test-experimental, test-cloud, test-windows] + needs: [test-internals, test-api-and-no-engine, test-defaults, test-hdk, test-all-unidist, test-all, test-experimental, test-cloud, test-sanity] + if: always() # we need to run it regardless of some job being skipped, like in PR runs-on: ubuntu-latest defaults: run: shell: bash -l {0} steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 1 + - uses: actions/checkout@v3 + - uses: ./.github/actions/python-only - name: Download coverage data uses: actions/download-artifact@v3.0.2 with: name: coverage-data - - uses: actions/setup-python@v4 - run: pip install coverage - name: Combine coverage run: python -m coverage combine @@ -1183,4 +755,4 @@ jobs: run: python -m coverage xml - uses: codecov/codecov-action@v3 with: - fail_ci_if_error: true + fail_ci_if_error: ${{ github.event_name == 'push' }} # do not care about uploads in PR diff --git a/.github/workflows/fuzzydata-test.yml b/.github/workflows/fuzzydata-test.yml index f16a6f18e7b..b0407c16137 100644 --- a/.github/workflows/fuzzydata-test.yml +++ b/.github/workflows/fuzzydata-test.yml @@ -34,32 +34,10 @@ jobs: engine: ["ray", "dask"] steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 1 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list + python-version: ${{matrix.python-version}} - name: test-fuzzydata (engine ${{matrix.engine}}, python ${{matrix.python-version}}) run: python -m pytest modin/experimental/fuzzydata/test/test_fuzzydata.py -Wignore::UserWarning --log-file=/tmp/fuzzydata-test-wf-${{matrix.engine}}/run.log --log-file-level=INFO env: diff --git a/.github/workflows/push-to-master.yml b/.github/workflows/push-to-master.yml index fc3b8bdc451..d5152bd98d3 100644 --- a/.github/workflows/push-to-master.yml +++ b/.github/workflows/push-to-master.yml @@ -27,28 +27,9 @@ jobs: AWS_SECRET_ACCESS_KEY: foobar_secret steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 1 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - name: install Ray nightly build # Use --force-reinstall to always reinstall ray and its dependencies. # botocore isn't compatible with urllib3>=2; see #6094 for details @@ -97,32 +78,9 @@ jobs: name: test docs steps: - uses: actions/checkout@v3 + - uses: ./.github/actions/mamba-env with: - fetch-depth: 1 - - name: Cache conda - uses: actions/cache@v3 - with: - path: | - ~/conda_pkgs_dir - ~/.cache/pip - key: - ${{ runner.os }}-conda-${{ hashFiles('environment-dev.yml') }} - - uses: conda-incubator/setup-miniconda@v2 - with: - miniforge-variant: Mambaforge - miniforge-version: latest - use-mamba: true - activate-environment: modin environment-file: environment-dev.yml - python-version: 3.8 - channel-priority: strict - # we set use-only-tar-bz2 to false in order for conda to properly find new packages to be installed - # for more info see https://github.com/conda-incubator/setup-miniconda/issues/264 - use-only-tar-bz2: false - - name: Conda environment - run: | - conda info - conda list - run: sudo apt update && sudo apt install -y libhdf5-dev - name: Docstring URL validity check run: python -m pytest modin/test/test_docstring_urls.py diff --git a/README.md b/README.md index 5a118cbd602..4029350db18 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ + PyPI version

diff --git a/asv_bench/asv.conf.dask.json b/asv_bench/asv.conf.dask.json index cc12302b149..30e44e14821 100644 --- a/asv_bench/asv.conf.dask.json +++ b/asv_bench/asv.conf.dask.json @@ -48,7 +48,7 @@ // timeout in seconds for installing any dependencies in environment // defaults to 10 min - //"install_timeout": 600, + "install_timeout": 6000, // the base URL to show a commit for the project. "show_commit_url": "https://github.com/modin-project/modin/commit/", diff --git a/asv_bench/asv.conf.hdk.json b/asv_bench/asv.conf.hdk.json index 7d8b947ced1..317becb9fc8 100644 --- a/asv_bench/asv.conf.hdk.json +++ b/asv_bench/asv.conf.hdk.json @@ -25,6 +25,10 @@ // variable. "environment_type": "conda", + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + "install_timeout": 6000, + // the base URL to show a commit for the project. "show_commit_url": "https://github.com/modin-project/modin/commit/", diff --git a/asv_bench/asv.conf.json b/asv_bench/asv.conf.json index fc7a3d99525..234004dbbb2 100644 --- a/asv_bench/asv.conf.json +++ b/asv_bench/asv.conf.json @@ -48,7 +48,7 @@ // timeout in seconds for installing any dependencies in environment // defaults to 10 min - //"install_timeout": 600, + "install_timeout": 6000, // the base URL to show a commit for the project. "show_commit_url": "https://github.com/modin-project/modin/commit/", diff --git a/asv_bench/asv.conf.unidist.json b/asv_bench/asv.conf.unidist.json index df011617ea3..b8e04c5bb1e 100644 --- a/asv_bench/asv.conf.unidist.json +++ b/asv_bench/asv.conf.unidist.json @@ -48,7 +48,7 @@ // timeout in seconds for installing any dependencies in environment // defaults to 10 min - //"install_timeout": 600, + "install_timeout": 6000, // the base URL to show a commit for the project. "show_commit_url": "https://github.com/modin-project/modin/commit/", diff --git a/asv_bench/benchmarks/scalability/scalability_benchmarks.py b/asv_bench/benchmarks/scalability/scalability_benchmarks.py index 05a465b2ab8..f9850ff1999 100644 --- a/asv_bench/benchmarks/scalability/scalability_benchmarks.py +++ b/asv_bench/benchmarks/scalability/scalability_benchmarks.py @@ -17,7 +17,7 @@ from modin.pandas.utils import from_pandas try: - from modin.utils import to_pandas + from modin.utils import to_pandas, to_numpy except ImportError: # This provides compatibility with older versions of the Modin, allowing us to test old commits. from modin.pandas.utils import to_pandas @@ -70,4 +70,22 @@ def time_to_pandas(self, shape, cpus): to_pandas(self.data) +class TimeToNumPy: + param_names = ["shape", "cpus"] + params = [ + get_benchmark_shapes("TimeToNumPy"), + [4, 16, 32], + ] + + def setup(self, shape, cpus): + from modin.config import NPartitions + + NPartitions.get = lambda: cpus + self.data = generate_dataframe("int", *shape, RAND_LOW, RAND_HIGH, impl="modin") + + def time_to_numpy(self, shape, cpus): + # to_numpy is already synchronous + to_numpy(self.data) + + from ..utils import setup # noqa: E402, F401 diff --git a/asv_bench/benchmarks/utils/common.py b/asv_bench/benchmarks/utils/common.py index e67ca677de9..9a0a1dab276 100644 --- a/asv_bench/benchmarks/utils/common.py +++ b/asv_bench/benchmarks/utils/common.py @@ -459,17 +459,8 @@ def trigger_import(*dfs): if ASV_USE_STORAGE_FORMAT != "hdk" or ASV_USE_IMPL == "pandas": return - from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( - DbWorker, - ) - for df in dfs: - df.shape # to trigger real execution - df._query_compiler._modin_frame._partitions[0][ - 0 - ].frame_id = DbWorker().import_arrow_table( - df._query_compiler._modin_frame._partitions[0][0].get() - ) # to trigger real execution + df._query_compiler._modin_frame.force_import() def execute( diff --git a/asv_bench/benchmarks/utils/data_shapes.py b/asv_bench/benchmarks/utils/data_shapes.py index 98d58aa4291..af3ce71014f 100644 --- a/asv_bench/benchmarks/utils/data_shapes.py +++ b/asv_bench/benchmarks/utils/data_shapes.py @@ -116,6 +116,7 @@ # Scalability benchmarks "TimeFromPandas", "TimeToPandas", + "TimeToNumPy", ], ), ( diff --git a/codecov.yml b/codecov.yml index 69cb76019a4..95adf7b6b26 100644 --- a/codecov.yml +++ b/codecov.yml @@ -1 +1,11 @@ comment: false +coverage: + status: + project: + default: + branches: + - master + target: 85% + patch: + default: + target: 30% diff --git a/docs/conf.py b/docs/conf.py index 1e6ac43a891..9cf86535675 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -13,6 +13,7 @@ import ray + # stub ray.remote to be a no-op so it doesn't shadow docstrings def noop_decorator(*args, **kwargs): if len(args) == 1 and len(kwargs) == 0 and callable(args[0]): @@ -24,7 +25,7 @@ def noop_decorator(*args, **kwargs): ray.remote = noop_decorator # fake modules if they're missing -for mod_name in ("cudf", "cupy", "pyarrow.gandiva", "pyhdk"): +for mod_name in ("cudf", "cupy", "pyarrow.gandiva", "pyhdk", "pyhdk.hdk"): try: __import__(mod_name) except ImportError: @@ -37,6 +38,17 @@ def noop_decorator(*args, **kwargs): sys.modules["cupy"].ndarray = type("ndarray", (object,), {}) if not hasattr(sys.modules["pyhdk"], "PyDbEngine"): sys.modules["pyhdk"].PyDbEngine = type("PyDbEngine", (object,), {}) +if not hasattr(sys.modules["pyhdk.hdk"], "HDK"): + sys.modules["pyhdk.hdk"].HDK = type("HDK", (object,), {}) +if not hasattr(sys.modules["pyhdk.hdk"], "QueryNode"): + sys.modules["pyhdk.hdk"].QueryNode = type("QueryNode", (object,), {}) +if not hasattr(sys.modules["pyhdk.hdk"], "ExecutionResult"): + sys.modules["pyhdk.hdk"].ExecutionResult = type("ExecutionResult", (object,), {}) +if not hasattr(sys.modules["pyhdk.hdk"], "RelAlgExecutor"): + sys.modules["pyhdk.hdk"].RelAlgExecutor = type("RelAlgExecutor", (object,), {}) +if not hasattr(sys.modules["pyhdk"], "__version__"): + # Show all known pyhdk config options in documentation + sys.modules["pyhdk"].__version__ = "999" sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) import modin diff --git a/docs/development/using_hdk.rst b/docs/development/using_hdk.rst index 63455178462..86a6b39ffc5 100644 --- a/docs/development/using_hdk.rst +++ b/docs/development/using_hdk.rst @@ -34,4 +34,22 @@ If for some reasons ``Native`` engine is explicitly set using ``modin.config`` o If you encounter ``LLVM ERROR: inconsistency in registered CommandLine options`` error when using HDK, please refer to the respective section in :doc:`Troubleshooting ` page to avoid the issue. -.. _HDK: https://github.com/intel-ai/hdk \ No newline at end of file + +Running on a GPU +---------------- + +Prerequisites: + +* HDK's GPU mode is currently supported on Linux and Intel GPU only. +* HDK supports Gen9 architecture and higher (including Xe & Arc). +* HDK's GPU mode requires proper driver installation. Follow this guide_ to set up your system. Make sure to install the compute runtime packages: ``intel-opencl-icd``, ``intel-level-zero-gpu``, ``level-zero``. +* Make sure your GPU is visible and accessible. + +.. note:: + You can use ``hwinfo`` and ``clinfo`` utilities to verify the driver installation and device accessibility. + +HDK supports a heterogeneous execution mode (experimental) that is disabled by default in Modin. Starting with pyHDK version 0.7 Modin can run the workload on Intel GPU. +Run on a GPU via ``MODIN_HDK_LAUNCH_PARAMETERS="cpu_only=0" python ``. + +.. _HDK: https://github.com/intel-ai/hdk +.. _guide: https://dgpu-docs.intel.com/driver/installation.html \ No newline at end of file diff --git a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst index 8a4e98e2bf9..e3911d36bd0 100644 --- a/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst +++ b/docs/flow/modin/experimental/core/execution/native/implementations/hdk_on_native/index.rst @@ -46,12 +46,16 @@ engine itself and we don't need to manage multiple partitions. :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.dataframe.dataframe.HdkOnNativeDataframe` always has a single partition. -A partition holds data in either ``pandas.DataFrame`` or ``pyarrow.Table`` +A partition holds data in either ``pandas.DataFrame``, ``pyarrow.Table`` or ``DbTable`` format. ``pandas.DataFrame`` is preferred only when we detect unsupported data type and therefore have to use ``pandas`` framework for processing. -In other cases ``pyarrow.Table`` format is preferred. Arrow tables can be -zero-copy imported into HDK. A query execution result is also -returned as an Arrow table. +The ``pyarrow.Table`` format is used when a ``DataFrame`` is created and until the +table is imported into HDK. When it's imported, the partition data is replaced with +a ``DbTable``. ``DbTable`` represents a table in the HDK database and provides basic +information about the table: table name, column names, shape. It also allows +exporting the data into the ``pyarrow.Table`` format. Depending on the data types, +a ``pyarrow.Table`` import/export could be performed zero-copy. A query execution +result is also returned as a ``DbTable``. Data Ingress ------------ @@ -173,15 +177,15 @@ Arrow execution For simple operations which don't include actual computations, execution can use Arrow API. We can use it to rename columns, drop columns and concatenate -frames. Arrow execution is preferable since it doesn't require actual data import/export -from/to HDK. +frames. Arrow execution is performed if we have an arrow table in the partition +and it's preferable since it doesn't require actual data import into HDK. HDK execution ''''''''''''' To execute a query in the HDK engine we need to import data first. We should find all leaves of an operation tree and import their Arrow tables. Partitions -with imported tables hold corresponding table names used to refer to them in +with ``DbTable`` hold corresponding table names used to refer to them in queries. HDK executes queries expressed in HDK-specific intermediate representation (IR) format. @@ -215,9 +219,9 @@ The building of Calcite query (starting from the conversion to the Calcite Algeb the forming JSON query) is orchestrated by :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.partitioning.partition_manager.HdkOnNativeDataframePartitionManager`. -An execution result is a new Arrow table which is used to form a new -partition. This partition is assigned to the executed frame. The frame's -operation tree is replaced with +An execution result is a new table in the HDK database, that is represented by ``DbTable``, +which is used to form a new partition. This partition is assigned to the executed frame. +The frame's operation tree is replaced with :py:class:`~modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra.FrameNode` operation. Rowid column and sub-queries @@ -261,11 +265,11 @@ class. Column name mangling """""""""""""""""""" -In ``pandas.DataFrame`` columns might have names not allowed in SQL (e. g. -an empty string). To handle this we simply add '`F_`' prefix to -column names. Index labels are more tricky because they might be non-unique. -Indexes are represented as regular columns, and we have to perform a special -mangling to get valid and unique column names. Demangling is done when we +In ``pandas.DataFrame`` columns might have names of non-string types or not allowed +in SQL (e. g. an empty string). To handle this we use an internal encoder, that +makes the names SQL-compatible. Index labels are more tricky because they might be +non-unique. Indexes are represented as regular columns, and we have to perform a +special mangling to get valid and unique column names. Demangling is done when we transform our frame (i.e. its Arrow table) into ``pandas.DataFrame`` format. .. toctree:: diff --git a/docs/img/hdk/hdk_calcite_serialization_flow.svg b/docs/img/hdk/hdk_calcite_serialization_flow.svg index 01719049c4f..6914ef81755 100644 --- a/docs/img/hdk/hdk_calcite_serialization_flow.svg +++ b/docs/img/hdk/hdk_calcite_serialization_flow.svg @@ -1,4 +1,4 @@ -
HdkOnNativeDataframe
HdkOnNativeDataframe
PartitionManager.
     run_execution_plan()
PartitionManager....
Partition to query,
computation tree
Partition to query...
CalciteBuilder.build()
CalciteBuilder.build()
Computation tree
Computation tree
Calcite Nodes Sequence
Calcite Nodes Sequence
CalciteSerializer.
       serialize()
CalciteSerialize...
Calcite Nodes Sequence
Calcite Nodes Sequence
JSON query for DB
JSON query for DB
HDK Engine
HDK Engine
JSON query for DB
JSON query for DB
Arrow Table
Arrow Table
Partition, holding
resulted Arrow Table
Partition, holding...
Viewer does not support full SVG 1.1
\ No newline at end of file +
HdkOnNativeDataframe
HdkOnNativeDataframe
PartitionManager.
     run_execution_plan()
PartitionManager....
Partition to query,
computation tree
Partition to query...
CalciteBuilder.build()
CalciteBuilder.build()
Computation tree
Computation tree
Calcite Nodes Sequence
Calcite Nodes Sequence
CalciteSerializer.
       serialize()
CalciteSerialize...
Calcite Nodes Sequence
Calcite Nodes Sequence
JSON query for DB
JSON query for DB
HDK Engine
HDK Engine
JSON query for DB
JSON query for DB
DbTable
DbTable
Partition, holding
resulted DbTable
Partition, holding...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/img/hdk/hdk_import.svg b/docs/img/hdk/hdk_import.svg index 34791905cc5..6350102ca58 100644 --- a/docs/img/hdk/hdk_import.svg +++ b/docs/img/hdk/hdk_import.svg @@ -1,4 +1,4 @@ -
Partition
Partition
PyArrow Table/
pandas DataFrame
PyArrow Table/...
Request to import a table
Request to import a table
Imported table name
Imported table name
HDK Engine
HDK Engine
PartitionManager
PartitionManager
HdkOnNativeDataframe
OmnisciOnNa...
Viewer does not support full SVG 1.1
\ No newline at end of file +
Partition
Partition
PyArrow Table/
pandas DataFrame
PyArrow Table/...
Request to import a table
Request to import a table
DbTable
DbTable
HDK Engine
HDK Engine
PartitionManager
PartitionManager
HdkOnNativeDataframe
OmnisciOnNa...
Viewer does not support full SVG 1.1
\ No newline at end of file diff --git a/docs/requirements-doc.txt b/docs/requirements-doc.txt index 74626d8d610..620f8a7a560 100644 --- a/docs/requirements-doc.txt +++ b/docs/requirements-doc.txt @@ -12,7 +12,10 @@ pyyaml recommonmark sphinx<6.0.0 sphinx-click -ray[default]>=1.13.0 +# ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100 +ray[default]>=1.13.0,!=2.5.0 +# https://github.com/modin-project/modin/issues/6336 +pydantic<2 # Override to latest version of modin-spreadsheet git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 sphinxcontrib_plantuml diff --git a/environment-dev.yml b/environment-dev.yml index a4b70551534..f1cc505ab4d 100644 --- a/environment-dev.yml +++ b/environment-dev.yml @@ -2,21 +2,30 @@ name: modin channels: - conda-forge dependencies: + - pip + + # required dependencies - pandas>=2,<2.1 - numpy>=1.18.5 - - ray-default>=1.13.0 + - fsspec + - packaging + - psutil + + # optional dependencies + # ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100 + - ray-default>=1.13.0,!=2.5.0 + # https://github.com/modin-project/modin/issues/6336 + - pydantic<2 - pyarrow # workaround for https://github.com/conda/conda/issues/11744 - grpcio!=1.45.* - grpcio!=1.46.* - dask>=2.22.0 - distributed>=2.22.0 - - fsspec # TODO: uncomment after Modin switch to python>=3.9 # - xarray - Jinja2 - scipy - - pip - s3fs>=2021.8 - feather-format - lxml @@ -27,43 +36,47 @@ dependencies: - pandas-gbq - pytables - msgpack-python - - psutil - - pytest>=6.0.1 - - pytest-benchmark - - pytest-cov>=2.10.1 - - pytest-xdist>=2.1.0 - - packaging - - coverage - - pygithub - - rpyc==4.1.5 - - cloudpickle - - boto3 - - moto - scikit-learn - pymssql - psycopg2 - # Mypy 0.990 doesn't work: https://github.com/modin-project/modin/issues/5206 - - mypy!=0.990 - - pandas-stubs - fastparquet<2023.1.0 - # for release script - - pygit2 # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost # when we use collective instead of rabit. - xgboost>=1.7.1,<2.0.0 - tqdm + + ## modin in the cloud dependencies + - boto3 + - cloudpickle + - rpyc==4.1.5 + + # dependencies for making release + - pygithub>=v1.58.0 + - pygit2>=1.9.2 + + # test dependencies + - coverage>=7.1.0 + - moto>=4.1.0 + - pytest>=7.2.1 + - pytest-benchmark>=4.0.0 + - pytest-cov>=4.0.0 + - pytest-xdist>=3.2.0 + # code linters - - black - - flake8 - - flake8-no-implicit-concat - - flake8-print + - black>=23.1.0 + - flake8>=6.0.0 + - flake8-no-implicit-concat>=0.3.4 + - flake8-print>=5.0.0 + - mypy>=1.0.0 + - pandas-stubs>=2.0.0 + - pip: + # no conda package for windows so we install it with pip + - connectorx>=0.2.6a4 + # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies + - fuzzydata>=0.0.6 # Fixes breaking ipywidgets changes, but didn't release yet. - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 - git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9 - # no conda package for windows so we install it with pip - - connectorx>=0.2.6a4 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.1.0 - # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies - - fuzzydata>=0.0.6 diff --git a/examples/docker/modin-hdk/census-hdk.py b/examples/docker/modin-hdk/census-hdk.py index 7d5337c93ce..4d325320a58 100644 --- a/examples/docker/modin-hdk/census-hdk.py +++ b/examples/docker/modin-hdk/census-hdk.py @@ -14,16 +14,8 @@ import sys from utils import measure import modin.pandas as pd -from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( - DbWorker, -) -from sklearn import config_context -import sklearnex -sklearnex.patch_sklearn() -from sklearn.model_selection import train_test_split -import sklearn.linear_model as lm import numpy as np @@ -131,12 +123,8 @@ def read(filename): skiprows=1, ) - df.shape # to trigger real execution - df._query_compiler._modin_frame._partitions[0][ - 0 - ].frame_id = DbWorker().import_arrow_table( - df._query_compiler._modin_frame._partitions[0][0].get() - ) # to trigger real execution + # to trigger real execution and table import + df._query_compiler._modin_frame.force_import() return df @@ -203,6 +191,14 @@ def cod(y_test, y_pred): def ml(X, y, random_state, n_runs, test_size): + # to not install ML dependencies unless required + from sklearn import config_context + import sklearnex + + sklearnex.patch_sklearn() + from sklearn.model_selection import train_test_split + import sklearn.linear_model as lm + clf = lm.Ridge() X = np.ascontiguousarray(X, dtype=np.float64) diff --git a/examples/docker/modin-hdk/nyc-taxi-hdk.py b/examples/docker/modin-hdk/nyc-taxi-hdk.py index ba8ba9de55f..b13d3ab9b23 100644 --- a/examples/docker/modin-hdk/nyc-taxi-hdk.py +++ b/examples/docker/modin-hdk/nyc-taxi-hdk.py @@ -15,9 +15,6 @@ from utils import measure import modin.pandas as pd from modin.pandas.test.utils import df_equals -from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( - DbWorker, -) from modin.experimental.sql import query @@ -145,12 +142,8 @@ def read(filename): parse_dates=dates_only, ) - df.shape # to trigger real execution - df._query_compiler._modin_frame._partitions[0][ - 0 - ].frame_id = DbWorker().import_arrow_table( - df._query_compiler._modin_frame._partitions[0][0].get() - ) # to trigger real execution + # to trigger real execution and table import + df._query_compiler._modin_frame.force_import() return df diff --git a/examples/docker/modin-hdk/plasticc-hdk.py b/examples/docker/modin-hdk/plasticc-hdk.py index 7f7a98f4ced..9a4632ec1ea 100644 --- a/examples/docker/modin-hdk/plasticc-hdk.py +++ b/examples/docker/modin-hdk/plasticc-hdk.py @@ -18,13 +18,6 @@ import modin.pandas as pd import numpy as np -import xgboost as xgb - -import sklearnex - -sklearnex.patch_sklearn() -from sklearn.model_selection import train_test_split -from sklearn.preprocessing import LabelEncoder ################ helper functions ############################### @@ -81,6 +74,9 @@ def all_etl(train, train_meta, test, test_meta): def split_step(train_final, test_final): + from sklearn.model_selection import train_test_split + from sklearn.preprocessing import LabelEncoder + X = train_final.drop(["object_id", "target"], axis=1).values Xt = test_final.drop(["object_id"], axis=1).values @@ -197,6 +193,13 @@ def etl(df, df_meta): def ml(train_final, test_final): + # to not install ML dependencies unless required + import xgboost as xgb + import sklearnex + + sklearnex.patch_sklearn() + + X_train, y_train, X_test, y_test, Xt, classes, class_weights = split_step( train_final, test_final ) diff --git a/examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt b/examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt index 8cc9c67fb3b..d19ba10cd43 100644 --- a/examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt +++ b/examples/tutorial/jupyter/execution/pandas_on_ray/requirements.txt @@ -3,4 +3,6 @@ jupyterlab ipywidgets tqdm modin[ray] +# https://github.com/modin-project/modin/issues/6336 +pydantic<2 modin[spreadsheet] diff --git a/modin/config/envvars.py b/modin/config/envvars.py index 87a3f97fdaa..eb54441ae4c 100644 --- a/modin/config/envvars.py +++ b/modin/config/envvars.py @@ -517,14 +517,6 @@ class HdkLaunchParameters(EnvironmentVariable, type=dict): """ varname = "MODIN_HDK_LAUNCH_PARAMETERS" - default = { - "enable_union": 1, - "enable_columnar_output": 1, - "enable_lazy_fetch": 0, - "null_div_by_zero": 1, - "enable_watchdog": 0, - "enable_thrift_logs": 0, - } @classmethod def get(cls) -> dict: @@ -557,12 +549,44 @@ def _get(cls) -> dict: Decoded and verified config value. """ custom_parameters = super().get() - result = cls.default.copy() + result = cls._get_default().copy() result.update( {key.replace("-", "_"): value for key, value in custom_parameters.items()} ) return result + @classmethod + def _get_default(cls) -> Any: + """ + Get default value of the config. Checks the pyhdk version and omits variables unsupported in prior versions. + + Returns + ------- + dict + Config keys and corresponding values. + """ + if (default := getattr(cls, "default", None)) is None: + cls.default = default = { + "enable_union": 1, + "enable_columnar_output": 1, + "enable_lazy_fetch": 0, + "null_div_by_zero": 1, + "enable_watchdog": 0, + "enable_thrift_logs": 0, + "cpu_only": 1, + } + + try: + import pyhdk + + if version.parse(pyhdk.__version__) >= version.parse("0.6.1"): + default["enable_lazy_dict_materialization"] = 0 + default["log_dir"] = "pyhdk_log" + except ImportError: + # if pyhdk is not available, do not show any additional options + pass + return default + class OmnisciLaunchParameters(HdkLaunchParameters, type=dict): """ diff --git a/modin/config/test/test_envvars.py b/modin/config/test/test_envvars.py index 01ed1c9304f..0abfa6dc5ac 100644 --- a/modin/config/test/test_envvars.py +++ b/modin/config/test/test_envvars.py @@ -16,6 +16,8 @@ import modin.config as cfg from modin.config.envvars import EnvironmentVariable, _check_vars, ExactStr +from packaging import version + @pytest.fixture def make_unknown_env(): @@ -63,9 +65,22 @@ def test_custom_help(make_custom_envvar): def test_hdk_envvar(): + try: + import pyhdk + + defaults = cfg.HdkLaunchParameters.get() + assert defaults["enable_union"] == 1 + if version.parse(pyhdk.__version__) >= version.parse("0.6.1"): + assert defaults["log_dir"] == "pyhdk_log" + del cfg.HdkLaunchParameters._value + except ImportError: + # This test is intended to check pyhdk internals. If pyhdk is not available, skip the version check test. + pass + os.environ[ cfg.OmnisciLaunchParameters.varname ] = "enable_union=2,enable_thrift_logs=3" + del cfg.OmnisciLaunchParameters._value params = cfg.OmnisciLaunchParameters.get() assert params["enable_union"] == 2 assert params["enable_thrift_logs"] == 3 @@ -74,11 +89,27 @@ def test_hdk_envvar(): assert params["enable_union"] == 2 assert params["enable_thrift_logs"] == 3 - os.environ[cfg.HdkLaunchParameters.varname] = "enable_union=4,enable_thrift_logs=5" + os.environ[cfg.HdkLaunchParameters.varname] = "unsupported=X" + params = cfg.HdkLaunchParameters.get() + assert params["unsupported"] == "X" + try: + import pyhdk + + pyhdk.buildConfig(**cfg.HdkLaunchParameters.get()) + except RuntimeError as e: + assert str(e) == "unrecognised option '--unsupported'" + except ImportError: + # This test is intended to check pyhdk internals. If pyhdk is not available, skip the version check test. + pass + + os.environ[ + cfg.HdkLaunchParameters.varname + ] = "enable_union=4,enable_thrift_logs=5,enable_lazy_dict_materialization=6" del cfg.HdkLaunchParameters._value params = cfg.HdkLaunchParameters.get() assert params["enable_union"] == 4 assert params["enable_thrift_logs"] == 5 + assert params["enable_lazy_dict_materialization"] == 6 params = cfg.OmnisciLaunchParameters.get() assert params["enable_union"] == 2 diff --git a/modin/conftest.py b/modin/conftest.py index 0647e8360fd..789d5ca331e 100644 --- a/modin/conftest.py +++ b/modin/conftest.py @@ -62,6 +62,7 @@ def _saving_make_api_url(token, _make_api_url=modin.utils._make_api_url): CIAWSAccessKeyID, CIAWSSecretAccessKey, AsyncReadMode, + BenchmarkMode, ) import uuid # noqa: E402 @@ -558,6 +559,14 @@ def set_num_partitions(request): NPartitions.put(old_num_partitions) +@pytest.fixture() +def set_benchmark_mode(request): + old_benchmark_mode = BenchmarkMode.get() + BenchmarkMode.put(request.param) + yield + BenchmarkMode.put(old_benchmark_mode) + + @pytest.fixture def set_async_read_mode(request): old_async_read_mode = AsyncReadMode.get() diff --git a/modin/core/dataframe/algebra/binary.py b/modin/core/dataframe/algebra/binary.py index b3d011dfff4..cdffd3c9bb0 100644 --- a/modin/core/dataframe/algebra/binary.py +++ b/modin/core/dataframe/algebra/binary.py @@ -382,6 +382,7 @@ def caller( lambda x, y: func(x, y, *args, **kwargs), [other._modin_frame], join_type=join_type, + labels=labels, dtypes=dtypes, ), shape_hint=shape_hint, diff --git a/modin/core/dataframe/algebra/default2pandas/groupby.py b/modin/core/dataframe/algebra/default2pandas/groupby.py index 1c6256cd24b..d5bd9cd4a20 100644 --- a/modin/core/dataframe/algebra/default2pandas/groupby.py +++ b/modin/core/dataframe/algebra/default2pandas/groupby.py @@ -601,10 +601,13 @@ def register(cls, func, **kwargs): # 2. `.apply(func)` applies func to a DataFrames, holding a whole group (group-wise). # 3. `.transform(func)` is the same as `.apply()` but also broadcast the `func` # result to the group's original shape. + # 4. 'direct' mode means that the passed `func` has to be applied directly + # to the `pandas.DataFrameGroupBy` object. _aggregation_methods_dict = { "axis_wise": pandas.core.groupby.DataFrameGroupBy.aggregate, "group_wise": pandas.core.groupby.DataFrameGroupBy.apply, "transform": pandas.core.groupby.DataFrameGroupBy.transform, + "direct": lambda grp, func, *args, **kwargs: func(grp, *args, **kwargs), } @classmethod @@ -637,4 +640,5 @@ class SeriesGroupByDefault(GroupByDefault): "axis_wise": pandas.core.groupby.SeriesGroupBy.aggregate, "group_wise": pandas.core.groupby.SeriesGroupBy.apply, "transform": pandas.core.groupby.SeriesGroupBy.transform, + "direct": lambda grp, func, *args, **kwargs: func(grp, *args, **kwargs), } diff --git a/modin/core/dataframe/algebra/default2pandas/rolling.py b/modin/core/dataframe/algebra/default2pandas/rolling.py index 0259849225e..cdc6c7f5480 100644 --- a/modin/core/dataframe/algebra/default2pandas/rolling.py +++ b/modin/core/dataframe/algebra/default2pandas/rolling.py @@ -37,9 +37,9 @@ def _build_rolling(cls, func): Function that takes pandas DataFrame and applies `func` on a rolling window. """ - def fn(df, rolling_args, *args, **kwargs): + def fn(df, rolling_kwargs, *args, **kwargs): """Create rolling window for the passed frame and execute specified `func` on it.""" - roller = df.rolling(*rolling_args) + roller = df.rolling(**rolling_kwargs) if type(func) == property: return func.fget(roller) diff --git a/modin/core/dataframe/base/dataframe/dataframe.py b/modin/core/dataframe/base/dataframe/dataframe.py index 44c8efa8695..07274788977 100644 --- a/modin/core/dataframe/base/dataframe/dataframe.py +++ b/modin/core/dataframe/base/dataframe/dataframe.py @@ -209,7 +209,7 @@ def window( Notes ----- - The user-defined reduce function must reduce each window’s column + The user-defined reduce function must reduce each window's column (row if axis=1) down to a single value. """ pass @@ -467,7 +467,7 @@ def from_labels(self) -> "ModinDataframe": Notes ----- - In the case that the dataframe has hierarchical labels, all label "levels” are inserted into the dataframe + In the case that the dataframe has hierarchical labels, all label "levels" are inserted into the dataframe in the order they occur in the labels, with the outermost being in position 0. """ pass diff --git a/modin/core/dataframe/pandas/dataframe/dataframe.py b/modin/core/dataframe/pandas/dataframe/dataframe.py index 44736aa6753..db15c489a4a 100644 --- a/modin/core/dataframe/pandas/dataframe/dataframe.py +++ b/modin/core/dataframe/pandas/dataframe/dataframe.py @@ -23,7 +23,11 @@ import datetime from pandas.api.types import is_object_dtype from pandas.core.indexes.api import Index, RangeIndex -from pandas.core.dtypes.common import is_numeric_dtype, is_list_like +from pandas.core.dtypes.common import ( + is_numeric_dtype, + is_list_like, + is_categorical_dtype, +) from pandas._libs.lib import no_default from typing import List, Hashable, Optional, Callable, Union, Dict, TYPE_CHECKING @@ -2896,20 +2900,6 @@ def _prepare_frame_to_broadcast(self, axis, indices, broadcast_all): passed_len += len(internal) return result_dict - def __make_init_labels_args(self, partitions, index, columns) -> dict: - kw = {} - kw["index"], kw["row_lengths"] = ( - self._compute_axis_labels_and_lengths(0, partitions) - if index is None - else (index, None) - ) - kw["columns"], kw["column_widths"] = ( - self._compute_axis_labels_and_lengths(1, partitions) - if columns is None - else (columns, None) - ) - return kw - @lazy_metadata_decorator(apply_axis="both") def broadcast_apply_select_indices( self, @@ -2988,9 +2978,9 @@ def broadcast_apply_select_indices( broadcasted_dict, keep_remaining, ) - - kw = self.__make_init_labels_args(new_partitions, new_index, new_columns) - return self.__constructor__(new_partitions, **kw) + return self.__constructor__( + new_partitions, index=new_index, columns=new_columns + ) @lazy_metadata_decorator(apply_axis="both") def broadcast_apply_full_axis( @@ -3280,6 +3270,7 @@ def n_ary_op( right_frames: list, join_type="outer", copartition_along_columns=True, + labels="replace", dtypes=None, ): """ @@ -3296,6 +3287,9 @@ def n_ary_op( copartition_along_columns : bool, default: True Whether to perform copartitioning along columns or not. For some ops this isn't needed (e.g., `fillna`). + labels : {"replace", "drop"}, default: "replace" + Whether use labels from joined DataFrame or drop altogether to make + them be computed lazily later. dtypes : series, default: None Dtypes of the resultant dataframe, this argument will be received if the resultant dtypes of n-opary operation is precomputed. @@ -3346,6 +3340,8 @@ def n_ary_op( left_parts, op, list_of_right_parts ) ) + if labels == "drop": + joined_index = joined_columns = row_lengths = column_widths = None return self.__constructor__( new_frame, @@ -3544,7 +3540,7 @@ def groupby( by = [by] def apply_func(df): # pragma: no cover - if any(dtype == "category" for dtype in df.dtypes[by].values): + if any(is_categorical_dtype(dtype) for dtype in df.dtypes[by].values): raise NotImplementedError( "Reshuffling groupby is not yet supported when grouping on a categorical column. " + "https://github.com/modin-project/modin/issues/5925" @@ -3564,7 +3560,7 @@ def apply_func(df): # pragma: no cover return result - @lazy_metadata_decorator(apply_axis="opposite", axis_arg=0) + @lazy_metadata_decorator(apply_axis="both") def groupby_reduce( self, axis, @@ -3612,11 +3608,16 @@ def groupby_reduce( self._get_dict_of_block_index(axis ^ 1, numeric_indices).keys() ) + if by_parts is not None: + # inplace operation + if by_parts.shape[axis] != self._partitions.shape[axis]: + self._filter_empties(compute_metadata=False) new_partitions = self._partition_mgr_cls.groupby_reduce( axis, self._partitions, by_parts, map_func, reduce_func, apply_indices ) - kw = self.__make_init_labels_args(new_partitions, new_index, new_columns) - return self.__constructor__(new_partitions, **kw) + return self.__constructor__( + new_partitions, index=new_index, columns=new_columns + ) @classmethod def from_pandas(cls, df): @@ -3724,6 +3725,8 @@ def to_pandas(self): df = self._partition_mgr_cls.to_pandas(self._partitions) if df.empty: df = pandas.DataFrame(columns=self.columns, index=self.index) + if len(df.columns) and self.has_materialized_dtypes: + df = df.astype(self.dtypes) else: for axis, has_external_index in enumerate( ["has_materialized_index", "has_materialized_columns"] diff --git a/modin/core/dataframe/pandas/partitioning/partition.py b/modin/core/dataframe/pandas/partitioning/partition.py index 2c00983c65a..6f7666212d8 100644 --- a/modin/core/dataframe/pandas/partitioning/partition.py +++ b/modin/core/dataframe/pandas/partitioning/partition.py @@ -38,6 +38,24 @@ class PandasDataframePartition(ABC): # pragma: no cover _width_cache = None _identity_cache = None _data = None + execution_wrapper = None + + # these variables are intentionally initialized at runtime + # so as not to initialize the engine during import + _iloc_func = None + + def __init__(self): + if type(self)._iloc_func is None: + # Places `_iloc` function into the storage to speed up + # remote function calls and caches the result. + # It also postpones engine initialization, which happens + # implicitly when `execution_wrapper.put` is called. + if self.execution_wrapper is not None: + type(self)._iloc_func = staticmethod( + self.execution_wrapper.put(self._iloc) + ) + else: + type(self)._iloc_func = staticmethod(self._iloc) @cache_readonly def __constructor__(self): @@ -236,7 +254,7 @@ def is_full_axis_mask(index, axis_length): ): return copy(self) - new_obj = self.add_to_apply_calls(self._iloc, row_labels, col_labels) + new_obj = self.add_to_apply_calls(self._iloc_func, row_labels, col_labels) def try_recompute_cache(indices, previous_cache): """Compute new axis-length cache for the masked frame based on its previous cache.""" diff --git a/modin/core/dataframe/pandas/partitioning/partition_manager.py b/modin/core/dataframe/pandas/partitioning/partition_manager.py index 3a5f639c866..7dce9702f22 100644 --- a/modin/core/dataframe/pandas/partitioning/partition_manager.py +++ b/modin/core/dataframe/pandas/partitioning/partition_manager.py @@ -241,6 +241,13 @@ def groupby_reduce( ) if by is not None: + # need to make sure that the partitioning of the following objects + # coincides in the required axis, because `partition_manager.broadcast_apply` + # doesn't call `_copartition` unlike `modin_frame.broadcast_apply` + assert partitions.shape[axis] == by.shape[axis], ( + f"the number of partitions along {axis=} is not equal: " + + f"{partitions.shape[axis]} != {by.shape[axis]}" + ) mapped_partitions = cls.broadcast_apply( axis, map_func, left=partitions, right=by ) @@ -1546,6 +1553,7 @@ def rebalance_partitions(cls, partitions): return new_partitions, lengths @classmethod + @wait_computations_if_benchmark_mode def shuffle_partitions( cls, partitions, index, shuffle_functions, final_shuffle_func ): diff --git a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py index c7101b62100..6173da7b94b 100644 --- a/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py +++ b/modin/core/execution/dask/implementations/pandas_on_dask/partitioning/partition.py @@ -43,6 +43,7 @@ class PandasOnDaskDataframePartition(PandasDataframePartition): execution_wrapper = DaskWrapper def __init__(self, data, length=None, width=None, ip=None, call_queue=None): + super().__init__() assert isinstance(data, Future) self._data = data if call_queue is None: diff --git a/modin/core/execution/dispatching/factories/dispatcher.py b/modin/core/execution/dispatching/factories/dispatcher.py index c42f856d838..64323f5b928 100644 --- a/modin/core/execution/dispatching/factories/dispatcher.py +++ b/modin/core/execution/dispatching/factories/dispatcher.py @@ -155,7 +155,16 @@ def _update_factory(cls, _): raise FactoryNotFoundError(msg.format(factory_name)) cls.__factory = StubFactory.set_failing_name(factory_name) else: - cls.__factory.prepare() + try: + cls.__factory.prepare() + except ModuleNotFoundError as err: + # incorrectly initialized, should be reset to None again + # so that an unobvious error does not appear in the following code: + # "AttributeError: 'NoneType' object has no attribute 'from_non_pandas'" + cls.__factory = None + raise ModuleNotFoundError( + f"Make sure all required packages are installed: {str(err)}" + ) from err @classmethod @_inherit_docstrings(factories.BaseFactory._from_pandas) diff --git a/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py b/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py index ae3601da162..059f3ae2286 100644 --- a/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py +++ b/modin/core/execution/python/implementations/pandas_on_python/partitioning/partition.py @@ -45,6 +45,7 @@ class PandasOnPythonDataframePartition(PandasDataframePartition): execution_wrapper = PythonWrapper def __init__(self, data, length=None, width=None, call_queue=None): + super().__init__() if hasattr(data, "copy"): data = data.copy() self._data = data diff --git a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py index c391d3b54f9..75dd47f0ba9 100644 --- a/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py +++ b/modin/core/execution/ray/implementations/cudf_on_ray/partitioning/partition.py @@ -43,6 +43,7 @@ class cuDFOnRayDataframePartition(PandasDataframePartition): """ def __init__(self, gpu_manager, key, length=None, width=None): + super().__init__() self.gpu_manager = gpu_manager self.key = key self._length_cache = length diff --git a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py index 1804ba67dab..a7caabf2f9d 100644 --- a/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py +++ b/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/partition.py @@ -46,6 +46,7 @@ class PandasOnRayDataframePartition(PandasDataframePartition): execution_wrapper = RayWrapper def __init__(self, data, length=None, width=None, ip=None, call_queue=None): + super().__init__() assert isinstance(data, ObjectIDType) self._data = data if call_queue is None: @@ -173,10 +174,6 @@ def __copy__(self): call_queue=self.call_queue, ) - # If Ray has not been initialized yet by Modin, - # it will be initialized when calling `RayWrapper.put`. - _iloc = execution_wrapper.put(PandasDataframePartition._iloc) - def mask(self, row_labels, col_labels): """ Lazily create a mask that extracts the indices provided. diff --git a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py index c880e1dfe50..fa1c71993cd 100644 --- a/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py +++ b/modin/core/execution/unidist/implementations/pandas_on_unidist/partitioning/partition.py @@ -45,6 +45,7 @@ class PandasOnUnidistDataframePartition(PandasDataframePartition): execution_wrapper = UnidistWrapper def __init__(self, data, length=None, width=None, ip=None, call_queue=None): + super().__init__() assert unidist.is_object_ref(data) self._data = data self.call_queue = call_queue if call_queue is not None else [] @@ -150,11 +151,6 @@ def wait(self): self.drain_call_queue() UnidistWrapper.wait(self._data) - # If unidist has not been initialized yet by Modin, - # unidist itself handles initialization when calling `unidist.put`, - # which is called inside of `UnidistWrapper.put`. - _iloc = execution_wrapper.put(PandasDataframePartition._iloc) - def mask(self, row_labels, col_labels): """ Lazily create a mask that extracts the indices provided. diff --git a/modin/core/io/text/excel_dispatcher.py b/modin/core/io/text/excel_dispatcher.py index eb1fd884d1c..4fd40b6424f 100644 --- a/modin/core/io/text/excel_dispatcher.py +++ b/modin/core/io/text/excel_dispatcher.py @@ -147,7 +147,7 @@ def _read(cls, io, **kwargs): ex.shared_strings, False, ) - if cls.need_rich_text_param: + if cls.need_rich_text_param(): reader = WorksheetReader(*common_args, rich_text=False) else: reader = WorksheetReader(*common_args) diff --git a/modin/core/io/text/text_file_dispatcher.py b/modin/core/io/text/text_file_dispatcher.py index 14447fc7b11..158212b37ec 100644 --- a/modin/core/io/text/text_file_dispatcher.py +++ b/modin/core/io/text/text_file_dispatcher.py @@ -1105,6 +1105,14 @@ def _read(cls, filepath_or_buffer, **kwargs): if can_compute_metadata_while_skipping_rows: pd_df_metadata = pd_df_metadata_temp + # compute dtypes if possible + common_dtypes = None + if kwargs["dtype"] is None: + most_common_dtype = (object,) + common_dtypes = {} + for col, dtype in pd_df_metadata.dtypes.to_dict().items(): + if dtype in most_common_dtype: + common_dtypes[col] = dtype column_names = pd_df_metadata.columns column_widths, num_splits = cls._define_metadata(pd_df_metadata, column_names) # kwargs that will be passed to the workers @@ -1117,6 +1125,7 @@ def _read(cls, filepath_or_buffer, **kwargs): skiprows=None, nrows=None, compression=compression_infered, + common_dtypes=common_dtypes, ) # this is done mostly for performance; see PR#5678 for details filepath_or_buffer_md_ref = cls.put(filepath_or_buffer_md) diff --git a/modin/core/storage_formats/base/doc_utils.py b/modin/core/storage_formats/base/doc_utils.py index 847ddaecd2f..8b58af073ad 100644 --- a/modin/core/storage_formats/base/doc_utils.py +++ b/modin/core/storage_formats/base/doc_utils.py @@ -598,11 +598,11 @@ def doc_window_method( if action is None: action = f"compute {result}" if win_type == "rolling window": - window_args_name = "rolling_args" + window_args_name = "rolling_kwargs" elif win_type == "expanding window": window_args_name = "expanding_args" else: - window_args_name = "window_args" + window_args_name = "window_kwargs" # We need that `params` value ended with new line to have # an empty line between "parameters" and "return" sections diff --git a/modin/core/storage_formats/base/query_compiler.py b/modin/core/storage_formats/base/query_compiler.py index a535c4ca52d..188960a19a1 100644 --- a/modin/core/storage_formats/base/query_compiler.py +++ b/modin/core/storage_formats/base/query_compiler.py @@ -579,6 +579,10 @@ def combine_first(self, other, **kwargs): # noqa: PR02 def eq(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.eq)(self, other=other, **kwargs) + @doc_utils.add_refer_to("DataFrame.equals") + def equals(self, other): # noqa: PR01, RT01 + return BinaryDefault.register(pandas.DataFrame.equals)(self, other=other) + @doc_utils.doc_binary_method(operation="integer division", sign="//") def floordiv(self, other, **kwargs): # noqa: PR02 return BinaryDefault.register(pandas.DataFrame.floordiv)( @@ -3021,6 +3025,68 @@ def groupby_size( result.columns = result.columns[:-1].append(pandas.Index(["size"])) return result + @doc_utils.add_refer_to("GroupBy.rolling") + def groupby_rolling( + self, + by, + agg_func, + axis, + groupby_kwargs, + rolling_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + """ + Group QueryCompiler data and apply passed aggregation function to a rolling window in each group. + + Parameters + ---------- + by : BaseQueryCompiler, column or index label, Grouper or list of such + Object that determine groups. + agg_func : str, dict or callable(Series | DataFrame) -> scalar | Series | DataFrame + Function to apply to the GroupBy object. + axis : {0, 1} + Axis to group and apply aggregation function along. + 0 is for index, when 1 is for columns. + groupby_kwargs : dict + GroupBy parameters as expected by ``modin.pandas.DataFrame.groupby`` signature. + rolling_kwargs : dict + Parameters to build a rolling window as expected by ``modin.pandas.window.RollingGroupby`` signature. + agg_args : list-like + Positional arguments to pass to the `agg_func`. + agg_kwargs : dict + Key arguments to pass to the `agg_func`. + drop : bool, default: False + If `by` is a QueryCompiler indicates whether or not by-data came + from the `self`. + + Returns + ------- + BaseQueryCompiler + QueryCompiler containing the result of groupby aggregation. + """ + if isinstance(agg_func, str): + str_func = agg_func + + def agg_func(window, *args, **kwargs): + return getattr(window, str_func)(*args, **kwargs) + + else: + assert callable(agg_func) + return self.groupby_agg( + by=by, + agg_func=lambda grp, *args, **kwargs: agg_func( + grp.rolling(**rolling_kwargs), *args, **kwargs + ), + axis=axis, + groupby_kwargs=groupby_kwargs, + agg_args=agg_args, + agg_kwargs=agg_kwargs, + how="direct", + drop=drop, + ) + @doc_utils.add_refer_to("GroupBy.aggregate") def groupby_agg( self, @@ -5760,9 +5826,9 @@ def str_casefold(self): **kwargs : dict""", build_rules="udf_aggregation", ) - def rolling_aggregate(self, fold_axis, rolling_args, func, *args, **kwargs): + def rolling_aggregate(self, fold_axis, rolling_kwargs, func, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.aggregate)( - self, rolling_args, func, *args, **kwargs + self, rolling_kwargs, func, *args, **kwargs ) # FIXME: at the query compiler method `rolling_apply` is an alias for `rolling_aggregate`, @@ -5787,7 +5853,7 @@ def rolling_aggregate(self, fold_axis, rolling_args, func, *args, **kwargs): def rolling_apply( self, fold_axis, - rolling_args, + rolling_kwargs, func, raw=False, engine=None, @@ -5796,7 +5862,7 @@ def rolling_apply( kwargs=None, ): return RollingDefault.register(pandas.core.window.rolling.Rolling.apply)( - self, rolling_args, func, raw, engine, engine_kwargs, args, kwargs + self, rolling_kwargs, func, raw, engine, engine_kwargs, args, kwargs ) @doc_utils.doc_window_method( @@ -5810,18 +5876,18 @@ def rolling_apply( **kwargs : dict""", ) def rolling_corr( - self, fold_axis, rolling_args, other=None, pairwise=None, *args, **kwargs + self, fold_axis, rolling_kwargs, other=None, pairwise=None, *args, **kwargs ): return RollingDefault.register(pandas.core.window.rolling.Rolling.corr)( - self, rolling_args, other, pairwise, *args, **kwargs + self, rolling_kwargs, other, pairwise, *args, **kwargs ) @doc_utils.doc_window_method( window_cls_name="Rolling", result="number of non-NA values", refer_to="count" ) - def rolling_count(self, fold_axis, rolling_args): + def rolling_count(self, fold_axis, rolling_kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.count)( - self, rolling_args + self, rolling_kwargs ) @doc_utils.doc_window_method( @@ -5835,10 +5901,10 @@ def rolling_count(self, fold_axis, rolling_args): **kwargs : dict""", ) def rolling_cov( - self, fold_axis, rolling_args, other=None, pairwise=None, ddof=1, **kwargs + self, fold_axis, rolling_kwargs, other=None, pairwise=None, ddof=1, **kwargs ): return RollingDefault.register(pandas.core.window.rolling.Rolling.cov)( - self, rolling_args, other, pairwise, ddof, **kwargs + self, rolling_kwargs, other, pairwise, ddof, **kwargs ) @doc_utils.doc_window_method( @@ -5847,9 +5913,9 @@ def rolling_cov( refer_to="kurt", params="**kwargs : dict", ) - def rolling_kurt(self, fold_axis, rolling_args, **kwargs): + def rolling_kurt(self, fold_axis, rolling_kwargs, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.kurt)( - self, rolling_args, **kwargs + self, rolling_kwargs, **kwargs ) @doc_utils.doc_window_method( @@ -5860,9 +5926,9 @@ def rolling_kurt(self, fold_axis, rolling_args, **kwargs): *args : iterable **kwargs : dict""", ) - def rolling_max(self, fold_axis, rolling_args, *args, **kwargs): + def rolling_max(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.max)( - self, rolling_args, *args, **kwargs + self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( @@ -5873,9 +5939,9 @@ def rolling_max(self, fold_axis, rolling_args, *args, **kwargs): *args : iterable **kwargs : dict""", ) - def rolling_mean(self, fold_axis, rolling_args, *args, **kwargs): + def rolling_mean(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.mean)( - self, rolling_args, *args, **kwargs + self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( @@ -5884,9 +5950,9 @@ def rolling_mean(self, fold_axis, rolling_args, *args, **kwargs): refer_to="median", params="**kwargs : dict", ) - def rolling_median(self, fold_axis, rolling_args, **kwargs): + def rolling_median(self, fold_axis, rolling_kwargs, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.median)( - self, rolling_args, **kwargs + self, rolling_kwargs, **kwargs ) @doc_utils.doc_window_method( @@ -5897,9 +5963,9 @@ def rolling_median(self, fold_axis, rolling_args, **kwargs): *args : iterable **kwargs : dict""", ) - def rolling_min(self, fold_axis, rolling_args, *args, **kwargs): + def rolling_min(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.min)( - self, rolling_args, *args, **kwargs + self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( @@ -5912,10 +5978,10 @@ def rolling_min(self, fold_axis, rolling_args, *args, **kwargs): **kwargs : dict""", ) def rolling_quantile( - self, fold_axis, rolling_args, quantile, interpolation="linear", **kwargs + self, fold_axis, rolling_kwargs, quantile, interpolation="linear", **kwargs ): return RollingDefault.register(pandas.core.window.rolling.Rolling.quantile)( - self, rolling_args, quantile, interpolation, **kwargs + self, rolling_kwargs, quantile, interpolation, **kwargs ) @doc_utils.doc_window_method( @@ -5924,9 +5990,9 @@ def rolling_quantile( refer_to="skew", params="**kwargs : dict", ) - def rolling_skew(self, fold_axis, rolling_args, **kwargs): + def rolling_skew(self, fold_axis, rolling_kwargs, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.skew)( - self, rolling_args, **kwargs + self, rolling_kwargs, **kwargs ) @doc_utils.doc_window_method( @@ -5938,9 +6004,9 @@ def rolling_skew(self, fold_axis, rolling_args, **kwargs): *args : iterable **kwargs : dict""", ) - def rolling_std(self, fold_axis, rolling_args, ddof=1, *args, **kwargs): + def rolling_std(self, fold_axis, rolling_kwargs, ddof=1, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.std)( - self, rolling_args, ddof, *args, **kwargs + self, rolling_kwargs, ddof, *args, **kwargs ) @doc_utils.doc_window_method( @@ -5951,9 +6017,9 @@ def rolling_std(self, fold_axis, rolling_args, ddof=1, *args, **kwargs): *args : iterable **kwargs : dict""", ) - def rolling_sum(self, fold_axis, rolling_args, *args, **kwargs): + def rolling_sum(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.sum)( - self, rolling_args, *args, **kwargs + self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( @@ -5964,9 +6030,9 @@ def rolling_sum(self, fold_axis, rolling_args, *args, **kwargs): *args : iterable **kwargs : dict""", ) - def rolling_sem(self, fold_axis, rolling_args, *args, **kwargs): + def rolling_sem(self, fold_axis, rolling_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.sem)( - self, rolling_args, *args, **kwargs + self, rolling_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( @@ -5978,9 +6044,9 @@ def rolling_sem(self, fold_axis, rolling_args, *args, **kwargs): *args : iterable **kwargs : dict""", ) - def rolling_var(self, fold_axis, rolling_args, ddof=1, *args, **kwargs): + def rolling_var(self, fold_axis, rolling_kwargs, ddof=1, *args, **kwargs): return RollingDefault.register(pandas.core.window.rolling.Rolling.var)( - self, rolling_args, ddof, *args, **kwargs + self, rolling_kwargs, ddof, *args, **kwargs ) @doc_utils.doc_window_method( @@ -5998,7 +6064,7 @@ def rolling_var(self, fold_axis, rolling_args, ddof=1, *args, **kwargs): def rolling_rank( self, fold_axis, - rolling_args, + rolling_kwargs, method="average", ascending=True, pct=False, @@ -6008,7 +6074,7 @@ def rolling_rank( ): return RollingDefault.register(pandas.core.window.rolling.Rolling.rank)( self, - rolling_args, + rolling_kwargs, method=method, ascending=ascending, pct=pct, @@ -6371,9 +6437,9 @@ def expanding_rank( *args : iterable **kwargs : dict""", ) - def window_mean(self, fold_axis, window_args, *args, **kwargs): + def window_mean(self, fold_axis, window_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.Window.mean)( - self, window_args, *args, **kwargs + self, window_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( @@ -6386,9 +6452,9 @@ def window_mean(self, fold_axis, window_args, *args, **kwargs): *args : iterable **kwargs : dict""", ) - def window_std(self, fold_axis, window_args, ddof=1, *args, **kwargs): + def window_std(self, fold_axis, window_kwargs, ddof=1, *args, **kwargs): return RollingDefault.register(pandas.core.window.Window.std)( - self, window_args, ddof, *args, **kwargs + self, window_kwargs, ddof, *args, **kwargs ) @doc_utils.doc_window_method( @@ -6400,9 +6466,9 @@ def window_std(self, fold_axis, window_args, ddof=1, *args, **kwargs): *args : iterable **kwargs : dict""", ) - def window_sum(self, fold_axis, window_args, *args, **kwargs): + def window_sum(self, fold_axis, window_kwargs, *args, **kwargs): return RollingDefault.register(pandas.core.window.Window.sum)( - self, window_args, *args, **kwargs + self, window_kwargs, *args, **kwargs ) @doc_utils.doc_window_method( @@ -6415,9 +6481,9 @@ def window_sum(self, fold_axis, window_args, *args, **kwargs): *args : iterable **kwargs : dict""", ) - def window_var(self, fold_axis, window_args, ddof=1, *args, **kwargs): + def window_var(self, fold_axis, window_kwargs, ddof=1, *args, **kwargs): return RollingDefault.register(pandas.core.window.Window.var)( - self, window_args, ddof, *args, **kwargs + self, window_kwargs, ddof, *args, **kwargs ) # End of Window methods @@ -6527,6 +6593,7 @@ def repartition(self, axis=None): lambda df: df, new_index=self._modin_frame.copy_index_cache(), new_columns=self._modin_frame.copy_columns_cache(), + dtypes=self._modin_frame.copy_dtypes_cache(), keep_partitioning=False, sync_labels=False, ) diff --git a/modin/core/storage_formats/pandas/parsers.py b/modin/core/storage_formats/pandas/parsers.py index 207f5e775a9..5139a651bf6 100644 --- a/modin/core/storage_formats/pandas/parsers.py +++ b/modin/core/storage_formats/pandas/parsers.py @@ -162,6 +162,7 @@ def generic_parse(fname, **kwargs): start = kwargs.pop("start", None) end = kwargs.pop("end", None) header_size = kwargs.pop("header_size", 0) + common_dtypes = kwargs.pop("common_dtypes", None) encoding = kwargs.get("encoding", None) callback = kwargs.pop("callback") if start is None or end is None: @@ -208,6 +209,8 @@ def generic_parse(fname, **kwargs): if "memory_map" in kwargs: kwargs = kwargs.copy() del kwargs["memory_map"] + if common_dtypes is not None: + kwargs["dtype"] = common_dtypes pandas_df = callback(BytesIO(to_read), **kwargs) index = ( pandas_df.index @@ -557,8 +560,8 @@ def _convert_cell(cls, cell, convert_float): return cell.value - @property - def need_rich_text_param(self): + @staticmethod + def need_rich_text_param(): """ Determine whether a required `rich_text` parameter should be specified for the ``WorksheetReader`` constructor. diff --git a/modin/core/storage_formats/pandas/query_compiler.py b/modin/core/storage_formats/pandas/query_compiler.py index e5668e26cc4..243ba712cff 100644 --- a/modin/core/storage_formats/pandas/query_compiler.py +++ b/modin/core/storage_formats/pandas/query_compiler.py @@ -21,7 +21,6 @@ import re import numpy as np import pandas -import functools from pandas.api.types import is_scalar from pandas.core.common import is_bool_indexer from pandas.core.indexing import check_bool_indexer @@ -32,6 +31,7 @@ is_datetime_or_timedelta_dtype, is_datetime64_any_dtype, is_bool_dtype, + is_categorical_dtype, ) from pandas.core.dtypes.cast import find_common_type from pandas.errors import DataError, MergeError @@ -390,6 +390,12 @@ def to_numpy(self, **kwargs): combine = Binary.register(pandas.DataFrame.combine, infer_dtypes="common_cast") combine_first = Binary.register(pandas.DataFrame.combine_first, infer_dtypes="bool") eq = Binary.register(pandas.DataFrame.eq, infer_dtypes="bool") + equals = Binary.register( + lambda df, other: pandas.DataFrame([[df.equals(other)]]), + join_type=None, + labels="drop", + infer_dtypes="bool", + ) floordiv = Binary.register(pandas.DataFrame.floordiv, infer_dtypes="common_cast") ge = Binary.register(pandas.DataFrame.ge, infer_dtypes="bool") gt = Binary.register(pandas.DataFrame.gt, infer_dtypes="bool") @@ -1378,81 +1384,83 @@ def expanding_corr( ) window_mean = Fold.register( - lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).mean(*args, **kwargs) + lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).mean(*args, **kwargs) ) ) window_sum = Fold.register( - lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).sum(*args, **kwargs) + lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).sum(*args, **kwargs) ) ) window_var = Fold.register( - lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).var(ddof=ddof, *args, **kwargs) + lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).var(ddof=ddof, *args, **kwargs) ) ) window_std = Fold.register( - lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).std(ddof=ddof, *args, **kwargs) + lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).std(ddof=ddof, *args, **kwargs) ) ) rolling_count = Fold.register( - lambda df, rolling_args: pandas.DataFrame(df.rolling(*rolling_args).count()) + lambda df, rolling_kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).count() + ) ) rolling_sum = Fold.register( - lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).sum(*args, **kwargs) + lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).sum(*args, **kwargs) ) ) rolling_sem = Fold.register( - lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).sem(*args, **kwargs) + lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).sem(*args, **kwargs) ) ) rolling_mean = Fold.register( - lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).mean(*args, **kwargs) + lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).mean(*args, **kwargs) ) ) rolling_median = Fold.register( - lambda df, rolling_args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).median(**kwargs) + lambda df, rolling_kwargs, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).median(**kwargs) ) ) rolling_var = Fold.register( - lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).var(ddof=ddof, *args, **kwargs) + lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).var(ddof=ddof, *args, **kwargs) ) ) rolling_std = Fold.register( - lambda df, rolling_args, ddof, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).std(ddof=ddof, *args, **kwargs) + lambda df, rolling_kwargs, ddof, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).std(ddof=ddof, *args, **kwargs) ) ) rolling_min = Fold.register( - lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).min(*args, **kwargs) + lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).min(*args, **kwargs) ) ) rolling_max = Fold.register( - lambda df, rolling_args, *args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).max(*args, **kwargs) + lambda df, rolling_kwargs, *args, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).max(*args, **kwargs) ) ) rolling_skew = Fold.register( - lambda df, rolling_args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).skew(**kwargs) + lambda df, rolling_kwargs, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).skew(**kwargs) ) ) rolling_kurt = Fold.register( - lambda df, rolling_args, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).kurt(**kwargs) + lambda df, rolling_kwargs, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).kurt(**kwargs) ) ) rolling_apply = Fold.register( - lambda df, rolling_args, func, raw, engine, engine_kwargs, args, kwargs: pandas.DataFrame( - df.rolling(*rolling_args).apply( + lambda df, rolling_kwargs, func, raw, engine, engine_kwargs, args, kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).apply( func=func, raw=raw, engine=engine, @@ -1463,15 +1471,15 @@ def expanding_corr( ) ) rolling_quantile = Fold.register( - lambda df, rolling_args, quantile, interpolation, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).quantile( + lambda df, rolling_kwargs, quantile, interpolation, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).quantile( quantile=quantile, interpolation=interpolation, **kwargs ) ) ) rolling_rank = Fold.register( - lambda df, rolling_args, method, ascending, pct, numeric_only, **kwargs: pandas.DataFrame( - df.rolling(*rolling_args).rank( + lambda df, rolling_kwargs, method, ascending, pct, numeric_only, **kwargs: pandas.DataFrame( + df.rolling(**rolling_kwargs).rank( method=method, ascending=ascending, pct=pct, @@ -1481,43 +1489,43 @@ def expanding_corr( ) ) - def rolling_corr(self, axis, rolling_args, other, pairwise, *args, **kwargs): + def rolling_corr(self, axis, rolling_kwargs, other, pairwise, *args, **kwargs): if len(self.columns) > 1: return self.default_to_pandas( - lambda df: pandas.DataFrame.rolling(df, *rolling_args).corr( + lambda df: pandas.DataFrame.rolling(df, **rolling_kwargs).corr( other=other, pairwise=pairwise, *args, **kwargs ) ) else: return Fold.register( lambda df: pandas.DataFrame( - df.rolling(*rolling_args).corr( + df.rolling(**rolling_kwargs).corr( other=other, pairwise=pairwise, *args, **kwargs ) ) )(self, axis) - def rolling_cov(self, axis, rolling_args, other, pairwise, ddof, **kwargs): + def rolling_cov(self, axis, rolling_kwargs, other, pairwise, ddof, **kwargs): if len(self.columns) > 1: return self.default_to_pandas( - lambda df: pandas.DataFrame.rolling(df, *rolling_args).cov( + lambda df: pandas.DataFrame.rolling(df, **rolling_kwargs).cov( other=other, pairwise=pairwise, ddof=ddof, **kwargs ) ) else: return Fold.register( lambda df: pandas.DataFrame( - df.rolling(*rolling_args).cov( + df.rolling(**rolling_kwargs).cov( other=other, pairwise=pairwise, ddof=ddof, **kwargs ) ) )(self, axis) - def rolling_aggregate(self, axis, rolling_args, func, *args, **kwargs): + def rolling_aggregate(self, axis, rolling_kwargs, func, *args, **kwargs): new_modin_frame = self._modin_frame.apply_full_axis( axis, lambda df: pandas.DataFrame( - df.rolling(*rolling_args).aggregate(func=func, *args, **kwargs) + df.rolling(**rolling_kwargs).aggregate(func=func, *args, **kwargs) ), new_index=self.index, ) @@ -2454,8 +2462,10 @@ def rank(self, **kwargs): new_modin_frame = self._modin_frame.apply_full_axis( axis, lambda df: df.rank(**kwargs), - new_index=self.index, - new_columns=self.columns if not numeric_only else None, + new_index=self._modin_frame.copy_index_cache(), + new_columns=self._modin_frame.copy_columns_cache() + if not numeric_only + else None, dtypes=np.float64, ) return self.__constructor__(new_modin_frame) @@ -3410,7 +3420,7 @@ def _groupby_shuffle( # So this check works only if we have dtypes cache materialized, otherwise the exception will be thrown # inside the kernel and so it will be uncatchable. TODO: figure out a better way to handle this. if self._modin_frame._dtypes is not None and any( - dtype == "category" for dtype in self.dtypes[by].values + is_categorical_dtype(dtype) for dtype in self.dtypes[by].values ): raise NotImplementedError( "Reshuffling groupby is not yet supported when grouping on a categorical column. " @@ -3439,9 +3449,11 @@ def _groupby_shuffle( else: obj = self - agg_func = functools.partial( - GroupByDefault.get_aggregation_method(how), func=agg_func - ) + agg_method = GroupByDefault.get_aggregation_method(how) + original_agg_func = agg_func + + def agg_func(grp, *args, **kwargs): + return agg_method(grp, original_agg_func, *args, **kwargs) result = obj._modin_frame.groupby( axis=axis, @@ -3504,6 +3516,51 @@ def groupby_cov( drop=drop, ) + def groupby_rolling( + self, + by, + agg_func, + axis, + groupby_kwargs, + rolling_kwargs, + agg_args, + agg_kwargs, + drop=False, + ): + # 'corr' and 'cov' require knowledge about the whole row axis (all columns have + # to be available in the same partitions), this requirement is not being satisfied + # in the current groupby implementation + unsupported_groupby = ( + agg_func in ("corr", "cov") or rolling_kwargs.get("on") is not None + ) + + if isinstance(agg_func, str): + str_func = agg_func + + def agg_func(window, *args, **kwargs): + return getattr(window, str_func)(*args, **kwargs) + + else: + assert callable(agg_func) + + if unsupported_groupby: + obj = super(PandasQueryCompiler, self) + else: + obj = self + + return obj.groupby_agg( + by=by, + agg_func=lambda grp, *args, **kwargs: agg_func( + grp.rolling(**rolling_kwargs), *args, **kwargs + ), + axis=axis, + groupby_kwargs=groupby_kwargs, + agg_args=agg_args, + agg_kwargs=agg_kwargs, + how="direct", + drop=drop, + ) + def groupby_agg( self, by, @@ -3559,12 +3616,12 @@ def groupby_agg( how == "axis_wise" ), f"Only 'axis_wise' aggregation is supported with dictionary functions, got: {how}" else: - agg_func = functools.partial( - ( - SeriesGroupByDefault if series_groupby else GroupByDefault - ).get_aggregation_method(how), - func=agg_func, - ) + agg_method = ( + SeriesGroupByDefault if series_groupby else GroupByDefault + ).get_aggregation_method(how) + + def agg_func(grp, *args, **kwargs): + return agg_method(grp, original_agg_func, *args, **kwargs) # since we're going to modify `groupby_kwargs` dict in a `groupby_agg_builder`, # we want to copy it to not propagate these changes into source dict, in case @@ -4081,7 +4138,7 @@ def sort_columns_by_row_values(self, rows, ascending=True, **kwargs): def cat_codes(self): def func(df: pandas.DataFrame) -> pandas.DataFrame: ser = df.iloc[:, 0] - assert ser.dtype == "category" + assert is_categorical_dtype(ser.dtype) return ser.cat.codes.to_frame(name=MODIN_UNNAMED_SERIES_LABEL) res = self._modin_frame.map(func=func, new_columns=[MODIN_UNNAMED_SERIES_LABEL]) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py index 75c96d59557..4327b0c277b 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/base_worker.py @@ -15,15 +15,70 @@ import abc import uuid -import os +from typing import Tuple, List import pyarrow as pa import numpy as np -from modin.config import OmnisciFragmentSize, HdkFragmentSize from modin.error_message import ErrorMessage +class DbTable(abc.ABC): + """ + Base class, representing a table in the HDK database. + + Attributes + ---------- + name : str + Table name. + """ + + @property + @abc.abstractmethod + def shape(self) -> Tuple[int, int]: + """ + Return a tuple with the number of rows and columns. + + Returns + ------- + tuple of int + """ + pass + + @property + @abc.abstractmethod + def column_names(self) -> List[str]: + """ + Return a list of the table column names. + + Returns + ------- + tuple of str + """ + pass + + @abc.abstractmethod + def to_arrow(self) -> pa.Table: + """ + Convert this table to arrow. + + Returns + ------- + pyarrow.Table + """ + pass + + def __len__(self): + """ + Return the number of rows in the table. + + Returns + ------- + int + """ + return self.shape[0] + + class BaseDbWorker(abc.ABC): """Base class for HDK storage format based execution engine .""" @@ -53,7 +108,7 @@ def executeDML(cls, query): Returns ------- - pyarrow.Table + DbTable Execution result. """ pass @@ -71,7 +126,7 @@ def executeRA(cls, query): Returns ------- - pyarrow.Table + DbTable Execution result. """ pass @@ -184,36 +239,6 @@ def cast_to_compatible_types(table): return table - @classmethod - def compute_fragment_size(cls, table): - """ - Compute fragment size to be used for table import. - - Parameters - ---------- - table : pyarrow.Table - A table to import. - - Returns - ------- - int - Fragment size to use for import. - """ - fragment_size = HdkFragmentSize.get() - if fragment_size is None: - fragment_size = OmnisciFragmentSize.get() - if fragment_size is None: - cpu_count = os.cpu_count() - if cpu_count is not None: - fragment_size = table.num_rows // cpu_count - fragment_size = min(fragment_size, 2**25) - fragment_size = max(fragment_size, 2**18) - else: - fragment_size = 0 - else: - fragment_size = int(fragment_size) - return fragment_size - @classmethod @abc.abstractmethod def import_arrow_table(cls, table, name=None): @@ -229,8 +254,8 @@ def import_arrow_table(cls, table, name=None): Returns ------- - str - Imported table name. + DbTable + Imported table. """ pass @@ -248,7 +273,7 @@ def import_pandas_dataframe(cls, df, name=None): Returns ------- - str - Imported table name. + DbTable + Imported table. """ return cls.import_arrow_table(pa.Table.from_pandas(df), name=name) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py index 46c42f79fef..8929890d11c 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_algebra.py @@ -20,6 +20,7 @@ import abc +from .db_worker import DbTable from .dataframe.utils import ColNameCodec from .expr import BaseExpr @@ -181,9 +182,10 @@ class CalciteScanNode(CalciteBaseNode): def __init__(self, modin_frame): assert modin_frame._partitions is not None - assert modin_frame._partitions[0][0].frame_id is not None + table = modin_frame._partitions[0][0].get() + assert isinstance(table, DbTable) super(CalciteScanNode, self).__init__("EnumerableTableScan") - self.table = ["hdk", modin_frame._partitions[0][0].frame_id] + self.table = ["hdk", table.name] self.fieldNames = [ ColNameCodec.encode(col) for col in modin_frame._table_cols ] + ["rowid"] diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py index 6f122339327..bd56ac7bab0 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_builder.py @@ -342,6 +342,7 @@ class InputContext: "min": "MIN", "size": "COUNT", "count": "COUNT", + "median": "APPROX_QUANTILE", } _no_arg_aggregates = {"size"} @@ -737,6 +738,18 @@ def _push(self, node): node : CalciteBaseNode A node to add. """ + if ( + len(self.res) != 0 + and isinstance(node, CalciteProjectionNode) + and isinstance(self.res[-1], CalciteProjectionNode) + and all(isinstance(expr, CalciteInputRefExpr) for expr in node.exprs) + ): + # Replace the last CalciteProjectionNode with this one and + # translate the input refs. + exprs = self.res.pop().exprs + node = CalciteProjectionNode( + node.fields, [exprs[expr.input] for expr in node.exprs] + ) self.res.append(node) def _last(self): diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py index f3d7c34d6f4..a92e5afd212 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py @@ -13,6 +13,8 @@ """Module provides ``CalciteSerializer`` class.""" +from pandas.core.dtypes.common import is_datetime64_dtype + from .expr import ( BaseExpr, LiteralExpr, @@ -65,6 +67,7 @@ class CalciteSerializer: "bool": "BOOLEAN", "float32": "FLOAT", "float64": "DOUBLE", + "datetime64": "TIMESTAMP", } _INT_OPTS = { @@ -79,6 +82,16 @@ class CalciteSerializer: int: ("BIGINT", 19), } + _TIMESTAMP_PRECISION = { + "s": 0, + "ms": 3, + "us": 6, + "ns": 9, + } + _DTYPE_STRINGS.update( + {f"datetime64[{u}]": "TIMESTAMP" for u in _TIMESTAMP_PRECISION} + ) + def serialize(self, plan): """ Serialize a sequence of Calcite nodes into JSON format. @@ -327,6 +340,20 @@ def serialize_literal(self, literal): "type_scale": -2147483648, "type_precision": 1, } + if isinstance(val, np.datetime64): + unit = np.datetime_data(val)[0] + precision = self._TIMESTAMP_PRECISION.get(unit, None) + if precision is not None: + return { + "literal": int(val.astype(np.int64)), + "type": "TIMESTAMP", + "target_type": "TIMESTAMP", + "scale": -2147483648, + "precision": precision, + "type_scale": -2147483648, + "type_precision": precision, + } + raise NotImplementedError(f"Can not serialize {type(val).__name__}") def opts_for_int_type(self, int_type): @@ -367,7 +394,11 @@ def serialize_dtype(self, dtype): """ _warn_if_unsigned(dtype) try: - return {"type": self._DTYPE_STRINGS[dtype.name], "nullable": True} + type_info = {"type": self._DTYPE_STRINGS[dtype.name], "nullable": True} + if is_datetime64_dtype(dtype): + unit = np.datetime_data(dtype)[0] + type_info["precision"] = self._TIMESTAMP_PRECISION[unit] + return type_info except KeyError: raise TypeError(f"Unsupported dtype: {dtype}") diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py index ad57dba7c8a..e9a0fe2bb2e 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py @@ -45,12 +45,14 @@ ) from .utils import ( ColNameCodec, + maybe_range, arrow_to_pandas, check_join_supported, check_cols_to_join, get_data_for_join_by_index, build_categorical_from_at, ) +from ..db_worker import DbTable from ..partitioning.partition_manager import HdkOnNativeDataframePartitionManager from modin.core.dataframe.pandas.metadata import LazyProxyCategoricalDtype from modin.error_message import ErrorMessage @@ -80,6 +82,7 @@ is_cmp_op, ) from modin.pandas.utils import check_both_not_none +from modin.pandas.indexing import is_range_like IDX_COL_NAME = ColNameCodec.IDX_COL_NAME ROWID_COL_NAME = ColNameCodec.ROWID_COL_NAME @@ -167,8 +170,8 @@ class HdkOnNativeDataframe(PandasDataframe): _force_execution_mode : str or None Used by tests to control frame's execution process. Value "lazy" is used to raise RuntimeError if execution is triggered for the frame. - Value "arrow" is used to raise RuntimeError execution is triggered - and cannot be done using Arrow API (have to use HDK for execution). + The values "arrow" and "hdk" are used to force the corresponding + execution mode. """ _query_compiler_cls = DFAlgQueryCompiler @@ -250,6 +253,7 @@ def copy( dtypes=no_default, op=no_default, index_cols=no_default, + uses_rowid=no_default, ): """ Copy this DataFrame. @@ -271,6 +275,9 @@ def copy( index_cols : list of str, optional A list of columns included into the frame's index. None value means a default index (row id is used as an index). + uses_rowid : bool, optional + True for frames which require access to the virtual 'rowid' column + for its execution. Returns ------- @@ -289,6 +296,8 @@ def copy( dtypes = self.copy_dtypes_cache() if index_cols is no_default: index_cols = self._index_cols + if uses_rowid is no_default: + uses_rowid = self._uses_rowid return self.__constructor__( partitions=partitions, index=index, @@ -298,7 +307,7 @@ def copy( dtypes=dtypes, op=op, index_cols=index_cols, - uses_rowid=self._uses_rowid, + uses_rowid=uses_rowid, force_execution_mode=self._force_execution_mode, has_unsupported_data=self._has_unsupported_data, ) @@ -419,15 +428,50 @@ def take_2d_labels_or_positional( if row_labels is not None: raise NotImplementedError("Row labels masking is not yet supported") - if row_positions is not None: - base = base._maybe_materialize_rowid() - op = MaskNode(base, row_labels=row_labels, row_positions=row_positions) - return self.__constructor__( - columns=base.columns, - dtypes=base.copy_dtypes_cache(), - op=op, - index_cols=base._index_cols, - force_execution_mode=base._force_execution_mode, + if row_positions is None: + return base + + row_positions = maybe_range(row_positions) + base = base._maybe_materialize_rowid() + op = MaskNode(base, row_labels=row_labels, row_positions=row_positions) + base = self.__constructor__( + columns=base.columns, + dtypes=base.copy_dtypes_cache(), + op=op, + index_cols=base._index_cols, + force_execution_mode=base._force_execution_mode, + ) + + # Reverse the frame rows, if performing a reverse order selection via HDK. + if ( + is_range_like(row_positions) and row_positions.step < 0 + ) and not base._can_execute_arrow(): + cols = base.columns + table_cols = base._table_cols + # Add the rowid column + rowid_col = "__tmp_rowid__" + while rowid_col in table_cols: + rowid_col += "1" + exprs = base._index_exprs() + exprs[rowid_col] = base.ref(ROWID_COL_NAME) + for col in cols: + exprs[col] = base.ref(col) + base = base.copy( + columns=[rowid_col] + base.columns.tolist(), + dtypes=base._dtypes_for_exprs(exprs), + op=TransformNode(base, exprs), + uses_rowid=True, + ) + # Sort by the rowid column + base = base.copy(op=SortNode(base, [rowid_col], [False], "last")) + # Remove the rowid column + exprs = OrderedDict() + for col in table_cols: + exprs[col] = base.ref(col) + base = base.copy( + columns=cols, + dtypes=base._dtypes_for_exprs(exprs), + op=TransformNode(base, exprs), ) return base @@ -463,8 +507,10 @@ def _dtypes_for_exprs(self, exprs): def _maybe_update_proxies(self, dtypes, new_parent=None): if new_parent is not None: super()._maybe_update_proxies(dtypes, new_parent) - elif self._has_arrow_table(): - table = self._partitions[0, 0].get() + if self._partitions is None: + return + table = self._partitions[0][0].get() + if isinstance(table, pyarrow.Table): super()._maybe_update_proxies(dtypes, new_parent=table) def groupby_agg(self, by, axis, agg, groupby_args, **kwargs): @@ -1373,10 +1419,15 @@ def _join_arrow_columns(self, other_modin_frames): if all( f._index_cols is None # Make sure all the frames have an arrow table in partitions. - and isinstance(f._execute(), pyarrow.Table) + and isinstance(f._execute(), (DbTable, pyarrow.Table)) for f in frames ): - tables = [f._partitions[0][0].get() for f in frames] + tables = [ + t + if isinstance(t := f._partitions[0][0].get(), pyarrow.Table) + else t.to_arrow() + for f in frames + ] column_names = [c for t in tables for c in t.column_names] if len(column_names) != len(set(column_names)): raise NotImplementedError("Duplicate column names") @@ -1616,7 +1667,7 @@ def cat_codes(self): The new frame. """ assert len(self.columns) == 1 - assert self._dtypes[-1] == "category" + assert is_categorical_dtype(self._dtypes[-1]) exprs = self._index_exprs() col_expr = self.ref(self.columns[-1]) @@ -1818,6 +1869,20 @@ def filter(self, key): force_execution_mode=self._force_execution_mode, ) + def force_import(self) -> DbTable: + """ + Force table import. + + Returns + ------- + DbTable + The imported table. + """ + if self._has_unsupported_data: + raise NotImplementedError("Unable to import a frame with unsupported data") + self._execute() + return self._partition_mgr_cls.import_table(self) + def _maybe_materialize_rowid(self): """ Materialize virtual 'rowid' column if frame uses it as an index. @@ -1923,7 +1988,7 @@ def _execute(self): Returns ------- - pyarrow.Table or pandas.Dataframe + DbTable or pyarrow.Table or pandas.Dataframe """ if isinstance(self._op, FrameNode): return self._op.execute_arrow() @@ -1938,7 +2003,7 @@ def _execute(self): if isinstance(frame._op, FrameNode): result = frame._op.execute_arrow() continue - if not frame._op.can_execute_hdk(): + if not frame._op.can_execute_hdk() and stack[-1] != frame._materialize: stack.append(frame._materialize) if frame._uses_rowid or frame._op.require_executed_base(): for i in reversed(frame._op.input): @@ -1955,24 +2020,25 @@ def _materialize(self): Returns ------- - pyarrow.Table + DbTable or pyarrow.Table """ - assert ( - self._force_execution_mode != "lazy" - ), "Unexpected execution triggered on lazy frame!" + mode = self._force_execution_mode + assert mode != "lazy", "Unexpected execution triggered on lazy frame!" + + if isinstance(self._op, FrameNode): + return self._op.execute_arrow() - if self._force_execution_mode != "hdk" and self._can_execute_arrow(): + if ( + mode == "arrow" + or not self._op.can_execute_hdk() + or (self._can_execute_arrow() and mode != "hdk") + ): new_table = self._execute_arrow() partitions = self._partition_mgr_cls.from_arrow( new_table, unsupported_cols=[], encode_col_names=False )[0] else: - assert ( - self._force_execution_mode != "arrow" - ), "Forced arrow execution failed!" - partitions = self._partition_mgr_cls.run_exec_plan( - self._op, self._table_cols - ) + partitions = self._partition_mgr_cls.run_exec_plan(self._op) self._partitions = partitions self._op = FrameNode(self) @@ -1989,6 +2055,9 @@ def _can_execute_arrow(self): ------- bool """ + if self._force_execution_mode == "hdk": + return False + stack = [self] while stack: op = stack.pop()._op @@ -2014,7 +2083,8 @@ def _execute_arrow(self): frame = stack.pop() if callable(frame): - result = frame(result) + if isinstance(result := frame(result), DbTable): + result = result.to_arrow() elif input := getattr(frame._op, "input", None): if len(input) == 1: stack.append(frame._op.execute_arrow) @@ -2045,45 +2115,37 @@ def to_arrow(result, op=frame._op, tables=[], frames=iter(input)): return result to_arrow(result) - else: - result = frame._op.execute_arrow(result) + elif isinstance(result := frame._op.execute_arrow(result), DbTable): + result = result.to_arrow() return result def _build_index_cache(self): - """ - Materialize index and store it in the cache. - - Can only be called for materialized frames. - """ - assert isinstance(self._op, FrameNode) + """Materialize index and store it in the cache.""" + obj = self._execute() - if self._partitions is None: - self.set_index_cache(Index.__new__(Index)) + if self._index_cols is None: + self.set_index_cache(Index.__new__(RangeIndex, data=range(len(obj)))) + return + if isinstance(obj, DbTable): + # TODO: Get the index columns only + obj = obj.to_arrow() + if isinstance(obj, pyarrow.Table): + # The index columns must be in the beginning of the list + col_names = obj.column_names[len(self._index_cols) :] + index_at = obj.drop(col_names) + index_df = index_at.to_pandas() + index_df.set_index(self._index_cols, inplace=True) + idx = index_df.index + idx.rename(demangle_index_names(self._index_cols), inplace=True) + if ( + isinstance(idx, (pd.DatetimeIndex, pd.TimedeltaIndex)) + and len(idx) >= 3 # infer_freq() requires at least 3 values + ): + idx.freq = pd.infer_freq(idx) + self.set_index_cache(idx) else: - obj = self._partitions[0][0].get() - if isinstance(obj, (pd.DataFrame, pd.Series)): - self.set_index_cache(obj.index) - else: - assert isinstance(obj, pyarrow.Table) - if self._index_cols is None: - self.set_index_cache( - Index.__new__(RangeIndex, data=range(obj.num_rows)) - ) - else: - # The index columns must be in the beginning of the list - col_names = obj.column_names[len(self._index_cols) :] - index_at = obj.drop(col_names) - index_df = index_at.to_pandas() - index_df.set_index(self._index_cols, inplace=True) - idx = index_df.index - idx.rename(demangle_index_names(self._index_cols), inplace=True) - if ( - isinstance(idx, (pd.DatetimeIndex, pd.TimedeltaIndex)) - and len(idx) >= 3 # infer_freq() requires at least 3 values - ): - idx.freq = pd.infer_freq(idx) - self.set_index_cache(idx) + self.set_index_cache(obj.index) def _get_index(self): """ @@ -2095,7 +2157,6 @@ def _get_index(self): ------- pandas.Index """ - self._execute() if not self.has_index_cache: self._build_index_cache() return self._index_cache.get() @@ -2125,9 +2186,7 @@ def _set_index(self, new_index): "HdkOnNativeDataframe._set_index is not yet suported" ) else: - assert isinstance(obj, pyarrow.Table) - - at = obj + at = obj if isinstance(obj, pyarrow.Table) else obj.to_arrow() if self._index_cols: at = at.drop(self._index_cols) @@ -2493,6 +2552,8 @@ def to_pandas(self): obj = self._execute() + if isinstance(obj, DbTable): + obj = obj.to_arrow() if isinstance(obj, pyarrow.Table): # If the table is exported from HDK, the string columns are converted # to dictionary. On conversion to pandas, these columns will be of type diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py index 161b9be8998..5fb2d01dfe8 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/utils.py @@ -29,6 +29,7 @@ import pyarrow as pa from pyarrow.types import is_dictionary +from modin.pandas.indexing import is_range_like from modin.utils import MODIN_UNNAMED_SERIES_LABEL EMPTY_ARROW_TABLE = pa.Table.from_pandas(pandas.DataFrame({})) @@ -428,12 +429,12 @@ def to_empty_pandas_df(df): index_cols = None else: index_cols = ColNameCodec.mangle_index_names(merged.index.names) - for orig_name, mangled_name in zip(merged.index.names, index_cols): + for name in index_cols: # Using _dtypes here since it contains all column names, # including the index. - df = left if mangled_name in left._dtypes else right - exprs[orig_name] = df.ref(mangled_name) - new_dtypes.append(df._dtypes[mangled_name]) + df = left if name in left._dtypes else right + exprs[name] = df.ref(name) + new_dtypes.append(df._dtypes[name]) left_col_names = set(left.columns) right_col_names = set(right.columns) @@ -465,6 +466,30 @@ def to_empty_pandas_df(df): return index_cols, exprs, new_dtypes, merged.columns +def maybe_range(numbers: Union[List[int], range]) -> Union[List[int], range]: + """ + Try to convert the specified sequence of numbers to a range. + + Parameters + ---------- + numbers : list of ints or range + + Returns + ------- + list of ints or range + """ + if len(numbers) > 2 and not is_range_like(numbers): + diff = numbers[1] - numbers[0] + is_range = True + for i in range(2, len(numbers)): + if (numbers[i] - numbers[i - 1]) != diff: + is_range = False + break + if is_range: + numbers = range(numbers[0], numbers[-1] + diff, diff) + return numbers + + def to_arrow_type(dtype) -> pa.lib.DataType: """ Convert the specified dtype to arrow. diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py index 02ce4f7f68c..80bb792b7cd 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/db_worker.py @@ -12,6 +12,7 @@ # governing permissions and limitations under the License. """Module chooses a proper worker class.""" +from .base_worker import DbTable from .hdk_worker import HdkWorker as DbWorker -__all__ = ["DbWorker"] +__all__ = ["DbTable", "DbWorker"] diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py index 82a56ba563b..da5e068eac0 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/df_algebra.py @@ -30,6 +30,7 @@ from .expr import InputRefExpr, LiteralExpr, OpExpr from .dataframe.utils import ColNameCodec, EMPTY_ARROW_TABLE, get_common_arrow_type +from .db_worker import DbTable if TYPE_CHECKING: from .dataframe.dataframe import HdkOnNativeDataframe @@ -416,12 +417,12 @@ def __init__(self, modin_frame: "HdkOnNativeDataframe"): def can_execute_arrow(self) -> bool: return self.modin_frame._has_arrow_table() - def execute_arrow(self, ignore=None) -> Union[pa.Table, pandas.DataFrame]: + def execute_arrow(self, ignore=None) -> Union[DbTable, pa.Table, pandas.DataFrame]: """ Materialized frame. If `can_execute_arrow` returns True, this method returns an arrow table, - otherwise - a pandas Dataframe. + otherwise - a pandas Dataframe or DbTable. Parameters ---------- @@ -429,7 +430,7 @@ def execute_arrow(self, ignore=None) -> Union[pa.Table, pandas.DataFrame]: Returns ------- - pa.Table or pandas.Dataframe + DbTable or pa.Table or pandas.Dataframe """ frame = self.modin_frame if frame._partitions is not None: diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py index 698db1be538..58addb5c7ac 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/expr.py @@ -31,8 +31,10 @@ is_categorical_dtype, is_datetime64_any_dtype, is_bool_dtype, + is_datetime64_dtype, ) +from modin.pandas.indexing import is_range_like from modin.utils import _inherit_docstrings from .dataframe.utils import ColNameCodec, to_arrow_type @@ -65,6 +67,8 @@ def _get_common_dtype(lhs_dtype, rhs_dtype): return get_dtype(float) if is_integer_dtype(lhs_dtype) and is_integer_dtype(rhs_dtype): return get_dtype(int) + if is_datetime64_dtype(lhs_dtype) and is_datetime64_dtype(rhs_dtype): + return np.promote_types(lhs_dtype, rhs_dtype) raise NotImplementedError( f"Cannot perform operation on types: {lhs_dtype}, {rhs_dtype}" ) @@ -72,7 +76,7 @@ def _get_common_dtype(lhs_dtype, rhs_dtype): _aggs_preserving_numeric_type = {"sum", "min", "max"} _aggs_with_int_result = {"count", "size"} -_aggs_with_float_result = {"mean", "std", "skew"} +_aggs_with_float_result = {"mean", "median", "std", "skew"} def _agg_dtype(agg, dtype): @@ -801,45 +805,48 @@ class LiteralExpr(BaseExpr): Parameters ---------- - val : int, np.int, float, bool, str or None + val : int, np.int, float, bool, str, np.datetime64 or None Literal value. dtype : None or dtype, default: None Value dtype. Attributes ---------- - val : int, np.int, float, bool, str or None + val : int, np.int, float, bool, str, np.datetime64 or None Literal value. _dtype : dtype Literal data type. """ def __init__(self, val, dtype=None): - if dtype is None: - if val is not None and not isinstance( - val, - ( - int, - float, - bool, - str, - np.int8, - np.int16, - np.int32, - np.int64, - np.uint8, - np.uint16, - np.uint32, - np.uint64, - ), - ): - raise NotImplementedError(f"Literal value {val} of type {type(val)}") - if val is None: - dtype = get_dtype(float) - else: - dtype = get_dtype(type(val)) + if val is not None and not isinstance( + val, + ( + int, + float, + bool, + str, + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.datetime64, + ), + ): + raise NotImplementedError(f"Literal value {val} of type {type(val)}") self.val = val - self._dtype = dtype + if dtype is not None: + self._dtype = dtype + elif val is None: + self._dtype = get_dtype(float) + else: + self._dtype = ( + val.dtype if isinstance(val, np.generic) else get_dtype(type(val)) + ) def copy(self): """ @@ -857,8 +864,21 @@ def fold(self): @_inherit_docstrings(BaseExpr.cast) def cast(self, res_type): - dtype = np.dtype(res_type) - return LiteralExpr(dtype.type(self.val), dtype) + val = self.val + if val is not None: + if isinstance(val, np.generic): + val = val.astype(res_type) + elif is_integer_dtype(res_type): + val = int(val) + elif is_float_dtype(res_type): + val = float(val) + elif is_bool_dtype(res_type): + val = bool(val) + elif is_string_dtype(res_type): + val = str(val) + else: + raise TypeError(f"Cannot cast '{val}' to '{res_type}'") + return LiteralExpr(val, res_type) @_inherit_docstrings(BaseExpr.is_null) def is_null(self): @@ -1308,8 +1328,17 @@ def build_row_idx_filter_expr(row_idx, row_col): if not is_list_like(row_idx): return row_col.eq(row_idx) - if isinstance(row_idx, (pandas.RangeIndex, range)) and row_idx.step == 1: - exprs = [row_col.ge(row_idx[0]), row_col.le(row_idx[-1])] + if is_range_like(row_idx): + start = row_idx[0] + stop = row_idx[-1] + step = row_idx.step + if step < 0: + start, stop = stop, start + step = -step + exprs = [row_col.ge(start), row_col.le(stop)] + if step > 1: + mod = OpExpr("MOD", [row_col, LiteralExpr(step)], get_dtype(int)) + exprs.append(mod.eq(0)) return OpExpr("AND", exprs, get_dtype(bool)) exprs = [row_col.eq(idx) for idx in row_idx] @@ -1336,6 +1365,11 @@ def build_if_then_else(cond, then_val, else_val, res_type): BaseExpr The conditional operator expression. """ + if is_datetime64_dtype(res_type): + if then_val._dtype != res_type: + then_val = then_val.cast(res_type) + if else_val._dtype != res_type: + else_val = else_val.cast(res_type) return OpExpr("CASE", [cond, then_val, else_val], res_type) diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py index 24e930ac46d..619e9359c7c 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/hdk_worker.py @@ -12,94 +12,171 @@ # governing permissions and limitations under the License. """Module provides ``HdkWorker`` class.""" +from typing import Optional, Tuple, List, Union -import pyhdk +import pyarrow as pa +import os -from .base_worker import BaseDbWorker +from pyhdk.hdk import HDK, QueryNode, ExecutionResult, RelAlgExecutor + +from .base_worker import DbTable, BaseDbWorker from modin.utils import _inherit_docstrings -from modin.config import HdkLaunchParameters +from modin.config import HdkLaunchParameters, OmnisciFragmentSize, HdkFragmentSize + + +class HdkTable(DbTable): + """ + Represents a table in the HDK database. + + Parameters + ---------- + table : QueryNode or ExecutionResult + """ + + def __init__(self, table: Union[QueryNode, ExecutionResult]): + self.name = table.table_name + self._table = table + + def __del__(self): + """Drop table.""" + # The ExecutionResults are cleared by HDK. + if not isinstance(self._table, ExecutionResult): + HdkWorker.dropTable(self.name) + + @property + @_inherit_docstrings(DbTable.shape) + def shape(self) -> Tuple[int, int]: + shape = getattr(self, "_shape", None) + if shape is None: + self._shape = shape = self.scan().shape + return shape + + @property + @_inherit_docstrings(DbTable.column_names) + def column_names(self) -> List[str]: + names = getattr(self, "_column_names", None) + if names is None: + self._column_names = names = list(self.scan().schema) + return names + + @_inherit_docstrings(DbTable.to_arrow) + def to_arrow(self) -> pa.Table: + return ( + self._table.to_arrow() + if isinstance(self._table, ExecutionResult) + else self._table.run().to_arrow() + ) + + def scan(self): + """ + Return a scan query node referencing this table. + + Returns + ------- + QueryNode + """ + if isinstance(self._table, QueryNode): + return self._table + scan = getattr(self, "_scan", None) + if scan is None: + self._scan = scan = HdkWorker._hdk().scan(self.name) + return scan @_inherit_docstrings(BaseDbWorker) -class HdkWorker(BaseDbWorker): +class HdkWorker(BaseDbWorker): # noqa: PR01 """PyHDK based wrapper class for HDK storage format.""" - _config = None - _storage = None - _data_mgr = None - _calcite = None - _executor = None + def __new__(cls, *args, **kwargs): + instance = getattr(cls, "_instance", None) + if instance is None: + cls._instance = instance = object.__new__(cls) + return instance @classmethod - def setup_engine(cls): - """ - Initialize PyHDK. + def dropTable(cls, name: str): + cls.dropTable = cls._hdk().drop_table + cls.dropTable(name) - Do nothing if it is initiliazed already. - """ - if cls._executor is None: - cls._config = pyhdk.buildConfig(**HdkLaunchParameters.get()) - cls._storage = pyhdk.storage.ArrowStorage(1) - cls._data_mgr = pyhdk.storage.DataMgr(cls._config) - cls._data_mgr.registerDataProvider(cls._storage) - - cls._calcite = pyhdk.sql.Calcite(cls._storage, cls._config) - cls._executor = pyhdk.Executor(cls._data_mgr, cls._config) + @classmethod + def executeDML(cls, query: str): + return cls.executeRA(query, True) - def __init__(self): - """Initialize HDK storage format.""" - self.setup_engine() + @classmethod + def executeRA(cls, query: str, exec_calcite=False): + hdk = cls._hdk() + if exec_calcite or query.startswith("execute calcite"): + ra = hdk._calcite.process(query, db_name="hdk", legacy_syntax=True) + else: + ra = query + ra_executor = RelAlgExecutor(hdk._executor, hdk._schema_mgr, hdk._data_mgr, ra) + return HdkTable(ra_executor.execute(device_type=cls._preferred_device)) @classmethod - def dropTable(cls, name): - cls._storage.dropTable(name) + def import_arrow_table(cls, table: pa.Table, name: Optional[str] = None): + name = cls._genName(name) + table = cls.cast_to_compatible_types(table) + fragment_size = cls.compute_fragment_size(table) + return HdkTable(cls._hdk().import_arrow(table, name, fragment_size)) @classmethod - def _executeRelAlgJson(cls, ra): + def compute_fragment_size(cls, table): """ - Execute RelAlg JSON query. + Compute fragment size to be used for table import. Parameters ---------- - ra : str - RelAlg JSON string. + table : pyarrow.Table + A table to import. Returns ------- - pyarrow.Table - Execution result. + int + Fragment size to use for import. """ - rel_alg_executor = pyhdk.sql.RelAlgExecutor( - cls._executor, cls._storage, cls._data_mgr, ra - ) - res = rel_alg_executor.execute() - return res.to_arrow() - - @classmethod - def executeDML(cls, query): - ra = cls._calcite.process(query, db_name="hdk") - return cls._executeRelAlgJson(ra) - - @classmethod - def executeRA(cls, query): - if query.startswith("execute relalg"): - # 14 == len("execute relalg") - ra = query[14:] + fragment_size = HdkFragmentSize.get() + if fragment_size is None: + fragment_size = OmnisciFragmentSize.get() + if fragment_size is None: + if bool(HdkLaunchParameters.get()["cpu_only"]): + cpu_count = os.cpu_count() + if cpu_count is not None: + fragment_size = table.num_rows // cpu_count + fragment_size = min(fragment_size, 2**25) + fragment_size = max(fragment_size, 2**18) + else: + fragment_size = 0 + else: + fragment_size = 2**25 else: - assert query.startswith("execute calcite") - ra = cls._calcite.process(query, db_name="hdk") - - return cls._executeRelAlgJson(ra) + fragment_size = int(fragment_size) + return fragment_size @classmethod - def import_arrow_table(cls, table, name=None): - name = cls._genName(name) + def _hdk(cls) -> HDK: + """ + Initialize and return an HDK instance. - table = cls.cast_to_compatible_types(table) - fragment_size = cls.compute_fragment_size(table) + Returns + ------- + HDK + """ + params = HdkLaunchParameters.get() + cls._preferred_device = ( + "CPU" if bool(HdkLaunchParameters.get()["cpu_only"]) else "GPU" + ) + cls._hdk_instance = HDK(**params) + cls._hdk = cls._get_hdk_instance + return cls._hdk() - opt = pyhdk.storage.TableOptions(fragment_size) - cls._storage.importArrowTable(table, name, opt) + @classmethod + def _get_hdk_instance(cls) -> HDK: + """ + Return the initialized HDK instance. - return name + Returns + ------- + HDK + """ + return cls._hdk_instance diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py index 88af4ff509b..3003eef70fc 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/interchange/dataframe_protocol/dataframe.py @@ -210,6 +210,8 @@ def _pyarrow_table(self) -> pa.Table: pyarrow.Table """ at = self._df._execute() + if not isinstance(at, pa.Table): + at = at.to_arrow() assert at is not None return at diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py index 12c57db9673..bb5702f2af3 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition.py @@ -12,7 +12,7 @@ # governing permissions and limitations under the License. """Module provides a partition class for ``HdkOnNativeDataframe`` frame.""" -from typing import Optional, Union +from typing import Union import pandas @@ -20,29 +20,24 @@ from modin.core.dataframe.pandas.partitioning.partition import PandasDataframePartition from ..dataframe.utils import arrow_to_pandas -from ..db_worker import DbWorker +from ..db_worker import DbTable class HdkOnNativeDataframePartition(PandasDataframePartition): """ A partition of ``HdkOnNativeDataframe`` frame. - Class holds either a ``pandas.DataFrame`` or ``pyarrow.Table``. + Class holds either a ``DbTable`` or ``pandas.DataFrame`` or ``pyarrow.Table``. Parameters ---------- - data : pandas.DataFrame or pyarrow.Table + data : DbTable or pandas.DataFrame or pyarrow.Table Partition data in either pandas or PyArrow format. - frame_id : str, optional - A corresponding HDK table name or None. Attributes ---------- - _data : pandas.DataFrame or pyarrow.Table + _data : DbTable or pandas.DataFrame or pyarrow.Table Partition data in either pandas or PyArrow format. - frame_id : str - A corresponding HDK table name if partition was imported - into HDK. Otherwise None. _length_cache : int Length of the partition. _width_cache : int @@ -51,23 +46,11 @@ class HdkOnNativeDataframePartition(PandasDataframePartition): def __init__( self, - data: Union[pa.Table, pandas.DataFrame], - frame_id: Optional[str] = None, + data: Union[DbTable, pa.Table, pandas.DataFrame], ): + super().__init__() + assert isinstance(data, (DbTable, pa.Table, pandas.DataFrame)) self._data = data - self.frame_id = frame_id - if isinstance(data, pa.Table): - self._length_cache = data.num_rows - self._width_cache = data.num_columns - else: - assert isinstance(data, pandas.DataFrame) - self._length_cache = len(data) - self._width_cache = len(data.columns) - - def __del__(self): - """Deallocate HDK resources related to the partition.""" - if self.frame_id is not None: - DbWorker.dropTable(self.frame_id) def to_pandas(self): """ @@ -80,7 +63,8 @@ def to_pandas(self): obj = self.get() if isinstance(obj, pandas.DataFrame): return obj - assert isinstance(obj, pa.Table) + if isinstance(obj, DbTable): + obj = obj.to_arrow() return arrow_to_pandas(obj) def to_numpy(self, **kwargs): @@ -104,18 +88,18 @@ def get(self): Returns ------- - pandas.DataFrame or pyarrow.Table + DbTable or pandas.DataFrame or pyarrow.Table """ return self._data @classmethod def put(cls, obj): """ - Create partition from ``pandas.DataFrame`` or ``pyarrow.Table``. + Create partition from ``DbTable`` or ``pandas.DataFrame`` or ``pyarrow.Table``. Parameters ---------- - obj : pandas.DataFrame or pyarrow.Table + obj : DbTable or pandas.DataFrame or pyarrow.Table Source frame. Returns @@ -124,3 +108,28 @@ def put(cls, obj): The new partition. """ return cls(obj) + + @property + def _length_cache(self): + """ + Number of rows. + + Returns + ------- + int + """ + return len(self._data) + + @property + def _width_cache(self): + """ + Number of columns. + + Returns + ------- + int + """ + if isinstance(self._data, pa.Table): + return self._data.num_columns + else: + return self._data.shape[1] diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py index ad7c995a532..dd242bee9d3 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py @@ -22,7 +22,7 @@ ) from ..dataframe.utils import ColNameCodec from ..partitioning.partition import HdkOnNativeDataframePartition -from ..db_worker import DbWorker +from ..db_worker import DbTable, DbWorker from ..calcite_builder import CalciteBuilder from ..calcite_serializer import CalciteSerializer from modin.config import DoUseCalcite @@ -227,7 +227,7 @@ def is_supported_dtype(dtype): ) @classmethod - def run_exec_plan(cls, plan, columns): + def run_exec_plan(cls, plan): """ Run execution plan in HDK storage format to materialize frame. @@ -235,61 +235,64 @@ def run_exec_plan(cls, plan, columns): ---------- plan : DFAlgNode A root of an execution plan tree. - columns : list of str - A frame column names. Returns ------- np.array Created frame's partitions. """ - omniSession = DbWorker() + worker = DbWorker() # First step is to make sure all partitions are in HDK. frames = plan.collect_frames() for frame in frames: - for p in frame._partitions.flatten(): - if p.frame_id is None: - obj = p.get() - if isinstance(obj, (pandas.DataFrame, pandas.Series)): - p.frame_id = omniSession.import_pandas_dataframe(obj) - else: - assert isinstance(obj, pyarrow.Table) - if obj.num_columns == 0: - # Tables without columns are not supported. - # Creating an empty table with index columns only. - idx_names = ( - frame.index.names - if frame.has_materialized_index - else [None] - ) - idx_names = ColNameCodec.mangle_index_names(idx_names) - obj = pyarrow.table( - {n: [] for n in idx_names}, - schema=pyarrow.schema( - {n: pyarrow.int64() for n in idx_names} - ), - ) - p.frame_id = omniSession.import_arrow_table(obj) + cls.import_table(frame, worker) calcite_plan = CalciteBuilder().build(plan) calcite_json = CalciteSerializer().serialize(calcite_plan) - - cmd_prefix = "execute relalg " - if DoUseCalcite.get(): - cmd_prefix = "execute calcite " - - at = omniSession.executeRA(cmd_prefix + calcite_json) + calcite_json = "execute calcite " + calcite_json + table = worker.executeRA(calcite_json) res = np.empty((1, 1), dtype=np.dtype(object)) - # workaround for https://github.com/modin-project/modin/issues/1851 - if DoUseCalcite.get(): - at = at.rename_columns([ColNameCodec.encode(c) for c in columns]) - res[0][0] = cls._partition_class(at) + res[0][0] = cls._partition_class(table) return res + @classmethod + def import_table(cls, frame, worker=DbWorker()) -> DbTable: + """ + Import the frame's partition data, if required. + + Parameters + ---------- + frame : HdkOnNativeDataframe + worker : DbWorker, optional + + Returns + ------- + DbTable + """ + table = frame._partitions[0][0].get() + if isinstance(table, pandas.DataFrame): + table = worker.import_pandas_dataframe(table) + frame._partitions[0][0] = cls._partition_class(table) + elif isinstance(table, pyarrow.Table): + if table.num_columns == 0: + # Tables without columns are not supported. + # Creating an empty table with index columns only. + idx_names = ( + frame.index.names if frame.has_materialized_index else [None] + ) + idx_names = ColNameCodec.mangle_index_names(idx_names) + table = pyarrow.table( + {n: [] for n in idx_names}, + schema=pyarrow.schema({n: pyarrow.int64() for n in idx_names}), + ) + table = worker.import_arrow_table(table) + frame._partitions[0][0] = cls._partition_class(table) + return table + @classmethod def _names_from_index_cols(cls, cols): """ diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py index 1d42b977485..35dbeb752af 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/test_dataframe.py @@ -20,7 +20,7 @@ from pandas._testing import ensure_clean -from modin.config import StorageFormat, DoUseCalcite +from modin.config import StorageFormat from modin.pandas.test.utils import ( io_ops_bad_exc, default_to_pandas_ignore_string, @@ -54,6 +54,9 @@ from modin.experimental.core.execution.native.implementations.hdk_on_native.df_algebra import ( FrameNode, ) +from modin.experimental.core.execution.native.implementations.hdk_on_native.calcite_serializer import ( + CalciteSerializer, +) # Our configuration in pytest.ini requires that we explicitly catch all @@ -221,9 +224,6 @@ def test_null_col(self, null_dtype): with ForceHdkImport(exp): exp = to_pandas(exp) exp["c"] = exp["c"].astype("string") - # The arrow table contains empty strings, when reading as category. - assert all(v == "" for v in exp["c"]) - exp["c"] = None df_equals(ref, exp) @@ -939,7 +939,7 @@ def groupby(df, **kwargs): run_and_compare(groupby, data=self.data) @pytest.mark.parametrize("by", [["a"], ["a", "b", "c"]]) - @pytest.mark.parametrize("agg", ["sum", "size", "mean"]) + @pytest.mark.parametrize("agg", ["sum", "size", "mean", "median"]) @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_agg_by_col(self, by, agg, as_index): def simple_agg(df, **kwargs): @@ -1263,19 +1263,7 @@ def groupby(df, **kwargs): @pytest.mark.parametrize("invert", [True, False]) @pytest.mark.parametrize("select", [True, False]) @pytest.mark.parametrize("ascending", [None, True, False]) - @pytest.mark.parametrize( - "use_calcite", - [ - False, - pytest.param( - True, - marks=pytest.mark.xfail( - reason="Function ROW_NUMBER() is not yet supported by Calcite" - ), - ), - ], - ) - def test_head_tail(self, op, n, invert, select, ascending, use_calcite): + def test_head_tail(self, op, n, invert, select, ascending): def head(df, **kwargs): if invert: df = df[~df["col3"].isna()] @@ -1287,13 +1275,8 @@ def head(df, **kwargs): df = getattr(df, op)(n) return df.sort_values(list(df.columns)) - orig_value = DoUseCalcite.get() - DoUseCalcite._value = use_calcite - try: - # When invert is false, the rowid column is materialized. - run_and_compare(head, data=test_data["int_data"], force_lazy=invert) - finally: - DoUseCalcite._value = orig_value + # When invert is false, the rowid column is materialized. + run_and_compare(head, data=test_data["int_data"], force_lazy=invert) class TestAgg: @@ -1978,16 +1961,7 @@ def compute(df, operation, **kwargs): force_hdk_execute=force_hdk, ) - @pytest.mark.parametrize( - "force_hdk", - [ - False, - pytest.param( - True, - marks=pytest.mark.xfail(reason="Invert is not yet supported by HDK"), - ), - ], - ) + @pytest.mark.parametrize("force_hdk", [False, True]) def test_invert_op(self, force_hdk): def invert(df, **kwargs): return ~df @@ -2037,6 +2011,33 @@ def dt_hour(df, **kwargs): run_and_compare(dt_hour, data=self.datetime_data) + @pytest.mark.parametrize("cast", [True, False]) + @pytest.mark.parametrize("unit", CalciteSerializer._TIMESTAMP_PRECISION.keys()) + def test_dt_serialization(self, cast, unit): + fill_value = np.datetime64(3, unit) + + def serialize(df, **kwargs): + if cast: + df = df.astype(f"datetime64[{unit}]") + return df.fillna(fill_value) + + def cmp(df1, df2): + assert df1["date"].max().asm8 == fill_value + assert df2["date"].max().asm8 == fill_value + df_equals(df1, df2) + + run_and_compare( + serialize, + data={ + "date": [ + np.datetime64(1, unit), + np.datetime64(2, unit), + None, + ] + }, + comparator=cmp, + ) + class TestCategory: data = { diff --git a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/utils.py b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/utils.py index 0dc5a5134d6..1fa289535cd 100644 --- a/modin/experimental/core/execution/native/implementations/hdk_on_native/test/utils.py +++ b/modin/experimental/core/execution/native/implementations/hdk_on_native/test/utils.py @@ -18,18 +18,12 @@ import datetime import numpy as np from pandas.api.types import is_datetime64_any_dtype -import pyarrow as pa from modin.pandas.test.utils import ( df_equals, io_ops_bad_exc, eval_io as general_eval_io, ) -from ..df_algebra import FrameNode - -from modin.experimental.core.execution.native.implementations.hdk_on_native.db_worker import ( - DbWorker, -) def eval_io( @@ -58,6 +52,15 @@ def hdk_comparator(df1, df2, **kwargs): # Aligning DateTime dtypes because of the bug related to the `parse_dates` parameter: # https://github.com/modin-project/modin/issues/3485 df1, df2 = align_datetime_dtypes(df1, df2) + + # 1. Replace NA with empty strings. HDK treats empty strings and NA equally. + # 2. HdkWorker.cast_to_compatible_types() converts all categorical columns to string. + for dtype in ("object", "category"): + for df in (df1, df2): + sdf = df.select_dtypes(dtype) + if len(sdf.columns) != 0: + sdf = sdf.fillna("") if dtype == "object" else sdf.astype(str) + df[sdf.columns] = sdf[sdf.columns] comparator(df1, df2, **kwargs) general_eval_io( @@ -158,27 +161,14 @@ def __init__(self, *dfs): for df in dfs: if not isinstance(df, (pd.DataFrame, pd.Series)): continue - df.shape # to trigger real execution if df.empty: continue - modin_frame = df._query_compiler._modin_frame - partition = modin_frame._partitions[0][0] - if partition.frame_id is not None: - continue - frame = partition.get() - if isinstance(frame, (pandas.DataFrame, pandas.Series)): - frame = pa.Table.from_pandas(frame) - if isinstance(frame, pa.Table): - _, cols = modin_frame._partition_mgr_cls._get_unsupported_cols(frame) - if len(cols) != 0: - continue - frame_id = DbWorker().import_arrow_table(frame) - else: - raise TypeError( - f"Unexpected storage format, expected pandas.DataFrame or pyarrow.Table, got: {type(frame)}." - ) - partition.frame_id = frame_id - self._imported_frames.append((df, frame_id)) + try: + modin_frame = df._query_compiler._modin_frame + modin_frame.force_import() + self._imported_frames.append(df) + except NotImplementedError: + ... def __enter__(self): return self @@ -194,28 +184,20 @@ def export_frames(self): that was just exported from HDK. """ result = [] - for df, frame_id in self._imported_frames: + for df in self._imported_frames: # Append `TransformNode`` selecting all the columns (SELECT * FROM frame_id) df = df[df.columns.tolist()] modin_frame = df._query_compiler._modin_frame - # Forcibly executing plan via HDK. We can't use `modin_frame._execute()` here - # as it has a chance of running via pyarrow bypassing HDK - new_partitions = modin_frame._partition_mgr_cls.run_exec_plan( - modin_frame._op, - modin_frame._table_cols, - ) - modin_frame._partitions = new_partitions - modin_frame._op = FrameNode(modin_frame) + # Forcibly executing plan via HDK. + mode = modin_frame._force_execution_mode + modin_frame._force_execution_mode = "hdk" + modin_frame._execute() + modin_frame._force_execution_mode = mode result.append(df) return result def __exit__(self, exc_type, exc_val, exc_tb): - for df, frame_id in self._imported_frames: - actual_frame_id = df._query_compiler._modin_frame._partitions[0][0].frame_id - DbWorker().dropTable(frame_id) - if actual_frame_id == frame_id: - df._query_compiler._modin_frame._partitions[0][0].frame_id = None - self._imported_frames = [] + self._imported_frames.clear() def set_execution_mode(frame, mode, recursive=False): diff --git a/modin/experimental/sql/hdk/query.py b/modin/experimental/sql/hdk/query.py index 6ede4493d25..ded9a93fa7e 100644 --- a/modin/experimental/sql/hdk/query.py +++ b/modin/experimental/sql/hdk/query.py @@ -69,10 +69,10 @@ def hdk_query(query: str, **kwargs) -> pd.DataFrame: modin.pandas.DataFrame Execution result. """ - worker = HdkWorker() if len(kwargs) > 0: - query = _build_query(query, kwargs, worker.import_arrow_table) - df = from_arrow(worker.executeDML(query)) + query = _build_query(query, kwargs) + table = HdkWorker().executeDML(query) + df = from_arrow(table.to_arrow()) mdf = df._query_compiler._modin_frame schema = mdf._partitions[0][0].get().schema # HDK returns strings as dictionary. For the proper conversion to @@ -87,7 +87,7 @@ def hdk_query(query: str, **kwargs) -> pd.DataFrame: return df -def _build_query(query: str, frames: dict, import_table: callable) -> str: +def _build_query(query: str, frames: dict) -> str: """ Build query to be executed. @@ -100,8 +100,6 @@ def _build_query(query: str, frames: dict, import_table: callable) -> str: SQL query to be processed. frames : dict DataFrames referenced by the query. - import_table : callable - Used to import tables and assign the table names. Returns ------- @@ -112,22 +110,14 @@ def _build_query(query: str, frames: dict, import_table: callable) -> str: for name, df in frames.items(): assert isinstance(df._query_compiler, DFAlgQueryCompiler) mf = df._query_compiler._modin_frame - if not mf._has_arrow_table(): - mf._execute() - assert mf._has_arrow_table() - part = mf._partitions[0][0] - at = part.get() - - if part.frame_id is None: - part.frame_id = import_table(at) - + table = mf.force_import() alias.append("WITH " if len(alias) == 0 else "\n),\n") alias.extend((name, " AS (\n", " SELECT\n")) - for i, col in enumerate(at.column_names): + for i, col in enumerate(table.column_names): alias.append(" " if i == 0 else ",\n ") alias.extend(('"', col, '"', " AS ", '"', ColNameCodec.decode(col), '"')) - alias.extend(("\n FROM\n ", part.frame_id)) + alias.extend(("\n FROM\n ", table.name)) alias.extend(("\n)\n", query)) return "".join(alias) diff --git a/modin/numpy/arr.py b/modin/numpy/arr.py index 42f79f0a365..dffc1188d90 100644 --- a/modin/numpy/arr.py +++ b/modin/numpy/arr.py @@ -2239,7 +2239,7 @@ def argmax(self, axis=None, out=None, keepdims=None): na_mask = self._query_compiler.isna().any(axis=apply_axis) if na_mask.any(axis=apply_axis ^ 1).to_numpy()[0, 0]: na_idxs = self._query_compiler.isna().idxmax(axis=apply_axis) - result = na_mask.where(na_idxs, result) + result = na_idxs.where(na_mask, result) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: result = result.to_numpy()[0, 0] @@ -2304,7 +2304,7 @@ def argmin(self, axis=None, out=None, keepdims=None): na_mask = self._query_compiler.isna().any(axis=apply_axis) if na_mask.any(axis=apply_axis ^ 1).to_numpy()[0, 0]: na_idxs = self._query_compiler.isna().idxmax(axis=apply_axis) - result = na_mask.where(na_idxs, result) + result = na_idxs.where(na_mask, result) new_ndim = self._ndim - 1 if not keepdims else self._ndim if new_ndim == 0: result = result.to_numpy()[0, 0] diff --git a/modin/numpy/test/test_array.py b/modin/numpy/test/test_array.py index f96962741d0..15284225ad2 100644 --- a/modin/numpy/test/test_array.py +++ b/modin/numpy/test/test_array.py @@ -271,6 +271,16 @@ def test_array_where(): assert_scalar_or_array_equal(modin_result, numpy_result) +@pytest.mark.parametrize("method", ["argmax", "argmin"]) +def test_argmax_argmin(method): + numpy_arr = numpy.array([[1, 2, 3], [4, 5, np.NaN]]) + modin_arr = np.array(numpy_arr) + assert_scalar_or_array_equal( + getattr(np, method)(modin_arr, axis=1), + getattr(numpy, method)(numpy_arr, axis=1), + ) + + def test_flatten(): numpy_flat_arr = numpy.random.randint(-100, 100, size=100) modin_flat_arr = np.array(numpy_flat_arr) diff --git a/modin/pandas/__init__.py b/modin/pandas/__init__.py index 91457b0c47b..dd04f00f052 100644 --- a/modin/pandas/__init__.py +++ b/modin/pandas/__init__.py @@ -13,15 +13,22 @@ import pandas import warnings +from packaging import version -__pandas_version__ = "2.0.2" +__pandas_version__ = "2.0" -if pandas.__version__ != __pandas_version__: +if ( + version.parse(pandas.__version__).release[:2] + != version.parse(__pandas_version__).release[:2] +): warnings.warn( f"The pandas version installed ({pandas.__version__}) does not match the supported pandas version in" - + f" Modin ({__pandas_version__}). This may cause undesired side effects!" + + f" Modin ({__pandas_version__}.X). This may cause undesired side effects!" ) +# to not pollute namespace +del version + with warnings.catch_warnings(): warnings.simplefilter("ignore") from pandas import ( @@ -149,6 +156,11 @@ def _update_engine(publisher: Parameter): initialize_unidist() elif publisher.get() == "Cloudray": + warnings.warn( + "Cloud feature is deprecated and will be removed in 0.24.0 release", + DeprecationWarning, + ) + from modin.experimental.cloud import get_connection conn = get_connection() @@ -174,10 +186,20 @@ def init_remote_ray(partition): else: get_connection().modules["modin"].set_execution("Ray", StorageFormat.get()) elif publisher.get() == "Cloudpython": + warnings.warn( + "Cloud feature is deprecated and will be removed in 0.24.0 release", + DeprecationWarning, + ) + from modin.experimental.cloud import get_connection get_connection().modules["modin"].set_execution("Python") elif publisher.get() == "Cloudnative": + warnings.warn( + "Cloud feature is deprecated and will be removed in 0.24.0 release", + DeprecationWarning, + ) + from modin.experimental.cloud import get_connection assert ( diff --git a/modin/pandas/base.py b/modin/pandas/base.py index d6a513851f2..e6eb94d64c7 100644 --- a/modin/pandas/base.py +++ b/modin/pandas/base.py @@ -1083,8 +1083,6 @@ def clip( if upper is not None and np.any(np.isnan(upper)): upper = None if is_list_like(lower) or is_list_like(upper): - if axis is None: - raise ValueError("Must specify axis = 0 or 1") lower = self._validate_other(lower, axis) upper = self._validate_other(upper, axis) # FIXME: Judging by pandas docs `*args` and `**kwargs` serves only compatibility diff --git a/modin/pandas/dataframe.py b/modin/pandas/dataframe.py index 938625c550f..e0738263346 100644 --- a/modin/pandas/dataframe.py +++ b/modin/pandas/dataframe.py @@ -759,11 +759,18 @@ def equals(self, other): # noqa: PR01, RT01, D200 if isinstance(other, pandas.DataFrame): # Copy into a Modin DataFrame to simplify logic below other = self.__constructor__(other) - return ( - self.index.equals(other.index) - and self.columns.equals(other.columns) - and self.eq(other).all().all() + + if ( + type(self) != type(other) + or not self.index.equals(other.index) + or not self.columns.equals(other.columns) + ): + return False + + result = self.__constructor__( + query_compiler=self._query_compiler.equals(other._query_compiler) ) + return result.all(axis=None) def _update_var_dicts_in_kwargs(self, expr, kwargs): """ diff --git a/modin/pandas/groupby.py b/modin/pandas/groupby.py index 3d208bc02be..3ef68f17c2e 100644 --- a/modin/pandas/groupby.py +++ b/modin/pandas/groupby.py @@ -39,6 +39,7 @@ from modin.core.dataframe.algebra.default2pandas.groupby import GroupBy from modin.config import IsExperimental from .series import Series +from .window import RollingGroupby from .utils import is_label @@ -135,7 +136,20 @@ def __init__( } self._kwargs.update(kwargs) - def __override(self, **kwargs): + def _override(self, **kwargs): + """ + Override groupby parameters. + + Parameters + ---------- + **kwargs : dict + Parameters to override. + + Returns + ------- + DataFrameGroupBy + A groupby object with new parameters. + """ new_kw = dict( df=self._df, by=self._by, @@ -836,7 +850,7 @@ def do_relabel(obj_to_relabel): # for list-list aggregation pandas always puts # groups as index in the result, ignoring as_index, # so we have to reset it to default value - res = self.__override(as_index=True)._wrap_aggregation( + res = self._override(as_index=True)._wrap_aggregation( qc_method=type(self._query_compiler).groupby_agg, numeric_only=False, agg_func=func, @@ -916,7 +930,7 @@ def var(self, ddof=1, engine=None, engine_kwargs=None, numeric_only=False): ) def get_group(self, name, obj=None): - work_object = self.__override( + work_object = self._override( df=obj if obj is not None else self._df, as_index=True ) @@ -1067,7 +1081,7 @@ def median(self, numeric_only=False): def head(self, n=5): # groupby().head()/.tail() ignore as_index, so override it to True - work_object = self.__override(as_index=True) + work_object = self._override(as_index=True) return work_object._check_index( work_object._wrap_aggregation( @@ -1184,7 +1198,7 @@ def cumcount(self, ascending=True): def tail(self, n=5): # groupby().head()/.tail() ignore as_index, so override it to True - work_object = self.__override(as_index=True) + work_object = self._override(as_index=True) return work_object._check_index( work_object._wrap_aggregation( type(work_object._query_compiler).groupby_tail, @@ -1199,7 +1213,7 @@ def expanding(self, *args, **kwargs): return self._default_to_pandas(lambda df: df.expanding(*args, **kwargs)) def rolling(self, *args, **kwargs): - return self._default_to_pandas(lambda df: df.rolling(*args, **kwargs)) + return RollingGroupby(self, *args, **kwargs) def hist( self, diff --git a/modin/pandas/series.py b/modin/pandas/series.py index d873ab7f488..aa061598657 100644 --- a/modin/pandas/series.py +++ b/modin/pandas/series.py @@ -23,6 +23,7 @@ from pandas.core.dtypes.common import ( is_dict_like, is_list_like, + is_categorical_dtype, ) from pandas.core.series import _coerce_method from pandas._libs.lib import no_default, NoDefault @@ -421,7 +422,7 @@ def __repr__(self): if ( isinstance(temp_df, pandas.Series) and temp_df.name is not None - and temp_df.dtype == "category" + and is_categorical_dtype(temp_df.dtype) ): maxsplit = 2 return temp_str.rsplit("\n", maxsplit)[0] + "\n{}{}{}{}".format( @@ -926,11 +927,26 @@ def equals(self, other): # noqa: PR01, RT01, D200 """ Test whether two objects contain the same elements. """ - return ( - self.name == other.name - and self.index.equals(other.index) - and self.eq(other).all() - ) + if isinstance(other, pandas.Series): + # Copy into a Modin Series to simplify logic below + other = self.__constructor__(other) + + if type(self) != type(other) or not self.index.equals(other.index): + return False + + old_name_self = self.name + old_name_other = other.name + try: + self.name = "temp_name_for_equals_op" + other.name = "temp_name_for_equals_op" + # this function should return only scalar + res = self.__constructor__( + query_compiler=self._query_compiler.equals(other._query_compiler) + ) + finally: + self.name = old_name_self + other.name = old_name_other + return res.all() def explode(self, ignore_index: bool = False): # noqa: PR01, RT01, D200 """ @@ -1397,6 +1413,22 @@ def reindex( fill_value=fill_value, ) + def rename_axis( + self, + mapper=no_default, + *, + index=no_default, + axis=0, + copy=True, + inplace=False, + ): # noqa: PR01, RT01, D200 + """ + Set the name of the axis for the index or columns. + """ + return super().rename_axis( + mapper=mapper, index=index, axis=axis, copy=copy, inplace=inplace + ) + def rename( self, index=None, diff --git a/modin/pandas/test/data/issue_1930.csv b/modin/pandas/test/data/issue_1930.csv new file mode 100644 index 00000000000..d61b5455de7 --- /dev/null +++ b/modin/pandas/test/data/issue_1930.csv @@ -0,0 +1,5 @@ +,col1,col2,col3,col4,col5 +0,0,4,8,12,0 +1,1,5,9,13,0 +2,2,6,10,14,0 +3,3,7,11,15,0 diff --git a/modin/pandas/test/dataframe/test_binary.py b/modin/pandas/test/dataframe/test_binary.py index 92d5c25f6e9..c5b9e775d6c 100644 --- a/modin/pandas/test/dataframe/test_binary.py +++ b/modin/pandas/test/dataframe/test_binary.py @@ -13,6 +13,7 @@ import pytest import pandas +import numpy as np import matplotlib import modin.pandas as pd @@ -223,28 +224,81 @@ def test_multi_level_comparison(data, op): getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1) -def test_equals(): - frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 4, 1]} - modin_df1 = pd.DataFrame(frame_data) - modin_df2 = pd.DataFrame(frame_data) - - assert modin_df1.equals(modin_df2) - - df_equals(modin_df1, modin_df2) - df_equals(modin_df1, pd.DataFrame(modin_df1)) +@pytest.mark.parametrize( + "frame1_data,frame2_data,expected_pandas_equals", + [ + pytest.param({}, {}, True, id="two_empty_dataframes"), + pytest.param([[1]], [[0]], False, id="single_unequal_values"), + pytest.param([[None]], [[None]], True, id="single_none_values"), + pytest.param([[np.NaN]], [[np.NaN]], True, id="single_nan_values"), + pytest.param({1: [10]}, {1.0: [10]}, True, id="different_column_types"), + pytest.param({1: [10]}, {2: [10]}, False, id="different_columns"), + pytest.param( + pandas.DataFrame({1: [10]}, index=[1]), + pandas.DataFrame({1: [10]}, index=[1.0]), + True, + id="different_index_types", + ), + pytest.param( + pandas.DataFrame({1: [10]}, index=[1]), + pandas.DataFrame({1: [10]}, index=[2]), + False, + id="different_indexes", + ), + pytest.param({1: [10]}, {1: [10.0]}, False, id="different_value_types"), + pytest.param( + [[1, 2], [3, 4]], + [[1, 2], [3, 4]], + True, + id="equal_two_by_two_dataframes", + ), + pytest.param( + [[1, 2], [3, 4]], + [[5, 2], [3, 4]], + False, + id="unequal_two_by_two_dataframes", + ), + pytest.param( + [[1, 1]], + [[1]], + False, + id="different_row_lengths", + ), + pytest.param( + [[1], [1]], + [[1]], + False, + id="different_column_lengths", + ), + ], +) +def test_equals(frame1_data, frame2_data, expected_pandas_equals): + modin_df1 = pd.DataFrame(frame1_data) + pandas_df1 = pandas.DataFrame(frame1_data) + modin_df2 = pd.DataFrame(frame2_data) + pandas_df2 = pandas.DataFrame(frame2_data) + + pandas_equals = pandas_df1.equals(pandas_df2) + assert pandas_equals == expected_pandas_equals, ( + "Test expected pandas to say the dataframes were" + + f"{'' if expected_pandas_equals else ' not'} equal, but they were" + + f"{' not' if expected_pandas_equals else ''} equal." + ) - frame_data = {"col1": [2.9, 3, 3, 3], "col2": [2, 3, 5, 1]} - modin_df3 = pd.DataFrame(frame_data, index=list("abcd")) + assert modin_df1.equals(modin_df2) == pandas_equals + assert modin_df1.equals(pandas_df2) == pandas_equals - assert not modin_df1.equals(modin_df3) - with pytest.raises(AssertionError): - df_equals(modin_df3, modin_df1) +def test_equals_several_partitions(): + modin_series1 = pd.concat([pd.DataFrame([0, 1]), pd.DataFrame([None, 1])]) + modin_series2 = pd.concat([pd.DataFrame([0, 1]), pd.DataFrame([1, None])]) + assert not modin_series1.equals(modin_series2) - with pytest.raises(AssertionError): - df_equals(modin_df3, modin_df2) - assert modin_df1.equals(modin_df2._query_compiler.to_pandas()) +def test_equals_with_nans(): + df1 = pd.DataFrame([0, 1, None], dtype="uint8[pyarrow]") + df2 = pd.DataFrame([None, None, None], dtype="uint8[pyarrow]") + assert not df1.equals(df2) @pytest.mark.parametrize("is_more_other_partitions", [True, False]) diff --git a/modin/pandas/test/dataframe/test_map_metadata.py b/modin/pandas/test/dataframe/test_map_metadata.py index 66208d07004..8844e8d1766 100644 --- a/modin/pandas/test/dataframe/test_map_metadata.py +++ b/modin/pandas/test/dataframe/test_map_metadata.py @@ -320,6 +320,12 @@ def test_copy(data): df_equals(modin_df, modin_df_cp) +def test_copy_empty_dataframe(): + df = pd.DataFrame(range(3)) + res = df[:0].copy() + assert res.dtypes.equals(df.dtypes) + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_dtypes(data): modin_df = pd.DataFrame(data) @@ -757,6 +763,7 @@ def test_infer_objects_single_partition(): @pytest.mark.parametrize( "convert_floating", bool_arg_values, ids=arg_keys("convert_floating", bool_arg_keys) ) +@pytest.mark.exclude_in_sanity def test_convert_dtypes_single_partition( infer_objects, convert_string, convert_integer, convert_boolean, convert_floating ): @@ -844,6 +851,7 @@ def test_convert_dtypes_5653(): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("axis", axis_values, ids=axis_keys) @pytest.mark.parametrize("bound_type", ["list", "series"], ids=["list", "series"]) +@pytest.mark.exclude_in_sanity def test_clip(request, data, axis, bound_type): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) @@ -893,6 +901,12 @@ def test_clip(request, data, axis, bound_type): modin_df.clip(lower=[1, 2, 3], axis=None) +def test_clip_4485(): + modin_result = pd.DataFrame([1]).clip([3]) + pandas_result = pandas.DataFrame([1]).clip([3]) + df_equals(modin_result, pandas_result) + + def test_drop(): frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} simple = pandas.DataFrame(frame_data) @@ -1037,6 +1051,7 @@ def test_droplevel(): ids=["None", "string", "name", "tuple", "list"], ) @pytest.mark.parametrize("ignore_index", [True, False], ids=["True", "False"]) +@pytest.mark.exclude_in_sanity def test_drop_duplicates(data, keep, subset, ignore_index): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) @@ -1690,6 +1705,7 @@ def test___round__(): ], ) @pytest.mark.parametrize("dtype", [None, "str"]) +@pytest.mark.exclude_in_sanity def test_constructor_from_modin_series(get_index, get_columns, dtype): modin_df, pandas_df = create_test_dfs(test_data_values[0]) diff --git a/modin/pandas/test/internals/test_benchmark_mode.py b/modin/pandas/test/internals/test_benchmark_mode.py index e42e4ee76c6..8d67503ff33 100644 --- a/modin/pandas/test/internals/test_benchmark_mode.py +++ b/modin/pandas/test/internals/test_benchmark_mode.py @@ -13,9 +13,10 @@ import unittest.mock as mock +import pytest + import modin.pandas as pd -from modin.pandas.test.utils import test_data_values -from modin.config import BenchmarkMode, Engine +from modin.config import Engine engine = Engine.get() @@ -46,26 +47,17 @@ ) -def test_from_environment_variable(): - assert BenchmarkMode.get() - with mock.patch(wait_method) as wait: - pd.DataFrame(test_data_values[0]).mean() - - wait.assert_called() - - -def test_turn_off(): +@pytest.mark.parametrize("set_benchmark_mode", [False], indirect=True) +def test_turn_off(set_benchmark_mode): df = pd.DataFrame([0]) - BenchmarkMode.put(False) with mock.patch(wait_method) as wait: df.dropna() wait.assert_not_called() -def test_turn_on(): - BenchmarkMode.put(False) +@pytest.mark.parametrize("set_benchmark_mode", [True], indirect=True) +def test_turn_on(set_benchmark_mode): df = pd.DataFrame([0]) - BenchmarkMode.put(True) with mock.patch(wait_method) as wait: df.dropna() wait.assert_called() diff --git a/modin/pandas/test/test_api.py b/modin/pandas/test/test_api.py index 58f8088ae63..37984eb3d10 100644 --- a/modin/pandas/test/test_api.py +++ b/modin/pandas/test/test_api.py @@ -272,8 +272,6 @@ def test_series_api_equality(): # These have to be checked manually allowed_different = ["to_hdf", "hist"] - # skip verifying .rename_axis() due to https://github.com/modin-project/modin/issues/5077 - allowed_different.append("rename_axis") assert_parameters_eq((pandas.Series, pd.Series), modin_dir, allowed_different) diff --git a/modin/pandas/test/test_groupby.py b/modin/pandas/test/test_groupby.py index 6edeabb4157..2348fb8abdf 100644 --- a/modin/pandas/test/test_groupby.py +++ b/modin/pandas/test/test_groupby.py @@ -2651,3 +2651,135 @@ def test_groupby_pct_change_diff_6194(): df._to_pandas(), lambda df: df.groupby(by="by").diff(), ) + + +def eval_rolling(md_window, pd_window): + eval_general(md_window, pd_window, lambda window: window.count()) + eval_general(md_window, pd_window, lambda window: window.sum()) + eval_general(md_window, pd_window, lambda window: window.mean()) + eval_general(md_window, pd_window, lambda window: window.median()) + eval_general(md_window, pd_window, lambda window: window.var()) + eval_general(md_window, pd_window, lambda window: window.std()) + eval_general(md_window, pd_window, lambda window: window.min()) + eval_general(md_window, pd_window, lambda window: window.max()) + eval_general(md_window, pd_window, lambda window: window.corr()) + eval_general(md_window, pd_window, lambda window: window.cov()) + eval_general(md_window, pd_window, lambda window: window.skew()) + eval_general(md_window, pd_window, lambda window: window.kurt()) + eval_general( + md_window, pd_window, lambda window: window.apply(lambda df: (df + 10).sum()) + ) + eval_general(md_window, pd_window, lambda window: window.agg("sum")) + eval_general(md_window, pd_window, lambda window: window.quantile(0.2)) + eval_general(md_window, pd_window, lambda window: window.rank()) + + if not md_window._as_index: + # There's a mismatch in group columns when 'as_index=False' + # see: https://github.com/modin-project/modin/issues/6291 + by_cols = list(md_window._groupby_obj._internal_by) + eval_general( + md_window, + pd_window, + lambda window: window.sem().drop(columns=by_cols, errors="ignore"), + ) + else: + eval_general( + md_window, + pd_window, + lambda window: window.sem(), + ) + + +@pytest.mark.parametrize("center", [True, False]) +@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"]) +@pytest.mark.parametrize("as_index", [True, False]) +def test_rolling_int_window(center, closed, as_index): + col_part1 = pd.DataFrame( + { + "by": np.tile(np.arange(15), 10), + "col1": np.arange(150), + "col2": np.arange(10, 160), + } + ) + col_part2 = pd.DataFrame({"col3": np.arange(20, 170)}) + + md_df = pd.concat([col_part1, col_part2], axis=1) + pd_df = md_df._to_pandas() + + if StorageFormat.get() == "Pandas": + assert md_df._query_compiler._modin_frame._partitions.shape[1] == 2 + + md_window = md_df.groupby("by", as_index=as_index).rolling( + 3, center=center, closed=closed + ) + pd_window = pd_df.groupby("by", as_index=as_index).rolling( + 3, center=center, closed=closed + ) + eval_rolling(md_window, pd_window) + + +@pytest.mark.parametrize("center", [True, False]) +@pytest.mark.parametrize("closed", ["right", "left", "both", "neither"]) +@pytest.mark.parametrize("as_index", [True, False]) +@pytest.mark.parametrize("on", [None, "col4"]) +def test_rolling_timedelta_window(center, closed, as_index, on): + col_part1 = pd.DataFrame( + { + "by": np.tile(np.arange(15), 10), + "col1": np.arange(150), + "col2": np.arange(10, 160), + } + ) + col_part2 = pd.DataFrame({"col3": np.arange(20, 170)}) + + if on is not None: + col_part2[on] = pandas.DatetimeIndex( + [ + datetime.date(2020, 1, 1) + datetime.timedelta(hours=12) * i + for i in range(150) + ] + ) + + md_df = pd.concat([col_part1, col_part2], axis=1) + md_df.index = pandas.DatetimeIndex( + [datetime.date(2020, 1, 1) + datetime.timedelta(days=1) * i for i in range(150)] + ) + + pd_df = md_df._to_pandas() + + if StorageFormat.get() == "Pandas": + assert md_df._query_compiler._modin_frame._partitions.shape[1] == 2 + + md_window = md_df.groupby("by", as_index=as_index).rolling( + datetime.timedelta(days=3), center=center, closed=closed, on=on + ) + pd_window = pd_df.groupby("by", as_index=as_index).rolling( + datetime.timedelta(days=3), center=center, closed=closed, on=on + ) + eval_rolling(md_window, pd_window) + + +@pytest.mark.parametrize( + "func", + [ + pytest.param("sum", id="map_reduce_func"), + pytest.param("median", id="full_axis_func"), + ], +) +def test_groupby_deferred_index(func): + # the test is copied from the issue: + # https://github.com/modin-project/modin/issues/6368 + + def perform(lib): + df1 = lib.DataFrame({"a": [1, 1, 2, 2]}) + df2 = lib.DataFrame({"b": [3, 4, 5, 6], "c": [7, 5, 4, 3]}) + + df = lib.concat([df1, df2], axis=1) + df.index = [10, 11, 12, 13] + + grp = df.groupby("a") + grp.indices + + return getattr(grp, func)() + + eval_general(pd, pandas, perform) diff --git a/modin/pandas/test/test_io.py b/modin/pandas/test/test_io.py index b14cf3d130a..ff25601fccf 100644 --- a/modin/pandas/test/test_io.py +++ b/modin/pandas/test/test_io.py @@ -950,6 +950,7 @@ def test_read_csv_google_cloud_storage(self): ], ], ) + @pytest.mark.exclude_in_sanity def test_read_csv_parse_dates( self, names, header, index_col, parse_dates, encoding, encoding_errors ): @@ -1124,6 +1125,7 @@ def test_read_csv_wrong_path(self): condition="config.getoption('--simulate-cloud').lower() != 'off'", reason="The reason of tests fail in `cloud` mode is unknown for now - issue #2340", ) + @pytest.mark.exclude_in_sanity def test_to_csv( self, tmp_path, @@ -1337,6 +1339,16 @@ def test_read_csv_issue_5150(self, set_async_read_mode): if not AsyncReadMode.get(): df_equals(expected_pandas_df, actual_pandas_df) + @pytest.mark.parametrize("usecols", [None, [0, 1, 2, 3, 4]]) + def test_read_csv_1930(self, usecols): + eval_io( + fn_name="read_csv", + # read_csv kwargs + filepath_or_buffer="modin/pandas/test/data/issue_1930.csv", + names=["c1", "c2", "c3", "c4", "c5"], + usecols=usecols, + ) + class TestTable: def test_read_table(self, make_csv_file): @@ -1474,6 +1486,7 @@ def test_read_parquet_indexing_by_column(self, tmp_path, engine, make_parquet_fi condition="config.getoption('--simulate-cloud').lower() != 'off'", reason="The reason of tests fail in `cloud` mode is unknown for now - issue #3264", ) + @pytest.mark.exclude_in_sanity def test_read_parquet_directory( self, engine, make_parquet_dir, columns, row_group_size, rows_per_file ): @@ -2032,10 +2045,6 @@ def test_read_excel_every_other_nan(self): io="modin/pandas/test/data/every_other_row_nan.xlsx", ) - @pytest.mark.xfail( - StorageFormat.get() == "Hdk", - reason="The frame contains different dtypes in the same column and could not be converted to arrow", - ) @check_file_leaks def test_read_excel_header_none(self): eval_io( diff --git a/modin/pandas/test/test_series.py b/modin/pandas/test/test_series.py index 45eb9145c3a..513dceec7d2 100644 --- a/modin/pandas/test/test_series.py +++ b/modin/pandas/test/test_series.py @@ -567,12 +567,14 @@ def test___repr__4186(): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.exclude_in_sanity def test___round__(data): modin_series, pandas_series = create_test_series(data) df_equals(round(modin_series), round(pandas_series)) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.exclude_in_sanity def test___setitem__(data): modin_series, pandas_series = create_test_series(data) for key in modin_series.keys(): @@ -1409,6 +1411,12 @@ def test_copy(data): df_equals(modin_series.copy(), pandas_series.copy()) +def test_copy_empty_series(): + ser = pd.Series(range(3)) + res = ser[:0].copy() + assert res.dtype == ser.dtype + + @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) def test_corr(data): modin_series, pandas_series = create_test_series(data) @@ -1897,26 +1905,74 @@ def test_eq(data): inter_df_math_helper(modin_series, pandas_series, "eq") -def test_equals(): - series_data = [2.9, 3, 3, 3] - modin_df1 = pd.Series(series_data) - modin_df2 = pd.Series(series_data) +@pytest.mark.parametrize( + "series1_data,series2_data,expected_pandas_equals", + [ + pytest.param([1], [0], False, id="single_unequal_values"), + pytest.param([None], [None], True, id="single_none_values"), + pytest.param( + pandas.Series(1, name="series1"), + pandas.Series(1, name="series2"), + True, + id="different_names", + ), + pytest.param( + pandas.Series([1], index=[1]), + pandas.Series([1], index=[1.0]), + True, + id="different_index_types", + ), + pytest.param( + pandas.Series([1], index=[1]), + pandas.Series([1], index=[2]), + False, + id="different_index_values", + ), + pytest.param([1], [1.0], False, id="different_value_types"), + pytest.param( + [1, 2], + [1, 2], + True, + id="equal_series_of_length_two", + ), + pytest.param( + [1, 2], + [1, 3], + False, + id="unequal_series_of_length_two", + ), + pytest.param( + [[1, 2]], + [[1]], + False, + id="different_lengths", + ), + ], +) +def test_equals(series1_data, series2_data, expected_pandas_equals): + modin_series1, pandas_df1 = create_test_series(series1_data) + modin_series2, pandas_df2 = create_test_series(series2_data) - assert modin_df1.equals(modin_df2) - assert modin_df1.equals(pd.Series(modin_df1)) - df_equals(modin_df1, modin_df2) - df_equals(modin_df1, pd.Series(modin_df1)) + pandas_equals = pandas_df1.equals(pandas_df2) + assert pandas_equals == expected_pandas_equals, ( + "Test expected pandas to say the series were" + + f"{'' if expected_pandas_equals else ' not'} equal, but they were" + + f"{' not' if expected_pandas_equals else ''} equal." + ) + assert modin_series1.equals(modin_series2) == pandas_equals + assert modin_series1.equals(pandas_df2) == pandas_equals - series_data = [2, 3, 5, 1] - modin_df3 = pd.Series(series_data, index=list("abcd")) - assert not modin_df1.equals(modin_df3) +def test_equals_several_partitions(): + modin_series1 = pd.concat([pd.Series([0, 1]), pd.Series([None, 1])]) + modin_series2 = pd.concat([pd.Series([0, 1]), pd.Series([1, None])]) + assert not modin_series1.equals(modin_series2) - with pytest.raises(AssertionError): - df_equals(modin_df3, modin_df1) - with pytest.raises(AssertionError): - df_equals(modin_df3, modin_df2) +def test_equals_with_nans(): + ser1 = pd.Series([0, 1, None], dtype="uint8[pyarrow]") + ser2 = pd.Series([None, None, None], dtype="uint8[pyarrow]") + assert not ser1.equals(ser2) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @@ -1954,6 +2010,7 @@ def test_ffill(data): @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) @pytest.mark.parametrize("reindex", [None, 2, -2]) @pytest.mark.parametrize("limit", [None, 1, 2, 0.5, -1, -2, 1.5]) +@pytest.mark.exclude_in_sanity def test_fillna(data, reindex, limit): modin_series, pandas_series = create_test_series(data) index = pandas_series.index @@ -2376,6 +2433,10 @@ def test_map(data, na_values): ) +@pytest.mark.xfail( + StorageFormat.get() == "Hdk", + reason="https://github.com/intel-ai/hdk/issues/542", +) def test_mask(): modin_series = pd.Series(np.arange(10)) m = modin_series % 3 == 0 @@ -2850,6 +2911,12 @@ def test_repeat_lists(data, repeats): ) +def test_clip_4485(): + modin_result = pd.Series([1]).clip([3]) + pandas_result = pandas.Series([1]).clip([3]) + df_equals(modin_result, pandas_result) + + def test_replace(): modin_series = pd.Series([0, 1, 2, 3, 4]) pandas_series = pandas.Series([0, 1, 2, 3, 4]) @@ -2865,6 +2932,7 @@ def test_replace(): @pytest.mark.parametrize("closed", ["left", "right"]) @pytest.mark.parametrize("label", ["right", "left"]) @pytest.mark.parametrize("level", [None, 1]) +@pytest.mark.exclude_in_sanity def test_resample(closed, label, level): rule = "5T" freq = "H" @@ -3050,6 +3118,7 @@ def test_sample(data): @pytest.mark.parametrize("values_number", [1, 2, 5]) @pytest.mark.parametrize("side", ["left", "right"]) @pytest.mark.parametrize("data", test_data_values, ids=test_data_keys) +@pytest.mark.exclude_in_sanity def test_searchsorted( data, side, values_number, sorter, use_multiindex, single_value_data ): @@ -3312,6 +3381,7 @@ def test_subtract(data): @pytest.mark.parametrize( "min_count", int_arg_values, ids=arg_keys("min_count", int_arg_keys) ) +@pytest.mark.exclude_in_sanity def test_sum(data, axis, skipna, numeric_only, min_count): eval_general( *create_test_series(data), @@ -3670,6 +3740,7 @@ def test_update(data, other_data): ], ) @pytest.mark.parametrize("ascending", bool_arg_values, ids=bool_arg_keys) +@pytest.mark.exclude_in_sanity def test_value_counts(sort, normalize, bins, dropna, ascending): def sort_sensitive_comparator(df1, df2): # We sort indices for Modin and pandas result because of issue #1650 diff --git a/modin/pandas/window.py b/modin/pandas/window.py index 473b2a91d97..4e05cece9e4 100644 --- a/modin/pandas/window.py +++ b/modin/pandas/window.py @@ -20,6 +20,7 @@ from modin.logging import ClassLogger from modin.utils import _inherit_docstrings from modin.pandas.utils import cast_function_modin2pandas +from modin.error_message import ErrorMessage @_inherit_docstrings(pandas.core.window.rolling.Window) @@ -39,44 +40,44 @@ def __init__( ): self._dataframe = dataframe self._query_compiler = dataframe._query_compiler - self.window_args = [ - window, - min_periods, - center, - win_type, - on, - axis, - closed, - step, - method, - ] + self.window_kwargs = { + "window": window, + "min_periods": min_periods, + "center": center, + "win_type": win_type, + "on": on, + "axis": axis, + "closed": closed, + "step": step, + "method": method, + } self.axis = axis def mean(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.window_mean( - self.axis, self.window_args, *args, **kwargs + self.axis, self.window_kwargs, *args, **kwargs ) ) def sum(self, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.window_sum( - self.axis, self.window_args, *args, **kwargs + self.axis, self.window_kwargs, *args, **kwargs ) ) def var(self, ddof=1, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.window_var( - self.axis, self.window_args, ddof, *args, **kwargs + self.axis, self.window_kwargs, ddof, *args, **kwargs ) ) def std(self, ddof=1, *args, **kwargs): return self._dataframe.__constructor__( query_compiler=self._query_compiler.window_std( - self.axis, self.window_args, ddof, *args, **kwargs + self.axis, self.window_kwargs, ddof, *args, **kwargs ) ) @@ -103,81 +104,87 @@ def __init__( raise NotImplementedError("step parameter is not implemented yet.") self._dataframe = dataframe self._query_compiler = dataframe._query_compiler - self.rolling_args = [ - window, - min_periods, - center, - win_type, - on, - axis, - closed, - step, - method, - ] + self.rolling_kwargs = { + "window": window, + "min_periods": min_periods, + "center": center, + "win_type": win_type, + "on": on, + "axis": axis, + "closed": closed, + "step": step, + "method": method, + } self.axis = axis + def _call_qc_method(self, method_name, *args, **kwargs): + """ + Call a query compiler method for the specified rolling aggregation. + + Parameters + ---------- + method_name : str + Name of the aggregation. + *args : tuple + Positional arguments to pass to the query compiler method. + **kwargs : dict + Keyword arguments to pass to the query compiler method. + + Returns + ------- + BaseQueryCompiler + QueryCompiler holding the result of the aggregation. + """ + qc_method = getattr(self._query_compiler, f"rolling_{method_name}") + return qc_method(self.axis, self.rolling_kwargs, *args, **kwargs) + + def _aggregate(self, method_name, *args, **kwargs): + """ + Run the specified rolling aggregation. + + Parameters + ---------- + method_name : str + Name of the aggregation. + *args : tuple + Positional arguments to pass to the aggregation. + **kwargs : dict + Keyword arguments to pass to the aggregation. + + Returns + ------- + DataFrame or Series + Result of the aggregation. + """ + qc_result = self._call_qc_method(method_name, *args, **kwargs) + return self._dataframe.__constructor__(query_compiler=qc_result) + def count(self): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_count( - self.axis, self.rolling_args - ) - ) + return self._aggregate("count") def sem(self, *args, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_sem( - self.axis, self.rolling_args, *args, **kwargs - ) - ) + return self._aggregate("sem", *args, **kwargs) def sum(self, *args, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_sum( - self.axis, self.rolling_args, *args, **kwargs - ) - ) + return self._aggregate("sum", *args, **kwargs) def mean(self, *args, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_mean( - self.axis, self.rolling_args, *args, **kwargs - ) - ) + return self._aggregate("mean", *args, **kwargs) def median(self, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_median( - self.axis, self.rolling_args, **kwargs - ) - ) + return self._aggregate("median", **kwargs) def var(self, ddof=1, *args, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_var( - self.axis, self.rolling_args, ddof, *args, **kwargs - ) - ) + return self._aggregate("var", ddof, *args, **kwargs) def std(self, ddof=1, *args, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_std( - self.axis, self.rolling_args, ddof, *args, **kwargs - ) - ) + return self._aggregate("std", ddof, *args, **kwargs) def min(self, *args, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_min( - self.axis, self.rolling_args, *args, **kwargs - ) - ) + return self._aggregate("min", *args, **kwargs) def max(self, *args, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_max( - self.axis, self.rolling_args, *args, **kwargs - ) - ) + return self._aggregate("max", *args, **kwargs) def corr(self, other=None, pairwise=None, *args, **kwargs): from .dataframe import DataFrame @@ -188,11 +195,7 @@ def corr(self, other=None, pairwise=None, *args, **kwargs): elif isinstance(other, Series): other = other._query_compiler.to_pandas().squeeze() - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_corr( - self.axis, self.rolling_args, other, pairwise, *args, **kwargs - ) - ) + return self._aggregate("corr", other, pairwise, *args, **kwargs) def cov(self, other=None, pairwise=None, ddof: Optional[int] = 1, **kwargs): from .dataframe import DataFrame @@ -203,25 +206,13 @@ def cov(self, other=None, pairwise=None, ddof: Optional[int] = 1, **kwargs): elif isinstance(other, Series): other = other._query_compiler.to_pandas().squeeze() - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_cov( - self.axis, self.rolling_args, other, pairwise, ddof, **kwargs - ) - ) + return self._aggregate("cov", other, pairwise, ddof, **kwargs) def skew(self, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_skew( - self.axis, self.rolling_args, **kwargs - ) - ) + return self._aggregate("skew", **kwargs) def kurt(self, **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_kurt( - self.axis, self.rolling_args, **kwargs - ) - ) + return self._aggregate("kurt", **kwargs) def apply( self, @@ -233,18 +224,7 @@ def apply( kwargs=None, ): func = cast_function_modin2pandas(func) - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_apply( - self.axis, - self.rolling_args, - func, - raw, - engine, - engine_kwargs, - args, - kwargs, - ) - ) + return self._aggregate("apply", func, raw, engine, engine_kwargs, args, kwargs) def aggregate( self, @@ -255,9 +235,8 @@ def aggregate( from .dataframe import DataFrame dataframe = DataFrame( - query_compiler=self._query_compiler.rolling_aggregate( - self.axis, - self.rolling_args, + query_compiler=self._call_qc_method( + "aggregate", func, *args, **kwargs, @@ -265,7 +244,7 @@ def aggregate( ) if isinstance(self._dataframe, DataFrame): return dataframe - elif is_list_like(func): + elif is_list_like(func) and dataframe.columns.nlevels > 1: dataframe.columns = dataframe.columns.droplevel() return dataframe else: @@ -274,26 +253,86 @@ def aggregate( agg = aggregate def quantile(self, quantile, interpolation="linear", **kwargs): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_quantile( - self.axis, self.rolling_args, quantile, interpolation, **kwargs - ) - ) + return self._aggregate("quantile", quantile, interpolation, **kwargs) def rank( self, method="average", ascending=True, pct=False, numeric_only=False, **kwargs ): - return self._dataframe.__constructor__( - query_compiler=self._query_compiler.rolling_rank( - self.axis, - self.rolling_args, - method, - ascending, - pct, - numeric_only, - **kwargs, - ) + return self._aggregate("rank", method, ascending, pct, numeric_only, **kwargs) + + +@_inherit_docstrings(Rolling) +class RollingGroupby(Rolling): + def __init__(self, groupby_obj, *args, **kwargs): + self._as_index = groupby_obj._kwargs.get("as_index", True) + self._groupby_obj = ( + groupby_obj if self._as_index else groupby_obj._override(as_index=True) ) + super().__init__(self._groupby_obj._df, *args, **kwargs) + + def sem(self, *args, **kwargs): + ErrorMessage.missmatch_with_pandas( + operation="RollingGroupby.sem() when 'as_index=False'", + message=( + "The group columns won't be involved in the aggregation.\n" + + "See this gh-issue for more information: https://github.com/modin-project/modin/issues/6291" + ), + ) + return super().sem(*args, **kwargs) + + def corr(self, other=None, pairwise=None, *args, **kwargs): + # pandas behavior is that it always assumes that 'as_index=True' for the '.corr()' method + return super().corr( + *args, as_index=True, other=other, pairwise=pairwise, **kwargs + ) + + def cov(self, other=None, pairwise=None, ddof: Optional[int] = 1, **kwargs): + # pandas behavior is that it always assumes that 'as_index=True' for the '.cov()' method + return super().cov(as_index=True, other=other, pairwise=pairwise, **kwargs) + + def _aggregate(self, method_name, *args, as_index=None, **kwargs): + """ + Run the specified rolling aggregation. + + Parameters + ---------- + method_name : str + Name of the aggregation. + *args : tuple + Positional arguments to pass to the aggregation. + as_index : bool, optional + Whether the result should have the group labels as index levels or as columns. + If not specified the parameter value will be taken from groupby kwargs. + **kwargs : dict + Keyword arguments to pass to the aggregation. + + Returns + ------- + DataFrame or Series + Result of the aggregation. + """ + res = self._groupby_obj._wrap_aggregation( + qc_method=type(self._query_compiler).groupby_rolling, + numeric_only=False, + agg_args=args, + agg_kwargs=kwargs, + agg_func=method_name, + rolling_kwargs=self.rolling_kwargs, + ) + + if as_index is None: + as_index = self._as_index + + if not as_index: + res = res.reset_index( + level=[i for i in range(len(self._groupby_obj._internal_by))], + drop=False, + ) + + return res + + def _call_qc_method(self, method_name, *args, **kwargs): + return self._aggregate(method_name, *args, **kwargs)._query_compiler @_inherit_docstrings( diff --git a/modin/test/interchange/dataframe_protocol/hdk/test_protocol.py b/modin/test/interchange/dataframe_protocol/hdk/test_protocol.py index b52f20b8b7e..2a116c7143b 100644 --- a/modin/test/interchange/dataframe_protocol/hdk/test_protocol.py +++ b/modin/test/interchange/dataframe_protocol/hdk/test_protocol.py @@ -47,6 +47,16 @@ def test_simple_export(data_has_nulls, from_hdk, n_chunks): md_df = pd.DataFrame(data) exported_df = export_frame(md_df, from_hdk, n_chunks=n_chunks) + + # export_frame() splits the frame into multiple chunks. When it's + # split with HDK, each categorical column will have a different + # set of categories. When concatenating the chunks, the categorical + # column will be of type object. + cat_cols = md_df.select_dtypes(include=["category"]).columns + with warns_that_defaulting_to_pandas(): + md_df[cat_cols] = md_df[cat_cols].astype(str) + exported_df[cat_cols] = exported_df[cat_cols].astype(str) + df_equals(md_df, exported_df) diff --git a/modin/test/storage_formats/pandas/test_internals.py b/modin/test/storage_formats/pandas/test_internals.py index 6f484248ffd..210ea0dc1b1 100644 --- a/modin/test/storage_formats/pandas/test_internals.py +++ b/modin/test/storage_formats/pandas/test_internals.py @@ -726,6 +726,26 @@ def test_merge_partitioning( ) +def test_groupby_with_empty_partition(): + # see #5461 for details + md_df = construct_modin_df_by_scheme( + pandas_df=pandas.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}), + partitioning_scheme={"row_lengths": [2, 2], "column_widths": [2]}, + ) + md_res = md_df.query("a > 1") + grp_obj = md_res.groupby("a") + # check index error due to partitioning missmatching + grp_obj.count() + + md_df = construct_modin_df_by_scheme( + pandas_df=pandas.DataFrame({"a": [1, 1, 2, 2], "b": [3, 4, 5, 6]}), + partitioning_scheme={"row_lengths": [2, 2], "column_widths": [2]}, + ) + md_res = md_df.query("a > 1") + grp_obj = md_res.groupby(md_res["a"]) + grp_obj.count() + + @pytest.mark.parametrize("set_num_partitions", [2], indirect=True) def test_repartitioning(set_num_partitions): """ diff --git a/modin/utils.py b/modin/utils.py index daac8abde6c..db32b39a1ad 100644 --- a/modin/utils.py +++ b/modin/utils.py @@ -79,7 +79,7 @@ def _to_numpy(self) -> Any: # noqa: GL08 pass -MIN_RAY_VERSION = version.parse("1.4.0") +MIN_RAY_VERSION = version.parse("1.13.0") MIN_DASK_VERSION = version.parse("2.22.0") MIN_UNIDIST_VERSION = version.parse("0.2.1") diff --git a/requirements-dev.txt b/requirements-dev.txt index fb3884472ea..606c14ca4f0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,57 +1,70 @@ +## required dependencies pandas>=2,<2.1 numpy>=1.18.5 +fsspec +packaging +psutil + +## optional dependencies +# ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100 +ray[default]>=1.13.0,!=2.5.0 +# https://github.com/modin-project/modin/issues/6336 +pydantic<2 +pyarrow dask[complete]>=2.22.0 distributed>=2.22.0 -ray[default]>=1.13.0 -pyarrow -psutil -fsspec xarray Jinja2 -tables scipy s3fs>=2021.8 -pytest -pytest-benchmark -coverage -pytest-cov -pytest-xdist feather-format lxml openpyxl xlrd matplotlib sqlalchemy>=1.4.0,<1.4.46 +pandas-gbq +tables msgpack -pandas_gbq -cloudpickle -rpyc==4.1.5 scikit-learn -git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9 +pymssql +psycopg2 +connectorx>=0.2.6a4 +fastparquet<2023.1.0 +flask-cors # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost # when we use collective instead of rabit. xgboost>=1.7.1,<2.0.0 tqdm # Latest modin-spreadsheet with widget fix git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 -pymssql -psycopg2 -connectorx>=0.2.6a4 -black -flake8 -flake8-no-implicit-concat -flake8-print -# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. -numpydoc==1.1.0 + +## modin in the cloud dependencies +boto3 +cloudpickle +rpyc==4.1.5 + +## dependencies for making release +PyGithub>=1.58.0 +pygit2>=1.9.2 + +## test dependencies +coverage>=7.1.0 # experimental version of fuzzydata requires at least 0.0.6 to successfully resolve all dependencies fuzzydata>=0.0.6 -# Mypy 0.990 doesn't work: https://github.com/modin-project/modin/issues/5206 -mypy!=0.990 -pandas-stubs -fastparquet<2023.1.0 -# for release script -PyGithub -pygit2 -moto -flask-cors -boto3 \ No newline at end of file +git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9 +# The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. +numpydoc==1.1.0 +moto>=4.1.0 +pytest>=7.2.1 +pytest-benchmark>=4.0.0 +pytest-cov>=4.0.0 +pytest-xdist>=3.2.0 + +## code linters +black>=23.1.0 +flake8>=6.0.0 +flake8-no-implicit-concat>=0.3.4 +flake8-print>=5.0.0 +mypy>=1.0.0 +pandas-stubs>=2.0.0 diff --git a/requirements/env_hdk.yml b/requirements/env_hdk.yml index 65c83dae087..690a3090c42 100644 --- a/requirements/env_hdk.yml +++ b/requirements/env_hdk.yml @@ -2,37 +2,46 @@ name: modin_on_hdk channels: - conda-forge dependencies: + - pip + + # required dependencies - pandas>=2,<2.1 - - pyarrow - numpy>=1.18.5 + - pyhdk==0.7 - fsspec - - pip - - pytest>=6.0.1 - - pytest-cov>=2.10.1 - - pytest-xdist>=2.1.0 - - coverage - - pygithub - - pyhdk==0.6 - - s3fs>=2021.8 + - packaging - psutil + + # optional dependencies + - s3fs>=2021.8 - openpyxl - xlrd - sqlalchemy>=1.4.0,<1.4.46 - scipy - - xgboost>=1.7.1,<2.0.0 - - scikit-learn-intelex - matplotlib # TODO: uncomment after Modin switch to python>=3.9 # - xarray - pytables - fastparquet - # code linters - - black - - flake8 - - flake8-no-implicit-concat - - flake8-print + + # dependencies for making release + - pygithub + + # test dependencies - boto3 - - moto + - coverage>=7.1.0 + - moto>=4.1.0 + - pytest>=7.2.1 + - pytest-cov>=4.0.0 + - pytest-xdist>=3.2.0 + + # code linters + - black>=23.1.0 + - flake8>=6.0.0 + - flake8-no-implicit-concat>=0.3.4 + - flake8-print>=5.0.0 + - mypy>=1.0.0 + - pip: # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.1.0 diff --git a/requirements/env_unidist.yml b/requirements/env_unidist.yml index 54e39ac6b45..e8b4ca88d3c 100644 --- a/requirements/env_unidist.yml +++ b/requirements/env_unidist.yml @@ -2,16 +2,22 @@ name: modin_on_unidist channels: - conda-forge dependencies: - - unidist-mpi>=0.2.1 + - pip + + # required dependencies - pandas>=2,<2.1 - numpy>=1.18.5 - - pyarrow + - unidist-mpi>=0.2.1 - fsspec + - packaging + - psutil + + # optional dependencies + - pyarrow # TODO: uncomment after Modin switch to python>=3.9 # - xarray - Jinja2 - scipy - - pip - s3fs>=2021.8 - feather-format - lxml @@ -22,32 +28,36 @@ dependencies: - pandas-gbq - pytables - msgpack-python - - psutil - - pytest>=6.0.1 - - pytest-benchmark - - pytest-cov>=2.10.1 - - pytest-xdist>=2.1.0 - - packaging - - coverage - - pygithub - - rpyc==4.1.5 - - cloudpickle - - boto3 - scikit-learn - pymssql - psycopg2 - - mypy - - pandas-stubs - fastparquet<2023.1.0 - tqdm - # for release script - - pygit2 + + # modin in the cloud dependencies + - boto3 + - cloudpickle + - rpyc==4.1.5 + + # dependencies for making release + - pygithub>=v1.58.0 + - pygit2>=1.9.2 + + # test dependencies + - coverage>=7.1.0 + - moto>=4.1.0 + - pytest>=7.2.1 + - pytest-cov>=4.0.0 + - pytest-xdist>=3.2.0 + # code linters - - black - - flake8 - - flake8-no-implicit-concat - - flake8-print - - moto + - black>=23.1.0 + - flake8>=6.0.0 + - flake8-no-implicit-concat>=0.3.4 + - flake8-print>=5.0.0 + - mypy>=1.0.0 + - pandas-stubs>=2.0.0 + - pip: # Fixes breaking ipywidgets changes, but didn't release yet. - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 diff --git a/requirements/requirements-no-engine.yml b/requirements/requirements-no-engine.yml index 4cc13cce2b9..9d2255902c3 100644 --- a/requirements/requirements-no-engine.yml +++ b/requirements/requirements-no-engine.yml @@ -1,15 +1,21 @@ channels: - conda-forge dependencies: + - pip + + # required dependencies - pandas>=2,<2.1 - numpy>=1.18.5 - - pyarrow - fsspec + - packaging + - psutil + + # optional dependencies + - pyarrow # TODO: uncomment after Modin switch to python>=3.9 # - xarray - Jinja2 - scipy - - pip - s3fs>=2021.8 - feather-format - lxml @@ -20,31 +26,38 @@ dependencies: - pandas-gbq - pytables - msgpack-python - - psutil - - pytest>=6.0.1 - - pytest-benchmark - - pytest-cov>=2.10.1 - - pytest-xdist>=2.1.0 - - coverage - - pygithub - - rpyc==4.1.5 - - cloudpickle - - boto3 - - moto # TODO(https://github.com/modin-project/modin/issues/5194): Uncap xgboost # when we use collective instead of rabit. - xgboost>=1.7.1,<2.0.0 - tqdm + + # modin in the cloud dependencies + - boto3 + - cloudpickle + - rpyc==4.1.5 + + # dependencies for making release + - pygithub>=v1.58.0 + - pygit2>=1.9.2 + + # test dependencies + - coverage>=7.1.0 + - moto>=4.1.0 + - pytest>=7.2.1 + - pytest-cov>=4.0.0 + - pytest-xdist>=3.2.0 + # code linters - - black - - flake8 - - flake8-no-implicit-concat - - flake8-print + - black>=23.1.0 + - flake8>=6.0.0 + - flake8-no-implicit-concat>=0.3.4 + - flake8-print>=5.0.0 + - pip: + # no conda package for windows + - connectorx>=0.2.6a4 # Fixes breaking ipywidgets changes, but didn't release yet. - git+https://github.com/modin-project/modin-spreadsheet.git@49ffd89f683f54c311867d602c55443fb11bf2a5 - git+https://github.com/airspeed-velocity/asv.git@ef016e233cb9a0b19d517135104f49e0a3c380e9 - # no conda package for windows - - connectorx>=0.2.6a4 # The `numpydoc` version should match the version installed in the `lint-pydocstyle` job of the CI. - numpydoc==1.1.0 diff --git a/scripts/doc_checker.py b/scripts/doc_checker.py index 3f68509bd8c..2eb7083f8c8 100644 --- a/scripts/doc_checker.py +++ b/scripts/doc_checker.py @@ -527,8 +527,9 @@ def load_obj(name, old_load_obj=Docstring._load_obj): Docstring._load_obj = staticmethod(load_obj) # for testing hdk-engine docs without `pyhdk` installation - # TODO: check if we could remove these lines sys.modules["pyhdk"] = Mock() + sys.modules["pyhdk.hdk"] = Mock() + sys.modules["pyhdk._sql"] = Mock() # enable docs testing on windows sys.getdlopenflags = Mock() sys.setdlopenflags = Mock() diff --git a/scripts/release.py b/scripts/release.py index cc383480a30..455bb56aef3 100644 --- a/scripts/release.py +++ b/scripts/release.py @@ -113,14 +113,26 @@ def __init__(self): def is_on_master(self): return self.repo.references["refs/heads/master"] == self.repo.head + @staticmethod + def __get_tag_version(entry): + try: + return version.parse(entry.lstrip("refs/tags/")) + except version.InvalidVersion as ex: + return f'' + def get_previous_release(self, rel_type): tags = [ - (entry, version.parse(entry.lstrip("refs/tags/"))) + (entry, self.__get_tag_version(entry)) for entry in self.repo.references if entry.startswith("refs/tags/") ] - # filter away legacy versions (which aren't following the proper naming schema) - tags = [(entry, ver) for entry, ver in tags if isinstance(ver, version.Version)] + # filter away legacy versions (which aren't following the proper naming schema); + # also skip pre-releases + tags = [ + (entry, ver) + for entry, ver in tags + if isinstance(ver, version.Version) and not ver.pre + ] if rel_type == "minor": # leave only minor releases tags = [(entry, ver) for entry, ver in tags if ver.micro == 0] diff --git a/setup.cfg b/setup.cfg index 0d6e97bc180..8f1cf929dd8 100644 --- a/setup.cfg +++ b/setup.cfg @@ -17,6 +17,7 @@ xfail_strict=true markers = xfail_executions skip_executions + exclude_in_sanity filterwarnings = error:.*defaulting to pandas.*:UserWarning diff --git a/setup.py b/setup.py index 259e2f04a6f..43d67ee1fb7 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,9 @@ long_description = fh.read() dask_deps = ["dask>=2.22.0", "distributed>=2.22.0"] -ray_deps = ["ray[default]>=1.13.0", "pyarrow"] +# ray==2.5.0 broken: https://github.com/conda-forge/ray-packages-feedstock/issues/100 +# pydantic<2: https://github.com/modin-project/modin/issues/6336 +ray_deps = ["ray[default]>=1.13.0,!=2.5.0", "pyarrow", "pydantic<2"] unidist_deps = ["unidist[mpi]>=0.2.1"] remote_deps = ["rpyc==4.1.5", "cloudpickle", "boto3"] spreadsheet_deps = ["modin-spreadsheet>=0.1.0"]