Skip to content

Commit

Permalink
FIX-modin-project#6549: remove usage of dfsql module (modin-project…
Browse files Browse the repository at this point in the history
…#6550)

Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
Co-authored-by: Vasily Litvinov <fam1ly.n4me@yandex.ru>
  • Loading branch information
anmyachev and vnlitvinov authored Sep 12, 2023
1 parent 7ec9fdb commit 9886c01
Show file tree
Hide file tree
Showing 6 changed files with 16 additions and 79 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -378,7 +378,7 @@ jobs:
# these variables to test writing to the mock s3 filesystem.
- run: mpiexec -n 1 -genv AWS_ACCESS_KEY_ID foobar_key -genv AWS_SECRET_ACCESS_KEY foobar_secret python -m pytest modin/pandas/test/test_io.py --verbose
- run: mpiexec -n 1 python -m pytest modin/experimental/pandas/test/test_io_exp.py
- run: pip install "dfsql>=0.4.2" "pyparsing<=2.4.7" && mpiexec -n 1 python -m pytest modin/experimental/sql/test/test_sql.py
- run: mpiexec -n 1 python -m pytest modin/experimental/sql/test/test_sql.py
- run: mpiexec -n 1 python -m pytest modin/test/interchange/dataframe_protocol/test_general.py
- run: mpiexec -n 1 python -m pytest modin/test/interchange/dataframe_protocol/pandas/test_protocol.py
- run: |
Expand Down Expand Up @@ -499,7 +499,7 @@ jobs:
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- run: python -m pytest modin/experimental/pandas/test/test_io_exp.py
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
- run: pip install "dfsql>=0.4.2" "pyparsing<=2.4.7" && python -m pytest modin/experimental/sql/test/test_sql.py
- run: python -m pytest modin/experimental/sql/test/test_sql.py
if: matrix.os == 'ubuntu' && (matrix.engine == 'python' || matrix.test_task == 'group_4')
- run: python -m pytest modin/test/interchange/dataframe_protocol/test_general.py
if: matrix.engine == 'python' || matrix.test_task == 'group_4'
Expand Down
11 changes: 3 additions & 8 deletions docs/usage_guide/advanced_usage/modin_sql.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,9 @@ due to the architecture of Modin. Currently, Modin has a query compiler that act
intermediate layer between the query language (e.g. SQL, pandas) and the execution
(See :doc:`architecture </development/architecture>` documentation for details).

To execute SQL queries, Modin uses either dfsql third-party library or, in case of HDK
engine (See :doc:`Using HDK </development/using_hdk>` documentation for details)
the queries are executed directly by HDK. Thus, to execute SQL queries, either dfsql
or pyhdk module must be installed.
To execute SQL queries, Modin uses HDK engine
(See :doc:`Using HDK </development/using_hdk>` documentation for details)
Thus, to execute SQL queries, pyhdk module must be installed.


A Short Example Using the Google Play Store
Expand Down Expand Up @@ -42,10 +41,6 @@ App, Category, and Rating, where Price is ‘0’.
# you can also ignore the FROM part in the query string:
sql_str = "SELECT App, Category, Rating WHERE Price = '0' "
# DataFrame.sql() can take query strings without FROM statement
# NOTE: this method required the dfsql module to be installed!
result_df = gstore_apps_df.sql(sql_str)
Writing Complex Queries
"""""""""""""""""""""""

Expand Down
18 changes: 2 additions & 16 deletions modin/experimental/sql/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@

def query(sql: str, *args, **kwargs) -> pd.DataFrame:
"""
Execute SQL query using either HDK engine or dfsql.
Execute SQL query using HDK engine.
Parameters
----------
Expand All @@ -42,20 +42,6 @@ def query(sql: str, *args, **kwargs) -> pd.DataFrame:
if cfg.StorageFormat.get() == "Hdk":
from modin.experimental.sql.hdk.query import hdk_query as _query_impl
else:
from dfsql import sql_query as _query_impl
raise NotImplementedError

return _query_impl(sql, *args, **kwargs)


# dfsql adds the sql() method to the DataFrame class.
# This code is used for lazy dfsql extensions initialization.
if not hasattr(pd.DataFrame, "sql"):

def dfsql_init(df, query):
delattr(pd.DataFrame, "sql")
import modin.experimental.sql.dfsql.query # noqa: F401

df.sql = pd.DataFrame.sql(df)
return df.sql(query)

pd.DataFrame.sql = dfsql_init
28 changes: 0 additions & 28 deletions modin/experimental/sql/dfsql/query.py

This file was deleted.

32 changes: 9 additions & 23 deletions modin/experimental/sql/test/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@

import pandas
import modin.pandas as pd
import modin.config as cfg
from modin.pandas.test.utils import default_to_pandas_ignore_string, df_equals
from modin.config import StorageFormat

import io
import pytest
Expand All @@ -34,6 +34,10 @@
"""


@pytest.mark.skipif(
StorageFormat.get() != "Hdk",
reason="Lack of implementation for other storage formats.",
)
def test_sql_query():
from modin.experimental.sql import query

Expand All @@ -52,28 +56,10 @@ def test_sql_query():
assert (values_left == values_right).all()


def test_sql_extension():
# This test is for DataFrame.sql() method, that is injected by
# dfsql.extensions. In the HDK environment, there is no dfsql
# module and, thus, this test fails.
if cfg.StorageFormat.get() == "Hdk":
return

import modin.experimental.sql # noqa: F401

df = pd.read_csv(io.StringIO(titanic_snippet))

expected_df = df[df["survived"] == 1][["passenger_id", "survived"]]

sql = "SELECT passenger_id, survived WHERE survived = 1"
query_result = df.sql(sql)
assert list(query_result.columns) == ["passenger_id", "survived"]
values_left = expected_df.values
values_right = query_result.values
assert values_left.shape == values_right.shape
assert (values_left == values_right).all()


@pytest.mark.skipif(
StorageFormat.get() != "Hdk",
reason="Lack of implementation for other storage formats.",
)
def test_string_cast():
from modin.experimental.sql import query

Expand Down
2 changes: 0 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
ray_deps = ["ray[default]>=1.13.0,!=2.5.0", "pyarrow>=7.0.0", "pydantic<2"]
unidist_deps = ["unidist[mpi]>=0.2.1"]
spreadsheet_deps = ["modin-spreadsheet>=0.1.0"]
sql_deps = ["dfsql>=0.4.2", "pyparsing<=2.4.7"]
all_deps = dask_deps + ray_deps + unidist_deps + spreadsheet_deps

# Distribute 'modin-autoimport-pandas.pth' along with binary and source distributions.
Expand Down Expand Up @@ -60,7 +59,6 @@ def make_distribution(self):
"ray": ray_deps,
"unidist": unidist_deps,
"spreadsheet": spreadsheet_deps,
"sql": sql_deps,
"all": all_deps,
},
python_requires=">=3.9",
Expand Down

0 comments on commit 9886c01

Please sign in to comment.