Skip to content

Commit

Permalink
Add Isthmus producer (#6)
Browse files Browse the repository at this point in the history
* feat: adding isthmus producer
  • Loading branch information
richtia authored Dec 19, 2022
1 parent 1e3808f commit b0c9a87
Show file tree
Hide file tree
Showing 28 changed files with 541 additions and 176 deletions.
16 changes: 15 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,28 @@ which is located in `tests/conftest.py`.
# Setup
Create and activate your conda environment with python3.9:
```commandline
conda create -y -n substrait_consumer_testing -c conda-forge python=3.9
conda create -y -n substrait_consumer_testing -c conda-forge python=3.9 openjdk
conda activate substrait_consumer_testing
```
*Note: Java is used by Jpype to access the Isthmus producer.
JPype should work with all versions of Java but to see details on which versions are
officially supported see https://jpype.readthedocs.io/en/latest/install.html*

Install requirements from the top level directory:
```commandline
pip install -r requirements.txt
```

Get the java dependencies needed by the Isthmus Substrait producer:
1. Clone the substrait-java repo
2. From the consumer-testing repo, run the build-and-copy-isthmus-shadow-jar.sh script
```commandline
git clone https://github.com/substrait-io/substrait-java.git
cd consumer-testing
sh build-and-copy-isthmus-shadow-jar.sh
```
*This shell script may not work on Windows environments.*

# How to Run Tests
TPCH tests are located in the `tests/integration` folder and substrait function tests
are located in the `tests/functional` folder.
Expand Down
8 changes: 8 additions & 0 deletions build-and-copy-isthmus-shadow-jar.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

echo "Enter the absolute path of the substrait-java repo"
read substrait_java_path
cd ${substrait_java_path}/isthmus; ../gradlew shadowJar
cd -
mkdir -p jars
cp ${substrait_java_path}/isthmus/build/libs/*all.jar ./jars/
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ duckdb
filelock
ibis-framework
ibis-substrait
JPype1
protobuf
--extra-index-url https://pypi.fury.io/arrow-nightlies --prefer-binary --pre
pyarrow
Expand Down
4 changes: 2 additions & 2 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from filelock import FileLock
from tests.consumers import AceroConsumer, DuckDBConsumer
from tests.producers import DuckDBProducer, IbisProducer
from tests.producers import DuckDBProducer, IbisProducer, IsthmusProducer


@pytest.fixture(scope="session")
Expand Down Expand Up @@ -43,7 +43,7 @@ def pytest_addoption(parser):
)


PRODUCERS = [DuckDBProducer, IbisProducer]
PRODUCERS = [DuckDBProducer, IbisProducer, IsthmusProducer]
CONSUMERS = [AceroConsumer, DuckDBConsumer]


Expand Down
2 changes: 1 addition & 1 deletion tests/consumers.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ class AceroConsumer:
def __init__(self):
self.created_tables = set()
self.tables = {}
self.table_provider = lambda names: self.tables[names[0]]
self.table_provider = lambda names: self.tables[names[0].lower()]

def setup(self, db_connection, file_names: Iterable[str]):
if len(file_names) > 0:
Expand Down
57 changes: 57 additions & 0 deletions tests/context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from pathlib import Path

import jpype.imports

import tests.java_definitions as java

REPO_DIR = Path(__file__).parent.parent
from com.google.protobuf.util import JsonFormat as json_formatter

schema_file = Path.joinpath(REPO_DIR, "tests/data/tpch_parquet/schema.sql")


def produce_isthmus_substrait(sql_string, schema_list):
"""
Produce the substrait plan using Isthmus.
Parameters:
sql_string:
SQL query.
schema_list:
List of schemas.
Returns:
Substrait plan in json format.
"""
sql_to_substrait = java.SqlToSubstraitClass()
java_sql_string = jpype.java.lang.String(sql_string)
plan = sql_to_substrait.execute(java_sql_string, schema_list)
json_plan = json_formatter.printer().print_(plan)
return json_plan


def get_schema(file_names):
"""
Create the list of schemas based on the given file names. If there are no files
give, a custom schema for the data is used.
Parameters:
file_names: List of file names.
Returns:
List of all schemas as a java list.
"""
arr = java.ArrayListClass()
if file_names:
text_schema_file = open(schema_file)
schema_string = text_schema_file.read().replace("\n", " ").split(";")[:-1]
for create_table in schema_string:
java_obj = jpype.JObject @ jpype.JString(create_table)
arr.add(java_obj)
else:
java_obj = jpype.JObject @ jpype.JString(
"CREATE TABLE T(a integer, b integer, c boolean, d boolean)"
)
arr.add(java_obj)

return java.ListClass @ arr
15 changes: 10 additions & 5 deletions tests/functional/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def substrait_function_test(
db_con: DuckDBPyConnection,
created_tables: set,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
Expand Down Expand Up @@ -69,9 +69,10 @@ def substrait_function_test(
"""
producer.set_db_connection(db_con)
consumer.setup(db_con, file_names)
supported_producers = sql_query[1]

# Load the parquet files into DuckDB and return all the table names as a list
sql_query = producer.format_sql(created_tables, sql_query, file_names)
sql_query = producer.format_sql(created_tables, sql_query[0], file_names)

# Convert the SQL/Ibis expression to a substrait query plan
if type(producer).__name__ == "IbisProducer":
Expand All @@ -80,9 +81,13 @@ def substrait_function_test(
sql_query, consumer, ibis_expr(*args)
)
else:
pytest.skip("ibis expression currently undefined")
pytest.xfail("ibis expression currently undefined")
else:
substrait_plan = producer.produce_substrait(sql_query, consumer)
if type(producer) in supported_producers:
substrait_plan = producer.produce_substrait(sql_query, consumer)
else:
pytest.xfail(f"{type(producer).__name__} does not support the following SQL: "
f"{sql_query}")

actual_result = consumer.run_substrait_query(substrait_plan)
expected_result = db_con.query(f"{sql_query}").arrow()
Expand All @@ -97,7 +102,7 @@ def substrait_function_test(


def load_custom_duckdb_table(db_connection):
db_connection.execute("create table t (a int, b int, c boolean, d boolean)")
db_connection.execute("create table t (a BIGINT, b BIGINT, c boolean, d boolean)")
db_connection.execute(
"INSERT INTO t VALUES "
"(1, 1, TRUE, TRUE), (2, 1, FALSE, TRUE), (3, 1, TRUE, TRUE), "
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_approximation_functions(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def test_arithmetic_decimal_functions(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def test_arithmetic_functions(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class TestBooleanFunctions:
"""

@staticmethod
@pytest.fixture(scope="function", autouse=True)
def setup_teardown_function(request):
@pytest.fixture(scope="class", autouse=True)
def setup_teardown_class(request):
cls = request.cls

cls.db_connection = duckdb.connect()
Expand All @@ -41,10 +41,11 @@ def test_boolean_functions(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
partsupp
) -> None:
substrait_function_test(
self.db_connection,
Expand All @@ -54,5 +55,6 @@ def test_boolean_functions(
ibis_expr,
producer,
consumer,
partsupp,
self.table_t,
)
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class TestComparisonFunctions:
"""

@staticmethod
@pytest.fixture(scope="function", autouse=True)
def setup_teardown_function(request):
@pytest.fixture(scope="class", autouse=True)
def setup_teardown_class(request):
cls = request.cls

cls.db_connection = duckdb.connect()
Expand All @@ -37,7 +37,7 @@ def test_comparison_functions(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ class TestDatetimeFunctions:
"""

@staticmethod
@pytest.fixture(scope="function", autouse=True)
def setup_teardown_function(request):
@pytest.fixture(scope="class", autouse=True)
def setup_teardown_class(request):
cls = request.cls

cls.db_connection = duckdb.connect()
Expand All @@ -35,7 +35,7 @@ def test_datetime_functions(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_logarithmic_functions(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_rounding_functions(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_string_functions(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
consumer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,7 @@ def test_rounding_function_names(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
partsupp,
Expand All @@ -157,7 +157,7 @@ def run_function_name_test(
self,
test_name: str,
file_names: Iterable[str],
sql_query: str,
sql_query: tuple,
ibis_expr: Callable[[Table], Table],
producer,
*args
Expand Down Expand Up @@ -185,7 +185,7 @@ def run_function_name_test(
producer.set_db_connection(self.db_connection)

# Load the parquet files into DuckDB and return all the table names as a list
sql_query = producer.format_sql(self.created_tables, sql_query, file_names)
sql_query = producer.format_sql(self.created_tables, sql_query[0], file_names)

# Grab the json representation of the produced substrait plan to verify
# the proper substrait function name.
Expand Down
6 changes: 5 additions & 1 deletion tests/functional/queries/sql/approximation_functions_sql.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
from tests.producers import *

SQL_AGGREGATE = {
"approx_count_distinct":
"approx_count_distinct": (
"""
SELECT approx_count_distinct(l_comment)
FROM '{}';
""",
[DuckDBProducer],
),
}
Loading

0 comments on commit b0c9a87

Please sign in to comment.