Skip to content

Commit

Permalink
add tests for subsets writing
Browse files Browse the repository at this point in the history
  • Loading branch information
PhilippeMoussalli committed May 4, 2023
1 parent 924a59d commit d4f852f
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 6 deletions.
2 changes: 1 addition & 1 deletion tests/example_data/input_manifest.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"metadata": {
"base_path": "/home/philippe/Scripts/express/tests/example_data/subsets_input",
"base_path": "tests/example_data/subsets_input",
"run_id": "12345",
"component_id": "67890"
},
Expand Down
2 changes: 1 addition & 1 deletion tests/example_data/output_manifest.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"metadata": {
"base_path": "/home/philippe/Scripts/express/tests/example_data/subsets_output",
"base_path": "tests/example_data/subsets_output",
"run_id": "12345",
"component_id": "67890"
},
Expand Down
4 changes: 2 additions & 2 deletions tests/example_data/raw/split.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@
import dask.dataframe as dd

data_path = Path(__file__).parent
output_path = Path(__file__).parent.parent / "subsets/"
output_path = Path(__file__).parent.parent / "subsets_input/"


def split_into_subsets():
# read in complete dataset
master_df = dd.read_parquet(path=data_path / "testset.parquet")
master_df =
master_df = master_df.astype({"source": "string"})
# create index subset
index_df = master_df[["id", "source"]]
index_df.set_index("id")
Expand Down
Binary file modified tests/example_data/subsets_input/index/part.0.parquet
Binary file not shown.
Binary file modified tests/example_data/subsets_input/properties/part.0.parquet
Binary file not shown.
Binary file modified tests/example_data/subsets_input/types/part.0.parquet
Binary file not shown.
36 changes: 34 additions & 2 deletions tests/test_dataset.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import json
import tempfile
import pytest
import dask.dataframe as dd
from pathlib import Path
Expand All @@ -8,6 +8,7 @@
from fondant.component_spec import FondantComponentSpec

input_manifest_path = Path(__file__).parent / "example_data/input_manifest.json"
output_manifest_path = Path(__file__).parent / "example_data/output_manifest.json"
component_spec_path = Path(__file__).parent / "example_data/components/1.yaml"


Expand All @@ -16,6 +17,11 @@ def input_manifest():
return Manifest.from_file(input_manifest_path)


@pytest.fixture
def output_manifest():
return Manifest.from_file(input_manifest_path)


@pytest.fixture
def component_spec():
return FondantComponentSpec.from_file(component_spec_path)
Expand All @@ -37,4 +43,30 @@ def test_merge_subsets(input_manifest, component_spec):
"properties_HP",
"types_Type 1",
"types_Type 2",
]
]


def test_write_subsets(input_manifest, output_manifest, component_spec):
# Dictionary specifying the expected subsets to write and their column names
subset_columns_dict = {"index": ['id', 'source'],
"properties": ['Name', 'HP', 'id', 'source'],
"types": ['Type 1', 'Type 2', 'id', 'source']}

# Load dataframe from input manifest
input_fds = FondantDataset(manifest=input_manifest)
df = input_fds.load_dataframe(spec=component_spec)

# Write dataframe based on the output manifest and component spec
output_fds = FondantDataset(manifest=output_manifest)
output_base_path = Path(output_fds.manifest.base_path)

# Create temporary directory for writing the subset based on the manifest base path
with tempfile.TemporaryDirectory(dir=output_base_path):
tmp_dir_path = Path(output_base_path)
output_fds.write_index(df)
output_fds.write_subsets(df, spec=component_spec)
for subset, subset_columns in subset_columns_dict.items():
subset_path = Path(tmp_dir_path / subset)
df = dd.read_parquet(subset_path)
assert len(df) == 151
assert list(df.columns) == subset_columns

0 comments on commit d4f852f

Please sign in to comment.