add tests for subsets writing

ml6team · May 4, 2023 · d4f852f · d4f852f
1 parent 924a59d
commit d4f852f
Show file tree

Hide file tree

Showing 7 changed files with 38 additions and 6 deletions.
diff --git a/tests/example_data/input_manifest.json b/tests/example_data/input_manifest.json
@@ -1,6 +1,6 @@
 {
     "metadata": {
-      "base_path": "/home/philippe/Scripts/express/tests/example_data/subsets_input",
+      "base_path": "tests/example_data/subsets_input",
       "run_id": "12345",
       "component_id": "67890"
     },

diff --git a/tests/example_data/output_manifest.json b/tests/example_data/output_manifest.json
@@ -1,6 +1,6 @@
 {
     "metadata": {
-      "base_path": "/home/philippe/Scripts/express/tests/example_data/subsets_output",
+      "base_path": "tests/example_data/subsets_output",
       "run_id": "12345",
       "component_id": "67890"
     },

diff --git a/tests/example_data/raw/split.py b/tests/example_data/raw/split.py
@@ -12,13 +12,13 @@
 import dask.dataframe as dd
 
 data_path = Path(__file__).parent
-output_path = Path(__file__).parent.parent / "subsets/"
+output_path = Path(__file__).parent.parent / "subsets_input/"
 
 
 def split_into_subsets():
     # read in complete dataset
     master_df = dd.read_parquet(path=data_path / "testset.parquet")
-    master_df =
+    master_df = master_df.astype({"source": "string"})
     # create index subset
     index_df = master_df[["id", "source"]]
     index_df.set_index("id")

diff --git a/tests/example_data/subsets_input/index/part.0.parquet b/tests/example_data/subsets_input/index/part.0.parquet
diff --git a/tests/example_data/subsets_input/properties/part.0.parquet b/tests/example_data/subsets_input/properties/part.0.parquet
diff --git a/tests/example_data/subsets_input/types/part.0.parquet b/tests/example_data/subsets_input/types/part.0.parquet
diff --git a/tests/test_dataset.py b/tests/test_dataset.py
@@ -1,4 +1,4 @@
-import json
+import tempfile
 import pytest
 import dask.dataframe as dd
 from pathlib import Path
@@ -8,6 +8,7 @@
 from fondant.component_spec import FondantComponentSpec
 
 input_manifest_path = Path(__file__).parent / "example_data/input_manifest.json"
+output_manifest_path = Path(__file__).parent / "example_data/output_manifest.json"
 component_spec_path = Path(__file__).parent / "example_data/components/1.yaml"
 
 
@@ -16,6 +17,11 @@ def input_manifest():
     return Manifest.from_file(input_manifest_path)
 
 
+@pytest.fixture
+def output_manifest():
+    return Manifest.from_file(input_manifest_path)
+
+
 @pytest.fixture
 def component_spec():
     return FondantComponentSpec.from_file(component_spec_path)
@@ -37,4 +43,30 @@ def test_merge_subsets(input_manifest, component_spec):
         "properties_HP",
         "types_Type 1",
         "types_Type 2",
-    ]
+    ]
+
+
+def test_write_subsets(input_manifest, output_manifest, component_spec):
+    # Dictionary specifying the expected subsets to write and their column names
+    subset_columns_dict = {"index": ['id', 'source'],
+                           "properties": ['Name', 'HP', 'id', 'source'],
+                           "types": ['Type 1', 'Type 2', 'id', 'source']}
+
+    # Load dataframe from input manifest
+    input_fds = FondantDataset(manifest=input_manifest)
+    df = input_fds.load_dataframe(spec=component_spec)
+
+    # Write dataframe based on the output manifest and component spec
+    output_fds = FondantDataset(manifest=output_manifest)
+    output_base_path = Path(output_fds.manifest.base_path)
+
+    # Create temporary directory for writing the subset based on the manifest base path
+    with tempfile.TemporaryDirectory(dir=output_base_path):
+        tmp_dir_path = Path(output_base_path)
+        output_fds.write_index(df)
+        output_fds.write_subsets(df, spec=component_spec)
+        for subset, subset_columns in subset_columns_dict.items():
+            subset_path = Path(tmp_dir_path / subset)
+            df = dd.read_parquet(subset_path)
+            assert len(df) == 151
+            assert list(df.columns) == subset_columns