diff --git a/nvtabular/ops.py b/nvtabular/ops.py index fd0e5e3b86a..a39a6ce0006 100644 --- a/nvtabular/ops.py +++ b/nvtabular/ops.py @@ -208,7 +208,7 @@ def stat_logic(self, ddf, columns_ctx, input_cols, target_cols): @annotate("MinMax_finalize", color="green", domain="nvt_python") def finalize(self, stats): - for col in stats["mins"].index: + for col in stats["mins"].index.values_host: self.mins[col] = stats["mins"][col] self.maxs[col] = stats["maxs"][col] @@ -264,7 +264,7 @@ def stat_logic(self, ddf, columns_ctx, input_cols, target_cols): @annotate("Moments_finalize", color="green", domain="nvt_python") def finalize(self, dask_stats): - for col in dask_stats["count"].index: + for col in dask_stats["count"].index.values_host: self.counts[col] = float(dask_stats["count"][col]) self.means[col] = float(dask_stats["mean"][col]) self.stds[col] = float(dask_stats["std"][col]) @@ -317,7 +317,7 @@ def stat_logic(self, ddf, columns_ctx, input_cols, target_cols): @annotate("Median_finalize", color="green", domain="nvt_python") def finalize(self, dask_stats): - for col in dask_stats.index: + for col in dask_stats.index.values_host: self.medians[col] = float(dask_stats[col]) def registered_stats(self): diff --git a/tests/conftest.py b/tests/conftest.py index ca91507c2a5..7214322a874 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -151,6 +151,6 @@ def get_cats(processor, col, stat_name="categories"): filename = processor.stats[stat_name][col] gdf = cudf.read_parquet(filename) gdf.reset_index(drop=True, inplace=True) - return gdf[col].values_to_string() + return gdf[col].values_host else: - return processor.stats["encoders"][col].get_cats().values_to_string() + return processor.stats["encoders"][col].get_cats().values_host diff --git a/tests/unit/test_ops.py b/tests/unit/test_ops.py index b5cc70805c3..84567fd6402 100644 --- a/tests/unit/test_ops.py +++ b/tests/unit/test_ops.py @@ -109,13 +109,13 @@ def test_encoder(tmpdir, df, dataset, gpu_memory_frac, engine, op_columns): processor.update_stats(dataset) if engine == "parquet" and not op_columns: - cats_expected0 = df["name-cat"].unique().values_to_string() + cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") - assert cats0 == ["None"] + cats_expected0 + assert cats0.tolist() == [None] + cats_expected0.tolist() - cats_expected1 = df["name-string"].unique().values_to_string() + cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") - assert cats1 == ["None"] + cats_expected1 + assert cats1.tolist() == [None] + cats_expected1.tolist() @pytest.mark.parametrize("gpu_memory_frac", [0.01, 0.1]) diff --git a/tests/unit/test_torch_dataloader.py b/tests/unit/test_torch_dataloader.py index babe82dc47d..054b8beb672 100644 --- a/tests/unit/test_torch_dataloader.py +++ b/tests/unit/test_torch_dataloader.py @@ -126,12 +126,12 @@ def get_norms(tar: cudf.Series): # Check that categories match if engine == "parquet": - cats_expected0 = df["name-cat"].unique().values_to_string() + cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") - assert cats0 == ["None"] + cats_expected0 - cats_expected1 = df["name-string"].unique().values_to_string() + assert cats0.tolist() == [None] + cats_expected0.tolist() + cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") - assert cats1 == ["None"] + cats_expected1 + assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True) diff --git a/tests/unit/test_workflow.py b/tests/unit/test_workflow.py index a0ec79ea347..64bafed2947 100644 --- a/tests/unit/test_workflow.py +++ b/tests/unit/test_workflow.py @@ -79,14 +79,14 @@ def get_norms(tar: cudf.Series): # Check that categories match if engine == "parquet": - cats_expected0 = df["name-cat"].unique().values_to_string() + cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu - assert cats0 == ["None"] + cats_expected0 - cats_expected1 = df["name-string"].unique().values_to_string() + assert cats0.tolist() == [None] + cats_expected0.tolist() + cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu - assert cats1 == ["None"] + cats_expected1 + assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True) @@ -155,14 +155,14 @@ def get_norms(tar: cudf.Series): # Check that categories match if engine == "parquet": - cats_expected0 = df["name-cat"].unique().values_to_string() + cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu - assert cats0 == ["None"] + cats_expected0 - cats_expected1 = df["name-string"].unique().values_to_string() + assert cats0.tolist() == [None] + cats_expected0.tolist() + cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu - assert cats1 == ["None"] + cats_expected1 + assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True) @@ -236,17 +236,16 @@ def get_norms(tar: cudf.Series): assert math.isclose( get_norms(df.y).std(), processor.stats["stds"]["y" + concat_ops], rel_tol=1e-1 ) - # Check that categories match if engine == "parquet": - cats_expected0 = df["name-cat"].unique().values_to_string() + cats_expected0 = df["name-cat"].unique().values_host cats0 = get_cats(processor, "name-cat") # adding the None entry as a string because of move from gpu - assert cats0 == ["None"] + cats_expected0 - cats_expected1 = df["name-string"].unique().values_to_string() + assert cats0.tolist() == [None] + cats_expected0.tolist() + cats_expected1 = df["name-string"].unique().values_host cats1 = get_cats(processor, "name-string") # adding the None entry as a string because of move from gpu - assert cats1 == ["None"] + cats_expected1 + assert cats1.tolist() == [None] + cats_expected1.tolist() # Write to new "shuffled" and "processed" dataset processor.write_to_dataset(tmpdir, dataset, nfiles=10, shuffle=True, apply_ops=True)