apache · alamb · Aug 21, 2023 · Aug 18, 2023 · Aug 18, 2023 · alamb
diff --git a/datafusion/common/src/test_util.rs b/datafusion/common/src/test_util.rs
@@ -17,6 +17,8 @@
 
 //! Utility functions to make testing DataFusion based crates easier
 
+use std::{error::Error, path::PathBuf};
+
 /// A macro to assert that one string is contained within another with
 /// a nice error message if they are not.
 ///
@@ -62,3 +64,152 @@ macro_rules! assert_not_contains {
         );
     };
 }
+
+/// Returns the arrow test data directory, which is by default stored
+/// in a git submodule rooted at `testing/data`.
+///
+/// The default can be overridden by the optional environment
+/// variable `ARROW_TEST_DATA`
+///
+/// panics when the directory can not be found.
+///
+/// Example:
+/// ```
+/// let testdata = datafusion_common::test_util::arrow_test_data();
+/// let csvdata = format!("{}/csv/aggregate_test_100.csv", testdata);
+/// assert!(std::path::PathBuf::from(csvdata).exists());
+/// ```
+pub fn arrow_test_data() -> String {
+    match get_data_dir("ARROW_TEST_DATA", "../../testing/data") {
+        Ok(pb) => pb.display().to_string(),
+        Err(err) => panic!("failed to get arrow data dir: {err}"),
+    }
+}
+
+/// Returns the parquet test data directory, which is by default
+/// stored in a git submodule rooted at
+/// `parquet-testing/data`.
+///
+/// The default can be overridden by the optional environment variable
+/// `PARQUET_TEST_DATA`
+///
+/// panics when the directory can not be found.
+///
+/// Example:
+/// ```
+/// let testdata = datafusion_common::test_util::parquet_test_data();
+/// let filename = format!("{}/binary.parquet", testdata);
+/// assert!(std::path::PathBuf::from(filename).exists());
+/// ```
+pub fn parquet_test_data() -> String {
+    match get_data_dir("PARQUET_TEST_DATA", "../../parquet-testing/data") {
+        Ok(pb) => pb.display().to_string(),
+        Err(err) => panic!("failed to get parquet data dir: {err}"),
+    }
+}
+
+/// Returns a directory path for finding test data.
+///
+/// udf_env: name of an environment variable
+///
+/// submodule_dir: fallback path (relative to CARGO_MANIFEST_DIR)
+///
+///  Returns either:
+/// The path referred to in `udf_env` if that variable is set and refers to a directory
+/// The submodule_data directory relative to CARGO_MANIFEST_PATH
+pub fn get_data_dir(
+    udf_env: &str,
+    submodule_data: &str,
+) -> Result<PathBuf, Box<dyn Error>> {
+    // Try user defined env.
+    if let Ok(dir) = std::env::var(udf_env) {
+        let trimmed = dir.trim().to_string();
+        if !trimmed.is_empty() {
+            let pb = PathBuf::from(trimmed);
+            if pb.is_dir() {
+                return Ok(pb);
+            } else {
+                return Err(format!(
+                    "the data dir `{}` defined by env {} not found",
+                    pb.display(),
+                    udf_env
+                )
+                .into());
+            }
+        }
+    }
+
+    // The env is undefined or its value is trimmed to empty, let's try default dir.
+
+    // env "CARGO_MANIFEST_DIR" is "the directory containing the manifest of your package",
+    // set by `cargo run` or `cargo test`, see:
+    // https://doc.rust-lang.org/cargo/reference/environment-variables.html
+    let dir = env!("CARGO_MANIFEST_DIR");
+
+    let pb = PathBuf::from(dir).join(submodule_data);
+    if pb.is_dir() {
+        Ok(pb)
+    } else {
+        Err(format!(
+            "env `{}` is undefined or has empty value, and the pre-defined data dir `{}` not found\n\
+             HINT: try running `git submodule update --init`",
+            udf_env,
+            pb.display(),
+        ).into())
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::env;
+
+    #[test]
+    fn test_data_dir() {
+        let udf_env = "get_data_dir";
+        let cwd = env::current_dir().unwrap();
+
+        let existing_pb = cwd.join("..");
+        let existing = existing_pb.display().to_string();
+        let existing_str = existing.as_str();
+
+        let non_existing = cwd.join("non-existing-dir").display().to_string();
+        let non_existing_str = non_existing.as_str();
+
+        env::set_var(udf_env, non_existing_str);
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_err());
+
+        env::set_var(udf_env, "");
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_ok());
+        assert_eq!(res.unwrap(), existing_pb);
+
+        env::set_var(udf_env, " ");
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_ok());
+        assert_eq!(res.unwrap(), existing_pb);
+
+        env::set_var(udf_env, existing_str);
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_ok());
+        assert_eq!(res.unwrap(), existing_pb);
+
+        env::remove_var(udf_env);
+        let res = get_data_dir(udf_env, non_existing_str);
+        assert!(res.is_err());
+
+        let res = get_data_dir(udf_env, existing_str);
+        assert!(res.is_ok());
+        assert_eq!(res.unwrap(), existing_pb);
+    }
+
+    #[test]
+    fn test_happy() {
+        let res = arrow_test_data();
+        assert!(PathBuf::from(res).is_dir());
+
+        let res = parquet_test_data();
+        assert!(PathBuf::from(res).is_dir());
+    }
+}
diff --git a/datafusion/core/src/datasource/physical_plan/csv.rs b/datafusion/core/src/datasource/physical_plan/csv.rs
@@ -654,9 +654,9 @@ mod tests {
     use crate::datasource::physical_plan::chunked_store::ChunkedStore;
     use crate::prelude::*;
     use crate::test::{partitioned_csv_config, partitioned_file_groups};
-    use crate::test_util::{aggr_test_schema_with_missing_col, arrow_test_data};
     use crate::{scalar::ScalarValue, test_util::aggr_test_schema};
     use arrow::datatypes::*;
+    use datafusion_common::test_util::arrow_test_data;
     use futures::StreamExt;
     use object_store::local::LocalFileSystem;
     use rstest::*;
@@ -1249,4 +1249,20 @@ mod tests {
             }
         }
     }
+
+    /// Get the schema for the aggregate_test_* csv files with an additional filed not present in the files.
+    fn aggr_test_schema_with_missing_col() -> SchemaRef {
+        let fields =
+            Fields::from_iter(aggr_test_schema().fields().iter().cloned().chain(
+                std::iter::once(Arc::new(Field::new(
+                    "missing_col",
+                    DataType::Int64,
+                    true,
+                ))),
+            ));
+
+        let schema = Schema::new(fields);
+
+        Arc::new(schema)
+    }
 }