Skip to content

Commit

Permalink
Drop Raster Rows with all NULL Bands (#7654)
Browse files Browse the repository at this point in the history
* Add new CopyParam COPY FROM WITH option " bool raster_drop_if_all_null"
* Add a test (manually-edited uncompressed GeoTIFF with last pixel NULL)

If set to true, any raster pixel where ALL the data bands are NULL will be dropped on import
Note that this requires the file to have a valid NoData value for all its data band types

Signed-off-by: Misiu Godfrey <misiu.godfrey@kraken.mapd.com>
  • Loading branch information
simoneves authored and misiugodfrey committed Aug 26, 2024
1 parent e4bcbf0 commit 5832188
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 9 deletions.
7 changes: 5 additions & 2 deletions ImportExport/CopyParams.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ struct CopyParams {
bool raster_point_compute_angle;
std::string raster_import_dimensions;
std::string add_metadata_columns;
bool raster_drop_if_all_null;
// odbc parameters
std::string sql_select;
std::string sql_order_by;
Expand Down Expand Up @@ -138,7 +139,8 @@ struct CopyParams {
, raster_point_type(RasterPointType::kAuto)
, raster_scanlines_per_thread(32)
, raster_point_transform(RasterPointTransform::kAuto)
, raster_point_compute_angle{false} {}
, raster_point_compute_angle{false}
, raster_drop_if_all_null{false} {}

CopyParams(char d, const std::string& n, char l, size_t b, size_t retries, size_t wait)
: delimiter(d)
Expand Down Expand Up @@ -172,7 +174,8 @@ struct CopyParams {
, raster_point_type(RasterPointType::kAuto)
, raster_scanlines_per_thread(32)
, raster_point_transform(RasterPointTransform::kAuto)
, raster_point_compute_angle{false} {}
, raster_point_compute_angle{false}
, raster_drop_if_all_null{false} {}
};

} // namespace import_export
90 changes: 83 additions & 7 deletions ImportExport/Importer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5844,6 +5844,25 @@ ImportStatus Importer::importGDALRaster(

bool read_block_failed = false;

// prepare to store which band values in which rows are null
boost::dynamic_bitset<> row_band_nulls;
if (copy_params.raster_drop_if_all_null) {
row_band_nulls.resize(num_elems * num_bands);
}

auto set_row_band_null = [&](const int row, const uint32_t band) {
auto const bit_index = (row * num_bands) + band;
row_band_nulls.set(bit_index);
};
auto all_row_bands_null = [&](const int row) -> bool {
auto const first_bit_index = row * num_bands;
bool all_null = true;
for (auto i = first_bit_index; i < first_bit_index + num_bands; i++) {
all_null = all_null && row_band_nulls.test(i);
}
return all_null;
};

// for each band/column
for (uint32_t band_idx = 0; band_idx < num_bands; band_idx++) {
// the corresponding column
Expand Down Expand Up @@ -5884,6 +5903,9 @@ ImportStatus Importer::importGDALRaster(
if (null_value_valid && value == static_cast<int16_t>(null_value)) {
td.is_null = true;
td.val.int_val = NULL_SMALLINT;
if (copy_params.raster_drop_if_all_null) {
set_row_band_null(idx, band_idx);
}
} else {
td.is_null = false;
td.val.int_val = static_cast<int64_t>(value);
Expand All @@ -5899,6 +5921,9 @@ ImportStatus Importer::importGDALRaster(
if (null_value_valid && value == static_cast<int32_t>(null_value)) {
td.is_null = true;
td.val.int_val = NULL_INT;
if (copy_params.raster_drop_if_all_null) {
set_row_band_null(idx, band_idx);
}
} else {
td.is_null = false;
td.val.int_val = static_cast<int64_t>(value);
Expand All @@ -5914,6 +5939,9 @@ ImportStatus Importer::importGDALRaster(
if (null_value_valid && value == static_cast<uint32_t>(null_value)) {
td.is_null = true;
td.val.int_val = NULL_INT;
if (copy_params.raster_drop_if_all_null) {
set_row_band_null(idx, band_idx);
}
} else {
td.is_null = false;
td.val.int_val = static_cast<int64_t>(value);
Expand All @@ -5928,6 +5956,9 @@ ImportStatus Importer::importGDALRaster(
if (null_value_valid && value == static_cast<float>(null_value)) {
td.is_null = true;
td.val.real_val = NULL_FLOAT;
if (copy_params.raster_drop_if_all_null) {
set_row_band_null(idx, band_idx);
}
} else {
td.is_null = false;
td.val.real_val = static_cast<double>(value);
Expand All @@ -5942,6 +5973,9 @@ ImportStatus Importer::importGDALRaster(
if (null_value_valid && value == null_value) {
td.is_null = true;
td.val.real_val = NULL_DOUBLE;
if (copy_params.raster_drop_if_all_null) {
set_row_band_null(idx, band_idx);
}
} else {
td.is_null = false;
td.val.real_val = value;
Expand All @@ -5964,7 +5998,9 @@ ImportStatus Importer::importGDALRaster(
for (auto& col_buffer : import_buffers) {
col_buffer->clear();
}
thread_import_status.rows_rejected += num_elems;
thread_import_status.rows_estimated = 0;
thread_import_status.rows_completed = 0;
thread_import_status.rows_rejected = num_elems;
} else {
// metadata columns?
for (auto const& mci : metadata_column_infos) {
Expand All @@ -5975,8 +6011,50 @@ ImportStatus Importer::importGDALRaster(
}
col_idx++;
}
thread_import_status.rows_estimated = num_elems;
thread_import_status.rows_completed = num_elems;

// drop rows where all band columns are null?
int num_dropped_as_all_null = 0;
if (copy_params.raster_drop_if_all_null) {
// capture rows where ALL the band values (only) were NULL
// count rows first (implies two passes on the bitset but
// still quicker than building the row set if not needed,
// in the case where ALL rows are to be dropped)
for (int row = 0; row < num_elems; row++) {
if (all_row_bands_null(row)) {
num_dropped_as_all_null++;
}
}
// delete those rows from ALL column buffers (including coords and metadata)
if (num_dropped_as_all_null == num_elems) {
// all rows need dropping, just clear (fast)
for (auto& col_buffer : import_buffers) {
col_buffer->clear();
}
} else if (num_dropped_as_all_null > 0) {
// drop "bad" rows selectively (slower)
// build row set to drop
BadRowsTracker bad_rows_tracker;
for (int row = 0; row < num_elems; row++) {
if (all_row_bands_null(row)) {
bad_rows_tracker.rows.emplace(static_cast<int64_t>(row));
}
}
// then delete rows
for (auto& col_buffer : import_buffers) {
auto const* cd = col_buffer->getColumnDesc();
CHECK(cd);
auto const col_type = cd->columnType.get_type();
col_buffer->del_values(col_type, &bad_rows_tracker);
}
}
}

// final count
CHECK_LE(num_dropped_as_all_null, num_elems);
auto const actual_num_elems = num_elems - num_dropped_as_all_null;
thread_import_status.rows_estimated = actual_num_elems;
thread_import_status.rows_completed = actual_num_elems;
thread_import_status.rows_rejected = 0;
}

// done
Expand Down Expand Up @@ -6005,15 +6083,13 @@ ImportStatus Importer::importGDALRaster(
VLOG(1) << "Raster Importer: scanlines_in_block: " << scanlines_in_block
<< ", block_max_scanlines_per_thread: " << block_max_scanlines_per_thread;

std::vector<size_t> rows_per_thread;
auto block_wall_timer = timer_start();
// run max_threads scanlines at once
for (size_t thread_id = 0; thread_id < max_threads; thread_id++) {
const int y_start = block_y + thread_id * block_max_scanlines_per_thread;
if (y_start < band_size_y) {
const int y_end = std::min(y_start + block_max_scanlines_per_thread, band_size_y);
if (y_start < y_end) {
rows_per_thread.emplace_back((y_end - y_start) * band_size_x);
futures.emplace_back(
std::async(std::launch::async, import_rows, thread_id, y_start, y_end));
}
Expand All @@ -6036,8 +6112,8 @@ ImportStatus Importer::importGDALRaster(
// fashion so we can simultaneously read the next batch of data
auto thread_load_timer = timer_start();
// only try to load this thread's data if valid
if (import_status.rows_rejected == 0) {
load(import_buffers_vec[thread_idx], rows_per_thread[thread_idx], session_info);
if (import_status.rows_completed > 0) {
load(import_buffers_vec[thread_idx], import_status.rows_completed, session_info);
}
load_s += TIMER_STOP(thread_load_timer);
++thread_idx;
Expand Down
9 changes: 9 additions & 0 deletions Parser/ParserNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1561,6 +1561,15 @@ void parse_copy_params(const std::list<std::unique_ptr<NameValueAssign>>& option
if (bool_from_string_literal(str_literal)) {
copy_params.raster_point_compute_angle = true;
}
} else if (boost::iequals(*p->get_name(), "raster_drop_if_all_null")) {
const StringLiteral* str_literal =
dynamic_cast<const StringLiteral*>(p->get_value());
if (str_literal == nullptr) {
throw std::runtime_error("'raster_drop_if_all_null' option must be a boolean.");
}
if (bool_from_string_literal(str_literal)) {
copy_params.raster_drop_if_all_null = true;
}
} else if (boost::iequals(*p->get_name(), "sql_order_by")) {
if (auto str_literal = dynamic_cast<const StringLiteral*>(p->get_value())) {
copy_params.sql_order_by = *str_literal->get_stringval();
Expand Down
Binary file not shown.
9 changes: 9 additions & 0 deletions Tests/ImportExportTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5366,6 +5366,8 @@ TEST_F(BasicRasterImporterTest, HDF5ImageMultiThreaded) {

static constexpr const char* kPNG = "beach.png";
static constexpr const char* kGeoTIFF = "USGS_1m_x30y441_OH_Columbus_2019_small.tif";
static constexpr const char* kGeoTIFFLastPixelNull =
"USGS_1m_x30y441_OH_Columbus_2019_small_last_pixel_null.tif";
static constexpr const char* kGeoTIFFTruncated =
"USGS_1m_x30y441_OH_Columbus_2019_small_truncated.tif";
static constexpr const char* kGeoTIFFDir = "geotif/";
Expand Down Expand Up @@ -5801,6 +5803,13 @@ TEST_F(RasterImportTest, ImportGeoTIFFTest) {
{{-83.222766892364277, 39.818764365787992, 287.54092407226562}}));
}

TEST_F(RasterImportTest, ImportGeoTIFFDropNullsTest) {
ASSERT_NO_THROW(importTestCommon(kGeoTIFFLastPixelNull,
", raster_drop_if_all_null='true'",
"SELECT COUNT(*) FROM raster;",
{{39999L}}));
}

TEST_F(RasterImportTest, ImportGeoTIFFTruncatedTest) {
ASSERT_NO_THROW(importTestCommon(kGeoTIFFTruncated,
", max_reject=1000000",
Expand Down
2 changes: 2 additions & 0 deletions ThriftHandler/DBHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4042,6 +4042,7 @@ import_export::CopyParams DBHandler::thrift_to_copyparams(const TCopyParams& cp)
copy_params.add_metadata_columns = cp.add_metadata_columns;
copy_params.trim_spaces = cp.trim_spaces;
copy_params.geo_validate_geometry = cp.geo_validate_geometry;
copy_params.raster_drop_if_all_null = cp.raster_drop_if_all_null;
return copy_params;
}

Expand Down Expand Up @@ -4174,6 +4175,7 @@ TCopyParams DBHandler::copyparams_to_thrift(const import_export::CopyParams& cp)
copy_params.add_metadata_columns = cp.add_metadata_columns;
copy_params.trim_spaces = cp.trim_spaces;
copy_params.geo_validate_geometry = cp.geo_validate_geometry;
copy_params.raster_drop_if_all_null = cp.raster_drop_if_all_null;
return copy_params;
}

Expand Down
1 change: 1 addition & 0 deletions heavy.thrift
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ struct TCopyParams {
40: string add_metadata_columns;
41: bool trim_spaces=true;
42: bool geo_validate_geometry=false;
43: bool raster_drop_if_all_null=false;
}

struct TCreateParams {
Expand Down

0 comments on commit 5832188

Please sign in to comment.