Skip to content
This repository was archived by the owner on Oct 21, 2024. It is now read-only.

Commit d6b6f87

Browse files
committed
ARROW-7691: [C++] Check non-scalar Flatbuffers fields are not null
We're discussing whether to make those fields required in the schema definitions (which would make validation automatic by the flatbuffers generated verifier), but in the meantime we can check those fields manually. This should fix a bunch of issues detected by OSS-Fuzz. Closes apache#6293 from pitrou/ARROW-7691-check-fb-fields-not-null and squashes the following commits: 02478a6 <Antoine Pitrou> Use a function rather than a macro e6d3d88 <Antoine Pitrou> ARROW-7691: Check non-scalar Flatbuffers fields are not null Authored-by: Antoine Pitrou <antoine@python.org> Signed-off-by: Antoine Pitrou <antoine@python.org>
1 parent 0326ea3 commit d6b6f87

File tree

3 files changed

+46
-77
lines changed

3 files changed

+46
-77
lines changed

cpp/src/arrow/ipc/metadata_internal.cc

Lines changed: 18 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -286,11 +286,7 @@ Status ConcreteTypeFromFlatbuffer(flatbuf::Type type, const void* type_data,
286286
case flatbuf::Type::Timestamp: {
287287
auto ts_type = static_cast<const flatbuf::Timestamp*>(type_data);
288288
TimeUnit::type unit = FromFlatbufferUnit(ts_type->unit());
289-
if (ts_type->timezone() != 0 && ts_type->timezone()->Length() > 0) {
290-
*out = timestamp(unit, ts_type->timezone()->str());
291-
} else {
292-
*out = timestamp(unit);
293-
}
289+
*out = timestamp(unit, StringFromFlatbuffers(ts_type->timezone()));
294290
return Status::OK();
295291
}
296292
case flatbuf::Type::Duration: {
@@ -369,10 +365,7 @@ static Status TypeFromFlatbuffer(const flatbuf::Field* field,
369365
const KeyValueMetadata* field_metadata,
370366
std::shared_ptr<DataType>* out) {
371367
auto type_data = field->type();
372-
if (type_data == nullptr) {
373-
return Status::IOError(
374-
"Type-pointer in custom metadata of flatbuffer-encoded Field is null.");
375-
}
368+
CHECK_FLATBUFFERS_NOT_NULL(type_data, "Field.type");
376369
RETURN_NOT_OK(ConcreteTypeFromFlatbuffer(field->type_type(), type_data, children, out));
377370

378371
// Look for extension metadata in custom_metadata field
@@ -474,14 +467,8 @@ Status KeyValueMetadataFromFlatbuffer(const KVVector* fb_metadata,
474467

475468
metadata->reserve(fb_metadata->size());
476469
for (const auto& pair : *fb_metadata) {
477-
if (pair->key() == nullptr) {
478-
return Status::IOError(
479-
"Key-pointer in custom metadata of flatbuffer-encoded Schema is null.");
480-
}
481-
if (pair->value() == nullptr) {
482-
return Status::IOError(
483-
"Value-pointer in custom metadata of flatbuffer-encoded Schema is null.");
484-
}
470+
CHECK_FLATBUFFERS_NOT_NULL(pair->key(), "custom_metadata.key");
471+
CHECK_FLATBUFFERS_NOT_NULL(pair->value(), "custom_metadata.value");
485472
metadata->Append(pair->key()->str(), pair->value()->str());
486473
}
487474

@@ -776,16 +763,16 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona
776763

777764
// Reconstruct the data type
778765
auto children = field->children();
779-
if (children == nullptr) {
780-
return Status::IOError("Children-pointer of flatbuffer-encoded Field is null.");
781-
}
766+
CHECK_FLATBUFFERS_NOT_NULL(children, "Field.children");
782767
std::vector<std::shared_ptr<Field>> child_fields(children->size());
783768
for (int i = 0; i < static_cast<int>(children->size()); ++i) {
784769
RETURN_NOT_OK(
785770
FieldFromFlatbuffer(children->Get(i), dictionary_memo, &child_fields[i]));
786771
}
787772
RETURN_NOT_OK(TypeFromFlatbuffer(field, child_fields, metadata.get(), &type));
788773

774+
auto field_name = StringFromFlatbuffers(field->name());
775+
789776
const flatbuf::DictionaryEncoding* encoding = field->dictionary();
790777

791778
if (encoding != nullptr) {
@@ -794,22 +781,14 @@ Status FieldFromFlatbuffer(const flatbuf::Field* field, DictionaryMemo* dictiona
794781
// dictionary_memo
795782
std::shared_ptr<DataType> index_type;
796783
auto int_data = encoding->indexType();
797-
if (int_data == nullptr) {
798-
return Status::IOError(
799-
"indexType-pointer in custom metadata of flatbuffer-encoded DictionaryEncoding "
800-
"is null.");
801-
}
784+
CHECK_FLATBUFFERS_NOT_NULL(int_data, "DictionaryEncoding.indexType");
802785
RETURN_NOT_OK(IntFromFlatbuffer(int_data, &index_type));
803786
ARROW_ASSIGN_OR_RAISE(type,
804787
DictionaryType::Make(index_type, type, encoding->isOrdered()));
805-
*out = ::arrow::field(field->name()->str(), type, field->nullable(), metadata);
788+
*out = ::arrow::field(field_name, type, field->nullable(), metadata);
806789
RETURN_NOT_OK(dictionary_memo->AddField(encoding->id(), *out));
807790
} else {
808-
auto name = field->name();
809-
if (name == nullptr) {
810-
return Status::IOError("Name-pointer of flatbuffer-encoded Field is null.");
811-
}
812-
*out = ::arrow::field(name->str(), type, field->nullable(), metadata);
791+
*out = ::arrow::field(field_name, type, field->nullable(), metadata);
813792
}
814793
return Status::OK();
815794
}
@@ -1183,17 +1162,15 @@ Status WriteFileFooter(const Schema& schema, const std::vector<FileBlock>& dicti
11831162
Status GetSchema(const void* opaque_schema, DictionaryMemo* dictionary_memo,
11841163
std::shared_ptr<Schema>* out) {
11851164
auto schema = static_cast<const flatbuf::Schema*>(opaque_schema);
1186-
if (schema->fields() == nullptr) {
1187-
return Status::IOError("Fields-pointer of flatbuffer-encoded Schema is null.");
1188-
}
1165+
CHECK_FLATBUFFERS_NOT_NULL(schema, "schema");
1166+
CHECK_FLATBUFFERS_NOT_NULL(schema->fields(), "Schema.fields");
11891167
int num_fields = static_cast<int>(schema->fields()->size());
11901168

11911169
std::vector<std::shared_ptr<Field>> fields(num_fields);
11921170
for (int i = 0; i < num_fields; ++i) {
11931171
const flatbuf::Field* field = schema->fields()->Get(i);
1194-
if (field == nullptr) {
1195-
return Status::IOError("Field-pointer of flatbuffer-encoded Schema is null.");
1196-
}
1172+
// XXX I don't think this check is necessary (AP)
1173+
CHECK_FLATBUFFERS_NOT_NULL(field, "DictionaryEncoding.indexType");
11971174
RETURN_NOT_OK(FieldFromFlatbuffer(field, dictionary_memo, &fields[i]));
11981175
}
11991176

@@ -1225,12 +1202,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
12251202
auto dim = tensor->shape()->Get(i);
12261203

12271204
shape->push_back(dim->size());
1228-
auto fb_name = dim->name();
1229-
if (fb_name == 0) {
1230-
dim_names->push_back("");
1231-
} else {
1232-
dim_names->push_back(fb_name->str());
1233-
}
1205+
dim_names->push_back(StringFromFlatbuffers(dim->name()));
12341206
}
12351207

12361208
if (tensor->strides() && tensor->strides()->size() > 0) {
@@ -1239,11 +1211,7 @@ Status GetTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>* type
12391211
}
12401212
}
12411213

1242-
auto type_data = tensor->type();
1243-
if (type_data == nullptr) {
1244-
return Status::IOError(
1245-
"Type-pointer in custom metadata of flatbuffer-encoded Tensor is null.");
1246-
}
1214+
auto type_data = tensor->type(); // Required
12471215
return ConcreteTypeFromFlatbuffer(tensor->type_type(), type_data, {}, type);
12481216
}
12491217

@@ -1283,12 +1251,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>
12831251
}
12841252

12851253
if (dim_names) {
1286-
auto fb_name = dim->name();
1287-
if (fb_name == 0) {
1288-
dim_names->push_back("");
1289-
} else {
1290-
dim_names->push_back(fb_name->str());
1291-
}
1254+
dim_names->push_back(StringFromFlatbuffers(dim->name()));
12921255
}
12931256
}
12941257
}
@@ -1324,11 +1287,7 @@ Status GetSparseTensorMetadata(const Buffer& metadata, std::shared_ptr<DataType>
13241287
}
13251288
}
13261289

1327-
auto type_data = sparse_tensor->type();
1328-
if (type_data == nullptr) {
1329-
return Status::IOError(
1330-
"Type-pointer in custom metadata of flatbuffer-encoded SparseTensor is null.");
1331-
}
1290+
auto type_data = sparse_tensor->type(); // Required
13321291
if (type) {
13331292
return ConcreteTypeFromFlatbuffer(sparse_tensor->type_type(), type_data, {}, type);
13341293
} else {

cpp/src/arrow/ipc/metadata_internal.h

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,20 +31,16 @@
3131
#include "arrow/buffer.h"
3232
#include "arrow/ipc/dictionary.h" // IYWU pragma: keep
3333
#include "arrow/ipc/message.h"
34-
#include "arrow/memory_pool.h"
3534
#include "arrow/sparse_tensor.h"
3635
#include "arrow/status.h"
36+
#include "arrow/type_fwd.h"
37+
#include "arrow/util/macros.h"
3738

3839
#include "generated/Message_generated.h"
3940
#include "generated/Schema_generated.h"
4041

4142
namespace arrow {
4243

43-
class DataType;
44-
class Schema;
45-
class Tensor;
46-
class SparseTensor;
47-
4844
namespace flatbuf = org::apache::arrow::flatbuf;
4945

5046
namespace io {
@@ -92,6 +88,23 @@ struct FileBlock {
9288
int64_t body_length;
9389
};
9490

91+
// Low-level utilities to help with reading Flatbuffers data.
92+
93+
#define CHECK_FLATBUFFERS_NOT_NULL(fb_value, name) \
94+
if ((fb_value) == NULLPTR) { \
95+
return Status::IOError("Unexpected null field ", name, \
96+
" in flatbuffer-encoded metadata"); \
97+
}
98+
99+
template <typename T>
100+
inline uint32_t FlatBuffersVectorSize(const flatbuffers::Vector<T>* vec) {
101+
return (vec == NULLPTR) ? 0 : vec->size();
102+
}
103+
104+
inline std::string StringFromFlatbuffers(const flatbuffers::String* s) {
105+
return (s == NULLPTR) ? "" : s->str();
106+
}
107+
95108
// Read interface classes. We do not fully deserialize the flatbuffers so that
96109
// individual fields metadata can be retrieved from very large schema without
97110
//

cpp/src/arrow/ipc/reader.cc

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -102,10 +102,7 @@ class IpcComponentSource {
102102

103103
Status GetBuffer(int buffer_index, std::shared_ptr<Buffer>* out) {
104104
auto buffers = metadata_->buffers();
105-
if (buffers == nullptr) {
106-
return Status::IOError(
107-
"Buffers-pointer of flatbuffer-encoded RecordBatch is null.");
108-
}
105+
CHECK_FLATBUFFERS_NOT_NULL(buffers, "RecordBatch.buffers");
109106
if (buffer_index >= static_cast<int>(buffers->size())) {
110107
return Status::IOError("buffer_index out of range.");
111108
}
@@ -127,9 +124,7 @@ class IpcComponentSource {
127124

128125
Status GetFieldMetadata(int field_index, ArrayData* out) {
129126
auto nodes = metadata_->nodes();
130-
if (nodes == nullptr) {
131-
return Status::IOError("Nodes-pointer of flatbuffer-encoded Table is null.");
132-
}
127+
CHECK_FLATBUFFERS_NOT_NULL(nodes, "Table.nodes");
133128
// pop off a field
134129
if (field_index >= static_cast<int>(nodes->size())) {
135130
return Status::Invalid("Ran out of field metadata, likely malformed");
@@ -441,6 +436,7 @@ Status ReadDictionary(const Buffer& metadata, DictionaryMemo* dictionary_memo,
441436
// The dictionary is embedded in a record batch with a single column
442437
std::shared_ptr<RecordBatch> batch;
443438
auto batch_meta = dictionary_batch->data();
439+
CHECK_FLATBUFFERS_NOT_NULL(batch_meta, "DictionaryBatch.data");
444440
RETURN_NOT_OK(ReadRecordBatch(batch_meta, ::arrow::schema({value_field}),
445441
dictionary_memo, options, file, &batch));
446442
if (batch->num_columns() != 1) {
@@ -475,9 +471,6 @@ class RecordBatchStreamReader::RecordBatchStreamReaderImpl {
475471
}
476472
CHECK_MESSAGE_TYPE(Message::SCHEMA, message->type());
477473
CHECK_HAS_NO_BODY(*message);
478-
if (message->header() == nullptr) {
479-
return Status::IOError("Header-pointer of flatbuffer-encoded Message is null.");
480-
}
481474
return internal::GetSchema(message->header(), &dictionary_memo_, &schema_);
482475
}
483476

@@ -663,9 +656,13 @@ class RecordBatchFileReader::RecordBatchFileReaderImpl {
663656
return Status::OK();
664657
}
665658

666-
int num_dictionaries() const { return footer_->dictionaries()->size(); }
659+
int num_dictionaries() const {
660+
return static_cast<int>(internal::FlatBuffersVectorSize(footer_->dictionaries()));
661+
}
667662

668-
int num_record_batches() const { return footer_->recordBatches()->size(); }
663+
int num_record_batches() const {
664+
return static_cast<int>(internal::FlatBuffersVectorSize(footer_->recordBatches()));
665+
}
669666

670667
MetadataVersion version() const {
671668
return internal::GetMetadataVersion(footer_->version());

0 commit comments

Comments
 (0)