-
Notifications
You must be signed in to change notification settings - Fork 4k
Description
Git hash of Arrow version tested:
e8270d7
Steps to reproduce:
Build Arrow and Parquet statically, and install to a prefix directory, ~/arrow/cpp/release/prefix:
cd
git clone https://github.com/apache/arrow.git
cd arrow/cpp/
mkdir release
cd release
cmake -DARROW_PARQUET=ON -DARROW_BUILD_SHARED=OFF -DARROW_BUILD_STATIC=ON -DCMAKE_CXX_STANDARD=17 ..
make
mkdir prefix
make DESTDIR=prefix installBuild the following main file:
#include <arrow/api.h>
#include <arrow/io/api.h>
#include <parquet/exception.h>
#include <parquet/file_reader.h>
#include <parquet/file_writer.h>
#include <parquet/metadata.h>
#include <iostream>
#include <memory>
#include <string>
#include <utility>int main(int, char**) {
std::shared_ptr<arrow::io::FileOutputStream> output; std::string tempfile_path{"/tmp/arrowfooterbug.parquet"}; PARQUET_ASSIGN_OR_THROW(output, arrow::io::FileOutputStream::Open(tempfile_path)); parquet::schema::NodeVector fields;
fields.push_back(
parquet::schema::PrimitiveNode::Make("Test",
parquet::Repetition::REQUIRED,
parquet::Type::INT64,
parquet::ConvertedType::UINT_64));
static std::shared_ptr<parquet::schema::GroupNode> schema{
std::static_pointer_cast<parquet::schema::GroupNode>(
parquet::schema::GroupNode::Make("schema", parquet::Repetition::REQUIRED,
fields))};
std::shared_ptr<parquet::WriterProperties> properties{
parquet::WriterProperties::Builder{}.build()};
std::shared_ptr<parquet::ArrowWriterProperties> arrow_properties{
parquet::ArrowWriterProperties::Builder{}.build()}; std::unique_ptr<::parquet::ParquetFileWriter> writer{
::parquet::ParquetFileWriter::Open(output, schema, properties)}; writer->Close();
std::size_t writer_size{writer->metadata()->size()};
std::size_t reader_size{
parquet::ParquetFileReader::OpenFile(tempfile_path, false)->metadata()->size()};
std::cout << writer_size << ' ' << reader_size << '\n';
}Using the following CMakeLists.txt:
project("arrowfooterbug")set(ARROW_HOME "/home/enolan/arrow/cpp/release/prefix/usr/local")
find_package(Arrow CONFIG PATHS "/home/enolan/arrow/cpp/release/prefix/usr/local/lib/cmake/arrow" NO_DEFAULT_PATH REQUIRED)
find_package(Parquet CONFIG PATHS "/home/enolan/arrow/cpp/release/prefix/usr/local/lib/cmake/arrow" NO_DEFAULT_PATH REQUIRED)set(CMAKE_CXX_FLAGS "-std=c++17")add_executable(main main.cpp)target_link_libraries(main parquet_static arrow_static)Expected behavior:
writer_size matches reader_size (output is 89 89)
Actual behavior:
writer_size is always zero (output is 0 89)
Note that the interface of ParquetFileWriter::metadata() is specified as:
/// Returns the file metadata, only available after calling Close().
const std::shared_ptr<FileMetaData> metadata() const;And FileMetaData::size() is specified as:
/// \brief Size of the original thrift encoded metadata footer.
uint32_t size() const;So you would expect to be able to recover the size of the footer from the metadata after invoking Close().
Environment: OS: Ubuntu 21.04
Linux ed-ubuntu-pc 5.11.0-34-generic #36-Ubuntu SMP Thu Aug 26 19:22:09 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux
Compiler: gcc version 9.3.0 (Ubuntu 9.3.0-23ubuntu2)
Reporter: Edward Nolan
Note: This issue was originally created as ARROW-14521. Please see the migration documentation for further details.