forked from apache/doris
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Feature] Support read data with format of parquet from hdfs, using l…
…ibhdfs3 (apache#5686) Add new lib, Backend can read data from hdfs without broker, this patch include libhdfs3.a which can read file on hdfs. This patch will make reading the data from hdfs with parquet possible. By this, we will support more format of file on hdfs in the future, and we will support other metadata in the future.
- Loading branch information
1 parent
a1fa392
commit 29a3fa1
Showing
11 changed files
with
508 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,195 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
#include "exec/hdfs_file_reader.h" | ||
|
||
#include <sys/stat.h> | ||
#include <unistd.h> | ||
|
||
#include "common/logging.h" | ||
|
||
namespace doris { | ||
HdfsFileReader::HdfsFileReader(THdfsParams hdfs_params, | ||
const std::string& path, int64_t start_offset) | ||
: _hdfs_params(hdfs_params), _path(path), _current_offset(start_offset), | ||
_file_size(-1), _hdfs_fs(nullptr), _hdfs_file(nullptr) { | ||
std::stringstream namenode_ss; | ||
namenode_ss << "hdfs://" << _hdfs_params.host<< ":" << _hdfs_params.port; | ||
_namenode = namenode_ss.str(); | ||
} | ||
|
||
HdfsFileReader::~HdfsFileReader() { | ||
close(); | ||
} | ||
|
||
Status HdfsFileReader::connect() { | ||
hdfsBuilder* hdfs_builder = hdfsNewBuilder(); | ||
hdfsBuilderSetNameNode(hdfs_builder, _namenode.c_str()); | ||
// set hdfs user | ||
if (_hdfs_params.__isset.user) { | ||
hdfsBuilderSetUserName(hdfs_builder, _hdfs_params.user.c_str()); | ||
} | ||
// set kerberos conf | ||
if (_hdfs_params.__isset.kerb_principal) { | ||
hdfsBuilderSetPrincipal(hdfs_builder, _hdfs_params.kerb_principal.c_str()); | ||
} | ||
if (_hdfs_params.__isset.kerb_ticket_cache_path) { | ||
hdfsBuilderSetKerbTicketCachePath(hdfs_builder, _hdfs_params.kerb_ticket_cache_path.c_str()); | ||
} | ||
// set token | ||
if (_hdfs_params.__isset.token) { | ||
hdfsBuilderSetToken(hdfs_builder, _hdfs_params.token.c_str()); | ||
} | ||
// set other conf | ||
if (_hdfs_params.__isset.hdfs_conf) { | ||
for (const THdfsConf& conf : _hdfs_params.hdfs_conf) { | ||
hdfsBuilderConfSetStr(hdfs_builder, conf.key.c_str(), conf.value.c_str()); | ||
} | ||
} | ||
_hdfs_fs = hdfsBuilderConnect(hdfs_builder); | ||
if (_hdfs_fs == nullptr) { | ||
std::stringstream ss; | ||
ss << "connect failed. " << _namenode; | ||
return Status::InternalError(ss.str()); | ||
} | ||
return Status::OK(); | ||
} | ||
|
||
Status HdfsFileReader::open() { | ||
if (!closed()) { | ||
close(); | ||
} | ||
RETURN_IF_ERROR(connect()); | ||
_hdfs_file = hdfsOpenFile(_hdfs_fs, _path.c_str(), O_RDONLY, 0, 0, 0); | ||
if (_hdfs_file == nullptr) { | ||
std::stringstream ss; | ||
ss << "open file failed. " << _namenode << _path; | ||
return Status::InternalError(ss.str()); | ||
} | ||
LOG(INFO) << "open file. " << _namenode << _path; | ||
return seek(_current_offset); | ||
} | ||
|
||
void HdfsFileReader::close() { | ||
if (!closed()) { | ||
if (_hdfs_file != nullptr && _hdfs_fs != nullptr) { | ||
std::stringstream ss; | ||
ss << "close hdfs file: " << _namenode << _path; | ||
LOG(INFO) << ss.str(); | ||
//If the hdfs file was valid, the memory associated with it will | ||
// be freed at the end of this call, even if there was an I/O error | ||
hdfsCloseFile(_hdfs_fs, _hdfs_file); | ||
} | ||
if (_hdfs_fs != nullptr) { | ||
// Even if there is an error, the resources associated with the hdfsFS will be freed. | ||
hdfsDisconnect(_hdfs_fs); | ||
} | ||
} | ||
_hdfs_file = nullptr; | ||
_hdfs_fs = nullptr; | ||
} | ||
|
||
bool HdfsFileReader::closed() { | ||
return _hdfs_file == nullptr || _hdfs_fs == nullptr; | ||
} | ||
|
||
// Read all bytes | ||
Status HdfsFileReader::read_one_message(std::unique_ptr<uint8_t[]>* buf, size_t* length) { | ||
int64_t file_size = size() - _current_offset; | ||
if (file_size <= 0) { | ||
buf->reset(); | ||
*length = 0; | ||
return Status::OK(); | ||
} | ||
bool eof; | ||
*length = file_size; | ||
buf->reset(new uint8_t[file_size]); | ||
read(buf->get(), length, &eof); | ||
return Status::OK(); | ||
} | ||
|
||
Status HdfsFileReader::read(uint8_t* buf, size_t* buf_len, bool* eof) { | ||
readat(_current_offset, (int64_t)*buf_len, (int64_t*)buf_len, buf); | ||
if (*buf_len == 0) { | ||
*eof = true; | ||
} else { | ||
*eof = false; | ||
} | ||
return Status::OK(); | ||
} | ||
|
||
Status HdfsFileReader::readat(int64_t position, int64_t nbytes, int64_t* bytes_read, void* out) { | ||
if (position != _current_offset) { | ||
int ret = hdfsSeek(_hdfs_fs, _hdfs_file, position); | ||
if (ret != 0) { // check fseek return value | ||
std::stringstream ss; | ||
ss << "hdfsSeek failed. " << _namenode << _path; | ||
return Status::InternalError(ss.str()); | ||
} | ||
} | ||
|
||
*bytes_read = hdfsRead(_hdfs_fs, _hdfs_file, out, nbytes); | ||
if (*bytes_read < 0) { | ||
std::stringstream ss; | ||
ss << "Read hdfs file failed. " << _namenode << _path; | ||
return Status::InternalError(ss.str()); | ||
} | ||
_current_offset += *bytes_read; // save offset with file | ||
return Status::OK(); | ||
} | ||
|
||
int64_t HdfsFileReader::size() { | ||
if (_file_size == -1) { | ||
bool need_init_fs = false; | ||
if (_hdfs_fs == nullptr) { | ||
need_init_fs = true; | ||
if (!connect().ok()) { | ||
return -1; | ||
} | ||
} | ||
hdfsFileInfo* file_info = hdfsGetPathInfo(_hdfs_fs, _path.c_str()); | ||
if (file_info == nullptr) { | ||
LOG(WARNING) << "get path info failed: " << _namenode << _path; | ||
close(); | ||
return -1; | ||
} | ||
_file_size = file_info->mSize; | ||
hdfsFreeFileInfo(file_info, 1); | ||
if (need_init_fs) { | ||
close(); | ||
} | ||
} | ||
return _file_size; | ||
} | ||
|
||
Status HdfsFileReader::seek(int64_t position) { | ||
int res = hdfsSeek(_hdfs_fs, _hdfs_file, position); | ||
if (res != 0) { | ||
char err_buf[64]; | ||
std::stringstream ss; | ||
ss << "Seek to offset failed. offset=" << position | ||
<< ", error=" << strerror_r(errno, err_buf, 64); | ||
return Status::InternalError(ss.str()); | ||
} | ||
return Status::OK(); | ||
} | ||
|
||
Status HdfsFileReader::tell(int64_t* position) { | ||
*position = _current_offset; | ||
return Status::OK(); | ||
} | ||
|
||
} // namespace doris |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
// Licensed to the Apache Software Foundation (ASF) under one | ||
// or more contributor license agreements. See the NOTICE file | ||
// distributed with this work for additional information | ||
// regarding copyright ownership. The ASF licenses this file | ||
// to you under the Apache License, Version 2.0 (the | ||
// "License"); you may not use this file except in compliance | ||
// with the License. You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, | ||
// software distributed under the License is distributed on an | ||
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
// KIND, either express or implied. See the License for the | ||
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
#pragma once | ||
|
||
#include <hdfs/hdfs.h> | ||
|
||
#include "exec/file_reader.h" | ||
|
||
#include "gen_cpp/PlanNodes_types.h" | ||
|
||
namespace doris { | ||
|
||
class HdfsFileReader : public FileReader { | ||
public: | ||
HdfsFileReader(THdfsParams hdfs_params, const std::string& path, int64_t start_offset); | ||
virtual ~HdfsFileReader(); | ||
|
||
virtual Status open() override; | ||
|
||
// Read content to 'buf', 'buf_len' is the max size of this buffer. | ||
// Return ok when read success, and 'buf_len' is set to size of read content | ||
// If reach to end of file, the eof is set to true. meanwhile 'buf_len' | ||
// is set to zero. | ||
virtual Status read(uint8_t* buf, size_t* buf_len, bool* eof) override; | ||
virtual Status readat(int64_t position, int64_t nbytes, int64_t* bytes_read, | ||
void* out) override; | ||
virtual Status read_one_message(std::unique_ptr<uint8_t[]>* buf, size_t* length) override; | ||
virtual int64_t size() override; | ||
virtual Status seek(int64_t position) override; | ||
virtual Status tell(int64_t* position) override; | ||
virtual void close() override; | ||
virtual bool closed() override; | ||
|
||
private: | ||
Status connect(); | ||
private: | ||
THdfsParams _hdfs_params; | ||
std::string _namenode; | ||
std::string _path; | ||
int64_t _current_offset; | ||
int64_t _file_size; | ||
hdfsFS _hdfs_fs; | ||
hdfsFile _hdfs_file; | ||
}; | ||
|
||
} // namespace doris |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.