Skip to content

Commit

Permalink
[c++] remove HDFS support (fixes microsoft#6436) (microsoft#6534)
Browse files Browse the repository at this point in the history
  • Loading branch information
jameslamb authored Jul 12, 2024
1 parent da174b8 commit 525f8b4
Show file tree
Hide file tree
Showing 6 changed files with 0 additions and 209 deletions.
20 changes: 0 additions & 20 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ option(USE_MPI "Enable MPI-based distributed learning" OFF)
option(USE_OPENMP "Enable OpenMP" ON)
option(USE_GPU "Enable GPU-accelerated training" OFF)
option(USE_SWIG "Enable SWIG to generate Java API" OFF)
option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
option(USE_TIMETAG "Set to ON to output time costs" OFF)
option(USE_CUDA "Enable CUDA-accelerated training " OFF)
option(USE_DEBUG "Set to ON for Debug mode" OFF)
Expand Down Expand Up @@ -294,21 +293,6 @@ if(USE_CUDA)
endforeach()
endif()

if(USE_HDFS)
message(
DEPRECATION
"HDFS support in LightGBM is deprecated, and will be removed in a future release.\
See https://github.com/microsoft/LightGBM/issues/6436.
"
)
find_package(JNI REQUIRED)
find_path(HDFS_INCLUDE_DIR hdfs.h REQUIRED)
find_library(HDFS_LIB NAMES hdfs REQUIRED)
include_directories(${HDFS_INCLUDE_DIR})
add_definitions(-DUSE_HDFS)
set(HDFS_CXX_LIBRARIES ${HDFS_LIB} ${JAVA_JVM_LIBRARY})
endif()

include(CheckCXXSourceCompiles)
check_cxx_source_compiles("
#include <xmmintrin.h>
Expand Down Expand Up @@ -647,10 +631,6 @@ if(USE_CUDA)
target_link_libraries(_lightgbm PRIVATE ${histograms})
endif()

if(USE_HDFS)
target_link_libraries(lightgbm_objs PUBLIC ${HDFS_CXX_LIBRARIES})
endif()

if(WIN32)
if(MINGW OR CYGWIN)
target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi)
Expand Down
5 changes: 0 additions & 5 deletions build-python.sh
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,6 @@
# Compile CUDA version.
# --gpu
# Compile GPU version.
# --hdfs
# Compile HDFS version.
# --integrated-opencl
# Compile integrated OpenCL version.
# --mingw
Expand Down Expand Up @@ -148,9 +146,6 @@ while [ $# -gt 0 ]; do
--gpu)
BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_GPU=ON"
;;
--hdfs)
BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_HDFS=ON"
;;
--integrated-opencl)
BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.__INTEGRATE_OPENCL=ON"
;;
Expand Down
35 changes: 0 additions & 35 deletions docs/Installation-Guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -628,41 +628,6 @@ Windows
The CUDA version is not supported on Windows.
Use the GPU version (``device_type=gpu``) for GPU acceleration on Windows.

Build HDFS Version
~~~~~~~~~~~~~~~~~~

.. warning::
HDFS support in LightGBM is deprecated, and will be removed in a future release.
See https://github.com/microsoft/LightGBM/issues/6436.

The HDFS version of LightGBM was tested on CDH-5.14.4 cluster.

Linux
^^^^^

On Linux a HDFS version of LightGBM can be built using **CMake** and **gcc**.

1. Install `CMake`_.

2. Run the following commands:

.. code:: sh
git clone --recursive https://github.com/microsoft/LightGBM
cd LightGBM
cmake -B build -S . -DUSE_HDFS=ON
# if you have installed HDFS to a customized location, you should specify paths to HDFS headers (hdfs.h) and library (libhdfs.so) like the following:
# cmake \
# -DUSE_HDFS=ON \
# -DHDFS_LIB="/opt/cloudera/parcels/CDH-5.14.4-1.cdh5.14.4.p0.3/lib64/libhdfs.so" \
# -DHDFS_INCLUDE_DIR="/opt/cloudera/parcels/CDH-5.14.4-1.cdh5.14.4.p0.3/include/" \
# ..
cmake --build build -j4
**Note**: glibc >= 2.14 is required.

**Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this).

Build Java Wrapper
~~~~~~~~~~~~~~~~~~

Expand Down
1 change: 0 additions & 1 deletion docs/_static/js/script.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ $(function() {
'#build-mpi-version',
'#build-gpu-version',
'#build-cuda-version',
'#build-hdfs-version',
'#build-java-wrapper',
'#build-c-unit-tests'
];
Expand Down
19 changes: 0 additions & 19 deletions python-package/README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -155,23 +155,6 @@ All requirements from `Build from Sources section <#build-from-sources>`__ apply

To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters.

Build HDFS Version
~~~~~~~~~~~~~~~~~~

.. warning::
HDFS support in LightGBM is deprecated, and will be removed in a future release.
See https://github.com/microsoft/LightGBM/issues/6436.

.. code:: sh
pip install lightgbm --config-settings=cmake.define.USE_HDFS=ON
All requirements from `Build from Sources section <#build-from-sources>`__ apply for this installation option as well.

**HDFS** library is needed: details for installation can be found in `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-hdfs-version>`__.

Note that the installation process of HDFS version was tested only on **Linux**.

Build with MinGW-w64 on Windows
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -247,8 +230,6 @@ Run ``sh ./build-python.sh install --gpu`` to enable GPU support. All requiremen

Run ``sh ./build-python.sh install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.

Run ``sh ./build-python.sh install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well.

Run ``sh ./build-python.sh install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well.

Run ``sh ./build-python.sh install --time-costs``, if you want to output time costs for different internal routines. All requirements from `Build with Time Costs Output section <#build-with-time-costs-output>`__ apply for this installation option as well.
Expand Down
129 changes: 0 additions & 129 deletions src/io/file_io.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,6 @@
#include <sstream>
#include <unordered_map>

#ifdef USE_HDFS
#include <hdfs.h>
#endif

namespace LightGBM {

struct LocalFile : VirtualFileReader, VirtualFileWriter {
Expand Down Expand Up @@ -56,142 +52,17 @@ struct LocalFile : VirtualFileReader, VirtualFileWriter {
const std::string mode_;
};

const char* kHdfsProto = "hdfs://";

#ifdef USE_HDFS
const size_t kHdfsProtoLength = static_cast<size_t>(strlen(kHdfsProto));

struct HDFSFile : VirtualFileReader, VirtualFileWriter {
HDFSFile(const std::string& filename, int flags)
: filename_(filename), flags_(flags) {}
~HDFSFile() {
if (file_ != NULL) {
hdfsCloseFile(fs_, file_);
}
}

bool Init() {
if (file_ == NULL) {
if (fs_ == NULL) {
fs_ = GetHDFSFileSystem(filename_);
}
if (fs_ != NULL &&
(flags_ == O_WRONLY || 0 == hdfsExists(fs_, filename_.c_str()))) {
file_ = hdfsOpenFile(fs_, filename_.c_str(), flags_, 0, 0, 0);
}
}
return file_ != NULL;
}

bool Exists() const {
if (fs_ == NULL) {
fs_ = GetHDFSFileSystem(filename_);
}
return fs_ != NULL && 0 == hdfsExists(fs_, filename_.c_str());
}

size_t Read(void* data, size_t bytes) const {
return FileOperation<void*>(data, bytes, &hdfsRead);
}

size_t Write(const void* data, size_t bytes) const {
return FileOperation<const void*>(data, bytes, &hdfsWrite);
}

private:
template <typename BufferType>
using fileOp = tSize (*)(hdfsFS, hdfsFile, BufferType, tSize);

template <typename BufferType>
inline size_t FileOperation(BufferType data, size_t bytes,
fileOp<BufferType> op) const {
char* buffer = const_cast<char*>(static_cast<const char*>(data));
size_t remain = bytes;
while (remain != 0) {
size_t nmax = static_cast<size_t>(std::numeric_limits<tSize>::max());
tSize ret = op(fs_, file_, buffer, std::min(nmax, remain));
if (ret > 0) {
size_t n = static_cast<size_t>(ret);
remain -= n;
buffer += n;
} else if (ret == 0) {
break;
} else if (errno != EINTR) {
Log::Fatal("Failed HDFS file operation [%s]", strerror(errno));
}
}
return bytes - remain;
}

static hdfsFS GetHDFSFileSystem(const std::string& uri) {
size_t end = uri.find("/", kHdfsProtoLength);
if (uri.find(kHdfsProto) != 0 || end == std::string::npos) {
Log::Warning("Bad HDFS uri, no namenode found [%s]", uri.c_str());
return NULL;
}
std::string hostport = uri.substr(kHdfsProtoLength, end - kHdfsProtoLength);
if (fs_cache_.count(hostport) == 0) {
fs_cache_[hostport] = MakeHDFSFileSystem(hostport);
}
return fs_cache_[hostport];
}

static hdfsFS MakeHDFSFileSystem(const std::string& hostport) {
std::istringstream iss(hostport);
std::string host;
tPort port = 0;
std::getline(iss, host, ':');
iss >> port;
hdfsFS fs = iss.eof() ? hdfsConnect(host.c_str(), port) : NULL;
if (fs == NULL) {
Log::Warning("Could not connect to HDFS namenode [%s]", hostport.c_str());
}
return fs;
}

mutable hdfsFS fs_ = NULL;
hdfsFile file_ = NULL;
const std::string filename_;
const int flags_;
static std::unordered_map<std::string, hdfsFS> fs_cache_;
};

std::unordered_map<std::string, hdfsFS> HDFSFile::fs_cache_ =
std::unordered_map<std::string, hdfsFS>();

#define WITH_HDFS(x) x
#else
#define WITH_HDFS(x) Log::Fatal("HDFS support is not enabled")
#endif // USE_HDFS

std::unique_ptr<VirtualFileReader> VirtualFileReader::Make(
const std::string& filename) {
#ifdef USE_HDFS
if (0 == filename.find(kHdfsProto)) {
WITH_HDFS(return std::unique_ptr<VirtualFileReader>(
new HDFSFile(filename, O_RDONLY)));
}
#endif
return std::unique_ptr<VirtualFileReader>(new LocalFile(filename, "rb"));
}

std::unique_ptr<VirtualFileWriter> VirtualFileWriter::Make(
const std::string& filename) {
#ifdef USE_HDFS
if (0 == filename.find(kHdfsProto)) {
WITH_HDFS(return std::unique_ptr<VirtualFileWriter>(
new HDFSFile(filename, O_WRONLY)));
}
#endif
return std::unique_ptr<VirtualFileWriter>(new LocalFile(filename, "wb"));
}

bool VirtualFileWriter::Exists(const std::string& filename) {
#ifdef USE_HDFS
if (0 == filename.find(kHdfsProto)) {
WITH_HDFS(HDFSFile file(filename, O_RDONLY); return file.Exists());
}
#endif
LocalFile file(filename, "rb");
return file.Exists();
}
Expand Down

0 comments on commit 525f8b4

Please sign in to comment.