[c++] remove HDFS support (fixes #6436) (#6534)

This commit is contained in:
James Lamb 2024-07-12 09:31:34 -07:00 коммит произвёл GitHub
Родитель da174b8d06
Коммит 525f8b4b80
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
6 изменённых файлов: 0 добавлений и 209 удалений

Просмотреть файл

@ -2,7 +2,6 @@ option(USE_MPI "Enable MPI-based distributed learning" OFF)
option(USE_OPENMP "Enable OpenMP" ON)
option(USE_GPU "Enable GPU-accelerated training" OFF)
option(USE_SWIG "Enable SWIG to generate Java API" OFF)
option(USE_HDFS "Enable HDFS support (EXPERIMENTAL)" OFF)
option(USE_TIMETAG "Set to ON to output time costs" OFF)
option(USE_CUDA "Enable CUDA-accelerated training " OFF)
option(USE_DEBUG "Set to ON for Debug mode" OFF)
@ -294,21 +293,6 @@ if(USE_CUDA)
endforeach()
endif()
if(USE_HDFS)
message(
DEPRECATION
"HDFS support in LightGBM is deprecated, and will be removed in a future release.\
See https://github.com/microsoft/LightGBM/issues/6436.
"
)
find_package(JNI REQUIRED)
find_path(HDFS_INCLUDE_DIR hdfs.h REQUIRED)
find_library(HDFS_LIB NAMES hdfs REQUIRED)
include_directories(${HDFS_INCLUDE_DIR})
add_definitions(-DUSE_HDFS)
set(HDFS_CXX_LIBRARIES ${HDFS_LIB} ${JAVA_JVM_LIBRARY})
endif()
include(CheckCXXSourceCompiles)
check_cxx_source_compiles("
#include <xmmintrin.h>
@ -647,10 +631,6 @@ if(USE_CUDA)
target_link_libraries(_lightgbm PRIVATE ${histograms})
endif()
if(USE_HDFS)
target_link_libraries(lightgbm_objs PUBLIC ${HDFS_CXX_LIBRARIES})
endif()
if(WIN32)
if(MINGW OR CYGWIN)
target_link_libraries(lightgbm_objs PUBLIC ws2_32 iphlpapi)

Просмотреть файл

@ -40,8 +40,6 @@
# Compile CUDA version.
# --gpu
# Compile GPU version.
# --hdfs
# Compile HDFS version.
# --integrated-opencl
# Compile integrated OpenCL version.
# --mingw
@ -148,9 +146,6 @@ while [ $# -gt 0 ]; do
--gpu)
BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_GPU=ON"
;;
--hdfs)
BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.USE_HDFS=ON"
;;
--integrated-opencl)
BUILD_ARGS="${BUILD_ARGS} --config-setting=cmake.define.__INTEGRATE_OPENCL=ON"
;;

Просмотреть файл

@ -628,41 +628,6 @@ Windows
The CUDA version is not supported on Windows.
Use the GPU version (``device_type=gpu``) for GPU acceleration on Windows.
Build HDFS Version
~~~~~~~~~~~~~~~~~~
.. warning::
HDFS support in LightGBM is deprecated, and will be removed in a future release.
See https://github.com/microsoft/LightGBM/issues/6436.
The HDFS version of LightGBM was tested on CDH-5.14.4 cluster.
Linux
^^^^^
On Linux a HDFS version of LightGBM can be built using **CMake** and **gcc**.
1. Install `CMake`_.
2. Run the following commands:
.. code:: sh
git clone --recursive https://github.com/microsoft/LightGBM
cd LightGBM
cmake -B build -S . -DUSE_HDFS=ON
# if you have installed HDFS to a customized location, you should specify paths to HDFS headers (hdfs.h) and library (libhdfs.so) like the following:
# cmake \
# -DUSE_HDFS=ON \
# -DHDFS_LIB="/opt/cloudera/parcels/CDH-5.14.4-1.cdh5.14.4.p0.3/lib64/libhdfs.so" \
# -DHDFS_INCLUDE_DIR="/opt/cloudera/parcels/CDH-5.14.4-1.cdh5.14.4.p0.3/include/" \
# ..
cmake --build build -j4
**Note**: glibc >= 2.14 is required.
**Note**: In some rare cases you may need to install OpenMP runtime library separately (use your package manager and search for ``lib[g|i]omp`` for doing this).
Build Java Wrapper
~~~~~~~~~~~~~~~~~~

1
docs/_static/js/script.js поставляемый
Просмотреть файл

@ -29,7 +29,6 @@ $(function() {
'#build-mpi-version',
'#build-gpu-version',
'#build-cuda-version',
'#build-hdfs-version',
'#build-java-wrapper',
'#build-c-unit-tests'
];

Просмотреть файл

@ -155,23 +155,6 @@ All requirements from `Build from Sources section <#build-from-sources>`__ apply
To use the CUDA version within Python, pass ``{"device": "cuda"}`` respectively in parameters.
Build HDFS Version
~~~~~~~~~~~~~~~~~~
.. warning::
HDFS support in LightGBM is deprecated, and will be removed in a future release.
See https://github.com/microsoft/LightGBM/issues/6436.
.. code:: sh
pip install lightgbm --config-settings=cmake.define.USE_HDFS=ON
All requirements from `Build from Sources section <#build-from-sources>`__ apply for this installation option as well.
**HDFS** library is needed: details for installation can be found in `Installation Guide <https://github.com/microsoft/LightGBM/blob/master/docs/Installation-Guide.rst#build-hdfs-version>`__.
Note that the installation process of HDFS version was tested only on **Linux**.
Build with MinGW-w64 on Windows
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@ -247,8 +230,6 @@ Run ``sh ./build-python.sh install --gpu`` to enable GPU support. All requiremen
Run ``sh ./build-python.sh install --cuda`` to enable CUDA support. All requirements from `Build CUDA Version section <#build-cuda-version>`__ apply for this installation option as well.
Run ``sh ./build-python.sh install --hdfs`` to enable HDFS support. All requirements from `Build HDFS Version section <#build-hdfs-version>`__ apply for this installation option as well.
Run ``sh ./build-python.sh install --bit32``, if you want to use 32-bit version. All requirements from `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__ apply for this installation option as well.
Run ``sh ./build-python.sh install --time-costs``, if you want to output time costs for different internal routines. All requirements from `Build with Time Costs Output section <#build-with-time-costs-output>`__ apply for this installation option as well.

Просмотреть файл

@ -11,10 +11,6 @@
#include <sstream>
#include <unordered_map>
#ifdef USE_HDFS
#include <hdfs.h>
#endif
namespace LightGBM {
struct LocalFile : VirtualFileReader, VirtualFileWriter {
@ -56,142 +52,17 @@ struct LocalFile : VirtualFileReader, VirtualFileWriter {
const std::string mode_;
};
const char* kHdfsProto = "hdfs://";
#ifdef USE_HDFS
const size_t kHdfsProtoLength = static_cast<size_t>(strlen(kHdfsProto));
struct HDFSFile : VirtualFileReader, VirtualFileWriter {
HDFSFile(const std::string& filename, int flags)
: filename_(filename), flags_(flags) {}
~HDFSFile() {
if (file_ != NULL) {
hdfsCloseFile(fs_, file_);
}
}
bool Init() {
if (file_ == NULL) {
if (fs_ == NULL) {
fs_ = GetHDFSFileSystem(filename_);
}
if (fs_ != NULL &&
(flags_ == O_WRONLY || 0 == hdfsExists(fs_, filename_.c_str()))) {
file_ = hdfsOpenFile(fs_, filename_.c_str(), flags_, 0, 0, 0);
}
}
return file_ != NULL;
}
bool Exists() const {
if (fs_ == NULL) {
fs_ = GetHDFSFileSystem(filename_);
}
return fs_ != NULL && 0 == hdfsExists(fs_, filename_.c_str());
}
size_t Read(void* data, size_t bytes) const {
return FileOperation<void*>(data, bytes, &hdfsRead);
}
size_t Write(const void* data, size_t bytes) const {
return FileOperation<const void*>(data, bytes, &hdfsWrite);
}
private:
template <typename BufferType>
using fileOp = tSize (*)(hdfsFS, hdfsFile, BufferType, tSize);
template <typename BufferType>
inline size_t FileOperation(BufferType data, size_t bytes,
fileOp<BufferType> op) const {
char* buffer = const_cast<char*>(static_cast<const char*>(data));
size_t remain = bytes;
while (remain != 0) {
size_t nmax = static_cast<size_t>(std::numeric_limits<tSize>::max());
tSize ret = op(fs_, file_, buffer, std::min(nmax, remain));
if (ret > 0) {
size_t n = static_cast<size_t>(ret);
remain -= n;
buffer += n;
} else if (ret == 0) {
break;
} else if (errno != EINTR) {
Log::Fatal("Failed HDFS file operation [%s]", strerror(errno));
}
}
return bytes - remain;
}
static hdfsFS GetHDFSFileSystem(const std::string& uri) {
size_t end = uri.find("/", kHdfsProtoLength);
if (uri.find(kHdfsProto) != 0 || end == std::string::npos) {
Log::Warning("Bad HDFS uri, no namenode found [%s]", uri.c_str());
return NULL;
}
std::string hostport = uri.substr(kHdfsProtoLength, end - kHdfsProtoLength);
if (fs_cache_.count(hostport) == 0) {
fs_cache_[hostport] = MakeHDFSFileSystem(hostport);
}
return fs_cache_[hostport];
}
static hdfsFS MakeHDFSFileSystem(const std::string& hostport) {
std::istringstream iss(hostport);
std::string host;
tPort port = 0;
std::getline(iss, host, ':');
iss >> port;
hdfsFS fs = iss.eof() ? hdfsConnect(host.c_str(), port) : NULL;
if (fs == NULL) {
Log::Warning("Could not connect to HDFS namenode [%s]", hostport.c_str());
}
return fs;
}
mutable hdfsFS fs_ = NULL;
hdfsFile file_ = NULL;
const std::string filename_;
const int flags_;
static std::unordered_map<std::string, hdfsFS> fs_cache_;
};
std::unordered_map<std::string, hdfsFS> HDFSFile::fs_cache_ =
std::unordered_map<std::string, hdfsFS>();
#define WITH_HDFS(x) x
#else
#define WITH_HDFS(x) Log::Fatal("HDFS support is not enabled")
#endif // USE_HDFS
std::unique_ptr<VirtualFileReader> VirtualFileReader::Make(
const std::string& filename) {
#ifdef USE_HDFS
if (0 == filename.find(kHdfsProto)) {
WITH_HDFS(return std::unique_ptr<VirtualFileReader>(
new HDFSFile(filename, O_RDONLY)));
}
#endif
return std::unique_ptr<VirtualFileReader>(new LocalFile(filename, "rb"));
}
std::unique_ptr<VirtualFileWriter> VirtualFileWriter::Make(
const std::string& filename) {
#ifdef USE_HDFS
if (0 == filename.find(kHdfsProto)) {
WITH_HDFS(return std::unique_ptr<VirtualFileWriter>(
new HDFSFile(filename, O_WRONLY)));
}
#endif
return std::unique_ptr<VirtualFileWriter>(new LocalFile(filename, "wb"));
}
bool VirtualFileWriter::Exists(const std::string& filename) {
#ifdef USE_HDFS
if (0 == filename.find(kHdfsProto)) {
WITH_HDFS(HDFSFile file(filename, O_RDONLY); return file.Exists());
}
#endif
LocalFile file(filename, "rb");
return file.Exists();
}