Merge branch 'master' into nccl-dev

This commit is contained in:
shiyu1994 2024-06-30 22:14:04 +08:00 коммит произвёл GitHub
Родитель 75afe5e010 e9a6c79807
Коммит 1e6e4a1cca
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: B5690EEEBB952194
73 изменённых файлов: 1420 добавлений и 765 удалений

Просмотреть файл

@ -1,16 +1,14 @@
version: 4.3.0.99.{build}
version: 4.4.0.99.{build}
image: Visual Studio 2015
platform: x64
configuration: # a trick to construct a build matrix with multiple Python versions
configuration:
- '3.8'
# only build pull requests and
# commits to 'master' or any branch starting with 'release'
# only build on 'master' and pull requests targeting it
branches:
only:
- master
- /^release/
environment:
matrix:
@ -25,12 +23,13 @@ install:
- git submodule update --init --recursive # get `external_libs` folder
- set PATH=C:\mingw-w64\x86_64-8.1.0-posix-seh-rt_v6-rev0\mingw64\bin;%PATH%
- set PYTHON_VERSION=%CONFIGURATION%
- set CONDA_ENV="test-env"
- ps: |
$env:ALLOW_SKIP_ARROW_TESTS = "1"
$env:APPVEYOR = "true"
$env:CMAKE_BUILD_PARALLEL_LEVEL = 4
$env:MINICONDA = "C:\Miniconda3-x64"
$env:PATH = "$env:MINICONDA;$env:MINICONDA\Scripts;$env:PATH"
$env:BUILD_SOURCESDIRECTORY = "$env:APPVEYOR_BUILD_FOLDER"
$env:LGB_VER = (Get-Content $env:APPVEYOR_BUILD_FOLDER\VERSION.txt).trim()
build: false

Просмотреть файл

@ -26,11 +26,12 @@ fi
PY_MINOR_VER=$(python -c "import sys; print(sys.version_info.minor)")
if [ $PY_MINOR_VER -gt 7 ]; then
echo "pydistcheck..."
pip install pydistcheck
pip install 'pydistcheck>=0.7.0'
if { test "${TASK}" = "cuda" || test "${METHOD}" = "wheel"; }; then
pydistcheck \
--inspect \
--ignore 'compiled-objects-have-debug-symbols,distro-too-large-compressed' \
--ignore 'compiled-objects-have-debug-symbols'\
--ignore 'distro-too-large-compressed' \
--max-allowed-size-uncompressed '500M' \
--max-allowed-files 800 \
${DIST_DIR}/* || exit 1

Просмотреть файл

@ -0,0 +1,51 @@
# [description]
#
# Similar to ci-core.txt, but specific to Python 3.8.
#
# Unlike ci-core.txt, this includes a Python version and uses
# `=` and `<=` pins to make solves faster and prevent against
# issues like https://github.com/microsoft/LightGBM/pull/6370.
#
# [usage]
#
# conda create \
# --name test-env \
# --file ./.ci/conda-envs/ci-core-py38.txt
#
# python
python=3.8.*
# direct imports
cffi=1.15.*
dask=2023.5.*
distributed=2023.5.*
joblib=1.4.*
matplotlib-base=3.7.*
numpy=1.24.*
pandas=1.5.*
pyarrow-core=16.1.*
python-graphviz=0.20.*
scikit-learn=1.3.*
scipy=1.10.*
# testing-only dependencies
cloudpickle=3.0.*
pluggy=1.5.*
psutil=5.9.8
pytest=8.2.*
# other recursive dependencies, just
# pinned here to help speed up solves
bokeh=3.1.*
fsspec=2024.5.*
msgpack-python=1.0.*
pluggy=1.5.*
pytz=2024.1
setuptools=69.5.*
snappy=1.2.*
tomli=2.0.*
tornado=6.4.*
wheel=0.43.*
zict=3.0.*
zipp=3.17.*

Просмотреть файл

@ -6,15 +6,12 @@
TRIGGER_PHRASE: Code phrase that triggers workflow.
"""
import json
from os import environ
from sys import argv, exit
from time import sleep
try:
from urllib import request
except ImportError:
import urllib2 as request
from urllib import request
def get_runs(trigger_phrase):

Просмотреть файл

@ -52,6 +52,8 @@ LINTERS_TO_USE <- list(
, "inner_combine" = lintr::inner_combine_linter()
, "is_numeric" = lintr::is_numeric_linter()
, "lengths" = lintr::lengths_linter()
, "length_levels" = lintr::length_levels_linter()
, "length_test" = lintr::length_test_linter()
, "line_length" = lintr::line_length_linter(length = 120L)
, "literal_coercion" = lintr::literal_coercion_linter()
, "matrix" = lintr::matrix_apply_linter()
@ -66,6 +68,7 @@ LINTERS_TO_USE <- list(
, "redundant_equals" = lintr::redundant_equals_linter()
, "regex_subset" = lintr::regex_subset_linter()
, "routine_registration" = lintr::routine_registration_linter()
, "scalar_in" = lintr::scalar_in_linter()
, "semicolon" = lintr::semicolon_linter()
, "seq" = lintr::seq_linter()
, "spaces_inside" = lintr::spaces_inside_linter()

Просмотреть файл

@ -14,15 +14,13 @@ if [[ $OS_NAME == "macos" ]]; then
if [[ $COMPILER == "clang" ]]; then
brew install libomp
if [[ $AZURE == "true" ]]; then
sudo xcode-select -s /Applications/Xcode_11.7.app/Contents/Developer || exit 1
sudo xcode-select -s /Applications/Xcode_13.1.0.app/Contents/Developer || exit 1
fi
else # gcc
# Check https://github.com/actions/runner-images/tree/main/images/macos for available
# versions of Xcode
sudo xcode-select -s /Applications/Xcode_14.3.1.app/Contents/Developer || exit 1
if [[ $TASK != "mpi" ]]; then
brew install gcc
fi
brew install gcc
fi
if [[ $TASK == "mpi" ]]; then
brew install open-mpi
@ -30,10 +28,6 @@ if [[ $OS_NAME == "macos" ]]; then
if [[ $TASK == "swig" ]]; then
brew install swig
fi
curl \
-sL \
-o miniforge.sh \
https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-${ARCH}.sh
else # Linux
if [[ $IN_UBUNTU_BASE_CONTAINER == "true" ]]; then
# fixes error "unable to initialize frontend: Dialog"
@ -45,35 +39,30 @@ else # Linux
software-properties-common
sudo apt-get install --no-install-recommends -y \
apt-utils \
build-essential \
ca-certificates \
cmake \
curl \
git \
iputils-ping \
jq \
libcurl4 \
libicu-dev \
libssl-dev \
libunwind8 \
locales \
locales-all \
netcat \
unzip \
zip || exit 1
locales-all || exit 1
if [[ $COMPILER == "clang" ]]; then
sudo apt-get install --no-install-recommends -y \
clang \
libomp-dev
elif [[ $COMPILER == "clang-17" ]]; then
sudo apt-get install wget
sudo apt-get install --no-install-recommends -y \
wget
wget -qO- https://apt.llvm.org/llvm-snapshot.gpg.key | sudo tee /etc/apt/trusted.gpg.d/apt.llvm.org.asc
sudo apt-add-repository deb http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main
sudo apt-add-repository deb-src http://apt.llvm.org/jammy/ llvm-toolchain-jammy-17 main
sudo apt-get update
sudo apt-get install -y clang-17
sudo apt-get install --no-install-recommends -y libomp-17-dev
sudo apt-get install -y \
clang-17 \
libomp-17-dev
fi
export LANG="en_US.UTF-8"
@ -144,16 +133,14 @@ else # Linux
apt-get install --no-install-recommends -y \
cmake
fi
if [[ $SETUP_CONDA != "false" ]]; then
curl \
-sL \
-o miniforge.sh \
https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${ARCH}.sh
fi
fi
if [[ "${TASK}" != "r-package" ]] && [[ "${TASK}" != "r-rchk" ]]; then
if [[ $SETUP_CONDA != "false" ]]; then
curl \
-sL \
-o miniforge.sh \
https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-${ARCH}.sh
sh miniforge.sh -b -p $CONDA
fi
conda config --set always_yes yes --set changeps1 no

50
.ci/test-python-latest.sh Executable file
Просмотреть файл

@ -0,0 +1,50 @@
#!/bin/bash
set -e -E -u -o pipefail
# latest versions of lightgbm's dependencies,
# including pre-releases and nightlies
#
# ref: https://github.com/pydata/xarray/blob/31111b3afe44fd6f7dac363264e94186cc5168d2/.github/workflows/upstream-dev-ci.yaml
echo "installing testing dependencies"
python -m pip install \
cloudpickle \
psutil \
pytest
echo "done installing testing dependencies"
echo "installing lightgbm's dependencies"
python -m pip install \
--extra-index-url https://pypi.anaconda.org/scientific-python-nightly-wheels/simple \
--prefer-binary \
--pre \
--upgrade \
'numpy>=2.0.0.dev0' \
'matplotlib>=3.10.0.dev0' \
'pandas>=3.0.0.dev0' \
'scikit-learn>=1.6.dev0' \
'scipy>=1.15.0.dev0'
python -m pip install \
--extra-index-url https://pypi.fury.io/arrow-nightlies/ \
--prefer-binary \
--pre \
--upgrade \
'pyarrow>=17.0.0.dev0'
python -m pip install \
'cffi>=1.15.1'
echo "done installing lightgbm's dependencies"
echo "installing lightgbm"
pip install --no-deps dist/*.whl
echo "done installing lightgbm"
echo "installed package versions:"
pip freeze
echo ""
echo "running tests"
pytest tests/c_api_test/
pytest tests/python_package_test/

Просмотреть файл

@ -3,19 +3,20 @@
set -e -E -u -o pipefail
# oldest versions of dependencies published after
# minimum supported Python version's first release
# minimum supported Python version's first release,
# for which there are wheels compatible with the
# python:{version} image
#
# see https://devguide.python.org/versions/
#
echo "installing lightgbm's dependencies"
pip install \
'cffi==1.15.1' \
'dataclasses' \
'numpy==1.16.6' \
'pandas==0.24.0' \
'numpy==1.19.0' \
'pandas==1.1.3' \
'pyarrow==6.0.1' \
'scikit-learn==0.18.2' \
'scipy==0.19.0' \
'scikit-learn==0.24.0' \
'scipy==1.6.0' \
|| exit 1
echo "done installing lightgbm's dependencies"

Просмотреть файл

@ -3,6 +3,7 @@
set -e -E -o -u pipefail
# defaults
CONDA_ENV="test-env"
IN_UBUNTU_BASE_CONTAINER=${IN_UBUNTU_BASE_CONTAINER:-"false"}
METHOD=${METHOD:-""}
PRODUCES_ARTIFACTS=${PRODUCES_ARTIFACTS:-"false"}
@ -10,6 +11,8 @@ SANITIZERS=${SANITIZERS:-""}
ARCH=$(uname -m)
LGB_VER=$(head -n 1 "${BUILD_DIRECTORY}/VERSION.txt")
if [[ $OS_NAME == "macos" ]] && [[ $COMPILER == "gcc" ]]; then
export CXX=g++-11
export CC=gcc-11
@ -26,8 +29,21 @@ if [[ $IN_UBUNTU_BASE_CONTAINER == "true" ]]; then
export LC_ALL="en_US.UTF-8"
fi
# Setting MACOSX_DEPLOYMENT_TARGET prevents CMake from building against too-new
# macOS features, and helps tools like Python build tools determine the appropriate
# wheel compatibility tags.
#
# ref:
# * https://cmake.org/cmake/help/latest/envvar/MACOSX_DEPLOYMENT_TARGET.html
# * https://github.com/scikit-build/scikit-build-core/blob/acb7d0346e4a05bcb47a4ea3939c705ab71e3145/src/scikit_build_core/builder/macos.py#L36
if [[ $ARCH == "x86_64" ]]; then
export MACOSX_DEPLOYMENT_TARGET=10.15
else
export MACOSX_DEPLOYMENT_TARGET=12.0
fi
if [[ "${TASK}" == "r-package" ]] || [[ "${TASK}" == "r-rchk" ]]; then
bash ${BUILD_DIRECTORY}/.ci/test_r_package.sh || exit 1
bash "${BUILD_DIRECTORY}/.ci/test_r_package.sh" || exit 1
exit 0
fi
@ -54,27 +70,31 @@ if [[ $TASK == "if-else" ]]; then
source activate $CONDA_ENV
cmake -B build -S . || exit 1
cmake --build build --target lightgbm -j4 || exit 1
cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp && ../../lightgbm config=predict.conf output_result=origin.pred || exit 1
cd $BUILD_DIRECTORY/tests/cpp_tests && ../../lightgbm config=predict.conf output_result=ifelse.pred && python test.py || exit 1
cd "$BUILD_DIRECTORY/tests/cpp_tests"
../../lightgbm config=train.conf convert_model_language=cpp convert_model=../../src/boosting/gbdt_prediction.cpp
../../lightgbm config=predict.conf output_result=origin.pred
../../lightgbm config=predict.conf output_result=ifelse.pred
python test.py
exit 0
fi
cd "${BUILD_DIRECTORY}"
if [[ $TASK == "swig" ]]; then
cmake -B build -S . -DUSE_SWIG=ON
cmake --build build -j4 || exit 1
if [[ $OS_NAME == "linux" ]] && [[ $COMPILER == "gcc" ]]; then
objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit 1
objdump -T $BUILD_DIRECTORY/lib_lightgbm_swig.so >> $BUILD_DIRECTORY/objdump.log || exit 1
python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit 1
objdump -T ./lib_lightgbm.so > ./objdump.log || exit 1
objdump -T ./lib_lightgbm_swig.so >> ./objdump.log || exit 1
python ./helpers/check_dynamic_dependencies.py ./objdump.log || exit 1
fi
if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
cp $BUILD_DIRECTORY/build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar
cp ./build/lightgbmlib.jar $BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_$OS_NAME.jar
fi
exit 0
fi
if [[ $TASK == "lint" ]]; then
cd ${BUILD_DIRECTORY}
mamba create -q -y -n $CONDA_ENV \
${CONDA_PYTHON_REQUIREMENT} \
'cmakelint>=1.4.2' \
@ -83,19 +103,19 @@ if [[ $TASK == "lint" ]]; then
'mypy>=1.8.0' \
'pre-commit>=3.6.0' \
'pyarrow>=6.0' \
'r-lintr>=3.1'
'r-lintr>=3.1.2'
source activate $CONDA_ENV
echo "Linting Python code"
bash ${BUILD_DIRECTORY}/.ci/lint-python.sh || exit 1
bash ./.ci/lint-python.sh || exit 1
echo "Linting R code"
Rscript ${BUILD_DIRECTORY}/.ci/lint_r_code.R ${BUILD_DIRECTORY} || exit 1
Rscript ./.ci/lint_r_code.R "${BUILD_DIRECTORY}" || exit 1
echo "Linting C++ code"
bash ${BUILD_DIRECTORY}/.ci/lint-cpp.sh || exit 1
bash ./.ci/lint-cpp.sh || exit 1
exit 0
fi
if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then
cd $BUILD_DIRECTORY/docs
cd "${BUILD_DIRECTORY}/docs"
mamba env create \
-n $CONDA_ENV \
--file ./env.yml || exit 1
@ -107,29 +127,32 @@ if [[ $TASK == "check-docs" ]] || [[ $TASK == "check-links" ]]; then
'rstcheck>=6.2.0' || exit 1
source activate $CONDA_ENV
# check reStructuredText formatting
cd $BUILD_DIRECTORY/python-package
cd "${BUILD_DIRECTORY}/python-package"
rstcheck --report-level warning $(find . -type f -name "*.rst") || exit 1
cd $BUILD_DIRECTORY/docs
cd "${BUILD_DIRECTORY}/docs"
rstcheck --report-level warning --ignore-directives=autoclass,autofunction,autosummary,doxygenfile $(find . -type f -name "*.rst") || exit 1
# build docs
make html || exit 1
if [[ $TASK == "check-links" ]]; then
# check docs for broken links
pip install --user linkchecker
pip install linkchecker
linkchecker --config=.linkcheckerrc ./_build/html/*.html || exit 1
exit 0
fi
# check the consistency of parameters' descriptions and other stuff
cp $BUILD_DIRECTORY/docs/Parameters.rst $BUILD_DIRECTORY/docs/Parameters-backup.rst
cp $BUILD_DIRECTORY/src/io/config_auto.cpp $BUILD_DIRECTORY/src/io/config_auto-backup.cpp
python $BUILD_DIRECTORY/helpers/parameter_generator.py || exit 1
diff $BUILD_DIRECTORY/docs/Parameters-backup.rst $BUILD_DIRECTORY/docs/Parameters.rst || exit 1
diff $BUILD_DIRECTORY/src/io/config_auto-backup.cpp $BUILD_DIRECTORY/src/io/config_auto.cpp || exit 1
cd "${BUILD_DIRECTORY}"
cp ./docs/Parameters.rst ./docs/Parameters-backup.rst
cp ./src/io/config_auto.cpp ./src/io/config_auto-backup.cpp
python ./helpers/parameter_generator.py || exit 1
diff ./docs/Parameters-backup.rst ./docs/Parameters.rst || exit 1
diff ./src/io/config_auto-backup.cpp ./src/io/config_auto.cpp || exit 1
exit 0
fi
if [[ $PYTHON_VERSION == "3.7" ]]; then
CONDA_REQUIREMENT_FILES="--file ${BUILD_DIRECTORY}/.ci/conda-envs/ci-core-py37.txt"
elif [[ $PYTHON_VERSION == "3.8" ]]; then
CONDA_REQUIREMENT_FILES="--file ${BUILD_DIRECTORY}/.ci/conda-envs/ci-core-py38.txt"
else
CONDA_REQUIREMENT_FILES="--file ${BUILD_DIRECTORY}/.ci/conda-envs/ci-core.txt"
fi
@ -143,38 +166,21 @@ mamba create \
source activate $CONDA_ENV
cd $BUILD_DIRECTORY
if [[ $OS_NAME == "macos" ]] && [[ $COMPILER == "clang" ]]; then
# fix "OMP: Error #15: Initializing libiomp5.dylib, but found libomp.dylib already initialized." (OpenMP library conflict due to conda's MKL)
for LIBOMP_ALIAS in libgomp.dylib libiomp5.dylib libomp.dylib; do sudo ln -sf "$(brew --cellar libomp)"/*/lib/libomp.dylib $CONDA_PREFIX/lib/$LIBOMP_ALIAS || exit 1; done
fi
cd "${BUILD_DIRECTORY}"
if [[ $TASK == "sdist" ]]; then
cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit 1
sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit 1
pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz -v || exit 1
sh ./build-python.sh sdist || exit 1
sh .ci/check_python_dists.sh ./dist || exit 1
pip install ./dist/lightgbm-$LGB_VER.tar.gz -v || exit 1
if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
cp $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz $BUILD_ARTIFACTSTAGINGDIRECTORY || exit 1
cp ./dist/lightgbm-$LGB_VER.tar.gz $BUILD_ARTIFACTSTAGINGDIRECTORY || exit 1
fi
pytest $BUILD_DIRECTORY/tests/python_package_test || exit 1
pytest ./tests/python_package_test || exit 1
exit 0
elif [[ $TASK == "bdist" ]]; then
if [[ $OS_NAME == "macos" ]]; then
cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel || exit 1
sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit 1
mv \
./dist/*.whl \
./dist/tmp.whl || exit 1
if [[ $ARCH == "x86_64" ]]; then
PLATFORM="macosx_10_15_x86_64.macosx_11_6_x86_64.macosx_12_5_x86_64"
else
echo "ERROR: macos wheels not supported yet on architecture '${ARCH}'"
exit 1
fi
mv \
./dist/tmp.whl \
dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl || exit 1
sh ./build-python.sh bdist_wheel || exit 1
sh .ci/check_python_dists.sh ./dist || exit 1
if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
cp dist/lightgbm-$LGB_VER-py3-none-macosx*.whl $BUILD_ARTIFACTSTAGINGDIRECTORY || exit 1
fi
@ -184,91 +190,88 @@ elif [[ $TASK == "bdist" ]]; then
else
PLATFORM="manylinux2014_$ARCH"
fi
cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --integrated-opencl || exit 1
sh ./build-python.sh bdist_wheel --integrated-opencl || exit 1
mv \
./dist/*.whl \
./dist/tmp.whl || exit 1
mv \
./dist/tmp.whl \
./dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl || exit 1
sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit 1
sh .ci/check_python_dists.sh ./dist || exit 1
if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
cp dist/lightgbm-$LGB_VER-py3-none-$PLATFORM.whl $BUILD_ARTIFACTSTAGINGDIRECTORY || exit 1
fi
# Make sure we can do both CPU and GPU; see tests/python_package_test/test_dual.py
export LIGHTGBM_TEST_DUAL_CPU_GPU=1
fi
pip install --user $BUILD_DIRECTORY/dist/*.whl || exit 1
pytest $BUILD_DIRECTORY/tests || exit 1
pip install -v ./dist/*.whl || exit 1
pytest ./tests || exit 1
exit 0
fi
if [[ $TASK == "gpu" ]]; then
sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' $BUILD_DIRECTORY/include/LightGBM/config.h
grep -q 'std::string device_type = "gpu"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit 1 # make sure that changes were really done
sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "gpu";/' ./include/LightGBM/config.h
grep -q 'std::string device_type = "gpu"' ./include/LightGBM/config.h || exit 1 # make sure that changes were really done
if [[ $METHOD == "pip" ]]; then
cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit 1
sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit 1
sh ./build-python.sh sdist || exit 1
sh .ci/check_python_dists.sh ./dist || exit 1
pip install \
--user \
-v \
--config-settings=cmake.define.USE_GPU=ON \
$BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz \
./dist/lightgbm-$LGB_VER.tar.gz \
|| exit 1
pytest $BUILD_DIRECTORY/tests/python_package_test || exit 1
pytest ./tests/python_package_test || exit 1
exit 0
elif [[ $METHOD == "wheel" ]]; then
cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --gpu || exit 1
sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit 1
pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER*.whl -v || exit 1
pytest $BUILD_DIRECTORY/tests || exit 1
sh ./build-python.sh bdist_wheel --gpu || exit 1
sh ./.ci/check_python_dists.sh ./dist || exit 1
pip install ./dist/lightgbm-$LGB_VER*.whl -v || exit 1
pytest ./tests || exit 1
exit 0
elif [[ $METHOD == "source" ]]; then
cmake -B build -S . -DUSE_GPU=ON
fi
elif [[ $TASK == "cuda" ]]; then
sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' $BUILD_DIRECTORY/include/LightGBM/config.h
grep -q 'std::string device_type = "cuda"' $BUILD_DIRECTORY/include/LightGBM/config.h || exit 1 # make sure that changes were really done
sed -i'.bak' 's/std::string device_type = "cpu";/std::string device_type = "cuda";/' ./include/LightGBM/config.h
grep -q 'std::string device_type = "cuda"' ./include/LightGBM/config.h || exit 1 # make sure that changes were really done
# by default ``gpu_use_dp=false`` for efficiency. change to ``true`` here for exact results in ci tests
sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' $BUILD_DIRECTORY/include/LightGBM/config.h
grep -q 'gpu_use_dp = true' $BUILD_DIRECTORY/include/LightGBM/config.h || exit 1 # make sure that changes were really done
sed -i'.bak' 's/gpu_use_dp = false;/gpu_use_dp = true;/' ./include/LightGBM/config.h
grep -q 'gpu_use_dp = true' ./include/LightGBM/config.h || exit 1 # make sure that changes were really done
if [[ $METHOD == "pip" ]]; then
cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit 1
sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit 1
sh ./build-python.sh sdist || exit 1
sh ./.ci/check_python_dists.sh ./dist || exit 1
pip install \
--user \
-v \
--config-settings=cmake.define.USE_CUDA=ON \
$BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz \
./dist/lightgbm-$LGB_VER.tar.gz \
|| exit 1
pytest $BUILD_DIRECTORY/tests/python_package_test || exit 1
pytest ./tests/python_package_test || exit 1
exit 0
elif [[ $METHOD == "wheel" ]]; then
cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --cuda || exit 1
sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit 1
pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER*.whl -v || exit 1
pytest $BUILD_DIRECTORY/tests || exit 1
sh ./build-python.sh bdist_wheel --cuda || exit 1
sh ./.ci/check_python_dists.sh ./dist || exit 1
pip install ./dist/lightgbm-$LGB_VER*.whl -v || exit 1
pytest ./tests || exit 1
exit 0
elif [[ $METHOD == "source" ]]; then
cmake -B build -S . -DUSE_CUDA=ON
fi
elif [[ $TASK == "mpi" ]]; then
if [[ $METHOD == "pip" ]]; then
cd $BUILD_DIRECTORY && sh ./build-python.sh sdist || exit 1
sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit 1
sh ./build-python.sh sdist || exit 1
sh ./.ci/check_python_dists.sh ./dist || exit 1
pip install \
--user \
-v \
--config-settings=cmake.define.USE_MPI=ON \
$BUILD_DIRECTORY/dist/lightgbm-$LGB_VER.tar.gz \
./dist/lightgbm-$LGB_VER.tar.gz \
|| exit 1
pytest $BUILD_DIRECTORY/tests/python_package_test || exit 1
pytest ./tests/python_package_test || exit 1
exit 0
elif [[ $METHOD == "wheel" ]]; then
cd $BUILD_DIRECTORY && sh ./build-python.sh bdist_wheel --mpi || exit 1
sh $BUILD_DIRECTORY/.ci/check_python_dists.sh $BUILD_DIRECTORY/dist || exit 1
pip install --user $BUILD_DIRECTORY/dist/lightgbm-$LGB_VER*.whl -v || exit 1
pytest $BUILD_DIRECTORY/tests || exit 1
sh ./build-python.sh bdist_wheel --mpi || exit 1
sh ./.ci/check_python_dists.sh ./dist || exit 1
pip install ./dist/lightgbm-$LGB_VER*.whl -v || exit 1
pytest ./tests || exit 1
exit 0
elif [[ $METHOD == "source" ]]; then
cmake -B build -S . -DUSE_MPI=ON -DUSE_DEBUG=ON
@ -279,22 +282,22 @@ fi
cmake --build build --target _lightgbm -j4 || exit 1
cd $BUILD_DIRECTORY && sh ./build-python.sh install --precompile --user || exit 1
pytest $BUILD_DIRECTORY/tests || exit 1
sh ./build-python.sh install --precompile || exit 1
pytest ./tests || exit 1
if [[ $TASK == "regular" ]]; then
if [[ $PRODUCES_ARTIFACTS == "true" ]]; then
if [[ $OS_NAME == "macos" ]]; then
cp $BUILD_DIRECTORY/lib_lightgbm.dylib $BUILD_ARTIFACTSTAGINGDIRECTORY/lib_lightgbm.dylib
cp ./lib_lightgbm.dylib $BUILD_ARTIFACTSTAGINGDIRECTORY/lib_lightgbm.dylib
else
if [[ $COMPILER == "gcc" ]]; then
objdump -T $BUILD_DIRECTORY/lib_lightgbm.so > $BUILD_DIRECTORY/objdump.log || exit 1
python $BUILD_DIRECTORY/helpers/check_dynamic_dependencies.py $BUILD_DIRECTORY/objdump.log || exit 1
objdump -T ./lib_lightgbm.so > ./objdump.log || exit 1
python ./helpers/check_dynamic_dependencies.py ./objdump.log || exit 1
fi
cp $BUILD_DIRECTORY/lib_lightgbm.so $BUILD_ARTIFACTSTAGINGDIRECTORY/lib_lightgbm.so
cp ./lib_lightgbm.so $BUILD_ARTIFACTSTAGINGDIRECTORY/lib_lightgbm.so
fi
fi
cd $BUILD_DIRECTORY/examples/python-guide
cd "$BUILD_DIRECTORY/examples/python-guide"
sed -i'.bak' '/import lightgbm as lgb/a\
import matplotlib\
matplotlib.use\(\"Agg\"\)\
@ -306,7 +309,7 @@ matplotlib.use\(\"Agg\"\)\
'ipywidgets>=8.1.2' \
'notebook>=7.1.2'
for f in *.py **/*.py; do python $f || exit 1; done # run all examples
cd $BUILD_DIRECTORY/examples/python-guide/notebooks
cd "$BUILD_DIRECTORY/examples/python-guide/notebooks"
sed -i'.bak' 's/INTERACTIVE = False/assert False, \\"Interactive mode disabled\\"/' interactive_plot_example.ipynb
jupyter nbconvert --ExecutePreprocessor.timeout=180 --to notebook --execute --inplace *.ipynb || exit 1 # run all notebooks

Просмотреть файл

@ -106,10 +106,10 @@ if [[ $OS_NAME == "macos" ]]; then
-target / || exit 1
fi
# fix for issue where CRAN was not returning {lattice} when using R 3.6
# fix for issue where CRAN was not returning {lattice} and {evaluate} when using R 3.6
# "Warning: dependency lattice is not available"
if [[ "${R_MAJOR_VERSION}" == "3" ]]; then
Rscript --vanilla -e "install.packages('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', repos = NULL, lib = '${R_LIB_PATH}')"
Rscript --vanilla -e "install.packages(c('https://cran.r-project.org/src/contrib/Archive/lattice/lattice_0.20-41.tar.gz', 'https://cran.r-project.org/src/contrib/Archive/evaluate/evaluate_0.23.tar.gz'), repos = NULL, lib = '${R_LIB_PATH}')"
else
# {Matrix} needs {lattice}, so this needs to run before manually installing {Matrix}.
# This should be unnecessary on R >=4.4.0
@ -136,7 +136,7 @@ if [[ $OS_NAME == "macos" ]]; then
fi
Rscript --vanilla -e "options(install.packages.compile.from.source = '${compile_from_source}'); install.packages(${packages}, repos = '${CRAN_MIRROR}', lib = '${R_LIB_PATH}', dependencies = c('Depends', 'Imports', 'LinkingTo'), Ncpus = parallel::detectCores())" || exit 1
cd ${BUILD_DIRECTORY}
cd "${BUILD_DIRECTORY}"
PKG_TARBALL="lightgbm_*.tar.gz"
LOG_FILE_NAME="lightgbm.Rcheck/00check.log"
@ -147,7 +147,7 @@ elif [[ $R_BUILD_TYPE == "cran" ]]; then
# on Linux, we recreate configure in CI to test if
# a change in a PR has changed configure.ac
if [[ $OS_NAME == "linux" ]]; then
${BUILD_DIRECTORY}/R-package/recreate-configure.sh
./R-package/recreate-configure.sh
num_files_changed=$(
git diff --name-only | wc -l

Просмотреть файл

@ -72,10 +72,14 @@ bytes_possibly_lost=$(
| tr -d ","
)
echo "valgrind found ${bytes_possibly_lost} bytes possibly lost"
if [[ ${bytes_possibly_lost} -gt 1056 ]]; then
if [[ ${bytes_possibly_lost} -gt 1104 ]]; then
exit 1
fi
# ensure 'grep --count' doesn't cause failures
set +e
echo "checking for invalid reads"
invalid_reads=$(
cat ${VALGRIND_LOGS_FILE} \
| grep --count -i "Invalid read"
@ -85,6 +89,7 @@ if [[ ${invalid_reads} -gt 0 ]]; then
exit 1
fi
echo "checking for invalid writes"
invalid_writes=$(
cat ${VALGRIND_LOGS_FILE} \
| grep --count -i "Invalid write"

Просмотреть файл

@ -6,14 +6,11 @@ function Check-Output {
}
}
# unify environment variable for Azure DevOps and AppVeyor
if (Test-Path env:APPVEYOR) {
$env:APPVEYOR = "true"
$env:ALLOW_SKIP_ARROW_TESTS = "1"
}
$env:CONDA_ENV = "test-env"
$env:LGB_VER = (Get-Content $env:BUILD_SOURCESDIRECTORY\VERSION.txt).trim()
if ($env:TASK -eq "r-package") {
& $env:BUILD_SOURCESDIRECTORY\.ci\test_r_package_windows.ps1 ; Check-Output $?
& .\.ci\test_r_package_windows.ps1 ; Check-Output $?
Exit 0
}
@ -34,7 +31,7 @@ if ($env:TASK -eq "swig") {
cmake -B build -S . -A x64 -DUSE_SWIG=ON ; Check-Output $?
cmake --build build --target ALL_BUILD --config Release ; Check-Output $?
if ($env:AZURE -eq "true") {
cp $env:BUILD_SOURCESDIRECTORY/build/lightgbmlib.jar $env:BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_win.jar ; Check-Output $?
cp ./build/lightgbmlib.jar $env:BUILD_ARTIFACTSTAGINGDIRECTORY/lightgbmlib_win.jar ; Check-Output $?
}
Exit 0
}
@ -43,16 +40,12 @@ if ($env:TASK -eq "swig") {
conda init powershell
conda activate
conda config --set always_yes yes --set changeps1 no
# ref:
# * https://stackoverflow.com/a/62897729/3986677
# * https://github.com/microsoft/LightGBM/issues/5899
conda install "brotlipy>=0.7"
conda update -q -y conda
conda update -q -y conda "python=$env:PYTHON_VERSION[build=*cpython]"
if ($env:PYTHON_VERSION -eq "3.7") {
$env:CONDA_REQUIREMENT_FILE = "$env:BUILD_SOURCESDIRECTORY/.ci/conda-envs/ci-core-py37.txt"
} elseif ($env:PYTHON_VERSION -eq "3.8") {
$env:CONDA_REQUIREMENT_FILE = "$env:BUILD_SOURCESDIRECTORY/.ci/conda-envs/ci-core-py38.txt"
} else {
$env:CONDA_REQUIREMENT_FILE = "$env:BUILD_SOURCESDIRECTORY/.ci/conda-envs/ci-core.txt"
}
@ -67,18 +60,17 @@ if ($env:TASK -ne "bdist") {
conda activate $env:CONDA_ENV
}
cd $env:BUILD_SOURCESDIRECTORY
if ($env:TASK -eq "regular") {
cmake -B build -S . -A x64 ; Check-Output $?
cmake --build build --target ALL_BUILD --config Release ; Check-Output $?
cd $env:BUILD_SOURCESDIRECTORY
sh $env:BUILD_SOURCESDIRECTORY/build-python.sh install --precompile ; Check-Output $?
cp $env:BUILD_SOURCESDIRECTORY/Release/lib_lightgbm.dll $env:BUILD_ARTIFACTSTAGINGDIRECTORY
cp $env:BUILD_SOURCESDIRECTORY/Release/lightgbm.exe $env:BUILD_ARTIFACTSTAGINGDIRECTORY
sh ./build-python.sh install --precompile ; Check-Output $?
cp ./Release/lib_lightgbm.dll $env:BUILD_ARTIFACTSTAGINGDIRECTORY
cp ./Release/lightgbm.exe $env:BUILD_ARTIFACTSTAGINGDIRECTORY
}
elseif ($env:TASK -eq "sdist") {
cd $env:BUILD_SOURCESDIRECTORY
sh $env:BUILD_SOURCESDIRECTORY/build-python.sh sdist ; Check-Output $?
sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/dist ; Check-Output $?
sh ./build-python.sh sdist ; Check-Output $?
sh ./.ci/check_python_dists.sh ./dist ; Check-Output $?
cd dist; pip install @(Get-ChildItem *.gz) -v ; Check-Output $?
}
elseif ($env:TASK -eq "bdist") {
@ -92,17 +84,15 @@ elseif ($env:TASK -eq "bdist") {
Get-ItemProperty -Path Registry::HKEY_LOCAL_MACHINE\SOFTWARE\Khronos\OpenCL\Vendors
conda activate $env:CONDA_ENV
cd $env:BUILD_SOURCESDIRECTORY
sh "build-python.sh" bdist_wheel --integrated-opencl ; Check-Output $?
sh $env:BUILD_SOURCESDIRECTORY/.ci/check_python_dists.sh $env:BUILD_SOURCESDIRECTORY/dist ; Check-Output $?
cd dist; pip install --user @(Get-ChildItem *py3-none-win_amd64.whl) ; Check-Output $?
sh ./.ci/check_python_dists.sh ./dist ; Check-Output $?
cd dist; pip install @(Get-ChildItem *py3-none-win_amd64.whl) ; Check-Output $?
cp @(Get-ChildItem *py3-none-win_amd64.whl) $env:BUILD_ARTIFACTSTAGINGDIRECTORY
} elseif (($env:APPVEYOR -eq "true") -and ($env:TASK -eq "python")) {
cd $env:BUILD_SOURCESDIRECTORY
if ($env:COMPILER -eq "MINGW") {
sh $env:BUILD_SOURCESDIRECTORY/build-python.sh install --user --mingw ; Check-Output $?
sh ./build-python.sh install --mingw ; Check-Output $?
} else {
sh $env:BUILD_SOURCESDIRECTORY/build-python.sh install --user; Check-Output $?
sh ./build-python.sh install; Check-Output $?
}
}

140
.github/workflows/cuda.yml поставляемый
Просмотреть файл

@ -7,54 +7,41 @@ on:
pull_request:
branches:
- master
- release/*
# Run manually by clicking a button in the UI
workflow_dispatch:
inputs:
restart_docker:
description: 'Restart nvidia-docker on the runner before building?'
required: true
type: boolean
default: false
# automatically cancel in-progress builds if another commit is pushed
concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true
env:
github_actions: 'true'
os_name: linux
conda_env: test-env
jobs:
test:
name: ${{ matrix.task }} ${{ matrix.cuda_version }} ${{ matrix.method }} (linux, ${{ matrix.compiler }}, Python ${{ matrix.python_version }})
# Optionally reinstall + restart docker on the runner before building.
# This is safe as long as only 1 of these jobs runs at a time.
restart-docker:
name: set up docker
runs-on: [self-hosted, linux]
timeout-minutes: 60
strategy:
fail-fast: false
matrix:
include:
- method: wheel
compiler: gcc
python_version: "3.11"
cuda_version: "11.8.0"
task: cuda
- method: source
compiler: gcc
python_version: "3.9"
cuda_version: "12.2.0"
task: cuda
- method: pip
compiler: clang
python_version: "3.10"
cuda_version: "11.8.0"
task: cuda
timeout-minutes: 30
steps:
- name: Setup or update software on host machine
if: ${{ inputs.restart_docker }}
run: |
# install core packages
sudo apt-get update
sudo apt-get install --no-install-recommends -y \
apt-transport-https \
ca-certificates \
curl \
git \
gnupg-agent \
lsb-release \
software-properties-common
# set up nvidia-docker
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
sudo add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $(lsb_release -cs) stable" -y
curl -sL https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
@ -67,43 +54,76 @@ jobs:
nvidia-docker2
sudo chmod a+rw /var/run/docker.sock
sudo systemctl restart docker
- name: Remove old folder with repository
run: sudo rm -rf $GITHUB_WORKSPACE
- name: mark job successful
run: |
exit 0
test:
name: ${{ matrix.task }} ${{ matrix.cuda_version }} ${{ matrix.method }} (${{ matrix.linux_version }}, ${{ matrix.compiler }}, Python ${{ matrix.python_version }})
runs-on: [self-hosted, linux]
needs: [restart-docker]
container:
image: nvcr.io/nvidia/cuda:${{ matrix.cuda_version }}-devel-${{ matrix.linux_version }}
env:
CMAKE_BUILD_PARALLEL_LEVEL: 4
COMPILER: ${{ matrix.compiler }}
CONDA: /tmp/miniforge
DEBIAN_FRONTEND: noninteractive
METHOD: ${{ matrix.method }}
OS_NAME: linux
PYTHON_VERSION: ${{ matrix.python_version }}
TASK: ${{ matrix.task }}
SKBUILD_STRICT_CONFIG: true
options: --gpus all
timeout-minutes: 30
strategy:
fail-fast: false
matrix:
include:
- method: wheel
compiler: gcc
python_version: "3.10"
cuda_version: "11.8.0"
linux_version: "ubuntu20.04"
task: cuda
- method: source
compiler: gcc
python_version: "3.12"
cuda_version: "12.2.0"
linux_version: "ubuntu22.04"
task: cuda
- method: pip
compiler: clang
python_version: "3.11"
cuda_version: "11.8.0"
linux_version: "ubuntu20.04"
task: cuda
steps:
- name: Install latest git
run: |
apt-get update
apt-get install --no-install-recommends -y \
ca-certificates \
software-properties-common
add-apt-repository ppa:git-core/ppa -y
apt-get update
apt-get install --no-install-recommends -y \
git
- name: Checkout repository
uses: actions/checkout@v1
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: true
- name: Setup and run tests
run: |
export ROOT_DOCKER_FOLDER=/LightGBM
cat > docker.env <<EOF
GITHUB_ACTIONS=${{ env.github_actions }}
OS_NAME=${{ env.os_name }}
COMPILER=${{ matrix.compiler }}
TASK=${{ matrix.task }}
METHOD=${{ matrix.method }}
CONDA_ENV=${{ env.conda_env }}
PYTHON_VERSION=${{ matrix.python_version }}
BUILD_DIRECTORY=$ROOT_DOCKER_FOLDER
LGB_VER=$(head -n 1 VERSION.txt)
EOF
cat > docker-script.sh <<EOF
export CONDA=\$HOME/miniforge
export PATH=\$CONDA/bin:\$PATH
nvidia-smi
$ROOT_DOCKER_FOLDER/.ci/setup.sh || exit 1
$ROOT_DOCKER_FOLDER/.ci/test.sh || exit 1
EOF
cuda_version="${{ matrix.cuda_version }}"
cuda_major=${cuda_version%%.*}
docker_img="nvcr.io/nvidia/cuda:${cuda_version}-devel"
if [[ ${cuda_major} -eq 11 ]]; then
docker_img="${docker_img}-ubuntu18.04"
elif [[ ${cuda_major} -ge 12 ]]; then
docker_img="${docker_img}-ubuntu20.04"
fi
docker run --env-file docker.env -v "$GITHUB_WORKSPACE":"$ROOT_DOCKER_FOLDER" --rm --gpus all "$docker_img" /bin/bash $ROOT_DOCKER_FOLDER/docker-script.sh
export BUILD_DIRECTORY="$GITHUB_WORKSPACE"
export PATH=$CONDA/bin:$PATH
# check GPU usage
nvidia-smi
# build and test
$GITHUB_WORKSPACE/.ci/setup.sh
$GITHUB_WORKSPACE/.ci/test.sh
all-cuda-jobs-successful:
if: always()
runs-on: ubuntu-latest

7
.github/workflows/linkchecker.yml поставляемый
Просмотреть файл

@ -8,10 +8,9 @@ on:
- cron: '0 8 * * *'
env:
CONDA_ENV: test-env
GITHUB_ACTIONS: 'true'
COMPILER: gcc
OS_NAME: 'linux'
PYTHON_VERSION: '3.11'
PYTHON_VERSION: '3.12'
TASK: 'check-links'
jobs:
@ -20,7 +19,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: false

3
.github/workflows/optional_checks.yml поставляемый
Просмотреть файл

@ -4,7 +4,6 @@ on:
pull_request:
branches:
- master
- release/*
jobs:
all-optional-checks-successful:
@ -12,7 +11,7 @@ jobs:
runs-on: ubuntu-latest
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: false

80
.github/workflows/python_package.yml поставляемый
Просмотреть файл

@ -7,7 +7,6 @@ on:
pull_request:
branches:
- master
- release/*
# automatically cancel in-progress builds if another commit is pushed
concurrency:
@ -15,8 +14,8 @@ concurrency:
cancel-in-progress: true
env:
CONDA_ENV: test-env
GITHUB_ACTIONS: 'true'
CMAKE_BUILD_PARALLEL_LEVEL: 4
SKBUILD_STRICT_CONFIG: true
jobs:
test:
@ -29,33 +28,37 @@ jobs:
include:
- os: macos-13
task: regular
python_version: '3.9'
- os: macos-13
task: sdist
python_version: '3.10'
- os: macos-13
task: sdist
python_version: '3.11'
- os: macos-13
task: bdist
python_version: '3.7'
python_version: '3.8'
- os: macos-13
task: if-else
python_version: '3.9'
- os: macos-14
task: bdist
method: wheel
python_version: '3.10'
# We're currently skipping MPI jobs on macOS, see https://github.com/microsoft/LightGBM/pull/6425
# for further details.
# - os: macos-13
# task: mpi
# method: source
# python_version: '3.10'
# - os: macos-13
# task: mpi
# method: pip
# python_version: '3.11'
# - os: macos-13
# task: mpi
# method: pip
# python_version: '3.12'
# - os: macos-13
# task: mpi
# method: wheel
# python_version: '3.8'
# python_version: '3.9'
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: true
@ -65,7 +68,11 @@ jobs:
export TASK="${{ matrix.task }}"
export METHOD="${{ matrix.method }}"
export PYTHON_VERSION="${{ matrix.python_version }}"
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
if [[ "${{ matrix.os }}" == "macos-14" ]]; then
# use clang when creating macOS release artifacts
export COMPILER="clang"
export OS_NAME="macos"
elif [[ "${{ matrix.os }}" == "macos-13" ]]; then
export COMPILER="gcc"
export OS_NAME="macos"
elif [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
@ -73,18 +80,23 @@ jobs:
export OS_NAME="linux"
fi
export BUILD_DIRECTORY="$GITHUB_WORKSPACE"
export LGB_VER=$(head -n 1 VERSION.txt)
export CONDA=${HOME}/miniforge
export PATH=${CONDA}/bin:${PATH}
$GITHUB_WORKSPACE/.ci/setup.sh || exit 1
$GITHUB_WORKSPACE/.ci/test.sh || exit 1
test-oldest-versions:
name: Python - oldest supported versions (ubuntu-latest)
- name: upload wheels
if: ${{ matrix.method == 'wheel' && matrix.os == 'macos-14' }}
uses: actions/upload-artifact@v4
with:
name: macosx-arm64-wheel
path: dist/*.whl
test-latest-versions:
name: Python - latest versions (ubuntu-latest)
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: true
@ -92,6 +104,7 @@ jobs:
run: |
docker run \
--rm \
--env CMAKE_BUILD_PARALLEL_LEVEL=${{ env.CMAKE_BUILD_PARALLEL_LEVEL }} \
-v $(pwd):/opt/lgb-build \
-w /opt/lgb-build \
lightgbm/vsts-agent:manylinux_2_28_x86_64 \
@ -102,12 +115,39 @@ jobs:
--rm \
-v $(pwd):/opt/lgb-build \
-w /opt/lgb-build \
python:3.6 \
python:3.11 \
/bin/bash ./.ci/test-python-latest.sh
test-oldest-versions:
name: Python - oldest supported versions (ubuntu-latest)
runs-on: ubuntu-latest
timeout-minutes: 60
steps:
- name: Checkout repository
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: true
- name: Create wheel
run: |
docker run \
--rm \
--env CMAKE_BUILD_PARALLEL_LEVEL=${{ env.CMAKE_BUILD_PARALLEL_LEVEL }} \
-v $(pwd):/opt/lgb-build \
-w /opt/lgb-build \
lightgbm/vsts-agent:manylinux_2_28_x86_64 \
/bin/bash -c 'PATH=/opt/miniforge/bin:$PATH sh ./build-python.sh bdist_wheel --nomp'
- name: Test compatibility
run: |
docker run \
--rm \
-v $(pwd):/opt/lgb-build \
-w /opt/lgb-build \
python:3.7 \
/bin/bash ./.ci/test-python-oldest.sh
all-python-package-jobs-successful:
if: always()
runs-on: ubuntu-latest
needs: [test, test-oldest-versions]
needs: [test, test-latest-versions, test-oldest-versions]
steps:
- name: Note that all tests succeeded
uses: re-actors/alls-green@v1.2.2

2
.github/workflows/r_configure.yml поставляемый
Просмотреть файл

@ -21,7 +21,7 @@ jobs:
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: true

8
.github/workflows/r_package.yml поставляемый
Просмотреть файл

@ -7,7 +7,6 @@ on:
pull_request:
branches:
- master
- release/*
# automatically cancel in-progress builds if another commit is pushed
concurrency:
@ -15,6 +14,7 @@ concurrency:
cancel-in-progress: true
env:
CMAKE_BUILD_PARALLEL_LEVEL: 4
# hack to get around this:
# https://stat.ethz.ch/pipermail/r-package-devel/2020q3/005930.html
_R_CHECK_SYSTEM_CLOCK_: 0
@ -189,7 +189,6 @@ jobs:
run: |
export TASK="${{ matrix.task }}"
export COMPILER="${{ matrix.compiler }}"
export GITHUB_ACTIONS="true"
if [[ "${{ matrix.os }}" == "macos-13" ]]; then
export OS_NAME="macos"
elif [[ "${{ matrix.os }}" == "ubuntu-latest" ]]; then
@ -216,7 +215,6 @@ jobs:
$env:R_VERSION = "${{ matrix.r_version }}"
$env:R_BUILD_TYPE = "${{ matrix.build_type }}"
$env:COMPILER = "${{ matrix.compiler }}"
$env:GITHUB_ACTIONS = "true"
$env:TASK = "${{ matrix.task }}"
& "$env:GITHUB_WORKSPACE/.ci/test_windows.ps1"
test-r-sanitizers:
@ -237,7 +235,7 @@ jobs:
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: true
@ -280,7 +278,7 @@ jobs:
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: true

2
.github/workflows/r_valgrind.yml поставляемый
Просмотреть файл

@ -24,7 +24,7 @@ jobs:
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: true

9
.github/workflows/static_analysis.yml поставляемый
Просмотреть файл

@ -9,7 +9,6 @@ on:
pull_request:
branches:
- master
- release/*
# automatically cancel in-progress builds if another commit is pushed
concurrency:
@ -18,10 +17,8 @@ concurrency:
env:
COMPILER: 'gcc'
CONDA_ENV: test-env
GITHUB_ACTIONS: 'true'
OS_NAME: 'linux'
PYTHON_VERSION: '3.11'
PYTHON_VERSION: '3.12'
jobs:
test:
@ -36,7 +33,7 @@ jobs:
- task: check-docs
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: false
@ -59,7 +56,7 @@ jobs:
run: |
git config --global --add safe.directory "${GITHUB_WORKSPACE}"
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: true

2
.github/workflows/triggering_comments.yml поставляемый
Просмотреть файл

@ -12,7 +12,7 @@ jobs:
SECRETS_WORKFLOW: ${{ secrets.WORKFLOW }}
steps:
- name: Checkout repository
uses: actions/checkout@v3
uses: actions/checkout@v4
with:
fetch-depth: 5
submodules: false

2
.gitignore поставляемый
Просмотреть файл

@ -405,7 +405,7 @@ python-package/lightgbm/VERSION.txt
# R build artefacts
**/autom4te.cache/
conftest*
R-package/conftest*
R-package/config.status
!R-package/data/agaricus.test.rda
!R-package/data/agaricus.train.rda

Просмотреть файл

@ -1,5 +1,6 @@
# coding: utf-8
"""Script for generating files with NuGet package metadata."""
import datetime
import sys
from pathlib import Path

Просмотреть файл

@ -13,7 +13,7 @@ exclude: |
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.5.0
rev: v4.6.0
hooks:
- id: end-of-file-fixer
- id: trailing-whitespace
@ -25,7 +25,7 @@ repos:
args: ["--settings-path", "python-package/pyproject.toml"]
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.2.1
rev: v0.4.7
hooks:
# Run the linter.
- id: ruff

Просмотреть файл

@ -7,15 +7,15 @@ trigger:
- v*
pr:
- master
- release/*
variables:
AZURE: 'true'
PYTHON_VERSION: '3.11'
CONDA_ENV: test-env
CMAKE_BUILD_PARALLEL_LEVEL: 4
PYTHON_VERSION: '3.12'
runCodesignValidationInjection: false
skipComponentGovernanceDetection: true
DOTNET_CLI_TELEMETRY_OPTOUT: true
DOTNET_SKIP_FIRST_TIME_EXPERIENCE: true
SKBUILD_STRICT_CONFIG: true
resources:
# The __work/ directory, where Azure DevOps writes the source files, needs to be read-write because
# LightGBM's CI jobs write files in the source directory.
@ -61,19 +61,19 @@ jobs:
matrix:
regular:
TASK: regular
PYTHON_VERSION: '3.9'
PYTHON_VERSION: '3.10'
sdist:
TASK: sdist
PYTHON_VERSION: '3.7'
PYTHON_VERSION: '3.8'
bdist:
TASK: bdist
PYTHON_VERSION: '3.8'
PYTHON_VERSION: '3.9'
inference:
TASK: if-else
mpi_source:
TASK: mpi
METHOD: source
PYTHON_VERSION: '3.8'
PYTHON_VERSION: '3.9'
gpu_source:
TASK: gpu
METHOD: source
@ -82,7 +82,6 @@ jobs:
steps:
- script: |
echo "##vso[task.setvariable variable=BUILD_DIRECTORY]$BUILD_SOURCESDIRECTORY"
echo "##vso[task.setvariable variable=LGB_VER]$(head -n 1 VERSION.txt)"
echo "##vso[task.prependpath]/usr/lib64/openmpi/bin"
echo "##vso[task.prependpath]$CONDA/bin"
displayName: 'Set variables'
@ -127,7 +126,7 @@ jobs:
TASK: sdist
bdist:
TASK: bdist
PYTHON_VERSION: '3.9'
PYTHON_VERSION: '3.10'
inference:
TASK: if-else
mpi_source:
@ -136,30 +135,29 @@ jobs:
mpi_pip:
TASK: mpi
METHOD: pip
PYTHON_VERSION: '3.10'
PYTHON_VERSION: '3.11'
mpi_wheel:
TASK: mpi
METHOD: wheel
PYTHON_VERSION: '3.8'
PYTHON_VERSION: '3.9'
gpu_source:
TASK: gpu
METHOD: source
PYTHON_VERSION: '3.10'
PYTHON_VERSION: '3.11'
gpu_pip:
TASK: gpu
METHOD: pip
PYTHON_VERSION: '3.9'
PYTHON_VERSION: '3.10'
gpu_wheel:
TASK: gpu
METHOD: wheel
PYTHON_VERSION: '3.8'
PYTHON_VERSION: '3.9'
cpp_tests:
TASK: cpp-tests
METHOD: with-sanitizers
steps:
- script: |
echo "##vso[task.setvariable variable=BUILD_DIRECTORY]$BUILD_SOURCESDIRECTORY"
echo "##vso[task.setvariable variable=LGB_VER]$(head -n 1 VERSION.txt)"
CONDA=$HOME/miniforge
echo "##vso[task.setvariable variable=CONDA]$CONDA"
echo "##vso[task.prependpath]$CONDA/bin"
@ -188,8 +186,8 @@ jobs:
- job: QEMU_multiarch
###########################################
variables:
BUILD_DIRECTORY: /LightGBM
COMPILER: gcc
OS_NAME: 'linux'
PRODUCES_ARTIFACTS: 'true'
pool:
vmImage: ubuntu-22.04
@ -215,26 +213,12 @@ jobs:
git clean -d -f -x
displayName: 'Clean source directory'
- script: |
export ROOT_DOCKER_FOLDER=/LightGBM
cat > docker.env <<EOF
AZURE=$AZURE
OS_NAME=$OS_NAME
COMPILER=$COMPILER
TASK=$TASK
METHOD=$METHOD
CONDA_ENV=$CONDA_ENV
PYTHON_VERSION=$PYTHON_VERSION
BUILD_DIRECTORY=$ROOT_DOCKER_FOLDER
LGB_VER=$(head -n 1 VERSION.txt)
PRODUCES_ARTIFACTS=$PRODUCES_ARTIFACTS
BUILD_ARTIFACTSTAGINGDIRECTORY=$BUILD_ARTIFACTSTAGINGDIRECTORY
EOF
cat > docker-script.sh <<EOF
export CONDA=\$HOME/miniforge
export PATH=\$CONDA/bin:/opt/rh/llvm-toolset-7.0/root/usr/bin:\$PATH
export LD_LIBRARY_PATH=/opt/rh/llvm-toolset-7.0/root/usr/lib64:\$LD_LIBRARY_PATH
$ROOT_DOCKER_FOLDER/.ci/setup.sh || exit 1
$ROOT_DOCKER_FOLDER/.ci/test.sh || exit 1
\$BUILD_DIRECTORY/.ci/setup.sh || exit 1
\$BUILD_DIRECTORY/.ci/test.sh || exit 1
EOF
IMAGE_URI="lightgbm/vsts-agent:manylinux2014_aarch64"
docker pull "${IMAGE_URI}" || exit 1
@ -243,11 +227,19 @@ jobs:
docker run \
--platform "${PLATFORM}" \
--rm \
--env-file docker.env \
-v "$(Build.SourcesDirectory)":"$ROOT_DOCKER_FOLDER" \
--env AZURE=true \
--env BUILD_ARTIFACTSTAGINGDIRECTORY=$BUILD_ARTIFACTSTAGINGDIRECTORY \
--env BUILD_DIRECTORY=$BUILD_DIRECTORY \
--env COMPILER=$COMPILER \
--env METHOD=$METHOD \
--env OS_NAME=linux \
--env PRODUCES_ARTIFACTS=$PRODUCES_ARTIFACTS \
--env PYTHON_VERSION=$PYTHON_VERSION \
--env TASK=$TASK \
-v "$(Build.SourcesDirectory)":"$BUILD_DIRECTORY" \
-v "$(Build.ArtifactStagingDirectory)":"$(Build.ArtifactStagingDirectory)" \
"${IMAGE_URI}" \
/bin/bash $ROOT_DOCKER_FOLDER/docker-script.sh
/bin/bash $BUILD_DIRECTORY/docker-script.sh
displayName: 'Setup and run tests'
- task: PublishBuildArtifacts@1
condition: and(succeeded(), in(variables['TASK'], 'bdist'), not(startsWith(variables['Build.SourceBranch'], 'refs/pull/')))
@ -263,7 +255,7 @@ jobs:
OS_NAME: 'macos'
PRODUCES_ARTIFACTS: 'true'
pool:
vmImage: 'macOS-11'
vmImage: 'macOS-12'
strategy:
matrix:
regular:
@ -283,7 +275,6 @@ jobs:
steps:
- script: |
echo "##vso[task.setvariable variable=BUILD_DIRECTORY]$BUILD_SOURCESDIRECTORY"
echo "##vso[task.setvariable variable=LGB_VER]$(head -n 1 VERSION.txt)"
CONDA=$AGENT_HOMEDIRECTORY/miniforge
echo "##vso[task.setvariable variable=CONDA]$CONDA"
echo "##vso[task.prependpath]$CONDA/bin"

Просмотреть файл

@ -25,6 +25,14 @@ option(__INTEGRATE_OPENCL "Set to ON if building LightGBM with the OpenCL ICD Lo
cmake_minimum_required(VERSION 3.18)
# If using Visual Studio generators, always target v10.x of the Windows SDK.
# Doing this avoids lookups that could fall back to very old versions, e.g. by finding
# outdated registry entries.
# ref: https://cmake.org/cmake/help/latest/variable/CMAKE_VS_WINDOWS_TARGET_PLATFORM_VERSION.html
if(CMAKE_GENERATOR MATCHES "Visual Studio")
set(CMAKE_SYSTEM_VERSION 10.0 CACHE INTERNAL "target Windows SDK version" FORCE)
endif()
project(lightgbm LANGUAGES C CXX)
if(BUILD_CPP_TEST)
@ -704,6 +712,83 @@ if(__BUILD_FOR_PYTHON)
set(CMAKE_INSTALL_PREFIX "lightgbm")
endif()
# The macOS linker puts an absolute path to linked libraries in lib_lightgb.dylib.
# This block overrides that information for LightGBM's OpenMP dependency, to allow
# finding that library in more places.
#
# This reduces the risk of runtime issues resulting from multiple libomp.dylib being loaded.
#
if(APPLE AND USE_OPENMP)
# store path to libomp found at build time in a variable
get_target_property(
OpenMP_LIBRARY_LOCATION
OpenMP::OpenMP_CXX
INTERFACE_LINK_LIBRARIES
)
# get just the filename of that path
# (to deal with the possibility that it might be 'libomp.dylib' or 'libgomp.dylib' or 'libiomp.dylib')
get_filename_component(
OpenMP_LIBRARY_NAME
${OpenMP_LIBRARY_LOCATION}
NAME
)
# get directory of that path
get_filename_component(
OpenMP_LIBRARY_DIR
${OpenMP_LIBRARY_LOCATION}
DIRECTORY
)
# get exact name of the library in a variable
get_target_property(
__LIB_LIGHTGBM_OUTPUT_NAME
_lightgbm
OUTPUT_NAME
)
if(NOT __LIB_LIGHTGBM_OUTPUT_NAME)
set(__LIB_LIGHTGBM_OUTPUT_NAME "lib_lightgbm")
endif()
if(CMAKE_SHARED_LIBRARY_SUFFIX_CXX)
set(
__LIB_LIGHTGBM_FILENAME "${__LIB_LIGHTGBM_OUTPUT_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX_CXX}"
CACHE INTERNAL "lightgbm shared library filename"
)
else()
set(
__LIB_LIGHTGBM_FILENAME "${__LIB_LIGHTGBM_OUTPUT_NAME}.dylib"
CACHE INTERNAL "lightgbm shared library filename"
)
endif()
# Override the absolute path to OpenMP with a relative one using @rpath.
#
# This also ensures that if a libomp.dylib has already been loaded, it'll just use that.
add_custom_command(
TARGET _lightgbm
POST_BUILD
COMMAND
install_name_tool
-change
${OpenMP_LIBRARY_LOCATION}
"@rpath/${OpenMP_LIBRARY_NAME}"
"${__LIB_LIGHTGBM_FILENAME}"
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
COMMENT "Replacing hard-coded OpenMP install_name with '@rpath/${OpenMP_LIBRARY_NAME}'..."
)
# add RPATH entries to ensure the loader looks in the following, in the following order:
#
# - /opt/homebrew/opt/libomp/lib (where 'brew install' / 'brew link' puts libomp.dylib)
# - ${OpenMP_LIBRARY_DIR} (wherever find_package(OpenMP) found OpenMP at build time)
#
set_target_properties(
_lightgbm
PROPERTIES
BUILD_WITH_INSTALL_RPATH TRUE
INSTALL_RPATH "/opt/homebrew/opt/libomp/lib;${OpenMP_LIBRARY_DIR}"
INSTALL_RPATH_USE_LINK_PATH FALSE
)
endif()
install(
TARGETS _lightgbm
RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin

Просмотреть файл

@ -1309,6 +1309,8 @@ lgb.load <- function(filename = NULL, model_str = NULL) {
#' For example, passing \code{start_iteration=5, num_iteration=3} for a regression model
#' means "save the fifth, sixth, and seventh tree"
#'
#' \emph{New in version 4.4.0}
#'
#' @return lgb.Booster
#'
#' @examples
@ -1373,6 +1375,8 @@ lgb.save <- function(
#' For example, passing \code{start_iteration=5, num_iteration=3} for a regression model
#' means "dump the fifth, sixth, and seventh tree"
#'
#' \emph{New in version 4.4.0}
#'
#' @return json format of model
#'
#' @examples

Просмотреть файл

@ -170,7 +170,12 @@ Dataset <- R6::R6Class(
# Check if more categorical features were output over the feature space
data_is_not_filename <- !is.character(private$raw_data)
if (data_is_not_filename && max(private$categorical_feature) > ncol(private$raw_data)) {
if (
data_is_not_filename
&& !is.null(private$raw_data)
&& is.null(private$used_indices)
&& max(private$categorical_feature) > ncol(private$raw_data)
) {
stop(
"lgb.Dataset.construct: supplied a too large value in categorical_feature: "
, max(private$categorical_feature)
@ -1049,6 +1054,9 @@ dimnames.lgb.Dataset <- function(x) {
#' @title Slice a dataset
#' @description Get a new \code{lgb.Dataset} containing the specified rows of
#' original \code{lgb.Dataset} object
#'
#' \emph{Renamed from} \code{slice()} \emph{in 4.4.0}
#'
#' @param dataset Object of class \code{lgb.Dataset}
#' @param idxset an integer vector of indices of rows needed
#' @return constructed sub dataset

Просмотреть файл

@ -6,6 +6,9 @@
#' @param start_iteration Index (1-based) of the first boosting round to include in the output.
#' For example, passing \code{start_iteration=5, num_iteration=3} for a regression model
#' means "return information about the fifth, sixth, and seventh trees".
#'
#' \emph{New in version 4.4.0}
#'
#' @return
#' A \code{data.table} with detailed information about model trees' nodes and leafs.
#'

Просмотреть файл

@ -59,68 +59,66 @@
}
# [description]
#
# Besides applying checks, this function
#
# 1. turns feature *names* into 1-based integer positions, then
# 2. adds an extra list element with skipped features, then
# 3. turns 1-based integer positions into 0-based positions, and finally
# 4. collapses the values of each list element into a string like "[0, 1]".
#
.check_interaction_constraints <- function(interaction_constraints, column_names) {
# Convert interaction constraints to feature numbers
string_constraints <- list()
if (!is.null(interaction_constraints)) {
if (!methods::is(interaction_constraints, "list")) {
stop("interaction_constraints must be a list")
}
constraint_is_character_or_numeric <- sapply(
X = interaction_constraints
, FUN = function(x) {
return(is.character(x) || is.numeric(x))
}
)
if (!all(constraint_is_character_or_numeric)) {
stop("every element in interaction_constraints must be a character vector or numeric vector")
}
for (constraint in interaction_constraints) {
# Check for character name
if (is.character(constraint)) {
constraint_indices <- as.integer(match(constraint, column_names) - 1L)
# Provided indices, but some indices are not existing?
if (sum(is.na(constraint_indices)) > 0L) {
stop(
"supplied an unknown feature in interaction_constraints "
, sQuote(constraint[is.na(constraint_indices)])
)
}
} else {
# Check that constraint indices are at most number of features
if (max(constraint) > length(column_names)) {
stop(
"supplied a too large value in interaction_constraints: "
, max(constraint)
, " but only "
, length(column_names)
, " features"
)
}
# Store indices as [0, n-1] indexed instead of [1, n] indexed
constraint_indices <- as.integer(constraint - 1L)
}
# Convert constraint to string
constraint_string <- paste0("[", paste0(constraint_indices, collapse = ","), "]")
string_constraints <- append(string_constraints, constraint_string)
}
if (is.null(interaction_constraints)) {
return(list())
}
if (!identical(class(interaction_constraints), "list")) {
stop("interaction_constraints must be a list")
}
return(string_constraints)
column_indices <- seq_along(column_names)
# Convert feature names to 1-based integer positions and apply checks
for (j in seq_along(interaction_constraints)) {
constraint <- interaction_constraints[[j]]
if (is.character(constraint)) {
constraint_indices <- match(constraint, column_names)
} else if (is.numeric(constraint)) {
constraint_indices <- as.integer(constraint)
} else {
stop("every element in interaction_constraints must be a character vector or numeric vector")
}
# Features outside range?
bad <- !(constraint_indices %in% column_indices)
if (any(bad)) {
stop(
"unknown feature(s) in interaction_constraints: "
, toString(sQuote(constraint[bad], q = "'"))
)
}
interaction_constraints[[j]] <- constraint_indices
}
# Add missing features as new interaction set
remaining_indices <- setdiff(
column_indices, sort(unique(unlist(interaction_constraints)))
)
if (length(remaining_indices) > 0L) {
interaction_constraints <- c(
interaction_constraints, list(remaining_indices)
)
}
# Turn indices 0-based and convert to string
for (j in seq_along(interaction_constraints)) {
interaction_constraints[[j]] <- paste0(
"[", paste0(interaction_constraints[[j]] - 1L, collapse = ","), "]"
)
}
return(interaction_constraints)
}

18
R-package/configure поставляемый
Просмотреть файл

@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated by GNU Autoconf 2.71 for lightgbm 4.3.0.99.
# Generated by GNU Autoconf 2.71 for lightgbm 4.4.0.99.
#
#
# Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation,
@ -607,8 +607,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='lightgbm'
PACKAGE_TARNAME='lightgbm'
PACKAGE_VERSION='4.3.0.99'
PACKAGE_STRING='lightgbm 4.3.0.99'
PACKAGE_VERSION='4.4.0.99'
PACKAGE_STRING='lightgbm 4.4.0.99'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@ -1211,7 +1211,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
\`configure' configures lightgbm 4.3.0.99 to adapt to many kinds of systems.
\`configure' configures lightgbm 4.4.0.99 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@ -1273,7 +1273,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
short | recursive ) echo "Configuration of lightgbm 4.3.0.99:";;
short | recursive ) echo "Configuration of lightgbm 4.4.0.99:";;
esac
cat <<\_ACEOF
@ -1341,7 +1341,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
lightgbm configure 4.3.0.99
lightgbm configure 4.4.0.99
generated by GNU Autoconf 2.71
Copyright (C) 2021 Free Software Foundation, Inc.
@ -1378,7 +1378,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
It was created by lightgbm $as_me 4.3.0.99, which was
It was created by lightgbm $as_me 4.4.0.99, which was
generated by GNU Autoconf 2.71. Invocation command line was
$ $0$ac_configure_args_raw
@ -2454,7 +2454,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
This file was extended by lightgbm $as_me 4.3.0.99, which was
This file was extended by lightgbm $as_me 4.4.0.99, which was
generated by GNU Autoconf 2.71. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@ -2509,7 +2509,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config='$ac_cs_config_escaped'
ac_cs_version="\\
lightgbm config.status 4.3.0.99
lightgbm config.status 4.4.0.99
configured by $0, generated by GNU Autoconf 2.71,
with options \\"\$ac_cs_config\\"

Просмотреть файл

@ -1,5 +1,15 @@
# CRAN Submission History
## v4.4.0 - Submission 1 - (June 14, 2024)
### CRAN response
Accepted to CRAN
### Maintainer Notes
This was a standard release of `{lightgbm}`, not intended to fix any particular R-specific issues.
## v4.3.0 - Submission 1 - (January 18, 2024)
### CRAN response

Просмотреть файл

@ -12,8 +12,10 @@ lgb.dump(booster, num_iteration = NULL, start_iteration = 1L)
\item{num_iteration}{Number of iterations to be dumped. NULL or <= 0 means use best iteration}
\item{start_iteration}{Index (1-based) of the first boosting round to dump.
For example, passing \code{start_iteration=5, num_iteration=3} for a regression model
means "dump the fifth, sixth, and seventh tree"}
For example, passing \code{start_iteration=5, num_iteration=3} for a regression model
means "dump the fifth, sixth, and seventh tree"
\emph{New in version 4.4.0}}
}
\value{
json format of model

Просмотреть файл

@ -12,8 +12,10 @@ lgb.model.dt.tree(model, num_iteration = NULL, start_iteration = 1L)
\item{num_iteration}{Number of iterations to include. NULL or <= 0 means use best iteration.}
\item{start_iteration}{Index (1-based) of the first boosting round to include in the output.
For example, passing \code{start_iteration=5, num_iteration=3} for a regression model
means "return information about the fifth, sixth, and seventh trees".}
For example, passing \code{start_iteration=5, num_iteration=3} for a regression model
means "return information about the fifth, sixth, and seventh trees".
\emph{New in version 4.4.0}}
}
\value{
A \code{data.table} with detailed information about model trees' nodes and leafs.

Просмотреть файл

@ -14,8 +14,10 @@ lgb.save(booster, filename, num_iteration = NULL, start_iteration = 1L)
\item{num_iteration}{Number of iterations to save, NULL or <= 0 means use best iteration}
\item{start_iteration}{Index (1-based) of the first boosting round to save.
For example, passing \code{start_iteration=5, num_iteration=3} for a regression model
means "save the fifth, sixth, and seventh tree"}
For example, passing \code{start_iteration=5, num_iteration=3} for a regression model
means "save the fifth, sixth, and seventh tree"
\emph{New in version 4.4.0}}
}
\value{
lgb.Booster

Просмотреть файл

@ -17,6 +17,8 @@ constructed sub dataset
\description{
Get a new \code{lgb.Dataset} containing the specified rows of
original \code{lgb.Dataset} object
\emph{Renamed from} \code{slice()} \emph{in 4.4.0}
}
\examples{
\donttest{

Просмотреть файл

@ -11,6 +11,7 @@
#include <LightGBM/utils/text_reader.h>
#include <R_ext/Rdynload.h>
#include <R_ext/Altrep.h>
#define R_NO_REMAP
#define R_USE_C99_IN_CXX
@ -24,6 +25,150 @@
#include <utility>
#include <vector>
#include <algorithm>
#include <type_traits>
R_altrep_class_t lgb_altrepped_char_vec;
R_altrep_class_t lgb_altrepped_int_arr;
R_altrep_class_t lgb_altrepped_dbl_arr;
template <class T>
void delete_cpp_array(SEXP R_ptr) {
T *ptr_to_cpp_obj = static_cast<T*>(R_ExternalPtrAddr(R_ptr));
delete[] ptr_to_cpp_obj;
R_ClearExternalPtr(R_ptr);
}
void delete_cpp_char_vec(SEXP R_ptr) {
std::vector<char> *ptr_to_cpp_obj = static_cast<std::vector<char>*>(R_ExternalPtrAddr(R_ptr));
delete ptr_to_cpp_obj;
R_ClearExternalPtr(R_ptr);
}
// Note: MSVC has issues with Altrep classes, so they are disabled for it.
// See: https://github.com/microsoft/LightGBM/pull/6213#issuecomment-2111025768
#ifdef _MSC_VER
# define LGB_NO_ALTREP
#endif
#ifndef LGB_NO_ALTREP
SEXP make_altrepped_raw_vec(void *void_ptr) {
std::unique_ptr<std::vector<char>> *ptr_to_cpp_vec = static_cast<std::unique_ptr<std::vector<char>>*>(void_ptr);
SEXP R_ptr = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP R_raw = Rf_protect(R_new_altrep(lgb_altrepped_char_vec, R_NilValue, R_NilValue));
R_SetExternalPtrAddr(R_ptr, ptr_to_cpp_vec->get());
R_RegisterCFinalizerEx(R_ptr, delete_cpp_char_vec, TRUE);
ptr_to_cpp_vec->release();
R_set_altrep_data1(R_raw, R_ptr);
Rf_unprotect(2);
return R_raw;
}
#else
SEXP make_r_raw_vec(void *void_ptr) {
std::unique_ptr<std::vector<char>> *ptr_to_cpp_vec = static_cast<std::unique_ptr<std::vector<char>>*>(void_ptr);
R_xlen_t len = ptr_to_cpp_vec->get()->size();
SEXP out = Rf_protect(Rf_allocVector(RAWSXP, len));
std::copy(ptr_to_cpp_vec->get()->begin(), ptr_to_cpp_vec->get()->end(), reinterpret_cast<char*>(RAW(out)));
Rf_unprotect(1);
return out;
}
#define make_altrepped_raw_vec make_r_raw_vec
#endif
std::vector<char>* get_ptr_from_altrepped_raw(SEXP R_raw) {
return static_cast<std::vector<char>*>(R_ExternalPtrAddr(R_altrep_data1(R_raw)));
}
R_xlen_t get_altrepped_raw_len(SEXP R_raw) {
return get_ptr_from_altrepped_raw(R_raw)->size();
}
const void* get_altrepped_raw_dataptr_or_null(SEXP R_raw) {
return get_ptr_from_altrepped_raw(R_raw)->data();
}
void* get_altrepped_raw_dataptr(SEXP R_raw, Rboolean writeable) {
return get_ptr_from_altrepped_raw(R_raw)->data();
}
#ifndef LGB_NO_ALTREP
template <class T>
R_altrep_class_t get_altrep_class_for_type() {
if (std::is_same<T, double>::value) {
return lgb_altrepped_dbl_arr;
} else {
return lgb_altrepped_int_arr;
}
}
#else
template <class T>
SEXPTYPE get_sexptype_class_for_type() {
if (std::is_same<T, double>::value) {
return REALSXP;
} else {
return INTSXP;
}
}
template <class T>
T* get_r_vec_ptr(SEXP x) {
if (std::is_same<T, double>::value) {
return static_cast<T*>(static_cast<void*>(REAL(x)));
} else {
return static_cast<T*>(static_cast<void*>(INTEGER(x)));
}
}
#endif
template <class T>
struct arr_and_len {
T *arr;
int64_t len;
};
#ifndef LGB_NO_ALTREP
template <class T>
SEXP make_altrepped_vec_from_arr(void *void_ptr) {
T *arr = static_cast<arr_and_len<T>*>(void_ptr)->arr;
uint64_t len = static_cast<arr_and_len<T>*>(void_ptr)->len;
SEXP R_ptr = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP R_len = Rf_protect(Rf_allocVector(REALSXP, 1));
SEXP R_vec = Rf_protect(R_new_altrep(get_altrep_class_for_type<T>(), R_NilValue, R_NilValue));
REAL(R_len)[0] = static_cast<double>(len);
R_SetExternalPtrAddr(R_ptr, arr);
R_RegisterCFinalizerEx(R_ptr, delete_cpp_array<T>, TRUE);
R_set_altrep_data1(R_vec, R_ptr);
R_set_altrep_data2(R_vec, R_len);
Rf_unprotect(3);
return R_vec;
}
#else
template <class T>
SEXP make_R_vec_from_arr(void *void_ptr) {
T *arr = static_cast<arr_and_len<T>*>(void_ptr)->arr;
uint64_t len = static_cast<arr_and_len<T>*>(void_ptr)->len;
SEXP out = Rf_protect(Rf_allocVector(get_sexptype_class_for_type<T>(), len));
std::copy(arr, arr + len, get_r_vec_ptr<T>(out));
Rf_unprotect(1);
return out;
}
#define make_altrepped_vec_from_arr make_R_vec_from_arr
#endif
R_xlen_t get_altrepped_vec_len(SEXP R_vec) {
return static_cast<R_xlen_t>(Rf_asReal(R_altrep_data2(R_vec)));
}
const void* get_altrepped_vec_dataptr_or_null(SEXP R_vec) {
return R_ExternalPtrAddr(R_altrep_data1(R_vec));
}
void* get_altrepped_vec_dataptr(SEXP R_vec, Rboolean writeable) {
return R_ExternalPtrAddr(R_altrep_data1(R_vec));
}
#define COL_MAJOR (0)
@ -143,18 +288,18 @@ SEXP LGBM_DatasetCreateFromFile_R(SEXP filename,
SEXP parameters,
SEXP reference) {
R_API_BEGIN();
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
DatasetHandle handle = nullptr;
DatasetHandle ref = nullptr;
if (!Rf_isNull(reference)) {
ref = R_ExternalPtrAddr(reference);
}
const char* filename_ptr = CHAR(PROTECT(Rf_asChar(filename)));
const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters)));
const char* filename_ptr = CHAR(Rf_protect(Rf_asChar(filename)));
const char* parameters_ptr = CHAR(Rf_protect(Rf_asChar(parameters)));
CHECK_CALL(LGBM_DatasetCreateFromFile(filename_ptr, parameters_ptr, ref, &handle));
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DatasetFinalizer, TRUE);
UNPROTECT(3);
Rf_unprotect(3);
return ret;
R_API_END();
}
@ -168,14 +313,14 @@ SEXP LGBM_DatasetCreateFromCSC_R(SEXP indptr,
SEXP parameters,
SEXP reference) {
R_API_BEGIN();
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
const int* p_indptr = INTEGER(indptr);
const int* p_indices = INTEGER(indices);
const double* p_data = REAL(data);
int64_t nindptr = static_cast<int64_t>(Rf_asInteger(num_indptr));
int64_t ndata = static_cast<int64_t>(Rf_asInteger(nelem));
int64_t nrow = static_cast<int64_t>(Rf_asInteger(num_row));
const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters)));
const char* parameters_ptr = CHAR(Rf_protect(Rf_asChar(parameters)));
DatasetHandle handle = nullptr;
DatasetHandle ref = nullptr;
if (!Rf_isNull(reference)) {
@ -186,7 +331,7 @@ SEXP LGBM_DatasetCreateFromCSC_R(SEXP indptr,
nrow, parameters_ptr, ref, &handle));
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DatasetFinalizer, TRUE);
UNPROTECT(2);
Rf_unprotect(2);
return ret;
R_API_END();
}
@ -197,11 +342,11 @@ SEXP LGBM_DatasetCreateFromMat_R(SEXP data,
SEXP parameters,
SEXP reference) {
R_API_BEGIN();
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
int32_t nrow = static_cast<int32_t>(Rf_asInteger(num_row));
int32_t ncol = static_cast<int32_t>(Rf_asInteger(num_col));
double* p_mat = REAL(data);
const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters)));
const char* parameters_ptr = CHAR(Rf_protect(Rf_asChar(parameters)));
DatasetHandle handle = nullptr;
DatasetHandle ref = nullptr;
if (!Rf_isNull(reference)) {
@ -211,7 +356,7 @@ SEXP LGBM_DatasetCreateFromMat_R(SEXP data,
parameters_ptr, ref, &handle));
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DatasetFinalizer, TRUE);
UNPROTECT(2);
Rf_unprotect(2);
return ret;
R_API_END();
}
@ -222,7 +367,7 @@ SEXP LGBM_DatasetGetSubset_R(SEXP handle,
SEXP parameters) {
R_API_BEGIN();
_AssertDatasetHandleNotNull(handle);
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
int32_t len = static_cast<int32_t>(Rf_asInteger(len_used_row_indices));
std::unique_ptr<int32_t[]> idxvec(new int32_t[len]);
// convert from one-based to zero-based index
@ -233,14 +378,14 @@ SEXP LGBM_DatasetGetSubset_R(SEXP handle,
for (int32_t i = 0; i < len; ++i) {
idxvec[i] = static_cast<int32_t>(used_row_indices_[i] - 1);
}
const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters)));
const char* parameters_ptr = CHAR(Rf_protect(Rf_asChar(parameters)));
DatasetHandle res = nullptr;
CHECK_CALL(LGBM_DatasetGetSubset(R_ExternalPtrAddr(handle),
idxvec.get(), len, parameters_ptr,
&res));
R_SetExternalPtrAddr(ret, res);
R_RegisterCFinalizerEx(ret, _DatasetFinalizer, TRUE);
UNPROTECT(2);
Rf_unprotect(2);
return ret;
R_API_END();
}
@ -249,7 +394,7 @@ SEXP LGBM_DatasetSetFeatureNames_R(SEXP handle,
SEXP feature_names) {
R_API_BEGIN();
_AssertDatasetHandleNotNull(handle);
auto vec_names = Split(CHAR(PROTECT(Rf_asChar(feature_names))), '\t');
auto vec_names = Split(CHAR(Rf_protect(Rf_asChar(feature_names))), '\t');
int len = static_cast<int>(vec_names.size());
std::unique_ptr<const char*[]> vec_sptr(new const char*[len]);
for (int i = 0; i < len; ++i) {
@ -257,13 +402,13 @@ SEXP LGBM_DatasetSetFeatureNames_R(SEXP handle,
}
CHECK_CALL(LGBM_DatasetSetFeatureNames(R_ExternalPtrAddr(handle),
vec_sptr.get(), len));
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
SEXP LGBM_DatasetGetFeatureNames_R(SEXP handle) {
SEXP cont_token = PROTECT(R_MakeUnwindCont());
SEXP cont_token = Rf_protect(R_MakeUnwindCont());
R_API_BEGIN();
_AssertDatasetHandleNotNull(handle);
SEXP feature_names;
@ -301,11 +446,11 @@ SEXP LGBM_DatasetGetFeatureNames_R(SEXP handle) {
ptr_names.data()));
}
CHECK_EQ(len, out_len);
feature_names = PROTECT(safe_R_string(static_cast<R_xlen_t>(len), &cont_token));
feature_names = Rf_protect(safe_R_string(static_cast<R_xlen_t>(len), &cont_token));
for (int i = 0; i < len; ++i) {
SET_STRING_ELT(feature_names, i, safe_R_mkChar(ptr_names[i], &cont_token));
}
UNPROTECT(2);
Rf_unprotect(2);
return feature_names;
R_API_END();
}
@ -314,10 +459,10 @@ SEXP LGBM_DatasetSaveBinary_R(SEXP handle,
SEXP filename) {
R_API_BEGIN();
_AssertDatasetHandleNotNull(handle);
const char* filename_ptr = CHAR(PROTECT(Rf_asChar(filename)));
const char* filename_ptr = CHAR(Rf_protect(Rf_asChar(filename)));
CHECK_CALL(LGBM_DatasetSaveBinary(R_ExternalPtrAddr(handle),
filename_ptr));
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -339,7 +484,7 @@ SEXP LGBM_DatasetSetField_R(SEXP handle,
R_API_BEGIN();
_AssertDatasetHandleNotNull(handle);
int len = Rf_asInteger(num_element);
const char* name = CHAR(PROTECT(Rf_asChar(field_name)));
const char* name = CHAR(Rf_protect(Rf_asChar(field_name)));
if (!strcmp("group", name) || !strcmp("query", name)) {
CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, INTEGER(field_data), len, C_API_DTYPE_INT32));
} else if (!strcmp("init_score", name)) {
@ -349,7 +494,7 @@ SEXP LGBM_DatasetSetField_R(SEXP handle,
std::copy(REAL(field_data), REAL(field_data) + len, vec.get());
CHECK_CALL(LGBM_DatasetSetField(R_ExternalPtrAddr(handle), name, vec.get(), len, C_API_DTYPE_FLOAT32));
}
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -359,7 +504,7 @@ SEXP LGBM_DatasetGetField_R(SEXP handle,
SEXP field_data) {
R_API_BEGIN();
_AssertDatasetHandleNotNull(handle);
const char* name = CHAR(PROTECT(Rf_asChar(field_name)));
const char* name = CHAR(Rf_protect(Rf_asChar(field_name)));
int out_len = 0;
int out_type = 0;
const void* res;
@ -381,7 +526,7 @@ SEXP LGBM_DatasetGetField_R(SEXP handle,
auto p_data = reinterpret_cast<const float*>(res);
std::copy(p_data, p_data + out_len, REAL(field_data));
}
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -391,7 +536,7 @@ SEXP LGBM_DatasetGetFieldSize_R(SEXP handle,
SEXP out) {
R_API_BEGIN();
_AssertDatasetHandleNotNull(handle);
const char* name = CHAR(PROTECT(Rf_asChar(field_name)));
const char* name = CHAR(Rf_protect(Rf_asChar(field_name)));
int out_len = 0;
int out_type = 0;
const void* res;
@ -400,7 +545,7 @@ SEXP LGBM_DatasetGetFieldSize_R(SEXP handle,
out_len -= 1;
}
INTEGER(out)[0] = out_len;
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -408,10 +553,10 @@ SEXP LGBM_DatasetGetFieldSize_R(SEXP handle,
SEXP LGBM_DatasetUpdateParamChecking_R(SEXP old_params,
SEXP new_params) {
R_API_BEGIN();
const char* old_params_ptr = CHAR(PROTECT(Rf_asChar(old_params)));
const char* new_params_ptr = CHAR(PROTECT(Rf_asChar(new_params)));
const char* old_params_ptr = CHAR(Rf_protect(Rf_asChar(old_params)));
const char* new_params_ptr = CHAR(Rf_protect(Rf_asChar(new_params)));
CHECK_CALL(LGBM_DatasetUpdateParamChecking(old_params_ptr, new_params_ptr));
UNPROTECT(2);
Rf_unprotect(2);
return R_NilValue;
R_API_END();
}
@ -468,34 +613,34 @@ SEXP LGBM_BoosterCreate_R(SEXP train_data,
SEXP parameters) {
R_API_BEGIN();
_AssertDatasetHandleNotNull(train_data);
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters)));
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
const char* parameters_ptr = CHAR(Rf_protect(Rf_asChar(parameters)));
BoosterHandle handle = nullptr;
CHECK_CALL(LGBM_BoosterCreate(R_ExternalPtrAddr(train_data), parameters_ptr, &handle));
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(2);
Rf_unprotect(2);
return ret;
R_API_END();
}
SEXP LGBM_BoosterCreateFromModelfile_R(SEXP filename) {
R_API_BEGIN();
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
int out_num_iterations = 0;
const char* filename_ptr = CHAR(PROTECT(Rf_asChar(filename)));
const char* filename_ptr = CHAR(Rf_protect(Rf_asChar(filename)));
BoosterHandle handle = nullptr;
CHECK_CALL(LGBM_BoosterCreateFromModelfile(filename_ptr, &out_num_iterations, &handle));
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(2);
Rf_unprotect(2);
return ret;
R_API_END();
}
SEXP LGBM_BoosterLoadModelFromString_R(SEXP model_str) {
R_API_BEGIN();
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
SEXP temp = NULL;
int n_protected = 1;
int out_num_iterations = 0;
@ -510,7 +655,7 @@ SEXP LGBM_BoosterLoadModelFromString_R(SEXP model_str) {
break;
}
case STRSXP: {
temp = PROTECT(STRING_ELT(model_str, 0));
temp = Rf_protect(STRING_ELT(model_str, 0));
n_protected++;
model_str_ptr = reinterpret_cast<const char*>(CHAR(temp));
}
@ -519,7 +664,7 @@ SEXP LGBM_BoosterLoadModelFromString_R(SEXP model_str) {
CHECK_CALL(LGBM_BoosterLoadModelFromString(model_str_ptr, &out_num_iterations, &handle));
R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _BoosterFinalizer, TRUE);
UNPROTECT(n_protected);
Rf_unprotect(n_protected);
return ret;
R_API_END();
}
@ -558,9 +703,9 @@ SEXP LGBM_BoosterResetParameter_R(SEXP handle,
SEXP parameters) {
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
const char* parameters_ptr = CHAR(PROTECT(Rf_asChar(parameters)));
const char* parameters_ptr = CHAR(Rf_protect(Rf_asChar(parameters)));
CHECK_CALL(LGBM_BoosterResetParameter(R_ExternalPtrAddr(handle), parameters_ptr));
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -650,7 +795,7 @@ SEXP LGBM_BoosterGetLowerBoundValue_R(SEXP handle,
}
SEXP LGBM_BoosterGetEvalNames_R(SEXP handle) {
SEXP cont_token = PROTECT(R_MakeUnwindCont());
SEXP cont_token = Rf_protect(R_MakeUnwindCont());
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
SEXP eval_names;
@ -689,11 +834,11 @@ SEXP LGBM_BoosterGetEvalNames_R(SEXP handle) {
ptr_names.data()));
}
CHECK_EQ(out_len, len);
eval_names = PROTECT(safe_R_string(static_cast<R_xlen_t>(len), &cont_token));
eval_names = Rf_protect(safe_R_string(static_cast<R_xlen_t>(len), &cont_token));
for (int i = 0; i < len; ++i) {
SET_STRING_ELT(eval_names, i, safe_R_mkChar(ptr_names[i], &cont_token));
}
UNPROTECT(2);
Rf_unprotect(2);
return eval_names;
R_API_END();
}
@ -763,14 +908,14 @@ SEXP LGBM_BoosterPredictForFile_R(SEXP handle,
SEXP result_filename) {
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
const char* data_filename_ptr = CHAR(PROTECT(Rf_asChar(data_filename)));
const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
const char* result_filename_ptr = CHAR(PROTECT(Rf_asChar(result_filename)));
const char* data_filename_ptr = CHAR(Rf_protect(Rf_asChar(data_filename)));
const char* parameter_ptr = CHAR(Rf_protect(Rf_asChar(parameter)));
const char* result_filename_ptr = CHAR(Rf_protect(Rf_asChar(result_filename)));
int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib);
CHECK_CALL(LGBM_BoosterPredictForFile(R_ExternalPtrAddr(handle), data_filename_ptr,
Rf_asInteger(data_has_header), pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), parameter_ptr,
result_filename_ptr));
UNPROTECT(3);
Rf_unprotect(3);
return R_NilValue;
R_API_END();
}
@ -819,12 +964,12 @@ SEXP LGBM_BoosterPredictForCSC_R(SEXP handle,
int64_t nrow = static_cast<int64_t>(Rf_asInteger(num_row));
double* ptr_ret = REAL(out_result);
int64_t out_len;
const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
const char* parameter_ptr = CHAR(Rf_protect(Rf_asChar(parameter)));
CHECK_CALL(LGBM_BoosterPredictForCSC(R_ExternalPtrAddr(handle),
p_indptr, C_API_DTYPE_INT32, p_indices,
p_data, C_API_DTYPE_FLOAT64, nindptr, ndata,
nrow, pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), parameter_ptr, &out_len, ptr_ret));
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -844,7 +989,7 @@ SEXP LGBM_BoosterPredictForCSR_R(SEXP handle,
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib);
const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
const char* parameter_ptr = CHAR(Rf_protect(Rf_asChar(parameter)));
int64_t out_len;
CHECK_CALL(LGBM_BoosterPredictForCSR(R_ExternalPtrAddr(handle),
INTEGER(indptr), C_API_DTYPE_INT32, INTEGER(indices),
@ -852,7 +997,7 @@ SEXP LGBM_BoosterPredictForCSR_R(SEXP handle,
Rf_xlength(indptr), Rf_xlength(data), Rf_asInteger(ncols),
pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration),
parameter_ptr, &out_len, REAL(out_result)));
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -871,7 +1016,7 @@ SEXP LGBM_BoosterPredictForCSRSingleRow_R(SEXP handle,
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib);
const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
const char* parameter_ptr = CHAR(Rf_protect(Rf_asChar(parameter)));
int nnz = static_cast<int>(Rf_xlength(data));
const int indptr[] = {0, nnz};
int64_t out_len;
@ -881,7 +1026,7 @@ SEXP LGBM_BoosterPredictForCSRSingleRow_R(SEXP handle,
2, nnz, Rf_asInteger(ncols),
pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration),
parameter_ptr, &out_len, REAL(out_result)));
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -901,8 +1046,8 @@ SEXP LGBM_BoosterPredictForCSRSingleRowFastInit_R(SEXP handle,
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib);
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
const char* parameter_ptr = CHAR(Rf_protect(Rf_asChar(parameter)));
FastConfigHandle out_fastConfig;
CHECK_CALL(LGBM_BoosterPredictForCSRSingleRowFastInit(R_ExternalPtrAddr(handle),
pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration),
@ -910,7 +1055,7 @@ SEXP LGBM_BoosterPredictForCSRSingleRowFastInit_R(SEXP handle,
parameter_ptr, &out_fastConfig));
R_SetExternalPtrAddr(ret, out_fastConfig);
R_RegisterCFinalizerEx(ret, LGBM_FastConfigFree_wrapped, TRUE);
UNPROTECT(2);
Rf_unprotect(2);
return ret;
R_API_END();
}
@ -950,12 +1095,12 @@ SEXP LGBM_BoosterPredictForMat_R(SEXP handle,
int32_t ncol = static_cast<int32_t>(Rf_asInteger(num_col));
const double* p_mat = REAL(data);
double* ptr_ret = REAL(out_result);
const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
const char* parameter_ptr = CHAR(Rf_protect(Rf_asChar(parameter)));
int64_t out_len;
CHECK_CALL(LGBM_BoosterPredictForMat(R_ExternalPtrAddr(handle),
p_mat, C_API_DTYPE_FLOAT64, nrow, ncol, COL_MAJOR,
pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), parameter_ptr, &out_len, ptr_ret));
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -964,8 +1109,6 @@ struct SparseOutputPointers {
void* indptr;
int32_t* indices;
void* data;
int indptr_type;
int data_type;
SparseOutputPointers(void* indptr, int32_t* indices, void* data)
: indptr(indptr), indices(indices), data(data) {}
};
@ -985,12 +1128,12 @@ SEXP LGBM_BoosterPredictSparseOutput_R(SEXP handle,
SEXP start_iteration,
SEXP num_iteration,
SEXP parameter) {
SEXP cont_token = PROTECT(R_MakeUnwindCont());
SEXP cont_token = Rf_protect(R_MakeUnwindCont());
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
const char* out_names[] = {"indptr", "indices", "data", ""};
SEXP out = PROTECT(Rf_mkNamed(VECSXP, out_names));
const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
SEXP out = Rf_protect(Rf_mkNamed(VECSXP, out_names));
const char* parameter_ptr = CHAR(Rf_protect(Rf_asChar(parameter)));
int64_t out_len[2];
void *out_indptr;
@ -1015,17 +1158,28 @@ SEXP LGBM_BoosterPredictSparseOutput_R(SEXP handle,
&delete_SparseOutputPointers
};
SEXP out_indptr_R = safe_R_int(out_len[1], &cont_token);
SET_VECTOR_ELT(out, 0, out_indptr_R);
SEXP out_indices_R = safe_R_int(out_len[0], &cont_token);
SET_VECTOR_ELT(out, 1, out_indices_R);
SEXP out_data_R = safe_R_real(out_len[0], &cont_token);
SET_VECTOR_ELT(out, 2, out_data_R);
std::memcpy(INTEGER(out_indptr_R), out_indptr, out_len[1]*sizeof(int));
std::memcpy(INTEGER(out_indices_R), out_indices, out_len[0]*sizeof(int));
std::memcpy(REAL(out_data_R), out_data, out_len[0]*sizeof(double));
arr_and_len<int> indptr_str{static_cast<int*>(out_indptr), out_len[1]};
SET_VECTOR_ELT(
out, 0,
R_UnwindProtect(make_altrepped_vec_from_arr<int>,
static_cast<void*>(&indptr_str), throw_R_memerr, &cont_token, cont_token));
pointers_struct->indptr = nullptr;
UNPROTECT(3);
arr_and_len<int> indices_str{static_cast<int*>(out_indices), out_len[0]};
SET_VECTOR_ELT(
out, 1,
R_UnwindProtect(make_altrepped_vec_from_arr<int>,
static_cast<void*>(&indices_str), throw_R_memerr, &cont_token, cont_token));
pointers_struct->indices = nullptr;
arr_and_len<double> data_str{static_cast<double*>(out_data), out_len[0]};
SET_VECTOR_ELT(
out, 2,
R_UnwindProtect(make_altrepped_vec_from_arr<double>,
static_cast<void*>(&data_str), throw_R_memerr, &cont_token, cont_token));
pointers_struct->data = nullptr;
Rf_unprotect(3);
return out;
R_API_END();
}
@ -1042,14 +1196,14 @@ SEXP LGBM_BoosterPredictForMatSingleRow_R(SEXP handle,
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib);
const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
const char* parameter_ptr = CHAR(Rf_protect(Rf_asChar(parameter)));
double* ptr_ret = REAL(out_result);
int64_t out_len;
CHECK_CALL(LGBM_BoosterPredictForMatSingleRow(R_ExternalPtrAddr(handle),
REAL(data), C_API_DTYPE_FLOAT64, Rf_xlength(data), 1,
pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration),
parameter_ptr, &out_len, ptr_ret));
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
@ -1065,8 +1219,8 @@ SEXP LGBM_BoosterPredictForMatSingleRowFastInit_R(SEXP handle,
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
int pred_type = GetPredictType(is_rawscore, is_leafidx, is_predcontrib);
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
const char* parameter_ptr = CHAR(PROTECT(Rf_asChar(parameter)));
SEXP ret = Rf_protect(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
const char* parameter_ptr = CHAR(Rf_protect(Rf_asChar(parameter)));
FastConfigHandle out_fastConfig;
CHECK_CALL(LGBM_BoosterPredictForMatSingleRowFastInit(R_ExternalPtrAddr(handle),
pred_type, Rf_asInteger(start_iteration), Rf_asInteger(num_iteration),
@ -1074,7 +1228,7 @@ SEXP LGBM_BoosterPredictForMatSingleRowFastInit_R(SEXP handle,
parameter_ptr, &out_fastConfig));
R_SetExternalPtrAddr(ret, out_fastConfig);
R_RegisterCFinalizerEx(ret, LGBM_FastConfigFree_wrapped, TRUE);
UNPROTECT(2);
Rf_unprotect(2);
return ret;
R_API_END();
}
@ -1097,18 +1251,46 @@ SEXP LGBM_BoosterSaveModel_R(SEXP handle,
SEXP start_iteration) {
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
const char* filename_ptr = CHAR(PROTECT(Rf_asChar(filename)));
const char* filename_ptr = CHAR(Rf_protect(Rf_asChar(filename)));
CHECK_CALL(LGBM_BoosterSaveModel(R_ExternalPtrAddr(handle), Rf_asInteger(start_iteration), Rf_asInteger(num_iteration), Rf_asInteger(feature_importance_type), filename_ptr));
UNPROTECT(1);
Rf_unprotect(1);
return R_NilValue;
R_API_END();
}
// Note: for some reason, MSVC crashes when an error is thrown here
// if the buffer variable is defined as 'std::unique_ptr<std::vector<char>>',
// but not if it is defined as '<std::vector<char>'.
#ifndef _MSC_VER
SEXP LGBM_BoosterSaveModelToString_R(SEXP handle,
SEXP num_iteration,
SEXP feature_importance_type,
SEXP start_iteration) {
SEXP cont_token = PROTECT(R_MakeUnwindCont());
SEXP cont_token = Rf_protect(R_MakeUnwindCont());
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
int64_t out_len = 0;
int64_t buf_len = 1024 * 1024;
int num_iter = Rf_asInteger(num_iteration);
int start_iter = Rf_asInteger(start_iteration);
int importance_type = Rf_asInteger(feature_importance_type);
std::unique_ptr<std::vector<char>> inner_char_buf(new std::vector<char>(buf_len));
CHECK_CALL(LGBM_BoosterSaveModelToString(R_ExternalPtrAddr(handle), start_iter, num_iter, importance_type, buf_len, &out_len, inner_char_buf->data()));
inner_char_buf->resize(out_len);
if (out_len > buf_len) {
CHECK_CALL(LGBM_BoosterSaveModelToString(R_ExternalPtrAddr(handle), start_iter, num_iter, importance_type, out_len, &out_len, inner_char_buf->data()));
}
SEXP out = R_UnwindProtect(make_altrepped_raw_vec, &inner_char_buf, throw_R_memerr, &cont_token, cont_token);
Rf_unprotect(1);
return out;
R_API_END();
}
#else
SEXP LGBM_BoosterSaveModelToString_R(SEXP handle,
SEXP num_iteration,
SEXP feature_importance_type,
SEXP start_iteration) {
SEXP cont_token = Rf_protect(R_MakeUnwindCont());
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
int64_t out_len = 0;
@ -1118,23 +1300,24 @@ SEXP LGBM_BoosterSaveModelToString_R(SEXP handle,
int importance_type = Rf_asInteger(feature_importance_type);
std::vector<char> inner_char_buf(buf_len);
CHECK_CALL(LGBM_BoosterSaveModelToString(R_ExternalPtrAddr(handle), start_iter, num_iter, importance_type, buf_len, &out_len, inner_char_buf.data()));
SEXP model_str = PROTECT(safe_R_raw(out_len, &cont_token));
SEXP model_str = Rf_protect(safe_R_raw(out_len, &cont_token));
// if the model string was larger than the initial buffer, call the function again, writing directly to the R object
if (out_len > buf_len) {
CHECK_CALL(LGBM_BoosterSaveModelToString(R_ExternalPtrAddr(handle), start_iter, num_iter, importance_type, out_len, &out_len, reinterpret_cast<char*>(RAW(model_str))));
} else {
std::copy(inner_char_buf.begin(), inner_char_buf.begin() + out_len, reinterpret_cast<char*>(RAW(model_str)));
}
UNPROTECT(2);
Rf_unprotect(2);
return model_str;
R_API_END();
}
#endif
SEXP LGBM_BoosterDumpModel_R(SEXP handle,
SEXP num_iteration,
SEXP feature_importance_type,
SEXP start_iteration) {
SEXP cont_token = PROTECT(R_MakeUnwindCont());
SEXP cont_token = Rf_protect(R_MakeUnwindCont());
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
SEXP model_str;
@ -1150,15 +1333,15 @@ SEXP LGBM_BoosterDumpModel_R(SEXP handle,
inner_char_buf.resize(out_len);
CHECK_CALL(LGBM_BoosterDumpModel(R_ExternalPtrAddr(handle), start_iter, num_iter, importance_type, out_len, &out_len, inner_char_buf.data()));
}
model_str = PROTECT(safe_R_string(static_cast<R_xlen_t>(1), &cont_token));
model_str = Rf_protect(safe_R_string(static_cast<R_xlen_t>(1), &cont_token));
SET_STRING_ELT(model_str, 0, safe_R_mkChar(inner_char_buf.data(), &cont_token));
UNPROTECT(2);
Rf_unprotect(2);
return model_str;
R_API_END();
}
SEXP LGBM_DumpParamAliases_R() {
SEXP cont_token = PROTECT(R_MakeUnwindCont());
SEXP cont_token = Rf_protect(R_MakeUnwindCont());
R_API_BEGIN();
SEXP aliases_str;
int64_t out_len = 0;
@ -1170,15 +1353,15 @@ SEXP LGBM_DumpParamAliases_R() {
inner_char_buf.resize(out_len);
CHECK_CALL(LGBM_DumpParamAliases(out_len, &out_len, inner_char_buf.data()));
}
aliases_str = PROTECT(safe_R_string(static_cast<R_xlen_t>(1), &cont_token));
aliases_str = Rf_protect(safe_R_string(static_cast<R_xlen_t>(1), &cont_token));
SET_STRING_ELT(aliases_str, 0, safe_R_mkChar(inner_char_buf.data(), &cont_token));
UNPROTECT(2);
Rf_unprotect(2);
return aliases_str;
R_API_END();
}
SEXP LGBM_BoosterGetLoadedParam_R(SEXP handle) {
SEXP cont_token = PROTECT(R_MakeUnwindCont());
SEXP cont_token = Rf_protect(R_MakeUnwindCont());
R_API_BEGIN();
_AssertBoosterHandleNotNull(handle);
SEXP params_str;
@ -1191,9 +1374,9 @@ SEXP LGBM_BoosterGetLoadedParam_R(SEXP handle) {
inner_char_buf.resize(out_len);
CHECK_CALL(LGBM_BoosterGetLoadedParam(R_ExternalPtrAddr(handle), out_len, &out_len, inner_char_buf.data()));
}
params_str = PROTECT(safe_R_string(static_cast<R_xlen_t>(1), &cont_token));
params_str = Rf_protect(safe_R_string(static_cast<R_xlen_t>(1), &cont_token));
SET_STRING_ELT(params_str, 0, safe_R_mkChar(inner_char_buf.data(), &cont_token));
UNPROTECT(2);
Rf_unprotect(2);
return params_str;
R_API_END();
}
@ -1281,4 +1464,21 @@ LIGHTGBM_C_EXPORT void R_init_lightgbm(DllInfo *dll);
void R_init_lightgbm(DllInfo *dll) {
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
R_useDynamicSymbols(dll, FALSE);
#ifndef LGB_NO_ALTREP
lgb_altrepped_char_vec = R_make_altraw_class("lgb_altrepped_char_vec", "lightgbm", dll);
R_set_altrep_Length_method(lgb_altrepped_char_vec, get_altrepped_raw_len);
R_set_altvec_Dataptr_method(lgb_altrepped_char_vec, get_altrepped_raw_dataptr);
R_set_altvec_Dataptr_or_null_method(lgb_altrepped_char_vec, get_altrepped_raw_dataptr_or_null);
lgb_altrepped_int_arr = R_make_altinteger_class("lgb_altrepped_int_arr", "lightgbm", dll);
R_set_altrep_Length_method(lgb_altrepped_int_arr, get_altrepped_vec_len);
R_set_altvec_Dataptr_method(lgb_altrepped_int_arr, get_altrepped_vec_dataptr);
R_set_altvec_Dataptr_or_null_method(lgb_altrepped_int_arr, get_altrepped_vec_dataptr_or_null);
lgb_altrepped_dbl_arr = R_make_altreal_class("lgb_altrepped_dbl_arr", "lightgbm", dll);
R_set_altrep_Length_method(lgb_altrepped_dbl_arr, get_altrepped_vec_len);
R_set_altvec_Dataptr_method(lgb_altrepped_dbl_arr, get_altrepped_vec_dataptr);
R_set_altvec_Dataptr_or_null_method(lgb_altrepped_dbl_arr, get_altrepped_vec_dataptr_or_null);
#endif
}

Просмотреть файл

@ -2776,14 +2776,12 @@ test_that(paste0("lgb.train() throws an informative error if the members of inte
test_that("lgb.train() throws an informative error if interaction_constraints contains a too large index", {
dtrain <- lgb.Dataset(train$data, label = train$label)
params <- list(objective = "regression",
interaction_constraints = list(c(1L, length(colnames(train$data)) + 1L), 3L))
expect_error({
bst <- lightgbm(
data = dtrain
, params = params
, nrounds = 2L
)
}, "supplied a too large value in interaction_constraints")
interaction_constraints = list(c(1L, ncol(train$data) + 1L:2L), 3L))
expect_error(
lightgbm(data = dtrain, params = params, nrounds = 2L)
, "unknown feature(s) in interaction_constraints: '127', '128'"
, fixed = TRUE
)
})
test_that(paste0("lgb.train() gives same result when interaction_constraints is specified as a list of ",
@ -2876,6 +2874,37 @@ test_that(paste0("lgb.train() gives same results when using interaction_constrai
})
test_that("Interaction constraints add missing features correctly as new group", {
dtrain <- lgb.Dataset(
train$data[, 1L:6L] # Pick only some columns
, label = train$label
, params = list(num_threads = .LGB_MAX_THREADS)
)
list_of_constraints <- list(
list(3L, 1L:2L)
, list("cap-shape=convex", c("cap-shape=bell", "cap-shape=conical"))
)
for (constraints in list_of_constraints) {
params <- list(
objective = "regression"
, interaction_constraints = constraints
, verbose = .LGB_VERBOSITY
, num_threads = .LGB_MAX_THREADS
)
bst <- lightgbm(data = dtrain, params = params, nrounds = 10L)
expected_list <- list("[2]", "[0,1]", "[3,4,5]")
expect_equal(bst$params$interaction_constraints, expected_list)
expected_string <- "[interaction_constraints: [2],[0,1],[3,4,5]]"
expect_true(
grepl(expected_string, bst$save_model_to_string(), fixed = TRUE)
)
}
})
.generate_trainset_for_monotone_constraints_tests <- function(x3_to_categorical) {
n_samples <- 3000L
x1_positively_correlated_with_y <- runif(n = n_samples, min = 0.0, max = 1.0)

Просмотреть файл

@ -440,6 +440,35 @@ test_that("lgb.Dataset: should be able to run lgb.cv() immediately after using l
expect_true(methods::is(bst, "lgb.CVBooster"))
})
test_that("lgb.Dataset: should be able to be used in lgb.cv() when constructed with categorical feature indices", {
data("mtcars")
y <- mtcars$mpg
x <- as.matrix(mtcars[, -1L])
categorical_feature <- which(names(mtcars) %in% c("cyl", "vs", "am", "gear", "carb")) - 1L
dtrain <- lgb.Dataset(
data = x
, label = y
, categorical_feature = categorical_feature
, free_raw_data = TRUE
, params = list(num_threads = .LGB_MAX_THREADS)
)
# constructing the Dataset frees the raw data
dtrain$construct()
params <- list(
objective = "regression"
, num_leaves = 2L
, verbose = .LGB_VERBOSITY
, num_threads = .LGB_MAX_THREADS
)
# cv should reuse the same categorical features without checking the indices
bst <- lgb.cv(params = params, data = dtrain, stratified = FALSE, nrounds = 1L)
expect_equal(
unlist(bst$boosters[[1L]]$booster$params$categorical_feature)
, categorical_feature - 1L # 0-based
)
})
test_that("lgb.Dataset: should be able to use and retrieve long feature names", {
# set one feature to a value longer than the default buffer size used
# in LGBM_DatasetGetFeatureNames_R
@ -621,3 +650,12 @@ test_that("lgb.Dataset can be constructed with categorical features and without
lgb.Dataset(raw_mat, categorical_feature = 2L)$construct()
}, regexp = "supplied a too large value in categorical_feature: 2 but only 1 features")
})
test_that("lgb.Dataset.slice fails with a categorical feature index greater than the number of features", {
data <- matrix(runif(100L), nrow = 50L, ncol = 2L)
ds <- lgb.Dataset(data = data, categorical_feature = 3L)
subset <- ds$slice(1L:20L)
expect_error({
subset$construct()
}, regexp = "supplied a too large value in categorical_feature: 3 but only 2 features")
})

Просмотреть файл

@ -174,7 +174,7 @@ test_that("Loading a Booster from a text file works", {
, bagging_freq = 1L
, boost_from_average = FALSE
, categorical_feature = c(1L, 2L)
, interaction_constraints = list(c(1L, 2L), 1L)
, interaction_constraints = list(1L:2L, 3L, 4L:ncol(train$data))
, feature_contri = rep(0.5, ncol(train$data))
, metric = c("mape", "average_precision")
, learning_rate = 1.0

Просмотреть файл

@ -147,3 +147,21 @@ test_that(".equal_or_both_null produces expected results", {
expect_false(.equal_or_both_null(10.0, 1L))
expect_true(.equal_or_both_null(0L, 0L))
})
test_that(".check_interaction_constraints() adds skipped features", {
ref <- letters[1L:5L]
ic_num <- list(1L, c(2L, 3L))
ic_char <- list("a", c("b", "c"))
expected <- list("[0]", "[1,2]", "[3,4]")
ic_checked_num <- .check_interaction_constraints(
interaction_constraints = ic_num, column_names = ref
)
ic_checked_char <- .check_interaction_constraints(
interaction_constraints = ic_char, column_names = ref
)
expect_equal(ic_checked_num, expected)
expect_equal(ic_checked_char, expected)
})

Просмотреть файл

@ -133,7 +133,7 @@ Support
-------
- Ask a question [on Stack Overflow with the `lightgbm` tag](https://stackoverflow.com/questions/ask?tags=lightgbm), we monitor this for new questions.
- Open **bug reports** and **feature requests** (not questions) on [GitHub issues](https://github.com/microsoft/LightGBM/issues).
- Open **bug reports** and **feature requests** on [GitHub issues](https://github.com/microsoft/LightGBM/issues).
How to Contribute
-----------------
@ -156,8 +156,6 @@ Qi Meng, Guolin Ke, Taifeng Wang, Wei Chen, Qiwei Ye, Zhi-Ming Ma, Tie-Yan Liu.
Huan Zhang, Si Si and Cho-Jui Hsieh. "[GPU Acceleration for Large-scale Tree Boosting](https://arxiv.org/abs/1706.08359)". SysML Conference, 2018.
**Note**: If you use LightGBM in your GitHub projects, please add `lightgbm` in the `requirements.txt`.
License
-------

Просмотреть файл

@ -1 +1 @@
4.3.0.99
4.4.0.99

Просмотреть файл

@ -149,7 +149,7 @@ and copy memory as required by creating new processes instead of forking (or, us
Cloud platform container services may cause LightGBM to hang, if they use Linux fork to run multiple containers on a
single instance. For example, LightGBM hangs in AWS Batch array jobs, which `use the ECS agent
<https://aws.amazon.com/batch/faqs/#Features>`__ to manage multiple running jobs. Setting ``nthreads=1`` mitigates the issue.
<https://aws.amazon.com/batch/faqs>`__ to manage multiple running jobs. Setting ``nthreads=1`` mitigates the issue.
12. Why is early stopping not enabled by default in LightGBM?
-------------------------------------------------------------
@ -321,7 +321,7 @@ We are doing our best to provide universal wheels which have high running speed
However, sometimes it's just impossible to guarantee the possibility of usage of LightGBM in any specific environment (see `Microsoft/LightGBM#1743 <https://github.com/microsoft/LightGBM/issues/1743>`__).
Therefore, the first thing you should try in case of segfaults is **compiling from the source** using ``pip install --no-binary lightgbm lightgbm``.
For the OS-specific prerequisites see `this guide <https://github.com/microsoft/LightGBM/blob/master/python-package/README.rst#user-content-build-from-sources>`__.
For the OS-specific prerequisites see https://github.com/microsoft/LightGBM/blob/master/python-package/README.rst.
Also, feel free to post a new issue in our GitHub repository. We always look at each case individually and try to find a root cause.

Просмотреть файл

@ -602,9 +602,9 @@ And open an issue in GitHub `here`_ with that log.
.. _Boost: https://www.boost.org/users/history/
.. _Prebuilt Boost x86_64: https://www.rpmfind.net/linux/fedora/linux/releases/38/Everything/x86_64/os/Packages/m/mingw64-boost-static-1.78.0-4.fc38.noarch.rpm
.. _Prebuilt Boost x86_64: https://www.rpmfind.net/linux/fedora/linux/releases/40/Everything/x86_64/os/Packages/m/mingw64-boost-static-1.78.0-9.fc40.noarch.rpm
.. _Prebuilt Boost i686: https://www.rpmfind.net/linux/fedora/linux/releases/38/Everything/x86_64/os/Packages/m/mingw32-boost-static-1.78.0-4.fc38.noarch.rpm
.. _Prebuilt Boost i686: https://www.rpmfind.net/linux/fedora/linux/releases/40/Everything/x86_64/os/Packages/m/mingw32-boost-static-1.78.0-9.fc40.noarch.rpm
.. _7zip: https://www.7-zip.org/download.html

Просмотреть файл

@ -22,7 +22,7 @@ To get good results using a leaf-wise tree, these are some important parameters:
1. ``num_leaves``. This is the main parameter to control the complexity of the tree model.
Theoretically, we can set ``num_leaves = 2^(max_depth)`` to obtain the same number of leaves as depth-wise tree.
However, this simple conversion is not good in practice.
The reason is that a leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting.
A leaf-wise tree is typically much deeper than a depth-wise tree for a fixed number of leaves. Unconstrained depth can induce over-fitting.
Thus, when trying to tune the ``num_leaves``, we should let it be smaller than ``2^(max_depth)``.
For example, when the ``max_depth=7`` the depth-wise tree can get good accuracy,
but setting ``num_leaves`` to ``127`` may cause over-fitting, and setting it to ``70`` or ``80`` may get better accuracy than depth-wise.
@ -33,6 +33,7 @@ To get good results using a leaf-wise tree, these are some important parameters:
In practice, setting it to hundreds or thousands is enough for a large dataset.
3. ``max_depth``. You also can use ``max_depth`` to limit the tree depth explicitly.
If you set ``max_depth``, also explicitly set ``num_leaves`` to some value ``<= 2^max_depth``.
For Faster Speed
----------------

Просмотреть файл

@ -414,6 +414,8 @@ Learning Control Parameters
- when early stopping is used (i.e. ``early_stopping_round > 0``), require the early stopping metric to improve by at least this delta to be considered an improvement
- *New in 4.4.0*
- ``first_metric_only`` :raw-html:`<a id="first_metric_only" title="Permalink to this parameter" href="#first_metric_only">&#x1F517;&#xFE0E;</a>`, default = ``false``, type = bool
- LightGBM allows you to provide multiple evaluation metrics. Set this to ``true``, if you want to use only the first metric for early stopping

Просмотреть файл

@ -59,8 +59,9 @@ Many of the examples in this page use functionality from ``numpy``. To run the e
.. code:: python
data = np.random.rand(500, 10) # 500 entities, each contains 10 features
label = np.random.randint(2, size=500) # binary target
rng = np.random.default_rng()
data = rng.uniform(size=(500, 10)) # 500 entities, each contains 10 features
label = rng.integers(low=0, high=2, size=(500, )) # binary target
train_data = lgb.Dataset(data, label=label)
**To load a scipy.sparse.csr\_matrix array into Dataset:**
@ -139,7 +140,8 @@ It doesn't need to convert to one-hot encoding, and is much faster than one-hot
.. code:: python
w = np.random.rand(500, )
rng = np.random.default_rng()
w = rng.uniform(size=(500, ))
train_data = lgb.Dataset(data, label=label, weight=w)
or
@ -147,7 +149,8 @@ or
.. code:: python
train_data = lgb.Dataset(data, label=label)
w = np.random.rand(500, )
rng = np.random.default_rng()
w = rng.uniform(size=(500, ))
train_data.set_weight(w)
And you can use ``Dataset.set_init_score()`` to set initial score, and ``Dataset.set_group()`` to set group/query data for ranking tasks.
@ -249,7 +252,8 @@ A model that has been trained or loaded can perform predictions on datasets:
.. code:: python
# 7 entities, each contains 10 features
data = np.random.rand(7, 10)
rng = np.random.default_rng()
data = rng.uniform(size=(7, 10))
ypred = bst.predict(data)
If early stopping is enabled during training, you can get predictions from the best iteration with ``bst.best_iteration``:

Просмотреть файл

@ -17,6 +17,7 @@
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute.
"""Sphinx configuration file."""
import datetime
import os
import sys

Просмотреть файл

@ -22,15 +22,15 @@ import lightgbm as lgb
#################
# Simulate some binary data with a single categorical and
# single continuous predictor
np.random.seed(0)
rng = np.random.default_rng(seed=0)
N = 1000
X = pd.DataFrame({"continuous": range(N), "categorical": np.repeat([0, 1, 2, 3, 4], N / 5)})
CATEGORICAL_EFFECTS = [-1, -1, -2, -2, 2]
LINEAR_TERM = np.array(
[-0.5 + 0.01 * X["continuous"][k] + CATEGORICAL_EFFECTS[X["categorical"][k]] for k in range(X.shape[0])]
) + np.random.normal(0, 1, X.shape[0])
) + rng.normal(loc=0, scale=1, size=X.shape[0])
TRUE_PROB = expit(LINEAR_TERM)
Y = np.random.binomial(1, TRUE_PROB, size=N)
Y = rng.binomial(n=1, p=TRUE_PROB, size=N)
DATA = {
"X": X,
"probability_labels": TRUE_PROB,
@ -65,10 +65,9 @@ def experiment(objective, label_type, data):
result : dict
Experiment summary stats.
"""
np.random.seed(0)
nrounds = 5
lgb_data = data[f"lgb_with_{label_type}_labels"]
params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1}
params = {"objective": objective, "feature_fraction": 1, "bagging_fraction": 1, "verbose": -1, "seed": 123}
time_zero = time.time()
gbm = lgb.train(params, lgb_data, num_boost_round=nrounds)
y_fitted = gbm.predict(data["X"])

Просмотреть файл

@ -12,6 +12,7 @@ Version history for these symbols can be found at the following:
* GLIBCXX: https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html
* OMP/GOMP: https://github.com/gcc-mirror/gcc/blob/master/libgomp/libgomp.map
"""
import re
import sys
from pathlib import Path

Просмотреть файл

@ -6,6 +6,7 @@ with list of all parameters, aliases table and other routines
along with parameters description in LightGBM/docs/Parameters.rst file
from the information in LightGBM/include/LightGBM/config.h file.
"""
import re
from collections import defaultdict
from pathlib import Path

Просмотреть файл

@ -396,6 +396,7 @@ struct Config {
// check = >=0.0
// desc = when early stopping is used (i.e. ``early_stopping_round > 0``), require the early stopping metric to improve by at least this delta to be considered an improvement
// desc = *New in 4.4.0*
double early_stopping_min_delta = 0.0;
// desc = LightGBM allows you to provide multiple evaluation metrics. Set this to ``true``, if you want to use only the first metric for early stopping
@ -1146,7 +1147,7 @@ struct Config {
static const std::string DumpAliases();
private:
void CheckParamConflict();
void CheckParamConflict(const std::unordered_map<std::string, std::string>& params);
void GetMembersFromString(const std::unordered_map<std::string, std::string>& params);
std::string SaveMembersToString() const;
void GetAucMuWeights();

Просмотреть файл

@ -1,6 +0,0 @@
PMML Generator
==============
The old Python convert script is removed due to it cannot support the new format of categorical features.
Please refer to https://github.com/jpmml/jpmml-lightgbm.

Просмотреть файл

@ -11,8 +11,6 @@ Preparation
32-bit Python is not supported. Please install 64-bit version. If you have a strong need to install with 32-bit Python, refer to `Build 32-bit Version with 32-bit Python section <#build-32-bit-version-with-32-bit-python>`__.
`setuptools <https://pypi.org/project/setuptools>`_ is needed.
Install from `PyPI <https://pypi.org/project/lightgbm>`_
''''''''''''''''''''''''''''''''''''''''''''''''''''''''
@ -299,10 +297,6 @@ Refer to the walk through examples in `Python guide folder <https://github.com/m
Development Guide
-----------------
The code style of Python-package follows `PEP 8 <https://www.python.org/dev/peps/pep-0008/>`_.
The package's documentation strings (docstrings) are written in the `numpydoc style <https://numpydoc.readthedocs.io/en/latest/format.html>`_.
To check that a contribution to the package matches its style expectations, run the following from the root of the repo.
.. code:: sh

Просмотреть файл

@ -3,6 +3,7 @@
Contributors: https://github.com/microsoft/LightGBM/graphs/contributors.
"""
from pathlib import Path
from .basic import Booster, Dataset, Sequence, register_logger

Просмотреть файл

@ -1,5 +1,6 @@
# coding: utf-8
"""Wrapper for C API of LightGBM."""
import abc
import ctypes
import inspect
@ -355,10 +356,10 @@ def _list_to_1d_numpy(
array = data.ravel()
return _cast_numpy_array_to_dtype(array, dtype)
elif _is_1d_list(data):
return np.array(data, dtype=dtype, copy=False)
return np.asarray(data, dtype=dtype)
elif isinstance(data, pd_Series):
_check_for_bad_pandas_dtypes(data.to_frame().dtypes)
return np.array(data, dtype=dtype, copy=False) # SparseArray should be supported as well
return np.asarray(data, dtype=dtype) # SparseArray should be supported as well
else:
raise TypeError(
f"Wrong type({type(data).__name__}) for {name}.\n" "It should be list, numpy 1-D array or pandas Series"
@ -556,7 +557,8 @@ class LightGBMError(Exception):
# DeprecationWarning is not shown by default, so let's create our own with higher level
class LGBMDeprecationWarning(UserWarning):
# ref: https://peps.python.org/pep-0565/#additional-use-case-for-futurewarning
class LGBMDeprecationWarning(FutureWarning):
"""Custom deprecation warning."""
pass
@ -726,7 +728,7 @@ def _convert_from_sliced_object(data: np.ndarray) -> np.ndarray:
def _c_float_array(data: np.ndarray) -> Tuple[_ctypes_float_ptr, int, np.ndarray]:
"""Get pointer of float numpy array / list."""
if _is_1d_list(data):
data = np.array(data, copy=False)
data = np.asarray(data)
if _is_numpy_1d_array(data):
data = _convert_from_sliced_object(data)
assert data.flags.c_contiguous
@ -747,7 +749,7 @@ def _c_float_array(data: np.ndarray) -> Tuple[_ctypes_float_ptr, int, np.ndarray
def _c_int_array(data: np.ndarray) -> Tuple[_ctypes_int_ptr, int, np.ndarray]:
"""Get pointer of int numpy array / list."""
if _is_1d_list(data):
data = np.array(data, copy=False)
data = np.asarray(data)
if _is_numpy_1d_array(data):
data = _convert_from_sliced_object(data)
assert data.flags.c_contiguous
@ -1268,7 +1270,7 @@ class _InnerPredictor:
preds: Optional[np.ndarray],
) -> Tuple[np.ndarray, int]:
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
else: # change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data, _ = _c_float_array(data)
@ -2283,9 +2285,9 @@ class Dataset:
self._handle = ctypes.c_void_p()
if mat.dtype == np.float32 or mat.dtype == np.float64:
data = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
data = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
else: # change non-float data to float data, need to copy
data = np.array(mat.reshape(mat.size), dtype=np.float32)
data = np.asarray(mat.reshape(mat.size), dtype=np.float32)
ptr_data, type_ptr_data, _ = _c_float_array(data)
_safe_call(
@ -2330,7 +2332,7 @@ class Dataset:
nrow[i] = mat.shape[0]
if mat.dtype == np.float32 or mat.dtype == np.float64:
mats[i] = np.array(mat.reshape(mat.size), dtype=mat.dtype, copy=False)
mats[i] = np.asarray(mat.reshape(mat.size), dtype=mat.dtype)
else: # change non-float data to float data, need to copy
mats[i] = np.array(mat.reshape(mat.size), dtype=np.float32)

Просмотреть файл

@ -1,5 +1,6 @@
# coding: utf-8
"""Callbacks library."""
from collections import OrderedDict
from dataclasses import dataclass
from functools import partial

Просмотреть файл

@ -37,18 +37,6 @@ except ImportError:
concat = None
"""numpy"""
try:
from numpy.random import Generator as np_random_Generator
except ImportError:
class np_random_Generator: # type: ignore
"""Dummy class for np.random.Generator."""
def __init__(self, *args: Any, **kwargs: Any):
pass
"""matplotlib"""
try:
import matplotlib # noqa: F401

Просмотреть файл

@ -6,6 +6,7 @@ dask.Array and dask.DataFrame collections.
It is based on dask-lightgbm, which was based on dask-xgboost.
"""
import operator
import socket
from collections import defaultdict

Просмотреть файл

@ -1,5 +1,6 @@
# coding: utf-8
"""Library with training routines of LightGBM."""
import copy
import json
import warnings
@ -511,7 +512,7 @@ def _make_n_folds(
if hasattr(folds, "split"):
group_info = full_data.get_group()
if group_info is not None:
group_info = np.array(group_info, dtype=np.int32, copy=False)
group_info = np.asarray(group_info, dtype=np.int32)
flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
else:
flatted_group = np.zeros(num_data, dtype=np.int32)
@ -525,7 +526,7 @@ def _make_n_folds(
if not SKLEARN_INSTALLED:
raise LightGBMError("scikit-learn is required for ranking cv")
# ranking task, split according to groups
group_info = np.array(full_data.get_group(), dtype=np.int32, copy=False)
group_info = np.asarray(full_data.get_group(), dtype=np.int32)
flatted_group = np.repeat(range(len(group_info)), repeats=group_info)
group_kfold = _LGBMGroupKFold(n_splits=nfold)
folds = group_kfold.split(X=np.empty(num_data), groups=flatted_group)

Просмотреть файл

@ -1,5 +1,6 @@
# coding: utf-8
"""Find the path to LightGBM dynamic library files."""
from pathlib import Path
from platform import system
from typing import List

Просмотреть файл

@ -1,5 +1,6 @@
# coding: utf-8
"""Plotting library."""
import math
from copy import deepcopy
from io import BytesIO

Просмотреть файл

@ -1,5 +1,6 @@
# coding: utf-8
"""Scikit-learn wrapper interface for LightGBM."""
import copy
from inspect import signature
from pathlib import Path
@ -40,7 +41,6 @@ from .compat import (
_LGBMModelBase,
_LGBMRegressorBase,
dt_DataTable,
np_random_Generator,
pd_DataFrame,
)
from .engine import train
@ -454,6 +454,30 @@ _lgbmmodel_doc_predict = """
"""
def _extract_evaluation_meta_data(
*,
collection: Optional[Union[Dict[Any, Any], List[Any]]],
name: str,
i: int,
) -> Optional[Any]:
"""Try to extract the ith element of one of the ``eval_*`` inputs."""
if collection is None:
return None
elif isinstance(collection, list):
# It's possible, for example, to pass 3 eval sets through `eval_set`,
# but only 1 init_score through `eval_init_score`.
#
# This if-else accounts for that possiblity.
if len(collection) > i:
return collection[i]
else:
return None
elif isinstance(collection, dict):
return collection.get(i, None)
else:
raise TypeError(f"{name} should be dict or list")
class LGBMModel(_LGBMModelBase):
"""Implementation of the scikit-learn API for LightGBM."""
@ -475,7 +499,7 @@ class LGBMModel(_LGBMModelBase):
colsample_bytree: float = 1.0,
reg_alpha: float = 0.0,
reg_lambda: float = 0.0,
random_state: Optional[Union[int, np.random.RandomState, "np.random.Generator"]] = None,
random_state: Optional[Union[int, np.random.RandomState, np.random.Generator]] = None,
n_jobs: Optional[int] = None,
importance_type: str = "split",
**kwargs: Any,
@ -492,6 +516,7 @@ class LGBMModel(_LGBMModelBase):
Maximum tree leaves for base learners.
max_depth : int, optional (default=-1)
Maximum tree depth for base learners, <=0 means no limit.
If setting this to a positive value, consider also changing ``num_leaves`` to ``<= 2^max_depth``.
learning_rate : float, optional (default=0.1)
Boosting learning rate.
You can use ``callbacks`` parameter of ``fit`` method to shrink/adapt learning rate
@ -738,7 +763,7 @@ class LGBMModel(_LGBMModelBase):
if isinstance(params["random_state"], np.random.RandomState):
params["random_state"] = params["random_state"].randint(np.iinfo(np.int32).max)
elif isinstance(params["random_state"], np_random_Generator):
elif isinstance(params["random_state"], np.random.Generator):
params["random_state"] = int(params["random_state"].integers(np.iinfo(np.int32).max))
if self._n_classes > 2:
for alias in _ConfigAliases.get("num_class"):
@ -868,17 +893,6 @@ class LGBMModel(_LGBMModelBase):
valid_sets: List[Dataset] = []
if eval_set is not None:
def _get_meta_data(collection, name, i):
if collection is None:
return None
elif isinstance(collection, list):
return collection[i] if len(collection) > i else None
elif isinstance(collection, dict):
return collection.get(i, None)
else:
raise TypeError(f"{name} should be dict or list")
if isinstance(eval_set, tuple):
eval_set = [eval_set]
for i, valid_data in enumerate(eval_set):
@ -886,8 +900,16 @@ class LGBMModel(_LGBMModelBase):
if valid_data[0] is X and valid_data[1] is y:
valid_set = train_set
else:
valid_weight = _get_meta_data(eval_sample_weight, "eval_sample_weight", i)
valid_class_weight = _get_meta_data(eval_class_weight, "eval_class_weight", i)
valid_weight = _extract_evaluation_meta_data(
collection=eval_sample_weight,
name="eval_sample_weight",
i=i,
)
valid_class_weight = _extract_evaluation_meta_data(
collection=eval_class_weight,
name="eval_class_weight",
i=i,
)
if valid_class_weight is not None:
if isinstance(valid_class_weight, dict) and self._class_map is not None:
valid_class_weight = {self._class_map[k]: v for k, v in valid_class_weight.items()}
@ -896,8 +918,16 @@ class LGBMModel(_LGBMModelBase):
valid_weight = valid_class_sample_weight
else:
valid_weight = np.multiply(valid_weight, valid_class_sample_weight)
valid_init_score = _get_meta_data(eval_init_score, "eval_init_score", i)
valid_group = _get_meta_data(eval_group, "eval_group", i)
valid_init_score = _extract_evaluation_meta_data(
collection=eval_init_score,
name="eval_init_score",
i=i,
)
valid_group = _extract_evaluation_meta_data(
collection=eval_group,
name="eval_group",
i=i,
)
valid_set = Dataset(
data=valid_data[0],
label=valid_data[1],

Просмотреть файл

@ -15,11 +15,11 @@ classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Scientific/Engineering :: Artificial Intelligence"
]
dependencies = [
"dataclasses ; python_version < '3.7'",
"numpy",
"numpy>=1.17.0",
"scipy"
]
description = "LightGBM Python Package"
@ -29,8 +29,8 @@ maintainers = [
]
name = "lightgbm"
readme = "README.rst"
requires-python = ">=3.6"
version = "4.3.0.99"
requires-python = ">=3.7"
version = "4.4.0.99"
[project.optional-dependencies]
arrow = [
@ -79,7 +79,7 @@ logging.level = "INFO"
sdist.reproducible = true
wheel.py-api = "py3"
experimental = false
strict-config = true
strict-config = false
minimum-version = "0.9.3"
# end:build-system
@ -156,6 +156,8 @@ select = [
"E",
# pyflakes
"F",
# NumPy-specific rules
"NPY",
# pylint
"PL",
# flake8-return: unnecessary assignment before return

Просмотреть файл

@ -289,14 +289,14 @@ void Config::Set(const std::unordered_map<std::string, std::string>& params) {
}
// check for conflicts
CheckParamConflict();
CheckParamConflict(params);
}
bool CheckMultiClassObjective(const std::string& objective) {
return (objective == std::string("multiclass") || objective == std::string("multiclassova"));
}
void Config::CheckParamConflict() {
void Config::CheckParamConflict(const std::unordered_map<std::string, std::string>& params) {
// check if objective, metric, and num_class match
int num_class_check = num_class;
bool objective_type_multiclass = CheckMultiClassObjective(objective) || (objective == std::string("custom") && num_class_check > 1);
@ -356,14 +356,24 @@ void Config::CheckParamConflict() {
tree_learner.c_str());
}
}
// Check max_depth and num_leaves
if (max_depth > 0) {
// max_depth defaults to -1, so max_depth>0 implies "you explicitly overrode the default"
//
// Changing max_depth while leaving num_leaves at its default (31) can lead to 2 undesirable situations:
//
// * (0 <= max_depth <= 4) it's not possible to produce a tree with 31 leaves
// - this block reduces num_leaves to 2^max_depth
// * (max_depth > 4) 31 leaves is less than a full depth-wise tree, which might lead to underfitting
// - this block warns about that
// ref: https://github.com/microsoft/LightGBM/issues/2898#issuecomment-1002860601
if (max_depth > 0 && (params.count("num_leaves") == 0 || params.at("num_leaves").empty())) {
double full_num_leaves = std::pow(2, max_depth);
if (full_num_leaves > num_leaves
&& num_leaves == kDefaultNumLeaves) {
Log::Warning("Accuracy may be bad since you didn't explicitly set num_leaves OR 2^max_depth > num_leaves."
" (num_leaves=%d).",
num_leaves);
if (full_num_leaves > num_leaves) {
Log::Warning("Provided parameters constrain tree depth (max_depth=%d) without explicitly setting 'num_leaves'. "
"This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<=%.0f) in params. "
"Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity.",
max_depth,
full_num_leaves);
}
if (full_num_leaves < num_leaves) {

Просмотреть файл

@ -274,7 +274,7 @@ Dataset* DatasetLoader::LoadFromFile(const char* filename, int rank, int num_mac
dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), rank, num_machines, &num_global_data, &used_data_indices));
// checks whether there's a initial score file when loaded from binary data files
// the intial score file should with suffix ".bin.init"
// the initial score file should with suffix ".bin.init"
dataset->metadata_.LoadInitialScore(bin_filename);
dataset->device_type_ = config_.device_type;
@ -344,7 +344,7 @@ Dataset* DatasetLoader::LoadFromFileAlignWithOtherDataset(const char* filename,
// load data from binary file
dataset.reset(LoadFromBinFile(filename, bin_filename.c_str(), 0, 1, &num_global_data, &used_data_indices));
// checks whether there's a initial score file when loaded from binary data files
// the intial score file should with suffix ".bin.init"
// the initial score file should with suffix ".bin.init"
dataset->metadata_.LoadInitialScore(bin_filename);
}
// not need to check validation data

Просмотреть файл

@ -125,7 +125,7 @@ def load_from_mat(filename, reference):
mat = np.loadtxt(str(filename), dtype=np.float64)
label = mat[:, 0].astype(np.float32)
mat = mat[:, 1:]
data = np.array(mat.reshape(mat.size), dtype=np.float64, copy=False)
data = np.asarray(mat.reshape(mat.size), dtype=np.float64)
handle = ctypes.c_void_p()
ref = None
if reference is not None:
@ -203,7 +203,7 @@ def test_booster():
mat = data[:, 1:]
preb = np.empty(mat.shape[0], dtype=np.float64)
num_preb = ctypes.c_int64(0)
data = np.array(mat.reshape(mat.size), dtype=np.float64, copy=False)
data = np.asarray(mat.reshape(mat.size), dtype=np.float64)
LIB.LGBM_BoosterPredictForMat(
booster2,
data.ctypes.data_as(ctypes.POINTER(ctypes.c_double)),

Просмотреть файл

@ -0,0 +1,12 @@
import numpy as np
import pytest
@pytest.fixture(scope="function")
def rng():
return np.random.default_rng()
@pytest.fixture(scope="function")
def rng_fixed_seed():
return np.random.default_rng(seed=42)

Просмотреть файл

@ -20,6 +20,10 @@ if os.getenv("ALLOW_SKIP_ARROW_TESTS") == "1":
else:
import pyarrow as pa # type: ignore
assert (
lgb.compat.PYARROW_INSTALLED is True
), "'pyarrow' and its dependencies must be installed to run the arrow tests"
# ----------------------------------------------------------------------------------------------- #
# UTILITIES #
# ----------------------------------------------------------------------------------------------- #

Просмотреть файл

@ -9,7 +9,7 @@ from pathlib import Path
import numpy as np
import pytest
from scipy import sparse
from sklearn.datasets import dump_svmlight_file, load_svmlight_file
from sklearn.datasets import dump_svmlight_file, load_svmlight_file, make_blobs
from sklearn.model_selection import train_test_split
import lightgbm as lgb
@ -136,7 +136,7 @@ def _create_sequence_from_ndarray(data, num_seq, batch_size):
@pytest.mark.parametrize("batch_size", [3, None])
@pytest.mark.parametrize("include_0_and_nan", [False, True])
@pytest.mark.parametrize("num_seq", [1, 3])
def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq, rng):
params = {"bin_construct_sample_cnt": sample_count}
nrow = 50
@ -175,7 +175,6 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
# Test for validation set.
# Select some random rows as valid data.
rng = np.random.default_rng() # Pass integer to set seed when needed.
valid_idx = (rng.random(10) * nrow).astype(np.int32)
valid_data = data[valid_idx, :]
valid_X = valid_data[:, :-1]
@ -201,7 +200,7 @@ def test_sequence(tmpdir, sample_count, batch_size, include_0_and_nan, num_seq):
@pytest.mark.parametrize("num_seq", [1, 2])
def test_sequence_get_data(num_seq):
def test_sequence_get_data(num_seq, rng):
nrow = 20
ncol = 11
data = np.arange(nrow * ncol, dtype=np.float64).reshape((nrow, ncol))
@ -212,7 +211,7 @@ def test_sequence_get_data(num_seq):
seq_ds = lgb.Dataset(seqs, label=Y, params=None, free_raw_data=False).construct()
assert seq_ds.get_data() == seqs
used_indices = np.random.choice(np.arange(nrow), nrow // 3, replace=False)
used_indices = rng.choice(a=np.arange(nrow), size=nrow // 3, replace=False)
subset_data = seq_ds.subset(used_indices).construct()
np.testing.assert_array_equal(subset_data.get_data(), X[sorted(used_indices)])
@ -246,8 +245,8 @@ def test_chunked_dataset_linear():
valid_data.construct()
def test_save_dataset_subset_and_load_from_file(tmp_path):
data = np.random.rand(100, 2)
def test_save_dataset_subset_and_load_from_file(tmp_path, rng):
data = rng.standard_normal(size=(100, 2))
params = {"max_bin": 50, "min_data_in_bin": 10}
ds = lgb.Dataset(data, params=params)
ds.subset([1, 2, 3, 5, 8]).save_binary(tmp_path / "subset.bin")
@ -267,18 +266,18 @@ def test_subset_group():
assert subset_group[1] == 9
def test_add_features_throws_if_num_data_unequal():
X1 = np.random.random((100, 1))
X2 = np.random.random((10, 1))
def test_add_features_throws_if_num_data_unequal(rng):
X1 = rng.uniform(size=(100, 1))
X2 = rng.uniform(size=(10, 1))
d1 = lgb.Dataset(X1).construct()
d2 = lgb.Dataset(X2).construct()
with pytest.raises(lgb.basic.LightGBMError):
d1.add_features_from(d2)
def test_add_features_throws_if_datasets_unconstructed():
X1 = np.random.random((100, 1))
X2 = np.random.random((100, 1))
def test_add_features_throws_if_datasets_unconstructed(rng):
X1 = rng.uniform(size=(100, 1))
X2 = rng.uniform(size=(100, 1))
with pytest.raises(ValueError):
d1 = lgb.Dataset(X1)
d2 = lgb.Dataset(X2)
@ -293,8 +292,8 @@ def test_add_features_throws_if_datasets_unconstructed():
d1.add_features_from(d2)
def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
X = np.random.random((100, 5))
def test_add_features_equal_data_on_alternating_used_unused(tmp_path, rng):
X = rng.uniform(size=(100, 5))
X[:, [1, 3]] = 0
names = [f"col_{i}" for i in range(5)]
for j in range(1, 5):
@ -313,8 +312,8 @@ def test_add_features_equal_data_on_alternating_used_unused(tmp_path):
assert dtxt == d1txt
def test_add_features_same_booster_behaviour(tmp_path):
X = np.random.random((100, 5))
def test_add_features_same_booster_behaviour(tmp_path, rng):
X = rng.uniform(size=(100, 5))
X[:, [1, 3]] = 0
names = [f"col_{i}" for i in range(5)]
for j in range(1, 5):
@ -322,7 +321,7 @@ def test_add_features_same_booster_behaviour(tmp_path):
d2 = lgb.Dataset(X[:, j:], feature_name=names[j:]).construct()
d1.add_features_from(d2)
d = lgb.Dataset(X, feature_name=names).construct()
y = np.random.random(100)
y = rng.uniform(size=(100,))
d1.set_label(y)
d.set_label(y)
b1 = lgb.Booster(train_set=d1)
@ -341,11 +340,11 @@ def test_add_features_same_booster_behaviour(tmp_path):
assert dtxt == d1txt
def test_add_features_from_different_sources():
def test_add_features_from_different_sources(rng):
pd = pytest.importorskip("pandas")
n_row = 100
n_col = 5
X = np.random.random((n_row, n_col))
X = rng.uniform(size=(n_row, n_col))
xxs = [X, sparse.csr_matrix(X), pd.DataFrame(X)]
names = [f"col_{i}" for i in range(n_col)]
seq = _create_sequence_from_ndarray(X, 1, 30)
@ -380,9 +379,9 @@ def test_add_features_from_different_sources():
assert d1.feature_name == res_feature_names
def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys):
def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_features(capsys, rng):
arr_a = np.zeros((100, 1), dtype=np.float32)
arr_b = np.random.normal(size=(100, 5))
arr_b = rng.uniform(size=(100, 5))
dataset_a = lgb.Dataset(arr_a).construct()
expected_msg = (
@ -402,10 +401,10 @@ def test_add_features_does_not_fail_if_initial_dataset_has_zero_informative_feat
assert dataset_a._handle.value == original_handle
def test_cegb_affects_behavior(tmp_path):
X = np.random.random((100, 5))
def test_cegb_affects_behavior(tmp_path, rng):
X = rng.uniform(size=(100, 5))
X[:, [1, 3]] = 0
y = np.random.random(100)
y = rng.uniform(size=(100,))
names = [f"col_{i}" for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
@ -433,10 +432,10 @@ def test_cegb_affects_behavior(tmp_path):
assert basetxt != casetxt
def test_cegb_scaling_equalities(tmp_path):
X = np.random.random((100, 5))
def test_cegb_scaling_equalities(tmp_path, rng):
X = rng.uniform(size=(100, 5))
X[:, [1, 3]] = 0
y = np.random.random(100)
y = rng.uniform(size=(100,))
names = [f"col_{i}" for i in range(5)]
ds = lgb.Dataset(X, feature_name=names).construct()
ds.set_label(y)
@ -573,10 +572,10 @@ def test_dataset_construction_overwrites_user_provided_metadata_fields():
np_assert_array_equal(dtrain.get_field("weight"), expected_weight, strict=True)
def test_dataset_construction_with_high_cardinality_categorical_succeeds():
def test_dataset_construction_with_high_cardinality_categorical_succeeds(rng):
pd = pytest.importorskip("pandas")
X = pd.DataFrame({"x1": np.random.randint(0, 5_000, 10_000)})
y = np.random.rand(10_000)
X = pd.DataFrame({"x1": rng.integers(low=0, high=5_000, size=(10_000,))})
y = rng.uniform(size=(10_000,))
ds = lgb.Dataset(X, y, categorical_feature=["x1"])
ds.construct()
assert ds.num_data() == 10_000
@ -663,11 +662,11 @@ def test_choose_param_value_objective(objective_alias):
@pytest.mark.parametrize("collection", ["1d_np", "2d_np", "pd_float", "pd_str", "1d_list", "2d_list"])
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
def test_list_to_1d_numpy(collection, dtype):
def test_list_to_1d_numpy(collection, dtype, rng):
collection2y = {
"1d_np": np.random.rand(10),
"2d_np": np.random.rand(10, 1),
"pd_float": np.random.rand(10),
"1d_np": rng.uniform(size=(10,)),
"2d_np": rng.uniform(size=(10, 1)),
"pd_float": rng.uniform(size=(10,)),
"pd_str": ["a", "b"],
"1d_list": [1] * 10,
"2d_list": [[1], [2]],
@ -696,7 +695,7 @@ def test_list_to_1d_numpy(collection, dtype):
@pytest.mark.parametrize("init_score_type", ["array", "dataframe", "list"])
def test_init_score_for_multiclass_classification(init_score_type):
def test_init_score_for_multiclass_classification(init_score_type, rng):
init_score = [[i * 10 + j for j in range(3)] for i in range(10)]
if init_score_type == "array":
init_score = np.array(init_score)
@ -704,7 +703,7 @@ def test_init_score_for_multiclass_classification(init_score_type):
if not PANDAS_INSTALLED:
pytest.skip("Pandas is not installed.")
init_score = pd_DataFrame(init_score)
data = np.random.rand(10, 2)
data = rng.uniform(size=(10, 2))
ds = lgb.Dataset(data, init_score=init_score).construct()
np.testing.assert_equal(ds.get_field("init_score"), init_score)
np.testing.assert_equal(ds.init_score, init_score)
@ -741,16 +740,20 @@ def test_param_aliases():
def _bad_gradients(preds, _):
return np.random.randn(len(preds) + 1), np.random.rand(len(preds) + 1)
rng = np.random.default_rng()
# "bad" = 1 element too many
size = (len(preds) + 1,)
return rng.standard_normal(size=size), rng.uniform(size=size)
def _good_gradients(preds, _):
return np.random.randn(*preds.shape), np.random.rand(*preds.shape)
rng = np.random.default_rng()
return rng.standard_normal(size=preds.shape), rng.uniform(size=preds.shape)
def test_custom_objective_safety():
def test_custom_objective_safety(rng):
nrows = 100
X = np.random.randn(nrows, 5)
X = rng.standard_normal(size=(nrows, 5))
y_binary = np.arange(nrows) % 2
classes = [0, 1, 2]
nclass = len(classes)
@ -771,10 +774,13 @@ def test_custom_objective_safety():
@pytest.mark.parametrize("dtype", [np.float32, np.float64])
@pytest.mark.parametrize("feature_name", [["x1", "x2"], "auto"])
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name, rng):
pd = pytest.importorskip("pandas")
X = np.random.rand(10, 2).astype(dtype)
df = pd.DataFrame(X)
X = rng.uniform(size=(10, 2)).astype(dtype)
# copy=False is necessary because starting with pandas 3.0, pd.DataFrame() creates
# a copy of the input numpy array by default
# ref: https://github.com/pandas-dev/pandas/issues/58913
df = pd.DataFrame(X, copy=False)
built_data = lgb.basic._data_from_pandas(
data=df, feature_name=feature_name, categorical_feature="auto", pandas_categorical=None
)[0]
@ -784,9 +790,9 @@ def test_no_copy_when_single_float_dtype_dataframe(dtype, feature_name):
@pytest.mark.parametrize("feature_name", [["x1"], [42], "auto"])
@pytest.mark.parametrize("categories", ["seen", "unseen"])
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories):
def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, categories, rng):
pd = pytest.importorskip("pandas")
X = np.random.choice(["a", "b"], 100).reshape(-1, 1)
X = rng.choice(a=["a", "b"], size=(100, 1))
column_name = "a" if feature_name == "auto" else feature_name[0]
df = pd.DataFrame(X.copy(), columns=[column_name], dtype="category")
if categories == "seen":
@ -814,15 +820,15 @@ def test_categorical_code_conversion_doesnt_modify_original_data(feature_name, c
@pytest.mark.parametrize("min_data_in_bin", [2, 10])
def test_feature_num_bin(min_data_in_bin):
def test_feature_num_bin(min_data_in_bin, rng):
X = np.vstack(
[
np.random.rand(100),
rng.uniform(size=(100,)),
np.array([1, 2] * 50),
np.array([0, 1, 2] * 33 + [0]),
np.array([1, 2] * 49 + 2 * [np.nan]),
np.zeros(100),
np.random.choice([0, 1], 100),
rng.choice(a=[0, 1], size=(100,)),
]
).T
n_continuous = X.shape[1] - 1
@ -862,9 +868,9 @@ def test_feature_num_bin(min_data_in_bin):
ds.feature_num_bin(num_features)
def test_feature_num_bin_with_max_bin_by_feature():
X = np.random.rand(100, 3)
max_bin_by_feature = np.random.randint(3, 30, size=X.shape[1])
def test_feature_num_bin_with_max_bin_by_feature(rng):
X = rng.uniform(size=(100, 3))
max_bin_by_feature = rng.integers(low=3, high=30, size=X.shape[1])
ds = lgb.Dataset(X, params={"max_bin_by_feature": max_bin_by_feature}).construct()
actual_num_bins = [ds.feature_num_bin(i) for i in range(X.shape[1])]
np.testing.assert_equal(actual_num_bins, max_bin_by_feature)
@ -882,8 +888,62 @@ def test_set_leaf_output():
np.testing.assert_allclose(bst.predict(X), y_pred + 1)
def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset():
def test_feature_names_are_set_correctly_when_no_feature_names_passed_into_Dataset(rng):
ds = lgb.Dataset(
data=np.random.randn(100, 3),
data=rng.standard_normal(size=(100, 3)),
)
assert ds.construct().feature_name == ["Column_0", "Column_1", "Column_2"]
# NOTE: this intentionally contains values where num_leaves <, ==, and > (max_depth^2)
@pytest.mark.parametrize(("max_depth", "num_leaves"), [(-1, 3), (-1, 50), (5, 3), (5, 31), (5, 32), (8, 3), (8, 31)])
def test_max_depth_warning_is_not_raised_if_num_leaves_is_also_provided(capsys, num_leaves, max_depth):
X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
lgb.Booster(
params={
"objective": "binary",
"max_depth": max_depth,
"num_leaves": num_leaves,
"num_iterations": 1,
"verbose": 0,
},
train_set=lgb.Dataset(X, label=y),
)
assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
# NOTE: max_depth < 5 is significant here because the default for num_leaves=31. With max_depth=5,
# a full depth-wise tree would have 2^5 = 32 leaves.
@pytest.mark.parametrize("max_depth", [1, 2, 3, 4])
def test_max_depth_warning_is_not_raised_if_max_depth_gt_1_and_lt_5_and_num_leaves_omitted(capsys, max_depth):
X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
lgb.Booster(
params={
"objective": "binary",
"max_depth": max_depth,
"num_iterations": 1,
"verbose": 0,
},
train_set=lgb.Dataset(X, label=y),
)
assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
@pytest.mark.parametrize("max_depth", [5, 6, 7, 8, 9])
def test_max_depth_warning_is_raised_if_max_depth_gte_5_and_num_leaves_omitted(capsys, max_depth):
X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
lgb.Booster(
params={
"objective": "binary",
"max_depth": max_depth,
"num_iterations": 1,
"verbose": 0,
},
train_set=lgb.Dataset(X, label=y),
)
expected_warning = (
f"[LightGBM] [Warning] Provided parameters constrain tree depth (max_depth={max_depth}) without explicitly "
f"setting 'num_leaves'. This can lead to underfitting. To resolve this warning, pass 'num_leaves' (<={2**max_depth}) "
"in params. Alternatively, pass (max_depth=-1) and just use 'num_leaves' to constrain model complexity."
)
assert expected_warning in capsys.readouterr().out

Просмотреть файл

@ -550,7 +550,7 @@ def test_multi_class_error():
@pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
def test_auc_mu():
def test_auc_mu(rng):
# should give same result as binary auc for 2 classes
X, y = load_digits(n_class=10, return_X_y=True)
y_new = np.zeros((len(y)))
@ -578,7 +578,7 @@ def test_auc_mu():
assert results_auc_mu["training"]["auc_mu"][-1] == pytest.approx(0.5)
# test that weighted data gives different auc_mu
lgb_X = lgb.Dataset(X, label=y)
lgb_X_weighted = lgb.Dataset(X, label=y, weight=np.abs(np.random.normal(size=y.shape)))
lgb_X_weighted = lgb.Dataset(X, label=y, weight=np.abs(rng.standard_normal(size=y.shape)))
results_unweighted = {}
results_weighted = {}
params = dict(params, num_classes=10, num_leaves=5)
@ -1432,9 +1432,9 @@ def test_feature_name():
assert feature_names == gbm.feature_name()
def test_feature_name_with_non_ascii():
X_train = np.random.normal(size=(100, 4))
y_train = np.random.random(100)
def test_feature_name_with_non_ascii(rng):
X_train = rng.normal(size=(100, 4))
y_train = rng.normal(size=(100,))
# This has non-ascii strings.
feature_names = ["F_零", "F_一", "F_二", "F_三"]
params = {"verbose": -1}
@ -1448,9 +1448,14 @@ def test_feature_name_with_non_ascii():
assert feature_names == gbm2.feature_name()
def test_parameters_are_loaded_from_model_file(tmp_path, capsys):
X = np.hstack([np.random.rand(100, 1), np.random.randint(0, 5, (100, 2))])
y = np.random.rand(100)
def test_parameters_are_loaded_from_model_file(tmp_path, capsys, rng):
X = np.hstack(
[
rng.uniform(size=(100, 1)),
rng.integers(low=0, high=5, size=(100, 2)),
]
)
y = rng.uniform(size=(100,))
ds = lgb.Dataset(X, y)
params = {
"bagging_fraction": 0.8,
@ -1702,29 +1707,29 @@ def test_all_expected_params_are_written_out_to_model_text(tmp_path):
assert param_str in model_txt_from_memory
def test_pandas_categorical():
# why fixed seed?
# sometimes there is no difference how cols are treated (cat or not cat)
def test_pandas_categorical(rng_fixed_seed):
pd = pytest.importorskip("pandas")
np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat)
X = pd.DataFrame(
{
"A": np.random.permutation(["a", "b", "c", "d"] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True),
"A": rng_fixed_seed.permutation(["a", "b", "c", "d"] * 75), # str
"B": rng_fixed_seed.permutation([1, 2, 3] * 100), # int
"C": rng_fixed_seed.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": rng_fixed_seed.permutation([True, False] * 150), # bool
"E": pd.Categorical(rng_fixed_seed.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True),
}
) # str and ordered categorical
y = np.random.permutation([0, 1] * 150)
y = rng_fixed_seed.permutation([0, 1] * 150)
X_test = pd.DataFrame(
{
"A": np.random.permutation(["a", "b", "e"] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(np.random.permutation(["z", "y"] * 30), ordered=True),
"A": rng_fixed_seed.permutation(["a", "b", "e"] * 20), # unseen category
"B": rng_fixed_seed.permutation([1, 3] * 30),
"C": rng_fixed_seed.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": rng_fixed_seed.permutation([True, False] * 30),
"E": pd.Categorical(rng_fixed_seed.permutation(["z", "y"] * 30), ordered=True),
}
)
np.random.seed() # reset seed
cat_cols_actual = ["A", "B", "C", "D"]
cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype("category")
@ -1786,21 +1791,21 @@ def test_pandas_categorical():
assert gbm7.pandas_categorical == cat_values
def test_pandas_sparse():
def test_pandas_sparse(rng):
pd = pytest.importorskip("pandas")
X = pd.DataFrame(
{
"A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150)),
"A": pd.arrays.SparseArray(rng.permutation([0, 1, 2] * 100)),
"B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.arrays.SparseArray(rng.permutation([True, False] * 150)),
}
)
y = pd.Series(pd.arrays.SparseArray(np.random.permutation([0, 1] * 150)))
y = pd.Series(pd.arrays.SparseArray(rng.permutation([0, 1] * 150)))
X_test = pd.DataFrame(
{
"A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30)),
"A": pd.arrays.SparseArray(rng.permutation([0, 2] * 30)),
"B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.arrays.SparseArray(rng.permutation([True, False] * 30)),
}
)
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
@ -1816,9 +1821,9 @@ def test_pandas_sparse():
np.testing.assert_allclose(pred_sparse, pred_dense)
def test_reference_chain():
X = np.random.normal(size=(100, 2))
y = np.random.normal(size=100)
def test_reference_chain(rng):
X = rng.normal(size=(100, 2))
y = rng.normal(size=(100,))
tmp_dat = lgb.Dataset(X, y)
# take subsets and train
tmp_dat_train = tmp_dat.subset(np.arange(80))
@ -1940,28 +1945,28 @@ def test_contribs_sparse_multiclass():
np.testing.assert_allclose(contribs_csc_array, contribs_dense)
@pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason="not enough RAM")
def test_int32_max_sparse_contribs():
params = {"objective": "binary"}
train_features = np.random.rand(100, 1000)
train_targets = [0] * 50 + [1] * 50
lgb_train = lgb.Dataset(train_features, train_targets)
gbm = lgb.train(params, lgb_train, num_boost_round=2)
csr_input_shape = (3000000, 1000)
test_features = csr_matrix(csr_input_shape)
for i in range(0, csr_input_shape[0], csr_input_shape[0] // 6):
for j in range(0, 1000, 100):
test_features[i, j] = random.random()
y_pred_csr = gbm.predict(test_features, pred_contrib=True)
# Note there is an extra column added to the output for the expected value
csr_output_shape = (csr_input_shape[0], csr_input_shape[1] + 1)
assert y_pred_csr.shape == csr_output_shape
y_pred_csc = gbm.predict(test_features.tocsc(), pred_contrib=True)
# Note output CSC shape should be same as CSR output shape
assert y_pred_csc.shape == csr_output_shape
# @pytest.mark.skipif(psutil.virtual_memory().available / 1024 / 1024 / 1024 < 3, reason="not enough RAM")
# def test_int32_max_sparse_contribs(rng):
# params = {"objective": "binary"}
# train_features = rng.uniform(size=(100, 1000))
# train_targets = [0] * 50 + [1] * 50
# lgb_train = lgb.Dataset(train_features, train_targets)
# gbm = lgb.train(params, lgb_train, num_boost_round=2)
# csr_input_shape = (3000000, 1000)
# test_features = csr_matrix(csr_input_shape)
# for i in range(0, csr_input_shape[0], csr_input_shape[0] // 6):
# for j in range(0, 1000, 100):
# test_features[i, j] = random.random()
# y_pred_csr = gbm.predict(test_features, pred_contrib=True)
# # Note there is an extra column added to the output for the expected value
# csr_output_shape = (csr_input_shape[0], csr_input_shape[1] + 1)
# assert y_pred_csr.shape == csr_output_shape
# y_pred_csc = gbm.predict(test_features.tocsc(), pred_contrib=True)
# # Note output CSC shape should be same as CSR output shape
# assert y_pred_csc.shape == csr_output_shape
def test_sliced_data():
def test_sliced_data(rng):
def train_and_get_predictions(features, labels):
dataset = lgb.Dataset(features, label=labels)
lgb_params = {
@ -1977,7 +1982,7 @@ def test_sliced_data():
return gbm.predict(features)
num_samples = 100
features = np.random.rand(num_samples, 5)
features = rng.uniform(size=(num_samples, 5))
positive_samples = int(num_samples * 0.25)
labels = np.append(
np.ones(positive_samples, dtype=np.float32), np.zeros(num_samples - positive_samples, dtype=np.float32)
@ -2011,13 +2016,13 @@ def test_sliced_data():
np.testing.assert_allclose(origin_pred, sliced_pred)
def test_init_with_subset():
data = np.random.random((50, 2))
def test_init_with_subset(rng):
data = rng.uniform(size=(50, 2))
y = [1] * 25 + [0] * 25
lgb_train = lgb.Dataset(data, y, free_raw_data=False)
subset_index_1 = np.random.choice(np.arange(50), 30, replace=False)
subset_index_1 = rng.choice(a=np.arange(50), size=30, replace=False)
subset_data_1 = lgb_train.subset(subset_index_1)
subset_index_2 = np.random.choice(np.arange(50), 20, replace=False)
subset_index_2 = rng.choice(a=np.arange(50), size=20, replace=False)
subset_data_2 = lgb_train.subset(subset_index_2)
params = {"objective": "binary", "verbose": -1}
init_gbm = lgb.train(params=params, train_set=subset_data_1, num_boost_round=10, keep_training_booster=True)
@ -2037,9 +2042,9 @@ def test_init_with_subset():
assert subset_data_4.get_data() == "lgb_train_data.bin"
def test_training_on_constructed_subset_without_params():
X = np.random.random((100, 10))
y = np.random.random(100)
def test_training_on_constructed_subset_without_params(rng):
X = rng.uniform(size=(100, 10))
y = rng.uniform(size=(100,))
lgb_data = lgb.Dataset(X, y)
subset_indices = [1, 2, 3, 4]
subset = lgb_data.subset(subset_indices).construct()
@ -2051,9 +2056,10 @@ def test_training_on_constructed_subset_without_params():
def generate_trainset_for_monotone_constraints_tests(x3_to_category=True):
number_of_dpoints = 3000
x1_positively_correlated_with_y = np.random.random(size=number_of_dpoints)
x2_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
x3_negatively_correlated_with_y = np.random.random(size=number_of_dpoints)
rng = np.random.default_rng()
x1_positively_correlated_with_y = rng.uniform(size=number_of_dpoints)
x2_negatively_correlated_with_y = rng.uniform(size=number_of_dpoints)
x3_negatively_correlated_with_y = rng.uniform(size=number_of_dpoints)
x = np.column_stack(
(
x1_positively_correlated_with_y,
@ -2062,8 +2068,8 @@ def generate_trainset_for_monotone_constraints_tests(x3_to_category=True):
)
)
zs = np.random.normal(loc=0.0, scale=0.01, size=number_of_dpoints)
scales = 10.0 * (np.random.random(6) + 0.5)
zs = rng.normal(loc=0.0, scale=0.01, size=number_of_dpoints)
scales = 10.0 * (rng.uniform(size=6) + 0.5)
y = (
scales[0] * x1_positively_correlated_with_y
+ np.sin(scales[1] * np.pi * x1_positively_correlated_with_y)
@ -2265,9 +2271,8 @@ def test_max_bin_by_feature():
assert len(np.unique(est.predict(X))) == 3
def test_small_max_bin():
np.random.seed(0)
y = np.random.choice([0, 1], 100)
def test_small_max_bin(rng_fixed_seed):
y = rng_fixed_seed.choice([0, 1], 100)
x = np.ones((100, 1))
x[:30, 0] = -1
x[60:, 0] = 2
@ -2278,7 +2283,6 @@ def test_small_max_bin():
params["max_bin"] = 3
lgb_x = lgb.Dataset(x, label=y)
lgb.train(params, lgb_x, num_boost_round=5)
np.random.seed() # reset seed
def test_refit():
@ -2293,14 +2297,14 @@ def test_refit():
assert err_pred > new_err_pred
def test_refit_dataset_params():
def test_refit_dataset_params(rng):
# check refit accepts dataset_params
X, y = load_breast_cancer(return_X_y=True)
lgb_train = lgb.Dataset(X, y, init_score=np.zeros(y.size))
train_params = {"objective": "binary", "verbose": -1, "seed": 123}
gbm = lgb.train(train_params, lgb_train, num_boost_round=10)
non_weight_err_pred = log_loss(y, gbm.predict(X))
refit_weight = np.random.rand(y.shape[0])
refit_weight = rng.uniform(size=(y.shape[0],))
dataset_params = {
"max_bin": 260,
"min_data_in_bin": 5,
@ -3011,7 +3015,7 @@ def test_model_size():
@pytest.mark.skipif(
getenv("TASK", "") == "cuda", reason="Skip due to differences in implementation details of CUDA version"
)
def test_get_split_value_histogram():
def test_get_split_value_histogram(rng_fixed_seed):
X, y = make_synthetic_regression()
X = np.repeat(X, 3, axis=0)
y = np.repeat(y, 3, axis=0)
@ -3351,7 +3355,7 @@ def test_binning_same_sign():
assert predicted[1] == pytest.approx(predicted[2])
def test_dataset_update_params():
def test_dataset_update_params(rng):
default_params = {
"max_bin": 100,
"max_bin_by_feature": [20, 10],
@ -3400,8 +3404,8 @@ def test_dataset_update_params():
"linear_tree": True,
"precise_float_parser": False,
}
X = np.random.random((100, 2))
y = np.random.random(100)
X = rng.uniform(size=(100, 2))
y = rng.uniform(size=(100,))
# decreasing without freeing raw data is allowed
lgb_data = lgb.Dataset(X, y, params=default_params, free_raw_data=False).construct()
@ -3443,12 +3447,12 @@ def test_dataset_update_params():
lgb.train(new_params, lgb_data, num_boost_round=3)
def test_dataset_params_with_reference():
def test_dataset_params_with_reference(rng):
default_params = {"max_bin": 100}
X = np.random.random((100, 2))
y = np.random.random(100)
X_val = np.random.random((100, 2))
y_val = np.random.random(100)
X = rng.uniform(size=(100, 2))
y = rng.uniform(size=(100,))
X_val = rng.uniform(size=(100, 2))
y_val = rng.uniform(size=(100,))
lgb_train = lgb.Dataset(X, y, params=default_params, free_raw_data=False).construct()
lgb_val = lgb.Dataset(X_val, y_val, reference=lgb_train, free_raw_data=False).construct()
assert lgb_train.get_params() == default_params
@ -3486,7 +3490,7 @@ def test_path_smoothing():
assert err < err_new
def test_trees_to_dataframe():
def test_trees_to_dataframe(rng):
pytest.importorskip("pandas")
def _imptcs_to_numpy(X, impcts_dict):
@ -3516,7 +3520,7 @@ def test_trees_to_dataframe():
# test edge case with one leaf
X = np.ones((10, 2))
y = np.random.rand(10)
y = rng.uniform(size=(10,))
data = lgb.Dataset(X, label=y)
bst = lgb.train({"objective": "binary", "verbose": -1}, data, num_trees)
tree_df = bst.trees_to_dataframe()
@ -3574,11 +3578,10 @@ def test_interaction_constraints():
)
def test_linear_trees_num_threads():
def test_linear_trees_num_threads(rng_fixed_seed):
# check that number of threads does not affect result
np.random.seed(0)
x = np.arange(0, 1000, 0.1)
y = 2 * x + np.random.normal(0, 0.1, len(x))
y = 2 * x + rng_fixed_seed.normal(loc=0, scale=0.1, size=(len(x),))
x = x[:, np.newaxis]
lgb_train = lgb.Dataset(x, label=y)
params = {"verbose": -1, "objective": "regression", "seed": 0, "linear_tree": True, "num_threads": 2}
@ -3590,11 +3593,10 @@ def test_linear_trees_num_threads():
np.testing.assert_allclose(pred1, pred2)
def test_linear_trees(tmp_path):
def test_linear_trees(tmp_path, rng_fixed_seed):
# check that setting linear_tree=True fits better than ordinary trees when data has linear relationship
np.random.seed(0)
x = np.arange(0, 100, 0.1)
y = 2 * x + np.random.normal(0, 0.1, len(x))
y = 2 * x + rng_fixed_seed.normal(0, 0.1, len(x))
x = x[:, np.newaxis]
lgb_train = lgb.Dataset(x, label=y)
params = {"verbose": -1, "metric": "mse", "seed": 0, "num_leaves": 2}
@ -4099,21 +4101,20 @@ def test_record_evaluation_with_cv(train_metric):
np.testing.assert_allclose(cv_hist[key], eval_result[dataset][f"{metric}-{agg}"])
def test_pandas_with_numpy_regular_dtypes():
def test_pandas_with_numpy_regular_dtypes(rng_fixed_seed):
pd = pytest.importorskip("pandas")
uints = ["uint8", "uint16", "uint32", "uint64"]
ints = ["int8", "int16", "int32", "int64"]
bool_and_floats = ["bool", "float16", "float32", "float64"]
rng = np.random.RandomState(42)
n_samples = 100
# data as float64
df = pd.DataFrame(
{
"x1": rng.randint(0, 2, n_samples),
"x2": rng.randint(1, 3, n_samples),
"x3": 10 * rng.randint(1, 3, n_samples),
"x4": 100 * rng.randint(1, 3, n_samples),
"x1": rng_fixed_seed.integers(low=0, high=2, size=n_samples),
"x2": rng_fixed_seed.integers(low=1, high=3, size=n_samples),
"x3": 10 * rng_fixed_seed.integers(low=1, high=3, size=n_samples),
"x4": 100 * rng_fixed_seed.integers(low=1, high=3, size=n_samples),
}
)
df = df.astype(np.float64)
@ -4139,15 +4140,14 @@ def test_pandas_with_numpy_regular_dtypes():
np.testing.assert_allclose(preds, preds2)
def test_pandas_nullable_dtypes():
def test_pandas_nullable_dtypes(rng_fixed_seed):
pd = pytest.importorskip("pandas")
rng = np.random.RandomState(0)
df = pd.DataFrame(
{
"x1": rng.randint(1, 3, size=100),
"x1": rng_fixed_seed.integers(low=1, high=3, size=100),
"x2": np.linspace(-1, 1, 100),
"x3": pd.arrays.SparseArray(rng.randint(0, 11, size=100)),
"x4": rng.rand(100) < 0.5,
"x3": pd.arrays.SparseArray(rng_fixed_seed.integers(low=0, high=11, size=100)),
"x4": rng_fixed_seed.uniform(size=(100,)) < 0.5,
}
)
# introduce some missing values
@ -4219,7 +4219,7 @@ def test_boost_from_average_with_single_leaf_trees():
assert y.min() <= mean_preds <= y.max()
def test_cegb_split_buffer_clean():
def test_cegb_split_buffer_clean(rng_fixed_seed):
# modified from https://github.com/microsoft/LightGBM/issues/3679#issuecomment-938652811
# and https://github.com/microsoft/LightGBM/pull/5087
# test that the ``splits_per_leaf_`` of CEGB is cleaned before training a new tree
@ -4228,11 +4228,9 @@ def test_cegb_split_buffer_clean():
# Check failed: (best_split_info.left_count) > (0)
R, C = 1000, 100
seed = 29
np.random.seed(seed)
data = np.random.randn(R, C)
data = rng_fixed_seed.standard_normal(size=(R, C))
for i in range(1, C):
data[i] += data[0] * np.random.randn()
data[i] += data[0] * rng_fixed_seed.standard_normal()
N = int(0.8 * len(data))
train_data = data[:N]

Просмотреть файл

@ -340,7 +340,7 @@ def test_grid_search():
assert evals_result == grid.best_estimator_.evals_result_
def test_random_search():
def test_random_search(rng):
X, y = load_iris(return_X_y=True)
y = y.astype(str) # utilize label encoder at it's max power
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
@ -349,8 +349,8 @@ def test_random_search():
params = {"subsample": 0.8, "subsample_freq": 1}
param_dist = {
"boosting_type": ["rf", "gbdt"],
"n_estimators": [np.random.randint(low=3, high=10) for i in range(n_iter)],
"reg_alpha": [np.random.uniform(low=0.01, high=0.06) for i in range(n_iter)],
"n_estimators": rng.integers(low=3, high=10, size=(n_iter,)).tolist(),
"reg_alpha": rng.uniform(low=0.01, high=0.06, size=(n_iter,)).tolist(),
}
fit_params = {"eval_set": [(X_val, y_val)], "eval_metric": constant_metric, "callbacks": [lgb.early_stopping(2)]}
rand = RandomizedSearchCV(
@ -556,29 +556,29 @@ def test_feature_importances_type():
assert importance_split_top1 != importance_gain_top1
def test_pandas_categorical():
# why fixed seed?
# sometimes there is no difference how cols are treated (cat or not cat)
def test_pandas_categorical(rng_fixed_seed):
pd = pytest.importorskip("pandas")
np.random.seed(42) # sometimes there is no difference how cols are treated (cat or not cat)
X = pd.DataFrame(
{
"A": np.random.permutation(["a", "b", "c", "d"] * 75), # str
"B": np.random.permutation([1, 2, 3] * 100), # int
"C": np.random.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": np.random.permutation([True, False] * 150), # bool
"E": pd.Categorical(np.random.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True),
"A": rng_fixed_seed.permutation(["a", "b", "c", "d"] * 75), # str
"B": rng_fixed_seed.permutation([1, 2, 3] * 100), # int
"C": rng_fixed_seed.permutation([0.1, 0.2, -0.1, -0.1, 0.2] * 60), # float
"D": rng_fixed_seed.permutation([True, False] * 150), # bool
"E": pd.Categorical(rng_fixed_seed.permutation(["z", "y", "x", "w", "v"] * 60), ordered=True),
}
) # str and ordered categorical
y = np.random.permutation([0, 1] * 150)
y = rng_fixed_seed.permutation([0, 1] * 150)
X_test = pd.DataFrame(
{
"A": np.random.permutation(["a", "b", "e"] * 20), # unseen category
"B": np.random.permutation([1, 3] * 30),
"C": np.random.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": np.random.permutation([True, False] * 30),
"E": pd.Categorical(np.random.permutation(["z", "y"] * 30), ordered=True),
"A": rng_fixed_seed.permutation(["a", "b", "e"] * 20), # unseen category
"B": rng_fixed_seed.permutation([1, 3] * 30),
"C": rng_fixed_seed.permutation([0.1, -0.1, 0.2, 0.2] * 15),
"D": rng_fixed_seed.permutation([True, False] * 30),
"E": pd.Categorical(rng_fixed_seed.permutation(["z", "y"] * 30), ordered=True),
}
)
np.random.seed() # reset seed
cat_cols_actual = ["A", "B", "C", "D"]
cat_cols_to_store = cat_cols_actual + ["E"]
X[cat_cols_actual] = X[cat_cols_actual].astype("category")
@ -620,21 +620,21 @@ def test_pandas_categorical():
assert gbm6.booster_.pandas_categorical == cat_values
def test_pandas_sparse():
def test_pandas_sparse(rng):
pd = pytest.importorskip("pandas")
X = pd.DataFrame(
{
"A": pd.arrays.SparseArray(np.random.permutation([0, 1, 2] * 100)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 150)),
"A": pd.arrays.SparseArray(rng.permutation([0, 1, 2] * 100)),
"B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1, 0.2] * 60)),
"C": pd.arrays.SparseArray(rng.permutation([True, False] * 150)),
}
)
y = pd.Series(pd.arrays.SparseArray(np.random.permutation([0, 1] * 150)))
y = pd.Series(pd.arrays.SparseArray(rng.permutation([0, 1] * 150)))
X_test = pd.DataFrame(
{
"A": pd.arrays.SparseArray(np.random.permutation([0, 2] * 30)),
"B": pd.arrays.SparseArray(np.random.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.arrays.SparseArray(np.random.permutation([True, False] * 30)),
"A": pd.arrays.SparseArray(rng.permutation([0, 2] * 30)),
"B": pd.arrays.SparseArray(rng.permutation([0.0, 0.1, 0.2, -0.1] * 15)),
"C": pd.arrays.SparseArray(rng.permutation([True, False] * 30)),
}
)
for dtype in pd.concat([X.dtypes, X_test.dtypes, pd.Series(y.dtypes)]):
@ -1073,11 +1073,11 @@ def test_multiple_eval_metrics():
assert "binary_logloss" in gbm.evals_result_["training"]
def test_nan_handle():
def test_nan_handle(rng):
nrows = 100
ncols = 10
X = np.random.randn(nrows, ncols)
y = np.random.randn(nrows) + np.full(nrows, 1e30)
X = rng.standard_normal(size=(nrows, ncols))
y = rng.standard_normal(size=(nrows,)) + np.full(nrows, 1e30)
weight = np.zeros(nrows)
params = {"n_estimators": 20, "verbose": -1}
params_fit = {"X": X, "y": y, "sample_weight": weight, "eval_set": (X, y), "callbacks": [lgb.early_stopping(5)]}
@ -1276,6 +1276,20 @@ def test_check_is_fitted():
check_is_fitted(model)
@pytest.mark.parametrize("estimator_class", [lgb.LGBMModel, lgb.LGBMClassifier, lgb.LGBMRegressor, lgb.LGBMRanker])
@pytest.mark.parametrize("max_depth", [3, 4, 5, 8])
def test_max_depth_warning_is_never_raised(capsys, estimator_class, max_depth):
X, y = make_blobs(n_samples=1_000, n_features=1, centers=2)
params = {"n_estimators": 1, "max_depth": max_depth, "verbose": 0}
if estimator_class is lgb.LGBMModel:
estimator_class(**{**params, "objective": "binary"}).fit(X, y)
elif estimator_class is lgb.LGBMRanker:
estimator_class(**params).fit(X, y, group=np.ones(X.shape[0]))
else:
estimator_class(**params).fit(X, y)
assert "Provided parameters constrain tree depth" not in capsys.readouterr().out
@parametrize_with_checks([lgb.LGBMClassifier(), lgb.LGBMRegressor()])
def test_sklearn_integration(estimator, check):
estimator.set_params(min_child_samples=1, min_data_in_bin=1)
@ -1410,13 +1424,13 @@ def test_validate_features(task):
@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_Series", "pd_DataFrame"])
@pytest.mark.parametrize("task", ["binary-classification", "multiclass-classification", "regression"])
def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task):
def test_classification_and_regression_minimally_work_with_all_all_accepted_data_types(X_type, y_type, task, rng):
if any(t.startswith("pd_") for t in [X_type, y_type]) and not PANDAS_INSTALLED:
pytest.skip("pandas is not installed")
if any(t.startswith("dt_") for t in [X_type, y_type]) and not DATATABLE_INSTALLED:
pytest.skip("datatable is not installed")
X, y, g = _create_data(task, n_samples=2_000)
weights = np.abs(np.random.randn(y.shape[0]))
weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
if task == "binary-classification" or task == "regression":
init_score = np.full_like(y, np.mean(y))
@ -1487,13 +1501,13 @@ def test_classification_and_regression_minimally_work_with_all_all_accepted_data
@pytest.mark.parametrize("X_type", ["dt_DataTable", "list2d", "numpy", "scipy_csc", "scipy_csr", "pd_DataFrame"])
@pytest.mark.parametrize("y_type", ["list1d", "numpy", "pd_DataFrame", "pd_Series"])
@pytest.mark.parametrize("g_type", ["list1d_float", "list1d_int", "numpy", "pd_Series"])
def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type):
def test_ranking_minimally_works_with_all_all_accepted_data_types(X_type, y_type, g_type, rng):
if any(t.startswith("pd_") for t in [X_type, y_type, g_type]) and not PANDAS_INSTALLED:
pytest.skip("pandas is not installed")
if any(t.startswith("dt_") for t in [X_type, y_type, g_type]) and not DATATABLE_INSTALLED:
pytest.skip("datatable is not installed")
X, y, g = _create_data(task="ranking", n_samples=1_000)
weights = np.abs(np.random.randn(y.shape[0]))
weights = np.abs(rng.standard_normal(size=(y.shape[0],)))
init_score = np.full_like(y, np.mean(y))
X_valid = X * 2