зеркало из https://github.com/microsoft/LightGBM.git
[R-package] factor out {ggplot2} (#3224)
* more changes * factor out ggplot2 * update CI * remove library() * linting * reduce NOTEs on Windows
This commit is contained in:
Родитель
58b49dd8cd
Коммит
9f52282d0b
|
@ -148,7 +148,7 @@ if grep -q -R "WARNING" "$LOG_FILE_NAME"; then
|
|||
exit -1
|
||||
fi
|
||||
|
||||
ALLOWED_CHECK_NOTES=3
|
||||
ALLOWED_CHECK_NOTES=2
|
||||
NUM_CHECK_NOTES=$(
|
||||
cat ${LOG_FILE_NAME} \
|
||||
| grep -e '^Status: .* NOTE.*' \
|
||||
|
|
|
@ -157,7 +157,7 @@ if ($env:COMPILER -ne "MSVC") {
|
|||
$note_str = Get-Content -Path "${LOG_FILE_NAME}" | Select-String -Pattern '.*Status.* NOTE' | Out-String ; Check-Output $?
|
||||
$relevant_line = $note_str -match '(\d+) NOTE'
|
||||
$NUM_CHECK_NOTES = $matches[1]
|
||||
$ALLOWED_CHECK_NOTES = 3
|
||||
$ALLOWED_CHECK_NOTES = 2
|
||||
if ([int]$NUM_CHECK_NOTES -gt $ALLOWED_CHECK_NOTES) {
|
||||
Write-Output "Found ${NUM_CHECK_NOTES} NOTEs from R CMD check. Only ${ALLOWED_CHECK_NOTES} are allowed"
|
||||
Check-Output $False
|
||||
|
|
|
@ -25,7 +25,6 @@ BugReports: https://github.com/Microsoft/LightGBM/issues
|
|||
NeedsCompilation: yes
|
||||
Biarch: false
|
||||
Suggests:
|
||||
ggplot2 (>= 1.0.1),
|
||||
processx,
|
||||
testthat
|
||||
Depends:
|
||||
|
|
|
@ -2,11 +2,81 @@
|
|||
# Obviously, we are in a controlled environment, without issues (real rules).
|
||||
# Do not do this in a real scenario.
|
||||
|
||||
# First, we load our libraries
|
||||
library(lightgbm)
|
||||
library(ggplot2)
|
||||
|
||||
# Second, we load our data
|
||||
# define helper functions for creating plots
|
||||
|
||||
# output of `RColorBrewer::brewer.pal(10, "RdYlGn")`, hardcooded here to avoid a dependency
|
||||
.diverging_palette <- c(
|
||||
"#A50026", "#D73027", "#F46D43", "#FDAE61", "#FEE08B"
|
||||
, "#D9EF8B", "#A6D96A", "#66BD63", "#1A9850", "#006837"
|
||||
)
|
||||
|
||||
.prediction_depth_plot <- function(df) {
|
||||
plot(
|
||||
x = df$X
|
||||
, y = df$Y
|
||||
, type = "p"
|
||||
, main = "Prediction Depth"
|
||||
, xlab = "Leaf Bin"
|
||||
, ylab = "Prediction Probability"
|
||||
, pch = 19L
|
||||
, col = .diverging_palette[df$binned + 1L]
|
||||
)
|
||||
legend(
|
||||
"topright"
|
||||
, title = "bin"
|
||||
, legend = sort(unique(df$binned))
|
||||
, pch = 19L
|
||||
, col = .diverging_palette[sort(unique(df$binned + 1L))]
|
||||
, cex = 0.7
|
||||
)
|
||||
}
|
||||
|
||||
.prediction_depth_spread_plot <- function(df) {
|
||||
plot(
|
||||
x = df$binned
|
||||
, xlim = c(0L, 9L)
|
||||
, y = df$Z
|
||||
, type = "p"
|
||||
, main = "Prediction Depth Spread"
|
||||
, xlab = "Leaf Bin"
|
||||
, ylab = "Logloss"
|
||||
, pch = 19L
|
||||
, col = .diverging_palette[df$binned + 1L]
|
||||
)
|
||||
legend(
|
||||
"topright"
|
||||
, title = "bin"
|
||||
, legend = sort(unique(df$binned))
|
||||
, pch = 19L
|
||||
, col = .diverging_palette[sort(unique(df$binned + 1L))]
|
||||
, cex = 0.7
|
||||
)
|
||||
}
|
||||
|
||||
.depth_density_plot <- function(df) {
|
||||
plot(
|
||||
x = density(df$Y)
|
||||
, xlim = c(min(df$Y), max(df$Y))
|
||||
, type = "p"
|
||||
, main = "Depth Density"
|
||||
, xlab = "Prediction Probability"
|
||||
, ylab = "Bin Density"
|
||||
, pch = 19L
|
||||
, col = .diverging_palette[df$binned + 1L]
|
||||
)
|
||||
legend(
|
||||
"topright"
|
||||
, title = "bin"
|
||||
, legend = sort(unique(df$binned))
|
||||
, pch = 19L
|
||||
, col = .diverging_palette[sort(unique(df$binned + 1L))]
|
||||
, cex = 0.7
|
||||
)
|
||||
}
|
||||
|
||||
# load some data
|
||||
data(agaricus.train, package = "lightgbm")
|
||||
train <- agaricus.train
|
||||
dtrain <- lgb.Dataset(train$data, label = train$label)
|
||||
|
@ -14,7 +84,7 @@ data(agaricus.test, package = "lightgbm")
|
|||
test <- agaricus.test
|
||||
dtest <- lgb.Dataset.create.valid(dtrain, test$data, label = test$label)
|
||||
|
||||
# Third, we setup parameters and we train a model
|
||||
# setup parameters and we train a model
|
||||
params <- list(objective = "regression", metric = "l2")
|
||||
valids <- list(test = dtest)
|
||||
model <- lgb.train(
|
||||
|
@ -59,7 +129,6 @@ new_data$binned <- .bincode(
|
|||
, include.lowest = TRUE
|
||||
)
|
||||
new_data$binned[is.na(new_data$binned)] <- 0L
|
||||
new_data$binned <- as.factor(new_data$binned)
|
||||
|
||||
# We can check the binned content
|
||||
table(new_data$binned)
|
||||
|
@ -67,25 +136,9 @@ table(new_data$binned)
|
|||
# We can plot the binned content
|
||||
# On the second plot, we clearly notice the lower the bin (the lower the leaf value), the higher the loss
|
||||
# On the third plot, it is smooth!
|
||||
ggplot(
|
||||
data = new_data
|
||||
, mapping = aes(x = X, y = Y, color = binned)
|
||||
) + geom_point() +
|
||||
theme_bw() +
|
||||
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
|
||||
ggplot(
|
||||
data = new_data
|
||||
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
|
||||
) + geom_boxplot() +
|
||||
theme_bw() +
|
||||
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
|
||||
ggplot(
|
||||
data = new_data
|
||||
, mapping = aes(x = Y, y = ..count.., fill = binned)
|
||||
) + geom_density(position = "fill") +
|
||||
theme_bw() +
|
||||
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
|
||||
.prediction_depth_plot(df = new_data)
|
||||
.prediction_depth_spread_plot(df = new_data)
|
||||
.depth_density_plot(df = new_data)
|
||||
|
||||
# Now, let's show with other parameters
|
||||
model2 <- lgb.train(
|
||||
|
@ -126,7 +179,6 @@ new_data2$binned <- .bincode(
|
|||
, include.lowest = TRUE
|
||||
)
|
||||
new_data2$binned[is.na(new_data2$binned)] <- 0L
|
||||
new_data2$binned <- as.factor(new_data2$binned)
|
||||
|
||||
# We can check the binned content
|
||||
table(new_data2$binned)
|
||||
|
@ -136,25 +188,9 @@ table(new_data2$binned)
|
|||
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules are
|
||||
# real thus it is not an issue
|
||||
# However, if the rules were not true, the loss would explode.
|
||||
ggplot(
|
||||
data = new_data2
|
||||
, mapping = aes(x = X, y = Y, color = binned)
|
||||
) + geom_point() +
|
||||
theme_bw() +
|
||||
labs(title = "Prediction Depth", x = "Leaf Bin", y = "Prediction Probability")
|
||||
ggplot(
|
||||
data = new_data2
|
||||
, mapping = aes(x = binned, y = Z, fill = binned, group = binned)
|
||||
) + geom_boxplot() +
|
||||
theme_bw() +
|
||||
labs(title = "Prediction Depth Spread", x = "Leaf Bin", y = "Logloss")
|
||||
ggplot(
|
||||
data = new_data2
|
||||
, mapping = aes(x = Y, y = ..count.., fill = binned)
|
||||
) + geom_density(position = "fill") +
|
||||
theme_bw() +
|
||||
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
|
||||
.prediction_depth_plot(df = new_data2)
|
||||
.prediction_depth_spread_plot(df = new_data2)
|
||||
.depth_density_plot(df = new_data2)
|
||||
|
||||
# Now, try with very severe overfitting
|
||||
model3 <- lgb.train(
|
||||
|
@ -195,7 +231,6 @@ new_data3$binned <- .bincode(
|
|||
, include.lowest = TRUE
|
||||
)
|
||||
new_data3$binned[is.na(new_data3$binned)] <- 0L
|
||||
new_data3$binned <- as.factor(new_data3$binned)
|
||||
|
||||
# We can check the binned content
|
||||
table(new_data3$binned)
|
||||
|
@ -204,18 +239,7 @@ table(new_data3$binned)
|
|||
# On the third plot, it is clearly not smooth! We are severely overfitting the data, but the rules
|
||||
# are real thus it is not an issue.
|
||||
# However, if the rules were not true, the loss would explode. See the sudden spikes?
|
||||
ggplot(
|
||||
data = new_data3
|
||||
, mapping = aes(x = Y, y = ..count.., fill = binned)
|
||||
) +
|
||||
geom_density(position = "fill") +
|
||||
theme_bw() +
|
||||
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
.depth_density_plot(df = new_data3)
|
||||
|
||||
# Compare with our second model, the difference is severe. This is smooth.
|
||||
ggplot(
|
||||
data = new_data2
|
||||
, mapping = aes(x = Y, y = ..count.., fill = binned)
|
||||
) + geom_density(position = "fill") +
|
||||
theme_bw() +
|
||||
labs(title = "Depth Density", x = "Prediction Probability", y = "Bin Density")
|
||||
.depth_density_plot(df = new_data2)
|
||||
|
|
Загрузка…
Ссылка в новой задаче