From 03979ad44a4064d791871fe727fb7e6d09ee75d0 Mon Sep 17 00:00:00 2001 From: Yu Shi Date: Thu, 21 Apr 2022 05:10:29 +0000 Subject: [PATCH] fix mismatch of number of features in feature_importances --- src/boosting/gbdt_model_text.cpp | 37 ++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/src/boosting/gbdt_model_text.cpp b/src/boosting/gbdt_model_text.cpp index 0e296c1ec..cb7cc62d7 100644 --- a/src/boosting/gbdt_model_text.cpp +++ b/src/boosting/gbdt_model_text.cpp @@ -104,7 +104,16 @@ std::string GBDT::DumpModel(int start_iteration, int num_iteration, int feature_ for (size_t i = 0; i < feature_importances.size(); ++i) { size_t feature_importances_int = static_cast(feature_importances[i]); if (feature_importances_int > 0) { - pairs.emplace_back(feature_importances_int, feature_names_[i]); + Log::Warning("i = %d, feature_names_.size() = %d", i, feature_names_.size()); + if (i < feature_names_.size()) { + pairs.emplace_back(feature_importances_int, feature_names_[i]); + } else { + // with LibSVM format and continual training, the number of features in dataset can be fewer than in the intial model + // in that case FeatureImportance returns with the number of features in the intial model + std::stringstream str_buf; + str_buf << "Column_" << i << "_from_init_model"; + pairs.emplace_back(feature_importances_int, str_buf.str()); + } } } str_buf << '\n' << "\"feature_importances\":" << "{"; @@ -377,7 +386,15 @@ std::string GBDT::SaveModelToString(int start_iteration, int num_iteration, int for (size_t i = 0; i < feature_importances.size(); ++i) { size_t feature_importances_int = static_cast(feature_importances[i]); if (feature_importances_int > 0) { - pairs.emplace_back(feature_importances_int, feature_names_[i]); + if (i < feature_names_.size()) { + pairs.emplace_back(feature_importances_int, feature_names_[i]); + } else { + // with LibSVM format and continual training, the number of features in dataset can be fewer than in the intial model + // in that case FeatureImportance returns with the number of features in the intial model + std::stringstream str_buf; + str_buf << "Column_" << i << "_from_init_model"; + pairs.emplace_back(feature_importances_int, str_buf.str()); + } } } // sort the importance @@ -609,10 +626,14 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty for (int iter = 0; iter < num_used_model; ++iter) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { if (models_[iter]->split_gain(split_idx) > 0) { + const int real_feature_index = models_[iter]->split_feature(split_idx); #ifdef DEBUG - CHECK_GE(models_[iter]->split_feature(split_idx), 0); + CHECK_GE(real_feature_index, 0); #endif - feature_importances[models_[iter]->split_feature(split_idx)] += 1.0; + if (static_cast(real_feature_index) >= feature_importances.size()) { + feature_importances.resize(real_feature_index + 1); + } + feature_importances[real_feature_index] += 1.0; } } } @@ -620,10 +641,14 @@ std::vector GBDT::FeatureImportance(int num_iteration, int importance_ty for (int iter = 0; iter < num_used_model; ++iter) { for (int split_idx = 0; split_idx < models_[iter]->num_leaves() - 1; ++split_idx) { if (models_[iter]->split_gain(split_idx) > 0) { + const int real_feature_index = models_[iter]->split_feature(split_idx); #ifdef DEBUG - CHECK_GE(models_[iter]->split_feature(split_idx), 0); + CHECK_GE(real_feature_index, 0); #endif - feature_importances[models_[iter]->split_feature(split_idx)] += models_[iter]->split_gain(split_idx); + if (static_cast(real_feature_index) >= feature_importances.size()) { + feature_importances.resize(real_feature_index + 1); + } + feature_importances[real_feature_index] += models_[iter]->split_gain(split_idx); } } }