support init_score for multiclass classification (#62)

support init_score for multiclass classification (#62)
This commit is contained in:
wxchan 2016-11-03 13:49:45 +08:00 коммит произвёл Guolin Ke
Родитель 665c9dbace
Коммит 01ed04dff2
10 изменённых файлов: 121 добавлений и 46 удалений

Просмотреть файл

@ -0,0 +1,33 @@
Multiclass Classification Example
=====================
Here is an example for LightGBM to run multiclass classification task.
***You should copy executable file to this folder first.***
#### Training
For windows, by running following command in this folder:
```
lightgbm.exe config=train.conf
```
For linux, by running following command in this folder:
```
./lightgbm config=train.conf
```
#### Prediction
You should finish training first.
For windows, by running following command in this folder:
```
lightgbm.exe config=predict.conf
```
For linux, by running following command in this folder:
```
./lightgbm config=predict.conf
```

Просмотреть файл

@ -86,6 +86,7 @@ enum TaskType {
struct IOConfig: public ConfigBase {
public:
int max_bin = 256;
int num_class = 1;
int data_random_seed = 1;
std::string data_filename = "";
std::vector<std::string> valid_data_filenames;

Просмотреть файл

@ -41,14 +41,15 @@ public:
* \brief Initialization will load qurey level informations, since it is need for sampling data
* \param data_filename Filename of data
* \param init_score_filename Filename of initial score
* \param is_int_label True if label is int type
* \param num_class Number of classes
*/
void Init(const char* data_filename, const char* init_score_filename);
void Init(const char* data_filename, const char* init_score_filename, const int num_class);
/*!
* \brief Initialize, only load initial score
* \param init_score_filename Filename of initial score
* \param num_class Number of classes
*/
void Init(const char* init_score_filename);
void Init(const char* init_score_filename, const int num_class);
/*!
* \brief Initial with binary memory
* \param memory Pointer to memory
@ -60,10 +61,11 @@ public:
/*!
* \brief Initial work, will allocate space for label, weight(if exists) and query(if exists)
* \param num_data Number of training data
* \param num_class Number of classes
* \param weight_idx Index of weight column, < 0 means doesn't exists
* \param query_idx Index of query id column, < 0 means doesn't exists
*/
void Init(data_size_t num_data, int weight_idx, int query_idx);
void Init(data_size_t num_data, int num_class, int weight_idx, int query_idx);
/*!
* \brief Partition label by used indices
@ -167,7 +169,7 @@ public:
* \return Pointer of initial scores
*/
inline const float* init_score() const { return init_score_; }
/*! \brief Load initial scores from file */
void LoadInitialScore();
@ -184,6 +186,8 @@ private:
const char* init_score_filename_;
/*! \brief Number of data */
data_size_t num_data_;
/*! \brief Number of classes */
int num_class_;
/*! \brief Number of weights, used to check correct weight file */
data_size_t num_weights_;
/*! \brief Label data */
@ -234,7 +238,7 @@ public:
};
using PredictFunction =
std::function<double(const std::vector<std::pair<int, double>>&)>;
std::function<std::vector<double>(const std::vector<std::pair<int, double>>&)>;
/*! \brief The main class of data set,
* which are used to traning or validation
@ -398,6 +402,8 @@ private:
int num_total_features_;
/*! \brief Number of total data*/
data_size_t num_data_;
/*! \brief Number of classes*/
int num_class_;
/*! \brief Store some label level data*/
Metadata metadata_;
/*! \brief Random generator*/

Просмотреть файл

@ -124,10 +124,17 @@ void Application::LoadData() {
// need to continue train
if (boosting_->NumberOfSubModels() > 0) {
predictor = new Predictor(boosting_, config_.io_config.is_sigmoid, config_.predict_leaf_index, -1);
predict_fun =
[&predictor](const std::vector<std::pair<int, double>>& features) {
return predictor->PredictRawOneLine(features);
};
if (config_.io_config.num_class == 1){
predict_fun =
[&predictor](const std::vector<std::pair<int, double>>& features) {
return predictor->PredictRawOneLine(features);
};
} else {
predict_fun =
[&predictor](const std::vector<std::pair<int, double>>& features) {
return predictor->PredictMulticlassOneLine(features);
};
}
}
// sync up random seed for data partition
if (config_.is_parallel_find_bin) {

Просмотреть файл

@ -61,10 +61,10 @@ public:
* \param features Feature for this record
* \return Prediction result
*/
double PredictRawOneLine(const std::vector<std::pair<int, double>>& features) {
std::vector<double> PredictRawOneLine(const std::vector<std::pair<int, double>>& features) {
const int tid = PutFeatureValuesToBuffer(features);
// get result without sigmoid transformation
return boosting_->PredictRaw(features_[tid], num_used_model_);
return std::vector<double>(1, boosting_->PredictRaw(features_[tid], num_used_model_));
}
/*!
@ -83,10 +83,10 @@ public:
* \param features Feature of this record
* \return Prediction result
*/
double PredictOneLine(const std::vector<std::pair<int, double>>& features) {
std::vector<double> PredictOneLine(const std::vector<std::pair<int, double>>& features) {
const int tid = PutFeatureValuesToBuffer(features);
// get result with sigmoid transform if needed
return boosting_->Predict(features_[tid], num_used_model_);
return std::vector<double>(1, boosting_->Predict(features_[tid], num_used_model_));
}
/*!
@ -136,6 +136,7 @@ public:
if (num_class_ > 1) {
predict_fun = [this](const std::vector<std::pair<int, double>>& features){
std::vector<double> prediction = PredictMulticlassOneLine(features);
Common::Softmax(&prediction);
std::stringstream result_stream_buf;
for (size_t i = 0; i < prediction.size(); ++i){
if (i > 0) {
@ -162,12 +163,12 @@ public:
else {
if (is_simgoid_) {
predict_fun = [this](const std::vector<std::pair<int, double>>& features){
return std::to_string(PredictOneLine(features));
return std::to_string(PredictOneLine(features)[0]);
};
}
else {
predict_fun = [this](const std::vector<std::pair<int, double>>& features){
return std::to_string(PredictRawOneLine(features));
return std::to_string(PredictRawOneLine(features)[0]);
};
}
}

Просмотреть файл

@ -503,7 +503,6 @@ std::vector<double> GBDT::PredictMulticlass(const double* value, int num_used_mo
ret[j] += models_[i * num_class_ + j] -> Predict(value);
}
}
Common::Softmax(&ret);
return ret;
}

Просмотреть файл

@ -27,7 +27,7 @@ public:
const float* init_score = data->metadata().init_score();
// if exists initial score, will start from it
if (init_score != nullptr) {
for (data_size_t i = 0; i < num_data_; ++i) {
for (data_size_t i = 0; i < num_data_ * num_class; ++i) {
score_[i] = init_score[i];
}
}

Просмотреть файл

@ -184,6 +184,7 @@ void OverallConfig::CheckParamConflict() {
void IOConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetInt(params, "max_bin", &max_bin);
CHECK(max_bin > 0);
GetInt(params, "num_class", &num_class);
GetInt(params, "data_random_seed", &data_random_seed);
if (!GetString(params, "data", &data_filename)) {
@ -236,7 +237,6 @@ void ObjectiveConfig::Set(const std::unordered_map<std::string, std::string>& pa
void MetricConfig::Set(const std::unordered_map<std::string, std::string>& params) {
GetDouble(params, "sigmoid", &sigmoid);
GetInt(params, "num_class", &num_class);
CHECK(num_class >= 1);
std::string tmp_str = "";
if (GetString(params, "label_gain", &tmp_str)) {
label_gain = Common::StringToDoubleArray(tmp_str, ',');
@ -294,7 +294,6 @@ void BoostingConfig::Set(const std::unordered_map<std::string, std::string>& par
CHECK(output_freq >= 0);
GetBool(params, "is_training_metric", &is_provide_training_metric);
GetInt(params, "num_class", &num_class);
CHECK(num_class >= 1);
}
void GBDTConfig::GetTreeLearnerType(const std::unordered_map<std::string, std::string>& params) {

Просмотреть файл

@ -20,6 +20,8 @@ Dataset::Dataset(const char* data_filename, const char* init_score_filename,
:data_filename_(data_filename), random_(io_config.data_random_seed),
max_bin_(io_config.max_bin), is_enable_sparse_(io_config.is_enable_sparse), predict_fun_(predict_fun) {
num_class_ = io_config.num_class;
CheckCanLoadFromBin();
if (is_loading_from_binfile_ && predict_fun != nullptr) {
Log::Info("Cannot performing initialization of prediction by using binary file, using text file instead");
@ -28,7 +30,7 @@ Dataset::Dataset(const char* data_filename, const char* init_score_filename,
if (!is_loading_from_binfile_) {
// load weight, query information and initilize score
metadata_.Init(data_filename, init_score_filename);
metadata_.Init(data_filename, init_score_filename, num_class_);
// create text reader
text_reader_ = new TextReader<data_size_t>(data_filename, io_config.has_header);
@ -152,7 +154,7 @@ Dataset::Dataset(const char* data_filename, const char* init_score_filename,
}
} else {
// only need to load initilize score, other meta data will be loaded from bin flie
metadata_.Init(init_score_filename);
metadata_.Init(init_score_filename, num_class_);
Log::Info("Loading data set from binary file");
parser_ = nullptr;
text_reader_ = nullptr;
@ -436,7 +438,7 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
// construct feature bin mappers
ConstructBinMappers(rank, num_machines, sample_data);
// initialize label
metadata_.Init(num_data_, weight_idx_, group_idx_);
metadata_.Init(num_data_, num_class_, weight_idx_, group_idx_);
// extract features
ExtractFeaturesFromMemory();
} else {
@ -446,7 +448,7 @@ void Dataset::LoadTrainData(int rank, int num_machines, bool is_pre_partition, b
// construct feature bin mappers
ConstructBinMappers(rank, num_machines, sample_data);
// initialize label
metadata_.Init(num_data_, weight_idx_, group_idx_);
metadata_.Init(num_data_, num_class_, weight_idx_, group_idx_);
// extract features
ExtractFeaturesFromFile();
@ -471,7 +473,7 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
// read data in memory
LoadDataToMemory(0, 1, false);
// initialize label
metadata_.Init(num_data_, weight_idx_, group_idx_);
metadata_.Init(num_data_, num_class_, weight_idx_, group_idx_);
features_.clear();
// copy feature bin mapper data
for (Feature* feature : train_set->features_) {
@ -487,7 +489,7 @@ void Dataset::LoadValidationData(const Dataset* train_set, bool use_two_round_lo
// Get number of lines of data file
num_data_ = static_cast<data_size_t>(text_reader_->CountLine());
// initialize label
metadata_.Init(num_data_, weight_idx_, group_idx_);
metadata_.Init(num_data_, num_class_, weight_idx_, group_idx_);
features_.clear();
// copy feature bin mapper data
for (Feature* feature : train_set->features_) {
@ -545,7 +547,7 @@ void Dataset::ExtractFeaturesFromMemory() {
}
} else {
// if need to prediction with initial model
float* init_score = new float[num_data_];
float* init_score = new float[num_data_ * num_class_];
#pragma omp parallel for schedule(guided) private(oneline_features) firstprivate(tmp_label)
for (data_size_t i = 0; i < num_data_; ++i) {
const int tid = omp_get_thread_num();
@ -553,7 +555,10 @@ void Dataset::ExtractFeaturesFromMemory() {
// parser
parser_->ParseOneLine(text_reader_->Lines()[i].c_str(), &oneline_features, &tmp_label);
// set initial score
init_score[i] = static_cast<float>(predict_fun_(oneline_features));
std::vector<double> oneline_init_score = predict_fun_(oneline_features);
for (int k = 0; k < num_class_; ++k){
init_score[k * num_data_ + i] = static_cast<float>(oneline_init_score[k]);
}
// set label
metadata_.SetLabelAt(i, static_cast<float>(tmp_label));
// free processed line:
@ -577,7 +582,7 @@ void Dataset::ExtractFeaturesFromMemory() {
}
}
// metadata_ will manage space of init_score
metadata_.SetInitScore(init_score, num_data_);
metadata_.SetInitScore(init_score, num_data_ * num_class_);
delete[] init_score;
}
@ -593,7 +598,7 @@ void Dataset::ExtractFeaturesFromMemory() {
void Dataset::ExtractFeaturesFromFile() {
float* init_score = nullptr;
if (predict_fun_ != nullptr) {
init_score = new float[num_data_];
init_score = new float[num_data_ * num_class_];
}
std::function<void(data_size_t, const std::vector<std::string>&)> process_fun =
[this, &init_score]
@ -608,7 +613,10 @@ void Dataset::ExtractFeaturesFromFile() {
parser_->ParseOneLine(lines[i].c_str(), &oneline_features, &tmp_label);
// set initial score
if (init_score != nullptr) {
init_score[start_idx + i] = static_cast<float>(predict_fun_(oneline_features));
std::vector<double> oneline_init_score = predict_fun_(oneline_features);
for (int k = 0; k < num_class_; ++k){
init_score[k * num_data_ + start_idx + i] = static_cast<float>(oneline_init_score[k]);
}
}
// set label
metadata_.SetLabelAt(start_idx + i, static_cast<float>(tmp_label));
@ -640,7 +648,7 @@ void Dataset::ExtractFeaturesFromFile() {
// metadata_ will manage space of init_score
if (init_score != nullptr) {
metadata_.SetInitScore(init_score, num_data_);
metadata_.SetInitScore(init_score, num_data_ * num_class_);
delete[] init_score;
}

Просмотреть файл

@ -14,9 +14,10 @@ Metadata::Metadata()
}
void Metadata::Init(const char * data_filename, const char* init_score_filename) {
void Metadata::Init(const char * data_filename, const char* init_score_filename, const int num_class) {
data_filename_ = data_filename;
init_score_filename_ = init_score_filename;
num_class_ = num_class;
// for lambdarank, it needs query data for partition data in parallel learning
LoadQueryBoundaries();
LoadWeights();
@ -24,8 +25,9 @@ void Metadata::Init(const char * data_filename, const char* init_score_filename)
LoadInitialScore();
}
void Metadata::Init(const char* init_score_filename) {
void Metadata::Init(const char* init_score_filename, const int num_class) {
init_score_filename_ = init_score_filename;
num_class_ = num_class;
LoadInitialScore();
}
@ -40,8 +42,9 @@ Metadata::~Metadata() {
}
void Metadata::Init(data_size_t num_data, int weight_idx, int query_idx) {
void Metadata::Init(data_size_t num_data, int num_class, int weight_idx, int query_idx) {
num_data_ = num_data;
num_class_ = num_class;
label_ = new float[num_data_];
if (weight_idx >= 0) {
if (weights_ != nullptr) {
@ -200,9 +203,11 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
if (init_score_ != nullptr) {
float* old_scores = init_score_;
num_init_score_ = num_data_;
init_score_ = new float[num_init_score_];
for (size_t i = 0; i < used_data_indices.size(); ++i) {
init_score_[i] = old_scores[used_data_indices[i]];
init_score_ = new float[num_init_score_ * num_class_];
for (int k = 0; k < num_class_; ++k){
for (size_t i = 0; i < used_data_indices.size(); ++i) {
init_score_[k * num_data_ + i] = old_scores[k * num_all_data + used_data_indices[i]];
}
}
delete[] old_scores;
}
@ -214,13 +219,13 @@ void Metadata::CheckOrPartition(data_size_t num_all_data, const std::vector<data
void Metadata::SetInitScore(const float* init_score, data_size_t len) {
if (num_data_ != len) {
Log::Fatal("len of initial score is not same with #data");
if (len != num_data_ * num_class_) {
Log::Fatal("Length of initial score is not same with number of data");
}
if (init_score_ != nullptr) { delete[] init_score_; }
num_init_score_ = num_data_;
init_score_ = new float[num_init_score_];
for (data_size_t i = 0; i < num_init_score_; ++i) {
init_score_ = new float[len];
for (data_size_t i = 0; i < len; ++i) {
init_score_[i] = init_score[i];
}
}
@ -253,11 +258,27 @@ void Metadata::LoadInitialScore() {
Log::Info("Start loading initial scores");
num_init_score_ = static_cast<data_size_t>(reader.Lines().size());
init_score_ = new float[num_init_score_];
init_score_ = new float[num_init_score_ * num_class_];
double tmp = 0.0f;
for (data_size_t i = 0; i < num_init_score_; ++i) {
Common::Atof(reader.Lines()[i].c_str(), &tmp);
init_score_[i] = static_cast<float>(tmp);
if (num_class_ == 1){
for (data_size_t i = 0; i < num_init_score_; ++i) {
Common::Atof(reader.Lines()[i].c_str(), &tmp);
init_score_[i] = static_cast<float>(tmp);
}
} else {
std::vector<std::string> oneline_init_score;
for (data_size_t i = 0; i < num_init_score_; ++i) {
oneline_init_score = Common::Split(reader.Lines()[i].c_str(), '\t');
if (static_cast<int>(oneline_init_score.size()) != num_class_){
Log::Fatal("Invalid initial score file. Redundant or insufficient columns.");
}
for (int k = 0; k < num_class_; ++k) {
Common::Atof(oneline_init_score[k].c_str(), &tmp);
init_score_[k * num_init_score_ + i] = static_cast<float>(tmp);
}
}
}
}