Parameterloader optimization

2015-11-21 17:18:08 +08:00 · 2015-11-21 17:18:08 +08:00 · 30a756e6fe
--- a/src/data_block.h
+++ b/src/data_block.h
@ -20,7 +20,8 @@ namespace multiverso
        class DataBlock : public multiverso::DataBlockBase
        {
        public:
-            std::vector <int> input_nodes, output_nodes;
+            std::unordered_set <int> input_nodes, output_nodes;
+            std::unordered_set <int> negativesample_pools;
            DataBlock(){}
            ~DataBlock();

--- a/src/distributed_wordembedding.cpp
+++ b/src/distributed_wordembedding.cpp
@ -280,6 +280,16 @@ namespace multiverso

                }
                multiverso::Multiverso::EndClock();
+
+                //Dump input-embedding weight
+                multiverso::Multiverso::BeginClock();
+                ++data_block_count;
+                DataBlock *data_block = new (std::nothrow)DataBlock();
+                assert(data_block != nullptr);
+                data_block->SetType(DataBlockType::Test);
+                PushDataBlock(datablock_queue, data_block);
+                multiverso::Multiverso::EndClock();
+
            }

 			//Dump input-embedding weight
--- a/src/parameter_loader.cpp
+++ b/src/parameter_loader.cpp
@ -38,19 +38,16 @@ namespace multiverso
            //input_nodes,output_nodes
            multiverso::Log::Debug("Rank %d ParameterLoader parse begin %d\n",
                multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
-            WordEmbedding_->PrepareParameter(data, input_nodes, output_nodes);
+            WordEmbedding_->PrepareParameter(data);
            multiverso::Log::Debug("Rank %d ParameterLoader parse end %d\n",
                multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
            //Step 2, Request the parameter
            multiverso::Log::Debug("Rank %d ParameterLoader request begin %d\n",
                multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
-            RequestParameter(data, input_nodes, output_nodes);
+            RequestParameter(data);
            multiverso::Log::Debug("Rank %d ParameterLoader request end %d\n",
                multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
            //Step 3, store the needed parameters in data_block
-            //it will be used to copy parameter from multiverso in trainer
-            data->input_nodes = std::move(input_nodes);
-            data->output_nodes = std::move(output_nodes);
           
            multiverso::Log::Info("Rank %d ParameterLoader finish %d\n",
                multiverso::Multiverso::ProcessRank(), parse_and_request_count_ - 1);
@ -58,9 +55,7 @@ namespace multiverso
            fflush(log_file_);
        }

-        void ParameterLoader::RequestParameter(DataBlock *data_block,
-            std::vector<int>& input_nodes,
-            std::vector<int>& output_nodes) 
+        void ParameterLoader::RequestParameter(DataBlock *data_block) 
        {
            //If the data_block is the last one, we need to dump 
            //the input-embedding weights
@ -68,16 +63,17 @@ namespace multiverso
                RequestTable(kInputEmbeddingTableId);

            RequestRow(kWordCountActualTableId, 0);
-            for (int i = 0; i < input_nodes.size(); ++i)
-                RequestRow(kInputEmbeddingTableId, input_nodes[i]);
-            for (int i = 0; i < output_nodes.size(); ++i)
-                RequestRow(kEmbeddingOutputTableId, output_nodes[i]);
+            for (auto node : data_block->input_nodes)
+                RequestRow(kInputEmbeddingTableId, node);
+            for (auto node : data_block->output_nodes)
+                RequestRow(kEmbeddingOutputTableId, node);
+
            if (option_->use_adagrad)
            {
-                for (int i = 0; i < input_nodes.size(); ++i)
-                    RequestRow(kSumGradient2IETableId, input_nodes[i]);
-                for (int i = 0; i < output_nodes.size(); ++i)
-                    RequestRow(kSumGradient2EOTableId, output_nodes[i]);
+                for (auto node : data_block->input_nodes)
+                    RequestRow(kSumGradient2IETableId, node);
+                for (auto node : data_block->output_nodes)
+                    RequestRow(kSumGradient2EOTableId, node);
            }
        }   
    }
--- a/src/parameter_loader.h
+++ b/src/parameter_loader.h
@ -43,9 +43,7 @@ namespace multiverso
            * \param input_nodes stores the input words'index
            * \param output_nodes stores the output words'index
            */
-            void RequestParameter(DataBlock *data_block,
-                std::vector<int>& input_nodes,
-                std::vector<int>& output_nodes);
+            void RequestParameter(DataBlock *data_block);
            //No copying allowed
            ParameterLoader(const ParameterLoader&);
            void operator=(const ParameterLoader&);
--- a/src/trainer.cpp
+++ b/src/trainer.cpp
@ -48,16 +48,16 @@ namespace multiverso
            //Compute the total number of processes
            if (process_count_ == -1)
                process_count_ = multiverso::Multiverso::TotalProcessCount();
-            //Get the input_nodes and output_nodes from data_block
-            //The input_nodes and output_nodes are stored by ParameterLoader
+           
            DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
-            std::vector<int>& input_nodes = data->input_nodes;
-            std::vector<int>& output_nodes = data->output_nodes;
+            std::vector<int> input_nodes(data->input_nodes.begin(), data->input_nodes.end());
+            std::vector<int> output_nodes(data->output_nodes.begin(), data->output_nodes.end());
            //A trainer only copy or add apart of parameters
            //This trainer should copy or add the parameters according to
            //local_input_nodes and local_output_nodes 
            std::vector<int> local_input_nodes;
            std::vector<int> local_output_nodes;
+
            for (int i = trainer_id_; i < input_nodes.size(); i += option_->thread_cnt)
                local_input_nodes.push_back(input_nodes[i]);
            for (int i = trainer_id_; i < output_nodes.size(); i += option_->thread_cnt)
@ -121,6 +121,7 @@ namespace multiverso
 			{
 				SaveEmbedding(option_->output_file, option_->output_binary);
 			}
+
            if (trainer_id_ == 0)
            {
                fprintf(log_file_, "%lf\n",
--- a/src/word_embedding.cpp
+++ b/src/word_embedding.cpp
@ -49,6 +49,8 @@ namespace multiverso
        void WordEmbedding::Train(DataBlock *data_block, int index_start, int interval,
            int64& word_count, real* hidden_act, real* hidden_err)
        {
+            std::vector <int> negativesample(data_block->negativesample_pools.begin(),
+                data_block->negativesample_pools.end());
            for (int i = index_start; i < data_block->Size(); i += interval)
            {
                int sentence_length;
@ -59,7 +61,7 @@ namespace multiverso
                    word_count_deta, next_random);

                this->Train(sentence, sentence_length,
-                    next_random, hidden_act, hidden_err);
+                    next_random, hidden_act, hidden_err, negativesample);

                word_count += word_count_deta;
            }
@ -77,10 +79,10 @@ namespace multiverso
        }

        void WordEmbedding::Train(int* sentence, int sentence_length,
-            uint64 next_random, real* hidden_act, real* hidden_err)
+            uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools)
        {
            ParseSentence(sentence, sentence_length,
-                next_random, hidden_act, hidden_err, &WordEmbedding::TrainSample);
+                next_random, hidden_act, hidden_err, &WordEmbedding::TrainSample, negativesample_pools);
        }
        //Train with forward direction and get  the input-hidden layer vector
        void WordEmbedding::FeedForward(std::vector<int>& input_nodes, real* hidden_act)
@ -190,37 +192,52 @@ namespace multiverso
            }
        }
        //Parapare the parameter for the datablock
-        void WordEmbedding::PrepareParameter(DataBlock* data_block,
-            std::vector<int>& input_nodes,
-            std::vector<int>& output_nodes)
+        void WordEmbedding::PrepareParameter(DataBlock* data_block)
        {
-            input_nodes_.clear();
-            output_nodes_.clear();
-
            int sentence_length;
-            int64 word_count_deta;
+            int64 word_count_delta;
            int *sentence;
            uint64 next_random;
+            if (option_->hs)
+            {
                for (int i = 0; i < data_block->Size(); ++i)
                {
-                data_block->GetSentence(i, sentence, sentence_length, word_count_deta,
-                    next_random);
-                ParseSentence(sentence, sentence_length, next_random,
-                    nullptr, nullptr, &WordEmbedding::DealPrepareParameter);
-            }
+                    data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);

-            for (auto it = input_nodes_.begin(); it != input_nodes_.end(); it++)
+                    for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
                    {
-                input_nodes.push_back(*it);
-                assert((*it) >= 0);
-                assert((*it) < dictionary_size_);
+                        data_block->input_nodes.insert(sentence[sentence_position]);
                    }
-
-            for (auto it = output_nodes_.begin(); it != output_nodes_.end(); it++)
+                }
+                for (auto input_node : data_block->input_nodes)
                {
-                output_nodes.push_back(*it);
-                assert((*it) >= 0);
-                assert((*it) < dictionary_size_);
+                    auto info = huffmanEncoder_->GetLabelInfo(input_node);
+                    for (int d = 0; d < info->codelen; d++)
+                        data_block->output_nodes.insert(info->point[d]);
+                }
+            }
+            else
+            {
+                for (int i = 0; i < data_block->Size(); ++i)
+                {
+                    data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);
+
+                    for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
+                    {
+                        data_block->input_nodes.insert(sentence[sentence_position]);
+                    }
+                }
+                for (auto input_node : data_block->input_nodes)
+                {
+                    data_block->output_nodes.insert(input_node);
+                }
+                for (int d = 0; d < option_->negative_num * data_block->input_nodes.size(); d++)
+                {
+                    next_random = sampler_->GetNextRandom(next_random);
+                    int target = sampler_->NegativeSampling(next_random);
+                    data_block->output_nodes.insert(target);
+                    data_block->negativesample_pools.insert(target);
+                }
            }
        }
        //Copy the input&ouput nodes
@ -236,7 +253,7 @@ namespace multiverso
        //Parse the sentence and deepen into two branches
        void WordEmbedding::ParseSentence(int* sentence, int sentence_length,
            uint64 next_random, real* hidden_act, real* hidden_err,
-            FunctionType function)
+            FunctionType function, std::vector <int> &negativesample_pools)
        {
            if (sentence_length == 0)
                return;
@ -263,7 +280,7 @@ namespace multiverso
                        input_nodes.clear();
                        output_nodes.clear();
                        Parse(feat + feat_size - 1, 1, sentence[sentence_position],
-                            next_random, input_nodes, output_nodes);
+                            next_random, input_nodes, output_nodes, negativesample_pools);
                        (this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
                    }
                }
@ -273,7 +290,8 @@ namespace multiverso
                    input_nodes.clear();
                    output_nodes.clear();
                    Parse(feat, feat_size, sentence[sentence_position],
-                        next_random, input_nodes, output_nodes);
+                        next_random, input_nodes, output_nodes, negativesample_pools);
+
                    (this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
                }
            }
@ -281,7 +299,7 @@ namespace multiverso
        //Parse the windows's input&output nodes
        inline void WordEmbedding::Parse(int *feat, int feat_cnt, int word_idx,
            uint64 &next_random, std::vector<int>& input_nodes,
-            std::vector<std::pair<int, int> >& output_nodes)
+            std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools)
        {
            for (int i = 0; i < feat_cnt; ++i)
            {
@ -301,7 +319,8 @@ namespace multiverso
                for (int d = 0; d < option_->negative_num; d++)
                {
                    next_random = sampler_->GetNextRandom(next_random);
-                    int target = sampler_->NegativeSampling(next_random);
+                    int index = (next_random >> 8) % negativesample_pools.size();
+                    int target = negativesample_pools[index];
                    if (target == word_idx) continue;
                    output_nodes.push_back(std::make_pair(target, 0));
                }
--- a/src/word_embedding.h
+++ b/src/word_embedding.h
@ -48,8 +48,7 @@ namespace multiverso
            * \param input_nodes  input_nodes represent the parameter which input_layer includes 
            * \param output_nodes output_nodes represent the parameter which output_layer inclueds
            */
-            void PrepareParameter(DataBlock *data_block,
-                std::vector<int>& input_nodes, std::vector<int>& output_nodes);
+            void PrepareParameter(DataBlock *data_block);
            /*!
            * \brief Update the learning rate
            */
@ -92,7 +91,7 @@ namespace multiverso
            */
            void Parse(int *feat, int feat_cnt, int word_idx, uint64 &next_random,
                std::vector<int>& input_nodes,
-                std::vector<std::pair<int, int> >& output_nodes);
+                std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools);
            /*!
            * \brief Parse a sentence and deepen into two branchs
            * \one for TrainNN,the other one is for Parameter_parse&request
@ -100,7 +99,7 @@ namespace multiverso
            void ParseSentence(int* sentence, int sentence_length,
                uint64 next_random,
                real* hidden_act, real* hidden_err,
-                FunctionType function);
+                FunctionType function, std::vector <int> &negativesample_pools);
            /*!
            * \brief Get the hidden layer vector
            * \param input_nodes represent the input nodes
@ -139,7 +138,7 @@ namespace multiverso
            * \brief Train the sentence actually
            */
            void Train(int* sentence, int sentence_length,
-                uint64 next_random, real* hidden_act, real* hidden_err);
+                uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools);

            //No copying allowed
            WordEmbedding(const WordEmbedding&);