Merge pull request #5 from Microsoft/revert-4-sherry_Version1

Revert " update distributed word embedding"
This commit is contained in:
Aerosoul 2016-05-26 00:04:28 -05:00
Родитель ee43717ab8 e7adf7286e
Коммит 8d2ca8c1ad
27 изменённых файлов: 2897 добавлений и 2901 удалений

Просмотреть файл

@ -1,31 +0,0 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_BLOCK_QUEUE_H_
#define DISTRIBUTED_WORD_EMBEDDING_BLOCK_QUEUE_H_
#include <cstdlib>
#include <condition_variable>
#include <iostream>
#include <mutex>
#include <thread>
#include <queue>
#include "data_block.h"
namespace multiverso
{
namespace wordembedding
{
class BlockQueue{
public:
std::queue <DataBlock *> queues;
std::mutex mtx;
std::condition_variable repo_not_empty;
BlockQueue(){}
~BlockQueue(){
std::queue<DataBlock *>().swap(queues);
}
};
}
}
#endif

Просмотреть файл

@ -1,273 +0,0 @@
#include "communicator.h"
namespace multiverso
{
namespace wordembedding
{
template <typename T>
void filler(std::vector<T> &v){
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_real_distribution<float> dis(-1.0, 1.0);
for (int i = 0; i<v.size(); i++)
{
v[i] = dis(gen);
}
}
Communicator::Communicator(Option* option){
option_ = option;
process_id_ = multiverso::MV_Rank();
memory_mamanger_ = new MemoryManager(option_->embeding_size);
}
Communicator::~Communicator(){
ClearParameterTables();
delete memory_mamanger_;
}
void Communicator::PrepareParameterTables(int row_size, int column_size){
worker_input_table_ = new MatrixWorkerTable<real>(row_size, column_size);
worker_output_table_ = new MatrixWorkerTable<real>(row_size, column_size);
server_input_table_ = new MatrixServerTable<real>(row_size, column_size, &filler);
server_output_table_ = new MatrixServerTable<real>(row_size, column_size);
worker_wordcount_table_ = new KVWorkerTable<int, int64>();
server_wordcount_table_ = new KVServerTable<int, int64>();
kv_ = worker_wordcount_table_->raw();
if (option_->use_adagrad){
worker_input_gradient_table_ = new MatrixWorkerTable<real>(row_size, column_size);
worker_output_gradient_table_ = new MatrixWorkerTable<real>(row_size, column_size);
server_input_gradient_table_ = new MatrixServerTable<real>(row_size, column_size);
server_output_gradient_table_ = new MatrixServerTable<real>(row_size, column_size);
}
}
void Communicator::ClearParameterTables(){
delete worker_input_table_;
delete worker_output_table_;
delete server_input_table_;
delete server_output_table_;
if (option_->use_adagrad){
delete worker_input_gradient_table_;
delete worker_output_gradient_table_;
delete server_input_gradient_table_;
delete server_output_gradient_table_;
}
//multiverso::Log::Info("Rank %d Clear Parameter Tables done.\n", process_id_);
}
inline void Communicator::AddRows(MatrixWorkerTable<real>* table_, std::vector<int> row_ids, std::vector<real *> ptrs, int size){
AddOption add_option;
table_->Add(row_ids, ptrs, size, &add_option);
}
void Communicator::GetWorkerTableRows(std::vector<int> row_nums, std::vector<real*> &blocks, int embeding_size){
worker_input_table_->Get(row_nums, blocks, embeding_size);
}
inline void Communicator::GetRows(MatrixWorkerTable<real>* table_, std::vector<int> row_ids, std::vector<real *> ptrs, int size){
table_->Get(row_ids, ptrs, size);
}
inline void Communicator::RequestParameterByTableId(DataBlock *data_block, int table_id, std::vector<int> &nodes, std::vector<real*> &blocks){
switch (table_id){
case kInputEmbeddingTableId:
GetRows(worker_input_table_, nodes, blocks, option_->embeding_size);
SetDataBlockEmbedding(data_block, blocks, nodes, kInputEmbeddingTableId);
break;
case kEmbeddingOutputTableId:
GetRows(worker_output_table_, nodes, blocks, option_->embeding_size);
SetDataBlockEmbedding(data_block, blocks, nodes, kEmbeddingOutputTableId);
break;
case kSumGradient2IETableId:
GetRows(worker_input_gradient_table_, nodes, blocks, option_->embeding_size);
SetDataBlockEmbedding(data_block, blocks, nodes, kSumGradient2IETableId);
break;
case kSumGradient2EOTableId:
GetRows(worker_output_gradient_table_, nodes, blocks, option_->embeding_size);
SetDataBlockEmbedding(data_block, blocks, nodes, kSumGradient2EOTableId);
break;
}
}
inline void Communicator::SetDataBlockEmbedding(DataBlock *data_block, std::vector<real*> &blocks, std::vector<int> &nodes, int table_id){
switch (table_id){
case kInputEmbeddingTableId:
for (int i = 0; i < nodes.size(); ++i){
data_block->SetWeightIE(nodes[i], blocks[i]);
}
break;
case kEmbeddingOutputTableId:
for (int i = 0; i < nodes.size(); ++i){
data_block->SetWeightEO(nodes[i], blocks[i]);
}
break;
case kSumGradient2IETableId:
for (int i = 0; i < nodes.size(); ++i){
data_block->SetSumGradient2IE(nodes[i], blocks[i]);
}
break;
case kSumGradient2EOTableId:
for (int i = 0; i < nodes.size(); ++i){
data_block->SetSumGradient2EO(nodes[i], blocks[i]);
}
break;
}
}
void Communicator::RequestParameter(DataBlock *data_block)
{
clock_t start = clock();
std::vector<int> input_nodes(data_block->input_nodes.begin(), data_block->input_nodes.end());
std::vector<int> output_nodes(data_block->output_nodes.begin(), data_block->output_nodes.end());
std::vector<real*> input_blocks;
std::vector<real*> output_blocks;
//Request blocks to store parameters
memory_mamanger_->RequestBlocks(data_block->input_nodes.size(), input_blocks);
memory_mamanger_->RequestBlocks(data_block->output_nodes.size(), output_blocks);
assert(input_blocks.size() == data_block->input_nodes.size());
assert(output_blocks.size() == data_block->output_nodes.size());
RequestParameterByTableId(data_block, kInputEmbeddingTableId, input_nodes, input_blocks);
RequestParameterByTableId(data_block, kEmbeddingOutputTableId, output_nodes, output_blocks);
if (option_->use_adagrad){
std::vector<real*> input_gradient_blocks;
std::vector<real*> output_gradient_blocks;
memory_mamanger_->RequestBlocks(input_nodes.size(), input_gradient_blocks);
memory_mamanger_->RequestBlocks(output_nodes.size(), output_gradient_blocks);
RequestParameterByTableId(data_block, kSumGradient2IETableId, input_nodes, input_gradient_blocks);
RequestParameterByTableId(data_block, kSumGradient2EOTableId, output_nodes, output_gradient_blocks);
}
multiverso::Log::Info("Rank %d Request Parameters time:%lfs\n", process_id_,
(clock() - start) / (double)CLOCKS_PER_SEC);
}
inline void Communicator::GetDeltaLoop(DataBlock *data_block, std::vector<real*> &blocks, std::vector<int> &nodes, int table_id, std::vector<real*> &recycle_blocks){
std::function<real*(int)> get_function;
switch (table_id){
case kInputEmbeddingTableId:
get_function = std::bind(&DataBlock::GetWeightIE, data_block, std::placeholders::_1);
break;
case kEmbeddingOutputTableId:
get_function = std::bind(&DataBlock::GetWeightEO, data_block, std::placeholders::_1);
break;
case kSumGradient2IETableId:
get_function = std::bind(&DataBlock::GetSumGradient2IE, data_block, std::placeholders::_1);
break;
case kSumGradient2EOTableId:
get_function = std::bind(&DataBlock::GetSumGradient2EO, data_block, std::placeholders::_1);
break;
}
for (int i = 0; i < nodes.size(); ++i)
{
real* new_row = get_function((nodes[i]));
real* old_row = blocks[i];
assert(new_row != nullptr);
for (int j = 0; j < option_->embeding_size; ++j)
{
old_row[j] = (new_row[j] - old_row[j]) / option_->thread_cnt;
}
recycle_blocks.push_back(new_row);
}
}
void Communicator::AddParameterByTableId(DataBlock *data_block, int table_id, std::vector<int> &nodes, std::vector<real*> &blocks, std::vector<real*> &recycle_blocks){
switch (table_id){
case kInputEmbeddingTableId:
GetRows(worker_input_table_, nodes, blocks, option_->embeding_size);
GetDeltaLoop(data_block, blocks, nodes, table_id, recycle_blocks);
AddRows(worker_input_table_, nodes, blocks, option_->embeding_size);
break;
case kEmbeddingOutputTableId:
GetRows(worker_output_table_, nodes, blocks, option_->embeding_size);
GetDeltaLoop(data_block, blocks, nodes, table_id, recycle_blocks);
AddRows(worker_output_table_, nodes, blocks, option_->embeding_size);
break;
case kSumGradient2IETableId:
GetRows(worker_input_gradient_table_, nodes, blocks, option_->embeding_size);
GetDeltaLoop(data_block, blocks, nodes, table_id, recycle_blocks);
AddRows(worker_input_gradient_table_, nodes, blocks, option_->embeding_size);
break;
case kSumGradient2EOTableId:
GetRows(worker_output_gradient_table_, nodes, blocks, option_->embeding_size);
GetDeltaLoop(data_block, blocks, nodes, table_id, recycle_blocks);
AddRows(worker_output_gradient_table_, nodes, blocks, option_->embeding_size);
break;
}
}
//Add delta to local buffer and send it to the parameter sever
void Communicator::AddDeltaParameter(DataBlock *data_block)
{
if (data_block == nullptr){
multiverso::Log::Info("Rank %d has null DataBlcok\n", process_id_);
return;
}
clock_t start = clock();
std::vector<real*> blocks;
std::vector<real*> recycle_blocks;
std::vector<int> input_nodes(data_block->input_nodes.begin(), data_block->input_nodes.end());
std::vector<int> output_nodes(data_block->output_nodes.begin(), data_block->output_nodes.end());
std::vector<real*> input_blocks;
std::vector<real*> output_blocks;
//Request blocks to store parameters
memory_mamanger_->RequestBlocks(input_nodes.size(), input_blocks);
memory_mamanger_->RequestBlocks(output_nodes.size(), output_blocks);
assert(input_blocks.size() == input_nodes.size());
assert(output_blocks.size() == output_nodes.size());
AddParameterByTableId(data_block, kInputEmbeddingTableId, input_nodes, input_blocks, recycle_blocks);
AddParameterByTableId(data_block, kEmbeddingOutputTableId, output_nodes, output_blocks, recycle_blocks);
memory_mamanger_->ReturnBlocks(input_blocks);
memory_mamanger_->ReturnBlocks(output_blocks);
if (option_->use_adagrad){
std::vector<real*> input_gradient_blocks;
std::vector<real*> output_gradient_blocks;
memory_mamanger_->RequestBlocks(input_nodes.size(), input_gradient_blocks);
memory_mamanger_->RequestBlocks(output_nodes.size(), output_gradient_blocks);
AddParameterByTableId(data_block, kSumGradient2IETableId, input_nodes, input_gradient_blocks, recycle_blocks);
AddParameterByTableId(data_block, kSumGradient2EOTableId, output_nodes, output_gradient_blocks, recycle_blocks);
memory_mamanger_->ReturnBlocks(input_gradient_blocks);
memory_mamanger_->ReturnBlocks(output_gradient_blocks);
}
memory_mamanger_->ReturnBlocks(recycle_blocks);
multiverso::Log::Info("Rank %d Add Parameters time:%lfs\n", process_id_, (clock() - start) / (double)CLOCKS_PER_SEC);
}
int64 Communicator::GetWordCount(){
worker_wordcount_table_->Get(kWordCountId);
return kv_[kWordCountId];
}
void Communicator::AddWordCount(int word_count_num){
worker_wordcount_table_->Add(kWordCountId, word_count_num);
}
void Communicator::RequestBlocks(int size, std::vector<real*> &blocks){
memory_mamanger_->RequestBlocks(size, blocks);
}
void Communicator::ReturnBlocks(std::vector<real*> &blocks){
memory_mamanger_->ReturnBlocks(blocks);
}
}
}

Просмотреть файл

@ -1,69 +0,0 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_COMMUNICATOR_H_
#define DISTRIBUTED_WORD_EMBEDDING_COMMUNICATOR_H_
#include "multiverso/multiverso.h"
#include "multiverso/table/matrix_table.h"
#include "multiverso/table/kv_table.h"
#include "multiverso/updater/updater.h"
#include "memory_manager.h"
#include "block_queue.h"
namespace multiverso
{
namespace wordembedding
{
class Communicator
{
public:
Communicator(Option* option);
~Communicator();
void RequestBlocks(int size, std::vector<real*> &blocks);
void ReturnBlocks(std::vector<real*> &blocks);
void RequestParameter(DataBlock *data_block);
void AddDeltaParameter(DataBlock *data_block);
int64 GetWordCount();
void AddWordCount(int word_count_num);
void GetWorkerTableRows(std::vector<int> row_nums, std::vector<real*> &blocks, int embeding_size);
void PrepareParameterTables(int row_size, int column_size);
private:
Option* option_ = nullptr;
MemoryManager* memory_mamanger_ = nullptr;
int process_id_;
std::unordered_map<int, int64> kv_;
MatrixWorkerTable<real>* worker_input_table_ = nullptr;
MatrixWorkerTable<real>* worker_output_table_ = nullptr;
MatrixServerTable<real>* server_input_table_ = nullptr;
MatrixServerTable<real>* server_output_table_ = nullptr;
MatrixWorkerTable<real>* worker_input_gradient_table_ = nullptr;
MatrixWorkerTable<real>* worker_output_gradient_table_ = nullptr;
MatrixServerTable<real>* server_input_gradient_table_ = nullptr;
MatrixServerTable<real>* server_output_gradient_table_ = nullptr;
KVWorkerTable<int, int64>* worker_wordcount_table_ = nullptr;
KVServerTable<int, int64>* server_wordcount_table_ = nullptr;
void ClearParameterTables();
void GetRows(MatrixWorkerTable<real>* table_, std::vector<int> row_ids, std::vector<real *> ptrs, int size);
void RequestParameterByTableId(DataBlock *data_block, int table_id, std::vector<int> &nodes, std::vector<real*> &blocks);
void SetDataBlockEmbedding(DataBlock *data_block, std::vector<real*> &blocks, std::vector<int> &nodes, int table_id);
void AddRows(MatrixWorkerTable<real>* table_, std::vector<int> row_ids, std::vector<real *> ptrs, int size);
void AddParameterByTableId(DataBlock *data_block, int table_id, std::vector<int> &nodes, std::vector<real*> &blocks, std::vector<real*> &recycle_blocks);
void GetDeltaLoop(DataBlock *data_block, std::vector<real*> &blocks, std::vector<int> &nodes, int table_id, std::vector<real*> &recycle_blocks);
};
}
}
#endif

Просмотреть файл

@ -1,41 +1,35 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_CONSTANT_H_
#define DISTRIBUTED_WORD_EMBEDDING_CONSTANT_H_
#pragma once
/*!
* \file constant.h
* \brief The index of parameter tables and some constant.
*/
#include "multiverso.h"
#include "log.h"
#include <cstdint>
#include "multiverso/multiverso.h"
#include "multiverso/util/log.h"
namespace multiverso
{
namespace wordembedding
{
namespace wordembedding
{
/*! \brief Table id is use*/
const multiverso::integer_t kInputEmbeddingTableId = 0;
const multiverso::integer_t kEmbeddingOutputTableId = 1;
const multiverso::integer_t kWordCountActualTableId = 2;
const multiverso::integer_t kSumGradient2IETableId = 3;
const multiverso::integer_t kSumGradient2EOTableId = 4;
typedef int64_t int64;
typedef uint64_t uint64;
typedef float real;
typedef int64_t int64;
typedef uint64_t uint64;
typedef float real;
const int kInputEmbeddingTableId = 0;
const int kEmbeddingOutputTableId = 1;
const int kSumGradient2IETableId = 2;
const int kSumGradient2EOTableId = 3;
const int kWordCountId = 4;
const int kTableSize = (int)1e8;
//const real kEps = static_cast<real>(1e-10);
const int kMaxWordSize = 901;
const int kMaxCodeLength = 100;
const int kMaxString = 100;
const int kMaxSentenceLength = 1000;
const int kMaxEXP = 6;
const int kExpTableSize = 1000;
const int kMaxExp = 6;
}
const int kTableSize = (int)1e8;
const real kEps = static_cast<real>(1e-10);
const int kMaxWordSize = 901;
const int kMaxCodeLength = 100;
const int kMaxString = 100;
const int kMaxSentenceLength = 1000;
const int kMaxEXP = 6;
}
}
#endif

Просмотреть файл

@ -2,157 +2,49 @@
namespace multiverso
{
namespace wordembedding
{
DataBlock::~DataBlock()
{
ClearSentences();
ClearParameters();
}
namespace wordembedding
{
DataBlock::~DataBlock()
{
ClearSentences();
}
size_t DataBlock::Size()
{
return sentences_.size();
}
size_t DataBlock::Size()
{
return sentences_.size();
}
//Add a new sentence to the DataBlock
void DataBlock::AddSentence(int *head, int sentence_length,
int64 word_count, uint64 next_random)
{
Sentence sentence(head, sentence_length, word_count, next_random);
sentences_.push_back(sentence);
}
//Add a new sentence to the DataBlock
void DataBlock::AddSentence(int *head, int sentence_length,
int64 word_count, uint64 next_random)
{
Sentence sentence(head, sentence_length, word_count, next_random);
sentences_.push_back(sentence);
}
//Get the information of the index-th sentence
void DataBlock::GetSentence(int index, int* &head,
int &sentence_length, int64 &word_count, uint64 &next_random)
{
if (index >= 0 && index < sentences_.size())
{
sentences_[index].Get(head, sentence_length,
word_count, next_random);
}
else
{
head = nullptr;
sentence_length = 0;
word_count = 0;
next_random = 0;
}
}
//Free the memory of sentences
void DataBlock::ClearSentences()
{
for (int i = 0; i < sentences_.size(); ++i)
delete[] sentences_[i].head;
sentences_.clear();
}
void DataBlock::ClearParameters()
{
delete[] weight_IE_;
delete[] weight_EO_;
if (is_use_adagrad_)
{
delete sum_gradient2_IE_;
delete sum_gradient2_EO_;
}
}
//Set the weight of input-embedding vector
void DataBlock::SetWeightIE(int input_node_id, real* ptr)
{
weight_IE_[input_node_id] = ptr;
}
//Set the weight of output-embedding vector
void DataBlock::SetWeightEO(int output_node_id, real* ptr)
{
weight_EO_[output_node_id] = ptr;
}
//Get the weight of output-embedding vector
real* DataBlock::GetWeightIE(int input_node_id)
{
return weight_IE_[input_node_id];
}
//Get the weight of output-embedding vector
real* DataBlock::GetWeightEO(int output_node_id)
{
return weight_EO_[output_node_id];
}
void DataBlock::SetSumGradient2IE(int input_node_id, real* ptr)
{
sum_gradient2_IE_[input_node_id] = ptr;
}
//Set the weight of SumGradient-output vector
void DataBlock::SetSumGradient2EO(int output_node_id, real* ptr)
{
sum_gradient2_EO_[output_node_id] = ptr;
}
//Get the weight of SumGradient-input vector
real* DataBlock::GetSumGradient2IE(int input_node_id)
{
return sum_gradient2_IE_[input_node_id];
}
//Get the weight of SumGradient-output vector
real* DataBlock::GetSumGradient2EO(int output_node_id)
{
return sum_gradient2_EO_[output_node_id];
}
void DataBlock::MallocMemory(int dictionary_size_, bool is_use_adagrad){
weight_IE_ = new (std::nothrow)real*[dictionary_size_];
assert(weight_IE_ != nullptr);
weight_EO_ = new (std::nothrow)real*[dictionary_size_];
assert(weight_EO_ != nullptr);
is_use_adagrad_ = is_use_adagrad;
if (is_use_adagrad_)
{
sum_gradient2_IE_ = new (std::nothrow)real*[dictionary_size_];
sum_gradient2_EO_ = new (std::nothrow)real*[dictionary_size_];
assert(sum_gradient2_IE_ != nullptr);
assert(sum_gradient2_EO_ != nullptr);
}
}
void DataBlock::PrintDataBlock(int embedding_size){
std::vector<int> input_nodes(input_nodes.begin(), input_nodes.end());
std::vector<int> output_nodes(output_nodes.begin(),output_nodes.end());
for (int i = 0; i < input_nodes.size(); ++i)
//for (int i = 0; i < 2; ++i)
{
real* ptr = GetWeightIE(input_nodes[i]);
for (int j = 0; j < embedding_size; j++){
std::cout << ptr[j] << " ";
}
std::cout << std::endl;
}
for (int i = 0; i < output_nodes.size(); ++i)
{
real* ptr = GetWeightEO(output_nodes[i]);
for (int j = 0; j < embedding_size; j++){
std::cout << ptr[j] << " ";
}
std::cout << std::endl;
}
}
void DataBlock::SetLastFlag(){
is_last_one_ = true;
}
bool DataBlock::isLast(){
return is_last_one_;
}
}
//Get the information of the index-th sentence
void DataBlock::GetSentence(int index, int* &head,
int &sentence_length, int64 &word_count, uint64 &next_random)
{
if (index >= 0 && index < sentences_.size())
{
sentences_[index].Get(head, sentence_length,
word_count, next_random);
}
else
{
head = nullptr;
sentence_length = 0;
word_count = 0;
next_random = 0;
}
}
//Free the memory of sentences
void DataBlock::ClearSentences()
{
for (int i = 0; i < sentences_.size(); ++i)
delete [] sentences_[i].head;
sentences_.clear();
}
}
}

Просмотреть файл

@ -1,131 +1,97 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_DATA_BLOCK_H_
#define DISTRIBUTED_WORD_EMBEDDING_DATA_BLOCK_H_
#pragma once
/*!
* \file data_block.h
* \brief Class DataBlock is to store the necessary data for trainer and param_loader
*/
#include<iostream>
#include "multiverso/multiverso.h"
#include "util.h"
#include "multiverso.h"
#include "huffman_encoder.h"
#include "constant.h"
namespace multiverso
{
namespace wordembedding
{
/*!
* \brief The class DataBlock stores train for trainer and param_loader
*/
class DataBlock
{
public:
std::unordered_set <int> input_nodes, output_nodes;
std::unordered_set <int> negativesample_pools;
namespace wordembedding
{
/*!
* \brief The class DataBlock stores train for trainer and param_loader
*/
class DataBlock : public multiverso::DataBlockBase
{
public:
std::unordered_set <int> input_nodes, output_nodes;
std::unordered_set <int> negativesample_pools;
DataBlock(){}
~DataBlock();
DataBlock(){}
~DataBlock();
/*!
* \brief Get the number of sentences stored in DataBlock
* \return the number of sentences
*/
size_t Size();
/*!
* \brief Add a new sentence to the DataBlock
* \param sentence the starting address of the sentence
* \param sentence_length the length of the sentence
* \param word_count the number of words when getting the
* sentence from train-file
* \param next_random the seed for getting random number
*/
void AddSentence(int *sentence, int sentence_length,
int64 word_count, uint64 next_random);
/*!
* \brief Get the information of the index-th sentence
* \param index the id of the sentence
* \param sentence the starting address of the sentence
* \param sentence_length the length of the sentence
* \param word_count the number of words when getting the
* sentence from train-file
* \param next_random the seed for getting random number
*/
void GetSentence(int index, int* &sentence,
int &sentence_length, int64 &word_count,
uint64 &next_random);
/*!
* \brief Get the number of sentences stored in DataBlock
* \return the number of sentences
*/
size_t Size();
/*!
* \brief Add a new sentence to the DataBlock
* \param sentence the starting address of the sentence
* \param sentence_length the length of the sentence
* \param word_count the number of words when getting the
* sentence from train-file
* \param next_random the seed for getting random number
*/
void AddSentence(int *sentence, int sentence_length,
int64 word_count, uint64 next_random);
/*!
* \brief Get the information of the index-th sentence
* \param index the id of the sentence
* \param sentence the starting address of the sentence
* \param sentence_length the length of the sentence
* \param word_count the number of words when getting the
* sentence from train-file
* \param next_random the seed for getting random number
*/
void GetSentence(int index, int* &sentence,
int &sentence_length, int64 &word_count,
uint64 &next_random);
/*!
* \brief Release the memory which are using to store sentences
*/
void ClearSentences();
/*!
* \brief Release the memory which are using to store sentences
*/
void ClearSentences();
private:
/*!
* \brief The information of sentences
* head the head address which store the sentence
* length the number of words in the sentence
* word_count the real word count of the sentence
* next_random the random seed
*/
struct Sentence
{
int* head;
int length;
int64 word_count;
uint64 next_random;
Sentence(int *head, int length, int64 word_count,
uint64 next_random) :head(head), length(length),
word_count(word_count), next_random(next_random){}
void ClearParameters();
void Get(int* &local_head, int &sentence_length,
int64 &local_word_count, uint64 &local_next_random)
{
local_head = head;
sentence_length = length;
local_word_count = word_count;
local_next_random = next_random;
}
};
void MallocMemory(int dictionary_size_, bool is_use_adagrad);
/*! \brief Store the information of sentences*/
std::vector <Sentence> sentences_;
void SetWeightIE(int input_node_id, real* ptr);
void SetWeightEO(int output_node_id, real* ptr);
real* GetWeightIE(int input_node_id);
real* GetWeightEO(int output_node_id);
void SetSumGradient2IE(int input_node_id, real* ptr);
void SetSumGradient2EO(int output_node_id, real* ptr);
real* GetSumGradient2IE(int input_node_id);
real* GetSumGradient2EO(int output_node_id);
void PrintDataBlock(int embedding_size);
void SetLastFlag();
bool isLast();
private:
/*!
* \brief The information of sentences
* head the head address which store the sentence
* length the number of words in the sentence
* word_count the real word count of the sentence
* next_random the random seed
*/
struct Sentence
{
int* head;
int length;
int64 word_count;
uint64 next_random;
Sentence(int *head, int length, int64 word_count,
uint64 next_random) :head(head), length(length),
word_count(word_count), next_random(next_random){}
void Get(int* &local_head, int &sentence_length,
int64 &local_word_count, uint64 &local_next_random)
{
local_head = head;
sentence_length = length;
local_word_count = word_count;
local_next_random = next_random;
}
};
/*! \brief Store the information of sentences*/
std::vector <Sentence> sentences_;
real** weight_IE_=nullptr;
real** weight_EO_ = nullptr;
real** sum_gradient2_IE_ = nullptr;
real** sum_gradient2_EO_ = nullptr;
bool is_use_adagrad_ = false;
bool is_last_one_ = false;
// No copying allowed
DataBlock(const DataBlock&);
//void operator=(const DataBlock&);
};
}
}
#endif
// No copying allowed
DataBlock(const DataBlock&);
void operator=(const DataBlock&);
};
}
}

Просмотреть файл

@ -1,236 +1,227 @@
#include "dictionary.h"
#include <cstring>
namespace multiverso
{
namespace wordembedding
{
Dictionary::Dictionary()
{
combine_ = 0;
Clear();
}
namespace wordembedding
{
Dictionary::Dictionary()
{
combine_ = 0;
Clear();
}
Dictionary::Dictionary(int i)
{
combine_ = i;
Clear();
}
Dictionary::Dictionary(int i)
{
combine_ = i;
Clear();
}
void Dictionary::Clear()
{
word_idx_map_.clear();
word_info_.clear();
word_whitelist_.clear();
}
//Set the white list for the dictionary
void Dictionary::SetWhiteList(const std::vector<std::string>& whitelist)
{
for (unsigned int i = 0; i < whitelist.size(); ++i)
word_whitelist_.insert(whitelist[i]);
}
//Merge in the word_info which has the frequency over-threshold
void Dictionary::MergeInfrequentWords(int64 threshold)
{
word_idx_map_.clear();
std::vector<WordInfo> tmp_info;
tmp_info.clear();
int infreq_idx = -1;
void Dictionary::Clear()
{
word_idx_map_.clear();
word_info_.clear();
word_whitelist_.clear();
}
//Set the white list for the dictionary
void Dictionary::SetWhiteList(const std::vector<std::string>& whitelist)
{
for (unsigned int i = 0; i < whitelist.size(); ++i)
word_whitelist_.insert(whitelist[i]);
}
//Merge in the word_info which has the frequency over-threshold
void Dictionary::MergeInfrequentWords(int64 threshold)
{
word_idx_map_.clear();
std::vector<WordInfo> tmp_info;
tmp_info.clear();
int infreq_idx = -1;
for (auto word_info : word_info_)
{
if (word_info.freq >= threshold || word_info.freq == 0
|| word_whitelist_.count(word_info.word))
{
word_idx_map_[word_info.word] = static_cast<int>(tmp_info.size());
tmp_info.push_back(word_info);
}
else
{
if (infreq_idx < 0)
{
WordInfo infreq_word_info;
infreq_word_info.word = "WE_ARE_THE_INFREQUENT_WORDS";
infreq_word_info.freq = 0;
word_idx_map_[infreq_word_info.word] = static_cast<int>(tmp_info.size());
infreq_idx = static_cast<int>(tmp_info.size());
tmp_info.push_back(infreq_word_info);
}
word_idx_map_[word_info.word] = infreq_idx;
tmp_info[infreq_idx].freq += word_info.freq;
}
}
word_info_ = tmp_info;
}
//Remove the words with frequency under min_count
void Dictionary::RemoveWordsLessThan(int64 min_count)
{
word_idx_map_.clear();
std::vector<WordInfo> tmp_info;
tmp_info.clear();
for (auto info : word_info_)
{
if (info.freq >= min_count || info.freq == 0
|| word_whitelist_.count(info.word))
{
word_idx_map_[info.word] = static_cast<int>(tmp_info.size());
tmp_info.push_back(info);
}
}
word_info_ = tmp_info;
}
//Insert the dictionary element
void Dictionary::Insert(const char* word, int64 cnt)
{
auto it = word_idx_map_.find(word);
if (it != word_idx_map_.end())
word_info_[it->second].freq += cnt;
else
{
word_idx_map_[word] = static_cast<int>(word_info_.size());
word_info_.push_back(WordInfo(word, cnt));
}
}
//Load dictionary from file
void Dictionary::LoadFromFile(const char* filename)
{
FILE* fid;
fid = fopen(filename, "r");
for (auto word_info : word_info_)
{
if (word_info.freq >= threshold || word_info.freq == 0
|| word_whitelist_.count(word_info.word))
{
word_idx_map_[word_info.word] = static_cast<int>(tmp_info.size());
tmp_info.push_back(word_info);
}
else
{
if (infreq_idx < 0)
{
WordInfo infreq_word_info;
infreq_word_info.word = "WE_ARE_THE_INFREQUENT_WORDS";
infreq_word_info.freq = 0;
word_idx_map_[infreq_word_info.word] = static_cast<int>(tmp_info.size());
infreq_idx = static_cast<int>(tmp_info.size());
tmp_info.push_back(infreq_word_info);
}
word_idx_map_[word_info.word] = infreq_idx;
tmp_info[infreq_idx].freq += word_info.freq;
}
}
word_info_ = tmp_info;
}
//Remove the words with frequency under min_count
void Dictionary::RemoveWordsLessThan(int64 min_count)
{
word_idx_map_.clear();
std::vector<WordInfo> tmp_info;
tmp_info.clear();
for (auto info : word_info_)
{
if (info.freq >= min_count || info.freq == 0
|| word_whitelist_.count(info.word))
{
word_idx_map_[info.word] = static_cast<int>(tmp_info.size());
tmp_info.push_back(info);
}
}
word_info_ = tmp_info;
}
//Insert the dictionary element
void Dictionary::Insert(const char* word, int64 cnt)
{
auto it = word_idx_map_.find(word);
if (it != word_idx_map_.end())
word_info_[it->second].freq += cnt;
else
{
word_idx_map_[word] = static_cast<int>(word_info_.size());
word_info_.push_back(WordInfo(word, cnt));
}
}
//Load dictionary from file
void Dictionary::LoadFromFile(const char* filename)
{
FILE* fid;
fid=fopen(filename, "r");
if (fid)
{
char sz_label[kMaxWordSize];
if (fid)
{
char sz_label[kMaxWordSize];
//while ((fid, "%s", sz_label, kMaxWordSize) != EOF)
while (fscanf(fid, "%s", sz_label, kMaxWordSize) != EOF)
{
int freq;
fscanf(fid, "%d", &freq);
Insert(sz_label, freq);
}
fclose(fid);
}
}
//while ((fid, "%s", sz_label, kMaxWordSize) != EOF)
while (fscanf(fid, "%s", sz_label, kMaxWordSize) != EOF)
{
int freq;
fscanf(fid, "%d", &freq);
Insert(sz_label, freq);
}
fclose(fid);
}
}
void Dictionary::LoadTriLetterFromFile(const char* filename,
unsigned int min_cnt, unsigned int letter_count)
{
FILE* fid;
fid = fopen(filename, "r");
if (fid)
{
char sz_label[kMaxWordSize] = { 0 };
//while (fscanf_s(fid, "%s", sz_label, kMaxWordSize) != EOF)
while (fscanf(fid, "%s", sz_label, kMaxWordSize) != EOF)
{
int64 freq;
fscanf(fid, "%lld", &freq);
if (freq < static_cast<int64>(min_cnt)) continue;
void Dictionary::LoadTriLetterFromFile(const char* filename,
unsigned int min_cnt, unsigned int letter_count)
{
FILE* fid;
fid=fopen(filename, "r");
if (fid)
{
char sz_label[kMaxWordSize] = { 0 };
//while (fscanf_s(fid, "%s", sz_label, kMaxWordSize) != EOF)
while (fscanf(fid, "%s", sz_label, kMaxWordSize) != EOF)
{
int64 freq;
fscanf(fid, "%lld", &freq);
if (freq < static_cast<int64>(min_cnt)) continue;
// Construct Tri-letter From word
size_t len = strlen(sz_label);
if (len > kMaxWordSize)
{
/*
multiverso::Log::Info("ignore super long term");
continue;
*/
}
// Construct Tri-letter From word
size_t len = strlen(sz_label);
if (len > kMaxWordSize)
{
multiverso::Log::Info("ignore super long term");
continue;
}
char tri_letters[kMaxWordSize + 2];
tri_letters[0] = '#';
int i = 0;
for (i = 0; i < strlen(sz_label); i++)
{
tri_letters[i + 1] = sz_label[i];
}
char tri_letters[kMaxWordSize + 2];
tri_letters[0] = '#';
int i = 0;
for (i = 0; i < strlen(sz_label); i++)
{
tri_letters[i + 1] = sz_label[i];
}
tri_letters[i + 1] = '#';
tri_letters[i + 2] = 0;
if (combine_) Insert(sz_label, freq);
tri_letters[i + 1] = '#';
tri_letters[i + 2] = 0;
if (combine_) Insert(sz_label, freq);
if (strlen(tri_letters) <= letter_count) {
Insert(tri_letters, freq);
}
else
{
for (i = 0; i <= strlen(tri_letters) - letter_count; ++i)
{
char tri_word[kMaxWordSize];
unsigned int j = 0;
for (j = 0; j < letter_count; j++)
{
tri_word[j] = tri_letters[i + j];
}
tri_word[j] = 0;
Insert(tri_word, freq);
}
}
}
fclose(fid);
}
}
if (strlen(tri_letters) <= letter_count) {
Insert(tri_letters, freq);
}
else
{
for (i = 0; i <= strlen(tri_letters) - letter_count; ++i)
{
char tri_word[kMaxWordSize];
unsigned int j = 0;
for (j = 0; j < letter_count; j++)
{
tri_word[j] = tri_letters[i + j];
}
tri_word[j] = 0;
Insert(tri_word, freq);
}
}
}
fclose(fid);
}
}
//Get the word's index from dictionary
int Dictionary::GetWordIdx(const char* word)
{
auto it = word_idx_map_.find(word);
if (it != word_idx_map_.end())
return it->second;
return -1;
}
//Return the size of frequency
int Dictionary::Size()
{
return static_cast<int>(word_info_.size());
}
//Get the wordinfo from word or index
const WordInfo* Dictionary::GetWordInfo(const char* word)
{
auto it = word_idx_map_.find(word);
if (it != word_idx_map_.end())
return GetWordInfo(it->second);
return NULL;
}
//Get the word's index from dictionary
int Dictionary::GetWordIdx(const char* word)
{
auto it = word_idx_map_.find(word);
if (it != word_idx_map_.end())
return it->second;
return -1;
}
//Return the size of frequency
int Dictionary::Size()
{
return static_cast<int>(word_info_.size());
}
//Get the wordinfo from word or index
const WordInfo* Dictionary::GetWordInfo(const char* word)
{
auto it = word_idx_map_.find(word);
if (it != word_idx_map_.end())
return GetWordInfo(it->second);
return NULL;
}
const WordInfo* Dictionary::GetWordInfo(int word_idx)
{
if (word_idx >= 0 && word_idx < word_info_.size())
return &word_info_[word_idx];
return NULL;
}
const WordInfo* Dictionary::GetWordInfo(int word_idx)
{
if (word_idx >= 0 && word_idx < word_info_.size())
return &word_info_[word_idx];
return NULL;
}
void Dictionary::StartIteration()
{
word_iterator_ = word_info_.begin();
}
//Judge whether the iterator is the end
bool Dictionary::HasMore()
{
return word_iterator_ != word_info_.end();
}
//Get the next Wordinfo
const WordInfo* Dictionary::Next()
{
const WordInfo* entry = &(*word_iterator_);
++word_iterator_;
return entry;
}
void Dictionary::StartIteration()
{
word_iterator_ = word_info_.begin();
}
//Judge whether the iterator is the end
bool Dictionary::HasMore()
{
return word_iterator_ != word_info_.end();
}
//Get the next Wordinfo
const WordInfo* Dictionary::Next()
{
const WordInfo* entry = &(*word_iterator_);
++word_iterator_;
return entry;
}
std::vector<WordInfo>::iterator Dictionary::Begin()
{
return word_info_.begin();
}
std::vector<WordInfo>::iterator Dictionary::End()
{
return word_info_.end();
}
void Dictionary::PrintVocab(){
int i = 0;
for (auto temp = Begin(); temp != End(); ++temp){
std::cout << temp->word << " " << i << std::endl;
i++;
}
}
}
std::vector<WordInfo>::iterator Dictionary::Begin()
{
return word_info_.begin();
}
std::vector<WordInfo>::iterator Dictionary::End()
{
return word_info_.end();
}
}
}

Просмотреть файл

@ -1,100 +1,94 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_DICTIONARY_H_
#define DISTRIBUTED_WORD_EMBEDDING_DICTIONARY_H_
#pragma once
/*!
* \brief Class dictionary stores the vocabulary and it's frequency
*/
#include <cstring>
#include <unordered_map>
#include <unordered_set>
#include <string>
#include <vector>
#include <iostream>
#include "multiverso/util/log.h"
#include "constant.h"
#include "log.h"
namespace multiverso
{
namespace wordembedding
{ /*!
* \brief struct WordInfo stores the pair of word&freq
*/
struct WordInfo
{
std::string word;
int64 freq;
WordInfo()
{
freq = 0;
word.clear();
}
WordInfo(const std::string& _word, int64 _freq)
{
word = _word;
freq = _freq;
}
};
namespace wordembedding
{ /*!
* \brief struct WordInfo stores the pair of word&freq
*/
struct WordInfo
{
std::string word;
int64 freq;
WordInfo()
{
freq = 0;
word.clear();
}
WordInfo(const std::string& _word, int64 _freq)
{
word = _word;
freq = _freq;
}
};
class Dictionary
{
public:
Dictionary();
Dictionary(int i);
void Clear();
/*!
* \brief Assign value to the set word_whitelist_
*/
void SetWhiteList(const std::vector<std::string>& whitelist);
/*!
* \brief Remove the low-freq word
*/
void RemoveWordsLessThan(int64 min_count);
/*!
* \brief Merge in the frequent words according to threshold
*/
void MergeInfrequentWords(int64 threshold);
/*!
* \brief Insert word-freq pair to the dictionary
* \param word the word string
* \param cnt the word's freqency
*/
void Insert(const char* word, int64 cnt = 1);
/*!
* \brief Load the word-freq pair from file
*/
void LoadFromFile(const char* filename);
void LoadTriLetterFromFile(const char* filename,
unsigned int min_cnt = 1, unsigned int letter_count = 3);
int GetWordIdx(const char* word);
/*!
* \brief Get the index of the word according to the dictionary
*/
const WordInfo* GetWordInfo(const char* word);
const WordInfo* GetWordInfo(int word_idx);
int Size();
void StartIteration();
/*!
* \brief Judge the word_iterator_ is the end
*/
bool HasMore();
/*!
* \brief Get the next wordinfo pointer in the vector
*/
const WordInfo* Next();
std::vector<WordInfo>::iterator Begin();
std::vector<WordInfo>::iterator End();
class Dictionary
{
public:
Dictionary();
Dictionary(int i);
void Clear();
/*!
* \brief Assign value to the set word_whitelist_
*/
void SetWhiteList(const std::vector<std::string>& whitelist);
/*!
* \brief Remove the low-freq word
*/
void RemoveWordsLessThan(int64 min_count);
/*!
* \brief Merge in the frequent words according to threshold
*/
void MergeInfrequentWords(int64 threshold);
/*!
* \brief Insert word-freq pair to the dictionary
* \param word the word string
* \param cnt the word's freqency
*/
void Insert(const char* word, int64 cnt = 1);
/*!
* \brief Load the word-freq pair from file
*/
void LoadFromFile(const char* filename);
void LoadTriLetterFromFile(const char* filename,
unsigned int min_cnt = 1, unsigned int letter_count = 3);
int GetWordIdx(const char* word);
/*!
* \brief Get the index of the word according to the dictionary
*/
const WordInfo* GetWordInfo(const char* word);
const WordInfo* GetWordInfo(int word_idx);
int Size();
void StartIteration();
/*!
* \brief Judge the word_iterator_ is the end
*/
bool HasMore();
/*!
* \brief Get the next wordinfo pointer in the vector
*/
const WordInfo* Next();
std::vector<WordInfo>::iterator Begin();
std::vector<WordInfo>::iterator End();
void PrintVocab();
private:
int combine_;
std::vector<WordInfo> word_info_;
std::vector<WordInfo>::iterator word_iterator_;
std::unordered_map<std::string, int> word_idx_map_;
std::unordered_set<std::string> word_whitelist_;
};
}
}
#endif
private:
int combine_;
std::vector<WordInfo> word_info_;
std::vector<WordInfo>::iterator word_iterator_;
std::unordered_map<std::string, int> word_idx_map_;
std::unordered_set<std::string> word_whitelist_;
};
}
}

Просмотреть файл

@ -1,478 +1,467 @@
#include "distributed_wordembedding.h"
#include "distributed_wordembedding.h"
namespace multiverso
{
namespace wordembedding
namespace wordembedding
{
void Distributed_wordembedding::LoadOneBlock(DataBlock *data_block,
Reader *reader, int64 size)
void Distributed_wordembedding::Train(int argc, char *argv[])
{
clock_t start = clock();
//The barrier for trainers
multiverso::Barrier* barrier =
new multiverso::Barrier(option_->thread_cnt);
data_block->ClearSentences();
reader->ResetSize(size);
while (true)
{
int64 word_count = 0;
int *sentence = new (std::nothrow)int[kMaxSentenceLength + 2];
assert(sentence != nullptr);
int sentence_length = reader->GetSentence(sentence, word_count);
if (sentence_length > 0)
{
data_block->AddSentence(sentence, sentence_length,
word_count, (uint64)rand() * 10000 + (uint64)rand());
}
else
{
//Reader read eof or has read data_block->size bytes before,
//reader_->GetSentence will return 0
delete[] sentence;
break;
}
}
MemoryManager* memory_mamanger =
new MemoryManager(option_->embeding_size);
WordEmbedding* WordEmbeddings[2] =
{ new WordEmbedding(option_, huffman_encoder_,
sampler_, dictionary_->Size()),
new WordEmbedding(option_, huffman_encoder_,
sampler_, dictionary_->Size()) };
multiverso::Log::Info("Rank %d LoadOneDataBlockTime:%lfs\n",process_id_,
(clock() - start) / (double)CLOCKS_PER_SEC);
}
//Step 1, Create Multiverso ParameterLoader and Trainers,
//Start Multiverso environment
WordEmbeddings[1]->MallocMemory();
void Distributed_wordembedding::StartLoadDataThread(BlockQueue *block_queue, Reader *reader, int64 file_size){
int data_block_count = 0;
for (int cur_epoch = 0; cur_epoch < option_->epoch; ++cur_epoch)
{
clock_t start_epoch = clock();
reader_->ResetStart();
for (int64 cur = 0; cur < file_size; cur += option_->data_block_size)
{
DataBlock *data_block = new (std::nothrow)DataBlock();
assert(data_block != nullptr);
LoadOneBlock(data_block, reader, option_->data_block_size);
//Prepare option_->thread_cnt trainers for multiverso
std::vector<multiverso::TrainerBase*>trainers;
for (int i = 0; i < option_->thread_cnt; ++i)
{
trainers.push_back(new (std::nothrow)Trainer(i, option_,
barrier, dictionary_, WordEmbeddings[1], memory_mamanger));
assert(trainers[i] != nullptr);
}
//multiverso::Log::Info("Rank %d Load Thread load the %d Data Block\n",process_id_,data_block_count);
data_block_count++;
//Start a thread to collect word_count from every trainers,
//and update the WordEmbeddings[1]->word_count_actual
StartCollectWordcountThread(trainers, WordEmbeddings[1]);
std::unique_lock<std::mutex> lock(block_queue->mtx);
(block_queue->queues).push(data_block);
(block_queue->repo_not_empty).notify_all();
lock.unlock();
}
}
//Prepare ParameterLoader
ParameterLoader *parameter_loader =new (std::nothrow)ParameterLoader(
option_, WordEmbeddings[0]);
assert(parameter_loader != nullptr);
DataBlock *data_block = new (std::nothrow)DataBlock();
assert(data_block != nullptr);
data_block->SetLastFlag();
std::unique_lock<std::mutex> lock(block_queue->mtx);
(block_queue->queues).push(data_block);
(block_queue->repo_not_empty).notify_all();
lock.unlock();
}
//Step 2, prepare the Config for multiverso
multiverso::Config config;
config.max_delay = option_->max_delay;
config.num_servers = option_->num_servers;
config.num_aggregator = option_->num_aggregator;
config.is_pipeline = option_->is_pipeline;
config.lock_option =
static_cast<multiverso::LockOption>(option_->lock_option);
config.num_lock = option_->num_lock;
//Config.server_endpoint_file = std::string(option_->endpoints_file);
DataBlock* Distributed_wordembedding::GetDataFromQueue(BlockQueue *block_queue){
std::unique_lock<std::mutex> lock(block_queue->mtx);
// item buffer is empty, just wait here.
while (block_queue->queues.size() == 0) {
multiverso::Log::Info("Waiting For Loading Data Block...\n");
(block_queue->repo_not_empty).wait(lock);
}
//Step3, Init the environment of multiverso
multiverso::Multiverso::Init(trainers, parameter_loader,
config, &argc, &argv);
DataBlock *temp = block_queue->queues.front();
multiverso::Log::Info("Geting Data Block From Queue...\n");
block_queue->queues.pop();
lock.unlock();
return temp;
}
DataBlock* Distributed_wordembedding::GetBlockAndPrepareParameter(BlockQueue *block_queue_){
DataBlock* data_block = GetDataFromQueue(block_queue_);
if (data_block->Size() == 0){
return data_block;
}
data_block->MallocMemory(dictionary_->Size(), option_);
PrepareData(data_block);
communicator_->RequestParameter(data_block);
GetAllWordCount();
return data_block;
}
void Distributed_wordembedding::GetAllWordCount(){
WordEmbedding_->word_count_actual = communicator_->GetWordCount();
WordEmbedding_->UpdateLearningRate();
//multiverso::Log::Info("Get all word count done.,word count actual is %d\n", WordEmbedding_->word_count_actual);
}
void Distributed_wordembedding::AddDeltaWordCount(){
int64 temp_word_count = communicator_->GetWordCount();
temp_word_count = WordEmbedding_->word_count_actual - temp_word_count;
communicator_->AddWordCount(temp_word_count);
//multiverso::Log::Info("Add word count done.word count delta is %d\n", WordEmbedding_->word_count_actual);
}
void Distributed_wordembedding::StartWordCount()
{
multiverso::Log::Info("Rank %d Start word count thread\n.",process_id_);
int64 total_word_count = 0, sum = 0;
while (is_running_)
{
sum = 0;
for (int i = 0; i < trainers_.size(); ++i)
sum += trainers_[i]->word_count;
if (sum < 10000 + total_word_count)
{
std::chrono::milliseconds dura(20);
std::this_thread::sleep_for(dura);
}
else
{
WordEmbedding_->word_count_actual += sum - total_word_count;
WordEmbedding_->UpdateLearningRate();
total_word_count = sum;
if (!option_->use_adagrad)
{
/*
multiverso::Log::Info("Rank %d Alpha: %lf Progress: %.2lf%% WordCountActual: %lld Words/thread/second %lfk\n",
multiverso::MV_Rank(), WordEmbedding_->learning_rate,
WordEmbedding_->word_count_actual / ((double)option_->total_words * option_->epoch + 1) * 100,
WordEmbedding_->word_count_actual,
total_word_count / ((double)option_->thread_cnt * (clock() - start_) / CLOCKS_PER_SEC * 1000.0));
*/
}
else
{
/*
multiverso::Log::Info("Rank %d Progress: %.2lf%% WordCountActual: %lld Words/thread/second %lfk\n",
multiverso::MV_Rank(),
WordEmbedding_->word_count_actual / ((double)option_->total_words * option_->epoch + 1) * 100,
WordEmbedding_->word_count_actual,
total_word_count / ((double)option_->thread_cnt * (clock() - start_) / CLOCKS_PER_SEC * 1000.0));
*/
}
}
}
//Add the left word_count to the WordEmbedding
WordEmbedding_->word_count_actual += sum - total_word_count;
WordEmbedding_->UpdateLearningRate();
}
void Distributed_wordembedding::StartCollectWordcountThread()
{
is_running_ = true;
collect_wordcount_thread_ = std::thread(
&Distributed_wordembedding::StartWordCount, this);
}
void Distributed_wordembedding::StopCollectWordcountThread()
{
is_running_ = false;
collect_wordcount_thread_.join();
}
void Distributed_wordembedding::TrainNeuralNetwork(){
int64 file_size = GetFileSize(option_->train_file);
multiverso::Log::Info("train-file-size:%lld, data_block_size:%lld.\n",
file_size, option_->data_block_size);
block_queue_ = new BlockQueue();
load_data_thread_ = std::thread(&Distributed_wordembedding::StartLoadDataThread, this, block_queue_, reader_, file_size);
WordEmbedding_ = new WordEmbedding(option_, huffman_encoder_,
sampler_, dictionary_->Size());
assert(WordEmbedding_ != nullptr);
for (int i = 0; i < option_->thread_cnt; ++i)
{
trainers_.push_back(new (std::nothrow) Trainer(i, option_, dictionary_, WordEmbedding_));
assert(trainers_[i] != nullptr);
}
StartCollectWordcountThread();
start_ = clock();
int data_block_count = 0;
DataBlock *next_block = nullptr;
DataBlock *data_block = nullptr;
data_block = GetBlockAndPrepareParameter(block_queue_);
if (data_block == nullptr){
multiverso::Log::Info("Please Change the Bigger Block Size.\n");
return;
}
data_block_count++;
int64 all = file_size / option_->data_block_size + 1;
for (int cur_epoch = 0; cur_epoch < option_->epoch; ++cur_epoch)
{
clock_t start_epoch = clock();
for (int64 cur = 0; cur < all; ++cur)
{
clock_t start_block = clock();
if (option_->is_pipeline == false){
#pragma omp parallel for num_threads(option_->thread_cnt)
for (int i = 0; i < option_->thread_cnt; ++i){
trainers_[i]->TrainIteration(data_block);
}
communicator_->AddDeltaParameter(data_block);
delete data_block;
data_block = GetBlockAndPrepareParameter(block_queue_);
data_block_count++;
multiverso::Log::Info("Get the %d Data Block and Request done.\n", data_block_count);
}
else{
#pragma omp parallel num_threads(option_->thread_cnt+1)
{
if (omp_get_thread_num() == option_->thread_cnt){
next_block = GetBlockAndPrepareParameter(block_queue_);
data_block_count++;
}
else{
trainers_[omp_get_thread_num()]->TrainIteration(data_block);
}
}
communicator_->AddDeltaParameter(data_block);
delete data_block;
//(if next_block == nullptr) then data_block is null,we not run next block
data_block = next_block;
next_block = nullptr;
}
multiverso::Log::Info("Rank %d Dealing one block time:%lfs\n", process_id_,
(clock() - start_block) / (double)CLOCKS_PER_SEC);
}
multiverso::Log::Info("Rank %d Dealing %d epoch time:%lfs\n", process_id_, cur_epoch,
(clock() - start_epoch) / (double)CLOCKS_PER_SEC);
if (process_id_ == 0){
SaveEmbedding(ChangeFileName(option_->output_file, cur_epoch), option_->output_binary);
}
}
multiverso::Log::Info("Rank %d Finish Traning %d Block.\n",process_id_, data_block_count);
StopCollectWordcountThread();
//multiverso::Log::Info("Rank %d stop the word count thread.\n", process_id_);
load_data_thread_.join();
//multiverso::Log::Info("Rank %d stop the load data thread.\n", process_id_);
assert(data_block->isLast() == true);
delete data_block;
delete WordEmbedding_;
delete block_queue_;
for (auto trainer : trainers_)
{
delete trainer;
}
//multiverso::Log::Info("Rank %d delete all pointers.\n",process_id_);
}
const char* Distributed_wordembedding::ChangeFileName(const char *file_path, int iteration){
char * temp = new char[strlen(file_path)+1];
strcpy(temp,file_path);
std::string c_iteration = "_"+std::to_string(iteration);
char const *p_iteration = c_iteration.c_str();
return strcat(temp, p_iteration);
}
void Distributed_wordembedding::SaveEmbedding(const char *file_path, bool is_binary){
clock_t start = clock();
const int batch = 50000;
int epoch = dictionary_->Size() / batch;
int left = dictionary_->Size() % batch;
int base = 0;
std::vector<real*> blocks;
std::vector<int> nodes;
FILE* fid = nullptr;
fid = (is_binary == true) ? fid = fopen(file_path, "wb") : fid = fopen(file_path, "wt");
fprintf(fid, "%d %d\n", dictionary_->Size(), option_->embeding_size);
for (int i = 0; i < epoch; ++i){
for (int j = 0; j < batch; ++j){
nodes.push_back(base + j);
}
communicator_->RequestBlocks(batch, blocks);
communicator_->GetWorkerTableRows(nodes, blocks,option_->embeding_size);
WriteToFile(is_binary, blocks,fid);
communicator_->ReturnBlocks(blocks);
blocks.clear();
nodes.clear();
base = (i + 1)*batch;
}
if (left > 0){
for (int j = 0; j <left; ++j){
nodes.push_back(base + j);
}
communicator_->RequestBlocks(left, blocks);
communicator_->GetWorkerTableRows(nodes, blocks, option_->embeding_size);
WriteToFile(is_binary, blocks, fid);
communicator_->ReturnBlocks(blocks);
}
fclose(fid);
multiverso::Log::Info("Rank % dSaving Embedding time:%lfs\n", process_id_,
(clock() - start) / (double)CLOCKS_PER_SEC);
}
void Distributed_wordembedding::WriteToFile(bool is_binary, std::vector<real*> &blocks, FILE* fid){
for (int i = 0; i < blocks.size(); ++i)
{
fprintf(fid, "%s ", dictionary_->GetWordInfo(i)->word.c_str());
for (int j = 0; j < option_->embeding_size; ++j)
{
if (is_binary){
real tmp = blocks[i][j];
fwrite(&tmp, sizeof(real), 1, fid);
}
else{
fprintf(fid, "%lf ", blocks[i][j]);
}
}
fprintf(fid, "\n");
}
}
void Distributed_wordembedding::PrepareData(DataBlock *data_block){
clock_t start = clock();
WordEmbedding_->PrepareData(data_block);
multiverso::Log::Info("Rank %d Prepare data time:%lfs.\n",process_id_,
(clock() - start) / (double)CLOCKS_PER_SEC);
}
void Distributed_wordembedding::Train(int argc, char *argv[])
{
//fix later
argc = 1;
argv = nullptr;
multiverso::MV_Init(&argc, argv);
multiverso::Log::Info("MV Rank %d Init done.\n",multiverso::MV_Rank());
char log_name[100];
sprintf(log_name, "log%s.txt", g_log_suffix.c_str());
multiverso::Log::ResetLogFile(log_name);
//Mark the node machine number
process_id_ = multiverso::Multiverso::ProcessRank();
//Step 4, prepare the sever/aggregator/cache Table for parametertable(3 or 5)
//and initialize the severtable for inputvector
PrepareMultiversoParameterTables(option_, dictionary_);
MV_Barrier();
multiverso::Log::Info("MV Barrier done.\n");
//Mark the node machine number
process_id_ = multiverso::MV_Rank();
//Step 5, start the Train of NN
TrainNeuralNetwork();
//create worker table and server table
communicator_->PrepareParameterTables(dictionary_->Size(), option_->embeding_size);
//Step6, stop the thread which are collecting word_count,
//and release the resource
StopCollectWordcountThread();
delete barrier;
delete memory_mamanger;
delete WordEmbeddings[0];
delete WordEmbeddings[1];
for (auto trainer : trainers)
{
delete trainer;
}
delete parameter_loader;
multiverso::Multiverso::Close();
}
//start to train
TrainNeuralNetwork();
//The thread to collect word_count from trainers_
void Distributed_wordembedding::StartThread()
{
int64 total_word_count = 0, sum = 0;
while (is_running_)
{
sum = 0;
for (int i = 0; i < trainers_.size(); ++i)
sum += trainers_[i]->word_count;
MV_ShutDown();
multiverso::Log::Info("MV ShutDone done.\n");
}
if (sum < 10000 + total_word_count)
{
std::chrono::milliseconds dura(20);
std::this_thread::sleep_for(dura);
}
else
{
WordEmbedding_->word_count_actual += sum - total_word_count;
WordEmbedding_->UpdateLearningRate();
total_word_count = sum;
void Distributed_wordembedding::Run(int argc, char *argv[])
{
g_log_suffix = GetSystemTime();
srand(static_cast<unsigned int>(time(NULL)));
if (!option_->use_adagrad)
{
multiverso::Log::Info("Rank %d Alpha: %lf Progress: %.2lf%% WordCountActual: %lld Words/thread/second %lfk\n",
multiverso::Multiverso::ProcessRank(), WordEmbedding_->learning_rate,
WordEmbedding_->word_count_actual / ((double)option_->total_words * option_->epoch + 1) * 100,
WordEmbedding_->word_count_actual,
total_word_count / ((double)option_->thread_cnt * (clock() - start_) / CLOCKS_PER_SEC * 1000.0));
}
else
{
multiverso::Log::Info("Rank %d Progress: %.2lf%% WordCountActual: %lld Words/thread/second %lfk\n",
multiverso::Multiverso::ProcessRank(),
WordEmbedding_->word_count_actual / ((double)option_->total_words * option_->epoch + 1) * 100,
WordEmbedding_->word_count_actual,
total_word_count / ((double)option_->thread_cnt * (clock() - start_) / CLOCKS_PER_SEC * 1000.0));
}
}
}
option_ = new (std::nothrow)Option();
assert(option_ != nullptr);
//Add the left word_count to the WordEmbedding
WordEmbedding_->word_count_actual += sum - total_word_count;
WordEmbedding_->UpdateLearningRate();
}
dictionary_ = new (std::nothrow)Dictionary();
assert(dictionary_ != nullptr);
//Start a thread to collect the word count from trainers
//The thread can be stopped by StopCollectWordcountThread()
void Distributed_wordembedding::StartCollectWordcountThread(
std::vector<multiverso::TrainerBase*> &trainer_bases, WordEmbedding *WordEmbedding)
{
is_running_ = true;
WordEmbedding_ = WordEmbedding;
for (int i = 0; i < trainer_bases.size(); ++i)
trainers_.push_back(reinterpret_cast<Trainer*>(trainer_bases[i]));
huffman_encoder_ = new (std::nothrow)HuffmanEncoder();
assert(huffman_encoder_ != nullptr);
//Parse argument and store them in option
//Start a thread to collect the actual_word_count
collect_wordcount_thread_ = std::thread(
&Distributed_wordembedding::StartThread, this);
}
if (argc <= 1)
{
option_->PrintUsage();
return;
}
//Stop the thread which is collecting the word_count_actual from trainers
void Distributed_wordembedding::StopCollectWordcountThread()
{
is_running_ = false;
collect_wordcount_thread_.join();
}
option_->ParseArgs(argc, argv);
//Create the three kinds of tables
void Distributed_wordembedding::CreateMultiversoParameterTable(
multiverso::integer_t table_id, multiverso::integer_t rows,
multiverso::integer_t cols, multiverso::Type type,
multiverso::Format default_format)
{
multiverso::Multiverso::AddServerTable(table_id, rows,
cols, type, default_format);
multiverso::Multiverso::AddCacheTable(table_id, rows,
cols, type, default_format, 0);
multiverso::Multiverso::AddAggregatorTable(table_id, rows,
cols, type, default_format, 0);
}
//Read the vocabulary file; create the dictionary
//and huffman_encoder according opt
if ((option_->hs == 1) && (option_->negative_num != 0))
{
multiverso::Log::Fatal("The Hierarchical Softmax and Negative Sampling is indefinite!\n");
exit(0);
}
void Distributed_wordembedding::PrepareMultiversoParameterTables(
Option *opt, Dictionary *dictionary)
{
multiverso::Multiverso::BeginConfig();
int proc_count = multiverso::Multiverso::TotalProcessCount();
option_->total_words = LoadVocab(option_, dictionary_,
huffman_encoder_);
//Create tables, the order of creating tables should arise from 0 continuously
//The elements of talbes will be initialized with 0
CreateMultiversoParameterTable(kInputEmbeddingTableId,
dictionary->Size(), opt->embeding_size,
multiverso::Type::Float, multiverso::Format::Dense);
option_->PrintArgs();
CreateMultiversoParameterTable(kEmbeddingOutputTableId,
dictionary->Size(), opt->embeding_size,
multiverso::Type::Float, multiverso::Format::Dense);
sampler_ = new (std::nothrow)Sampler();
assert(sampler_ != nullptr);
if (option_->negative_num)
sampler_->SetNegativeSamplingDistribution(dictionary_);
CreateMultiversoParameterTable(kWordCountActualTableId, 1, 1,
multiverso::Type::LongLong, multiverso::Format::Dense);
char *filename = new (std::nothrow)char[strlen(option_->train_file) + 1];
assert(filename != nullptr);
strcpy(filename, option_->train_file);
reader_ = new (std::nothrow)Reader(dictionary_, option_, sampler_, filename);
assert(reader_ != nullptr);
communicator_ = new (std::nothrow)Communicator(option_);
//Train with multiverso
this->Train(argc, argv);
if (opt->use_adagrad)
{
CreateMultiversoParameterTable(kSumGradient2IETableId,
dictionary->Size(), opt->embeding_size,
multiverso::Type::Float, multiverso::Format::Dense);
CreateMultiversoParameterTable(kSumGradient2EOTableId,
dictionary->Size(), opt->embeding_size,
multiverso::Type::Float, multiverso::Format::Dense);
}
delete option_;
delete dictionary_;
delete huffman_encoder_;
delete sampler_;
delete reader_;
delete communicator_;
}
//Initialize server tables
//Every process will execute the code below, so the initialized
//value should be divided by the number of processes
for (int row = 0; row < dictionary->Size(); ++row)
{
for (int col = 0; col < opt->embeding_size; ++col)
{
multiverso::Multiverso::AddToServer<real>(
kInputEmbeddingTableId, row, col,
static_cast<real>((static_cast<real>(rand())
/ RAND_MAX - 0.5) / opt->embeding_size / proc_count));
}
}
//Read the vocabulary file; create the dictionary
//and huffman_encoder according opt
int64 Distributed_wordembedding::LoadVocab(Option *opt,
Dictionary *dictionary, HuffmanEncoder *huffman_encoder)
{
int64 total_words = 0;
char word[kMaxString];
FILE* fid = nullptr;
clock_t start = clock();
multiverso::Log::Info("vocab_file %s\n", opt->read_vocab_file);
multiverso::Multiverso::EndConfig();
}
if (opt->read_vocab_file != nullptr && strlen(opt->read_vocab_file) > 0)
{
multiverso::Log::Info("Begin to load vocabulary file [%s] ...\n",
opt->read_vocab_file);
fid = fopen(opt->read_vocab_file, "r");
if (fid == nullptr)
{
multiverso::Log::Fatal("Open vocab_file failed!\n");
exit(1);
}
int word_freq;
while (fscanf(fid, "%s %d", word, &word_freq) != EOF)
{
dictionary->Insert(word, word_freq);
}
}
//Get the size of filename, it should deal with large files
int64 Distributed_wordembedding::GetFileSize(const char *filename)
{
#ifdef _MSC_VER
struct _stat64 info;
_stat64(filename, &info);
return (int64)info.st_size;
#else
struct stat info;
stat(filename, &info);
return(int64)info.st_size;
#endif
}
dictionary->RemoveWordsLessThan(opt->min_count);
multiverso::Log::Info("Dictionary size: %d\n", dictionary->Size());
//Remove the datablock which is delt by parameterloader and trainer
void Distributed_wordembedding::RemoveDoneDataBlock(
std::queue<DataBlock*> &datablock_queue)
{
while (datablock_queue.empty() == false
&& datablock_queue.front()->IsDone())
{
DataBlock *p_data_block = datablock_queue.front();
datablock_queue.pop();
delete p_data_block;
}
}
total_words = 0;
for (int i = 0; i < dictionary->Size(); ++i)
total_words += dictionary->GetWordInfo(i)->freq;
multiverso::Log::Info("Words in Dictionary %I64d\n", total_words);
void Distributed_wordembedding::PushDataBlock(
std::queue<DataBlock*> &datablock_queue, DataBlock* data_block)
{
multiverso::Multiverso::PushDataBlock(data_block);
datablock_queue.push(data_block);
//limit the max size of total datablocks to avoid out of memory
while (static_cast<int64>(datablock_queue.size()) * option_->data_block_size
> option_->max_preload_data_size)
{
std::chrono::milliseconds dura(200);
std::this_thread::sleep_for(dura);
//Remove the datablock which is delt by parameterloader and trainer
RemoveDoneDataBlock(datablock_queue);
}
}
multiverso::Log::Info("Loading vocab time:%lfs\n",
(clock() - start) / (double)CLOCKS_PER_SEC);
void Distributed_wordembedding::TrainNeuralNetwork()
{
std::queue<DataBlock*>datablock_queue;
int data_block_count = 0;
int64 file_size = GetFileSize(option_->train_file);
multiverso::Log::Info("train-file-size:%lld, data_block_size:%lld\n",
file_size, option_->data_block_size);
start_ = clock();
multiverso::Multiverso::BeginTrain();
for (int cur_epoch = 0; cur_epoch < option_->epoch; ++cur_epoch)
{
reader_->ResetStart();
multiverso::Multiverso::BeginClock();
for (int64 cur = 0; cur < file_size; cur += option_->data_block_size)
{
++data_block_count;
DataBlock *data_block = new (std::nothrow)DataBlock();
assert(data_block != nullptr);
//Load the sentences from train file, and store them in data_block
clock_t start = clock();
LoadData(data_block, reader_, option_->data_block_size);
multiverso::Log::Info("LoadOneDataBlockTime:%lfs\n",
(clock() - start) / (double)CLOCKS_PER_SEC);
PushDataBlock(datablock_queue, data_block);
if (opt->hs)
huffman_encoder->BuildFromTermFrequency(dictionary);
if (fid != nullptr)
fclose(fid);
}
multiverso::Multiverso::EndClock();
return total_words;
}
}
}
//Dump input-embedding weight
multiverso::Multiverso::BeginClock();
++data_block_count;
DataBlock *data_block = new (std::nothrow)DataBlock();
assert(data_block != nullptr);
data_block->SetType(DataBlockType::Test);
PushDataBlock(datablock_queue, data_block);
multiverso::Multiverso::EndClock();
multiverso::Log::Info("Rank %d Pushed %d datablocks\n",
process_id_, data_block_count);
multiverso::Multiverso::EndTrain();
//After EndTrain, all the datablock are done,
//we remove all the datablocks
RemoveDoneDataBlock(datablock_queue);
}
void Distributed_wordembedding::LoadData(DataBlock *data_block,
Reader *reader, int64 size)
{
//Be sure to clear all the sentences
//which were stored in data_block
data_block->ClearSentences();
reader->ResetSize(size);
while (true)
{
int64 word_count = 0;
int *sentence = new (std::nothrow)int[kMaxSentenceLength + 2];
assert(sentence != nullptr);
int sentence_length = reader->GetSentence(sentence, word_count);
if (sentence_length > 0)
{
data_block->AddSentence(sentence, sentence_length,
word_count, (uint64)rand() * 10000 + (uint64)rand());
}
else
{
//Reader read eof or has read data_block->size bytes before,
//reader_->GetSentence will return 0
delete[] sentence;
break;
}
}
}
void Distributed_wordembedding::Run(int argc, char *argv[])
{
g_log_suffix = GetSystemTime();
srand(static_cast<unsigned int>(time(NULL)));
option_ = new (std::nothrow)Option();
assert(option_ != nullptr);
dictionary_ = new (std::nothrow)Dictionary();
assert(dictionary_ != nullptr);
huffman_encoder_ = new (std::nothrow)HuffmanEncoder();
assert(huffman_encoder_ != nullptr);
//Parse argument and store them in option
if (argc <= 1)
{
option_->PrintUsage();
return;
}
option_->ParseArgs(argc, argv);
//Read the vocabulary file; create the dictionary
//and huffman_encoder according opt
if ((option_->hs == 1) && (option_->negative_num != 0))
{
multiverso::Log::Fatal("The Hierarchical Softmax and Negative Sampling is indefinite!\n");
exit(0);
}
multiverso::Log::Info("Loading vocabulary ...\n");
option_->total_words = LoadVocab(option_, dictionary_,
huffman_encoder_);
multiverso::Log::Info("Loaded vocabulary\n");
option_->PrintArgs();
sampler_ = new (std::nothrow)Sampler();
assert(sampler_ != nullptr);
if (option_->negative_num)
sampler_->SetNegativeSamplingDistribution(dictionary_);
char *filename = new (std::nothrow)char[strlen(option_->train_file) + 1];
assert(filename != nullptr);
strcpy(filename, option_->train_file);
reader_ = new (std::nothrow)Reader(dictionary_, option_, sampler_, filename);
assert(reader_ != nullptr);
//Train with multiverso
this->Train(argc, argv);
delete reader_;
delete sampler_;
delete huffman_encoder_;
delete dictionary_;
delete option_;
}
//Readword from train_file to word array by the word index
bool Distributed_wordembedding::ReadWord(char *word, FILE *fin)
{
int idx = 0;
char ch;
while (!feof(fin))
{
ch = fgetc(fin);
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
{
if (idx > 0)
{
if (ch == '\n')
ungetc(ch, fin);
break;
}
if (ch == '\n')
{
strcpy(word, (char *)"</s>");
return true;
}
else
{
continue;
}
}
word[idx++] = ch;
if (idx >= kMaxString - 1)
idx--;
}
word[idx] = 0;
return idx > 0;
}
//Read the vocabulary file; create the dictionary
//and huffman_encoder according opt
int64 Distributed_wordembedding::LoadVocab(Option *opt,
Dictionary *dictionary, HuffmanEncoder *huffman_encoder)
{
int64 total_words = 0;
char word[kMaxString];
FILE* fid = nullptr;
multiverso::Log::Info("vocab_file %s\n", opt->read_vocab_file);
if (opt->read_vocab_file != nullptr && strlen(opt->read_vocab_file) > 0)
{
multiverso::Log::Info("Begin to load vocabulary file [%s] ...\n",
opt->read_vocab_file);
fid = fopen(opt->read_vocab_file, "r");
if (fid == nullptr)
{
multiverso::Log::Fatal("Open vocab_file failed!\n");
exit(1);
}
int word_freq;
while (fscanf(fid, "%s %d", word, &word_freq) != EOF)
{
dictionary->Insert(word, word_freq);
}
}
dictionary->RemoveWordsLessThan(opt->min_count);
multiverso::Log::Info("Dictionary size: %d\n", dictionary->Size());
total_words = 0;
for (int i = 0; i < dictionary->Size(); ++i)
total_words += dictionary->GetWordInfo(i)->freq;
multiverso::Log::Info("Words in Dictionary %I64d\n", total_words);
if (opt->hs)
huffman_encoder->BuildFromTermFrequency(dictionary);
if (fid != nullptr)
fclose(fid);
return total_words;
}
}
}

Просмотреть файл

@ -1,15 +1,14 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_DISTRIBUTED_WORDEMBEDDING_H_
#define DISTRIBUTED_WORD_EMBEDDING_DISTRIBUTED_WORDEMBEDDING_H_
#pragma once
/*!
* file distributed_wordembedding.h
* \brief Class Distributed_wordembedding describles the main frame of Distributed WordEmbedding and some useful functions
* \brief Class Distributed_wordembedding describles the main frame of Distributed WordEmbedding and some useful functions
*/
#include <vector>
#include <ctime>
#include <stdlib.h>
#include <string.h>
#include <unordered_set>
#include <unordered_map>
#include <iostream>
@ -18,103 +17,107 @@
#include <thread>
#include <mutex>
#include <functional>
#include <omp.h>
#include "multiverso/multiverso.h"
#include <sys/stat.h>
#include "util.h"
#include "multiverso.h"
#include "huffman_encoder.h"
#include "reader.h"
#include "data_block.h"
#include "parameter_loader.h"
#include "trainer.h"
#include "memory_manager.h"
#include "block_queue.h"
#include "communicator.h"
#include "reader.h"
#include "log.h"
#include "constant.h"
namespace multiverso
{
namespace wordembedding
{
extern std::string g_log_suffix;
class Trainer;
class WordEmbedding;
class Comunicator;
namespace wordembedding
{
extern std::string g_log_suffix;
class Trainer;
class Distributed_wordembedding
{
public:
Distributed_wordembedding(){}
/*!
* \brief Run Function contains everything
*/
void Run(int argc, char *argv[]);
class Distributed_wordembedding
{
public:
Distributed_wordembedding(){}
/*!
* \brief Run Function contains everything
*/
void Run(int argc, char *argv[]);
private:
clock_t start_;
int process_id_;
Option* option_ = nullptr;
Dictionary* dictionary_ = nullptr;
HuffmanEncoder* huffman_encoder_ = nullptr;
Sampler* sampler_ = nullptr;
Reader* reader_ = nullptr;
WordEmbedding* WordEmbedding_ = nullptr;
BlockQueue *block_queue_ = nullptr;
std::thread load_data_thread_;
std::thread collect_wordcount_thread_;
bool is_running_ = false;
std::vector<Trainer*> trainers_;
Communicator* communicator_;
/*!
* \brief Load Dictionary from the vocabulary_file
* \param opt Some model-set setparams
* \param dictionary save the vocabulary and its frequency
* \param huffman_encoder convert dictionary to the huffman_code
*/
int64 LoadVocab(Option *opt, Dictionary *dictionary,
HuffmanEncoder *huffman_encoder);
/*!
* \brief Loaddata from train_file to datablock
* \param datablock the datablock which needs to be assigned
* \param reader some useful function for calling
* \param size datablock limit byte size
*/
//void LoadData(DataBlock *data_block, Reader *reader, int64 size);
/*!
* \brief Complete the train task with multiverso
*/
void Train(int argc, char *argv[]);
void TrainNeuralNetwork();
void PrepareData(DataBlock *data_block);
void StartLoadDataThread(BlockQueue *block_queue,
Reader *reader, int64 file_size);
void LoadOneBlock(DataBlock *data_block,
Reader *reader, int64 size);
void StartCollectWordcountThread();
void StopCollectWordcountThread();
void StartWordCount();
void GetAllWordCount();
void AddDeltaWordCount();
DataBlock* GetDataFromQueue(BlockQueue *block_queue);
DataBlock* GetBlockAndPrepareParameter(BlockQueue *block_queue_);
void SaveEmbedding(const char *file_path, bool is_binary);
void WriteToFile(bool is_binary, std::vector<real*> &blocks, FILE* fid);
const char* ChangeFileName(const char *file_path, int iteration);
};
}
private:
clock_t start_;
int process_id_;
Option* option_;
Dictionary* dictionary_;
HuffmanEncoder* huffman_encoder_;
Sampler* sampler_;
Reader* reader_;
std::thread collect_wordcount_thread_;
bool is_running_;
std::vector<Trainer*> trainers_;
WordEmbedding *WordEmbedding_;
/*!
* \brief Create a new thread which is used for
* \calculating the speed of word processing.
*/
void StartThread();
void StartCollectWordcountThread(
std::vector<multiverso::TrainerBase*> &trainer, WordEmbedding *WordEmbedding);
void StopCollectWordcountThread();
/*!
* \brief Read the word from the train_file
* \param word word saved by string
* \param fin train_filename
*/
bool ReadWord(char *word, FILE *fin);
/*!
* \brief Load Dictionary from the vocabulary_file
* \param opt Some model-set setparams
* \param dictionary save the vocabulary and its frequency
* \param huffman_encoder convert dictionary to the huffman_code
*/
int64 LoadVocab(Option *opt, Dictionary *dictionary,
HuffmanEncoder *huffman_encoder);
/*!
* \brief Get the file total wordnumber
*/
int64 GetFileSize(const char *filename);
/*!
* \brief Complete the train task with multiverso
*/
void Train(int argc, char *argv[]);
void TrainNeuralNetwork();
/*!
* \brief Create a new table in the multiverso
*/
void CreateMultiversoParameterTable(multiverso::integer_t table_id,
multiverso::integer_t rows, multiverso::integer_t cols,
multiverso::Type type, multiverso::Format default_format);
/*!
* \brief Push the datablock into the multiverso and datablock_queue
*/
void PushDataBlock(std::queue<DataBlock*> &datablock_queue,
DataBlock* data_block);
/*!
* \brief Prepare parameter table in the multiverso
*/
void PrepareMultiversoParameterTables(Option *opt,
Dictionary *dictionary);
/*!
* \brief Loaddata from train_file to datablock
* \param datablock the datablock which needs to be assigned
* \param reader some useful function for calling
* \param size datablock limit byte size
*/
void LoadData(DataBlock *data_block, Reader *reader, int64 size);
/*!
* \brief Remove datablock which is finished by multiverso thread
* \param datablock_queue store the pushed datablocks
*/
void RemoveDoneDataBlock(std::queue<DataBlock*> &datablock_queue);
// No copying allowed
Distributed_wordembedding(const Distributed_wordembedding&);
void operator=(const Distributed_wordembedding&);
};
}
}
#endif

Просмотреть файл

@ -1,287 +1,287 @@
#include <cstring>
#include "huffman_encoder.h"
namespace multiverso
{
namespace wordembedding
{
HuffmanEncoder::HuffmanEncoder()
{
dict_ = nullptr;
//multiverso::Log::Info("ignore super long term");
}
namespace wordembedding
{
HuffmanEncoder::HuffmanEncoder()
{
dict_ = nullptr;
}
//Save the word-huffmancode pair in file
void HuffmanEncoder::Save2File(const char* filename)
{
FILE* fid = fopen(filename, "w");
if (fid)
{
fprintf(fid, "%lld\n", hufflabel_info_.size());
//Save the word-huffmancode pair in file
void HuffmanEncoder::Save2File(const char* filename)
{
FILE* fid = fopen(filename, "w");
if (fid)
{
fprintf(fid, "%lld\n", hufflabel_info_.size());
for (unsigned i = 0; i < hufflabel_info_.size(); ++i)
{
auto info = hufflabel_info_[i];
const auto word = dict_->GetWordInfo(i);
fprintf(fid, "%s %d", word->word.c_str(), info.codelen);
for (unsigned i = 0; i < hufflabel_info_.size(); ++i)
{
auto info = hufflabel_info_[i];
const auto word = dict_->GetWordInfo(i);
fprintf(fid, "%s %d", word->word.c_str(), info.codelen);
for (int j = 0; j < info.codelen; ++j)
fprintf(fid, " %d", info.code[j]);
for (int j = 0; j < info.codelen; ++j)
fprintf(fid, " %d", info.code[j]);
for (int j = 0; j < info.codelen; ++j)
fprintf(fid, " %d", info.point[j]);
for (int j = 0; j < info.codelen; ++j)
fprintf(fid, " %d", info.point[j]);
fprintf(fid, "\n");
}
fprintf(fid, "\n");
}
fclose(fid);
}
else
{
multiverso::Log::Error("file open failed %s", filename);
}
}
//Recover the word-huffmancode pair from file
void HuffmanEncoder::RecoverFromFile(const char* filename)
{
dict_ = new (std::nothrow)Dictionary();
assert(dict_ != nullptr);
FILE* fid;
fid=fopen(filename, "r");
if (fid)
{
int64 vocab_size;
fscanf(fid, "%lld", &vocab_size);
hufflabel_info_.reserve(vocab_size);
hufflabel_info_.clear();
fclose(fid);
}
else
{
//multiverso::Log::Error("file open failed %s", filename);
}
}
int tmp;
char sz_label[kMaxWordSize];
for (int64 i = 0; i < vocab_size; ++i)
{
HuffLabelInfo info;
//Recover the word-huffmancode pair from file
void HuffmanEncoder::RecoverFromFile(const char* filename)
{
dict_ = new (std::nothrow)Dictionary();
assert(dict_ != nullptr);
FILE* fid;
fid = fopen(filename, "r");
if (fid)
{
int64 vocab_size;
fscanf(fid, "%lld", &vocab_size);
hufflabel_info_.reserve(vocab_size);
hufflabel_info_.clear();
//fscanf_s(fid, "%s", sz_label, kMaxWordSize);
fscanf(fid, "%s", sz_label, kMaxWordSize);
dict_->Insert(sz_label);
int tmp;
char sz_label[kMaxWordSize];
for (int64 i = 0; i < vocab_size; ++i)
{
HuffLabelInfo info;
fscanf(fid, "%d", &info.codelen);
//fscanf_s(fid, "%s", sz_label, kMaxWordSize);
fscanf(fid, "%s", sz_label, kMaxWordSize);
dict_->Insert(sz_label);
info.code.clear();
info.point.clear();
fscanf(fid, "%d", &info.codelen);
for (int j = 0; j < info.codelen; ++j)
{
fscanf(fid, "%d", &tmp);
info.code.push_back(tmp);
}
for (int j = 0; j < info.codelen; ++j)
{
fscanf(fid, "%d", &tmp);
info.point.push_back(tmp);
}
info.code.clear();
info.point.clear();
hufflabel_info_.push_back(info);
}
fclose(fid);
}
else
{
multiverso::Log::Error("file open failed %s", filename);
}
}
//Compare the second element of two pairs
bool compare(const std::pair<int, int64>& x,
const std::pair<int, int64>& y)
{
if (x.second == 0) return true;
if (y.second == 0) return false;
return (x.second > y.second);
}
//Build huffaman tree from the existing dictionary
void HuffmanEncoder::BuildHuffmanTreeFromDict()
{
std::vector<std::pair<int, int64> > ordered_words;
ordered_words.reserve(dict_->Size());
ordered_words.clear();
for (int i = 0; i < dict_->Size(); ++i)
ordered_words.push_back(std::pair<int, int64>(i, dict_->GetWordInfo(i)->freq));
std::sort(ordered_words.begin(), ordered_words.end(), compare);
for (int j = 0; j < info.codelen; ++j)
{
fscanf(fid, "%d", &tmp);
info.code.push_back(tmp);
}
for (int j = 0; j < info.codelen; ++j)
{
fscanf(fid, "%d", &tmp);
info.point.push_back(tmp);
}
unsigned vocab_size = (unsigned)ordered_words.size();
// frequence
int64 *count = new (std::nothrow)int64[vocab_size * 2 + 1];
assert(count != nullptr);
// Huffman code relative to parent node [1,0] of each node
unsigned *binary = new (std::nothrow)unsigned[vocab_size * 2 + 1];
assert(binary != nullptr);
memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
hufflabel_info_.push_back(info);
}
fclose(fid);
}
else
{
//multiverso::Log::Error("file open failed %s", filename);
}
}
//Compare the second element of two pairs
bool compare(const std::pair<int, int64>& x,
const std::pair<int, int64>& y)
{
if (x.second == 0) return true;
if (y.second == 0) return false;
return (x.second > y.second);
}
//Build huffaman tree from the existing dictionary
void HuffmanEncoder::BuildHuffmanTreeFromDict()
{
std::vector<std::pair<int, int64> > ordered_words;
ordered_words.reserve(dict_->Size());
ordered_words.clear();
for (int i = 0; i < dict_->Size(); ++i)
ordered_words.push_back(std::pair<int, int64>(i, dict_->GetWordInfo(i)->freq));
std::sort(ordered_words.begin(), ordered_words.end(), compare);
unsigned *parent_node = new (std::nothrow)unsigned[vocab_size * 2 + 1]; //
assert(parent_node != nullptr);
memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
unsigned code[kMaxCodeLength], point[kMaxCodeLength];
unsigned vocab_size = (unsigned)ordered_words.size();
// frequence
int64 *count = new (std::nothrow)int64[vocab_size * 2 + 1];
assert(count != nullptr);
// Huffman code relative to parent node [1,0] of each node
unsigned *binary = new (std::nothrow)unsigned[vocab_size * 2 + 1];
assert(binary != nullptr);
memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
for (unsigned i = 0; i < vocab_size; ++i)
count[i] = ordered_words[i].second;
for (unsigned i = vocab_size; i < vocab_size * 2; i++)
count[i] = static_cast<int64>(1e15);
int pos1 = vocab_size - 1;
int pos2 = vocab_size;
int min1i, min2i;
for (unsigned i = 0; i < vocab_size - 1; i++)
{
// First, find two smallest nodes 'min1, min2'
assert(pos2 < static_cast<int>(vocab_size) * 2 - 1);
//Find the samllest node
if (pos1 >= 0)
{
if (count[pos1] < count[pos2])
{
min1i = pos1;
pos1--;
}
else
{
min1i = pos2;
pos2++;
}
}
else
{
min1i = pos2;
pos2++;
}
unsigned *parent_node = new (std::nothrow)unsigned[vocab_size * 2 + 1]; //
assert(parent_node != nullptr);
memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
unsigned code[kMaxCodeLength], point[kMaxCodeLength];
//Find the second samllest node
if (pos1 >= 0)
{
if (count[pos1] < count[pos2])
{
min2i = pos1;
pos1--;
}
else
{
min2i = pos2;
pos2++;
}
}
else
{
min2i = pos2;
pos2++;
}
for (unsigned i = 0; i < vocab_size; ++i)
count[i] = ordered_words[i].second;
for (unsigned i = vocab_size; i < vocab_size * 2; i++)
count[i] = static_cast<int64>(1e15);
int pos1 = vocab_size - 1;
int pos2 = vocab_size;
int min1i, min2i;
for (unsigned i = 0; i < vocab_size - 1; i++)
{
// First, find two smallest nodes 'min1, min2'
assert(pos2 < static_cast<int>(vocab_size)* 2 - 1);
//Find the samllest node
if (pos1 >= 0)
{
if (count[pos1] < count[pos2])
{
min1i = pos1;
pos1--;
}
else
{
min1i = pos2;
pos2++;
}
}
else
{
min1i = pos2;
pos2++;
}
count[vocab_size + i] = count[min1i] + count[min2i];
//Find the second samllest node
if (pos1 >= 0)
{
if (count[pos1] < count[pos2])
{
min2i = pos1;
pos1--;
}
else
{
min2i = pos2;
pos2++;
}
}
else
{
min2i = pos2;
pos2++;
}
assert(min1i >= 0);
assert(min1i < static_cast<int>(vocab_size)* 2 - 1);
assert(min2i >= 0);
assert(min2i < static_cast<int>(vocab_size)* 2 - 1);
parent_node[min1i] = vocab_size + i;
parent_node[min2i] = vocab_size + i;
binary[min2i] = 1;
}
assert(pos1 < 0);
count[vocab_size + i] = count[min1i] + count[min2i];
//Generate the huffman code for each leaf node
hufflabel_info_.clear();
for (unsigned a = 0; a < vocab_size; ++a)
hufflabel_info_.push_back(HuffLabelInfo());
for (unsigned a = 0; a < vocab_size; a++)
{
unsigned b = a, i = 0;
while (1)
{
assert(i < kMaxCodeLength);
code[i] = binary[b];
point[i] = b;
i++;
b = parent_node[b];
if (b == vocab_size * 2 - 2) break;
}
unsigned cur_word = ordered_words[a].first;
assert(min1i >= 0);
assert(min1i < static_cast<int>(vocab_size)* 2 - 1);
assert(min2i >= 0);
assert(min2i < static_cast<int>(vocab_size)* 2 - 1);
parent_node[min1i] = vocab_size + i;
parent_node[min2i] = vocab_size + i;
binary[min2i] = 1;
}
assert(pos1 < 0);
hufflabel_info_[cur_word].codelen = i;
hufflabel_info_[cur_word].point.push_back(vocab_size - 2);
//Generate the huffman code for each leaf node
hufflabel_info_.clear();
for (unsigned a = 0; a < vocab_size; ++a)
hufflabel_info_.push_back(HuffLabelInfo());
for (unsigned a = 0; a < vocab_size; a++)
{
unsigned b = a, i = 0;
while (1)
{
assert(i < kMaxCodeLength);
code[i] = binary[b];
point[i] = b;
i++;
b = parent_node[b];
if (b == vocab_size * 2 - 2) break;
}
unsigned cur_word = ordered_words[a].first;
for (b = 0; b < i; b++)
{
hufflabel_info_[cur_word].code.push_back(code[i - b - 1]);
if (b)
hufflabel_info_[cur_word].point.push_back(point[i - b] - vocab_size);
}
}
hufflabel_info_[cur_word].codelen = i;
hufflabel_info_[cur_word].point.push_back(vocab_size - 2);
delete[] count;
count = nullptr;
delete[] binary;
binary = nullptr;
delete[] parent_node;
parent_node = nullptr;
}
//Firstly get the dictionary from file
void HuffmanEncoder::BuildFromTermFrequency(const char* filename)
{
FILE* fid;
fid=fopen(filename, "r");
if (fid)
{
char sz_label[kMaxWordSize];
dict_ = new (std::nothrow)Dictionary();
assert(dict_ != nullptr);
//while (fscanf_s(fid, "%s", sz_label, kMaxWordSize) != EOF)
while (fscanf(fid,"%s",sz_label) != EOF)
{
HuffLabelInfo info;
int freq;
fscanf(fid, "%d", &freq);
dict_->Insert(sz_label, freq);
}
fclose(fid);
for (b = 0; b < i; b++)
{
hufflabel_info_[cur_word].code.push_back(code[i - b - 1]);
if (b)
hufflabel_info_[cur_word].point.push_back(point[i - b] - vocab_size);
}
}
BuildHuffmanTreeFromDict();
}
else
{
multiverso::Log::Error("file open failed %s", filename);
}
}
delete[] count;
count = nullptr;
delete[] binary;
binary = nullptr;
delete[] parent_node;
parent_node = nullptr;
}
//Firstly get the dictionary from file
void HuffmanEncoder::BuildFromTermFrequency(const char* filename)
{
FILE* fid;
fid = fopen(filename, "r");
if (fid)
{
char sz_label[kMaxWordSize];
dict_ = new (std::nothrow)Dictionary();
assert(dict_ != nullptr);
//while (fscanf_s(fid, "%s", sz_label, kMaxWordSize) != EOF)
while (fscanf(fid, "%s", sz_label) != EOF)
{
HuffLabelInfo info;
int freq;
fscanf(fid, "%d", &freq);
dict_->Insert(sz_label, freq);
}
fclose(fid);
void HuffmanEncoder::BuildFromTermFrequency(Dictionary* dict)
{
dict_ = dict;
BuildHuffmanTreeFromDict();
}
BuildHuffmanTreeFromDict();
}
else
{
//multiverso::Log::Error("file open failed %s", filename);
}
}
int HuffmanEncoder::GetLabelSize()
{
return dict_->Size();
}
//Get the label index
int HuffmanEncoder::GetLabelIdx(const char* label)
{
return dict_->GetWordIdx(label);
}
void HuffmanEncoder::BuildFromTermFrequency(Dictionary* dict)
{
dict_ = dict;
BuildHuffmanTreeFromDict();
}
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(char* label)
{
int idx = GetLabelIdx(label);
if (idx == -1)
return nullptr;
return GetLabelInfo(idx);
}
int HuffmanEncoder::GetLabelSize()
{
return dict_->Size();
}
//Get the label index
int HuffmanEncoder::GetLabelIdx(const char* label)
{
return dict_->GetWordIdx(label);
}
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(char* label)
{
int idx = GetLabelIdx(label);
if (idx == -1)
return nullptr;
return GetLabelInfo(idx);
}
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(int label_idx)
{
if (label_idx == -1) return nullptr;
return &hufflabel_info_[label_idx];
}
//Get the dictionary
Dictionary* HuffmanEncoder::GetDict()
{
return dict_;
}
}
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(int label_idx)
{
if (label_idx == -1) return nullptr;
return &hufflabel_info_[label_idx];
}
//Get the dictionary
Dictionary* HuffmanEncoder::GetDict()
{
return dict_;
}
}
}

Просмотреть файл

@ -1,5 +1,5 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_HUFFMAN_ENCODER_H_
#define DISTRIBUTED_WORD_EMBEDDING_HUFFMAN_ENCODER_H_
#pragma once
/*!
* \brief Class Huffman_encoder stores the huffman_encode of the vocabulary according the dictionary
*/
@ -11,63 +11,63 @@
#include "dictionary.h"
#include "constant.h"
namespace multiverso
{
namespace wordembedding
{
struct HuffLabelInfo
{ /*!
* \brief Internal node ids in the code path
*/
std::vector<int> point;
/*!
* \brief Huffman code
*/
std::vector<char> code;
int codelen;
HuffLabelInfo()
{
codelen = 0;
point.clear();
code.clear();
}
};
namespace wordembedding
{
struct HuffLabelInfo
{ /*!
* \brief Internal node ids in the code path
*/
std::vector<int> point;
/*!
* \brief Huffman code
*/
std::vector<char> code;
int codelen;
HuffLabelInfo()
{
codelen = 0;
point.clear();
code.clear();
}
};
class HuffmanEncoder
{
public:
HuffmanEncoder();
/*!
* \brief Save the word-huffmancode in the file
*/
void Save2File(const char* filename);
/*!
* \brief Recover the word-huffmancode from the file
*/
void RecoverFromFile(const char* filename);
/*!
* \brief Get the dictionary file and build
* \hufflabel_info from the dictionary
*/
void BuildFromTermFrequency(const char* filename);
void BuildFromTermFrequency(Dictionary* dict);
/*!
* \brief Get the label size
*/
int GetLabelSize();
/*!
* \brief Get the label's index
*/
int GetLabelIdx(const char* label);
HuffLabelInfo* GetLabelInfo(char* label);
HuffLabelInfo* GetLabelInfo(int label_idx);
Dictionary* GetDict();
class HuffmanEncoder
{
public:
HuffmanEncoder();
/*!
* \brief Save the word-huffmancode in the file
*/
void Save2File(const char* filename);
/*!
* \brief Recover the word-huffmancode from the file
*/
void RecoverFromFile(const char* filename);
/*!
* \brief Get the dictionary file and build
* \hufflabel_info from the dictionary
*/
void BuildFromTermFrequency(const char* filename);
void BuildFromTermFrequency(Dictionary* dict);
/*!
* \brief Get the label size
*/
int GetLabelSize();
/*!
* \brief Get the label's index
*/
int GetLabelIdx(const char* label);
HuffLabelInfo* GetLabelInfo(char* label);
HuffLabelInfo* GetLabelInfo(int label_idx);
Dictionary* GetDict();
private:
void BuildHuffmanTreeFromDict();
std::vector<HuffLabelInfo> hufflabel_info_;
Dictionary* dict_;
};
}
}
#endif
private:
void BuildHuffmanTreeFromDict();
std::vector<HuffLabelInfo> hufflabel_info_;
Dictionary* dict_;
};
}
}

Просмотреть файл

@ -1,46 +1,42 @@
#include <cstring>
#include <cmath>
#include <thread>
#include <string>
#include <iostream>
#include <cstring>
#include <cmath>
#include <vector>
#include <fstream>
#include <sstream>
#include <new>
//#include <vld.h>
#include "multiverso/util/log.h"
#include "multiverso/multiverso.h"
#include "distributed_wordembedding.h"
#include "memory_manager.h"
#include "dictionary.h"
#include "huffman_encoder.h"
#include "util.h"
#include "reader.h"
#include "multiverso.h"
#include "barrier.h"
#include "distributed_wordembedding.h"
#include "parameter_loader.h"
#include "trainer.h"
#include "word_embedding.h"
#include "memory_manager.h"
using namespace multiverso;
using namespace wordembedding;
int main(int argc, char *argv[])
{
try
{
Distributed_wordembedding *ptr = new (std::nothrow)Distributed_wordembedding();
assert(ptr != nullptr);
ptr->Run(argc, argv);
delete ptr;
}
catch (std::bad_alloc &memExp)
{
multiverso::Log::Info("Something wrong with new() %s\n", memExp.what());
}
catch (...)
{
multiverso::Log::Info("Something wrong with other reason!\n");
}
system("PAUSE");
return 0;
{
try
{
Distributed_wordembedding *ptr = new (std::nothrow)Distributed_wordembedding();
assert(ptr != nullptr);
ptr->Run(argc, argv);
}
catch (std::bad_alloc &memExp)
{
multiverso::Log::Info("Something wrong with new() %s\n", memExp.what());
}
catch(...)
{
multiverso::Log::Info("Something wrong with other reason!\n");
}
return 0;
}

Просмотреть файл

@ -2,33 +2,33 @@
namespace multiverso
{
namespace wordembedding
{
MemoryManager::MemoryManager(int block_size)
{
block_size_ = block_size;
}
//Request memory for blocks
void MemoryManager::RequestBlocks(int64 block_number, std::vector<real*>& result)
{
std::unique_lock<std::mutex> lock(mutex_);
for (int64 i = 0; i < block_number; ++i)
{
result.push_back(new (std::nothrow) real[block_size_]);
assert(result[i] != nullptr);
}
}
//Free the memory for blocks
void MemoryManager::ReturnBlocks(std::vector<real*>& blocks)
{
std::unique_lock<std::mutex> lock(mutex_);
for (size_t i = 0; i < blocks.size(); ++i)
delete[] blocks[i];
}
namespace wordembedding
{
MemoryManager::MemoryManager(int block_size)
{
block_size_ = block_size;
}
//Request memory for blocks
void MemoryManager::RequestBlocks(int64 block_number, std::vector<real*>& result)
{
std::unique_lock<std::mutex> lock(mutex_);
for (int64 i = 0; i < block_number; ++i)
{
result.push_back(new (std::nothrow) real[block_size_]);
assert(result[i] != nullptr);
}
}
//Free the memory for blocks
void MemoryManager::ReturnBlocks(std::vector<real*>& blocks)
{
std::unique_lock<std::mutex> lock(mutex_);
for (size_t i = 0; i < blocks.size(); ++i)
delete[] blocks[i];
}
MemoryManager::~MemoryManager()
{
}
}
MemoryManager::~MemoryManager()
{
}
}
}

Просмотреть файл

@ -1,48 +1,46 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_MEMORY_MANAGER_H_
#define DISTRIBUTED_WORD_EMBEDDING_MEMORY_MANAGER_H_
#pragma once
/*!
* file memory_manager.h
* \brief Class MemoryManager creates and allocates memory for the local parameter which is needed by the datablock training.
*/
#include <vector>
#include <condition_variable>
#include <cassert>
#include <malloc.h>
#include <cstring>
#include <vector>
#include <condition_variable>
#include "constant.h"
namespace multiverso
{
namespace wordembedding
{
class MemoryManager
{
public:
MemoryManager(int block_size);
/*!
* \brief Create memory for the blocks
* \param block_number the block quantity needed
* \param result the vector of the head address of allocated memory
*/
void RequestBlocks(int64 block_number, std::vector<real*>& result);
/*!
* \brief Delete the blocks memory
* \param blocks the vector of the head address of allocated memory
*/
void ReturnBlocks(std::vector<real*>& blocks);
~MemoryManager();
namespace wordembedding
{
class MemoryManager
{
public:
MemoryManager(int block_size);
/*!
* \brief Create memory for the blocks
* \param block_number the block quantity needed
* \param result the vector of the head address of allocated memory
*/
void RequestBlocks(int64 block_number, std::vector<real*>& result);
/*!
* \brief Delete the blocks memory
* \param blocks the vector of the head address of allocated memory
*/
void ReturnBlocks(std::vector<real*>& blocks);
~MemoryManager();
private:
int64 block_size_;
std::mutex mutex_;
private:
int64 block_size_;
std::mutex mutex_;
// No copying allowed
MemoryManager(const MemoryManager&);
void operator=(const MemoryManager&);
};
}
// No copying allowed
MemoryManager(const MemoryManager&);
void operator=(const MemoryManager&);
};
}
}
#endif

80
src/parameter_loader.cpp Normal file
Просмотреть файл

@ -0,0 +1,80 @@
#include "parameter_loader.h"
namespace multiverso
{
namespace wordembedding
{
ParameterLoader::ParameterLoader(Option *option,
WordEmbedding *WordEmbedding)
{
option_ = option;
WordEmbedding_ = WordEmbedding;
parse_and_request_count_ = 0;
//the log which will store the begin and end time of ParseAndRequest
char log_name[100];
sprintf(log_name, "parameter_loader%s.txt", g_log_suffix.c_str());
log_file_ = fopen(log_name, "w");
}
void ParameterLoader::ParseAndRequest(
multiverso::DataBlockBase *data_block)
{
if (parse_and_request_count_ == 0)
{
start_ = clock();
}
fprintf(log_file_, "%lf\n", (clock()) / (double)CLOCKS_PER_SEC);
multiverso::Log::Info("Rank %d ParameterLoader begin %d\n",
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
++parse_and_request_count_;
DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
//Step 1, compute the parameters which will be used when the trainers begin
std::vector<int> input_nodes;
std::vector<int> output_nodes;
//input_nodes,output_nodes
multiverso::Log::Debug("Rank %d ParameterLoader parse begin %d\n",
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
WordEmbedding_->PrepareParameter(data);
multiverso::Log::Debug("Rank %d ParameterLoader parse end %d\n",
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
//Step 2, Request the parameter
multiverso::Log::Debug("Rank %d ParameterLoader request begin %d\n",
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
RequestParameter(data);
multiverso::Log::Debug("Rank %d ParameterLoader request end %d\n",
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
//Step 3, store the needed parameters in data_block
multiverso::Log::Info("Rank %d ParameterLoader finish %d\n",
multiverso::Multiverso::ProcessRank(), parse_and_request_count_ - 1);
fprintf(log_file_, "%lf\n", (clock()) / (double)CLOCKS_PER_SEC);
fflush(log_file_);
}
void ParameterLoader::RequestParameter(DataBlock *data_block)
{
//If the data_block is the last one, we need to dump
//the input-embedding weights
if (data_block->Type() == DataBlockType::Test)
RequestTable(kInputEmbeddingTableId);
RequestRow(kWordCountActualTableId, 0);
for (auto node : data_block->input_nodes)
RequestRow(kInputEmbeddingTableId, node);
for (auto node : data_block->output_nodes)
RequestRow(kEmbeddingOutputTableId, node);
if (option_->use_adagrad)
{
for (auto node : data_block->input_nodes)
RequestRow(kSumGradient2IETableId, node);
for (auto node : data_block->output_nodes)
RequestRow(kSumGradient2EOTableId, node);
}
}
}
}

53
src/parameter_loader.h Normal file
Просмотреть файл

@ -0,0 +1,53 @@
#pragma once
/*!
* file parameter_loader.h
* \brief Class Parameterloader parses the datablock and requests the params from multiverso server
*/
#include "multiverso.h"
#include "data_block.h"
#include "constant.h"
#include "util.h"
#include "huffman_encoder.h"
#include "word_embedding.h"
#include "log.h"
namespace multiverso
{
namespace wordembedding
{
class WordEmbedding;
extern std::string g_log_suffix;
class ParameterLoader : public multiverso::ParameterLoaderBase
{
public:
ParameterLoader(){}
ParameterLoader(Option *option, WordEmbedding *WordEmbedding);
/*!
* \brief Parse the datablock to get the parameter needed
* \param data_block which is pushed in
*/
void ParseAndRequest(multiverso::DataBlockBase* data_block) override;
private:
Option *option_;
WordEmbedding *WordEmbedding_;
int parse_and_request_count_;
clock_t start_;
FILE* log_file_;
/*!
* \brief Request the parameters from multiverso server to local buffer
* \param data_block which is pushed in
* \param input_nodes stores the input words'index
* \param output_nodes stores the output words'index
*/
void RequestParameter(DataBlock *data_block);
//No copying allowed
ParameterLoader(const ParameterLoader&);
void operator=(const ParameterLoader&);
};
}
}

Просмотреть файл

@ -2,115 +2,115 @@
namespace multiverso
{
namespace wordembedding
{
Reader::Reader(Dictionary *dictionary, Option *option,
Sampler *sampler, const char *input_file)
{
dictionary_ = dictionary;
option_ = option;
sampler_ = sampler;
namespace wordembedding
{
Reader::Reader(Dictionary *dictionary, Option *option,
Sampler *sampler, const char *input_file)
{
dictionary_ = dictionary;
option_ = option;
sampler_ = sampler;
stopwords_table_.clear();
if (option_->stopwords)
{
FILE* fid = fopen(option_->sw_file, "r");
if (fid == nullptr)
{
//multiverso::Log::Fatal("Open sw_file failed!\n");
exit(1);
}
while (ReadWord(word_, fid))
{
stopwords_table_.insert(word_);
}
stopwords_table_.clear();
if (option_->stopwords)
{
FILE* fid = fopen(option_->sw_file, "r");
if (fid == nullptr)
{
multiverso::Log::Fatal("Open sw_file failed!\n");
exit(1);
}
while (ReadWord(word_, fid))
{
stopwords_table_.insert(word_);
}
fclose(fid);
}
fclose(fid);
}
file_ = fopen(input_file, "r");
if (file_ == nullptr)
{
//multiverso::Log::Fatal("Open train_file failed!\n");
exit(1);
}
}
file_ = fopen(input_file, "r");
if (file_ == nullptr)
{
multiverso::Log::Fatal("Open train_file failed!\n");
exit(1);
}
}
Reader::~Reader()
{
if (file_ != nullptr)
fclose(file_);
}
//Get sentence by connecting the words extracted
int Reader::GetSentence(int *sentence, int64 &word_count)
{
int length = 0, word_idx;
word_count = 0;
while (1)
{
if (!ReadWord(word_, file_))
break;
word_idx = dictionary_->GetWordIdx(word_);
if (word_idx == -1)
continue;
word_count++;
if (option_->stopwords && stopwords_table_.count(word_))
continue;
if (option_->sample > 0 &&
!sampler_->WordSampling(
dictionary_->GetWordInfo(word_idx)->freq,
option_->total_words, option_->sample))
continue;
sentence[length++] = word_idx;
if (length >= kMaxSentenceLength)
break;
}
Reader::~Reader()
{
if (file_ != nullptr)
fclose(file_);
}
//Get sentence by connecting the words extracted
int Reader::GetSentence(int *sentence, int64 &word_count)
{
int length = 0, word_idx;
word_count = 0;
while (1)
{
if (!ReadWord(word_, file_))
break;
word_idx = dictionary_->GetWordIdx(word_);
if (word_idx == -1)
continue;
word_count++;
if (option_->stopwords && stopwords_table_.count(word_))
continue;
if (option_->sample > 0 &&
!sampler_->WordSampling(
dictionary_->GetWordInfo(word_idx)->freq,
option_->total_words, option_->sample))
continue;
sentence[length++] = word_idx;
if (length >= kMaxSentenceLength)
break;
}
return length;
}
return length;
}
void Reader::ResetStart()
{
fseek(file_, 0, SEEK_SET);
}
void Reader::ResetStart()
{
fseek(file_, 0, SEEK_SET);
}
void Reader::ResetSize(int64 size)
{
byte_count_ = 0;
byte_size_ = size;
}
//Read words from the file
bool Reader::ReadWord(char *word, FILE *fin)
{
int idx = 0;
char ch;
while (!feof(fin) && byte_count_ < byte_size_)
{
ch = fgetc(fin);
++byte_count_;
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
{
if (idx > 0)
{
if (ch == '\n')
ungetc(ch, fin);
break;
}
if (ch == '\n')
{
strcpy(word, (char *)"</s>");
return true;
}
else continue;
}
word[idx++] = ch;
//Truncate too long words
if (idx >= kMaxString - 1)
idx--;
}
word[idx] = 0;
return idx != 0;
}
}
void Reader::ResetSize(int64 size)
{
byte_count_ = 0;
byte_size_ = size;
}
//Read words from the file
bool Reader::ReadWord(char *word, FILE *fin)
{
int idx = 0;
char ch;
while (!feof(fin) && byte_count_ < byte_size_)
{
ch = fgetc(fin);
++byte_count_;
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
{
if (idx > 0)
{
if (ch == '\n')
ungetc(ch, fin);
break;
}
if (ch == '\n')
{
strcpy(word, (char *)"</s>");
return true;
}
else continue;
}
word[idx++] = ch;
//Truncate too long words
if (idx >= kMaxString - 1)
idx--;
}
word[idx] = 0;
return idx != 0;
}
}
}

Просмотреть файл

@ -1,5 +1,5 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_READER_H_
#define DISTRIBUTED_WORD_EMBEDDING_READER_H_
#pragma once
/*!
* file reader.h
* \brief Class Reader helps the function Loaddata to fill the datablock
@ -14,42 +14,41 @@
namespace multiverso
{
namespace wordembedding
{
class Reader
{
public:
Reader(Dictionary *dictionary, Option *option,
Sampler *sampler, const char *input_file);
~Reader();
/*!
* \brief Getsentence from the train_file
* \param sentence save the sentence by the word index according to the dictionary
* \param word_count count the sentence length
*/
int GetSentence(int *sentence, int64 &word_count);
void ResetStart();
void ResetSize(int64 size);
namespace wordembedding
{
class Reader
{
public:
Reader(Dictionary *dictionary, Option *option,
Sampler *sampler, const char *input_file);
~Reader();
/*!
* \brief Getsentence from the train_file
* \param sentence save the sentence by the word index according to the dictionary
* \param word_count count the sentence length
*/
int GetSentence(int *sentence, int64 &word_count);
void ResetStart();
void ResetSize(int64 size);
private:
const Option *option_;
FILE* file_;
char word_[kMaxString + 1];
Dictionary *dictionary_;
Sampler *sampler_;
int64 byte_count_, byte_size_;
std::unordered_set<std::string> stopwords_table_;
/*!
* \brief Read words from the train_file
* \param word store the extracted word
* \param file represent the train_file pointer
*/
bool ReadWord(char *word, FILE *file);
private:
const Option *option_;
FILE* file_;
char word_[kMaxString + 1];
Dictionary *dictionary_;
Sampler *sampler_;
int64 byte_count_, byte_size_;
std::unordered_set<std::string> stopwords_table_;
/*!
* \brief Read words from the train_file
* \param word store the extracted word
* \param file represent the train_file pointer
*/
bool ReadWord(char *word, FILE *file);
//No copying allowed
Reader(const Reader&);
void operator=(const Reader&);
};
}
}
#endif
//No copying allowed
Reader(const Reader&);
void operator=(const Reader&);
};
}
}

Просмотреть файл

@ -1,56 +1,315 @@
#include "trainer.h"
namespace multiverso
{
namespace wordembedding
{
Trainer::Trainer(int trainer_id, Option *option,
Dictionary* dictionary, WordEmbedding* WordEmbedding)
{
trainer_id_ = trainer_id;
option_ = option;
word_count = 0;
WordEmbedding_ = WordEmbedding;
dictionary_ = dictionary;
hidden_act_ = (real *)calloc(option_->embeding_size, sizeof(real));
hidden_err_ = (real *)calloc(option_->embeding_size, sizeof(real));
process_count_ = -1;
process_id_ = -1;
namespace wordembedding
{
Trainer::Trainer(int trainer_id, Option *option,
multiverso::Barrier *barrier,
Dictionary* dictionary, WordEmbedding* WordEmbedding,
MemoryManager* memory_mamanger)
{
trainer_id_ = trainer_id;
option_ = option;
word_count = 0;
WordEmbedding_ = WordEmbedding;
barrier_ = barrier;
dictionary_ = dictionary;
memory_mamanger_ = memory_mamanger;
hidden_act_ = (real *)calloc(option_->embeding_size, sizeof(real));
hidden_err_ = (real *)calloc(option_->embeding_size, sizeof(real));
process_count_ = -1;
process_id_ = -1;
assert(hidden_act_ != nullptr);
assert(hidden_err_ != nullptr);
start_ = 0;
train_count_ = 0;
if (trainer_id_ == 0)
{
//The log which recordes the begin and end time of TrainIteration()
char log_name[100];
sprintf(log_name, "trainer%s.txt", g_log_suffix.c_str());
log_file_ = fopen(log_name, "w");
}
}
assert(hidden_act_ != nullptr);
assert(hidden_err_ != nullptr);
start_ = 0;
train_count_ = 0;
/*
if (trainer_id_ == 0)
{
//The log which recordes the begin and end time of TrainIteration()
char log_name[100];
sprintf(log_name, "trainer%s.txt", g_log_suffix.c_str());
log_file_ = fopen(log_name, "w");
}
*/
}
void Trainer::TrainIteration(DataBlock *data_block)
{
if (process_id_ == -1)
process_id_ = multiverso::MV_Rank();
void Trainer::TrainIteration(multiverso::DataBlockBase *data_block)
{
if (process_id_ == -1)
process_id_ = multiverso::Multiverso::ProcessRank();
if (data_block == nullptr){
return;
}
if (trainer_id_ == 0)
//Record the starting time of the Trainiteration
fprintf(log_file_, "%lf\n", (clock()) / (double)CLOCKS_PER_SEC);
clock_t start = clock();
multiverso::Log::Info("Rank %d Train %d Begin TrainIteration%d ...\n",
process_id_, trainer_id_, train_count_);
++train_count_;
//Compute the total number of processes
if (process_count_ == -1)
process_count_ = multiverso::Multiverso::TotalProcessCount();
DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
std::vector<int> input_nodes(data->input_nodes.begin(), data->input_nodes.end());
std::vector<int> output_nodes(data->output_nodes.begin(), data->output_nodes.end());
//A trainer only copy or add apart of parameters
//This trainer should copy or add the parameters according to
//local_input_nodes and local_output_nodes
std::vector<int> local_input_nodes;
std::vector<int> local_output_nodes;
multiverso::Log::Info("Rank %d Train %d TrainNN Begin TrainIteration%d ...\n",
process_id_, trainer_id_, train_count_);
for (int i = trainer_id_; i < input_nodes.size(); i += option_->thread_cnt)
local_input_nodes.push_back(input_nodes[i]);
for (int i = trainer_id_; i < output_nodes.size(); i += option_->thread_cnt)
local_output_nodes.push_back(output_nodes[i]);
WordEmbedding_->Train(data_block, trainer_id_, option_->thread_cnt,
word_count, hidden_act_, hidden_err_);
if (trainer_id_ == 0)
{
multiverso::Log::Info("Rank %d input_size=%d, output_size=%d\n",
process_id_, input_nodes.size(), output_nodes.size());
}
multiverso::Log::Info("Rank %d Trainer %d training time:%lfs\n",process_id_,trainer_id_,
(clock() - start) / (double)CLOCKS_PER_SEC);
train_count_++;
}
}
//Step 1, Copy the parameter from multiverso to WordEmbedding_
//One trainer only copy a part of parameters
multiverso::Log::Debug("Rank %d Train %d Copyparameter Begin TrainIteration%d ...\n",
process_id_, trainer_id_, train_count_);
CopyParameter(local_input_nodes, local_output_nodes);
if (trainer_id_ == 0)
{
multiverso::Row<int64> &copy_row = GetRow<int64>(kWordCountActualTableId, 0);
WordEmbedding_->word_count_actual = copy_row.At(0);
WordEmbedding_->UpdateLearningRate();
}
multiverso::Log::Debug("Rank %d Train %d Copyparameter end TrainIteration%d ...\n",
process_id_, trainer_id_, train_count_);
//Wait for all the trainers to finish copying parameter
barrier_->Wait();
//Step 2, After finishing copying parameter,
//Use WordEmbedding_ to train a part of data_block
int64 last_word_count = word_count;
clock_t start = clock();
multiverso::Log::Debug("Rank %d Train %d TrainNN Begin TrainIteration%d ...\n",
process_id_, trainer_id_, train_count_);
WordEmbedding_->Train(data, trainer_id_, option_->thread_cnt,
word_count, hidden_act_, hidden_err_);
if (word_count > last_word_count)
{
multiverso::Log::Info("TrainNNSpeed: Words/thread/second %lfk\n",
((double)word_count - last_word_count) /
(clock() - start) * (double)CLOCKS_PER_SEC / 1000);
}
multiverso::Log::Debug("Rank %d Train %d TrainNN end TrainIteration%d ...\n",
process_id_, trainer_id_, train_count_);
//Wait for all the trainers to finish training
barrier_->Wait();
multiverso::Log::Debug("Rank %d Train %d AddDeltaParameter Begin TrainIteration%d ...\n",
process_id_, trainer_id_, train_count_);
//Step 3, After finishing training, add the delta of parameters to multiverso
AddDeltaParameter(local_input_nodes, local_output_nodes);
if (trainer_id_ == 0)
{
multiverso::Row<int64> &copy_row = GetRow<int64>(kWordCountActualTableId, 0);
Add<int64>(kWordCountActualTableId, 0, 0, WordEmbedding_->word_count_actual - copy_row.At(0));
}
multiverso::Log::Debug("Rank %d Train %d AddDeltaParameter end TrainIteration%d ...\n",
process_id_, trainer_id_, train_count_);
//If the data_block is the last one,Dump the input-embedding weights
if (data->Type() == DataBlockType::Test && trainer_id_ == 0)
{
SaveEmbedding(option_->output_file, option_->output_binary);
}
if (trainer_id_ == 0)
{
fprintf(log_file_, "%lf\n",
(clock()) / (double)CLOCKS_PER_SEC);
fflush(log_file_);
}
}
void Trainer::CopyRow(real* ptr, multiverso::Row<real>& row, int size)
{
for (int i = 0; i < size; ++i)
ptr[i] = row.At(i);
}
void Trainer::CopyParameter(std::vector<int>& input_nodes,
std::vector<int>& output_nodes)
{
//Compute the number of necessary memory blocks to store parameter
std::vector<real*> blocks;
int current_block = 0;
size_t total_blocks = (input_nodes.size() + output_nodes.size());
if (option_->use_adagrad)
total_blocks *= 2;
//Request blocks to store parameters
memory_mamanger_->RequestBlocks(total_blocks, blocks);
assert(blocks.size() == total_blocks);
if (blocks.size() != total_blocks)
{
multiverso::Log::Error("Rank %d Trainer %d Error to requestBlocks to CopyParameter, allocated_blocks_num=%lld, needed_blocks_num=%lld\n",
multiverso::Multiverso::ProcessRank(), trainer_id_, blocks.size(), total_blocks);
return;
}
//Copy input-embedding weights from multiverso to WordEmbedding
for (int i = 0; i < input_nodes.size(); ++i)
{
real* ptr = blocks[current_block++];
assert(ptr != nullptr);
CopyRow(ptr, GetRow<real>(kInputEmbeddingTableId,
input_nodes[i]), option_->embeding_size);
WordEmbedding_->SetWeightIE(input_nodes[i], ptr);
}
//Copy embedding-output weights from multiverso to WordEmbedding
for (int i = 0; i < output_nodes.size(); ++i)
{
real* ptr = blocks[current_block++];
assert(ptr != nullptr);
CopyRow(ptr, GetRow<real>(kEmbeddingOutputTableId,
output_nodes[i]), option_->embeding_size);
WordEmbedding_->SetWeightEO(output_nodes[i], ptr);
}
if (option_->use_adagrad)
{
//Copy input-embedding sum of squarsh of gradient
for (int i = 0; i < input_nodes.size(); ++i)
{
real* ptr = blocks[current_block++];
assert(ptr != nullptr);
CopyRow(ptr, GetRow<real>(kSumGradient2IETableId,
input_nodes[i]), option_->embeding_size);
WordEmbedding_->SetSumGradient2IE(input_nodes[i], ptr);
}
//Copy embedding-output sum of squarsh of gradient
for (int i = 0; i < output_nodes.size(); ++i)
{
real* ptr = blocks[current_block++];
assert(ptr != nullptr);
CopyRow(ptr, GetRow<real>(kSumGradient2EOTableId,
output_nodes[i]), option_->embeding_size);
WordEmbedding_->SetSumGradient2EO(output_nodes[i], ptr);
}
}
}
void Trainer::AddRow(real* ptr, int table_id, int row_id, int size)
{
multiverso::Row<real>& row = GetRow<real>(table_id, row_id);
for (int i = 0; i < size; ++i)
{
real delta = (ptr[i] - row.At(i)) / process_count_;
if (fabs(delta) > kEps)
Add<real>(table_id, row_id, i, delta);
}
}
//Add delta to local buffer and send it to the parameter sever
void Trainer::AddDeltaParameter(std::vector<int>& input_nodes,
std::vector<int>& output_nodes)
{
std::vector<real*> blocks;
for (int i = 0; i < input_nodes.size(); ++i)
{
real* ptr = WordEmbedding_->GetWeightIE(input_nodes[i]);
assert(ptr != nullptr);
AddRow(ptr, kInputEmbeddingTableId, input_nodes[i],
option_->embeding_size);
blocks.push_back(ptr);
}
for (int i = 0; i < output_nodes.size(); ++i)
{
real* ptr = WordEmbedding_->GetWeightEO(output_nodes[i]);
assert(ptr != nullptr);
AddRow(ptr, kEmbeddingOutputTableId, output_nodes[i],
option_->embeding_size);
blocks.push_back(ptr);
}
if (option_->use_adagrad)
{
for (int i = 0; i < input_nodes.size(); ++i)
{
real* ptr = WordEmbedding_->GetSumGradient2IE(input_nodes[i]);
assert(ptr != nullptr);
AddRow(ptr, kSumGradient2IETableId, input_nodes[i],
option_->embeding_size);
blocks.push_back(ptr);
}
for (int i = 0; i < output_nodes.size(); ++i)
{
real* ptr = WordEmbedding_->GetSumGradient2EO(output_nodes[i]);
assert(ptr != nullptr);
AddRow(ptr, kSumGradient2EOTableId, output_nodes[i],
option_->embeding_size);
blocks.push_back(ptr);
}
}
//Return all the memory blocks
memory_mamanger_->ReturnBlocks(blocks);
}
void Trainer::SaveEmbedding(const char *file_path, bool is_binary)
{
FILE* fid = nullptr;
if (is_binary)
{
fid = fopen(file_path, "wb");
fprintf(fid, "%d %d\n", dictionary_->Size(),option_->embeding_size);
for (int i = 0; i < dictionary_->Size(); ++i)
{
fprintf(fid, "%s ",
dictionary_->GetWordInfo(i)->word.c_str());
multiverso::Row<real>& embedding = GetRow<real>(
kInputEmbeddingTableId, i);
for (int j = 0; j < option_->embeding_size; ++j)
{
real tmp = embedding.At(j);
fwrite(&tmp, sizeof(real), 1, fid);
}
fprintf(fid, "\n");
}
fclose(fid);
}
else
{
fid = fopen(file_path, "wt");
fprintf(fid, "%d %d\n", dictionary_->Size(), option_->embeding_size);
for (int i = 0; i < dictionary_->Size(); ++i)
{
fprintf(fid, "%s ", dictionary_->GetWordInfo(i)->word.c_str());
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, i);
for (int j = 0; j < option_->embeding_size; ++j)
fprintf(fid, "%lf ", embedding.At(j));
fprintf(fid, "\n");
}
fclose(fid);
}
}
}
}

Просмотреть файл

@ -1,53 +1,80 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_TRAINER_H_
#define DISTRIBUTED_WORD_EMBEDDING_TRAINER_H_
#pragma once
/*!
* file trainer.h
* \brief Class Trainer trains the model by every trainiteration
*/
#include "multiverso/multiverso.h"
#include "multiverso/updater/updater.h"
#include "multiverso/table/matrix_table.h"
#include <thread>
#include <chrono>
#include "multiverso.h"
#include "data_block.h"
#include "constant.h"
#include "util.h"
#include "huffman_encoder.h"
#include "word_embedding.h"
#include "data_block.h"
#include "memory_manager.h"
#include "barrier.h"
namespace multiverso
{
namespace wordembedding
{
class WordEmbedding;
extern std::string g_log_suffix;
class Trainer{
public:
int64 word_count;
Trainer(int trainer_id, Option *option,
Dictionary* dictionary, WordEmbedding* WordEmbedding);
/*!
* /brief Train one datablock
*/
void TrainIteration(DataBlock * data_block);
namespace wordembedding
{
class WordEmbedding;
extern std::string g_log_suffix;
class Trainer : public multiverso::TrainerBase
{
public:
int64 word_count;
Trainer(int trainer_id, Option *option, Barrier* barrier,
Dictionary* dictionary, WordEmbedding* WordEmbedding,
MemoryManager* memory_mamanger);
/*!
* /brief Train one datablock
*/
void TrainIteration(multiverso::DataBlockBase* data_block) override;
private:
int process_count_;
int process_id_;
int trainer_id_;
Option *option_;
real *hidden_act_, *hidden_err_;
WordEmbedding* WordEmbedding_;
Dictionary* dictionary_;
int train_count_;
clock_t start_, now_;
FILE* log_file_;
private:
int process_count_;
int process_id_;
int trainer_id_;
Option *option_;
real *hidden_act_, *hidden_err_;
WordEmbedding* WordEmbedding_;
multiverso::Barrier *barrier_;
Dictionary* dictionary_;
MemoryManager* memory_mamanger_;
int train_count_;
clock_t start_, now_;
FILE* log_file_;
//No copying allowed
Trainer(const Trainer&);
void operator=(const Trainer&);
};
}
/*!
* \brief Save the input-embedding vectors in file_path
* \param file_path
* \param is_binary, the format of file
* 1 - save the vectors in the binary format,
* 2 - save the vectors in the ascii format
*/
void SaveEmbedding(const char *file_path, bool is_binary);
/*!
* \brief Copy the needed parameter from buffer to blocks
*/
void CopyRow(real* ptr, multiverso::Row<real>& row, int size);
void CopyParameter(std::vector<int>& input_nodes,
std::vector<int>& output_nodes);
/*!
* \brief Add delta to the parameter stored in the
* \buffer and send it to multiverso
*/
void AddRow(real* ptr, int table_id,
int row_id, int size);
void AddDeltaParameter(std::vector<int>& input_nodes,
std::vector<int>& output_nodes);
//No copying allowed
Trainer(const Trainer&);
void operator=(const Trainer&);
};
}
}
#endif

Просмотреть файл

@ -1,256 +1,192 @@
#include <time.h>
#include "util.h"
namespace multiverso
{
namespace wordembedding
{
Option::Option()
{
train_file = nullptr;
read_vocab_file = nullptr;
output_file = nullptr;
sw_file = nullptr;
endpoints_file = "";
hs = true;
negative_num = 0;
output_binary = false;
sample = 0;
cbow = true;
embeding_size = 0;
thread_cnt = 1;
window_size = 5;
min_count = 5;
data_block_size = 1000000;
init_learning_rate = static_cast<real>(0.025);
epoch = 1;
stopwords = false;
is_pipeline = true;
total_words = 0;
max_preload_data_size = 8000000000LL;
use_adagrad = false;
//multiverso config
num_servers = 0;
num_aggregator = 1;
lock_option = 1;
num_lock = 100;
max_delay = 0;
}
//Input all the local model-arguments
void Option::ParseArgs(int argc, char* argv[])
{
for (int i = 1; i < argc; i += 2)
{
if (strcmp(argv[i], "-size") == 0) embeding_size = atoi(argv[i + 1]);
if (strcmp(argv[i], "-train_file") == 0) train_file = argv[i + 1];
if (strcmp(argv[i], "-endpoints_file") == 0) endpoints_file = argv[i + 1];
if (strcmp(argv[i], "-read_vocab") == 0) read_vocab_file = argv[i + 1];
if (strcmp(argv[i], "-binary") == 0) output_binary = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-cbow") == 0) cbow = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-alpha") == 0) init_learning_rate = static_cast<real>(atof(argv[i + 1]));
if (strcmp(argv[i], "-output") == 0) output_file = argv[i + 1];
if (strcmp(argv[i], "-window") == 0) window_size = atoi(argv[i + 1]);
if (strcmp(argv[i], "-sample") == 0) sample = static_cast<real>(atof(argv[i + 1]));
if (strcmp(argv[i], "-hs") == 0) hs = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-data_block_size") == 0) data_block_size = atoll(argv[i + 1]);
if (strcmp(argv[i], "-max_preload_data_size") == 0) max_preload_data_size = atoll(argv[i + 1]);
if (strcmp(argv[i], "-negative") == 0) negative_num = atoi(argv[i + 1]);
if (strcmp(argv[i], "-threads") == 0) thread_cnt = atoi(argv[i + 1]);
if (strcmp(argv[i], "-min_count") == 0) min_count = atoi(argv[i + 1]);
if (strcmp(argv[i], "-epoch") == 0) epoch = atoi(argv[i + 1]);
if (strcmp(argv[i], "-stopwords") == 0) stopwords = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-sw_file") == 0) sw_file = argv[i + 1];
if (strcmp(argv[i], "-use_adagrad") == 0) use_adagrad = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-is_pipeline") == 0) is_pipeline = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-num_servers") == 0) num_servers = atoi(argv[i + 1]);
if (strcmp(argv[i], "-num_aggregator") == 0) num_aggregator = atoi(argv[i + 1]);
if (strcmp(argv[i], "-lock_option") == 0) lock_option = atoi(argv[i + 1]);
if (strcmp(argv[i], "-num_lock") == 0) num_lock = atoi(argv[i + 1]);
if (strcmp(argv[i], "-max_delay") == 0) max_delay = atoi(argv[i + 1]);
namespace wordembedding
{
Option::Option()
{
train_file = NULL;
read_vocab_file = NULL;
output_file = NULL;
sw_file = NULL;
endpoints_file = "";
hs = true;
negative_num = 0;
output_binary = false;
sample = 0;
cbow = true;
embeding_size = 0;
thread_cnt = 1;
window_size = 5;
min_count = 5;
data_block_size = 1000000;
init_learning_rate = static_cast<real>(0.025);
epoch = 1;
stopwords = false;
is_pipeline = true;
total_words = 0;
max_preload_data_size = 8000000000LL;
use_adagrad = false;
//multiverso config
num_servers = 0;
num_aggregator = 1;
lock_option = 1;
num_lock = 100;
max_delay = 0;
}
//Input all the local model-arguments
void Option::ParseArgs(int argc, char* argv[])
{
for (int i = 1; i < argc; i += 2)
{
if (strcmp(argv[i], "-size") == 0) embeding_size = atoi(argv[i + 1]);
if (strcmp(argv[i], "-train_file") == 0) train_file = argv[i + 1];
if (strcmp(argv[i], "-endpoints_file") == 0) endpoints_file = argv[i + 1];
if (strcmp(argv[i], "-read_vocab") == 0) read_vocab_file = argv[i + 1];
if (strcmp(argv[i], "-binary") == 0) output_binary = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-cbow") == 0) cbow = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-alpha") == 0) init_learning_rate = static_cast<real>(atof(argv[i + 1]));
if (strcmp(argv[i], "-output") == 0) output_file = argv[i + 1];
if (strcmp(argv[i], "-window") == 0) window_size = atoi(argv[i + 1]);
if (strcmp(argv[i], "-sample") == 0) sample = static_cast<real>(atof(argv[i + 1]));
if (strcmp(argv[i], "-hs") == 0) hs = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-data_block_size") == 0) data_block_size = atoll(argv[i + 1]);
if (strcmp(argv[i], "-max_preload_data_size") == 0) max_preload_data_size = atoll(argv[i + 1]);
if (strcmp(argv[i], "-negative") == 0) negative_num = atoi(argv[i + 1]);
if (strcmp(argv[i], "-threads") == 0) thread_cnt = atoi(argv[i + 1]);
if (strcmp(argv[i], "-min_count") == 0) min_count = atoi(argv[i + 1]);
if (strcmp(argv[i], "-epoch") == 0) epoch = atoi(argv[i + 1]);
if (strcmp(argv[i], "-stopwords") == 0) stopwords = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-sw_file") == 0) sw_file = argv[i + 1];
if (strcmp(argv[i], "-use_adagrad") == 0) use_adagrad = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-is_pipeline") == 0) is_pipeline = (atoi(argv[i + 1]) != 0);
if (strcmp(argv[i], "-num_servers") == 0) num_servers = atoi(argv[i + 1]);
if (strcmp(argv[i], "-num_aggregator") == 0) num_aggregator = atoi(argv[i + 1]);
if (strcmp(argv[i], "-lock_option") == 0) lock_option = atoi(argv[i + 1]);
if (strcmp(argv[i], "-num_lock") == 0) num_lock = atoi(argv[i + 1]);
if (strcmp(argv[i], "-max_delay") == 0) max_delay = atoi(argv[i + 1]);
}
}
}
}
void Option::PrintUsage()
{
puts("Usage:");
puts("-size: word embedding size, e.g. 300");
puts("-train_file: the training corpus file, e.g.enwik2014");
puts("-read_vocab : the file to read all the vocab counts info");
puts("-binary : 0 or 1, indicates whether to write all the embeddings vectors into binary format");
puts("-cbow : 0 or 1, default 1, whether to use cbow or not");
puts("-alpha : initial learning rate, usually set to 0.025");
puts("-output : the output file to store all the embedding vectors");
puts("-window : the window size");
puts("-sample : the sub - sample size, usually set to 0");
puts("-hs : 0 or 1, default 1, whether to use hierarchical softmax");
puts("-negative : the negative word count in negative sampling, please set it to 0 when - hs = 1");
puts("-threads : the thread number to run in one machine");
puts("-min_count : words with lower frequency than min_count is removed from dictionary");
puts("-epoch : the epoch number");
puts("-stopwords : 0 or 1, whether to avoid training stop words");
puts("-sw_file : the stop words file storing all the stop words, valid when -stopwords = 1");
puts("-use_adagrad : 0 or 1, whether to use adagrad to adjust learnin rate");
puts("-data_block_size : default 1MB, the maximum bytes which a data block will store");
puts("-max_preload_data_size : default 8GB, the maximum data size(bytes) which multiverse_WordEmbedding will preload");
puts("-num_servers : default 0, the parameter of multiverso.Separately, 0 indicates all precesses are servers");
puts("-num_aggregator : default 1, number of aggregation threads in a process");
puts("-max_delay : default 0, the delay bound(max staleness)");
puts("-num_lock : default 100, number of locks in Locked option");
puts("-is_pipeline : 0 or 1, whether to use pipeline");
puts("-lock_option : default 0, Lock option. 0 : the trheads do not write and there is no contention; 1:there is no lock for thread contention; 2:normal lock for thread contention");
puts("-server_endpoint_file : default "", server ZMQ socket endpoint file in MPI - free version");
}
void Option::PrintUsage()
{
puts("Usage:");
puts("-size: word embedding size, e.g. 300");
puts("-train_file: the training corpus file, e.g.enwik2014");
puts("-read_vocab : the file to read all the vocab counts info");
puts("-binary : 0 or 1, indicates whether to write all the embeddings vectors into binary format");
puts("-cbow : 0 or 1, default 1, whether to use cbow or not");
puts("-alpha : initial learning rate, usually set to 0.025");
puts("-output : the output file to store all the embedding vectors");
puts("-window : the window size");
puts("-sample : the sub - sample size, usually set to 0");
puts("-hs : 0 or 1, default 1, whether to use hierarchical softmax");
puts("-negative : the negative word count in negative sampling, please set it to 0 when - hs = 1");
puts("-threads : the thread number to run in one machine");
puts("-min_count : words with lower frequency than min_count is removed from dictionary");
puts("-epoch : the epoch number");
puts("-stopwords : 0 or 1, whether to avoid training stop words");
puts("-sw_file : the stop words file storing all the stop words, valid when -stopwords = 1");
puts("-use_adagrad : 0 or 1, whether to use adagrad to adjust learnin rate");
puts("-data_block_size : default 1MB, the maximum bytes which a data block will store");
puts("-max_preload_data_size : default 8GB, the maximum data size(bytes) which multiverse_WordEmbedding will preload");
puts("-num_servers : default 0, the parameter of multiverso.Separately, 0 indicates all precesses are servers");
puts("-num_aggregator : default 1, number of aggregation threads in a process");
puts("-max_delay : default 0, the delay bound(max staleness)");
puts("-num_lock : default 100, number of locks in Locked option");
puts("-is_pipeline : 0 or 1, whether to use pipeline");
puts("-lock_option : default 0, Lock option. 0 : the trheads do not write and there is no contention; 1:there is no lock for thread contention; 2:normal lock for thread contention");
puts("-server_endpoint_file : default "", server ZMQ socket endpoint file in MPI - free version");
}
void Option::PrintArgs()
{
multiverso::Log::Info("train_file: %s\n", train_file);
multiverso::Log::Info("read_vocab_file: %s\n", read_vocab_file);
multiverso::Log::Info("output_file: %s\n", output_file);
multiverso::Log::Info("sw_file: %s\n", sw_file);
multiverso::Log::Info("hs: %d\n", hs);
multiverso::Log::Info("output_binary: %d\n", output_binary);
multiverso::Log::Info("cbow: %d\n", cbow);
multiverso::Log::Info("stopwords: %d\n", stopwords);
multiverso::Log::Info("use_adagrad: %d\n", use_adagrad);
multiverso::Log::Info("sample: %lf\n", sample);
multiverso::Log::Info("embeding_size: %d\n", embeding_size);
multiverso::Log::Info("thread_cnt: %d\n", thread_cnt);
multiverso::Log::Info("window_size: %d\n", window_size);
multiverso::Log::Info("negative_num: %d\n", negative_num);
multiverso::Log::Info("min_count: %d\n", min_count);
multiverso::Log::Info("epoch: %d\n", epoch);
multiverso::Log::Info("total_words: %lld\n", total_words);
multiverso::Log::Info("max_preload_data_size: %lld\n", max_preload_data_size);
multiverso::Log::Info("init_learning_rate: %lf\n", init_learning_rate);
multiverso::Log::Info("data_block_size: %lld\n", data_block_size);
multiverso::Log::Info("num_servers: %d\n", num_servers);
multiverso::Log::Info("num_aggregator: %d\n", num_aggregator);
multiverso::Log::Info("is_pipeline: %d\n", is_pipeline);
multiverso::Log::Info("lock_option: %d\n", lock_option);
multiverso::Log::Info("num_lock: %d\n", num_lock);
multiverso::Log::Info("max_delay: %d\n", max_delay);
multiverso::Log::Info("endpoints_file: %s\n", endpoints_file);
}
void Option::PrintArgs()
{
multiverso::Log::Info("train_file: %s\n", train_file);
multiverso::Log::Info("read_vocab_file: %s\n", read_vocab_file);
multiverso::Log::Info("output_file: %s\n", output_file);
multiverso::Log::Info("sw_file: %s\n", sw_file);
multiverso::Log::Info("hs: %d\n", hs);
multiverso::Log::Info("output_binary: %d\n", output_binary);
multiverso::Log::Info("cbow: %d\n", cbow);
multiverso::Log::Info("stopwords: %d\n", stopwords);
multiverso::Log::Info("use_adagrad: %d\n", use_adagrad);
multiverso::Log::Info("sample: %lf\n", sample);
multiverso::Log::Info("embeding_size: %d\n", embeding_size);
multiverso::Log::Info("thread_cnt: %d\n", thread_cnt);
multiverso::Log::Info("window_size: %d\n", window_size);
multiverso::Log::Info("negative_num: %d\n", negative_num);
multiverso::Log::Info("min_count: %d\n", min_count);
multiverso::Log::Info("epoch: %d\n", epoch);
multiverso::Log::Info("total_words: %lld\n", total_words);
multiverso::Log::Info("max_preload_data_size: %lld\n", max_preload_data_size);
multiverso::Log::Info("init_learning_rate: %lf\n", init_learning_rate);
multiverso::Log::Info("data_block_size: %lld\n", data_block_size);
multiverso::Log::Info("num_servers: %d\n", num_servers);
multiverso::Log::Info("num_aggregator: %d\n", num_aggregator);
multiverso::Log::Info("is_pipeline: %d\n", is_pipeline);
multiverso::Log::Info("lock_option: %d\n", lock_option);
multiverso::Log::Info("num_lock: %d\n", num_lock);
multiverso::Log::Info("max_delay: %d\n", max_delay);
multiverso::Log::Info("endpoints_file: %s\n", endpoints_file);
}
Sampler::Sampler()
{
table_ = nullptr;
}
//Set the negative-sampling distribution
void Sampler::SetNegativeSamplingDistribution(Dictionary *dictionary)
{
real train_words_pow = 0;
real power = 0.75;
table_ = (int *)malloc(kTableSize * sizeof(int));
for (int i = 0; i < dictionary->Size(); ++i)
train_words_pow += static_cast<real>(pow(dictionary->GetWordInfo(i)->freq, power));
int cur_pos = 0;
real d1 = (real)pow(dictionary->GetWordInfo(cur_pos)->freq, power)
/ (real)train_words_pow;
Sampler::Sampler()
{
table_ = nullptr;
}
//Set the negative-sampling distribution
void Sampler::SetNegativeSamplingDistribution(Dictionary *dictionary)
{
real train_words_pow = 0;
real power = 0.75;
table_ = (int *)malloc(kTableSize * sizeof(int));
for (int i = 0; i < dictionary->Size(); ++i)
train_words_pow += static_cast<real>(pow(dictionary->GetWordInfo(i)->freq, power));
int cur_pos = 0;
real d1 = (real)pow(dictionary->GetWordInfo(cur_pos)->freq, power)
/ (real)train_words_pow;
assert(table_ != nullptr);
for (int i = 0; i < kTableSize; ++i)
{
table_[i] = cur_pos;
if (i > d1 * kTableSize && cur_pos + 1 < dictionary->Size())
{
cur_pos++;
d1 += (real)pow(dictionary->GetWordInfo(cur_pos)->freq, power)
/ (real)train_words_pow;
}
}
}
assert(table_ != nullptr);
for (int i = 0; i < kTableSize; ++i)
{
table_[i] = cur_pos;
if (i > d1 * kTableSize && cur_pos + 1 < dictionary->Size())
{
cur_pos++;
d1 += (real)pow(dictionary->GetWordInfo(cur_pos)->freq, power)
/ (real)train_words_pow;
}
}
}
bool Sampler::WordSampling(int64 word_cnt,
int64 train_words, real sample)
{
real ran = (sqrt(word_cnt / (sample * train_words)) + 1) *
(sample * train_words) / word_cnt;
return (ran > ((real)rand() / (RAND_MAX)));
}
//Get the next random
uint64 Sampler::GetNextRandom(uint64 next_random)
{
return next_random * (uint64)25214903917 + 11;
}
bool Sampler::WordSampling(int64 word_cnt,
int64 train_words, real sample)
{
real ran = (sqrt(word_cnt / (sample * train_words)) + 1) *
(sample * train_words) / word_cnt;
return (ran > ((real)rand() / (RAND_MAX)));
}
//Get the next random
uint64 Sampler::GetNextRandom(uint64 next_random)
{
return next_random * (uint64)25214903917 + 11;
}
int Sampler::NegativeSampling(uint64 next_random)
{
return table_[(next_random >> 16) % kTableSize];
}
int Sampler::NegativeSampling(uint64 next_random)
{
return table_[(next_random >> 16) % kTableSize];
}
std::string GetSystemTime()
{
time_t t = time(0);
char tmp[128];
strftime(tmp, sizeof(tmp), "%Y%m%d%H%M%S", localtime(&t));
return std::string(tmp);
}
std::string GetSystemTime()
{
time_t t = time(0);
tm timeinfo;
localtime_s(&timeinfo, &t);
char tmp[128];
strftime(tmp, sizeof(tmp), "%Y%m%d%H%M%S", &timeinfo);
return std::string(tmp);
}
//Get the size of filename, it should deal with large files
int64 GetFileSize(const char *filename)
{
#ifdef _MSC_VER
struct _stat64 info;
_stat64(filename, &info);
return (int64)info.st_size;
#else
struct stat info;
stat(filename, &info);
return(int64)info.st_size;
#endif
}
//Readword from train_file to word array by the word index
bool ReadWord(char *word, FILE *fin)
{
int idx = 0;
char ch;
while (!feof(fin))
{
ch = fgetc(fin);
if (ch == 13) continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
{
if (idx > 0)
{
if (ch == '\n')
ungetc(ch, fin);
break;
}
if (ch == '\n')
{
strcpy(word, (char *)"</s>");
return true;
}
else
{
continue;
}
}
word[idx++] = ch;
if (idx >= kMaxString - 1)
idx--;
}
word[idx] = 0;
return idx > 0;
}
std::string g_log_suffix;
real* expTable;
void InitExpTable(){
expTable = (real *)malloc((kExpTableSize + 1) * sizeof(real));
for (int i = 0; i < kExpTableSize; i++) {
expTable[i] = exp((i / (real)kExpTableSize * 2 - 1) * kMaxExp); // Precompute the exp() table
expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
}
}
}
std::string g_log_suffix;
}
}

Просмотреть файл

@ -1,5 +1,5 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_UTIL_H_
#define DISTRIBUTED_WORD_EMBEDDING_UTIL_H_
#pragma once
/*!
* file util.h
* \brief Struct Option stores many general arguments in model
@ -10,75 +10,68 @@
#include <random>
#include <cassert>
#include <exception>
#include <sys/stat.h>
#include "constant.h"
#include "dictionary.h"
namespace multiverso
{
namespace wordembedding
{
struct Option
{
const char* train_file;
const char* read_vocab_file;
const char* output_file;
const char* sw_file;
const char* endpoints_file;
bool hs, output_binary, cbow, stopwords;
bool use_adagrad;
bool is_pipeline;
real sample;
int64 data_block_size;
int embeding_size, thread_cnt, window_size, negative_num, min_count, epoch;
int64 total_words;
int64 max_preload_data_size;
real init_learning_rate;
int num_servers, num_aggregator, lock_option, num_lock, max_delay;
namespace wordembedding
{
struct Option
{
const char* train_file;
const char* read_vocab_file;
const char* output_file;
const char* sw_file;
const char* endpoints_file;
bool hs, output_binary, cbow, stopwords;
bool use_adagrad;
bool is_pipeline;
real sample;
int64 data_block_size;
int embeding_size, thread_cnt, window_size, negative_num, min_count, epoch;
int64 total_words;
int64 max_preload_data_size;
real init_learning_rate;
int num_servers, num_aggregator, lock_option, num_lock, max_delay;
Option();
/*!
* \brief Get the model-set arguments from file
*/
void ParseArgs(int argc, char* argv[]);
void PrintArgs();
void PrintUsage();
Option();
/*!
* \brief Get the model-set arguments from file
*/
void ParseArgs(int argc, char* argv[]);
void PrintArgs();
void PrintUsage();
};
};
class Sampler
{
public:
Sampler();
/*!
* \brief Set the negative-sampling distribution for every vocabulary
* \param dictionary the train_file dictionary
*/
void SetNegativeSamplingDistribution(Dictionary *dictionary);
bool WordSampling(int64 word_cnt, int64 train_words, real sample);
/*!
* \brief Get the next random according to the existing random seed
*/
uint64 GetNextRandom(uint64 next_random);
int NegativeSampling(uint64 next_random);
private:
int* table_;
class Sampler
{
public:
Sampler();
/*!
* \brief Set the negative-sampling distribution for every vocabulary
* \param dictionary the train_file dictionary
*/
void SetNegativeSamplingDistribution(Dictionary *dictionary);
bool WordSampling(int64 word_cnt, int64 train_words, real sample);
/*!
* \brief Get the next random according to the existing random seed
*/
uint64 GetNextRandom(uint64 next_random);
int NegativeSampling(uint64 next_random);
//No copying allowed
Sampler(const Sampler&);
void operator=(const Sampler&);
};
private:
int* table_;
std::string GetSystemTime();
int64 GetFileSize(const char *filename);
bool ReadWord(char *word, FILE *fin);
void InitExpTable();
//No copying allowed
Sampler(const Sampler&);
void operator=(const Sampler&);
};
extern std::string g_log_suffix;
extern real* expTable;
}
}
#endif
std::string GetSystemTime();
extern std::string g_log_suffix;
}
}

Просмотреть файл

@ -2,352 +2,374 @@
namespace multiverso
{
namespace wordembedding
{
WordEmbedding::WordEmbedding(Option* option, HuffmanEncoder* huffmanEncoder,
Sampler* sampler, int dictionary_size)
{
word_count_actual = 0;
option_ = option;
huffmanEncoder_ = huffmanEncoder;
sampler_ = sampler;
dictionary_size_ = dictionary_size;
learning_rate = option_->init_learning_rate;
data_block_ = nullptr;
//InitExpTable();
}
namespace wordembedding
{
WordEmbedding::WordEmbedding(Option* option, HuffmanEncoder* huffmanEncoder,
Sampler* sampler, int dictionary_size)
{
word_count_actual = 0;
option_ = option;
huffmanEncoder_ = huffmanEncoder;
sampler_ = sampler;
dictionary_size_ = dictionary_size;
learning_rate = option_->init_learning_rate;
weight_IE_ = nullptr;
weight_EO_ = nullptr;
sum_gradient2_IE_ = nullptr;
sum_gradient2_EO_ = nullptr;
}
WordEmbedding::~WordEmbedding()
{
}
WordEmbedding::~WordEmbedding()
{
delete [] weight_IE_;
delete [] weight_EO_;
//Train neural networks of WordEmbedding
void WordEmbedding::Train(DataBlock *data_block, int index_start, int interval,
int64& word_count, real* hidden_act, real* hidden_err)
{
data_block_ = data_block;
std::vector <int> negativesample(data_block_->negativesample_pools.begin(),
data_block_->negativesample_pools.end());
if (option_->use_adagrad)
{
delete [] sum_gradient2_IE_;
delete [] sum_gradient2_EO_;
}
}
//Allocate the memory for some private pointers
void WordEmbedding::MallocMemory()
{
weight_IE_ = new (std::nothrow)real*[dictionary_size_];
assert(weight_IE_ != nullptr);
weight_EO_ = new (std::nothrow)real*[dictionary_size_];
assert(weight_EO_ != nullptr);
if (option_->use_adagrad)
{
sum_gradient2_IE_ = new (std::nothrow)real*[dictionary_size_];
sum_gradient2_EO_ = new (std::nothrow)real*[dictionary_size_];
assert(sum_gradient2_IE_ != nullptr);
assert(sum_gradient2_EO_ != nullptr);
}
}
//Train neural networks of WordEmbedding
void WordEmbedding::Train(DataBlock *data_block, int index_start, int interval,
int64& word_count, real* hidden_act, real* hidden_err)
{
std::vector <int> negativesample(data_block->negativesample_pools.begin(),
data_block->negativesample_pools.end());
for (int i = index_start; i < data_block->Size(); i += interval)
{
int sentence_length;
int64 word_count_deta;
int *sentence;
uint64 next_random;
data_block->GetSentence(i, sentence, sentence_length,
word_count_deta, next_random);
for (int i = index_start; i < data_block_->Size(); i += interval)
{
int sentence_length;
int64 word_count_deta;
int *sentence;
uint64 next_random;
data_block_->GetSentence(i, sentence, sentence_length,
word_count_deta, next_random);
this->Train(sentence, sentence_length,
next_random, hidden_act, hidden_err, negativesample);
this->Train(sentence, sentence_length,
next_random, hidden_act, hidden_err, negativesample);
word_count += word_count_deta;
}
}
//Update the learning rate
void WordEmbedding::UpdateLearningRate()
{
if (option_->use_adagrad == false)
{
learning_rate = static_cast<real>(option_->init_learning_rate *
(1 - word_count_actual / ((real)option_->total_words * option_->epoch + 1.0)));
if (learning_rate < option_->init_learning_rate * 0.0001)
learning_rate = static_cast<real>(option_->init_learning_rate * 0.0001);
}
}
word_count += word_count_deta;
}
}
//Update the learning rate
void WordEmbedding::UpdateLearningRate()
{
if (option_->use_adagrad == false)
{
learning_rate = static_cast<real>(option_->init_learning_rate *
(1 - word_count_actual / ((real)option_->total_words * option_->epoch + 1.0)));
if (learning_rate < option_->init_learning_rate * 0.0001)
learning_rate = static_cast<real>(option_->init_learning_rate * 0.0001);
}
}
void WordEmbedding::Train(int* sentence, int sentence_length,
uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools)
{
ParseSentence(sentence, sentence_length,
next_random, hidden_act, hidden_err, &WordEmbedding::TrainSample, negativesample_pools);
}
//Train with forward direction and get the input-hidden layer vector
void WordEmbedding::FeedForward(std::vector<int>& input_nodes, real* hidden_act)
{
for (int i = 0; i < input_nodes.size(); ++i)
{
int &node_id = input_nodes[i];
real* input_embedding = weight_IE_[node_id];
for (int j = 0; j < option_->embeding_size; ++j)
hidden_act[j] += input_embedding[j];
}
void WordEmbedding::Train(int* sentence, int sentence_length,
uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools)
{
ParseSentence(sentence, sentence_length,
next_random, hidden_act, hidden_err, &WordEmbedding::TrainSample, negativesample_pools);
}
//Train with forward direction and get the input-hidden layer vector
void WordEmbedding::FeedForward(std::vector<int>& input_nodes, real* hidden_act)
{
for (int i = 0; i < input_nodes.size(); ++i)
{
int &node_id = input_nodes[i];
real* input_embedding = GetWeightIE(node_id);
for (int j = 0; j < option_->embeding_size; ++j){
hidden_act[j] += input_embedding[j];
}
}
if (input_nodes.size() > 1)
{
for (int j = 0; j < option_->embeding_size; ++j)
hidden_act[j] /= input_nodes.size();
}
}
//Train with inverse direction and update the hidden-output
void WordEmbedding::BPOutputLayer(int label, int word_idx,
real* classifier, real* hidden_act, real* hidden_err)
{
assert(classifier != nullptr && hidden_act != nullptr && hidden_err != nullptr);
real f = 0;
//Propagate hidden -> output
for (int j = 0; j < option_->embeding_size; ++j)
f += hidden_act[j] * classifier[j];
f = 1 / (1 + exp(-f));
real error = (1 - label - f);
//Propagate errors output -> hidden
for (int j = 0; j < option_->embeding_size; ++j)
hidden_err[j] += error * classifier[j];
if (input_nodes.size() > 1)
{
for (int j = 0; j < option_->embeding_size; ++j){
hidden_act[j] /= input_nodes.size();
}
}
}
//Train with inverse direction and update the hidden-output
void WordEmbedding::BPOutputLayer(int label, int word_idx,
real* classifier, real* hidden_act, real* hidden_err)
{
assert(classifier != nullptr && hidden_act != nullptr && hidden_err != nullptr);
real f = 0;
if (option_->use_adagrad)
{
real* sum_gradient2_row = sum_gradient2_EO_[word_idx];
assert(sum_gradient2_row != nullptr);
//Learn weights hidden -> output
for (int j = 0; j < option_->embeding_size; ++j)
{
real g = error * hidden_act[j];
sum_gradient2_row[j] += g * g;
if (sum_gradient2_row[j] > 1e-10)
classifier[j] += g * option_->init_learning_rate / sqrt(sum_gradient2_row[j]);
}
}
else
{
//'g' is the gradient multiplied by the learning rate
real g = error * learning_rate;
//Learn weights hidden -> output
for (int j = 0; j < option_->embeding_size; ++j)
classifier[j] += g * hidden_act[j];
}
}
//Propagate hidden -> output
for (int j = 0; j < option_->embeding_size; ++j)
f += hidden_act[j] * classifier[j];
void WordEmbedding::TrainSample(std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes,
void *local_hidden_act, void *local_hidden_err)
{
real* hidden_act = (real*)local_hidden_act;
real* hidden_err = (real*)local_hidden_err;
assert(hidden_act != nullptr);
assert(hidden_err != nullptr);
memset(hidden_act, 0, option_->embeding_size * sizeof(real));
memset(hidden_err, 0, option_->embeding_size * sizeof(real));
FeedForward(input_nodes, hidden_act);
f = 1 / (1 + exp(-f));
for (int i = 0; i < output_nodes.size(); ++i)
{
int &node_id = output_nodes[i].first;
int &code = output_nodes[i].second;
BPOutputLayer(code, node_id, weight_EO_[node_id],
hidden_act, hidden_err);
}
if (option_->use_adagrad)
{
//Update context embedding
for (int i = 0; i < input_nodes.size(); ++i)
{
int &node_id = input_nodes[i];
real* input_embedding_row = weight_IE_[node_id];
real* sum_gradient2_row = sum_gradient2_IE_[node_id];
assert(input_embedding_row != nullptr && sum_gradient2_row != nullptr);
for (int j = 0; j < option_->embeding_size; ++j)
{
sum_gradient2_row[j] += hidden_err[j] * hidden_err[j];
if (sum_gradient2_row[j] > 1e-10)
input_embedding_row[j] += hidden_err[j] * option_->init_learning_rate / sqrt(sum_gradient2_row[j]);
}
}
}
else
{
for (int j = 0; j < option_->embeding_size; ++j)
hidden_err[j] *= learning_rate;
//Update context embedding
for (int i = 0; i < input_nodes.size(); ++i)
{
int &node_id = input_nodes[i];
real* input_embedding = weight_IE_[node_id];
assert(input_embedding != nullptr);
for (int j = 0; j < option_->embeding_size; ++j)
input_embedding[j] += hidden_err[j];
}
}
}
//Parapare the parameter for the datablock
void WordEmbedding::PrepareParameter(DataBlock* data_block)
{
int sentence_length;
int64 word_count_delta;
int *sentence;
uint64 next_random;
if (option_->hs)
{
for (int i = 0; i < data_block->Size(); ++i)
{
data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
{
data_block->input_nodes.insert(sentence[sentence_position]);
}
}
for (auto input_node : data_block->input_nodes)
{
auto info = huffmanEncoder_->GetLabelInfo(input_node);
for (int d = 0; d < info->codelen; d++)
data_block->output_nodes.insert(info->point[d]);
}
}
else
{
for (int i = 0; i < data_block->Size(); ++i)
{
data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
{
data_block->input_nodes.insert(sentence[sentence_position]);
}
}
for (auto input_node : data_block->input_nodes)
{
data_block->output_nodes.insert(input_node);
}
for (int d = 0; d < option_->negative_num * data_block->input_nodes.size(); d++)
{
next_random = sampler_->GetNextRandom(next_random);
int target = sampler_->NegativeSampling(next_random);
data_block->output_nodes.insert(target);
data_block->negativesample_pools.insert(target);
}
}
}
//Copy the input&ouput nodes
void WordEmbedding::DealPrepareParameter(std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes,
void *hidden_act, void *hidden_err)
{
for (int i = 0; i < input_nodes.size(); ++i)
input_nodes_.insert(input_nodes[i]);
for (int i = 0; i < output_nodes.size(); ++i)
output_nodes_.insert(output_nodes[i].first);
}
//Parse the sentence and deepen into two branches
void WordEmbedding::ParseSentence(int* sentence, int sentence_length,
uint64 next_random, real* hidden_act, real* hidden_err,
FunctionType function, std::vector <int> &negativesample_pools)
{
if (sentence_length == 0)
return;
int feat[kMaxSentenceLength + 1];
std::vector<int> input_nodes;
std::vector<std::pair<int, int> > output_nodes;
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
{
if (sentence[sentence_position] == -1) continue;
next_random = sampler_->GetNextRandom(next_random);
int off = next_random % option_->window_size;
int feat_size = 0;
for (int i = off; i < option_->window_size * 2 + 1 - off; ++i)
if (i != option_->window_size)
{
int c = sentence_position - option_->window_size + i;
if (c < 0 || c >= sentence_length || sentence[c] == -1)
continue;
feat[feat_size++] = sentence[c];
if (!option_->cbow) //train Skip-gram
{
input_nodes.clear();
output_nodes.clear();
Parse(feat + feat_size - 1, 1, sentence[sentence_position],
next_random, input_nodes, output_nodes, negativesample_pools);
(this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
}
}
/*
if (f >-kMaxExp && f < kMaxExp){
f = expTable[(int)((f + kMaxExp) * (kExpTableSize / kMaxExp / 2))];
}
*/
real error = (1 - label - f);
//Propagate errors output -> hidden
for (int j = 0; j < option_->embeding_size; ++j)
hidden_err[j] += error * classifier[j];
if (option_->cbow) //train cbow
{
input_nodes.clear();
output_nodes.clear();
Parse(feat, feat_size, sentence[sentence_position],
next_random, input_nodes, output_nodes, negativesample_pools);
if (option_->use_adagrad)
{
real* sum_gradient2_row = GetSumGradient2EO(word_idx);
assert(sum_gradient2_row != nullptr);
//Learn weights hidden -> output
for (int j = 0; j < option_->embeding_size; ++j)
{
real g = error * hidden_act[j];
sum_gradient2_row[j] += g * g;
if (sum_gradient2_row[j] > 1e-10)
classifier[j] += g * option_->init_learning_rate / sqrt(sum_gradient2_row[j]);
}
}
else
{
//'g' is the gradient multiplied by the learning rate
real g = error * learning_rate;
//Learn weights hidden -> output
for (int j = 0; j < option_->embeding_size; ++j)
classifier[j] += g * hidden_act[j];
}
}
(this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
}
}
}
//Parse the windows's input&output nodes
inline void WordEmbedding::Parse(int *feat, int feat_cnt, int word_idx,
uint64 &next_random, std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools)
{
for (int i = 0; i < feat_cnt; ++i)
{
input_nodes.push_back(feat[i]);
}
void WordEmbedding::TrainSample(std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes,
void *local_hidden_act, void *local_hidden_err)
{
real* hidden_act = (real*)local_hidden_act;
real* hidden_err = (real*)local_hidden_err;
assert(hidden_act != nullptr);
assert(hidden_err != nullptr);
memset(hidden_act, 0, option_->embeding_size * sizeof(real));
memset(hidden_err, 0, option_->embeding_size * sizeof(real));
if (option_->hs)
{
auto info = huffmanEncoder_->GetLabelInfo(word_idx);
for (int d = 0; d < info->codelen; d++)
output_nodes.push_back(std::make_pair(info->point[d], info->code[d]));
}
else
if (option_->negative_num)
{
output_nodes.push_back(std::make_pair(word_idx, 1));
for (int d = 0; d < option_->negative_num; d++)
{
next_random = sampler_->GetNextRandom(next_random);
int index = (next_random >> 8) % negativesample_pools.size();
int target = negativesample_pools[index];
if (target == word_idx) continue;
output_nodes.push_back(std::make_pair(target, 0));
}
}
}
//Set the weight of input-embedding vector
void WordEmbedding::SetWeightIE(int input_node_id, real* ptr)
{
weight_IE_[input_node_id] = ptr;
}
FeedForward(input_nodes, hidden_act);
//Set the weight of output-embedding vector
void WordEmbedding::SetWeightEO(int output_node_id, real* ptr)
{
weight_EO_[output_node_id] = ptr;
}
//Get the weight of output-embedding vector
real* WordEmbedding::GetWeightIE(int input_node_id)
{
return weight_IE_[input_node_id];
}
//Get the weight of output-embedding vector
real* WordEmbedding::GetWeightEO(int output_node_id)
{
return weight_EO_[output_node_id];
}
for (int i = 0; i < output_nodes.size(); ++i)
{
int &node_id = output_nodes[i].first;
int &code = output_nodes[i].second;
BPOutputLayer(code, node_id, GetWeightEO(node_id),
hidden_act, hidden_err);
}
//Set the weight of SumGradient-input vector
void WordEmbedding::SetSumGradient2IE(int input_node_id, real* ptr)
{
sum_gradient2_IE_[input_node_id] = ptr;
}
if (option_->use_adagrad)
{
//Update context embedding
for (int i = 0; i < input_nodes.size(); ++i)
{
int &node_id = input_nodes[i];
real* input_embedding_row = GetWeightIE(node_id);
real* sum_gradient2_row = GetSumGradient2IE(node_id);
assert(input_embedding_row != nullptr && sum_gradient2_row != nullptr);
for (int j = 0; j < option_->embeding_size; ++j)
{
sum_gradient2_row[j] += hidden_err[j] * hidden_err[j];
if (sum_gradient2_row[j] > 1e-10)
input_embedding_row[j] += hidden_err[j] * option_->init_learning_rate / sqrt(sum_gradient2_row[j]);
}
}
}
else
{
for (int j = 0; j < option_->embeding_size; ++j)
hidden_err[j] *= learning_rate;
//Update context embedding
for (int i = 0; i < input_nodes.size(); ++i)
{
int &node_id = input_nodes[i];
real* input_embedding = GetWeightIE(node_id);
assert(input_embedding != nullptr);
for (int j = 0; j < option_->embeding_size; ++j)
input_embedding[j] += hidden_err[j];
}
}
}
//Set the weight of SumGradient-output vector
void WordEmbedding::SetSumGradient2EO(int output_node_id, real* ptr)
{
sum_gradient2_EO_[output_node_id] = ptr;
}
//Parapare the data for the datablock
void WordEmbedding::PrepareData(DataBlock* data_block)
{
int sentence_length;
int64 word_count_delta;
int *sentence;
uint64 next_random;
if (option_->hs)
{
for (int i = 0; i < data_block->Size(); ++i)
{
data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);
//Get the weight of SumGradient-input vector
real* WordEmbedding::GetSumGradient2IE(int input_node_id)
{
return sum_gradient2_IE_[input_node_id];
}
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
{
data_block->input_nodes.insert(sentence[sentence_position]);
}
}
for (auto input_node : data_block->input_nodes)
{
auto info = huffmanEncoder_->GetLabelInfo(input_node);
for (int d = 0; d < info->codelen; d++)
data_block->output_nodes.insert(info->point[d]);
}
}
else
{
for (int i = 0; i < data_block->Size(); ++i)
{
data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
{
data_block->input_nodes.insert(sentence[sentence_position]);
}
}
for (auto input_node : data_block->input_nodes)
{
data_block->output_nodes.insert(input_node);
}
for (int d = 0; d < option_->negative_num * data_block->input_nodes.size(); d++)
{
next_random = sampler_->GetNextRandom(next_random);
int target = sampler_->NegativeSampling(next_random);
data_block->output_nodes.insert(target);
data_block->negativesample_pools.insert(target);
}
}
}
//Parse the sentence and deepen into two branches
void WordEmbedding::ParseSentence(int* sentence, int sentence_length,
uint64 next_random, real* hidden_act, real* hidden_err,
FunctionType function, std::vector <int> &negativesample_pools)
{
if (sentence_length == 0)
return;
int feat[kMaxSentenceLength + 1];
std::vector<int> input_nodes;
std::vector<std::pair<int, int> > output_nodes;
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
{
if (sentence[sentence_position] == -1) continue;
next_random = sampler_->GetNextRandom(next_random);
int off = next_random % option_->window_size;
int feat_size = 0;
for (int i = off; i < option_->window_size * 2 + 1 - off; ++i)
if (i != option_->window_size)
{
int c = sentence_position - option_->window_size + i;
if (c < 0 || c >= sentence_length || sentence[c] == -1)
continue;
feat[feat_size++] = sentence[c];
if (!option_->cbow) //train Skip-gram
{
input_nodes.clear();
output_nodes.clear();
Parse(feat + feat_size - 1, 1, sentence[sentence_position],
next_random, input_nodes, output_nodes, negativesample_pools);
(this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
}
}
if (option_->cbow) //train cbow
{
input_nodes.clear();
output_nodes.clear();
Parse(feat, feat_size, sentence[sentence_position],
next_random, input_nodes, output_nodes, negativesample_pools);
(this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
}
}
}
//Parse the windows's input&output nodes
inline void WordEmbedding::Parse(int *feat, int feat_cnt, int word_idx,
uint64 &next_random, std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools)
{
for (int i = 0; i < feat_cnt; ++i)
{
input_nodes.push_back(feat[i]);
}
if (option_->hs)
{
auto info = huffmanEncoder_->GetLabelInfo(word_idx);
for (int d = 0; d < info->codelen; d++)
output_nodes.push_back(std::make_pair(info->point[d], info->code[d]));
}
else
if (option_->negative_num)
{
output_nodes.push_back(std::make_pair(word_idx, 1));
for (int d = 0; d < option_->negative_num; d++)
{
next_random = sampler_->GetNextRandom(next_random);
int index = (next_random >> 8) % negativesample_pools.size();
int target = negativesample_pools[index];
if (target == word_idx) continue;
output_nodes.push_back(std::make_pair(target, 0));
}
}
}
void WordEmbedding::SetWeightIE(int input_node_id, real* ptr)
{
data_block_->SetWeightIE(input_node_id,ptr);
}
void WordEmbedding::SetWeightEO(int output_node_id, real* ptr)
{
data_block_->SetWeightEO(output_node_id,ptr);
}
real* WordEmbedding::GetWeightIE(int input_node_id)
{
return data_block_->GetWeightIE(input_node_id);
}
real* WordEmbedding::GetWeightEO(int output_node_id)
{
return data_block_->GetWeightEO(output_node_id);
}
void WordEmbedding::SetSumGradient2IE(int input_node_id, real* ptr)
{
data_block_->SetSumGradient2IE(input_node_id, ptr);
}
void WordEmbedding::SetSumGradient2EO(int output_node_id, real* ptr)
{
data_block_->SetSumGradient2EO(output_node_id, ptr);
}
real* WordEmbedding::GetSumGradient2IE(int input_node_id)
{
return data_block_->GetSumGradient2IE(input_node_id);
}
real* WordEmbedding::GetSumGradient2EO(int output_node_id)
{
return data_block_->GetSumGradient2EO(output_node_id);
}
}
//Get the weight of SumGradient-output vector
real* WordEmbedding::GetSumGradient2EO(int output_node_id)
{
return sum_gradient2_EO_[output_node_id];
}
}
}

Просмотреть файл

@ -1,144 +1,148 @@
#ifndef DISTRIBUTED_WORD_EMBEDDING_WORD_EMBEDDING_H_
#define DISTRIBUTED_WORD_EMBEDDING_WORD_EMBEDDING_H_
#pragma once
/*!
* file WordEmbedding.h
* \brief Class WordEmbedding includes some functions and parameters about TrainNN
* \brief Class WordEmbedding includes some functions and parameters about TrainNN
*/
#include <vector>
#include <cstring>
#include "multiverso/multiverso.h"
#include "util.h"
#include "multiverso.h"
#include "huffman_encoder.h"
#include "distributed_wordembedding.h"
#include "constant.h"
#include "data_block.h"
namespace multiverso
{
namespace wordembedding
{
class WordEmbedding
{
public:
real learning_rate;
int64 word_count_actual;
namespace wordembedding
{
class WordEmbedding
{
public:
real learning_rate;
int64 word_count_actual;
WordEmbedding(Option* option, HuffmanEncoder* huffmanEncoder,
Sampler* sampler, int dictionary_size);
~WordEmbedding();
/*!
* \brief Create memory for weight_IE_ weight_EO_ sum_gradient2_IE_ sum_gradient2_EO_
*/
WordEmbedding(Option* option, HuffmanEncoder* huffmanEncoder,
Sampler* sampler, int dictionary_size);
~WordEmbedding();
/*!
* \brief Create memory for weight_IE_ weight_EO_ sum_gradient2_IE_ sum_gradient2_EO_
*/
void MallocMemory();
/*!
* \brief TrainNN
* \param data_block represents the trainNNing datablock
* \param index_start the thread's starting index in the sentence vector
* \param interval the total_number of thread
* \param word_count count the words which has been processed by trainNN
* \param hidden_act hidden layer value
* \param hidden_err hidden layer error
*/
void Train(DataBlock *data_block, int index_start,
int interval, int64& word_count,
real* hidden_act, real* hidden_err);
/*!
* \brief PrepareParameter for parameterloader threat
* \param data_block datablock for parameterloader to parse
* \param input_nodes input_nodes represent the parameter which input_layer includes
* \param output_nodes output_nodes represent the parameter which output_layer inclueds
*/
void PrepareParameter(DataBlock *data_block);
/*!
* \brief Update the learning rate
*/
void UpdateLearningRate();
/*!
* \brief Set the input(output)-embeddding weight
*/
void SetWeightIE(int input_node_id, real* ptr);
void SetWeightEO(int output_node_id, real* ptr);
/*!
* \brief Set the SumGradient-input(ouput)
*/
void SetSumGradient2IE(int input_node_id, real* ptr);
void SetSumGradient2EO(int output_node_id, real* ptr);
/*!
* \brief Return the parametertable value
*/
real* GetWeightIE(int input_node_id);
real* GetWeightEO(int output_node_id);
real* GetSumGradient2IE(int input_node_id);
real* GetSumGradient2EO(int output_node_id);
/*!
* \brief TrainNN
* \param data_block represents the trainNNing datablock
* \param index_start the thread's starting index in the sentence vector
* \param interval the total_number of thread
* \param word_count count the words which has been processed by trainNN
* \param hidden_act hidden layer value
* \param hidden_err hidden layer error
*/
void Train(DataBlock *data_block, int index_start,
int interval, int64& word_count,
real* hidden_act, real* hidden_err);
/*!
* \brief PrepareParameter for parameterloader threat
* \param data_block datablock for parameterloader to parse
* \param input_nodes input_nodes represent the parameter which input_layer includes
* \param output_nodes output_nodes represent the parameter which output_layer inclueds
*/
void PrepareData(DataBlock *data_block);
/*!
* \brief Update the learning rate
*/
void UpdateLearningRate();
/*!
* \brief Set the input(output)-embeddding weight
*/
void SetWeightIE(int input_node_id, real* ptr);
void SetWeightEO(int output_node_id, real* ptr);
private:
Option *option_;
Dictionary *dictionary_;
HuffmanEncoder *huffmanEncoder_;
Sampler *sampler_;
std::unordered_set<int> input_nodes_, output_nodes_;
int dictionary_size_;
real** weight_IE_;
real** weight_EO_;
real** sum_gradient2_IE_;
real** sum_gradient2_EO_;
/*!
* \brief Return the parametertable value
*/
real* GetWeightIE(int input_node_id);
real* GetWeightEO(int output_node_id);
typedef void(WordEmbedding::*FunctionType)(std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes,
void *hidden_act, void *hidden_err);
/*!
* \brief Parse the needed parameter in a window
*/
void Parse(int *feat, int feat_cnt, int word_idx, uint64 &next_random,
std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools);
/*!
* \brief Parse a sentence and deepen into two branchs
* \one for TrainNN,the other one is for Parameter_parse&request
*/
void ParseSentence(int* sentence, int sentence_length,
uint64 next_random,
real* hidden_act, real* hidden_err,
FunctionType function, std::vector <int> &negativesample_pools);
/*!
* \brief Get the hidden layer vector
* \param input_nodes represent the input nodes
* \param hidden_act store the hidden layer vector
*/
void FeedForward(std::vector<int>& input_nodes, real* hidden_act);
/*!
* \brief Calculate the hidden_err and update the output-embedding weight
* \param label record the label of every output-embedding vector
* \param word_idx the index of the output-embedding vector
* \param classifier store the output-embedding vector
* \param store the hidden layer vector
* \param store the hidden-error which is used
* \to update the input-embedding vector
*/
void BPOutputLayer(int label, int word_idx, real* classifier,
real* hidden_act, real* hidden_err);
/*!
* \brief Copy the input_nodes&output_nodes to WordEmbedding private set
*/
void DealPrepareParameter(std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes,
void *hidden_act, void *hidden_err);
/*!
* \brief Train a window sample and update the
* \input-embedding&output-embedding vectors
* \param input_nodes represent the input nodes
* \param output_nodes represent the ouput nodes
* \param hidden_act store the hidden layer vector
* \param hidden_err store the hidden layer error
*/
void TrainSample(std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes,
void *hidden_act, void *hidden_err);
/*!
* \brief Train the sentence actually
*/
void Train(int* sentence, int sentence_length,
uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools);
real* GetSumGradient2IE(int input_node_id);
real* GetSumGradient2EO(int output_node_id);
void SetSumGradient2IE(int input_node_id, real* ptr);
void SetSumGradient2EO(int output_node_id, real* ptr);
private:
Option *option_ = nullptr;
Dictionary *dictionary_ = nullptr;
HuffmanEncoder *huffmanEncoder_ = nullptr;
Sampler *sampler_ = nullptr;
std::unordered_set<int> input_nodes_, output_nodes_;
int dictionary_size_;
DataBlock * data_block_ = nullptr;
typedef void(WordEmbedding::*FunctionType)(std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes,
void *hidden_act, void *hidden_err);
/*!
* \brief Parse the needed parameter in a window
*/
void Parse(int *feat, int feat_cnt, int word_idx, uint64 &next_random,
std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools);
/*!
* \brief Parse a sentence and deepen into two branchs
* \one for TrainNN,the other one is for Parameter_parse&request
*/
void ParseSentence(int* sentence, int sentence_length,
uint64 next_random,
real* hidden_act, real* hidden_err,
FunctionType function, std::vector <int> &negativesample_pools);
/*!
* \brief Get the hidden layer vector
* \param input_nodes represent the input nodes
* \param hidden_act store the hidden layer vector
*/
void FeedForward(std::vector<int>& input_nodes, real* hidden_act);
/*!
* \brief Calculate the hidden_err and update the output-embedding weight
* \param label record the label of every output-embedding vector
* \param word_idx the index of the output-embedding vector
* \param classifier store the output-embedding vector
* \param store the hidden layer vector
* \param store the hidden-error which is used
* \to update the input-embedding vector
*/
void BPOutputLayer(int label, int word_idx, real* classifier,
real* hidden_act, real* hidden_err);
/*!
* \brief Train a window sample and update the
* \input-embedding&output-embedding vectors
* \param input_nodes represent the input nodes
* \param output_nodes represent the ouput nodes
* \param hidden_act store the hidden layer vector
* \param hidden_err store the hidden layer error
*/
void TrainSample(std::vector<int>& input_nodes,
std::vector<std::pair<int, int> >& output_nodes,
void *hidden_act, void *hidden_err);
/*!
* \brief Train the sentence actually
*/
void Train(int* sentence, int sentence_length,
uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools);
//No copying allowed
WordEmbedding(const WordEmbedding&);
void operator=(const WordEmbedding&);
};
}
//No copying allowed
WordEmbedding(const WordEmbedding&);
void operator=(const WordEmbedding&);
};
}
}
#endif

Просмотреть файл

@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
# Visual Studio 2013
VisualStudioVersion = 12.0.21005.1
MinimumVisualStudioVersion = 10.0.40219.1
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "distributed_word_embedding", "distributed_word_embedding\distributed_word_embedding.vcxproj", "{D1C18C01-40A1-400D-B537-528FE982DC5C}"
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "distributed_word_embedding", "distributed_word_embedding.vcxproj", "{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
@ -13,14 +13,14 @@ Global
Release|x64 = Release|x64
EndGlobalSection
GlobalSection(ProjectConfigurationPlatforms) = postSolution
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Debug|Win32.ActiveCfg = Debug|Win32
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Debug|Win32.Build.0 = Debug|Win32
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Debug|x64.ActiveCfg = Debug|x64
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Debug|x64.Build.0 = Debug|x64
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Release|Win32.ActiveCfg = Release|Win32
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Release|Win32.Build.0 = Release|Win32
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Release|x64.ActiveCfg = Release|x64
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Release|x64.Build.0 = Release|x64
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Debug|Win32.ActiveCfg = Debug|Win32
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Debug|Win32.Build.0 = Debug|Win32
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Debug|x64.ActiveCfg = Debug|x64
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Debug|x64.Build.0 = Debug|x64
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Release|Win32.ActiveCfg = Release|Win32
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Release|Win32.Build.0 = Release|Win32
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Release|x64.ActiveCfg = Release|x64
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Release|x64.Build.0 = Release|x64
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE

Просмотреть файл

@ -0,0 +1,173 @@
<?xml version="1.0" encoding="utf-8"?>
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
<ItemGroup Label="ProjectConfigurations">
<ProjectConfiguration Include="Debug|Win32">
<Configuration>Debug</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Debug|x64">
<Configuration>Debug</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|Win32">
<Configuration>Release</Configuration>
<Platform>Win32</Platform>
</ProjectConfiguration>
<ProjectConfiguration Include="Release|x64">
<Configuration>Release</Configuration>
<Platform>x64</Platform>
</ProjectConfiguration>
</ItemGroup>
<PropertyGroup Label="Globals">
<ProjectGuid>{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}</ProjectGuid>
<Keyword>Win32Proj</Keyword>
<RootNamespace>distributed_word_embedding</RootNamespace>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>true</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
<ConfigurationType>Application</ConfigurationType>
<UseDebugLibraries>false</UseDebugLibraries>
<PlatformToolset>v120</PlatformToolset>
<WholeProgramOptimization>true</WholeProgramOptimization>
<CharacterSet>Unicode</CharacterSet>
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
</ImportGroup>
<PropertyGroup Label="UserMacros" />
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<LinkIncremental>true</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<LinkIncremental>false</LinkIncremental>
</PropertyGroup>
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<LinkIncremental>false</LinkIncremental>
<IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);..\..\multiverso\include\multiverso</IncludePath>
<LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);..\..\multiverso\windows\x64\Release</LibraryPath>
</PropertyGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<ClCompile>
<PrecompiledHeader>
</PrecompiledHeader>
<WarningLevel>Level3</WarningLevel>
<Optimization>Disabled</Optimization>
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
</Link>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
<ClCompile>
<WarningLevel>Level3</WarningLevel>
<PrecompiledHeader>
</PrecompiledHeader>
<Optimization>MaxSpeed</Optimization>
<FunctionLevelLinking>true</FunctionLevelLinking>
<IntrinsicFunctions>true</IntrinsicFunctions>
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
</ClCompile>
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<EnableCOMDATFolding>true</EnableCOMDATFolding>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalDependencies>multiverso.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
</Link>
</ItemDefinitionGroup>
<ItemGroup>
<ClInclude Include="..\..\src\constant.h" />
<ClInclude Include="..\..\src\data_block.h" />
<ClInclude Include="..\..\src\dictionary.h" />
<ClInclude Include="..\..\src\distributed_wordembedding.h" />
<ClInclude Include="..\..\src\huffman_encoder.h" />
<ClInclude Include="..\..\src\memory_manager.h" />
<ClInclude Include="..\..\src\parameter_loader.h" />
<ClInclude Include="..\..\src\reader.h" />
<ClInclude Include="..\..\src\trainer.h" />
<ClInclude Include="..\..\src\util.h" />
<ClInclude Include="..\..\src\word_embedding.h" />
</ItemGroup>
<ItemGroup>
<ClCompile Include="..\..\src\data_block.cpp" />
<ClCompile Include="..\..\src\dictionary.cpp" />
<ClCompile Include="..\..\src\distributed_wordembedding.cpp" />
<ClCompile Include="..\..\src\huffman_encoder.cpp" />
<ClCompile Include="..\..\src\main.cpp" />
<ClCompile Include="..\..\src\memory_manager.cpp" />
<ClCompile Include="..\..\src\parameter_loader.cpp" />
<ClCompile Include="..\..\src\reader.cpp" />
<ClCompile Include="..\..\src\trainer.cpp" />
<ClCompile Include="..\..\src\util.cpp" />
<ClCompile Include="..\..\src\word_embedding.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
</ImportGroup>
</Project>