Merge pull request #5 from Microsoft/revert-4-sherry_Version1
Revert " update distributed word embedding"
This commit is contained in:
Коммит
8d2ca8c1ad
|
@ -1,31 +0,0 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_BLOCK_QUEUE_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_BLOCK_QUEUE_H_
|
||||
|
||||
#include <cstdlib>
|
||||
#include <condition_variable>
|
||||
|
||||
#include <iostream>
|
||||
#include <mutex>
|
||||
#include <thread>
|
||||
#include <queue>
|
||||
|
||||
#include "data_block.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
class BlockQueue{
|
||||
public:
|
||||
std::queue <DataBlock *> queues;
|
||||
std::mutex mtx;
|
||||
std::condition_variable repo_not_empty;
|
||||
|
||||
BlockQueue(){}
|
||||
~BlockQueue(){
|
||||
std::queue<DataBlock *>().swap(queues);
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -1,273 +0,0 @@
|
|||
#include "communicator.h"
|
||||
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
template <typename T>
|
||||
void filler(std::vector<T> &v){
|
||||
std::random_device rd;
|
||||
std::mt19937 gen(rd());
|
||||
std::uniform_real_distribution<float> dis(-1.0, 1.0);
|
||||
|
||||
for (int i = 0; i<v.size(); i++)
|
||||
{
|
||||
v[i] = dis(gen);
|
||||
}
|
||||
}
|
||||
|
||||
Communicator::Communicator(Option* option){
|
||||
option_ = option;
|
||||
process_id_ = multiverso::MV_Rank();
|
||||
memory_mamanger_ = new MemoryManager(option_->embeding_size);
|
||||
}
|
||||
|
||||
Communicator::~Communicator(){
|
||||
ClearParameterTables();
|
||||
delete memory_mamanger_;
|
||||
}
|
||||
|
||||
void Communicator::PrepareParameterTables(int row_size, int column_size){
|
||||
worker_input_table_ = new MatrixWorkerTable<real>(row_size, column_size);
|
||||
worker_output_table_ = new MatrixWorkerTable<real>(row_size, column_size);
|
||||
server_input_table_ = new MatrixServerTable<real>(row_size, column_size, &filler);
|
||||
server_output_table_ = new MatrixServerTable<real>(row_size, column_size);
|
||||
|
||||
worker_wordcount_table_ = new KVWorkerTable<int, int64>();
|
||||
server_wordcount_table_ = new KVServerTable<int, int64>();
|
||||
kv_ = worker_wordcount_table_->raw();
|
||||
|
||||
if (option_->use_adagrad){
|
||||
worker_input_gradient_table_ = new MatrixWorkerTable<real>(row_size, column_size);
|
||||
worker_output_gradient_table_ = new MatrixWorkerTable<real>(row_size, column_size);
|
||||
server_input_gradient_table_ = new MatrixServerTable<real>(row_size, column_size);
|
||||
server_output_gradient_table_ = new MatrixServerTable<real>(row_size, column_size);
|
||||
}
|
||||
}
|
||||
|
||||
void Communicator::ClearParameterTables(){
|
||||
delete worker_input_table_;
|
||||
delete worker_output_table_;
|
||||
delete server_input_table_;
|
||||
delete server_output_table_;
|
||||
|
||||
if (option_->use_adagrad){
|
||||
delete worker_input_gradient_table_;
|
||||
delete worker_output_gradient_table_;
|
||||
delete server_input_gradient_table_;
|
||||
delete server_output_gradient_table_;
|
||||
}
|
||||
//multiverso::Log::Info("Rank %d Clear Parameter Tables done.\n", process_id_);
|
||||
}
|
||||
|
||||
inline void Communicator::AddRows(MatrixWorkerTable<real>* table_, std::vector<int> row_ids, std::vector<real *> ptrs, int size){
|
||||
AddOption add_option;
|
||||
table_->Add(row_ids, ptrs, size, &add_option);
|
||||
}
|
||||
|
||||
void Communicator::GetWorkerTableRows(std::vector<int> row_nums, std::vector<real*> &blocks, int embeding_size){
|
||||
worker_input_table_->Get(row_nums, blocks, embeding_size);
|
||||
}
|
||||
|
||||
inline void Communicator::GetRows(MatrixWorkerTable<real>* table_, std::vector<int> row_ids, std::vector<real *> ptrs, int size){
|
||||
table_->Get(row_ids, ptrs, size);
|
||||
}
|
||||
|
||||
inline void Communicator::RequestParameterByTableId(DataBlock *data_block, int table_id, std::vector<int> &nodes, std::vector<real*> &blocks){
|
||||
switch (table_id){
|
||||
case kInputEmbeddingTableId:
|
||||
GetRows(worker_input_table_, nodes, blocks, option_->embeding_size);
|
||||
SetDataBlockEmbedding(data_block, blocks, nodes, kInputEmbeddingTableId);
|
||||
break;
|
||||
case kEmbeddingOutputTableId:
|
||||
GetRows(worker_output_table_, nodes, blocks, option_->embeding_size);
|
||||
SetDataBlockEmbedding(data_block, blocks, nodes, kEmbeddingOutputTableId);
|
||||
break;
|
||||
case kSumGradient2IETableId:
|
||||
GetRows(worker_input_gradient_table_, nodes, blocks, option_->embeding_size);
|
||||
SetDataBlockEmbedding(data_block, blocks, nodes, kSumGradient2IETableId);
|
||||
break;
|
||||
case kSumGradient2EOTableId:
|
||||
GetRows(worker_output_gradient_table_, nodes, blocks, option_->embeding_size);
|
||||
SetDataBlockEmbedding(data_block, blocks, nodes, kSumGradient2EOTableId);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
inline void Communicator::SetDataBlockEmbedding(DataBlock *data_block, std::vector<real*> &blocks, std::vector<int> &nodes, int table_id){
|
||||
switch (table_id){
|
||||
case kInputEmbeddingTableId:
|
||||
for (int i = 0; i < nodes.size(); ++i){
|
||||
data_block->SetWeightIE(nodes[i], blocks[i]);
|
||||
}
|
||||
break;
|
||||
case kEmbeddingOutputTableId:
|
||||
for (int i = 0; i < nodes.size(); ++i){
|
||||
data_block->SetWeightEO(nodes[i], blocks[i]);
|
||||
}
|
||||
break;
|
||||
case kSumGradient2IETableId:
|
||||
for (int i = 0; i < nodes.size(); ++i){
|
||||
data_block->SetSumGradient2IE(nodes[i], blocks[i]);
|
||||
}
|
||||
break;
|
||||
case kSumGradient2EOTableId:
|
||||
for (int i = 0; i < nodes.size(); ++i){
|
||||
data_block->SetSumGradient2EO(nodes[i], blocks[i]);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void Communicator::RequestParameter(DataBlock *data_block)
|
||||
{
|
||||
clock_t start = clock();
|
||||
|
||||
std::vector<int> input_nodes(data_block->input_nodes.begin(), data_block->input_nodes.end());
|
||||
std::vector<int> output_nodes(data_block->output_nodes.begin(), data_block->output_nodes.end());
|
||||
std::vector<real*> input_blocks;
|
||||
std::vector<real*> output_blocks;
|
||||
|
||||
//Request blocks to store parameters
|
||||
memory_mamanger_->RequestBlocks(data_block->input_nodes.size(), input_blocks);
|
||||
memory_mamanger_->RequestBlocks(data_block->output_nodes.size(), output_blocks);
|
||||
assert(input_blocks.size() == data_block->input_nodes.size());
|
||||
assert(output_blocks.size() == data_block->output_nodes.size());
|
||||
|
||||
RequestParameterByTableId(data_block, kInputEmbeddingTableId, input_nodes, input_blocks);
|
||||
RequestParameterByTableId(data_block, kEmbeddingOutputTableId, output_nodes, output_blocks);
|
||||
|
||||
if (option_->use_adagrad){
|
||||
std::vector<real*> input_gradient_blocks;
|
||||
std::vector<real*> output_gradient_blocks;
|
||||
|
||||
memory_mamanger_->RequestBlocks(input_nodes.size(), input_gradient_blocks);
|
||||
memory_mamanger_->RequestBlocks(output_nodes.size(), output_gradient_blocks);
|
||||
|
||||
RequestParameterByTableId(data_block, kSumGradient2IETableId, input_nodes, input_gradient_blocks);
|
||||
RequestParameterByTableId(data_block, kSumGradient2EOTableId, output_nodes, output_gradient_blocks);
|
||||
}
|
||||
|
||||
multiverso::Log::Info("Rank %d Request Parameters time:%lfs\n", process_id_,
|
||||
(clock() - start) / (double)CLOCKS_PER_SEC);
|
||||
}
|
||||
|
||||
inline void Communicator::GetDeltaLoop(DataBlock *data_block, std::vector<real*> &blocks, std::vector<int> &nodes, int table_id, std::vector<real*> &recycle_blocks){
|
||||
std::function<real*(int)> get_function;
|
||||
switch (table_id){
|
||||
case kInputEmbeddingTableId:
|
||||
get_function = std::bind(&DataBlock::GetWeightIE, data_block, std::placeholders::_1);
|
||||
break;
|
||||
case kEmbeddingOutputTableId:
|
||||
get_function = std::bind(&DataBlock::GetWeightEO, data_block, std::placeholders::_1);
|
||||
break;
|
||||
case kSumGradient2IETableId:
|
||||
get_function = std::bind(&DataBlock::GetSumGradient2IE, data_block, std::placeholders::_1);
|
||||
break;
|
||||
case kSumGradient2EOTableId:
|
||||
get_function = std::bind(&DataBlock::GetSumGradient2EO, data_block, std::placeholders::_1);
|
||||
break;
|
||||
}
|
||||
|
||||
for (int i = 0; i < nodes.size(); ++i)
|
||||
{
|
||||
real* new_row = get_function((nodes[i]));
|
||||
real* old_row = blocks[i];
|
||||
assert(new_row != nullptr);
|
||||
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
{
|
||||
old_row[j] = (new_row[j] - old_row[j]) / option_->thread_cnt;
|
||||
}
|
||||
recycle_blocks.push_back(new_row);
|
||||
}
|
||||
}
|
||||
|
||||
void Communicator::AddParameterByTableId(DataBlock *data_block, int table_id, std::vector<int> &nodes, std::vector<real*> &blocks, std::vector<real*> &recycle_blocks){
|
||||
switch (table_id){
|
||||
case kInputEmbeddingTableId:
|
||||
GetRows(worker_input_table_, nodes, blocks, option_->embeding_size);
|
||||
GetDeltaLoop(data_block, blocks, nodes, table_id, recycle_blocks);
|
||||
AddRows(worker_input_table_, nodes, blocks, option_->embeding_size);
|
||||
break;
|
||||
case kEmbeddingOutputTableId:
|
||||
GetRows(worker_output_table_, nodes, blocks, option_->embeding_size);
|
||||
GetDeltaLoop(data_block, blocks, nodes, table_id, recycle_blocks);
|
||||
AddRows(worker_output_table_, nodes, blocks, option_->embeding_size);
|
||||
break;
|
||||
case kSumGradient2IETableId:
|
||||
GetRows(worker_input_gradient_table_, nodes, blocks, option_->embeding_size);
|
||||
GetDeltaLoop(data_block, blocks, nodes, table_id, recycle_blocks);
|
||||
AddRows(worker_input_gradient_table_, nodes, blocks, option_->embeding_size);
|
||||
break;
|
||||
case kSumGradient2EOTableId:
|
||||
GetRows(worker_output_gradient_table_, nodes, blocks, option_->embeding_size);
|
||||
GetDeltaLoop(data_block, blocks, nodes, table_id, recycle_blocks);
|
||||
AddRows(worker_output_gradient_table_, nodes, blocks, option_->embeding_size);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
//Add delta to local buffer and send it to the parameter sever
|
||||
void Communicator::AddDeltaParameter(DataBlock *data_block)
|
||||
{
|
||||
if (data_block == nullptr){
|
||||
multiverso::Log::Info("Rank %d has null DataBlcok\n", process_id_);
|
||||
return;
|
||||
}
|
||||
|
||||
clock_t start = clock();
|
||||
std::vector<real*> blocks;
|
||||
std::vector<real*> recycle_blocks;
|
||||
|
||||
std::vector<int> input_nodes(data_block->input_nodes.begin(), data_block->input_nodes.end());
|
||||
std::vector<int> output_nodes(data_block->output_nodes.begin(), data_block->output_nodes.end());
|
||||
std::vector<real*> input_blocks;
|
||||
std::vector<real*> output_blocks;
|
||||
//Request blocks to store parameters
|
||||
memory_mamanger_->RequestBlocks(input_nodes.size(), input_blocks);
|
||||
memory_mamanger_->RequestBlocks(output_nodes.size(), output_blocks);
|
||||
assert(input_blocks.size() == input_nodes.size());
|
||||
assert(output_blocks.size() == output_nodes.size());
|
||||
|
||||
AddParameterByTableId(data_block, kInputEmbeddingTableId, input_nodes, input_blocks, recycle_blocks);
|
||||
AddParameterByTableId(data_block, kEmbeddingOutputTableId, output_nodes, output_blocks, recycle_blocks);
|
||||
|
||||
memory_mamanger_->ReturnBlocks(input_blocks);
|
||||
memory_mamanger_->ReturnBlocks(output_blocks);
|
||||
|
||||
if (option_->use_adagrad){
|
||||
std::vector<real*> input_gradient_blocks;
|
||||
std::vector<real*> output_gradient_blocks;
|
||||
memory_mamanger_->RequestBlocks(input_nodes.size(), input_gradient_blocks);
|
||||
memory_mamanger_->RequestBlocks(output_nodes.size(), output_gradient_blocks);
|
||||
|
||||
AddParameterByTableId(data_block, kSumGradient2IETableId, input_nodes, input_gradient_blocks, recycle_blocks);
|
||||
AddParameterByTableId(data_block, kSumGradient2EOTableId, output_nodes, output_gradient_blocks, recycle_blocks);
|
||||
|
||||
memory_mamanger_->ReturnBlocks(input_gradient_blocks);
|
||||
memory_mamanger_->ReturnBlocks(output_gradient_blocks);
|
||||
}
|
||||
|
||||
memory_mamanger_->ReturnBlocks(recycle_blocks);
|
||||
multiverso::Log::Info("Rank %d Add Parameters time:%lfs\n", process_id_, (clock() - start) / (double)CLOCKS_PER_SEC);
|
||||
}
|
||||
|
||||
int64 Communicator::GetWordCount(){
|
||||
worker_wordcount_table_->Get(kWordCountId);
|
||||
return kv_[kWordCountId];
|
||||
}
|
||||
|
||||
void Communicator::AddWordCount(int word_count_num){
|
||||
worker_wordcount_table_->Add(kWordCountId, word_count_num);
|
||||
}
|
||||
|
||||
void Communicator::RequestBlocks(int size, std::vector<real*> &blocks){
|
||||
memory_mamanger_->RequestBlocks(size, blocks);
|
||||
}
|
||||
void Communicator::ReturnBlocks(std::vector<real*> &blocks){
|
||||
memory_mamanger_->ReturnBlocks(blocks);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,69 +0,0 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_COMMUNICATOR_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_COMMUNICATOR_H_
|
||||
|
||||
#include "multiverso/multiverso.h"
|
||||
#include "multiverso/table/matrix_table.h"
|
||||
#include "multiverso/table/kv_table.h"
|
||||
#include "multiverso/updater/updater.h"
|
||||
|
||||
#include "memory_manager.h"
|
||||
#include "block_queue.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
|
||||
class Communicator
|
||||
{
|
||||
|
||||
public:
|
||||
Communicator(Option* option);
|
||||
~Communicator();
|
||||
|
||||
void RequestBlocks(int size, std::vector<real*> &blocks);
|
||||
void ReturnBlocks(std::vector<real*> &blocks);
|
||||
|
||||
void RequestParameter(DataBlock *data_block);
|
||||
|
||||
void AddDeltaParameter(DataBlock *data_block);
|
||||
|
||||
int64 GetWordCount();
|
||||
void AddWordCount(int word_count_num);
|
||||
|
||||
void GetWorkerTableRows(std::vector<int> row_nums, std::vector<real*> &blocks, int embeding_size);
|
||||
|
||||
void PrepareParameterTables(int row_size, int column_size);
|
||||
|
||||
private:
|
||||
Option* option_ = nullptr;
|
||||
MemoryManager* memory_mamanger_ = nullptr;
|
||||
int process_id_;
|
||||
std::unordered_map<int, int64> kv_;
|
||||
|
||||
MatrixWorkerTable<real>* worker_input_table_ = nullptr;
|
||||
MatrixWorkerTable<real>* worker_output_table_ = nullptr;
|
||||
MatrixServerTable<real>* server_input_table_ = nullptr;
|
||||
MatrixServerTable<real>* server_output_table_ = nullptr;
|
||||
|
||||
MatrixWorkerTable<real>* worker_input_gradient_table_ = nullptr;
|
||||
MatrixWorkerTable<real>* worker_output_gradient_table_ = nullptr;
|
||||
MatrixServerTable<real>* server_input_gradient_table_ = nullptr;
|
||||
MatrixServerTable<real>* server_output_gradient_table_ = nullptr;
|
||||
|
||||
KVWorkerTable<int, int64>* worker_wordcount_table_ = nullptr;
|
||||
KVServerTable<int, int64>* server_wordcount_table_ = nullptr;
|
||||
|
||||
void ClearParameterTables();
|
||||
|
||||
void GetRows(MatrixWorkerTable<real>* table_, std::vector<int> row_ids, std::vector<real *> ptrs, int size);
|
||||
void RequestParameterByTableId(DataBlock *data_block, int table_id, std::vector<int> &nodes, std::vector<real*> &blocks);
|
||||
void SetDataBlockEmbedding(DataBlock *data_block, std::vector<real*> &blocks, std::vector<int> &nodes, int table_id);
|
||||
|
||||
void AddRows(MatrixWorkerTable<real>* table_, std::vector<int> row_ids, std::vector<real *> ptrs, int size);
|
||||
void AddParameterByTableId(DataBlock *data_block, int table_id, std::vector<int> &nodes, std::vector<real*> &blocks, std::vector<real*> &recycle_blocks);
|
||||
void GetDeltaLoop(DataBlock *data_block, std::vector<real*> &blocks, std::vector<int> &nodes, int table_id, std::vector<real*> &recycle_blocks);
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -1,41 +1,35 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_CONSTANT_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_CONSTANT_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* \file constant.h
|
||||
* \brief The index of parameter tables and some constant.
|
||||
*/
|
||||
|
||||
#include "multiverso.h"
|
||||
#include "log.h"
|
||||
#include <cstdint>
|
||||
|
||||
#include "multiverso/multiverso.h"
|
||||
#include "multiverso/util/log.h"
|
||||
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
/*! \brief Table id is use*/
|
||||
const multiverso::integer_t kInputEmbeddingTableId = 0;
|
||||
const multiverso::integer_t kEmbeddingOutputTableId = 1;
|
||||
const multiverso::integer_t kWordCountActualTableId = 2;
|
||||
const multiverso::integer_t kSumGradient2IETableId = 3;
|
||||
const multiverso::integer_t kSumGradient2EOTableId = 4;
|
||||
|
||||
typedef int64_t int64;
|
||||
typedef uint64_t uint64;
|
||||
typedef float real;
|
||||
typedef int64_t int64;
|
||||
typedef uint64_t uint64;
|
||||
typedef float real;
|
||||
|
||||
const int kInputEmbeddingTableId = 0;
|
||||
const int kEmbeddingOutputTableId = 1;
|
||||
const int kSumGradient2IETableId = 2;
|
||||
const int kSumGradient2EOTableId = 3;
|
||||
const int kWordCountId = 4;
|
||||
|
||||
const int kTableSize = (int)1e8;
|
||||
//const real kEps = static_cast<real>(1e-10);
|
||||
const int kMaxWordSize = 901;
|
||||
const int kMaxCodeLength = 100;
|
||||
const int kMaxString = 100;
|
||||
const int kMaxSentenceLength = 1000;
|
||||
const int kMaxEXP = 6;
|
||||
|
||||
const int kExpTableSize = 1000;
|
||||
const int kMaxExp = 6;
|
||||
}
|
||||
const int kTableSize = (int)1e8;
|
||||
const real kEps = static_cast<real>(1e-10);
|
||||
const int kMaxWordSize = 901;
|
||||
const int kMaxCodeLength = 100;
|
||||
const int kMaxString = 100;
|
||||
const int kMaxSentenceLength = 1000;
|
||||
const int kMaxEXP = 6;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -2,157 +2,49 @@
|
|||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
DataBlock::~DataBlock()
|
||||
{
|
||||
ClearSentences();
|
||||
ClearParameters();
|
||||
}
|
||||
namespace wordembedding
|
||||
{
|
||||
DataBlock::~DataBlock()
|
||||
{
|
||||
ClearSentences();
|
||||
}
|
||||
|
||||
size_t DataBlock::Size()
|
||||
{
|
||||
return sentences_.size();
|
||||
}
|
||||
size_t DataBlock::Size()
|
||||
{
|
||||
return sentences_.size();
|
||||
}
|
||||
|
||||
//Add a new sentence to the DataBlock
|
||||
void DataBlock::AddSentence(int *head, int sentence_length,
|
||||
int64 word_count, uint64 next_random)
|
||||
{
|
||||
Sentence sentence(head, sentence_length, word_count, next_random);
|
||||
sentences_.push_back(sentence);
|
||||
}
|
||||
//Add a new sentence to the DataBlock
|
||||
void DataBlock::AddSentence(int *head, int sentence_length,
|
||||
int64 word_count, uint64 next_random)
|
||||
{
|
||||
Sentence sentence(head, sentence_length, word_count, next_random);
|
||||
sentences_.push_back(sentence);
|
||||
}
|
||||
|
||||
//Get the information of the index-th sentence
|
||||
void DataBlock::GetSentence(int index, int* &head,
|
||||
int &sentence_length, int64 &word_count, uint64 &next_random)
|
||||
{
|
||||
if (index >= 0 && index < sentences_.size())
|
||||
{
|
||||
sentences_[index].Get(head, sentence_length,
|
||||
word_count, next_random);
|
||||
}
|
||||
else
|
||||
{
|
||||
head = nullptr;
|
||||
sentence_length = 0;
|
||||
word_count = 0;
|
||||
next_random = 0;
|
||||
}
|
||||
}
|
||||
//Free the memory of sentences
|
||||
void DataBlock::ClearSentences()
|
||||
{
|
||||
for (int i = 0; i < sentences_.size(); ++i)
|
||||
delete[] sentences_[i].head;
|
||||
sentences_.clear();
|
||||
}
|
||||
|
||||
void DataBlock::ClearParameters()
|
||||
{
|
||||
delete[] weight_IE_;
|
||||
delete[] weight_EO_;
|
||||
|
||||
if (is_use_adagrad_)
|
||||
{
|
||||
delete sum_gradient2_IE_;
|
||||
delete sum_gradient2_EO_;
|
||||
}
|
||||
}
|
||||
|
||||
//Set the weight of input-embedding vector
|
||||
void DataBlock::SetWeightIE(int input_node_id, real* ptr)
|
||||
{
|
||||
weight_IE_[input_node_id] = ptr;
|
||||
}
|
||||
|
||||
//Set the weight of output-embedding vector
|
||||
void DataBlock::SetWeightEO(int output_node_id, real* ptr)
|
||||
{
|
||||
weight_EO_[output_node_id] = ptr;
|
||||
}
|
||||
//Get the weight of output-embedding vector
|
||||
real* DataBlock::GetWeightIE(int input_node_id)
|
||||
{
|
||||
return weight_IE_[input_node_id];
|
||||
}
|
||||
//Get the weight of output-embedding vector
|
||||
real* DataBlock::GetWeightEO(int output_node_id)
|
||||
{
|
||||
return weight_EO_[output_node_id];
|
||||
}
|
||||
|
||||
void DataBlock::SetSumGradient2IE(int input_node_id, real* ptr)
|
||||
{
|
||||
sum_gradient2_IE_[input_node_id] = ptr;
|
||||
}
|
||||
|
||||
//Set the weight of SumGradient-output vector
|
||||
void DataBlock::SetSumGradient2EO(int output_node_id, real* ptr)
|
||||
{
|
||||
sum_gradient2_EO_[output_node_id] = ptr;
|
||||
}
|
||||
|
||||
//Get the weight of SumGradient-input vector
|
||||
real* DataBlock::GetSumGradient2IE(int input_node_id)
|
||||
{
|
||||
return sum_gradient2_IE_[input_node_id];
|
||||
}
|
||||
|
||||
//Get the weight of SumGradient-output vector
|
||||
real* DataBlock::GetSumGradient2EO(int output_node_id)
|
||||
{
|
||||
return sum_gradient2_EO_[output_node_id];
|
||||
}
|
||||
|
||||
void DataBlock::MallocMemory(int dictionary_size_, bool is_use_adagrad){
|
||||
weight_IE_ = new (std::nothrow)real*[dictionary_size_];
|
||||
assert(weight_IE_ != nullptr);
|
||||
weight_EO_ = new (std::nothrow)real*[dictionary_size_];
|
||||
assert(weight_EO_ != nullptr);
|
||||
is_use_adagrad_ = is_use_adagrad;
|
||||
|
||||
if (is_use_adagrad_)
|
||||
{
|
||||
|
||||
sum_gradient2_IE_ = new (std::nothrow)real*[dictionary_size_];
|
||||
sum_gradient2_EO_ = new (std::nothrow)real*[dictionary_size_];
|
||||
assert(sum_gradient2_IE_ != nullptr);
|
||||
assert(sum_gradient2_EO_ != nullptr);
|
||||
}
|
||||
}
|
||||
|
||||
void DataBlock::PrintDataBlock(int embedding_size){
|
||||
std::vector<int> input_nodes(input_nodes.begin(), input_nodes.end());
|
||||
std::vector<int> output_nodes(output_nodes.begin(),output_nodes.end());
|
||||
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
//for (int i = 0; i < 2; ++i)
|
||||
{
|
||||
real* ptr = GetWeightIE(input_nodes[i]);
|
||||
for (int j = 0; j < embedding_size; j++){
|
||||
std::cout << ptr[j] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
|
||||
|
||||
for (int i = 0; i < output_nodes.size(); ++i)
|
||||
{
|
||||
real* ptr = GetWeightEO(output_nodes[i]);
|
||||
for (int j = 0; j < embedding_size; j++){
|
||||
std::cout << ptr[j] << " ";
|
||||
}
|
||||
std::cout << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
void DataBlock::SetLastFlag(){
|
||||
is_last_one_ = true;
|
||||
}
|
||||
|
||||
bool DataBlock::isLast(){
|
||||
return is_last_one_;
|
||||
}
|
||||
}
|
||||
//Get the information of the index-th sentence
|
||||
void DataBlock::GetSentence(int index, int* &head,
|
||||
int &sentence_length, int64 &word_count, uint64 &next_random)
|
||||
{
|
||||
if (index >= 0 && index < sentences_.size())
|
||||
{
|
||||
sentences_[index].Get(head, sentence_length,
|
||||
word_count, next_random);
|
||||
}
|
||||
else
|
||||
{
|
||||
head = nullptr;
|
||||
sentence_length = 0;
|
||||
word_count = 0;
|
||||
next_random = 0;
|
||||
}
|
||||
}
|
||||
//Free the memory of sentences
|
||||
void DataBlock::ClearSentences()
|
||||
{
|
||||
for (int i = 0; i < sentences_.size(); ++i)
|
||||
delete [] sentences_[i].head;
|
||||
sentences_.clear();
|
||||
}
|
||||
}
|
||||
}
|
192
src/data_block.h
192
src/data_block.h
|
@ -1,131 +1,97 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_DATA_BLOCK_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_DATA_BLOCK_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* \file data_block.h
|
||||
* \brief Class DataBlock is to store the necessary data for trainer and param_loader
|
||||
*/
|
||||
|
||||
#include<iostream>
|
||||
|
||||
#include "multiverso/multiverso.h"
|
||||
|
||||
#include "util.h"
|
||||
#include "multiverso.h"
|
||||
#include "huffman_encoder.h"
|
||||
#include "constant.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
/*!
|
||||
* \brief The class DataBlock stores train for trainer and param_loader
|
||||
*/
|
||||
class DataBlock
|
||||
{
|
||||
public:
|
||||
std::unordered_set <int> input_nodes, output_nodes;
|
||||
std::unordered_set <int> negativesample_pools;
|
||||
namespace wordembedding
|
||||
{
|
||||
/*!
|
||||
* \brief The class DataBlock stores train for trainer and param_loader
|
||||
*/
|
||||
class DataBlock : public multiverso::DataBlockBase
|
||||
{
|
||||
public:
|
||||
std::unordered_set <int> input_nodes, output_nodes;
|
||||
std::unordered_set <int> negativesample_pools;
|
||||
DataBlock(){}
|
||||
~DataBlock();
|
||||
|
||||
DataBlock(){}
|
||||
~DataBlock();
|
||||
/*!
|
||||
* \brief Get the number of sentences stored in DataBlock
|
||||
* \return the number of sentences
|
||||
*/
|
||||
size_t Size();
|
||||
/*!
|
||||
* \brief Add a new sentence to the DataBlock
|
||||
* \param sentence the starting address of the sentence
|
||||
* \param sentence_length the length of the sentence
|
||||
* \param word_count the number of words when getting the
|
||||
* sentence from train-file
|
||||
* \param next_random the seed for getting random number
|
||||
*/
|
||||
void AddSentence(int *sentence, int sentence_length,
|
||||
int64 word_count, uint64 next_random);
|
||||
/*!
|
||||
* \brief Get the information of the index-th sentence
|
||||
* \param index the id of the sentence
|
||||
* \param sentence the starting address of the sentence
|
||||
* \param sentence_length the length of the sentence
|
||||
* \param word_count the number of words when getting the
|
||||
* sentence from train-file
|
||||
* \param next_random the seed for getting random number
|
||||
*/
|
||||
void GetSentence(int index, int* &sentence,
|
||||
int &sentence_length, int64 &word_count,
|
||||
uint64 &next_random);
|
||||
|
||||
/*!
|
||||
* \brief Get the number of sentences stored in DataBlock
|
||||
* \return the number of sentences
|
||||
*/
|
||||
size_t Size();
|
||||
/*!
|
||||
* \brief Add a new sentence to the DataBlock
|
||||
* \param sentence the starting address of the sentence
|
||||
* \param sentence_length the length of the sentence
|
||||
* \param word_count the number of words when getting the
|
||||
* sentence from train-file
|
||||
* \param next_random the seed for getting random number
|
||||
*/
|
||||
void AddSentence(int *sentence, int sentence_length,
|
||||
int64 word_count, uint64 next_random);
|
||||
/*!
|
||||
* \brief Get the information of the index-th sentence
|
||||
* \param index the id of the sentence
|
||||
* \param sentence the starting address of the sentence
|
||||
* \param sentence_length the length of the sentence
|
||||
* \param word_count the number of words when getting the
|
||||
* sentence from train-file
|
||||
* \param next_random the seed for getting random number
|
||||
*/
|
||||
void GetSentence(int index, int* &sentence,
|
||||
int &sentence_length, int64 &word_count,
|
||||
uint64 &next_random);
|
||||
/*!
|
||||
* \brief Release the memory which are using to store sentences
|
||||
*/
|
||||
void ClearSentences();
|
||||
|
||||
/*!
|
||||
* \brief Release the memory which are using to store sentences
|
||||
*/
|
||||
void ClearSentences();
|
||||
private:
|
||||
/*!
|
||||
* \brief The information of sentences
|
||||
* head the head address which store the sentence
|
||||
* length the number of words in the sentence
|
||||
* word_count the real word count of the sentence
|
||||
* next_random the random seed
|
||||
*/
|
||||
struct Sentence
|
||||
{
|
||||
int* head;
|
||||
int length;
|
||||
int64 word_count;
|
||||
uint64 next_random;
|
||||
Sentence(int *head, int length, int64 word_count,
|
||||
uint64 next_random) :head(head), length(length),
|
||||
word_count(word_count), next_random(next_random){}
|
||||
|
||||
void ClearParameters();
|
||||
void Get(int* &local_head, int &sentence_length,
|
||||
int64 &local_word_count, uint64 &local_next_random)
|
||||
{
|
||||
local_head = head;
|
||||
sentence_length = length;
|
||||
local_word_count = word_count;
|
||||
local_next_random = next_random;
|
||||
}
|
||||
};
|
||||
|
||||
void MallocMemory(int dictionary_size_, bool is_use_adagrad);
|
||||
/*! \brief Store the information of sentences*/
|
||||
std::vector <Sentence> sentences_;
|
||||
|
||||
void SetWeightIE(int input_node_id, real* ptr);
|
||||
void SetWeightEO(int output_node_id, real* ptr);
|
||||
real* GetWeightIE(int input_node_id);
|
||||
real* GetWeightEO(int output_node_id);
|
||||
|
||||
void SetSumGradient2IE(int input_node_id, real* ptr);
|
||||
void SetSumGradient2EO(int output_node_id, real* ptr);
|
||||
real* GetSumGradient2IE(int input_node_id);
|
||||
real* GetSumGradient2EO(int output_node_id);
|
||||
|
||||
void PrintDataBlock(int embedding_size);
|
||||
|
||||
void SetLastFlag();
|
||||
bool isLast();
|
||||
|
||||
private:
|
||||
/*!
|
||||
* \brief The information of sentences
|
||||
* head the head address which store the sentence
|
||||
* length the number of words in the sentence
|
||||
* word_count the real word count of the sentence
|
||||
* next_random the random seed
|
||||
*/
|
||||
struct Sentence
|
||||
{
|
||||
int* head;
|
||||
int length;
|
||||
int64 word_count;
|
||||
uint64 next_random;
|
||||
Sentence(int *head, int length, int64 word_count,
|
||||
uint64 next_random) :head(head), length(length),
|
||||
word_count(word_count), next_random(next_random){}
|
||||
|
||||
void Get(int* &local_head, int &sentence_length,
|
||||
int64 &local_word_count, uint64 &local_next_random)
|
||||
{
|
||||
local_head = head;
|
||||
sentence_length = length;
|
||||
local_word_count = word_count;
|
||||
local_next_random = next_random;
|
||||
}
|
||||
};
|
||||
|
||||
/*! \brief Store the information of sentences*/
|
||||
std::vector <Sentence> sentences_;
|
||||
|
||||
real** weight_IE_=nullptr;
|
||||
real** weight_EO_ = nullptr;
|
||||
|
||||
real** sum_gradient2_IE_ = nullptr;
|
||||
real** sum_gradient2_EO_ = nullptr;
|
||||
bool is_use_adagrad_ = false;
|
||||
bool is_last_one_ = false;
|
||||
|
||||
// No copying allowed
|
||||
DataBlock(const DataBlock&);
|
||||
|
||||
//void operator=(const DataBlock&);
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// No copying allowed
|
||||
DataBlock(const DataBlock&);
|
||||
void operator=(const DataBlock&);
|
||||
};
|
||||
}
|
||||
}
|
|
@ -1,236 +1,227 @@
|
|||
#include "dictionary.h"
|
||||
#include <cstring>
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
Dictionary::Dictionary()
|
||||
{
|
||||
combine_ = 0;
|
||||
Clear();
|
||||
}
|
||||
namespace wordembedding
|
||||
{
|
||||
Dictionary::Dictionary()
|
||||
{
|
||||
combine_ = 0;
|
||||
Clear();
|
||||
}
|
||||
|
||||
Dictionary::Dictionary(int i)
|
||||
{
|
||||
combine_ = i;
|
||||
Clear();
|
||||
}
|
||||
Dictionary::Dictionary(int i)
|
||||
{
|
||||
combine_ = i;
|
||||
Clear();
|
||||
}
|
||||
|
||||
void Dictionary::Clear()
|
||||
{
|
||||
word_idx_map_.clear();
|
||||
word_info_.clear();
|
||||
word_whitelist_.clear();
|
||||
}
|
||||
//Set the white list for the dictionary
|
||||
void Dictionary::SetWhiteList(const std::vector<std::string>& whitelist)
|
||||
{
|
||||
for (unsigned int i = 0; i < whitelist.size(); ++i)
|
||||
word_whitelist_.insert(whitelist[i]);
|
||||
}
|
||||
//Merge in the word_info which has the frequency over-threshold
|
||||
void Dictionary::MergeInfrequentWords(int64 threshold)
|
||||
{
|
||||
word_idx_map_.clear();
|
||||
std::vector<WordInfo> tmp_info;
|
||||
tmp_info.clear();
|
||||
int infreq_idx = -1;
|
||||
void Dictionary::Clear()
|
||||
{
|
||||
word_idx_map_.clear();
|
||||
word_info_.clear();
|
||||
word_whitelist_.clear();
|
||||
}
|
||||
//Set the white list for the dictionary
|
||||
void Dictionary::SetWhiteList(const std::vector<std::string>& whitelist)
|
||||
{
|
||||
for (unsigned int i = 0; i < whitelist.size(); ++i)
|
||||
word_whitelist_.insert(whitelist[i]);
|
||||
}
|
||||
//Merge in the word_info which has the frequency over-threshold
|
||||
void Dictionary::MergeInfrequentWords(int64 threshold)
|
||||
{
|
||||
word_idx_map_.clear();
|
||||
std::vector<WordInfo> tmp_info;
|
||||
tmp_info.clear();
|
||||
int infreq_idx = -1;
|
||||
|
||||
for (auto word_info : word_info_)
|
||||
{
|
||||
if (word_info.freq >= threshold || word_info.freq == 0
|
||||
|| word_whitelist_.count(word_info.word))
|
||||
{
|
||||
word_idx_map_[word_info.word] = static_cast<int>(tmp_info.size());
|
||||
tmp_info.push_back(word_info);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (infreq_idx < 0)
|
||||
{
|
||||
WordInfo infreq_word_info;
|
||||
infreq_word_info.word = "WE_ARE_THE_INFREQUENT_WORDS";
|
||||
infreq_word_info.freq = 0;
|
||||
word_idx_map_[infreq_word_info.word] = static_cast<int>(tmp_info.size());
|
||||
infreq_idx = static_cast<int>(tmp_info.size());
|
||||
tmp_info.push_back(infreq_word_info);
|
||||
}
|
||||
word_idx_map_[word_info.word] = infreq_idx;
|
||||
tmp_info[infreq_idx].freq += word_info.freq;
|
||||
}
|
||||
}
|
||||
word_info_ = tmp_info;
|
||||
}
|
||||
//Remove the words with frequency under min_count
|
||||
void Dictionary::RemoveWordsLessThan(int64 min_count)
|
||||
{
|
||||
word_idx_map_.clear();
|
||||
std::vector<WordInfo> tmp_info;
|
||||
tmp_info.clear();
|
||||
for (auto info : word_info_)
|
||||
{
|
||||
if (info.freq >= min_count || info.freq == 0
|
||||
|| word_whitelist_.count(info.word))
|
||||
{
|
||||
word_idx_map_[info.word] = static_cast<int>(tmp_info.size());
|
||||
tmp_info.push_back(info);
|
||||
}
|
||||
}
|
||||
word_info_ = tmp_info;
|
||||
}
|
||||
//Insert the dictionary element
|
||||
void Dictionary::Insert(const char* word, int64 cnt)
|
||||
{
|
||||
auto it = word_idx_map_.find(word);
|
||||
if (it != word_idx_map_.end())
|
||||
word_info_[it->second].freq += cnt;
|
||||
else
|
||||
{
|
||||
word_idx_map_[word] = static_cast<int>(word_info_.size());
|
||||
word_info_.push_back(WordInfo(word, cnt));
|
||||
}
|
||||
}
|
||||
//Load dictionary from file
|
||||
void Dictionary::LoadFromFile(const char* filename)
|
||||
{
|
||||
FILE* fid;
|
||||
fid = fopen(filename, "r");
|
||||
for (auto word_info : word_info_)
|
||||
{
|
||||
if (word_info.freq >= threshold || word_info.freq == 0
|
||||
|| word_whitelist_.count(word_info.word))
|
||||
{
|
||||
word_idx_map_[word_info.word] = static_cast<int>(tmp_info.size());
|
||||
tmp_info.push_back(word_info);
|
||||
}
|
||||
else
|
||||
{
|
||||
if (infreq_idx < 0)
|
||||
{
|
||||
WordInfo infreq_word_info;
|
||||
infreq_word_info.word = "WE_ARE_THE_INFREQUENT_WORDS";
|
||||
infreq_word_info.freq = 0;
|
||||
word_idx_map_[infreq_word_info.word] = static_cast<int>(tmp_info.size());
|
||||
infreq_idx = static_cast<int>(tmp_info.size());
|
||||
tmp_info.push_back(infreq_word_info);
|
||||
}
|
||||
word_idx_map_[word_info.word] = infreq_idx;
|
||||
tmp_info[infreq_idx].freq += word_info.freq;
|
||||
}
|
||||
}
|
||||
word_info_ = tmp_info;
|
||||
}
|
||||
//Remove the words with frequency under min_count
|
||||
void Dictionary::RemoveWordsLessThan(int64 min_count)
|
||||
{
|
||||
word_idx_map_.clear();
|
||||
std::vector<WordInfo> tmp_info;
|
||||
tmp_info.clear();
|
||||
for (auto info : word_info_)
|
||||
{
|
||||
if (info.freq >= min_count || info.freq == 0
|
||||
|| word_whitelist_.count(info.word))
|
||||
{
|
||||
word_idx_map_[info.word] = static_cast<int>(tmp_info.size());
|
||||
tmp_info.push_back(info);
|
||||
}
|
||||
}
|
||||
word_info_ = tmp_info;
|
||||
}
|
||||
//Insert the dictionary element
|
||||
void Dictionary::Insert(const char* word, int64 cnt)
|
||||
{
|
||||
auto it = word_idx_map_.find(word);
|
||||
if (it != word_idx_map_.end())
|
||||
word_info_[it->second].freq += cnt;
|
||||
else
|
||||
{
|
||||
word_idx_map_[word] = static_cast<int>(word_info_.size());
|
||||
word_info_.push_back(WordInfo(word, cnt));
|
||||
}
|
||||
}
|
||||
//Load dictionary from file
|
||||
void Dictionary::LoadFromFile(const char* filename)
|
||||
{
|
||||
FILE* fid;
|
||||
fid=fopen(filename, "r");
|
||||
|
||||
if (fid)
|
||||
{
|
||||
char sz_label[kMaxWordSize];
|
||||
if (fid)
|
||||
{
|
||||
char sz_label[kMaxWordSize];
|
||||
|
||||
//while ((fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
while (fscanf(fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
{
|
||||
int freq;
|
||||
fscanf(fid, "%d", &freq);
|
||||
Insert(sz_label, freq);
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
//while ((fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
while (fscanf(fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
{
|
||||
int freq;
|
||||
fscanf(fid, "%d", &freq);
|
||||
Insert(sz_label, freq);
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
|
||||
void Dictionary::LoadTriLetterFromFile(const char* filename,
|
||||
unsigned int min_cnt, unsigned int letter_count)
|
||||
{
|
||||
FILE* fid;
|
||||
fid = fopen(filename, "r");
|
||||
if (fid)
|
||||
{
|
||||
char sz_label[kMaxWordSize] = { 0 };
|
||||
//while (fscanf_s(fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
while (fscanf(fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
{
|
||||
int64 freq;
|
||||
fscanf(fid, "%lld", &freq);
|
||||
if (freq < static_cast<int64>(min_cnt)) continue;
|
||||
void Dictionary::LoadTriLetterFromFile(const char* filename,
|
||||
unsigned int min_cnt, unsigned int letter_count)
|
||||
{
|
||||
FILE* fid;
|
||||
fid=fopen(filename, "r");
|
||||
if (fid)
|
||||
{
|
||||
char sz_label[kMaxWordSize] = { 0 };
|
||||
//while (fscanf_s(fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
while (fscanf(fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
{
|
||||
int64 freq;
|
||||
fscanf(fid, "%lld", &freq);
|
||||
if (freq < static_cast<int64>(min_cnt)) continue;
|
||||
|
||||
// Construct Tri-letter From word
|
||||
size_t len = strlen(sz_label);
|
||||
if (len > kMaxWordSize)
|
||||
{
|
||||
/*
|
||||
multiverso::Log::Info("ignore super long term");
|
||||
continue;
|
||||
*/
|
||||
}
|
||||
// Construct Tri-letter From word
|
||||
size_t len = strlen(sz_label);
|
||||
if (len > kMaxWordSize)
|
||||
{
|
||||
multiverso::Log::Info("ignore super long term");
|
||||
continue;
|
||||
}
|
||||
|
||||
char tri_letters[kMaxWordSize + 2];
|
||||
tri_letters[0] = '#';
|
||||
int i = 0;
|
||||
for (i = 0; i < strlen(sz_label); i++)
|
||||
{
|
||||
tri_letters[i + 1] = sz_label[i];
|
||||
}
|
||||
char tri_letters[kMaxWordSize + 2];
|
||||
tri_letters[0] = '#';
|
||||
int i = 0;
|
||||
for (i = 0; i < strlen(sz_label); i++)
|
||||
{
|
||||
tri_letters[i + 1] = sz_label[i];
|
||||
}
|
||||
|
||||
tri_letters[i + 1] = '#';
|
||||
tri_letters[i + 2] = 0;
|
||||
if (combine_) Insert(sz_label, freq);
|
||||
tri_letters[i + 1] = '#';
|
||||
tri_letters[i + 2] = 0;
|
||||
if (combine_) Insert(sz_label, freq);
|
||||
|
||||
if (strlen(tri_letters) <= letter_count) {
|
||||
Insert(tri_letters, freq);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = 0; i <= strlen(tri_letters) - letter_count; ++i)
|
||||
{
|
||||
char tri_word[kMaxWordSize];
|
||||
unsigned int j = 0;
|
||||
for (j = 0; j < letter_count; j++)
|
||||
{
|
||||
tri_word[j] = tri_letters[i + j];
|
||||
}
|
||||
tri_word[j] = 0;
|
||||
Insert(tri_word, freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
if (strlen(tri_letters) <= letter_count) {
|
||||
Insert(tri_letters, freq);
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = 0; i <= strlen(tri_letters) - letter_count; ++i)
|
||||
{
|
||||
char tri_word[kMaxWordSize];
|
||||
unsigned int j = 0;
|
||||
for (j = 0; j < letter_count; j++)
|
||||
{
|
||||
tri_word[j] = tri_letters[i + j];
|
||||
}
|
||||
tri_word[j] = 0;
|
||||
Insert(tri_word, freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
|
||||
//Get the word's index from dictionary
|
||||
int Dictionary::GetWordIdx(const char* word)
|
||||
{
|
||||
auto it = word_idx_map_.find(word);
|
||||
if (it != word_idx_map_.end())
|
||||
return it->second;
|
||||
return -1;
|
||||
}
|
||||
//Return the size of frequency
|
||||
int Dictionary::Size()
|
||||
{
|
||||
return static_cast<int>(word_info_.size());
|
||||
}
|
||||
//Get the wordinfo from word or index
|
||||
const WordInfo* Dictionary::GetWordInfo(const char* word)
|
||||
{
|
||||
auto it = word_idx_map_.find(word);
|
||||
if (it != word_idx_map_.end())
|
||||
return GetWordInfo(it->second);
|
||||
return NULL;
|
||||
}
|
||||
//Get the word's index from dictionary
|
||||
int Dictionary::GetWordIdx(const char* word)
|
||||
{
|
||||
auto it = word_idx_map_.find(word);
|
||||
if (it != word_idx_map_.end())
|
||||
return it->second;
|
||||
return -1;
|
||||
}
|
||||
//Return the size of frequency
|
||||
int Dictionary::Size()
|
||||
{
|
||||
return static_cast<int>(word_info_.size());
|
||||
}
|
||||
//Get the wordinfo from word or index
|
||||
const WordInfo* Dictionary::GetWordInfo(const char* word)
|
||||
{
|
||||
auto it = word_idx_map_.find(word);
|
||||
if (it != word_idx_map_.end())
|
||||
return GetWordInfo(it->second);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const WordInfo* Dictionary::GetWordInfo(int word_idx)
|
||||
{
|
||||
if (word_idx >= 0 && word_idx < word_info_.size())
|
||||
return &word_info_[word_idx];
|
||||
return NULL;
|
||||
}
|
||||
const WordInfo* Dictionary::GetWordInfo(int word_idx)
|
||||
{
|
||||
if (word_idx >= 0 && word_idx < word_info_.size())
|
||||
return &word_info_[word_idx];
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void Dictionary::StartIteration()
|
||||
{
|
||||
word_iterator_ = word_info_.begin();
|
||||
}
|
||||
//Judge whether the iterator is the end
|
||||
bool Dictionary::HasMore()
|
||||
{
|
||||
return word_iterator_ != word_info_.end();
|
||||
}
|
||||
//Get the next Wordinfo
|
||||
const WordInfo* Dictionary::Next()
|
||||
{
|
||||
const WordInfo* entry = &(*word_iterator_);
|
||||
++word_iterator_;
|
||||
return entry;
|
||||
}
|
||||
void Dictionary::StartIteration()
|
||||
{
|
||||
word_iterator_ = word_info_.begin();
|
||||
}
|
||||
//Judge whether the iterator is the end
|
||||
bool Dictionary::HasMore()
|
||||
{
|
||||
return word_iterator_ != word_info_.end();
|
||||
}
|
||||
//Get the next Wordinfo
|
||||
const WordInfo* Dictionary::Next()
|
||||
{
|
||||
const WordInfo* entry = &(*word_iterator_);
|
||||
++word_iterator_;
|
||||
return entry;
|
||||
}
|
||||
|
||||
std::vector<WordInfo>::iterator Dictionary::Begin()
|
||||
{
|
||||
return word_info_.begin();
|
||||
}
|
||||
std::vector<WordInfo>::iterator Dictionary::End()
|
||||
{
|
||||
return word_info_.end();
|
||||
}
|
||||
|
||||
void Dictionary::PrintVocab(){
|
||||
int i = 0;
|
||||
for (auto temp = Begin(); temp != End(); ++temp){
|
||||
std::cout << temp->word << " " << i << std::endl;
|
||||
i++;
|
||||
}
|
||||
}
|
||||
}
|
||||
std::vector<WordInfo>::iterator Dictionary::Begin()
|
||||
{
|
||||
return word_info_.begin();
|
||||
}
|
||||
std::vector<WordInfo>::iterator Dictionary::End()
|
||||
{
|
||||
return word_info_.end();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
164
src/dictionary.h
164
src/dictionary.h
|
@ -1,100 +1,94 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_DICTIONARY_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_DICTIONARY_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* \brief Class dictionary stores the vocabulary and it's frequency
|
||||
*/
|
||||
#include <cstring>
|
||||
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <iostream>
|
||||
|
||||
#include "multiverso/util/log.h"
|
||||
|
||||
#include "constant.h"
|
||||
#include "log.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{ /*!
|
||||
* \brief struct WordInfo stores the pair of word&freq
|
||||
*/
|
||||
struct WordInfo
|
||||
{
|
||||
std::string word;
|
||||
int64 freq;
|
||||
WordInfo()
|
||||
{
|
||||
freq = 0;
|
||||
word.clear();
|
||||
}
|
||||
WordInfo(const std::string& _word, int64 _freq)
|
||||
{
|
||||
word = _word;
|
||||
freq = _freq;
|
||||
}
|
||||
};
|
||||
namespace wordembedding
|
||||
{ /*!
|
||||
* \brief struct WordInfo stores the pair of word&freq
|
||||
*/
|
||||
struct WordInfo
|
||||
{
|
||||
std::string word;
|
||||
int64 freq;
|
||||
WordInfo()
|
||||
{
|
||||
freq = 0;
|
||||
word.clear();
|
||||
}
|
||||
WordInfo(const std::string& _word, int64 _freq)
|
||||
{
|
||||
word = _word;
|
||||
freq = _freq;
|
||||
}
|
||||
};
|
||||
|
||||
class Dictionary
|
||||
{
|
||||
public:
|
||||
Dictionary();
|
||||
Dictionary(int i);
|
||||
void Clear();
|
||||
/*!
|
||||
* \brief Assign value to the set word_whitelist_
|
||||
*/
|
||||
void SetWhiteList(const std::vector<std::string>& whitelist);
|
||||
/*!
|
||||
* \brief Remove the low-freq word
|
||||
*/
|
||||
void RemoveWordsLessThan(int64 min_count);
|
||||
/*!
|
||||
* \brief Merge in the frequent words according to threshold
|
||||
*/
|
||||
void MergeInfrequentWords(int64 threshold);
|
||||
/*!
|
||||
* \brief Insert word-freq pair to the dictionary
|
||||
* \param word the word string
|
||||
* \param cnt the word's freqency
|
||||
*/
|
||||
void Insert(const char* word, int64 cnt = 1);
|
||||
/*!
|
||||
* \brief Load the word-freq pair from file
|
||||
*/
|
||||
void LoadFromFile(const char* filename);
|
||||
void LoadTriLetterFromFile(const char* filename,
|
||||
unsigned int min_cnt = 1, unsigned int letter_count = 3);
|
||||
int GetWordIdx(const char* word);
|
||||
/*!
|
||||
* \brief Get the index of the word according to the dictionary
|
||||
*/
|
||||
const WordInfo* GetWordInfo(const char* word);
|
||||
const WordInfo* GetWordInfo(int word_idx);
|
||||
int Size();
|
||||
void StartIteration();
|
||||
/*!
|
||||
* \brief Judge the word_iterator_ is the end
|
||||
*/
|
||||
bool HasMore();
|
||||
/*!
|
||||
* \brief Get the next wordinfo pointer in the vector
|
||||
*/
|
||||
const WordInfo* Next();
|
||||
std::vector<WordInfo>::iterator Begin();
|
||||
std::vector<WordInfo>::iterator End();
|
||||
class Dictionary
|
||||
{
|
||||
public:
|
||||
Dictionary();
|
||||
Dictionary(int i);
|
||||
void Clear();
|
||||
/*!
|
||||
* \brief Assign value to the set word_whitelist_
|
||||
*/
|
||||
void SetWhiteList(const std::vector<std::string>& whitelist);
|
||||
/*!
|
||||
* \brief Remove the low-freq word
|
||||
*/
|
||||
void RemoveWordsLessThan(int64 min_count);
|
||||
/*!
|
||||
* \brief Merge in the frequent words according to threshold
|
||||
*/
|
||||
void MergeInfrequentWords(int64 threshold);
|
||||
/*!
|
||||
* \brief Insert word-freq pair to the dictionary
|
||||
* \param word the word string
|
||||
* \param cnt the word's freqency
|
||||
*/
|
||||
void Insert(const char* word, int64 cnt = 1);
|
||||
/*!
|
||||
* \brief Load the word-freq pair from file
|
||||
*/
|
||||
void LoadFromFile(const char* filename);
|
||||
void LoadTriLetterFromFile(const char* filename,
|
||||
unsigned int min_cnt = 1, unsigned int letter_count = 3);
|
||||
int GetWordIdx(const char* word);
|
||||
/*!
|
||||
* \brief Get the index of the word according to the dictionary
|
||||
*/
|
||||
const WordInfo* GetWordInfo(const char* word);
|
||||
const WordInfo* GetWordInfo(int word_idx);
|
||||
int Size();
|
||||
void StartIteration();
|
||||
/*!
|
||||
* \brief Judge the word_iterator_ is the end
|
||||
*/
|
||||
bool HasMore();
|
||||
/*!
|
||||
* \brief Get the next wordinfo pointer in the vector
|
||||
*/
|
||||
const WordInfo* Next();
|
||||
std::vector<WordInfo>::iterator Begin();
|
||||
std::vector<WordInfo>::iterator End();
|
||||
|
||||
void PrintVocab();
|
||||
|
||||
private:
|
||||
int combine_;
|
||||
std::vector<WordInfo> word_info_;
|
||||
std::vector<WordInfo>::iterator word_iterator_;
|
||||
std::unordered_map<std::string, int> word_idx_map_;
|
||||
std::unordered_set<std::string> word_whitelist_;
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
||||
private:
|
||||
int combine_;
|
||||
std::vector<WordInfo> word_info_;
|
||||
std::vector<WordInfo>::iterator word_iterator_;
|
||||
std::unordered_map<std::string, int> word_idx_map_;
|
||||
std::unordered_set<std::string> word_whitelist_;
|
||||
};
|
||||
}
|
||||
}
|
|
@ -1,478 +1,467 @@
|
|||
#include "distributed_wordembedding.h"
|
||||
|
||||
#include "distributed_wordembedding.h"
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
namespace wordembedding
|
||||
{
|
||||
void Distributed_wordembedding::LoadOneBlock(DataBlock *data_block,
|
||||
Reader *reader, int64 size)
|
||||
void Distributed_wordembedding::Train(int argc, char *argv[])
|
||||
{
|
||||
clock_t start = clock();
|
||||
//The barrier for trainers
|
||||
multiverso::Barrier* barrier =
|
||||
new multiverso::Barrier(option_->thread_cnt);
|
||||
|
||||
data_block->ClearSentences();
|
||||
reader->ResetSize(size);
|
||||
while (true)
|
||||
{
|
||||
int64 word_count = 0;
|
||||
int *sentence = new (std::nothrow)int[kMaxSentenceLength + 2];
|
||||
assert(sentence != nullptr);
|
||||
int sentence_length = reader->GetSentence(sentence, word_count);
|
||||
if (sentence_length > 0)
|
||||
{
|
||||
data_block->AddSentence(sentence, sentence_length,
|
||||
word_count, (uint64)rand() * 10000 + (uint64)rand());
|
||||
}
|
||||
else
|
||||
{
|
||||
//Reader read eof or has read data_block->size bytes before,
|
||||
//reader_->GetSentence will return 0
|
||||
delete[] sentence;
|
||||
break;
|
||||
}
|
||||
}
|
||||
MemoryManager* memory_mamanger =
|
||||
new MemoryManager(option_->embeding_size);
|
||||
WordEmbedding* WordEmbeddings[2] =
|
||||
{ new WordEmbedding(option_, huffman_encoder_,
|
||||
sampler_, dictionary_->Size()),
|
||||
new WordEmbedding(option_, huffman_encoder_,
|
||||
sampler_, dictionary_->Size()) };
|
||||
|
||||
multiverso::Log::Info("Rank %d LoadOneDataBlockTime:%lfs\n",process_id_,
|
||||
(clock() - start) / (double)CLOCKS_PER_SEC);
|
||||
}
|
||||
//Step 1, Create Multiverso ParameterLoader and Trainers,
|
||||
//Start Multiverso environment
|
||||
WordEmbeddings[1]->MallocMemory();
|
||||
|
||||
void Distributed_wordembedding::StartLoadDataThread(BlockQueue *block_queue, Reader *reader, int64 file_size){
|
||||
int data_block_count = 0;
|
||||
for (int cur_epoch = 0; cur_epoch < option_->epoch; ++cur_epoch)
|
||||
{
|
||||
clock_t start_epoch = clock();
|
||||
reader_->ResetStart();
|
||||
for (int64 cur = 0; cur < file_size; cur += option_->data_block_size)
|
||||
{
|
||||
DataBlock *data_block = new (std::nothrow)DataBlock();
|
||||
assert(data_block != nullptr);
|
||||
LoadOneBlock(data_block, reader, option_->data_block_size);
|
||||
//Prepare option_->thread_cnt trainers for multiverso
|
||||
std::vector<multiverso::TrainerBase*>trainers;
|
||||
for (int i = 0; i < option_->thread_cnt; ++i)
|
||||
{
|
||||
trainers.push_back(new (std::nothrow)Trainer(i, option_,
|
||||
barrier, dictionary_, WordEmbeddings[1], memory_mamanger));
|
||||
assert(trainers[i] != nullptr);
|
||||
}
|
||||
|
||||
//multiverso::Log::Info("Rank %d Load Thread load the %d Data Block\n",process_id_,data_block_count);
|
||||
data_block_count++;
|
||||
//Start a thread to collect word_count from every trainers,
|
||||
//and update the WordEmbeddings[1]->word_count_actual
|
||||
StartCollectWordcountThread(trainers, WordEmbeddings[1]);
|
||||
|
||||
std::unique_lock<std::mutex> lock(block_queue->mtx);
|
||||
(block_queue->queues).push(data_block);
|
||||
(block_queue->repo_not_empty).notify_all();
|
||||
lock.unlock();
|
||||
}
|
||||
}
|
||||
//Prepare ParameterLoader
|
||||
ParameterLoader *parameter_loader =new (std::nothrow)ParameterLoader(
|
||||
option_, WordEmbeddings[0]);
|
||||
assert(parameter_loader != nullptr);
|
||||
|
||||
DataBlock *data_block = new (std::nothrow)DataBlock();
|
||||
assert(data_block != nullptr);
|
||||
data_block->SetLastFlag();
|
||||
std::unique_lock<std::mutex> lock(block_queue->mtx);
|
||||
(block_queue->queues).push(data_block);
|
||||
(block_queue->repo_not_empty).notify_all();
|
||||
lock.unlock();
|
||||
}
|
||||
//Step 2, prepare the Config for multiverso
|
||||
multiverso::Config config;
|
||||
config.max_delay = option_->max_delay;
|
||||
config.num_servers = option_->num_servers;
|
||||
config.num_aggregator = option_->num_aggregator;
|
||||
config.is_pipeline = option_->is_pipeline;
|
||||
config.lock_option =
|
||||
static_cast<multiverso::LockOption>(option_->lock_option);
|
||||
config.num_lock = option_->num_lock;
|
||||
//Config.server_endpoint_file = std::string(option_->endpoints_file);
|
||||
|
||||
DataBlock* Distributed_wordembedding::GetDataFromQueue(BlockQueue *block_queue){
|
||||
std::unique_lock<std::mutex> lock(block_queue->mtx);
|
||||
// item buffer is empty, just wait here.
|
||||
while (block_queue->queues.size() == 0) {
|
||||
multiverso::Log::Info("Waiting For Loading Data Block...\n");
|
||||
(block_queue->repo_not_empty).wait(lock);
|
||||
}
|
||||
//Step3, Init the environment of multiverso
|
||||
multiverso::Multiverso::Init(trainers, parameter_loader,
|
||||
config, &argc, &argv);
|
||||
|
||||
DataBlock *temp = block_queue->queues.front();
|
||||
multiverso::Log::Info("Geting Data Block From Queue...\n");
|
||||
block_queue->queues.pop();
|
||||
lock.unlock();
|
||||
return temp;
|
||||
}
|
||||
|
||||
DataBlock* Distributed_wordembedding::GetBlockAndPrepareParameter(BlockQueue *block_queue_){
|
||||
DataBlock* data_block = GetDataFromQueue(block_queue_);
|
||||
if (data_block->Size() == 0){
|
||||
return data_block;
|
||||
}
|
||||
data_block->MallocMemory(dictionary_->Size(), option_);
|
||||
PrepareData(data_block);
|
||||
communicator_->RequestParameter(data_block);
|
||||
GetAllWordCount();
|
||||
return data_block;
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::GetAllWordCount(){
|
||||
WordEmbedding_->word_count_actual = communicator_->GetWordCount();
|
||||
WordEmbedding_->UpdateLearningRate();
|
||||
//multiverso::Log::Info("Get all word count done.,word count actual is %d\n", WordEmbedding_->word_count_actual);
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::AddDeltaWordCount(){
|
||||
int64 temp_word_count = communicator_->GetWordCount();
|
||||
temp_word_count = WordEmbedding_->word_count_actual - temp_word_count;
|
||||
communicator_->AddWordCount(temp_word_count);
|
||||
//multiverso::Log::Info("Add word count done.word count delta is %d\n", WordEmbedding_->word_count_actual);
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::StartWordCount()
|
||||
{
|
||||
multiverso::Log::Info("Rank %d Start word count thread\n.",process_id_);
|
||||
int64 total_word_count = 0, sum = 0;
|
||||
while (is_running_)
|
||||
{
|
||||
sum = 0;
|
||||
for (int i = 0; i < trainers_.size(); ++i)
|
||||
sum += trainers_[i]->word_count;
|
||||
|
||||
if (sum < 10000 + total_word_count)
|
||||
{
|
||||
std::chrono::milliseconds dura(20);
|
||||
std::this_thread::sleep_for(dura);
|
||||
}
|
||||
else
|
||||
{
|
||||
WordEmbedding_->word_count_actual += sum - total_word_count;
|
||||
WordEmbedding_->UpdateLearningRate();
|
||||
total_word_count = sum;
|
||||
if (!option_->use_adagrad)
|
||||
{
|
||||
/*
|
||||
multiverso::Log::Info("Rank %d Alpha: %lf Progress: %.2lf%% WordCountActual: %lld Words/thread/second %lfk\n",
|
||||
multiverso::MV_Rank(), WordEmbedding_->learning_rate,
|
||||
WordEmbedding_->word_count_actual / ((double)option_->total_words * option_->epoch + 1) * 100,
|
||||
WordEmbedding_->word_count_actual,
|
||||
total_word_count / ((double)option_->thread_cnt * (clock() - start_) / CLOCKS_PER_SEC * 1000.0));
|
||||
*/
|
||||
}
|
||||
else
|
||||
{
|
||||
/*
|
||||
multiverso::Log::Info("Rank %d Progress: %.2lf%% WordCountActual: %lld Words/thread/second %lfk\n",
|
||||
multiverso::MV_Rank(),
|
||||
WordEmbedding_->word_count_actual / ((double)option_->total_words * option_->epoch + 1) * 100,
|
||||
WordEmbedding_->word_count_actual,
|
||||
total_word_count / ((double)option_->thread_cnt * (clock() - start_) / CLOCKS_PER_SEC * 1000.0));
|
||||
*/
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Add the left word_count to the WordEmbedding
|
||||
WordEmbedding_->word_count_actual += sum - total_word_count;
|
||||
WordEmbedding_->UpdateLearningRate();
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::StartCollectWordcountThread()
|
||||
{
|
||||
is_running_ = true;
|
||||
collect_wordcount_thread_ = std::thread(
|
||||
&Distributed_wordembedding::StartWordCount, this);
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::StopCollectWordcountThread()
|
||||
{
|
||||
is_running_ = false;
|
||||
collect_wordcount_thread_.join();
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::TrainNeuralNetwork(){
|
||||
int64 file_size = GetFileSize(option_->train_file);
|
||||
multiverso::Log::Info("train-file-size:%lld, data_block_size:%lld.\n",
|
||||
file_size, option_->data_block_size);
|
||||
|
||||
block_queue_ = new BlockQueue();
|
||||
load_data_thread_ = std::thread(&Distributed_wordembedding::StartLoadDataThread, this, block_queue_, reader_, file_size);
|
||||
|
||||
WordEmbedding_ = new WordEmbedding(option_, huffman_encoder_,
|
||||
sampler_, dictionary_->Size());
|
||||
assert(WordEmbedding_ != nullptr);
|
||||
|
||||
for (int i = 0; i < option_->thread_cnt; ++i)
|
||||
{
|
||||
trainers_.push_back(new (std::nothrow) Trainer(i, option_, dictionary_, WordEmbedding_));
|
||||
assert(trainers_[i] != nullptr);
|
||||
}
|
||||
|
||||
StartCollectWordcountThread();
|
||||
|
||||
start_ = clock();
|
||||
int data_block_count = 0;
|
||||
DataBlock *next_block = nullptr;
|
||||
DataBlock *data_block = nullptr;
|
||||
|
||||
data_block = GetBlockAndPrepareParameter(block_queue_);
|
||||
if (data_block == nullptr){
|
||||
multiverso::Log::Info("Please Change the Bigger Block Size.\n");
|
||||
return;
|
||||
}
|
||||
data_block_count++;
|
||||
|
||||
int64 all = file_size / option_->data_block_size + 1;
|
||||
for (int cur_epoch = 0; cur_epoch < option_->epoch; ++cur_epoch)
|
||||
{
|
||||
clock_t start_epoch = clock();
|
||||
for (int64 cur = 0; cur < all; ++cur)
|
||||
{
|
||||
clock_t start_block = clock();
|
||||
|
||||
if (option_->is_pipeline == false){
|
||||
#pragma omp parallel for num_threads(option_->thread_cnt)
|
||||
for (int i = 0; i < option_->thread_cnt; ++i){
|
||||
trainers_[i]->TrainIteration(data_block);
|
||||
}
|
||||
|
||||
communicator_->AddDeltaParameter(data_block);
|
||||
delete data_block;
|
||||
|
||||
data_block = GetBlockAndPrepareParameter(block_queue_);
|
||||
data_block_count++;
|
||||
multiverso::Log::Info("Get the %d Data Block and Request done.\n", data_block_count);
|
||||
}
|
||||
else{
|
||||
#pragma omp parallel num_threads(option_->thread_cnt+1)
|
||||
{
|
||||
if (omp_get_thread_num() == option_->thread_cnt){
|
||||
next_block = GetBlockAndPrepareParameter(block_queue_);
|
||||
data_block_count++;
|
||||
}
|
||||
else{
|
||||
trainers_[omp_get_thread_num()]->TrainIteration(data_block);
|
||||
}
|
||||
}
|
||||
|
||||
communicator_->AddDeltaParameter(data_block);
|
||||
delete data_block;
|
||||
|
||||
//(if next_block == nullptr) then data_block is null,we not run next block
|
||||
data_block = next_block;
|
||||
next_block = nullptr;
|
||||
}
|
||||
|
||||
multiverso::Log::Info("Rank %d Dealing one block time:%lfs\n", process_id_,
|
||||
(clock() - start_block) / (double)CLOCKS_PER_SEC);
|
||||
}
|
||||
|
||||
multiverso::Log::Info("Rank %d Dealing %d epoch time:%lfs\n", process_id_, cur_epoch,
|
||||
(clock() - start_epoch) / (double)CLOCKS_PER_SEC);
|
||||
|
||||
if (process_id_ == 0){
|
||||
SaveEmbedding(ChangeFileName(option_->output_file, cur_epoch), option_->output_binary);
|
||||
}
|
||||
}
|
||||
|
||||
multiverso::Log::Info("Rank %d Finish Traning %d Block.\n",process_id_, data_block_count);
|
||||
|
||||
StopCollectWordcountThread();
|
||||
//multiverso::Log::Info("Rank %d stop the word count thread.\n", process_id_);
|
||||
load_data_thread_.join();
|
||||
//multiverso::Log::Info("Rank %d stop the load data thread.\n", process_id_);
|
||||
assert(data_block->isLast() == true);
|
||||
delete data_block;
|
||||
|
||||
delete WordEmbedding_;
|
||||
delete block_queue_;
|
||||
for (auto trainer : trainers_)
|
||||
{
|
||||
delete trainer;
|
||||
}
|
||||
//multiverso::Log::Info("Rank %d delete all pointers.\n",process_id_);
|
||||
}
|
||||
|
||||
const char* Distributed_wordembedding::ChangeFileName(const char *file_path, int iteration){
|
||||
char * temp = new char[strlen(file_path)+1];
|
||||
strcpy(temp,file_path);
|
||||
std::string c_iteration = "_"+std::to_string(iteration);
|
||||
char const *p_iteration = c_iteration.c_str();
|
||||
return strcat(temp, p_iteration);
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::SaveEmbedding(const char *file_path, bool is_binary){
|
||||
clock_t start = clock();
|
||||
|
||||
const int batch = 50000;
|
||||
int epoch = dictionary_->Size() / batch;
|
||||
int left = dictionary_->Size() % batch;
|
||||
int base = 0;
|
||||
std::vector<real*> blocks;
|
||||
std::vector<int> nodes;
|
||||
|
||||
FILE* fid = nullptr;
|
||||
fid = (is_binary == true) ? fid = fopen(file_path, "wb") : fid = fopen(file_path, "wt");
|
||||
fprintf(fid, "%d %d\n", dictionary_->Size(), option_->embeding_size);
|
||||
|
||||
for (int i = 0; i < epoch; ++i){
|
||||
for (int j = 0; j < batch; ++j){
|
||||
nodes.push_back(base + j);
|
||||
}
|
||||
|
||||
communicator_->RequestBlocks(batch, blocks);
|
||||
communicator_->GetWorkerTableRows(nodes, blocks,option_->embeding_size);
|
||||
WriteToFile(is_binary, blocks,fid);
|
||||
communicator_->ReturnBlocks(blocks);
|
||||
|
||||
blocks.clear();
|
||||
nodes.clear();
|
||||
base = (i + 1)*batch;
|
||||
}
|
||||
|
||||
if (left > 0){
|
||||
for (int j = 0; j <left; ++j){
|
||||
nodes.push_back(base + j);
|
||||
}
|
||||
communicator_->RequestBlocks(left, blocks);
|
||||
communicator_->GetWorkerTableRows(nodes, blocks, option_->embeding_size);
|
||||
WriteToFile(is_binary, blocks, fid);
|
||||
communicator_->ReturnBlocks(blocks);
|
||||
}
|
||||
|
||||
fclose(fid);
|
||||
multiverso::Log::Info("Rank % dSaving Embedding time:%lfs\n", process_id_,
|
||||
(clock() - start) / (double)CLOCKS_PER_SEC);
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::WriteToFile(bool is_binary, std::vector<real*> &blocks, FILE* fid){
|
||||
for (int i = 0; i < blocks.size(); ++i)
|
||||
{
|
||||
fprintf(fid, "%s ", dictionary_->GetWordInfo(i)->word.c_str());
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
{
|
||||
if (is_binary){
|
||||
real tmp = blocks[i][j];
|
||||
fwrite(&tmp, sizeof(real), 1, fid);
|
||||
}
|
||||
else{
|
||||
fprintf(fid, "%lf ", blocks[i][j]);
|
||||
}
|
||||
}
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::PrepareData(DataBlock *data_block){
|
||||
clock_t start = clock();
|
||||
WordEmbedding_->PrepareData(data_block);
|
||||
multiverso::Log::Info("Rank %d Prepare data time:%lfs.\n",process_id_,
|
||||
(clock() - start) / (double)CLOCKS_PER_SEC);
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::Train(int argc, char *argv[])
|
||||
{
|
||||
//fix later
|
||||
argc = 1;
|
||||
argv = nullptr;
|
||||
multiverso::MV_Init(&argc, argv);
|
||||
multiverso::Log::Info("MV Rank %d Init done.\n",multiverso::MV_Rank());
|
||||
char log_name[100];
|
||||
sprintf(log_name, "log%s.txt", g_log_suffix.c_str());
|
||||
multiverso::Log::ResetLogFile(log_name);
|
||||
//Mark the node machine number
|
||||
process_id_ = multiverso::Multiverso::ProcessRank();
|
||||
//Step 4, prepare the sever/aggregator/cache Table for parametertable(3 or 5)
|
||||
//and initialize the severtable for inputvector
|
||||
PrepareMultiversoParameterTables(option_, dictionary_);
|
||||
|
||||
MV_Barrier();
|
||||
multiverso::Log::Info("MV Barrier done.\n");
|
||||
//Mark the node machine number
|
||||
process_id_ = multiverso::MV_Rank();
|
||||
//Step 5, start the Train of NN
|
||||
TrainNeuralNetwork();
|
||||
|
||||
//create worker table and server table
|
||||
communicator_->PrepareParameterTables(dictionary_->Size(), option_->embeding_size);
|
||||
//Step6, stop the thread which are collecting word_count,
|
||||
//and release the resource
|
||||
StopCollectWordcountThread();
|
||||
delete barrier;
|
||||
delete memory_mamanger;
|
||||
delete WordEmbeddings[0];
|
||||
delete WordEmbeddings[1];
|
||||
for (auto trainer : trainers)
|
||||
{
|
||||
delete trainer;
|
||||
}
|
||||
delete parameter_loader;
|
||||
multiverso::Multiverso::Close();
|
||||
}
|
||||
|
||||
//start to train
|
||||
TrainNeuralNetwork();
|
||||
//The thread to collect word_count from trainers_
|
||||
void Distributed_wordembedding::StartThread()
|
||||
{
|
||||
int64 total_word_count = 0, sum = 0;
|
||||
while (is_running_)
|
||||
{
|
||||
sum = 0;
|
||||
for (int i = 0; i < trainers_.size(); ++i)
|
||||
sum += trainers_[i]->word_count;
|
||||
|
||||
MV_ShutDown();
|
||||
multiverso::Log::Info("MV ShutDone done.\n");
|
||||
}
|
||||
if (sum < 10000 + total_word_count)
|
||||
{
|
||||
std::chrono::milliseconds dura(20);
|
||||
std::this_thread::sleep_for(dura);
|
||||
}
|
||||
else
|
||||
{
|
||||
WordEmbedding_->word_count_actual += sum - total_word_count;
|
||||
WordEmbedding_->UpdateLearningRate();
|
||||
total_word_count = sum;
|
||||
|
||||
void Distributed_wordembedding::Run(int argc, char *argv[])
|
||||
{
|
||||
g_log_suffix = GetSystemTime();
|
||||
srand(static_cast<unsigned int>(time(NULL)));
|
||||
if (!option_->use_adagrad)
|
||||
{
|
||||
multiverso::Log::Info("Rank %d Alpha: %lf Progress: %.2lf%% WordCountActual: %lld Words/thread/second %lfk\n",
|
||||
multiverso::Multiverso::ProcessRank(), WordEmbedding_->learning_rate,
|
||||
WordEmbedding_->word_count_actual / ((double)option_->total_words * option_->epoch + 1) * 100,
|
||||
WordEmbedding_->word_count_actual,
|
||||
total_word_count / ((double)option_->thread_cnt * (clock() - start_) / CLOCKS_PER_SEC * 1000.0));
|
||||
}
|
||||
else
|
||||
{
|
||||
multiverso::Log::Info("Rank %d Progress: %.2lf%% WordCountActual: %lld Words/thread/second %lfk\n",
|
||||
multiverso::Multiverso::ProcessRank(),
|
||||
WordEmbedding_->word_count_actual / ((double)option_->total_words * option_->epoch + 1) * 100,
|
||||
WordEmbedding_->word_count_actual,
|
||||
total_word_count / ((double)option_->thread_cnt * (clock() - start_) / CLOCKS_PER_SEC * 1000.0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
option_ = new (std::nothrow)Option();
|
||||
assert(option_ != nullptr);
|
||||
//Add the left word_count to the WordEmbedding
|
||||
WordEmbedding_->word_count_actual += sum - total_word_count;
|
||||
WordEmbedding_->UpdateLearningRate();
|
||||
}
|
||||
|
||||
dictionary_ = new (std::nothrow)Dictionary();
|
||||
assert(dictionary_ != nullptr);
|
||||
//Start a thread to collect the word count from trainers
|
||||
//The thread can be stopped by StopCollectWordcountThread()
|
||||
void Distributed_wordembedding::StartCollectWordcountThread(
|
||||
std::vector<multiverso::TrainerBase*> &trainer_bases, WordEmbedding *WordEmbedding)
|
||||
{
|
||||
is_running_ = true;
|
||||
WordEmbedding_ = WordEmbedding;
|
||||
for (int i = 0; i < trainer_bases.size(); ++i)
|
||||
trainers_.push_back(reinterpret_cast<Trainer*>(trainer_bases[i]));
|
||||
|
||||
huffman_encoder_ = new (std::nothrow)HuffmanEncoder();
|
||||
assert(huffman_encoder_ != nullptr);
|
||||
//Parse argument and store them in option
|
||||
//Start a thread to collect the actual_word_count
|
||||
collect_wordcount_thread_ = std::thread(
|
||||
&Distributed_wordembedding::StartThread, this);
|
||||
}
|
||||
|
||||
if (argc <= 1)
|
||||
{
|
||||
option_->PrintUsage();
|
||||
return;
|
||||
}
|
||||
//Stop the thread which is collecting the word_count_actual from trainers
|
||||
void Distributed_wordembedding::StopCollectWordcountThread()
|
||||
{
|
||||
is_running_ = false;
|
||||
collect_wordcount_thread_.join();
|
||||
}
|
||||
|
||||
option_->ParseArgs(argc, argv);
|
||||
//Create the three kinds of tables
|
||||
void Distributed_wordembedding::CreateMultiversoParameterTable(
|
||||
multiverso::integer_t table_id, multiverso::integer_t rows,
|
||||
multiverso::integer_t cols, multiverso::Type type,
|
||||
multiverso::Format default_format)
|
||||
{
|
||||
multiverso::Multiverso::AddServerTable(table_id, rows,
|
||||
cols, type, default_format);
|
||||
multiverso::Multiverso::AddCacheTable(table_id, rows,
|
||||
cols, type, default_format, 0);
|
||||
multiverso::Multiverso::AddAggregatorTable(table_id, rows,
|
||||
cols, type, default_format, 0);
|
||||
}
|
||||
|
||||
//Read the vocabulary file; create the dictionary
|
||||
//and huffman_encoder according opt
|
||||
if ((option_->hs == 1) && (option_->negative_num != 0))
|
||||
{
|
||||
multiverso::Log::Fatal("The Hierarchical Softmax and Negative Sampling is indefinite!\n");
|
||||
exit(0);
|
||||
}
|
||||
void Distributed_wordembedding::PrepareMultiversoParameterTables(
|
||||
Option *opt, Dictionary *dictionary)
|
||||
{
|
||||
multiverso::Multiverso::BeginConfig();
|
||||
int proc_count = multiverso::Multiverso::TotalProcessCount();
|
||||
|
||||
option_->total_words = LoadVocab(option_, dictionary_,
|
||||
huffman_encoder_);
|
||||
//Create tables, the order of creating tables should arise from 0 continuously
|
||||
//The elements of talbes will be initialized with 0
|
||||
CreateMultiversoParameterTable(kInputEmbeddingTableId,
|
||||
dictionary->Size(), opt->embeding_size,
|
||||
multiverso::Type::Float, multiverso::Format::Dense);
|
||||
|
||||
option_->PrintArgs();
|
||||
CreateMultiversoParameterTable(kEmbeddingOutputTableId,
|
||||
dictionary->Size(), opt->embeding_size,
|
||||
multiverso::Type::Float, multiverso::Format::Dense);
|
||||
|
||||
sampler_ = new (std::nothrow)Sampler();
|
||||
assert(sampler_ != nullptr);
|
||||
if (option_->negative_num)
|
||||
sampler_->SetNegativeSamplingDistribution(dictionary_);
|
||||
CreateMultiversoParameterTable(kWordCountActualTableId, 1, 1,
|
||||
multiverso::Type::LongLong, multiverso::Format::Dense);
|
||||
|
||||
char *filename = new (std::nothrow)char[strlen(option_->train_file) + 1];
|
||||
assert(filename != nullptr);
|
||||
strcpy(filename, option_->train_file);
|
||||
reader_ = new (std::nothrow)Reader(dictionary_, option_, sampler_, filename);
|
||||
assert(reader_ != nullptr);
|
||||
communicator_ = new (std::nothrow)Communicator(option_);
|
||||
//Train with multiverso
|
||||
this->Train(argc, argv);
|
||||
if (opt->use_adagrad)
|
||||
{
|
||||
CreateMultiversoParameterTable(kSumGradient2IETableId,
|
||||
dictionary->Size(), opt->embeding_size,
|
||||
multiverso::Type::Float, multiverso::Format::Dense);
|
||||
CreateMultiversoParameterTable(kSumGradient2EOTableId,
|
||||
dictionary->Size(), opt->embeding_size,
|
||||
multiverso::Type::Float, multiverso::Format::Dense);
|
||||
}
|
||||
|
||||
delete option_;
|
||||
delete dictionary_;
|
||||
delete huffman_encoder_;
|
||||
delete sampler_;
|
||||
delete reader_;
|
||||
delete communicator_;
|
||||
}
|
||||
//Initialize server tables
|
||||
//Every process will execute the code below, so the initialized
|
||||
//value should be divided by the number of processes
|
||||
for (int row = 0; row < dictionary->Size(); ++row)
|
||||
{
|
||||
for (int col = 0; col < opt->embeding_size; ++col)
|
||||
{
|
||||
multiverso::Multiverso::AddToServer<real>(
|
||||
kInputEmbeddingTableId, row, col,
|
||||
static_cast<real>((static_cast<real>(rand())
|
||||
/ RAND_MAX - 0.5) / opt->embeding_size / proc_count));
|
||||
}
|
||||
}
|
||||
|
||||
//Read the vocabulary file; create the dictionary
|
||||
//and huffman_encoder according opt
|
||||
int64 Distributed_wordembedding::LoadVocab(Option *opt,
|
||||
Dictionary *dictionary, HuffmanEncoder *huffman_encoder)
|
||||
{
|
||||
int64 total_words = 0;
|
||||
char word[kMaxString];
|
||||
FILE* fid = nullptr;
|
||||
clock_t start = clock();
|
||||
multiverso::Log::Info("vocab_file %s\n", opt->read_vocab_file);
|
||||
multiverso::Multiverso::EndConfig();
|
||||
}
|
||||
|
||||
if (opt->read_vocab_file != nullptr && strlen(opt->read_vocab_file) > 0)
|
||||
{
|
||||
multiverso::Log::Info("Begin to load vocabulary file [%s] ...\n",
|
||||
opt->read_vocab_file);
|
||||
fid = fopen(opt->read_vocab_file, "r");
|
||||
if (fid == nullptr)
|
||||
{
|
||||
multiverso::Log::Fatal("Open vocab_file failed!\n");
|
||||
exit(1);
|
||||
}
|
||||
int word_freq;
|
||||
while (fscanf(fid, "%s %d", word, &word_freq) != EOF)
|
||||
{
|
||||
dictionary->Insert(word, word_freq);
|
||||
}
|
||||
}
|
||||
//Get the size of filename, it should deal with large files
|
||||
int64 Distributed_wordembedding::GetFileSize(const char *filename)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
struct _stat64 info;
|
||||
_stat64(filename, &info);
|
||||
return (int64)info.st_size;
|
||||
#else
|
||||
struct stat info;
|
||||
stat(filename, &info);
|
||||
return(int64)info.st_size;
|
||||
#endif
|
||||
}
|
||||
|
||||
dictionary->RemoveWordsLessThan(opt->min_count);
|
||||
multiverso::Log::Info("Dictionary size: %d\n", dictionary->Size());
|
||||
//Remove the datablock which is delt by parameterloader and trainer
|
||||
void Distributed_wordembedding::RemoveDoneDataBlock(
|
||||
std::queue<DataBlock*> &datablock_queue)
|
||||
{
|
||||
while (datablock_queue.empty() == false
|
||||
&& datablock_queue.front()->IsDone())
|
||||
{
|
||||
DataBlock *p_data_block = datablock_queue.front();
|
||||
datablock_queue.pop();
|
||||
delete p_data_block;
|
||||
}
|
||||
}
|
||||
|
||||
total_words = 0;
|
||||
for (int i = 0; i < dictionary->Size(); ++i)
|
||||
total_words += dictionary->GetWordInfo(i)->freq;
|
||||
multiverso::Log::Info("Words in Dictionary %I64d\n", total_words);
|
||||
void Distributed_wordembedding::PushDataBlock(
|
||||
std::queue<DataBlock*> &datablock_queue, DataBlock* data_block)
|
||||
{
|
||||
|
||||
multiverso::Multiverso::PushDataBlock(data_block);
|
||||
|
||||
datablock_queue.push(data_block);
|
||||
//limit the max size of total datablocks to avoid out of memory
|
||||
while (static_cast<int64>(datablock_queue.size()) * option_->data_block_size
|
||||
> option_->max_preload_data_size)
|
||||
{
|
||||
std::chrono::milliseconds dura(200);
|
||||
std::this_thread::sleep_for(dura);
|
||||
//Remove the datablock which is delt by parameterloader and trainer
|
||||
RemoveDoneDataBlock(datablock_queue);
|
||||
}
|
||||
}
|
||||
|
||||
multiverso::Log::Info("Loading vocab time:%lfs\n",
|
||||
(clock() - start) / (double)CLOCKS_PER_SEC);
|
||||
void Distributed_wordembedding::TrainNeuralNetwork()
|
||||
{
|
||||
std::queue<DataBlock*>datablock_queue;
|
||||
int data_block_count = 0;
|
||||
int64 file_size = GetFileSize(option_->train_file);
|
||||
multiverso::Log::Info("train-file-size:%lld, data_block_size:%lld\n",
|
||||
file_size, option_->data_block_size);
|
||||
start_ = clock();
|
||||
multiverso::Multiverso::BeginTrain();
|
||||
for (int cur_epoch = 0; cur_epoch < option_->epoch; ++cur_epoch)
|
||||
{
|
||||
reader_->ResetStart();
|
||||
multiverso::Multiverso::BeginClock();
|
||||
for (int64 cur = 0; cur < file_size; cur += option_->data_block_size)
|
||||
{
|
||||
++data_block_count;
|
||||
DataBlock *data_block = new (std::nothrow)DataBlock();
|
||||
assert(data_block != nullptr);
|
||||
//Load the sentences from train file, and store them in data_block
|
||||
clock_t start = clock();
|
||||
LoadData(data_block, reader_, option_->data_block_size);
|
||||
multiverso::Log::Info("LoadOneDataBlockTime:%lfs\n",
|
||||
(clock() - start) / (double)CLOCKS_PER_SEC);
|
||||
PushDataBlock(datablock_queue, data_block);
|
||||
|
||||
if (opt->hs)
|
||||
huffman_encoder->BuildFromTermFrequency(dictionary);
|
||||
if (fid != nullptr)
|
||||
fclose(fid);
|
||||
}
|
||||
multiverso::Multiverso::EndClock();
|
||||
|
||||
return total_words;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Dump input-embedding weight
|
||||
multiverso::Multiverso::BeginClock();
|
||||
++data_block_count;
|
||||
DataBlock *data_block = new (std::nothrow)DataBlock();
|
||||
assert(data_block != nullptr);
|
||||
data_block->SetType(DataBlockType::Test);
|
||||
PushDataBlock(datablock_queue, data_block);
|
||||
multiverso::Multiverso::EndClock();
|
||||
|
||||
multiverso::Log::Info("Rank %d Pushed %d datablocks\n",
|
||||
process_id_, data_block_count);
|
||||
|
||||
multiverso::Multiverso::EndTrain();
|
||||
|
||||
//After EndTrain, all the datablock are done,
|
||||
//we remove all the datablocks
|
||||
RemoveDoneDataBlock(datablock_queue);
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::LoadData(DataBlock *data_block,
|
||||
Reader *reader, int64 size)
|
||||
{
|
||||
//Be sure to clear all the sentences
|
||||
//which were stored in data_block
|
||||
data_block->ClearSentences();
|
||||
reader->ResetSize(size);
|
||||
while (true)
|
||||
{
|
||||
int64 word_count = 0;
|
||||
int *sentence = new (std::nothrow)int[kMaxSentenceLength + 2];
|
||||
assert(sentence != nullptr);
|
||||
int sentence_length = reader->GetSentence(sentence, word_count);
|
||||
if (sentence_length > 0)
|
||||
{
|
||||
data_block->AddSentence(sentence, sentence_length,
|
||||
word_count, (uint64)rand() * 10000 + (uint64)rand());
|
||||
}
|
||||
else
|
||||
{
|
||||
//Reader read eof or has read data_block->size bytes before,
|
||||
//reader_->GetSentence will return 0
|
||||
delete[] sentence;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void Distributed_wordembedding::Run(int argc, char *argv[])
|
||||
{
|
||||
g_log_suffix = GetSystemTime();
|
||||
srand(static_cast<unsigned int>(time(NULL)));
|
||||
option_ = new (std::nothrow)Option();
|
||||
assert(option_ != nullptr);
|
||||
dictionary_ = new (std::nothrow)Dictionary();
|
||||
assert(dictionary_ != nullptr);
|
||||
huffman_encoder_ = new (std::nothrow)HuffmanEncoder();
|
||||
assert(huffman_encoder_ != nullptr);
|
||||
//Parse argument and store them in option
|
||||
|
||||
if (argc <= 1)
|
||||
{
|
||||
option_->PrintUsage();
|
||||
return;
|
||||
}
|
||||
|
||||
option_->ParseArgs(argc, argv);
|
||||
//Read the vocabulary file; create the dictionary
|
||||
//and huffman_encoder according opt
|
||||
|
||||
if ((option_->hs == 1) && (option_->negative_num != 0))
|
||||
{
|
||||
multiverso::Log::Fatal("The Hierarchical Softmax and Negative Sampling is indefinite!\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
multiverso::Log::Info("Loading vocabulary ...\n");
|
||||
option_->total_words = LoadVocab(option_, dictionary_,
|
||||
huffman_encoder_);
|
||||
multiverso::Log::Info("Loaded vocabulary\n");
|
||||
|
||||
option_->PrintArgs();
|
||||
|
||||
sampler_ = new (std::nothrow)Sampler();
|
||||
assert(sampler_ != nullptr);
|
||||
if (option_->negative_num)
|
||||
sampler_->SetNegativeSamplingDistribution(dictionary_);
|
||||
|
||||
char *filename = new (std::nothrow)char[strlen(option_->train_file) + 1];
|
||||
assert(filename != nullptr);
|
||||
strcpy(filename, option_->train_file);
|
||||
reader_ = new (std::nothrow)Reader(dictionary_, option_, sampler_, filename);
|
||||
assert(reader_ != nullptr);
|
||||
//Train with multiverso
|
||||
this->Train(argc, argv);
|
||||
|
||||
delete reader_;
|
||||
delete sampler_;
|
||||
delete huffman_encoder_;
|
||||
delete dictionary_;
|
||||
delete option_;
|
||||
}
|
||||
|
||||
//Readword from train_file to word array by the word index
|
||||
bool Distributed_wordembedding::ReadWord(char *word, FILE *fin)
|
||||
{
|
||||
int idx = 0;
|
||||
char ch;
|
||||
while (!feof(fin))
|
||||
{
|
||||
ch = fgetc(fin);
|
||||
if (ch == 13) continue;
|
||||
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
|
||||
{
|
||||
if (idx > 0)
|
||||
{
|
||||
if (ch == '\n')
|
||||
ungetc(ch, fin);
|
||||
break;
|
||||
}
|
||||
|
||||
if (ch == '\n')
|
||||
{
|
||||
strcpy(word, (char *)"</s>");
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
word[idx++] = ch;
|
||||
if (idx >= kMaxString - 1)
|
||||
idx--;
|
||||
}
|
||||
|
||||
word[idx] = 0;
|
||||
return idx > 0;
|
||||
}
|
||||
|
||||
|
||||
//Read the vocabulary file; create the dictionary
|
||||
//and huffman_encoder according opt
|
||||
int64 Distributed_wordembedding::LoadVocab(Option *opt,
|
||||
Dictionary *dictionary, HuffmanEncoder *huffman_encoder)
|
||||
{
|
||||
int64 total_words = 0;
|
||||
char word[kMaxString];
|
||||
FILE* fid = nullptr;
|
||||
multiverso::Log::Info("vocab_file %s\n", opt->read_vocab_file);
|
||||
if (opt->read_vocab_file != nullptr && strlen(opt->read_vocab_file) > 0)
|
||||
{
|
||||
multiverso::Log::Info("Begin to load vocabulary file [%s] ...\n",
|
||||
opt->read_vocab_file);
|
||||
fid = fopen(opt->read_vocab_file, "r");
|
||||
if (fid == nullptr)
|
||||
{
|
||||
multiverso::Log::Fatal("Open vocab_file failed!\n");
|
||||
exit(1);
|
||||
}
|
||||
int word_freq;
|
||||
while (fscanf(fid, "%s %d", word, &word_freq) != EOF)
|
||||
{
|
||||
dictionary->Insert(word, word_freq);
|
||||
}
|
||||
}
|
||||
|
||||
dictionary->RemoveWordsLessThan(opt->min_count);
|
||||
multiverso::Log::Info("Dictionary size: %d\n", dictionary->Size());
|
||||
total_words = 0;
|
||||
for (int i = 0; i < dictionary->Size(); ++i)
|
||||
total_words += dictionary->GetWordInfo(i)->freq;
|
||||
multiverso::Log::Info("Words in Dictionary %I64d\n", total_words);
|
||||
if (opt->hs)
|
||||
huffman_encoder->BuildFromTermFrequency(dictionary);
|
||||
if (fid != nullptr)
|
||||
fclose(fid);
|
||||
|
||||
return total_words;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,15 +1,14 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_DISTRIBUTED_WORDEMBEDDING_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_DISTRIBUTED_WORDEMBEDDING_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* file distributed_wordembedding.h
|
||||
* \brief Class Distributed_wordembedding describles the main frame of Distributed WordEmbedding and some useful functions
|
||||
* \brief Class Distributed_wordembedding describles the main frame of Distributed WordEmbedding and some useful functions
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <ctime>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include <iostream>
|
||||
|
@ -18,103 +17,107 @@
|
|||
#include <thread>
|
||||
#include <mutex>
|
||||
#include <functional>
|
||||
#include <omp.h>
|
||||
|
||||
#include "multiverso/multiverso.h"
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "util.h"
|
||||
#include "multiverso.h"
|
||||
#include "huffman_encoder.h"
|
||||
#include "reader.h"
|
||||
#include "data_block.h"
|
||||
#include "parameter_loader.h"
|
||||
#include "trainer.h"
|
||||
#include "memory_manager.h"
|
||||
#include "block_queue.h"
|
||||
#include "communicator.h"
|
||||
#include "reader.h"
|
||||
#include "log.h"
|
||||
#include "constant.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
extern std::string g_log_suffix;
|
||||
class Trainer;
|
||||
class WordEmbedding;
|
||||
class Comunicator;
|
||||
namespace wordembedding
|
||||
{
|
||||
extern std::string g_log_suffix;
|
||||
class Trainer;
|
||||
class Distributed_wordembedding
|
||||
{
|
||||
public:
|
||||
Distributed_wordembedding(){}
|
||||
/*!
|
||||
* \brief Run Function contains everything
|
||||
*/
|
||||
void Run(int argc, char *argv[]);
|
||||
|
||||
class Distributed_wordembedding
|
||||
{
|
||||
public:
|
||||
Distributed_wordembedding(){}
|
||||
/*!
|
||||
* \brief Run Function contains everything
|
||||
*/
|
||||
void Run(int argc, char *argv[]);
|
||||
|
||||
private:
|
||||
clock_t start_;
|
||||
int process_id_;
|
||||
Option* option_ = nullptr;
|
||||
Dictionary* dictionary_ = nullptr;
|
||||
HuffmanEncoder* huffman_encoder_ = nullptr;
|
||||
Sampler* sampler_ = nullptr;
|
||||
Reader* reader_ = nullptr;
|
||||
WordEmbedding* WordEmbedding_ = nullptr;
|
||||
BlockQueue *block_queue_ = nullptr;
|
||||
std::thread load_data_thread_;
|
||||
std::thread collect_wordcount_thread_;
|
||||
bool is_running_ = false;
|
||||
std::vector<Trainer*> trainers_;
|
||||
Communicator* communicator_;
|
||||
|
||||
/*!
|
||||
* \brief Load Dictionary from the vocabulary_file
|
||||
* \param opt Some model-set setparams
|
||||
* \param dictionary save the vocabulary and its frequency
|
||||
* \param huffman_encoder convert dictionary to the huffman_code
|
||||
*/
|
||||
int64 LoadVocab(Option *opt, Dictionary *dictionary,
|
||||
HuffmanEncoder *huffman_encoder);
|
||||
|
||||
/*!
|
||||
* \brief Loaddata from train_file to datablock
|
||||
* \param datablock the datablock which needs to be assigned
|
||||
* \param reader some useful function for calling
|
||||
* \param size datablock limit byte size
|
||||
*/
|
||||
//void LoadData(DataBlock *data_block, Reader *reader, int64 size);
|
||||
|
||||
/*!
|
||||
* \brief Complete the train task with multiverso
|
||||
*/
|
||||
void Train(int argc, char *argv[]);
|
||||
void TrainNeuralNetwork();
|
||||
|
||||
void PrepareData(DataBlock *data_block);
|
||||
|
||||
void StartLoadDataThread(BlockQueue *block_queue,
|
||||
Reader *reader, int64 file_size);
|
||||
|
||||
void LoadOneBlock(DataBlock *data_block,
|
||||
Reader *reader, int64 size);
|
||||
|
||||
void StartCollectWordcountThread();
|
||||
|
||||
void StopCollectWordcountThread();
|
||||
|
||||
void StartWordCount();
|
||||
|
||||
void GetAllWordCount();
|
||||
|
||||
void AddDeltaWordCount();
|
||||
|
||||
DataBlock* GetDataFromQueue(BlockQueue *block_queue);
|
||||
|
||||
DataBlock* GetBlockAndPrepareParameter(BlockQueue *block_queue_);
|
||||
|
||||
void SaveEmbedding(const char *file_path, bool is_binary);
|
||||
|
||||
void WriteToFile(bool is_binary, std::vector<real*> &blocks, FILE* fid);
|
||||
|
||||
const char* ChangeFileName(const char *file_path, int iteration);
|
||||
};
|
||||
}
|
||||
private:
|
||||
clock_t start_;
|
||||
int process_id_;
|
||||
Option* option_;
|
||||
Dictionary* dictionary_;
|
||||
HuffmanEncoder* huffman_encoder_;
|
||||
Sampler* sampler_;
|
||||
Reader* reader_;
|
||||
std::thread collect_wordcount_thread_;
|
||||
bool is_running_;
|
||||
std::vector<Trainer*> trainers_;
|
||||
WordEmbedding *WordEmbedding_;
|
||||
/*!
|
||||
* \brief Create a new thread which is used for
|
||||
* \calculating the speed of word processing.
|
||||
*/
|
||||
void StartThread();
|
||||
void StartCollectWordcountThread(
|
||||
std::vector<multiverso::TrainerBase*> &trainer, WordEmbedding *WordEmbedding);
|
||||
void StopCollectWordcountThread();
|
||||
/*!
|
||||
* \brief Read the word from the train_file
|
||||
* \param word word saved by string
|
||||
* \param fin train_filename
|
||||
*/
|
||||
bool ReadWord(char *word, FILE *fin);
|
||||
/*!
|
||||
* \brief Load Dictionary from the vocabulary_file
|
||||
* \param opt Some model-set setparams
|
||||
* \param dictionary save the vocabulary and its frequency
|
||||
* \param huffman_encoder convert dictionary to the huffman_code
|
||||
*/
|
||||
int64 LoadVocab(Option *opt, Dictionary *dictionary,
|
||||
HuffmanEncoder *huffman_encoder);
|
||||
/*!
|
||||
* \brief Get the file total wordnumber
|
||||
*/
|
||||
int64 GetFileSize(const char *filename);
|
||||
/*!
|
||||
* \brief Complete the train task with multiverso
|
||||
*/
|
||||
void Train(int argc, char *argv[]);
|
||||
void TrainNeuralNetwork();
|
||||
/*!
|
||||
* \brief Create a new table in the multiverso
|
||||
*/
|
||||
void CreateMultiversoParameterTable(multiverso::integer_t table_id,
|
||||
multiverso::integer_t rows, multiverso::integer_t cols,
|
||||
multiverso::Type type, multiverso::Format default_format);
|
||||
/*!
|
||||
* \brief Push the datablock into the multiverso and datablock_queue
|
||||
*/
|
||||
void PushDataBlock(std::queue<DataBlock*> &datablock_queue,
|
||||
DataBlock* data_block);
|
||||
/*!
|
||||
* \brief Prepare parameter table in the multiverso
|
||||
*/
|
||||
void PrepareMultiversoParameterTables(Option *opt,
|
||||
Dictionary *dictionary);
|
||||
/*!
|
||||
* \brief Loaddata from train_file to datablock
|
||||
* \param datablock the datablock which needs to be assigned
|
||||
* \param reader some useful function for calling
|
||||
* \param size datablock limit byte size
|
||||
*/
|
||||
void LoadData(DataBlock *data_block, Reader *reader, int64 size);
|
||||
/*!
|
||||
* \brief Remove datablock which is finished by multiverso thread
|
||||
* \param datablock_queue store the pushed datablocks
|
||||
*/
|
||||
void RemoveDoneDataBlock(std::queue<DataBlock*> &datablock_queue);
|
||||
// No copying allowed
|
||||
Distributed_wordembedding(const Distributed_wordembedding&);
|
||||
void operator=(const Distributed_wordembedding&);
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -1,287 +1,287 @@
|
|||
#include <cstring>
|
||||
#include "huffman_encoder.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
HuffmanEncoder::HuffmanEncoder()
|
||||
{
|
||||
dict_ = nullptr;
|
||||
//multiverso::Log::Info("ignore super long term");
|
||||
}
|
||||
namespace wordembedding
|
||||
{
|
||||
HuffmanEncoder::HuffmanEncoder()
|
||||
{
|
||||
dict_ = nullptr;
|
||||
}
|
||||
|
||||
//Save the word-huffmancode pair in file
|
||||
void HuffmanEncoder::Save2File(const char* filename)
|
||||
{
|
||||
FILE* fid = fopen(filename, "w");
|
||||
if (fid)
|
||||
{
|
||||
fprintf(fid, "%lld\n", hufflabel_info_.size());
|
||||
|
||||
//Save the word-huffmancode pair in file
|
||||
void HuffmanEncoder::Save2File(const char* filename)
|
||||
{
|
||||
FILE* fid = fopen(filename, "w");
|
||||
if (fid)
|
||||
{
|
||||
fprintf(fid, "%lld\n", hufflabel_info_.size());
|
||||
for (unsigned i = 0; i < hufflabel_info_.size(); ++i)
|
||||
{
|
||||
auto info = hufflabel_info_[i];
|
||||
const auto word = dict_->GetWordInfo(i);
|
||||
fprintf(fid, "%s %d", word->word.c_str(), info.codelen);
|
||||
|
||||
for (unsigned i = 0; i < hufflabel_info_.size(); ++i)
|
||||
{
|
||||
auto info = hufflabel_info_[i];
|
||||
const auto word = dict_->GetWordInfo(i);
|
||||
fprintf(fid, "%s %d", word->word.c_str(), info.codelen);
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
fprintf(fid, " %d", info.code[j]);
|
||||
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
fprintf(fid, " %d", info.code[j]);
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
fprintf(fid, " %d", info.point[j]);
|
||||
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
fprintf(fid, " %d", info.point[j]);
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
else
|
||||
{
|
||||
multiverso::Log::Error("file open failed %s", filename);
|
||||
}
|
||||
}
|
||||
|
||||
//Recover the word-huffmancode pair from file
|
||||
void HuffmanEncoder::RecoverFromFile(const char* filename)
|
||||
{
|
||||
dict_ = new (std::nothrow)Dictionary();
|
||||
assert(dict_ != nullptr);
|
||||
FILE* fid;
|
||||
fid=fopen(filename, "r");
|
||||
if (fid)
|
||||
{
|
||||
int64 vocab_size;
|
||||
fscanf(fid, "%lld", &vocab_size);
|
||||
hufflabel_info_.reserve(vocab_size);
|
||||
hufflabel_info_.clear();
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
else
|
||||
{
|
||||
//multiverso::Log::Error("file open failed %s", filename);
|
||||
}
|
||||
}
|
||||
int tmp;
|
||||
char sz_label[kMaxWordSize];
|
||||
for (int64 i = 0; i < vocab_size; ++i)
|
||||
{
|
||||
HuffLabelInfo info;
|
||||
|
||||
//Recover the word-huffmancode pair from file
|
||||
void HuffmanEncoder::RecoverFromFile(const char* filename)
|
||||
{
|
||||
dict_ = new (std::nothrow)Dictionary();
|
||||
assert(dict_ != nullptr);
|
||||
FILE* fid;
|
||||
fid = fopen(filename, "r");
|
||||
if (fid)
|
||||
{
|
||||
int64 vocab_size;
|
||||
fscanf(fid, "%lld", &vocab_size);
|
||||
hufflabel_info_.reserve(vocab_size);
|
||||
hufflabel_info_.clear();
|
||||
//fscanf_s(fid, "%s", sz_label, kMaxWordSize);
|
||||
fscanf(fid, "%s", sz_label, kMaxWordSize);
|
||||
dict_->Insert(sz_label);
|
||||
|
||||
int tmp;
|
||||
char sz_label[kMaxWordSize];
|
||||
for (int64 i = 0; i < vocab_size; ++i)
|
||||
{
|
||||
HuffLabelInfo info;
|
||||
fscanf(fid, "%d", &info.codelen);
|
||||
|
||||
//fscanf_s(fid, "%s", sz_label, kMaxWordSize);
|
||||
fscanf(fid, "%s", sz_label, kMaxWordSize);
|
||||
dict_->Insert(sz_label);
|
||||
info.code.clear();
|
||||
info.point.clear();
|
||||
|
||||
fscanf(fid, "%d", &info.codelen);
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
{
|
||||
fscanf(fid, "%d", &tmp);
|
||||
info.code.push_back(tmp);
|
||||
}
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
{
|
||||
fscanf(fid, "%d", &tmp);
|
||||
info.point.push_back(tmp);
|
||||
}
|
||||
|
||||
info.code.clear();
|
||||
info.point.clear();
|
||||
hufflabel_info_.push_back(info);
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
else
|
||||
{
|
||||
multiverso::Log::Error("file open failed %s", filename);
|
||||
}
|
||||
}
|
||||
//Compare the second element of two pairs
|
||||
bool compare(const std::pair<int, int64>& x,
|
||||
const std::pair<int, int64>& y)
|
||||
{
|
||||
if (x.second == 0) return true;
|
||||
if (y.second == 0) return false;
|
||||
return (x.second > y.second);
|
||||
}
|
||||
//Build huffaman tree from the existing dictionary
|
||||
void HuffmanEncoder::BuildHuffmanTreeFromDict()
|
||||
{
|
||||
std::vector<std::pair<int, int64> > ordered_words;
|
||||
ordered_words.reserve(dict_->Size());
|
||||
ordered_words.clear();
|
||||
for (int i = 0; i < dict_->Size(); ++i)
|
||||
ordered_words.push_back(std::pair<int, int64>(i, dict_->GetWordInfo(i)->freq));
|
||||
std::sort(ordered_words.begin(), ordered_words.end(), compare);
|
||||
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
{
|
||||
fscanf(fid, "%d", &tmp);
|
||||
info.code.push_back(tmp);
|
||||
}
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
{
|
||||
fscanf(fid, "%d", &tmp);
|
||||
info.point.push_back(tmp);
|
||||
}
|
||||
unsigned vocab_size = (unsigned)ordered_words.size();
|
||||
// frequence
|
||||
int64 *count = new (std::nothrow)int64[vocab_size * 2 + 1];
|
||||
assert(count != nullptr);
|
||||
// Huffman code relative to parent node [1,0] of each node
|
||||
unsigned *binary = new (std::nothrow)unsigned[vocab_size * 2 + 1];
|
||||
assert(binary != nullptr);
|
||||
memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
|
||||
|
||||
hufflabel_info_.push_back(info);
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
else
|
||||
{
|
||||
//multiverso::Log::Error("file open failed %s", filename);
|
||||
}
|
||||
}
|
||||
//Compare the second element of two pairs
|
||||
bool compare(const std::pair<int, int64>& x,
|
||||
const std::pair<int, int64>& y)
|
||||
{
|
||||
if (x.second == 0) return true;
|
||||
if (y.second == 0) return false;
|
||||
return (x.second > y.second);
|
||||
}
|
||||
//Build huffaman tree from the existing dictionary
|
||||
void HuffmanEncoder::BuildHuffmanTreeFromDict()
|
||||
{
|
||||
std::vector<std::pair<int, int64> > ordered_words;
|
||||
ordered_words.reserve(dict_->Size());
|
||||
ordered_words.clear();
|
||||
for (int i = 0; i < dict_->Size(); ++i)
|
||||
ordered_words.push_back(std::pair<int, int64>(i, dict_->GetWordInfo(i)->freq));
|
||||
std::sort(ordered_words.begin(), ordered_words.end(), compare);
|
||||
unsigned *parent_node = new (std::nothrow)unsigned[vocab_size * 2 + 1]; //
|
||||
assert(parent_node != nullptr);
|
||||
memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
|
||||
unsigned code[kMaxCodeLength], point[kMaxCodeLength];
|
||||
|
||||
unsigned vocab_size = (unsigned)ordered_words.size();
|
||||
// frequence
|
||||
int64 *count = new (std::nothrow)int64[vocab_size * 2 + 1];
|
||||
assert(count != nullptr);
|
||||
// Huffman code relative to parent node [1,0] of each node
|
||||
unsigned *binary = new (std::nothrow)unsigned[vocab_size * 2 + 1];
|
||||
assert(binary != nullptr);
|
||||
memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
|
||||
for (unsigned i = 0; i < vocab_size; ++i)
|
||||
count[i] = ordered_words[i].second;
|
||||
for (unsigned i = vocab_size; i < vocab_size * 2; i++)
|
||||
count[i] = static_cast<int64>(1e15);
|
||||
int pos1 = vocab_size - 1;
|
||||
int pos2 = vocab_size;
|
||||
int min1i, min2i;
|
||||
for (unsigned i = 0; i < vocab_size - 1; i++)
|
||||
{
|
||||
// First, find two smallest nodes 'min1, min2'
|
||||
assert(pos2 < static_cast<int>(vocab_size) * 2 - 1);
|
||||
//Find the samllest node
|
||||
if (pos1 >= 0)
|
||||
{
|
||||
if (count[pos1] < count[pos2])
|
||||
{
|
||||
min1i = pos1;
|
||||
pos1--;
|
||||
}
|
||||
else
|
||||
{
|
||||
min1i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
min1i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
|
||||
unsigned *parent_node = new (std::nothrow)unsigned[vocab_size * 2 + 1]; //
|
||||
assert(parent_node != nullptr);
|
||||
memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
|
||||
unsigned code[kMaxCodeLength], point[kMaxCodeLength];
|
||||
//Find the second samllest node
|
||||
if (pos1 >= 0)
|
||||
{
|
||||
if (count[pos1] < count[pos2])
|
||||
{
|
||||
min2i = pos1;
|
||||
pos1--;
|
||||
}
|
||||
else
|
||||
{
|
||||
min2i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
min2i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
|
||||
for (unsigned i = 0; i < vocab_size; ++i)
|
||||
count[i] = ordered_words[i].second;
|
||||
for (unsigned i = vocab_size; i < vocab_size * 2; i++)
|
||||
count[i] = static_cast<int64>(1e15);
|
||||
int pos1 = vocab_size - 1;
|
||||
int pos2 = vocab_size;
|
||||
int min1i, min2i;
|
||||
for (unsigned i = 0; i < vocab_size - 1; i++)
|
||||
{
|
||||
// First, find two smallest nodes 'min1, min2'
|
||||
assert(pos2 < static_cast<int>(vocab_size)* 2 - 1);
|
||||
//Find the samllest node
|
||||
if (pos1 >= 0)
|
||||
{
|
||||
if (count[pos1] < count[pos2])
|
||||
{
|
||||
min1i = pos1;
|
||||
pos1--;
|
||||
}
|
||||
else
|
||||
{
|
||||
min1i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
min1i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
count[vocab_size + i] = count[min1i] + count[min2i];
|
||||
|
||||
//Find the second samllest node
|
||||
if (pos1 >= 0)
|
||||
{
|
||||
if (count[pos1] < count[pos2])
|
||||
{
|
||||
min2i = pos1;
|
||||
pos1--;
|
||||
}
|
||||
else
|
||||
{
|
||||
min2i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
min2i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
assert(min1i >= 0);
|
||||
assert(min1i < static_cast<int>(vocab_size)* 2 - 1);
|
||||
assert(min2i >= 0);
|
||||
assert(min2i < static_cast<int>(vocab_size)* 2 - 1);
|
||||
parent_node[min1i] = vocab_size + i;
|
||||
parent_node[min2i] = vocab_size + i;
|
||||
binary[min2i] = 1;
|
||||
}
|
||||
assert(pos1 < 0);
|
||||
|
||||
count[vocab_size + i] = count[min1i] + count[min2i];
|
||||
//Generate the huffman code for each leaf node
|
||||
hufflabel_info_.clear();
|
||||
for (unsigned a = 0; a < vocab_size; ++a)
|
||||
hufflabel_info_.push_back(HuffLabelInfo());
|
||||
for (unsigned a = 0; a < vocab_size; a++)
|
||||
{
|
||||
unsigned b = a, i = 0;
|
||||
while (1)
|
||||
{
|
||||
assert(i < kMaxCodeLength);
|
||||
code[i] = binary[b];
|
||||
point[i] = b;
|
||||
i++;
|
||||
b = parent_node[b];
|
||||
if (b == vocab_size * 2 - 2) break;
|
||||
}
|
||||
unsigned cur_word = ordered_words[a].first;
|
||||
|
||||
assert(min1i >= 0);
|
||||
assert(min1i < static_cast<int>(vocab_size)* 2 - 1);
|
||||
assert(min2i >= 0);
|
||||
assert(min2i < static_cast<int>(vocab_size)* 2 - 1);
|
||||
parent_node[min1i] = vocab_size + i;
|
||||
parent_node[min2i] = vocab_size + i;
|
||||
binary[min2i] = 1;
|
||||
}
|
||||
assert(pos1 < 0);
|
||||
hufflabel_info_[cur_word].codelen = i;
|
||||
hufflabel_info_[cur_word].point.push_back(vocab_size - 2);
|
||||
|
||||
//Generate the huffman code for each leaf node
|
||||
hufflabel_info_.clear();
|
||||
for (unsigned a = 0; a < vocab_size; ++a)
|
||||
hufflabel_info_.push_back(HuffLabelInfo());
|
||||
for (unsigned a = 0; a < vocab_size; a++)
|
||||
{
|
||||
unsigned b = a, i = 0;
|
||||
while (1)
|
||||
{
|
||||
assert(i < kMaxCodeLength);
|
||||
code[i] = binary[b];
|
||||
point[i] = b;
|
||||
i++;
|
||||
b = parent_node[b];
|
||||
if (b == vocab_size * 2 - 2) break;
|
||||
}
|
||||
unsigned cur_word = ordered_words[a].first;
|
||||
for (b = 0; b < i; b++)
|
||||
{
|
||||
hufflabel_info_[cur_word].code.push_back(code[i - b - 1]);
|
||||
if (b)
|
||||
hufflabel_info_[cur_word].point.push_back(point[i - b] - vocab_size);
|
||||
}
|
||||
}
|
||||
|
||||
hufflabel_info_[cur_word].codelen = i;
|
||||
hufflabel_info_[cur_word].point.push_back(vocab_size - 2);
|
||||
delete[] count;
|
||||
count = nullptr;
|
||||
delete[] binary;
|
||||
binary = nullptr;
|
||||
delete[] parent_node;
|
||||
parent_node = nullptr;
|
||||
}
|
||||
//Firstly get the dictionary from file
|
||||
void HuffmanEncoder::BuildFromTermFrequency(const char* filename)
|
||||
{
|
||||
FILE* fid;
|
||||
fid=fopen(filename, "r");
|
||||
if (fid)
|
||||
{
|
||||
char sz_label[kMaxWordSize];
|
||||
dict_ = new (std::nothrow)Dictionary();
|
||||
assert(dict_ != nullptr);
|
||||
//while (fscanf_s(fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
while (fscanf(fid,"%s",sz_label) != EOF)
|
||||
{
|
||||
HuffLabelInfo info;
|
||||
int freq;
|
||||
fscanf(fid, "%d", &freq);
|
||||
dict_->Insert(sz_label, freq);
|
||||
}
|
||||
fclose(fid);
|
||||
|
||||
for (b = 0; b < i; b++)
|
||||
{
|
||||
hufflabel_info_[cur_word].code.push_back(code[i - b - 1]);
|
||||
if (b)
|
||||
hufflabel_info_[cur_word].point.push_back(point[i - b] - vocab_size);
|
||||
}
|
||||
}
|
||||
BuildHuffmanTreeFromDict();
|
||||
}
|
||||
else
|
||||
{
|
||||
multiverso::Log::Error("file open failed %s", filename);
|
||||
}
|
||||
}
|
||||
|
||||
delete[] count;
|
||||
count = nullptr;
|
||||
delete[] binary;
|
||||
binary = nullptr;
|
||||
delete[] parent_node;
|
||||
parent_node = nullptr;
|
||||
}
|
||||
//Firstly get the dictionary from file
|
||||
void HuffmanEncoder::BuildFromTermFrequency(const char* filename)
|
||||
{
|
||||
FILE* fid;
|
||||
fid = fopen(filename, "r");
|
||||
if (fid)
|
||||
{
|
||||
char sz_label[kMaxWordSize];
|
||||
dict_ = new (std::nothrow)Dictionary();
|
||||
assert(dict_ != nullptr);
|
||||
//while (fscanf_s(fid, "%s", sz_label, kMaxWordSize) != EOF)
|
||||
while (fscanf(fid, "%s", sz_label) != EOF)
|
||||
{
|
||||
HuffLabelInfo info;
|
||||
int freq;
|
||||
fscanf(fid, "%d", &freq);
|
||||
dict_->Insert(sz_label, freq);
|
||||
}
|
||||
fclose(fid);
|
||||
void HuffmanEncoder::BuildFromTermFrequency(Dictionary* dict)
|
||||
{
|
||||
dict_ = dict;
|
||||
BuildHuffmanTreeFromDict();
|
||||
}
|
||||
|
||||
BuildHuffmanTreeFromDict();
|
||||
}
|
||||
else
|
||||
{
|
||||
//multiverso::Log::Error("file open failed %s", filename);
|
||||
}
|
||||
}
|
||||
int HuffmanEncoder::GetLabelSize()
|
||||
{
|
||||
return dict_->Size();
|
||||
}
|
||||
//Get the label index
|
||||
int HuffmanEncoder::GetLabelIdx(const char* label)
|
||||
{
|
||||
return dict_->GetWordIdx(label);
|
||||
}
|
||||
|
||||
void HuffmanEncoder::BuildFromTermFrequency(Dictionary* dict)
|
||||
{
|
||||
dict_ = dict;
|
||||
BuildHuffmanTreeFromDict();
|
||||
}
|
||||
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(char* label)
|
||||
{
|
||||
int idx = GetLabelIdx(label);
|
||||
if (idx == -1)
|
||||
return nullptr;
|
||||
return GetLabelInfo(idx);
|
||||
}
|
||||
|
||||
int HuffmanEncoder::GetLabelSize()
|
||||
{
|
||||
return dict_->Size();
|
||||
}
|
||||
//Get the label index
|
||||
int HuffmanEncoder::GetLabelIdx(const char* label)
|
||||
{
|
||||
return dict_->GetWordIdx(label);
|
||||
}
|
||||
|
||||
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(char* label)
|
||||
{
|
||||
int idx = GetLabelIdx(label);
|
||||
if (idx == -1)
|
||||
return nullptr;
|
||||
return GetLabelInfo(idx);
|
||||
}
|
||||
|
||||
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(int label_idx)
|
||||
{
|
||||
if (label_idx == -1) return nullptr;
|
||||
return &hufflabel_info_[label_idx];
|
||||
}
|
||||
//Get the dictionary
|
||||
Dictionary* HuffmanEncoder::GetDict()
|
||||
{
|
||||
return dict_;
|
||||
}
|
||||
}
|
||||
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(int label_idx)
|
||||
{
|
||||
if (label_idx == -1) return nullptr;
|
||||
return &hufflabel_info_[label_idx];
|
||||
}
|
||||
//Get the dictionary
|
||||
Dictionary* HuffmanEncoder::GetDict()
|
||||
{
|
||||
return dict_;
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,5 +1,5 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_HUFFMAN_ENCODER_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_HUFFMAN_ENCODER_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* \brief Class Huffman_encoder stores the huffman_encode of the vocabulary according the dictionary
|
||||
*/
|
||||
|
@ -11,63 +11,63 @@
|
|||
#include "dictionary.h"
|
||||
#include "constant.h"
|
||||
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
struct HuffLabelInfo
|
||||
{ /*!
|
||||
* \brief Internal node ids in the code path
|
||||
*/
|
||||
std::vector<int> point;
|
||||
/*!
|
||||
* \brief Huffman code
|
||||
*/
|
||||
std::vector<char> code;
|
||||
int codelen;
|
||||
HuffLabelInfo()
|
||||
{
|
||||
codelen = 0;
|
||||
point.clear();
|
||||
code.clear();
|
||||
}
|
||||
};
|
||||
namespace wordembedding
|
||||
{
|
||||
struct HuffLabelInfo
|
||||
{ /*!
|
||||
* \brief Internal node ids in the code path
|
||||
*/
|
||||
std::vector<int> point;
|
||||
/*!
|
||||
* \brief Huffman code
|
||||
*/
|
||||
std::vector<char> code;
|
||||
int codelen;
|
||||
HuffLabelInfo()
|
||||
{
|
||||
codelen = 0;
|
||||
point.clear();
|
||||
code.clear();
|
||||
}
|
||||
};
|
||||
|
||||
class HuffmanEncoder
|
||||
{
|
||||
public:
|
||||
HuffmanEncoder();
|
||||
/*!
|
||||
* \brief Save the word-huffmancode in the file
|
||||
*/
|
||||
void Save2File(const char* filename);
|
||||
/*!
|
||||
* \brief Recover the word-huffmancode from the file
|
||||
*/
|
||||
void RecoverFromFile(const char* filename);
|
||||
/*!
|
||||
* \brief Get the dictionary file and build
|
||||
* \hufflabel_info from the dictionary
|
||||
*/
|
||||
void BuildFromTermFrequency(const char* filename);
|
||||
void BuildFromTermFrequency(Dictionary* dict);
|
||||
/*!
|
||||
* \brief Get the label size
|
||||
*/
|
||||
int GetLabelSize();
|
||||
/*!
|
||||
* \brief Get the label's index
|
||||
*/
|
||||
int GetLabelIdx(const char* label);
|
||||
HuffLabelInfo* GetLabelInfo(char* label);
|
||||
HuffLabelInfo* GetLabelInfo(int label_idx);
|
||||
Dictionary* GetDict();
|
||||
class HuffmanEncoder
|
||||
{
|
||||
public:
|
||||
HuffmanEncoder();
|
||||
/*!
|
||||
* \brief Save the word-huffmancode in the file
|
||||
*/
|
||||
void Save2File(const char* filename);
|
||||
/*!
|
||||
* \brief Recover the word-huffmancode from the file
|
||||
*/
|
||||
void RecoverFromFile(const char* filename);
|
||||
/*!
|
||||
* \brief Get the dictionary file and build
|
||||
* \hufflabel_info from the dictionary
|
||||
*/
|
||||
void BuildFromTermFrequency(const char* filename);
|
||||
void BuildFromTermFrequency(Dictionary* dict);
|
||||
/*!
|
||||
* \brief Get the label size
|
||||
*/
|
||||
int GetLabelSize();
|
||||
/*!
|
||||
* \brief Get the label's index
|
||||
*/
|
||||
int GetLabelIdx(const char* label);
|
||||
HuffLabelInfo* GetLabelInfo(char* label);
|
||||
HuffLabelInfo* GetLabelInfo(int label_idx);
|
||||
Dictionary* GetDict();
|
||||
|
||||
private:
|
||||
void BuildHuffmanTreeFromDict();
|
||||
std::vector<HuffLabelInfo> hufflabel_info_;
|
||||
Dictionary* dict_;
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
||||
private:
|
||||
void BuildHuffmanTreeFromDict();
|
||||
std::vector<HuffLabelInfo> hufflabel_info_;
|
||||
Dictionary* dict_;
|
||||
};
|
||||
}
|
||||
}
|
54
src/main.cpp
54
src/main.cpp
|
@ -1,46 +1,42 @@
|
|||
#include <cstring>
|
||||
#include <cmath>
|
||||
|
||||
#include <thread>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <new>
|
||||
//#include <vld.h>
|
||||
|
||||
#include "multiverso/util/log.h"
|
||||
#include "multiverso/multiverso.h"
|
||||
#include "distributed_wordembedding.h"
|
||||
#include "memory_manager.h"
|
||||
|
||||
#include "dictionary.h"
|
||||
#include "huffman_encoder.h"
|
||||
#include "util.h"
|
||||
#include "reader.h"
|
||||
#include "multiverso.h"
|
||||
#include "barrier.h"
|
||||
#include "distributed_wordembedding.h"
|
||||
#include "parameter_loader.h"
|
||||
#include "trainer.h"
|
||||
#include "word_embedding.h"
|
||||
#include "memory_manager.h"
|
||||
|
||||
using namespace multiverso;
|
||||
using namespace wordembedding;
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
try
|
||||
{
|
||||
Distributed_wordembedding *ptr = new (std::nothrow)Distributed_wordembedding();
|
||||
assert(ptr != nullptr);
|
||||
ptr->Run(argc, argv);
|
||||
delete ptr;
|
||||
}
|
||||
catch (std::bad_alloc &memExp)
|
||||
{
|
||||
multiverso::Log::Info("Something wrong with new() %s\n", memExp.what());
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
multiverso::Log::Info("Something wrong with other reason!\n");
|
||||
}
|
||||
system("PAUSE");
|
||||
return 0;
|
||||
{
|
||||
try
|
||||
{
|
||||
Distributed_wordembedding *ptr = new (std::nothrow)Distributed_wordembedding();
|
||||
assert(ptr != nullptr);
|
||||
ptr->Run(argc, argv);
|
||||
}
|
||||
catch (std::bad_alloc &memExp)
|
||||
{
|
||||
multiverso::Log::Info("Something wrong with new() %s\n", memExp.what());
|
||||
}
|
||||
catch(...)
|
||||
{
|
||||
multiverso::Log::Info("Something wrong with other reason!\n");
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -2,33 +2,33 @@
|
|||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
MemoryManager::MemoryManager(int block_size)
|
||||
{
|
||||
block_size_ = block_size;
|
||||
}
|
||||
//Request memory for blocks
|
||||
void MemoryManager::RequestBlocks(int64 block_number, std::vector<real*>& result)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
for (int64 i = 0; i < block_number; ++i)
|
||||
{
|
||||
result.push_back(new (std::nothrow) real[block_size_]);
|
||||
assert(result[i] != nullptr);
|
||||
}
|
||||
}
|
||||
//Free the memory for blocks
|
||||
void MemoryManager::ReturnBlocks(std::vector<real*>& blocks)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
for (size_t i = 0; i < blocks.size(); ++i)
|
||||
delete[] blocks[i];
|
||||
}
|
||||
namespace wordembedding
|
||||
{
|
||||
MemoryManager::MemoryManager(int block_size)
|
||||
{
|
||||
block_size_ = block_size;
|
||||
}
|
||||
//Request memory for blocks
|
||||
void MemoryManager::RequestBlocks(int64 block_number, std::vector<real*>& result)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
for (int64 i = 0; i < block_number; ++i)
|
||||
{
|
||||
result.push_back(new (std::nothrow) real[block_size_]);
|
||||
assert(result[i] != nullptr);
|
||||
}
|
||||
}
|
||||
//Free the memory for blocks
|
||||
void MemoryManager::ReturnBlocks(std::vector<real*>& blocks)
|
||||
{
|
||||
std::unique_lock<std::mutex> lock(mutex_);
|
||||
for (size_t i = 0; i < blocks.size(); ++i)
|
||||
delete[] blocks[i];
|
||||
}
|
||||
|
||||
MemoryManager::~MemoryManager()
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
MemoryManager::~MemoryManager()
|
||||
{
|
||||
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,48 +1,46 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_MEMORY_MANAGER_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_MEMORY_MANAGER_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* file memory_manager.h
|
||||
* \brief Class MemoryManager creates and allocates memory for the local parameter which is needed by the datablock training.
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <condition_variable>
|
||||
#include <cassert>
|
||||
#include <malloc.h>
|
||||
#include <cstring>
|
||||
|
||||
#include <vector>
|
||||
#include <condition_variable>
|
||||
|
||||
#include "constant.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
class MemoryManager
|
||||
{
|
||||
public:
|
||||
MemoryManager(int block_size);
|
||||
/*!
|
||||
* \brief Create memory for the blocks
|
||||
* \param block_number the block quantity needed
|
||||
* \param result the vector of the head address of allocated memory
|
||||
*/
|
||||
void RequestBlocks(int64 block_number, std::vector<real*>& result);
|
||||
/*!
|
||||
* \brief Delete the blocks memory
|
||||
* \param blocks the vector of the head address of allocated memory
|
||||
*/
|
||||
void ReturnBlocks(std::vector<real*>& blocks);
|
||||
~MemoryManager();
|
||||
namespace wordembedding
|
||||
{
|
||||
class MemoryManager
|
||||
{
|
||||
public:
|
||||
MemoryManager(int block_size);
|
||||
/*!
|
||||
* \brief Create memory for the blocks
|
||||
* \param block_number the block quantity needed
|
||||
* \param result the vector of the head address of allocated memory
|
||||
*/
|
||||
void RequestBlocks(int64 block_number, std::vector<real*>& result);
|
||||
/*!
|
||||
* \brief Delete the blocks memory
|
||||
* \param blocks the vector of the head address of allocated memory
|
||||
*/
|
||||
void ReturnBlocks(std::vector<real*>& blocks);
|
||||
~MemoryManager();
|
||||
|
||||
private:
|
||||
int64 block_size_;
|
||||
std::mutex mutex_;
|
||||
private:
|
||||
int64 block_size_;
|
||||
std::mutex mutex_;
|
||||
|
||||
// No copying allowed
|
||||
MemoryManager(const MemoryManager&);
|
||||
void operator=(const MemoryManager&);
|
||||
};
|
||||
}
|
||||
// No copying allowed
|
||||
MemoryManager(const MemoryManager&);
|
||||
void operator=(const MemoryManager&);
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
|
@ -0,0 +1,80 @@
|
|||
#include "parameter_loader.h"
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
ParameterLoader::ParameterLoader(Option *option,
|
||||
WordEmbedding *WordEmbedding)
|
||||
{
|
||||
option_ = option;
|
||||
WordEmbedding_ = WordEmbedding;
|
||||
|
||||
parse_and_request_count_ = 0;
|
||||
|
||||
//the log which will store the begin and end time of ParseAndRequest
|
||||
char log_name[100];
|
||||
sprintf(log_name, "parameter_loader%s.txt", g_log_suffix.c_str());
|
||||
log_file_ = fopen(log_name, "w");
|
||||
}
|
||||
|
||||
void ParameterLoader::ParseAndRequest(
|
||||
multiverso::DataBlockBase *data_block)
|
||||
{
|
||||
if (parse_and_request_count_ == 0)
|
||||
{
|
||||
start_ = clock();
|
||||
}
|
||||
|
||||
fprintf(log_file_, "%lf\n", (clock()) / (double)CLOCKS_PER_SEC);
|
||||
multiverso::Log::Info("Rank %d ParameterLoader begin %d\n",
|
||||
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
|
||||
++parse_and_request_count_;
|
||||
|
||||
|
||||
DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
|
||||
//Step 1, compute the parameters which will be used when the trainers begin
|
||||
std::vector<int> input_nodes;
|
||||
std::vector<int> output_nodes;
|
||||
//input_nodes,output_nodes
|
||||
multiverso::Log::Debug("Rank %d ParameterLoader parse begin %d\n",
|
||||
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
|
||||
WordEmbedding_->PrepareParameter(data);
|
||||
multiverso::Log::Debug("Rank %d ParameterLoader parse end %d\n",
|
||||
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
|
||||
//Step 2, Request the parameter
|
||||
multiverso::Log::Debug("Rank %d ParameterLoader request begin %d\n",
|
||||
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
|
||||
RequestParameter(data);
|
||||
multiverso::Log::Debug("Rank %d ParameterLoader request end %d\n",
|
||||
multiverso::Multiverso::ProcessRank(), parse_and_request_count_);
|
||||
//Step 3, store the needed parameters in data_block
|
||||
|
||||
multiverso::Log::Info("Rank %d ParameterLoader finish %d\n",
|
||||
multiverso::Multiverso::ProcessRank(), parse_and_request_count_ - 1);
|
||||
fprintf(log_file_, "%lf\n", (clock()) / (double)CLOCKS_PER_SEC);
|
||||
fflush(log_file_);
|
||||
}
|
||||
|
||||
void ParameterLoader::RequestParameter(DataBlock *data_block)
|
||||
{
|
||||
//If the data_block is the last one, we need to dump
|
||||
//the input-embedding weights
|
||||
if (data_block->Type() == DataBlockType::Test)
|
||||
RequestTable(kInputEmbeddingTableId);
|
||||
|
||||
RequestRow(kWordCountActualTableId, 0);
|
||||
for (auto node : data_block->input_nodes)
|
||||
RequestRow(kInputEmbeddingTableId, node);
|
||||
for (auto node : data_block->output_nodes)
|
||||
RequestRow(kEmbeddingOutputTableId, node);
|
||||
|
||||
if (option_->use_adagrad)
|
||||
{
|
||||
for (auto node : data_block->input_nodes)
|
||||
RequestRow(kSumGradient2IETableId, node);
|
||||
for (auto node : data_block->output_nodes)
|
||||
RequestRow(kSumGradient2EOTableId, node);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,53 @@
|
|||
#pragma once
|
||||
|
||||
/*!
|
||||
* file parameter_loader.h
|
||||
* \brief Class Parameterloader parses the datablock and requests the params from multiverso server
|
||||
*/
|
||||
|
||||
#include "multiverso.h"
|
||||
#include "data_block.h"
|
||||
#include "constant.h"
|
||||
#include "util.h"
|
||||
#include "huffman_encoder.h"
|
||||
#include "word_embedding.h"
|
||||
#include "log.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
class WordEmbedding;
|
||||
extern std::string g_log_suffix;
|
||||
|
||||
class ParameterLoader : public multiverso::ParameterLoaderBase
|
||||
{
|
||||
public:
|
||||
ParameterLoader(){}
|
||||
ParameterLoader(Option *option, WordEmbedding *WordEmbedding);
|
||||
/*!
|
||||
* \brief Parse the datablock to get the parameter needed
|
||||
* \param data_block which is pushed in
|
||||
*/
|
||||
void ParseAndRequest(multiverso::DataBlockBase* data_block) override;
|
||||
|
||||
private:
|
||||
Option *option_;
|
||||
WordEmbedding *WordEmbedding_;
|
||||
int parse_and_request_count_;
|
||||
clock_t start_;
|
||||
FILE* log_file_;
|
||||
/*!
|
||||
* \brief Request the parameters from multiverso server to local buffer
|
||||
* \param data_block which is pushed in
|
||||
* \param input_nodes stores the input words'index
|
||||
* \param output_nodes stores the output words'index
|
||||
*/
|
||||
void RequestParameter(DataBlock *data_block);
|
||||
//No copying allowed
|
||||
ParameterLoader(const ParameterLoader&);
|
||||
void operator=(const ParameterLoader&);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
208
src/reader.cpp
208
src/reader.cpp
|
@ -2,115 +2,115 @@
|
|||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
Reader::Reader(Dictionary *dictionary, Option *option,
|
||||
Sampler *sampler, const char *input_file)
|
||||
{
|
||||
dictionary_ = dictionary;
|
||||
option_ = option;
|
||||
sampler_ = sampler;
|
||||
namespace wordembedding
|
||||
{
|
||||
Reader::Reader(Dictionary *dictionary, Option *option,
|
||||
Sampler *sampler, const char *input_file)
|
||||
{
|
||||
dictionary_ = dictionary;
|
||||
option_ = option;
|
||||
sampler_ = sampler;
|
||||
|
||||
stopwords_table_.clear();
|
||||
if (option_->stopwords)
|
||||
{
|
||||
FILE* fid = fopen(option_->sw_file, "r");
|
||||
if (fid == nullptr)
|
||||
{
|
||||
//multiverso::Log::Fatal("Open sw_file failed!\n");
|
||||
exit(1);
|
||||
}
|
||||
while (ReadWord(word_, fid))
|
||||
{
|
||||
stopwords_table_.insert(word_);
|
||||
}
|
||||
stopwords_table_.clear();
|
||||
if (option_->stopwords)
|
||||
{
|
||||
FILE* fid = fopen(option_->sw_file, "r");
|
||||
if (fid == nullptr)
|
||||
{
|
||||
multiverso::Log::Fatal("Open sw_file failed!\n");
|
||||
exit(1);
|
||||
}
|
||||
while (ReadWord(word_, fid))
|
||||
{
|
||||
stopwords_table_.insert(word_);
|
||||
}
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
|
||||
file_ = fopen(input_file, "r");
|
||||
if (file_ == nullptr)
|
||||
{
|
||||
//multiverso::Log::Fatal("Open train_file failed!\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
file_ = fopen(input_file, "r");
|
||||
if (file_ == nullptr)
|
||||
{
|
||||
multiverso::Log::Fatal("Open train_file failed!\n");
|
||||
exit(1);
|
||||
}
|
||||
}
|
||||
|
||||
Reader::~Reader()
|
||||
{
|
||||
if (file_ != nullptr)
|
||||
fclose(file_);
|
||||
}
|
||||
//Get sentence by connecting the words extracted
|
||||
int Reader::GetSentence(int *sentence, int64 &word_count)
|
||||
{
|
||||
int length = 0, word_idx;
|
||||
word_count = 0;
|
||||
while (1)
|
||||
{
|
||||
if (!ReadWord(word_, file_))
|
||||
break;
|
||||
word_idx = dictionary_->GetWordIdx(word_);
|
||||
if (word_idx == -1)
|
||||
continue;
|
||||
word_count++;
|
||||
if (option_->stopwords && stopwords_table_.count(word_))
|
||||
continue;
|
||||
if (option_->sample > 0 &&
|
||||
!sampler_->WordSampling(
|
||||
dictionary_->GetWordInfo(word_idx)->freq,
|
||||
option_->total_words, option_->sample))
|
||||
continue;
|
||||
sentence[length++] = word_idx;
|
||||
if (length >= kMaxSentenceLength)
|
||||
break;
|
||||
}
|
||||
Reader::~Reader()
|
||||
{
|
||||
if (file_ != nullptr)
|
||||
fclose(file_);
|
||||
}
|
||||
//Get sentence by connecting the words extracted
|
||||
int Reader::GetSentence(int *sentence, int64 &word_count)
|
||||
{
|
||||
int length = 0, word_idx;
|
||||
word_count = 0;
|
||||
while (1)
|
||||
{
|
||||
if (!ReadWord(word_, file_))
|
||||
break;
|
||||
word_idx = dictionary_->GetWordIdx(word_);
|
||||
if (word_idx == -1)
|
||||
continue;
|
||||
word_count++;
|
||||
if (option_->stopwords && stopwords_table_.count(word_))
|
||||
continue;
|
||||
if (option_->sample > 0 &&
|
||||
!sampler_->WordSampling(
|
||||
dictionary_->GetWordInfo(word_idx)->freq,
|
||||
option_->total_words, option_->sample))
|
||||
continue;
|
||||
sentence[length++] = word_idx;
|
||||
if (length >= kMaxSentenceLength)
|
||||
break;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
return length;
|
||||
}
|
||||
|
||||
void Reader::ResetStart()
|
||||
{
|
||||
fseek(file_, 0, SEEK_SET);
|
||||
}
|
||||
void Reader::ResetStart()
|
||||
{
|
||||
fseek(file_, 0, SEEK_SET);
|
||||
}
|
||||
|
||||
void Reader::ResetSize(int64 size)
|
||||
{
|
||||
byte_count_ = 0;
|
||||
byte_size_ = size;
|
||||
}
|
||||
//Read words from the file
|
||||
bool Reader::ReadWord(char *word, FILE *fin)
|
||||
{
|
||||
int idx = 0;
|
||||
char ch;
|
||||
while (!feof(fin) && byte_count_ < byte_size_)
|
||||
{
|
||||
ch = fgetc(fin);
|
||||
++byte_count_;
|
||||
if (ch == 13) continue;
|
||||
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
|
||||
{
|
||||
if (idx > 0)
|
||||
{
|
||||
if (ch == '\n')
|
||||
ungetc(ch, fin);
|
||||
break;
|
||||
}
|
||||
if (ch == '\n')
|
||||
{
|
||||
strcpy(word, (char *)"</s>");
|
||||
return true;
|
||||
}
|
||||
else continue;
|
||||
}
|
||||
word[idx++] = ch;
|
||||
//Truncate too long words
|
||||
if (idx >= kMaxString - 1)
|
||||
idx--;
|
||||
}
|
||||
word[idx] = 0;
|
||||
return idx != 0;
|
||||
}
|
||||
}
|
||||
void Reader::ResetSize(int64 size)
|
||||
{
|
||||
byte_count_ = 0;
|
||||
byte_size_ = size;
|
||||
}
|
||||
//Read words from the file
|
||||
bool Reader::ReadWord(char *word, FILE *fin)
|
||||
{
|
||||
int idx = 0;
|
||||
char ch;
|
||||
while (!feof(fin) && byte_count_ < byte_size_)
|
||||
{
|
||||
ch = fgetc(fin);
|
||||
++byte_count_;
|
||||
if (ch == 13) continue;
|
||||
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
|
||||
{
|
||||
if (idx > 0)
|
||||
{
|
||||
if (ch == '\n')
|
||||
ungetc(ch, fin);
|
||||
break;
|
||||
}
|
||||
if (ch == '\n')
|
||||
{
|
||||
strcpy(word, (char *)"</s>");
|
||||
return true;
|
||||
}
|
||||
else continue;
|
||||
}
|
||||
word[idx++] = ch;
|
||||
//Truncate too long words
|
||||
if (idx >= kMaxString - 1)
|
||||
idx--;
|
||||
}
|
||||
word[idx] = 0;
|
||||
return idx != 0;
|
||||
}
|
||||
}
|
||||
}
|
77
src/reader.h
77
src/reader.h
|
@ -1,5 +1,5 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_READER_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_READER_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* file reader.h
|
||||
* \brief Class Reader helps the function Loaddata to fill the datablock
|
||||
|
@ -14,42 +14,41 @@
|
|||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
class Reader
|
||||
{
|
||||
public:
|
||||
Reader(Dictionary *dictionary, Option *option,
|
||||
Sampler *sampler, const char *input_file);
|
||||
~Reader();
|
||||
/*!
|
||||
* \brief Getsentence from the train_file
|
||||
* \param sentence save the sentence by the word index according to the dictionary
|
||||
* \param word_count count the sentence length
|
||||
*/
|
||||
int GetSentence(int *sentence, int64 &word_count);
|
||||
void ResetStart();
|
||||
void ResetSize(int64 size);
|
||||
namespace wordembedding
|
||||
{
|
||||
class Reader
|
||||
{
|
||||
public:
|
||||
Reader(Dictionary *dictionary, Option *option,
|
||||
Sampler *sampler, const char *input_file);
|
||||
~Reader();
|
||||
/*!
|
||||
* \brief Getsentence from the train_file
|
||||
* \param sentence save the sentence by the word index according to the dictionary
|
||||
* \param word_count count the sentence length
|
||||
*/
|
||||
int GetSentence(int *sentence, int64 &word_count);
|
||||
void ResetStart();
|
||||
void ResetSize(int64 size);
|
||||
|
||||
private:
|
||||
const Option *option_;
|
||||
FILE* file_;
|
||||
char word_[kMaxString + 1];
|
||||
Dictionary *dictionary_;
|
||||
Sampler *sampler_;
|
||||
int64 byte_count_, byte_size_;
|
||||
std::unordered_set<std::string> stopwords_table_;
|
||||
/*!
|
||||
* \brief Read words from the train_file
|
||||
* \param word store the extracted word
|
||||
* \param file represent the train_file pointer
|
||||
*/
|
||||
bool ReadWord(char *word, FILE *file);
|
||||
private:
|
||||
const Option *option_;
|
||||
FILE* file_;
|
||||
char word_[kMaxString + 1];
|
||||
Dictionary *dictionary_;
|
||||
Sampler *sampler_;
|
||||
int64 byte_count_, byte_size_;
|
||||
std::unordered_set<std::string> stopwords_table_;
|
||||
/*!
|
||||
* \brief Read words from the train_file
|
||||
* \param word store the extracted word
|
||||
* \param file represent the train_file pointer
|
||||
*/
|
||||
bool ReadWord(char *word, FILE *file);
|
||||
|
||||
//No copying allowed
|
||||
Reader(const Reader&);
|
||||
void operator=(const Reader&);
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
||||
//No copying allowed
|
||||
Reader(const Reader&);
|
||||
void operator=(const Reader&);
|
||||
};
|
||||
}
|
||||
}
|
349
src/trainer.cpp
349
src/trainer.cpp
|
@ -1,56 +1,315 @@
|
|||
#include "trainer.h"
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
Trainer::Trainer(int trainer_id, Option *option,
|
||||
Dictionary* dictionary, WordEmbedding* WordEmbedding)
|
||||
{
|
||||
trainer_id_ = trainer_id;
|
||||
option_ = option;
|
||||
word_count = 0;
|
||||
WordEmbedding_ = WordEmbedding;
|
||||
dictionary_ = dictionary;
|
||||
hidden_act_ = (real *)calloc(option_->embeding_size, sizeof(real));
|
||||
hidden_err_ = (real *)calloc(option_->embeding_size, sizeof(real));
|
||||
process_count_ = -1;
|
||||
process_id_ = -1;
|
||||
namespace wordembedding
|
||||
{
|
||||
Trainer::Trainer(int trainer_id, Option *option,
|
||||
multiverso::Barrier *barrier,
|
||||
Dictionary* dictionary, WordEmbedding* WordEmbedding,
|
||||
MemoryManager* memory_mamanger)
|
||||
{
|
||||
trainer_id_ = trainer_id;
|
||||
option_ = option;
|
||||
word_count = 0;
|
||||
WordEmbedding_ = WordEmbedding;
|
||||
barrier_ = barrier;
|
||||
dictionary_ = dictionary;
|
||||
memory_mamanger_ = memory_mamanger;
|
||||
hidden_act_ = (real *)calloc(option_->embeding_size, sizeof(real));
|
||||
hidden_err_ = (real *)calloc(option_->embeding_size, sizeof(real));
|
||||
process_count_ = -1;
|
||||
process_id_ = -1;
|
||||
assert(hidden_act_ != nullptr);
|
||||
assert(hidden_err_ != nullptr);
|
||||
start_ = 0;
|
||||
train_count_ = 0;
|
||||
if (trainer_id_ == 0)
|
||||
{
|
||||
//The log which recordes the begin and end time of TrainIteration()
|
||||
char log_name[100];
|
||||
sprintf(log_name, "trainer%s.txt", g_log_suffix.c_str());
|
||||
log_file_ = fopen(log_name, "w");
|
||||
}
|
||||
}
|
||||
|
||||
assert(hidden_act_ != nullptr);
|
||||
assert(hidden_err_ != nullptr);
|
||||
start_ = 0;
|
||||
train_count_ = 0;
|
||||
/*
|
||||
if (trainer_id_ == 0)
|
||||
{
|
||||
//The log which recordes the begin and end time of TrainIteration()
|
||||
char log_name[100];
|
||||
sprintf(log_name, "trainer%s.txt", g_log_suffix.c_str());
|
||||
log_file_ = fopen(log_name, "w");
|
||||
}
|
||||
*/
|
||||
}
|
||||
|
||||
void Trainer::TrainIteration(DataBlock *data_block)
|
||||
{
|
||||
if (process_id_ == -1)
|
||||
process_id_ = multiverso::MV_Rank();
|
||||
void Trainer::TrainIteration(multiverso::DataBlockBase *data_block)
|
||||
{
|
||||
if (process_id_ == -1)
|
||||
process_id_ = multiverso::Multiverso::ProcessRank();
|
||||
|
||||
if (data_block == nullptr){
|
||||
return;
|
||||
}
|
||||
if (trainer_id_ == 0)
|
||||
//Record the starting time of the Trainiteration
|
||||
fprintf(log_file_, "%lf\n", (clock()) / (double)CLOCKS_PER_SEC);
|
||||
|
||||
clock_t start = clock();
|
||||
multiverso::Log::Info("Rank %d Train %d Begin TrainIteration%d ...\n",
|
||||
process_id_, trainer_id_, train_count_);
|
||||
++train_count_;
|
||||
//Compute the total number of processes
|
||||
if (process_count_ == -1)
|
||||
process_count_ = multiverso::Multiverso::TotalProcessCount();
|
||||
|
||||
DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
|
||||
std::vector<int> input_nodes(data->input_nodes.begin(), data->input_nodes.end());
|
||||
std::vector<int> output_nodes(data->output_nodes.begin(), data->output_nodes.end());
|
||||
//A trainer only copy or add apart of parameters
|
||||
//This trainer should copy or add the parameters according to
|
||||
//local_input_nodes and local_output_nodes
|
||||
std::vector<int> local_input_nodes;
|
||||
std::vector<int> local_output_nodes;
|
||||
|
||||
multiverso::Log::Info("Rank %d Train %d TrainNN Begin TrainIteration%d ...\n",
|
||||
process_id_, trainer_id_, train_count_);
|
||||
for (int i = trainer_id_; i < input_nodes.size(); i += option_->thread_cnt)
|
||||
local_input_nodes.push_back(input_nodes[i]);
|
||||
for (int i = trainer_id_; i < output_nodes.size(); i += option_->thread_cnt)
|
||||
local_output_nodes.push_back(output_nodes[i]);
|
||||
|
||||
WordEmbedding_->Train(data_block, trainer_id_, option_->thread_cnt,
|
||||
word_count, hidden_act_, hidden_err_);
|
||||
if (trainer_id_ == 0)
|
||||
{
|
||||
multiverso::Log::Info("Rank %d input_size=%d, output_size=%d\n",
|
||||
process_id_, input_nodes.size(), output_nodes.size());
|
||||
}
|
||||
|
||||
multiverso::Log::Info("Rank %d Trainer %d training time:%lfs\n",process_id_,trainer_id_,
|
||||
(clock() - start) / (double)CLOCKS_PER_SEC);
|
||||
train_count_++;
|
||||
}
|
||||
}
|
||||
//Step 1, Copy the parameter from multiverso to WordEmbedding_
|
||||
//One trainer only copy a part of parameters
|
||||
multiverso::Log::Debug("Rank %d Train %d Copyparameter Begin TrainIteration%d ...\n",
|
||||
process_id_, trainer_id_, train_count_);
|
||||
|
||||
CopyParameter(local_input_nodes, local_output_nodes);
|
||||
if (trainer_id_ == 0)
|
||||
{
|
||||
multiverso::Row<int64> ©_row = GetRow<int64>(kWordCountActualTableId, 0);
|
||||
WordEmbedding_->word_count_actual = copy_row.At(0);
|
||||
WordEmbedding_->UpdateLearningRate();
|
||||
}
|
||||
multiverso::Log::Debug("Rank %d Train %d Copyparameter end TrainIteration%d ...\n",
|
||||
process_id_, trainer_id_, train_count_);
|
||||
//Wait for all the trainers to finish copying parameter
|
||||
barrier_->Wait();
|
||||
|
||||
//Step 2, After finishing copying parameter,
|
||||
//Use WordEmbedding_ to train a part of data_block
|
||||
int64 last_word_count = word_count;
|
||||
clock_t start = clock();
|
||||
multiverso::Log::Debug("Rank %d Train %d TrainNN Begin TrainIteration%d ...\n",
|
||||
process_id_, trainer_id_, train_count_);
|
||||
WordEmbedding_->Train(data, trainer_id_, option_->thread_cnt,
|
||||
word_count, hidden_act_, hidden_err_);
|
||||
if (word_count > last_word_count)
|
||||
{
|
||||
multiverso::Log::Info("TrainNNSpeed: Words/thread/second %lfk\n",
|
||||
((double)word_count - last_word_count) /
|
||||
(clock() - start) * (double)CLOCKS_PER_SEC / 1000);
|
||||
}
|
||||
multiverso::Log::Debug("Rank %d Train %d TrainNN end TrainIteration%d ...\n",
|
||||
process_id_, trainer_id_, train_count_);
|
||||
//Wait for all the trainers to finish training
|
||||
barrier_->Wait();
|
||||
multiverso::Log::Debug("Rank %d Train %d AddDeltaParameter Begin TrainIteration%d ...\n",
|
||||
process_id_, trainer_id_, train_count_);
|
||||
//Step 3, After finishing training, add the delta of parameters to multiverso
|
||||
AddDeltaParameter(local_input_nodes, local_output_nodes);
|
||||
if (trainer_id_ == 0)
|
||||
{
|
||||
multiverso::Row<int64> ©_row = GetRow<int64>(kWordCountActualTableId, 0);
|
||||
Add<int64>(kWordCountActualTableId, 0, 0, WordEmbedding_->word_count_actual - copy_row.At(0));
|
||||
}
|
||||
multiverso::Log::Debug("Rank %d Train %d AddDeltaParameter end TrainIteration%d ...\n",
|
||||
process_id_, trainer_id_, train_count_);
|
||||
|
||||
//If the data_block is the last one,Dump the input-embedding weights
|
||||
if (data->Type() == DataBlockType::Test && trainer_id_ == 0)
|
||||
{
|
||||
SaveEmbedding(option_->output_file, option_->output_binary);
|
||||
}
|
||||
|
||||
if (trainer_id_ == 0)
|
||||
{
|
||||
fprintf(log_file_, "%lf\n",
|
||||
(clock()) / (double)CLOCKS_PER_SEC);
|
||||
fflush(log_file_);
|
||||
}
|
||||
}
|
||||
|
||||
void Trainer::CopyRow(real* ptr, multiverso::Row<real>& row, int size)
|
||||
{
|
||||
for (int i = 0; i < size; ++i)
|
||||
ptr[i] = row.At(i);
|
||||
}
|
||||
|
||||
|
||||
void Trainer::CopyParameter(std::vector<int>& input_nodes,
|
||||
std::vector<int>& output_nodes)
|
||||
{
|
||||
//Compute the number of necessary memory blocks to store parameter
|
||||
std::vector<real*> blocks;
|
||||
int current_block = 0;
|
||||
size_t total_blocks = (input_nodes.size() + output_nodes.size());
|
||||
if (option_->use_adagrad)
|
||||
total_blocks *= 2;
|
||||
|
||||
//Request blocks to store parameters
|
||||
memory_mamanger_->RequestBlocks(total_blocks, blocks);
|
||||
assert(blocks.size() == total_blocks);
|
||||
if (blocks.size() != total_blocks)
|
||||
{
|
||||
multiverso::Log::Error("Rank %d Trainer %d Error to requestBlocks to CopyParameter, allocated_blocks_num=%lld, needed_blocks_num=%lld\n",
|
||||
multiverso::Multiverso::ProcessRank(), trainer_id_, blocks.size(), total_blocks);
|
||||
return;
|
||||
}
|
||||
|
||||
//Copy input-embedding weights from multiverso to WordEmbedding
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
real* ptr = blocks[current_block++];
|
||||
assert(ptr != nullptr);
|
||||
CopyRow(ptr, GetRow<real>(kInputEmbeddingTableId,
|
||||
input_nodes[i]), option_->embeding_size);
|
||||
|
||||
WordEmbedding_->SetWeightIE(input_nodes[i], ptr);
|
||||
}
|
||||
|
||||
//Copy embedding-output weights from multiverso to WordEmbedding
|
||||
for (int i = 0; i < output_nodes.size(); ++i)
|
||||
{
|
||||
real* ptr = blocks[current_block++];
|
||||
assert(ptr != nullptr);
|
||||
CopyRow(ptr, GetRow<real>(kEmbeddingOutputTableId,
|
||||
output_nodes[i]), option_->embeding_size);
|
||||
|
||||
WordEmbedding_->SetWeightEO(output_nodes[i], ptr);
|
||||
}
|
||||
|
||||
if (option_->use_adagrad)
|
||||
{
|
||||
//Copy input-embedding sum of squarsh of gradient
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
real* ptr = blocks[current_block++];
|
||||
assert(ptr != nullptr);
|
||||
CopyRow(ptr, GetRow<real>(kSumGradient2IETableId,
|
||||
input_nodes[i]), option_->embeding_size);
|
||||
|
||||
WordEmbedding_->SetSumGradient2IE(input_nodes[i], ptr);
|
||||
}
|
||||
|
||||
//Copy embedding-output sum of squarsh of gradient
|
||||
for (int i = 0; i < output_nodes.size(); ++i)
|
||||
{
|
||||
real* ptr = blocks[current_block++];
|
||||
assert(ptr != nullptr);
|
||||
CopyRow(ptr, GetRow<real>(kSumGradient2EOTableId,
|
||||
output_nodes[i]), option_->embeding_size);
|
||||
|
||||
WordEmbedding_->SetSumGradient2EO(output_nodes[i], ptr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void Trainer::AddRow(real* ptr, int table_id, int row_id, int size)
|
||||
{
|
||||
multiverso::Row<real>& row = GetRow<real>(table_id, row_id);
|
||||
for (int i = 0; i < size; ++i)
|
||||
{
|
||||
real delta = (ptr[i] - row.At(i)) / process_count_;
|
||||
if (fabs(delta) > kEps)
|
||||
Add<real>(table_id, row_id, i, delta);
|
||||
}
|
||||
}
|
||||
|
||||
//Add delta to local buffer and send it to the parameter sever
|
||||
void Trainer::AddDeltaParameter(std::vector<int>& input_nodes,
|
||||
std::vector<int>& output_nodes)
|
||||
{
|
||||
std::vector<real*> blocks;
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
real* ptr = WordEmbedding_->GetWeightIE(input_nodes[i]);
|
||||
assert(ptr != nullptr);
|
||||
AddRow(ptr, kInputEmbeddingTableId, input_nodes[i],
|
||||
option_->embeding_size);
|
||||
|
||||
blocks.push_back(ptr);
|
||||
}
|
||||
|
||||
for (int i = 0; i < output_nodes.size(); ++i)
|
||||
{
|
||||
real* ptr = WordEmbedding_->GetWeightEO(output_nodes[i]);
|
||||
assert(ptr != nullptr);
|
||||
AddRow(ptr, kEmbeddingOutputTableId, output_nodes[i],
|
||||
option_->embeding_size);
|
||||
blocks.push_back(ptr);
|
||||
}
|
||||
|
||||
if (option_->use_adagrad)
|
||||
{
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
real* ptr = WordEmbedding_->GetSumGradient2IE(input_nodes[i]);
|
||||
assert(ptr != nullptr);
|
||||
AddRow(ptr, kSumGradient2IETableId, input_nodes[i],
|
||||
option_->embeding_size);
|
||||
blocks.push_back(ptr);
|
||||
}
|
||||
|
||||
for (int i = 0; i < output_nodes.size(); ++i)
|
||||
{
|
||||
real* ptr = WordEmbedding_->GetSumGradient2EO(output_nodes[i]);
|
||||
assert(ptr != nullptr);
|
||||
AddRow(ptr, kSumGradient2EOTableId, output_nodes[i],
|
||||
option_->embeding_size);
|
||||
blocks.push_back(ptr);
|
||||
}
|
||||
}
|
||||
|
||||
//Return all the memory blocks
|
||||
memory_mamanger_->ReturnBlocks(blocks);
|
||||
}
|
||||
|
||||
|
||||
void Trainer::SaveEmbedding(const char *file_path, bool is_binary)
|
||||
{
|
||||
FILE* fid = nullptr;
|
||||
if (is_binary)
|
||||
{
|
||||
fid = fopen(file_path, "wb");
|
||||
fprintf(fid, "%d %d\n", dictionary_->Size(),option_->embeding_size);
|
||||
for (int i = 0; i < dictionary_->Size(); ++i)
|
||||
{
|
||||
fprintf(fid, "%s ",
|
||||
dictionary_->GetWordInfo(i)->word.c_str());
|
||||
|
||||
multiverso::Row<real>& embedding = GetRow<real>(
|
||||
kInputEmbeddingTableId, i);
|
||||
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
{
|
||||
real tmp = embedding.At(j);
|
||||
fwrite(&tmp, sizeof(real), 1, fid);
|
||||
}
|
||||
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
else
|
||||
{
|
||||
fid = fopen(file_path, "wt");
|
||||
fprintf(fid, "%d %d\n", dictionary_->Size(), option_->embeding_size);
|
||||
for (int i = 0; i < dictionary_->Size(); ++i)
|
||||
{
|
||||
fprintf(fid, "%s ", dictionary_->GetWordInfo(i)->word.c_str());
|
||||
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, i);
|
||||
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
fprintf(fid, "%lf ", embedding.At(j));
|
||||
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,53 +1,80 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_TRAINER_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_TRAINER_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* file trainer.h
|
||||
* \brief Class Trainer trains the model by every trainiteration
|
||||
*/
|
||||
|
||||
#include "multiverso/multiverso.h"
|
||||
#include "multiverso/updater/updater.h"
|
||||
#include "multiverso/table/matrix_table.h"
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
|
||||
#include "multiverso.h"
|
||||
#include "data_block.h"
|
||||
#include "constant.h"
|
||||
#include "util.h"
|
||||
#include "huffman_encoder.h"
|
||||
#include "word_embedding.h"
|
||||
#include "data_block.h"
|
||||
#include "memory_manager.h"
|
||||
#include "barrier.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
class WordEmbedding;
|
||||
extern std::string g_log_suffix;
|
||||
class Trainer{
|
||||
public:
|
||||
int64 word_count;
|
||||
Trainer(int trainer_id, Option *option,
|
||||
Dictionary* dictionary, WordEmbedding* WordEmbedding);
|
||||
/*!
|
||||
* /brief Train one datablock
|
||||
*/
|
||||
void TrainIteration(DataBlock * data_block);
|
||||
namespace wordembedding
|
||||
{
|
||||
class WordEmbedding;
|
||||
extern std::string g_log_suffix;
|
||||
class Trainer : public multiverso::TrainerBase
|
||||
{
|
||||
public:
|
||||
int64 word_count;
|
||||
Trainer(int trainer_id, Option *option, Barrier* barrier,
|
||||
Dictionary* dictionary, WordEmbedding* WordEmbedding,
|
||||
MemoryManager* memory_mamanger);
|
||||
/*!
|
||||
* /brief Train one datablock
|
||||
*/
|
||||
void TrainIteration(multiverso::DataBlockBase* data_block) override;
|
||||
|
||||
private:
|
||||
int process_count_;
|
||||
int process_id_;
|
||||
int trainer_id_;
|
||||
Option *option_;
|
||||
real *hidden_act_, *hidden_err_;
|
||||
WordEmbedding* WordEmbedding_;
|
||||
Dictionary* dictionary_;
|
||||
int train_count_;
|
||||
clock_t start_, now_;
|
||||
FILE* log_file_;
|
||||
private:
|
||||
int process_count_;
|
||||
int process_id_;
|
||||
int trainer_id_;
|
||||
Option *option_;
|
||||
real *hidden_act_, *hidden_err_;
|
||||
WordEmbedding* WordEmbedding_;
|
||||
multiverso::Barrier *barrier_;
|
||||
Dictionary* dictionary_;
|
||||
MemoryManager* memory_mamanger_;
|
||||
int train_count_;
|
||||
clock_t start_, now_;
|
||||
FILE* log_file_;
|
||||
|
||||
//No copying allowed
|
||||
Trainer(const Trainer&);
|
||||
void operator=(const Trainer&);
|
||||
};
|
||||
}
|
||||
/*!
|
||||
* \brief Save the input-embedding vectors in file_path
|
||||
* \param file_path
|
||||
* \param is_binary, the format of file
|
||||
* 1 - save the vectors in the binary format,
|
||||
* 2 - save the vectors in the ascii format
|
||||
*/
|
||||
void SaveEmbedding(const char *file_path, bool is_binary);
|
||||
/*!
|
||||
* \brief Copy the needed parameter from buffer to blocks
|
||||
*/
|
||||
void CopyRow(real* ptr, multiverso::Row<real>& row, int size);
|
||||
void CopyParameter(std::vector<int>& input_nodes,
|
||||
std::vector<int>& output_nodes);
|
||||
/*!
|
||||
* \brief Add delta to the parameter stored in the
|
||||
* \buffer and send it to multiverso
|
||||
*/
|
||||
void AddRow(real* ptr, int table_id,
|
||||
int row_id, int size);
|
||||
void AddDeltaParameter(std::vector<int>& input_nodes,
|
||||
std::vector<int>& output_nodes);
|
||||
|
||||
//No copying allowed
|
||||
Trainer(const Trainer&);
|
||||
void operator=(const Trainer&);
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
|
422
src/util.cpp
422
src/util.cpp
|
@ -1,256 +1,192 @@
|
|||
#include <time.h>
|
||||
#include "util.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
Option::Option()
|
||||
{
|
||||
train_file = nullptr;
|
||||
read_vocab_file = nullptr;
|
||||
output_file = nullptr;
|
||||
sw_file = nullptr;
|
||||
endpoints_file = "";
|
||||
hs = true;
|
||||
negative_num = 0;
|
||||
output_binary = false;
|
||||
sample = 0;
|
||||
cbow = true;
|
||||
embeding_size = 0;
|
||||
thread_cnt = 1;
|
||||
window_size = 5;
|
||||
min_count = 5;
|
||||
data_block_size = 1000000;
|
||||
init_learning_rate = static_cast<real>(0.025);
|
||||
epoch = 1;
|
||||
stopwords = false;
|
||||
is_pipeline = true;
|
||||
total_words = 0;
|
||||
max_preload_data_size = 8000000000LL;
|
||||
use_adagrad = false;
|
||||
//multiverso config
|
||||
num_servers = 0;
|
||||
num_aggregator = 1;
|
||||
lock_option = 1;
|
||||
num_lock = 100;
|
||||
max_delay = 0;
|
||||
}
|
||||
//Input all the local model-arguments
|
||||
void Option::ParseArgs(int argc, char* argv[])
|
||||
{
|
||||
for (int i = 1; i < argc; i += 2)
|
||||
{
|
||||
if (strcmp(argv[i], "-size") == 0) embeding_size = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-train_file") == 0) train_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-endpoints_file") == 0) endpoints_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-read_vocab") == 0) read_vocab_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-binary") == 0) output_binary = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-cbow") == 0) cbow = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-alpha") == 0) init_learning_rate = static_cast<real>(atof(argv[i + 1]));
|
||||
if (strcmp(argv[i], "-output") == 0) output_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-window") == 0) window_size = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-sample") == 0) sample = static_cast<real>(atof(argv[i + 1]));
|
||||
if (strcmp(argv[i], "-hs") == 0) hs = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-data_block_size") == 0) data_block_size = atoll(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-max_preload_data_size") == 0) max_preload_data_size = atoll(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-negative") == 0) negative_num = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-threads") == 0) thread_cnt = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-min_count") == 0) min_count = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-epoch") == 0) epoch = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-stopwords") == 0) stopwords = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-sw_file") == 0) sw_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-use_adagrad") == 0) use_adagrad = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-is_pipeline") == 0) is_pipeline = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-num_servers") == 0) num_servers = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-num_aggregator") == 0) num_aggregator = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-lock_option") == 0) lock_option = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-num_lock") == 0) num_lock = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-max_delay") == 0) max_delay = atoi(argv[i + 1]);
|
||||
namespace wordembedding
|
||||
{
|
||||
Option::Option()
|
||||
{
|
||||
train_file = NULL;
|
||||
read_vocab_file = NULL;
|
||||
output_file = NULL;
|
||||
sw_file = NULL;
|
||||
endpoints_file = "";
|
||||
hs = true;
|
||||
negative_num = 0;
|
||||
output_binary = false;
|
||||
sample = 0;
|
||||
cbow = true;
|
||||
embeding_size = 0;
|
||||
thread_cnt = 1;
|
||||
window_size = 5;
|
||||
min_count = 5;
|
||||
data_block_size = 1000000;
|
||||
init_learning_rate = static_cast<real>(0.025);
|
||||
epoch = 1;
|
||||
stopwords = false;
|
||||
is_pipeline = true;
|
||||
total_words = 0;
|
||||
max_preload_data_size = 8000000000LL;
|
||||
use_adagrad = false;
|
||||
//multiverso config
|
||||
num_servers = 0;
|
||||
num_aggregator = 1;
|
||||
lock_option = 1;
|
||||
num_lock = 100;
|
||||
max_delay = 0;
|
||||
}
|
||||
//Input all the local model-arguments
|
||||
void Option::ParseArgs(int argc, char* argv[])
|
||||
{
|
||||
for (int i = 1; i < argc; i += 2)
|
||||
{
|
||||
if (strcmp(argv[i], "-size") == 0) embeding_size = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-train_file") == 0) train_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-endpoints_file") == 0) endpoints_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-read_vocab") == 0) read_vocab_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-binary") == 0) output_binary = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-cbow") == 0) cbow = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-alpha") == 0) init_learning_rate = static_cast<real>(atof(argv[i + 1]));
|
||||
if (strcmp(argv[i], "-output") == 0) output_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-window") == 0) window_size = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-sample") == 0) sample = static_cast<real>(atof(argv[i + 1]));
|
||||
if (strcmp(argv[i], "-hs") == 0) hs = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-data_block_size") == 0) data_block_size = atoll(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-max_preload_data_size") == 0) max_preload_data_size = atoll(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-negative") == 0) negative_num = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-threads") == 0) thread_cnt = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-min_count") == 0) min_count = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-epoch") == 0) epoch = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-stopwords") == 0) stopwords = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-sw_file") == 0) sw_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-use_adagrad") == 0) use_adagrad = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-is_pipeline") == 0) is_pipeline = (atoi(argv[i + 1]) != 0);
|
||||
if (strcmp(argv[i], "-num_servers") == 0) num_servers = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-num_aggregator") == 0) num_aggregator = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-lock_option") == 0) lock_option = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-num_lock") == 0) num_lock = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-max_delay") == 0) max_delay = atoi(argv[i + 1]);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
void Option::PrintUsage()
|
||||
{
|
||||
puts("Usage:");
|
||||
puts("-size: word embedding size, e.g. 300");
|
||||
puts("-train_file: the training corpus file, e.g.enwik2014");
|
||||
puts("-read_vocab : the file to read all the vocab counts info");
|
||||
puts("-binary : 0 or 1, indicates whether to write all the embeddings vectors into binary format");
|
||||
puts("-cbow : 0 or 1, default 1, whether to use cbow or not");
|
||||
puts("-alpha : initial learning rate, usually set to 0.025");
|
||||
puts("-output : the output file to store all the embedding vectors");
|
||||
puts("-window : the window size");
|
||||
puts("-sample : the sub - sample size, usually set to 0");
|
||||
puts("-hs : 0 or 1, default 1, whether to use hierarchical softmax");
|
||||
puts("-negative : the negative word count in negative sampling, please set it to 0 when - hs = 1");
|
||||
puts("-threads : the thread number to run in one machine");
|
||||
puts("-min_count : words with lower frequency than min_count is removed from dictionary");
|
||||
puts("-epoch : the epoch number");
|
||||
puts("-stopwords : 0 or 1, whether to avoid training stop words");
|
||||
puts("-sw_file : the stop words file storing all the stop words, valid when -stopwords = 1");
|
||||
puts("-use_adagrad : 0 or 1, whether to use adagrad to adjust learnin rate");
|
||||
puts("-data_block_size : default 1MB, the maximum bytes which a data block will store");
|
||||
puts("-max_preload_data_size : default 8GB, the maximum data size(bytes) which multiverse_WordEmbedding will preload");
|
||||
puts("-num_servers : default 0, the parameter of multiverso.Separately, 0 indicates all precesses are servers");
|
||||
puts("-num_aggregator : default 1, number of aggregation threads in a process");
|
||||
puts("-max_delay : default 0, the delay bound(max staleness)");
|
||||
puts("-num_lock : default 100, number of locks in Locked option");
|
||||
puts("-is_pipeline : 0 or 1, whether to use pipeline");
|
||||
puts("-lock_option : default 0, Lock option. 0 : the trheads do not write and there is no contention; 1:there is no lock for thread contention; 2:normal lock for thread contention");
|
||||
puts("-server_endpoint_file : default "", server ZMQ socket endpoint file in MPI - free version");
|
||||
}
|
||||
|
||||
void Option::PrintUsage()
|
||||
{
|
||||
puts("Usage:");
|
||||
puts("-size: word embedding size, e.g. 300");
|
||||
puts("-train_file: the training corpus file, e.g.enwik2014");
|
||||
puts("-read_vocab : the file to read all the vocab counts info");
|
||||
puts("-binary : 0 or 1, indicates whether to write all the embeddings vectors into binary format");
|
||||
puts("-cbow : 0 or 1, default 1, whether to use cbow or not");
|
||||
puts("-alpha : initial learning rate, usually set to 0.025");
|
||||
puts("-output : the output file to store all the embedding vectors");
|
||||
puts("-window : the window size");
|
||||
puts("-sample : the sub - sample size, usually set to 0");
|
||||
puts("-hs : 0 or 1, default 1, whether to use hierarchical softmax");
|
||||
puts("-negative : the negative word count in negative sampling, please set it to 0 when - hs = 1");
|
||||
puts("-threads : the thread number to run in one machine");
|
||||
puts("-min_count : words with lower frequency than min_count is removed from dictionary");
|
||||
puts("-epoch : the epoch number");
|
||||
puts("-stopwords : 0 or 1, whether to avoid training stop words");
|
||||
puts("-sw_file : the stop words file storing all the stop words, valid when -stopwords = 1");
|
||||
puts("-use_adagrad : 0 or 1, whether to use adagrad to adjust learnin rate");
|
||||
puts("-data_block_size : default 1MB, the maximum bytes which a data block will store");
|
||||
puts("-max_preload_data_size : default 8GB, the maximum data size(bytes) which multiverse_WordEmbedding will preload");
|
||||
puts("-num_servers : default 0, the parameter of multiverso.Separately, 0 indicates all precesses are servers");
|
||||
puts("-num_aggregator : default 1, number of aggregation threads in a process");
|
||||
puts("-max_delay : default 0, the delay bound(max staleness)");
|
||||
puts("-num_lock : default 100, number of locks in Locked option");
|
||||
puts("-is_pipeline : 0 or 1, whether to use pipeline");
|
||||
puts("-lock_option : default 0, Lock option. 0 : the trheads do not write and there is no contention; 1:there is no lock for thread contention; 2:normal lock for thread contention");
|
||||
puts("-server_endpoint_file : default "", server ZMQ socket endpoint file in MPI - free version");
|
||||
}
|
||||
void Option::PrintArgs()
|
||||
{
|
||||
multiverso::Log::Info("train_file: %s\n", train_file);
|
||||
multiverso::Log::Info("read_vocab_file: %s\n", read_vocab_file);
|
||||
multiverso::Log::Info("output_file: %s\n", output_file);
|
||||
multiverso::Log::Info("sw_file: %s\n", sw_file);
|
||||
multiverso::Log::Info("hs: %d\n", hs);
|
||||
multiverso::Log::Info("output_binary: %d\n", output_binary);
|
||||
multiverso::Log::Info("cbow: %d\n", cbow);
|
||||
multiverso::Log::Info("stopwords: %d\n", stopwords);
|
||||
multiverso::Log::Info("use_adagrad: %d\n", use_adagrad);
|
||||
multiverso::Log::Info("sample: %lf\n", sample);
|
||||
multiverso::Log::Info("embeding_size: %d\n", embeding_size);
|
||||
multiverso::Log::Info("thread_cnt: %d\n", thread_cnt);
|
||||
multiverso::Log::Info("window_size: %d\n", window_size);
|
||||
multiverso::Log::Info("negative_num: %d\n", negative_num);
|
||||
multiverso::Log::Info("min_count: %d\n", min_count);
|
||||
multiverso::Log::Info("epoch: %d\n", epoch);
|
||||
multiverso::Log::Info("total_words: %lld\n", total_words);
|
||||
multiverso::Log::Info("max_preload_data_size: %lld\n", max_preload_data_size);
|
||||
multiverso::Log::Info("init_learning_rate: %lf\n", init_learning_rate);
|
||||
multiverso::Log::Info("data_block_size: %lld\n", data_block_size);
|
||||
multiverso::Log::Info("num_servers: %d\n", num_servers);
|
||||
multiverso::Log::Info("num_aggregator: %d\n", num_aggregator);
|
||||
multiverso::Log::Info("is_pipeline: %d\n", is_pipeline);
|
||||
multiverso::Log::Info("lock_option: %d\n", lock_option);
|
||||
multiverso::Log::Info("num_lock: %d\n", num_lock);
|
||||
multiverso::Log::Info("max_delay: %d\n", max_delay);
|
||||
multiverso::Log::Info("endpoints_file: %s\n", endpoints_file);
|
||||
}
|
||||
|
||||
void Option::PrintArgs()
|
||||
{
|
||||
multiverso::Log::Info("train_file: %s\n", train_file);
|
||||
multiverso::Log::Info("read_vocab_file: %s\n", read_vocab_file);
|
||||
multiverso::Log::Info("output_file: %s\n", output_file);
|
||||
multiverso::Log::Info("sw_file: %s\n", sw_file);
|
||||
multiverso::Log::Info("hs: %d\n", hs);
|
||||
multiverso::Log::Info("output_binary: %d\n", output_binary);
|
||||
multiverso::Log::Info("cbow: %d\n", cbow);
|
||||
multiverso::Log::Info("stopwords: %d\n", stopwords);
|
||||
multiverso::Log::Info("use_adagrad: %d\n", use_adagrad);
|
||||
multiverso::Log::Info("sample: %lf\n", sample);
|
||||
multiverso::Log::Info("embeding_size: %d\n", embeding_size);
|
||||
multiverso::Log::Info("thread_cnt: %d\n", thread_cnt);
|
||||
multiverso::Log::Info("window_size: %d\n", window_size);
|
||||
multiverso::Log::Info("negative_num: %d\n", negative_num);
|
||||
multiverso::Log::Info("min_count: %d\n", min_count);
|
||||
multiverso::Log::Info("epoch: %d\n", epoch);
|
||||
multiverso::Log::Info("total_words: %lld\n", total_words);
|
||||
multiverso::Log::Info("max_preload_data_size: %lld\n", max_preload_data_size);
|
||||
multiverso::Log::Info("init_learning_rate: %lf\n", init_learning_rate);
|
||||
multiverso::Log::Info("data_block_size: %lld\n", data_block_size);
|
||||
multiverso::Log::Info("num_servers: %d\n", num_servers);
|
||||
multiverso::Log::Info("num_aggregator: %d\n", num_aggregator);
|
||||
multiverso::Log::Info("is_pipeline: %d\n", is_pipeline);
|
||||
multiverso::Log::Info("lock_option: %d\n", lock_option);
|
||||
multiverso::Log::Info("num_lock: %d\n", num_lock);
|
||||
multiverso::Log::Info("max_delay: %d\n", max_delay);
|
||||
multiverso::Log::Info("endpoints_file: %s\n", endpoints_file);
|
||||
}
|
||||
Sampler::Sampler()
|
||||
{
|
||||
table_ = nullptr;
|
||||
}
|
||||
//Set the negative-sampling distribution
|
||||
void Sampler::SetNegativeSamplingDistribution(Dictionary *dictionary)
|
||||
{
|
||||
real train_words_pow = 0;
|
||||
real power = 0.75;
|
||||
table_ = (int *)malloc(kTableSize * sizeof(int));
|
||||
for (int i = 0; i < dictionary->Size(); ++i)
|
||||
train_words_pow += static_cast<real>(pow(dictionary->GetWordInfo(i)->freq, power));
|
||||
int cur_pos = 0;
|
||||
real d1 = (real)pow(dictionary->GetWordInfo(cur_pos)->freq, power)
|
||||
/ (real)train_words_pow;
|
||||
|
||||
Sampler::Sampler()
|
||||
{
|
||||
table_ = nullptr;
|
||||
}
|
||||
//Set the negative-sampling distribution
|
||||
void Sampler::SetNegativeSamplingDistribution(Dictionary *dictionary)
|
||||
{
|
||||
real train_words_pow = 0;
|
||||
real power = 0.75;
|
||||
table_ = (int *)malloc(kTableSize * sizeof(int));
|
||||
for (int i = 0; i < dictionary->Size(); ++i)
|
||||
train_words_pow += static_cast<real>(pow(dictionary->GetWordInfo(i)->freq, power));
|
||||
int cur_pos = 0;
|
||||
real d1 = (real)pow(dictionary->GetWordInfo(cur_pos)->freq, power)
|
||||
/ (real)train_words_pow;
|
||||
assert(table_ != nullptr);
|
||||
for (int i = 0; i < kTableSize; ++i)
|
||||
{
|
||||
table_[i] = cur_pos;
|
||||
if (i > d1 * kTableSize && cur_pos + 1 < dictionary->Size())
|
||||
{
|
||||
cur_pos++;
|
||||
d1 += (real)pow(dictionary->GetWordInfo(cur_pos)->freq, power)
|
||||
/ (real)train_words_pow;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
assert(table_ != nullptr);
|
||||
for (int i = 0; i < kTableSize; ++i)
|
||||
{
|
||||
table_[i] = cur_pos;
|
||||
if (i > d1 * kTableSize && cur_pos + 1 < dictionary->Size())
|
||||
{
|
||||
cur_pos++;
|
||||
d1 += (real)pow(dictionary->GetWordInfo(cur_pos)->freq, power)
|
||||
/ (real)train_words_pow;
|
||||
}
|
||||
}
|
||||
}
|
||||
bool Sampler::WordSampling(int64 word_cnt,
|
||||
int64 train_words, real sample)
|
||||
{
|
||||
real ran = (sqrt(word_cnt / (sample * train_words)) + 1) *
|
||||
(sample * train_words) / word_cnt;
|
||||
return (ran > ((real)rand() / (RAND_MAX)));
|
||||
}
|
||||
//Get the next random
|
||||
uint64 Sampler::GetNextRandom(uint64 next_random)
|
||||
{
|
||||
return next_random * (uint64)25214903917 + 11;
|
||||
}
|
||||
|
||||
bool Sampler::WordSampling(int64 word_cnt,
|
||||
int64 train_words, real sample)
|
||||
{
|
||||
real ran = (sqrt(word_cnt / (sample * train_words)) + 1) *
|
||||
(sample * train_words) / word_cnt;
|
||||
return (ran > ((real)rand() / (RAND_MAX)));
|
||||
}
|
||||
//Get the next random
|
||||
uint64 Sampler::GetNextRandom(uint64 next_random)
|
||||
{
|
||||
return next_random * (uint64)25214903917 + 11;
|
||||
}
|
||||
int Sampler::NegativeSampling(uint64 next_random)
|
||||
{
|
||||
return table_[(next_random >> 16) % kTableSize];
|
||||
}
|
||||
|
||||
int Sampler::NegativeSampling(uint64 next_random)
|
||||
{
|
||||
return table_[(next_random >> 16) % kTableSize];
|
||||
}
|
||||
std::string GetSystemTime()
|
||||
{
|
||||
time_t t = time(0);
|
||||
char tmp[128];
|
||||
strftime(tmp, sizeof(tmp), "%Y%m%d%H%M%S", localtime(&t));
|
||||
return std::string(tmp);
|
||||
}
|
||||
|
||||
std::string GetSystemTime()
|
||||
{
|
||||
time_t t = time(0);
|
||||
tm timeinfo;
|
||||
localtime_s(&timeinfo, &t);
|
||||
char tmp[128];
|
||||
strftime(tmp, sizeof(tmp), "%Y%m%d%H%M%S", &timeinfo);
|
||||
return std::string(tmp);
|
||||
}
|
||||
|
||||
//Get the size of filename, it should deal with large files
|
||||
int64 GetFileSize(const char *filename)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
struct _stat64 info;
|
||||
_stat64(filename, &info);
|
||||
return (int64)info.st_size;
|
||||
#else
|
||||
struct stat info;
|
||||
stat(filename, &info);
|
||||
return(int64)info.st_size;
|
||||
#endif
|
||||
}
|
||||
|
||||
//Readword from train_file to word array by the word index
|
||||
bool ReadWord(char *word, FILE *fin)
|
||||
{
|
||||
int idx = 0;
|
||||
char ch;
|
||||
while (!feof(fin))
|
||||
{
|
||||
ch = fgetc(fin);
|
||||
if (ch == 13) continue;
|
||||
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
|
||||
{
|
||||
if (idx > 0)
|
||||
{
|
||||
if (ch == '\n')
|
||||
ungetc(ch, fin);
|
||||
break;
|
||||
}
|
||||
|
||||
if (ch == '\n')
|
||||
{
|
||||
strcpy(word, (char *)"</s>");
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
word[idx++] = ch;
|
||||
if (idx >= kMaxString - 1)
|
||||
idx--;
|
||||
}
|
||||
|
||||
word[idx] = 0;
|
||||
return idx > 0;
|
||||
}
|
||||
|
||||
std::string g_log_suffix;
|
||||
real* expTable;
|
||||
|
||||
void InitExpTable(){
|
||||
expTable = (real *)malloc((kExpTableSize + 1) * sizeof(real));
|
||||
for (int i = 0; i < kExpTableSize; i++) {
|
||||
expTable[i] = exp((i / (real)kExpTableSize * 2 - 1) * kMaxExp); // Precompute the exp() table
|
||||
expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
|
||||
}
|
||||
}
|
||||
}
|
||||
std::string g_log_suffix;
|
||||
}
|
||||
}
|
115
src/util.h
115
src/util.h
|
@ -1,5 +1,5 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_UTIL_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_UTIL_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* file util.h
|
||||
* \brief Struct Option stores many general arguments in model
|
||||
|
@ -10,75 +10,68 @@
|
|||
#include <random>
|
||||
#include <cassert>
|
||||
#include <exception>
|
||||
#include <sys/stat.h>
|
||||
|
||||
#include "constant.h"
|
||||
#include "dictionary.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
struct Option
|
||||
{
|
||||
const char* train_file;
|
||||
const char* read_vocab_file;
|
||||
const char* output_file;
|
||||
const char* sw_file;
|
||||
const char* endpoints_file;
|
||||
bool hs, output_binary, cbow, stopwords;
|
||||
bool use_adagrad;
|
||||
bool is_pipeline;
|
||||
real sample;
|
||||
int64 data_block_size;
|
||||
int embeding_size, thread_cnt, window_size, negative_num, min_count, epoch;
|
||||
int64 total_words;
|
||||
int64 max_preload_data_size;
|
||||
real init_learning_rate;
|
||||
int num_servers, num_aggregator, lock_option, num_lock, max_delay;
|
||||
namespace wordembedding
|
||||
{
|
||||
struct Option
|
||||
{
|
||||
const char* train_file;
|
||||
const char* read_vocab_file;
|
||||
const char* output_file;
|
||||
const char* sw_file;
|
||||
const char* endpoints_file;
|
||||
bool hs, output_binary, cbow, stopwords;
|
||||
bool use_adagrad;
|
||||
bool is_pipeline;
|
||||
real sample;
|
||||
int64 data_block_size;
|
||||
int embeding_size, thread_cnt, window_size, negative_num, min_count, epoch;
|
||||
int64 total_words;
|
||||
int64 max_preload_data_size;
|
||||
real init_learning_rate;
|
||||
int num_servers, num_aggregator, lock_option, num_lock, max_delay;
|
||||
|
||||
Option();
|
||||
/*!
|
||||
* \brief Get the model-set arguments from file
|
||||
*/
|
||||
void ParseArgs(int argc, char* argv[]);
|
||||
void PrintArgs();
|
||||
void PrintUsage();
|
||||
Option();
|
||||
/*!
|
||||
* \brief Get the model-set arguments from file
|
||||
*/
|
||||
void ParseArgs(int argc, char* argv[]);
|
||||
void PrintArgs();
|
||||
void PrintUsage();
|
||||
|
||||
};
|
||||
};
|
||||
|
||||
class Sampler
|
||||
{
|
||||
public:
|
||||
Sampler();
|
||||
/*!
|
||||
* \brief Set the negative-sampling distribution for every vocabulary
|
||||
* \param dictionary the train_file dictionary
|
||||
*/
|
||||
void SetNegativeSamplingDistribution(Dictionary *dictionary);
|
||||
bool WordSampling(int64 word_cnt, int64 train_words, real sample);
|
||||
/*!
|
||||
* \brief Get the next random according to the existing random seed
|
||||
*/
|
||||
uint64 GetNextRandom(uint64 next_random);
|
||||
int NegativeSampling(uint64 next_random);
|
||||
|
||||
private:
|
||||
int* table_;
|
||||
class Sampler
|
||||
{
|
||||
public:
|
||||
Sampler();
|
||||
/*!
|
||||
* \brief Set the negative-sampling distribution for every vocabulary
|
||||
* \param dictionary the train_file dictionary
|
||||
*/
|
||||
void SetNegativeSamplingDistribution(Dictionary *dictionary);
|
||||
bool WordSampling(int64 word_cnt, int64 train_words, real sample);
|
||||
/*!
|
||||
* \brief Get the next random according to the existing random seed
|
||||
*/
|
||||
uint64 GetNextRandom(uint64 next_random);
|
||||
int NegativeSampling(uint64 next_random);
|
||||
|
||||
//No copying allowed
|
||||
Sampler(const Sampler&);
|
||||
void operator=(const Sampler&);
|
||||
};
|
||||
private:
|
||||
int* table_;
|
||||
|
||||
std::string GetSystemTime();
|
||||
int64 GetFileSize(const char *filename);
|
||||
bool ReadWord(char *word, FILE *fin);
|
||||
void InitExpTable();
|
||||
//No copying allowed
|
||||
Sampler(const Sampler&);
|
||||
void operator=(const Sampler&);
|
||||
};
|
||||
|
||||
extern std::string g_log_suffix;
|
||||
extern real* expTable;
|
||||
|
||||
}
|
||||
}
|
||||
#endif
|
||||
std::string GetSystemTime();
|
||||
extern std::string g_log_suffix;
|
||||
}
|
||||
}
|
|
@ -2,352 +2,374 @@
|
|||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
WordEmbedding::WordEmbedding(Option* option, HuffmanEncoder* huffmanEncoder,
|
||||
Sampler* sampler, int dictionary_size)
|
||||
{
|
||||
word_count_actual = 0;
|
||||
option_ = option;
|
||||
huffmanEncoder_ = huffmanEncoder;
|
||||
sampler_ = sampler;
|
||||
dictionary_size_ = dictionary_size;
|
||||
learning_rate = option_->init_learning_rate;
|
||||
data_block_ = nullptr;
|
||||
//InitExpTable();
|
||||
}
|
||||
namespace wordembedding
|
||||
{
|
||||
WordEmbedding::WordEmbedding(Option* option, HuffmanEncoder* huffmanEncoder,
|
||||
Sampler* sampler, int dictionary_size)
|
||||
{
|
||||
word_count_actual = 0;
|
||||
option_ = option;
|
||||
huffmanEncoder_ = huffmanEncoder;
|
||||
sampler_ = sampler;
|
||||
dictionary_size_ = dictionary_size;
|
||||
learning_rate = option_->init_learning_rate;
|
||||
weight_IE_ = nullptr;
|
||||
weight_EO_ = nullptr;
|
||||
sum_gradient2_IE_ = nullptr;
|
||||
sum_gradient2_EO_ = nullptr;
|
||||
}
|
||||
|
||||
WordEmbedding::~WordEmbedding()
|
||||
{
|
||||
}
|
||||
WordEmbedding::~WordEmbedding()
|
||||
{
|
||||
delete [] weight_IE_;
|
||||
delete [] weight_EO_;
|
||||
|
||||
//Train neural networks of WordEmbedding
|
||||
void WordEmbedding::Train(DataBlock *data_block, int index_start, int interval,
|
||||
int64& word_count, real* hidden_act, real* hidden_err)
|
||||
{
|
||||
data_block_ = data_block;
|
||||
std::vector <int> negativesample(data_block_->negativesample_pools.begin(),
|
||||
data_block_->negativesample_pools.end());
|
||||
if (option_->use_adagrad)
|
||||
{
|
||||
delete [] sum_gradient2_IE_;
|
||||
delete [] sum_gradient2_EO_;
|
||||
}
|
||||
}
|
||||
//Allocate the memory for some private pointers
|
||||
void WordEmbedding::MallocMemory()
|
||||
{
|
||||
weight_IE_ = new (std::nothrow)real*[dictionary_size_];
|
||||
assert(weight_IE_ != nullptr);
|
||||
weight_EO_ = new (std::nothrow)real*[dictionary_size_];
|
||||
assert(weight_EO_ != nullptr);
|
||||
if (option_->use_adagrad)
|
||||
{
|
||||
sum_gradient2_IE_ = new (std::nothrow)real*[dictionary_size_];
|
||||
sum_gradient2_EO_ = new (std::nothrow)real*[dictionary_size_];
|
||||
assert(sum_gradient2_IE_ != nullptr);
|
||||
assert(sum_gradient2_EO_ != nullptr);
|
||||
}
|
||||
}
|
||||
//Train neural networks of WordEmbedding
|
||||
void WordEmbedding::Train(DataBlock *data_block, int index_start, int interval,
|
||||
int64& word_count, real* hidden_act, real* hidden_err)
|
||||
{
|
||||
std::vector <int> negativesample(data_block->negativesample_pools.begin(),
|
||||
data_block->negativesample_pools.end());
|
||||
for (int i = index_start; i < data_block->Size(); i += interval)
|
||||
{
|
||||
int sentence_length;
|
||||
int64 word_count_deta;
|
||||
int *sentence;
|
||||
uint64 next_random;
|
||||
data_block->GetSentence(i, sentence, sentence_length,
|
||||
word_count_deta, next_random);
|
||||
|
||||
for (int i = index_start; i < data_block_->Size(); i += interval)
|
||||
{
|
||||
int sentence_length;
|
||||
int64 word_count_deta;
|
||||
int *sentence;
|
||||
uint64 next_random;
|
||||
data_block_->GetSentence(i, sentence, sentence_length,
|
||||
word_count_deta, next_random);
|
||||
this->Train(sentence, sentence_length,
|
||||
next_random, hidden_act, hidden_err, negativesample);
|
||||
|
||||
this->Train(sentence, sentence_length,
|
||||
next_random, hidden_act, hidden_err, negativesample);
|
||||
word_count += word_count_deta;
|
||||
}
|
||||
}
|
||||
//Update the learning rate
|
||||
void WordEmbedding::UpdateLearningRate()
|
||||
{
|
||||
if (option_->use_adagrad == false)
|
||||
{
|
||||
learning_rate = static_cast<real>(option_->init_learning_rate *
|
||||
(1 - word_count_actual / ((real)option_->total_words * option_->epoch + 1.0)));
|
||||
if (learning_rate < option_->init_learning_rate * 0.0001)
|
||||
learning_rate = static_cast<real>(option_->init_learning_rate * 0.0001);
|
||||
}
|
||||
}
|
||||
|
||||
word_count += word_count_deta;
|
||||
}
|
||||
}
|
||||
//Update the learning rate
|
||||
void WordEmbedding::UpdateLearningRate()
|
||||
{
|
||||
if (option_->use_adagrad == false)
|
||||
{
|
||||
learning_rate = static_cast<real>(option_->init_learning_rate *
|
||||
(1 - word_count_actual / ((real)option_->total_words * option_->epoch + 1.0)));
|
||||
if (learning_rate < option_->init_learning_rate * 0.0001)
|
||||
learning_rate = static_cast<real>(option_->init_learning_rate * 0.0001);
|
||||
}
|
||||
}
|
||||
void WordEmbedding::Train(int* sentence, int sentence_length,
|
||||
uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools)
|
||||
{
|
||||
ParseSentence(sentence, sentence_length,
|
||||
next_random, hidden_act, hidden_err, &WordEmbedding::TrainSample, negativesample_pools);
|
||||
}
|
||||
//Train with forward direction and get the input-hidden layer vector
|
||||
void WordEmbedding::FeedForward(std::vector<int>& input_nodes, real* hidden_act)
|
||||
{
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
int &node_id = input_nodes[i];
|
||||
real* input_embedding = weight_IE_[node_id];
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
hidden_act[j] += input_embedding[j];
|
||||
}
|
||||
|
||||
void WordEmbedding::Train(int* sentence, int sentence_length,
|
||||
uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools)
|
||||
{
|
||||
ParseSentence(sentence, sentence_length,
|
||||
next_random, hidden_act, hidden_err, &WordEmbedding::TrainSample, negativesample_pools);
|
||||
}
|
||||
//Train with forward direction and get the input-hidden layer vector
|
||||
void WordEmbedding::FeedForward(std::vector<int>& input_nodes, real* hidden_act)
|
||||
{
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
int &node_id = input_nodes[i];
|
||||
real* input_embedding = GetWeightIE(node_id);
|
||||
for (int j = 0; j < option_->embeding_size; ++j){
|
||||
hidden_act[j] += input_embedding[j];
|
||||
}
|
||||
}
|
||||
if (input_nodes.size() > 1)
|
||||
{
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
hidden_act[j] /= input_nodes.size();
|
||||
}
|
||||
}
|
||||
//Train with inverse direction and update the hidden-output
|
||||
void WordEmbedding::BPOutputLayer(int label, int word_idx,
|
||||
real* classifier, real* hidden_act, real* hidden_err)
|
||||
{
|
||||
assert(classifier != nullptr && hidden_act != nullptr && hidden_err != nullptr);
|
||||
real f = 0;
|
||||
//Propagate hidden -> output
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
f += hidden_act[j] * classifier[j];
|
||||
f = 1 / (1 + exp(-f));
|
||||
real error = (1 - label - f);
|
||||
//Propagate errors output -> hidden
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
hidden_err[j] += error * classifier[j];
|
||||
|
||||
if (input_nodes.size() > 1)
|
||||
{
|
||||
for (int j = 0; j < option_->embeding_size; ++j){
|
||||
hidden_act[j] /= input_nodes.size();
|
||||
}
|
||||
}
|
||||
}
|
||||
//Train with inverse direction and update the hidden-output
|
||||
void WordEmbedding::BPOutputLayer(int label, int word_idx,
|
||||
real* classifier, real* hidden_act, real* hidden_err)
|
||||
{
|
||||
assert(classifier != nullptr && hidden_act != nullptr && hidden_err != nullptr);
|
||||
real f = 0;
|
||||
if (option_->use_adagrad)
|
||||
{
|
||||
real* sum_gradient2_row = sum_gradient2_EO_[word_idx];
|
||||
assert(sum_gradient2_row != nullptr);
|
||||
//Learn weights hidden -> output
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
{
|
||||
real g = error * hidden_act[j];
|
||||
sum_gradient2_row[j] += g * g;
|
||||
if (sum_gradient2_row[j] > 1e-10)
|
||||
classifier[j] += g * option_->init_learning_rate / sqrt(sum_gradient2_row[j]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//'g' is the gradient multiplied by the learning rate
|
||||
real g = error * learning_rate;
|
||||
//Learn weights hidden -> output
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
classifier[j] += g * hidden_act[j];
|
||||
}
|
||||
}
|
||||
|
||||
//Propagate hidden -> output
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
f += hidden_act[j] * classifier[j];
|
||||
void WordEmbedding::TrainSample(std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes,
|
||||
void *local_hidden_act, void *local_hidden_err)
|
||||
{
|
||||
real* hidden_act = (real*)local_hidden_act;
|
||||
real* hidden_err = (real*)local_hidden_err;
|
||||
assert(hidden_act != nullptr);
|
||||
assert(hidden_err != nullptr);
|
||||
memset(hidden_act, 0, option_->embeding_size * sizeof(real));
|
||||
memset(hidden_err, 0, option_->embeding_size * sizeof(real));
|
||||
FeedForward(input_nodes, hidden_act);
|
||||
|
||||
f = 1 / (1 + exp(-f));
|
||||
for (int i = 0; i < output_nodes.size(); ++i)
|
||||
{
|
||||
int &node_id = output_nodes[i].first;
|
||||
int &code = output_nodes[i].second;
|
||||
BPOutputLayer(code, node_id, weight_EO_[node_id],
|
||||
hidden_act, hidden_err);
|
||||
}
|
||||
|
||||
if (option_->use_adagrad)
|
||||
{
|
||||
//Update context embedding
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
int &node_id = input_nodes[i];
|
||||
real* input_embedding_row = weight_IE_[node_id];
|
||||
real* sum_gradient2_row = sum_gradient2_IE_[node_id];
|
||||
assert(input_embedding_row != nullptr && sum_gradient2_row != nullptr);
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
{
|
||||
sum_gradient2_row[j] += hidden_err[j] * hidden_err[j];
|
||||
if (sum_gradient2_row[j] > 1e-10)
|
||||
input_embedding_row[j] += hidden_err[j] * option_->init_learning_rate / sqrt(sum_gradient2_row[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
hidden_err[j] *= learning_rate;
|
||||
//Update context embedding
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
int &node_id = input_nodes[i];
|
||||
real* input_embedding = weight_IE_[node_id];
|
||||
assert(input_embedding != nullptr);
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
input_embedding[j] += hidden_err[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
//Parapare the parameter for the datablock
|
||||
void WordEmbedding::PrepareParameter(DataBlock* data_block)
|
||||
{
|
||||
int sentence_length;
|
||||
int64 word_count_delta;
|
||||
int *sentence;
|
||||
uint64 next_random;
|
||||
if (option_->hs)
|
||||
{
|
||||
for (int i = 0; i < data_block->Size(); ++i)
|
||||
{
|
||||
data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);
|
||||
|
||||
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
|
||||
{
|
||||
data_block->input_nodes.insert(sentence[sentence_position]);
|
||||
}
|
||||
}
|
||||
for (auto input_node : data_block->input_nodes)
|
||||
{
|
||||
auto info = huffmanEncoder_->GetLabelInfo(input_node);
|
||||
for (int d = 0; d < info->codelen; d++)
|
||||
data_block->output_nodes.insert(info->point[d]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < data_block->Size(); ++i)
|
||||
{
|
||||
data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);
|
||||
|
||||
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
|
||||
{
|
||||
data_block->input_nodes.insert(sentence[sentence_position]);
|
||||
}
|
||||
}
|
||||
for (auto input_node : data_block->input_nodes)
|
||||
{
|
||||
data_block->output_nodes.insert(input_node);
|
||||
}
|
||||
for (int d = 0; d < option_->negative_num * data_block->input_nodes.size(); d++)
|
||||
{
|
||||
next_random = sampler_->GetNextRandom(next_random);
|
||||
int target = sampler_->NegativeSampling(next_random);
|
||||
data_block->output_nodes.insert(target);
|
||||
data_block->negativesample_pools.insert(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
//Copy the input&ouput nodes
|
||||
void WordEmbedding::DealPrepareParameter(std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes,
|
||||
void *hidden_act, void *hidden_err)
|
||||
{
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
input_nodes_.insert(input_nodes[i]);
|
||||
for (int i = 0; i < output_nodes.size(); ++i)
|
||||
output_nodes_.insert(output_nodes[i].first);
|
||||
}
|
||||
//Parse the sentence and deepen into two branches
|
||||
void WordEmbedding::ParseSentence(int* sentence, int sentence_length,
|
||||
uint64 next_random, real* hidden_act, real* hidden_err,
|
||||
FunctionType function, std::vector <int> &negativesample_pools)
|
||||
{
|
||||
if (sentence_length == 0)
|
||||
return;
|
||||
|
||||
int feat[kMaxSentenceLength + 1];
|
||||
std::vector<int> input_nodes;
|
||||
std::vector<std::pair<int, int> > output_nodes;
|
||||
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
|
||||
{
|
||||
if (sentence[sentence_position] == -1) continue;
|
||||
next_random = sampler_->GetNextRandom(next_random);
|
||||
int off = next_random % option_->window_size;
|
||||
int feat_size = 0;
|
||||
for (int i = off; i < option_->window_size * 2 + 1 - off; ++i)
|
||||
if (i != option_->window_size)
|
||||
{
|
||||
int c = sentence_position - option_->window_size + i;
|
||||
if (c < 0 || c >= sentence_length || sentence[c] == -1)
|
||||
continue;
|
||||
|
||||
feat[feat_size++] = sentence[c];
|
||||
if (!option_->cbow) //train Skip-gram
|
||||
{
|
||||
input_nodes.clear();
|
||||
output_nodes.clear();
|
||||
Parse(feat + feat_size - 1, 1, sentence[sentence_position],
|
||||
next_random, input_nodes, output_nodes, negativesample_pools);
|
||||
(this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
if (f >-kMaxExp && f < kMaxExp){
|
||||
f = expTable[(int)((f + kMaxExp) * (kExpTableSize / kMaxExp / 2))];
|
||||
}
|
||||
*/
|
||||
|
||||
real error = (1 - label - f);
|
||||
//Propagate errors output -> hidden
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
hidden_err[j] += error * classifier[j];
|
||||
if (option_->cbow) //train cbow
|
||||
{
|
||||
input_nodes.clear();
|
||||
output_nodes.clear();
|
||||
Parse(feat, feat_size, sentence[sentence_position],
|
||||
next_random, input_nodes, output_nodes, negativesample_pools);
|
||||
|
||||
if (option_->use_adagrad)
|
||||
{
|
||||
real* sum_gradient2_row = GetSumGradient2EO(word_idx);
|
||||
assert(sum_gradient2_row != nullptr);
|
||||
//Learn weights hidden -> output
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
{
|
||||
real g = error * hidden_act[j];
|
||||
sum_gradient2_row[j] += g * g;
|
||||
if (sum_gradient2_row[j] > 1e-10)
|
||||
classifier[j] += g * option_->init_learning_rate / sqrt(sum_gradient2_row[j]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
//'g' is the gradient multiplied by the learning rate
|
||||
real g = error * learning_rate;
|
||||
//Learn weights hidden -> output
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
classifier[j] += g * hidden_act[j];
|
||||
}
|
||||
}
|
||||
(this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
|
||||
}
|
||||
}
|
||||
}
|
||||
//Parse the windows's input&output nodes
|
||||
inline void WordEmbedding::Parse(int *feat, int feat_cnt, int word_idx,
|
||||
uint64 &next_random, std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools)
|
||||
{
|
||||
for (int i = 0; i < feat_cnt; ++i)
|
||||
{
|
||||
input_nodes.push_back(feat[i]);
|
||||
}
|
||||
|
||||
void WordEmbedding::TrainSample(std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes,
|
||||
void *local_hidden_act, void *local_hidden_err)
|
||||
{
|
||||
real* hidden_act = (real*)local_hidden_act;
|
||||
real* hidden_err = (real*)local_hidden_err;
|
||||
assert(hidden_act != nullptr);
|
||||
assert(hidden_err != nullptr);
|
||||
memset(hidden_act, 0, option_->embeding_size * sizeof(real));
|
||||
memset(hidden_err, 0, option_->embeding_size * sizeof(real));
|
||||
if (option_->hs)
|
||||
{
|
||||
auto info = huffmanEncoder_->GetLabelInfo(word_idx);
|
||||
for (int d = 0; d < info->codelen; d++)
|
||||
output_nodes.push_back(std::make_pair(info->point[d], info->code[d]));
|
||||
}
|
||||
else
|
||||
if (option_->negative_num)
|
||||
{
|
||||
output_nodes.push_back(std::make_pair(word_idx, 1));
|
||||
for (int d = 0; d < option_->negative_num; d++)
|
||||
{
|
||||
next_random = sampler_->GetNextRandom(next_random);
|
||||
int index = (next_random >> 8) % negativesample_pools.size();
|
||||
int target = negativesample_pools[index];
|
||||
if (target == word_idx) continue;
|
||||
output_nodes.push_back(std::make_pair(target, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
//Set the weight of input-embedding vector
|
||||
void WordEmbedding::SetWeightIE(int input_node_id, real* ptr)
|
||||
{
|
||||
weight_IE_[input_node_id] = ptr;
|
||||
}
|
||||
|
||||
FeedForward(input_nodes, hidden_act);
|
||||
//Set the weight of output-embedding vector
|
||||
void WordEmbedding::SetWeightEO(int output_node_id, real* ptr)
|
||||
{
|
||||
weight_EO_[output_node_id] = ptr;
|
||||
}
|
||||
//Get the weight of output-embedding vector
|
||||
real* WordEmbedding::GetWeightIE(int input_node_id)
|
||||
{
|
||||
return weight_IE_[input_node_id];
|
||||
}
|
||||
//Get the weight of output-embedding vector
|
||||
real* WordEmbedding::GetWeightEO(int output_node_id)
|
||||
{
|
||||
return weight_EO_[output_node_id];
|
||||
}
|
||||
|
||||
for (int i = 0; i < output_nodes.size(); ++i)
|
||||
{
|
||||
int &node_id = output_nodes[i].first;
|
||||
int &code = output_nodes[i].second;
|
||||
BPOutputLayer(code, node_id, GetWeightEO(node_id),
|
||||
hidden_act, hidden_err);
|
||||
}
|
||||
//Set the weight of SumGradient-input vector
|
||||
void WordEmbedding::SetSumGradient2IE(int input_node_id, real* ptr)
|
||||
{
|
||||
sum_gradient2_IE_[input_node_id] = ptr;
|
||||
}
|
||||
|
||||
if (option_->use_adagrad)
|
||||
{
|
||||
//Update context embedding
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
int &node_id = input_nodes[i];
|
||||
real* input_embedding_row = GetWeightIE(node_id);
|
||||
real* sum_gradient2_row = GetSumGradient2IE(node_id);
|
||||
assert(input_embedding_row != nullptr && sum_gradient2_row != nullptr);
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
{
|
||||
sum_gradient2_row[j] += hidden_err[j] * hidden_err[j];
|
||||
if (sum_gradient2_row[j] > 1e-10)
|
||||
input_embedding_row[j] += hidden_err[j] * option_->init_learning_rate / sqrt(sum_gradient2_row[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
hidden_err[j] *= learning_rate;
|
||||
//Update context embedding
|
||||
for (int i = 0; i < input_nodes.size(); ++i)
|
||||
{
|
||||
int &node_id = input_nodes[i];
|
||||
real* input_embedding = GetWeightIE(node_id);
|
||||
assert(input_embedding != nullptr);
|
||||
for (int j = 0; j < option_->embeding_size; ++j)
|
||||
input_embedding[j] += hidden_err[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
//Set the weight of SumGradient-output vector
|
||||
void WordEmbedding::SetSumGradient2EO(int output_node_id, real* ptr)
|
||||
{
|
||||
sum_gradient2_EO_[output_node_id] = ptr;
|
||||
}
|
||||
|
||||
//Parapare the data for the datablock
|
||||
void WordEmbedding::PrepareData(DataBlock* data_block)
|
||||
{
|
||||
int sentence_length;
|
||||
int64 word_count_delta;
|
||||
int *sentence;
|
||||
uint64 next_random;
|
||||
if (option_->hs)
|
||||
{
|
||||
for (int i = 0; i < data_block->Size(); ++i)
|
||||
{
|
||||
data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);
|
||||
//Get the weight of SumGradient-input vector
|
||||
real* WordEmbedding::GetSumGradient2IE(int input_node_id)
|
||||
{
|
||||
return sum_gradient2_IE_[input_node_id];
|
||||
}
|
||||
|
||||
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
|
||||
{
|
||||
data_block->input_nodes.insert(sentence[sentence_position]);
|
||||
}
|
||||
}
|
||||
for (auto input_node : data_block->input_nodes)
|
||||
{
|
||||
auto info = huffmanEncoder_->GetLabelInfo(input_node);
|
||||
for (int d = 0; d < info->codelen; d++)
|
||||
data_block->output_nodes.insert(info->point[d]);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (int i = 0; i < data_block->Size(); ++i)
|
||||
{
|
||||
data_block->GetSentence(i, sentence, sentence_length, word_count_delta, next_random);
|
||||
|
||||
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
|
||||
{
|
||||
data_block->input_nodes.insert(sentence[sentence_position]);
|
||||
}
|
||||
}
|
||||
for (auto input_node : data_block->input_nodes)
|
||||
{
|
||||
data_block->output_nodes.insert(input_node);
|
||||
}
|
||||
for (int d = 0; d < option_->negative_num * data_block->input_nodes.size(); d++)
|
||||
{
|
||||
next_random = sampler_->GetNextRandom(next_random);
|
||||
int target = sampler_->NegativeSampling(next_random);
|
||||
data_block->output_nodes.insert(target);
|
||||
data_block->negativesample_pools.insert(target);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//Parse the sentence and deepen into two branches
|
||||
void WordEmbedding::ParseSentence(int* sentence, int sentence_length,
|
||||
uint64 next_random, real* hidden_act, real* hidden_err,
|
||||
FunctionType function, std::vector <int> &negativesample_pools)
|
||||
{
|
||||
if (sentence_length == 0)
|
||||
return;
|
||||
|
||||
int feat[kMaxSentenceLength + 1];
|
||||
std::vector<int> input_nodes;
|
||||
std::vector<std::pair<int, int> > output_nodes;
|
||||
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
|
||||
{
|
||||
if (sentence[sentence_position] == -1) continue;
|
||||
next_random = sampler_->GetNextRandom(next_random);
|
||||
int off = next_random % option_->window_size;
|
||||
int feat_size = 0;
|
||||
for (int i = off; i < option_->window_size * 2 + 1 - off; ++i)
|
||||
if (i != option_->window_size)
|
||||
{
|
||||
int c = sentence_position - option_->window_size + i;
|
||||
if (c < 0 || c >= sentence_length || sentence[c] == -1)
|
||||
continue;
|
||||
|
||||
feat[feat_size++] = sentence[c];
|
||||
if (!option_->cbow) //train Skip-gram
|
||||
{
|
||||
input_nodes.clear();
|
||||
output_nodes.clear();
|
||||
Parse(feat + feat_size - 1, 1, sentence[sentence_position],
|
||||
next_random, input_nodes, output_nodes, negativesample_pools);
|
||||
(this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
|
||||
}
|
||||
}
|
||||
|
||||
if (option_->cbow) //train cbow
|
||||
{
|
||||
input_nodes.clear();
|
||||
output_nodes.clear();
|
||||
Parse(feat, feat_size, sentence[sentence_position],
|
||||
next_random, input_nodes, output_nodes, negativesample_pools);
|
||||
|
||||
(this->*function)(input_nodes, output_nodes, hidden_act, hidden_err);
|
||||
}
|
||||
}
|
||||
}
|
||||
//Parse the windows's input&output nodes
|
||||
inline void WordEmbedding::Parse(int *feat, int feat_cnt, int word_idx,
|
||||
uint64 &next_random, std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools)
|
||||
{
|
||||
|
||||
for (int i = 0; i < feat_cnt; ++i)
|
||||
{
|
||||
input_nodes.push_back(feat[i]);
|
||||
}
|
||||
|
||||
if (option_->hs)
|
||||
{
|
||||
auto info = huffmanEncoder_->GetLabelInfo(word_idx);
|
||||
for (int d = 0; d < info->codelen; d++)
|
||||
output_nodes.push_back(std::make_pair(info->point[d], info->code[d]));
|
||||
}
|
||||
else
|
||||
if (option_->negative_num)
|
||||
{
|
||||
output_nodes.push_back(std::make_pair(word_idx, 1));
|
||||
for (int d = 0; d < option_->negative_num; d++)
|
||||
{
|
||||
next_random = sampler_->GetNextRandom(next_random);
|
||||
int index = (next_random >> 8) % negativesample_pools.size();
|
||||
int target = negativesample_pools[index];
|
||||
if (target == word_idx) continue;
|
||||
output_nodes.push_back(std::make_pair(target, 0));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void WordEmbedding::SetWeightIE(int input_node_id, real* ptr)
|
||||
{
|
||||
data_block_->SetWeightIE(input_node_id,ptr);
|
||||
}
|
||||
|
||||
void WordEmbedding::SetWeightEO(int output_node_id, real* ptr)
|
||||
{
|
||||
data_block_->SetWeightEO(output_node_id,ptr);
|
||||
}
|
||||
|
||||
real* WordEmbedding::GetWeightIE(int input_node_id)
|
||||
{
|
||||
return data_block_->GetWeightIE(input_node_id);
|
||||
}
|
||||
|
||||
real* WordEmbedding::GetWeightEO(int output_node_id)
|
||||
{
|
||||
return data_block_->GetWeightEO(output_node_id);
|
||||
}
|
||||
|
||||
void WordEmbedding::SetSumGradient2IE(int input_node_id, real* ptr)
|
||||
{
|
||||
data_block_->SetSumGradient2IE(input_node_id, ptr);
|
||||
}
|
||||
|
||||
void WordEmbedding::SetSumGradient2EO(int output_node_id, real* ptr)
|
||||
{
|
||||
data_block_->SetSumGradient2EO(output_node_id, ptr);
|
||||
}
|
||||
|
||||
real* WordEmbedding::GetSumGradient2IE(int input_node_id)
|
||||
{
|
||||
return data_block_->GetSumGradient2IE(input_node_id);
|
||||
}
|
||||
|
||||
real* WordEmbedding::GetSumGradient2EO(int output_node_id)
|
||||
{
|
||||
return data_block_->GetSumGradient2EO(output_node_id);
|
||||
}
|
||||
}
|
||||
//Get the weight of SumGradient-output vector
|
||||
real* WordEmbedding::GetSumGradient2EO(int output_node_id)
|
||||
{
|
||||
return sum_gradient2_EO_[output_node_id];
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,144 +1,148 @@
|
|||
#ifndef DISTRIBUTED_WORD_EMBEDDING_WORD_EMBEDDING_H_
|
||||
#define DISTRIBUTED_WORD_EMBEDDING_WORD_EMBEDDING_H_
|
||||
#pragma once
|
||||
|
||||
/*!
|
||||
* file WordEmbedding.h
|
||||
* \brief Class WordEmbedding includes some functions and parameters about TrainNN
|
||||
* \brief Class WordEmbedding includes some functions and parameters about TrainNN
|
||||
*/
|
||||
|
||||
#include <vector>
|
||||
#include <cstring>
|
||||
|
||||
#include "multiverso/multiverso.h"
|
||||
|
||||
#include "util.h"
|
||||
#include "multiverso.h"
|
||||
#include "huffman_encoder.h"
|
||||
#include "distributed_wordembedding.h"
|
||||
#include "constant.h"
|
||||
#include "data_block.h"
|
||||
|
||||
namespace multiverso
|
||||
{
|
||||
namespace wordembedding
|
||||
{
|
||||
class WordEmbedding
|
||||
{
|
||||
public:
|
||||
real learning_rate;
|
||||
int64 word_count_actual;
|
||||
namespace wordembedding
|
||||
{
|
||||
class WordEmbedding
|
||||
{
|
||||
public:
|
||||
real learning_rate;
|
||||
int64 word_count_actual;
|
||||
|
||||
WordEmbedding(Option* option, HuffmanEncoder* huffmanEncoder,
|
||||
Sampler* sampler, int dictionary_size);
|
||||
~WordEmbedding();
|
||||
/*!
|
||||
* \brief Create memory for weight_IE_ weight_EO_ sum_gradient2_IE_ sum_gradient2_EO_
|
||||
*/
|
||||
WordEmbedding(Option* option, HuffmanEncoder* huffmanEncoder,
|
||||
Sampler* sampler, int dictionary_size);
|
||||
~WordEmbedding();
|
||||
/*!
|
||||
* \brief Create memory for weight_IE_ weight_EO_ sum_gradient2_IE_ sum_gradient2_EO_
|
||||
*/
|
||||
void MallocMemory();
|
||||
/*!
|
||||
* \brief TrainNN
|
||||
* \param data_block represents the trainNNing datablock
|
||||
* \param index_start the thread's starting index in the sentence vector
|
||||
* \param interval the total_number of thread
|
||||
* \param word_count count the words which has been processed by trainNN
|
||||
* \param hidden_act hidden layer value
|
||||
* \param hidden_err hidden layer error
|
||||
*/
|
||||
void Train(DataBlock *data_block, int index_start,
|
||||
int interval, int64& word_count,
|
||||
real* hidden_act, real* hidden_err);
|
||||
/*!
|
||||
* \brief PrepareParameter for parameterloader threat
|
||||
* \param data_block datablock for parameterloader to parse
|
||||
* \param input_nodes input_nodes represent the parameter which input_layer includes
|
||||
* \param output_nodes output_nodes represent the parameter which output_layer inclueds
|
||||
*/
|
||||
void PrepareParameter(DataBlock *data_block);
|
||||
/*!
|
||||
* \brief Update the learning rate
|
||||
*/
|
||||
void UpdateLearningRate();
|
||||
/*!
|
||||
* \brief Set the input(output)-embeddding weight
|
||||
*/
|
||||
void SetWeightIE(int input_node_id, real* ptr);
|
||||
void SetWeightEO(int output_node_id, real* ptr);
|
||||
/*!
|
||||
* \brief Set the SumGradient-input(ouput)
|
||||
*/
|
||||
void SetSumGradient2IE(int input_node_id, real* ptr);
|
||||
void SetSumGradient2EO(int output_node_id, real* ptr);
|
||||
/*!
|
||||
* \brief Return the parametertable value
|
||||
*/
|
||||
real* GetWeightIE(int input_node_id);
|
||||
real* GetWeightEO(int output_node_id);
|
||||
real* GetSumGradient2IE(int input_node_id);
|
||||
real* GetSumGradient2EO(int output_node_id);
|
||||
|
||||
/*!
|
||||
* \brief TrainNN
|
||||
* \param data_block represents the trainNNing datablock
|
||||
* \param index_start the thread's starting index in the sentence vector
|
||||
* \param interval the total_number of thread
|
||||
* \param word_count count the words which has been processed by trainNN
|
||||
* \param hidden_act hidden layer value
|
||||
* \param hidden_err hidden layer error
|
||||
*/
|
||||
void Train(DataBlock *data_block, int index_start,
|
||||
int interval, int64& word_count,
|
||||
real* hidden_act, real* hidden_err);
|
||||
/*!
|
||||
* \brief PrepareParameter for parameterloader threat
|
||||
* \param data_block datablock for parameterloader to parse
|
||||
* \param input_nodes input_nodes represent the parameter which input_layer includes
|
||||
* \param output_nodes output_nodes represent the parameter which output_layer inclueds
|
||||
*/
|
||||
void PrepareData(DataBlock *data_block);
|
||||
/*!
|
||||
* \brief Update the learning rate
|
||||
*/
|
||||
void UpdateLearningRate();
|
||||
/*!
|
||||
* \brief Set the input(output)-embeddding weight
|
||||
*/
|
||||
void SetWeightIE(int input_node_id, real* ptr);
|
||||
void SetWeightEO(int output_node_id, real* ptr);
|
||||
private:
|
||||
Option *option_;
|
||||
Dictionary *dictionary_;
|
||||
HuffmanEncoder *huffmanEncoder_;
|
||||
Sampler *sampler_;
|
||||
std::unordered_set<int> input_nodes_, output_nodes_;
|
||||
int dictionary_size_;
|
||||
real** weight_IE_;
|
||||
real** weight_EO_;
|
||||
real** sum_gradient2_IE_;
|
||||
real** sum_gradient2_EO_;
|
||||
|
||||
/*!
|
||||
* \brief Return the parametertable value
|
||||
*/
|
||||
real* GetWeightIE(int input_node_id);
|
||||
real* GetWeightEO(int output_node_id);
|
||||
typedef void(WordEmbedding::*FunctionType)(std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes,
|
||||
void *hidden_act, void *hidden_err);
|
||||
/*!
|
||||
* \brief Parse the needed parameter in a window
|
||||
*/
|
||||
void Parse(int *feat, int feat_cnt, int word_idx, uint64 &next_random,
|
||||
std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools);
|
||||
/*!
|
||||
* \brief Parse a sentence and deepen into two branchs
|
||||
* \one for TrainNN,the other one is for Parameter_parse&request
|
||||
*/
|
||||
void ParseSentence(int* sentence, int sentence_length,
|
||||
uint64 next_random,
|
||||
real* hidden_act, real* hidden_err,
|
||||
FunctionType function, std::vector <int> &negativesample_pools);
|
||||
/*!
|
||||
* \brief Get the hidden layer vector
|
||||
* \param input_nodes represent the input nodes
|
||||
* \param hidden_act store the hidden layer vector
|
||||
*/
|
||||
void FeedForward(std::vector<int>& input_nodes, real* hidden_act);
|
||||
/*!
|
||||
* \brief Calculate the hidden_err and update the output-embedding weight
|
||||
* \param label record the label of every output-embedding vector
|
||||
* \param word_idx the index of the output-embedding vector
|
||||
* \param classifier store the output-embedding vector
|
||||
* \param store the hidden layer vector
|
||||
* \param store the hidden-error which is used
|
||||
* \to update the input-embedding vector
|
||||
*/
|
||||
void BPOutputLayer(int label, int word_idx, real* classifier,
|
||||
real* hidden_act, real* hidden_err);
|
||||
/*!
|
||||
* \brief Copy the input_nodes&output_nodes to WordEmbedding private set
|
||||
*/
|
||||
void DealPrepareParameter(std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes,
|
||||
void *hidden_act, void *hidden_err);
|
||||
/*!
|
||||
* \brief Train a window sample and update the
|
||||
* \input-embedding&output-embedding vectors
|
||||
* \param input_nodes represent the input nodes
|
||||
* \param output_nodes represent the ouput nodes
|
||||
* \param hidden_act store the hidden layer vector
|
||||
* \param hidden_err store the hidden layer error
|
||||
*/
|
||||
void TrainSample(std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes,
|
||||
void *hidden_act, void *hidden_err);
|
||||
/*!
|
||||
* \brief Train the sentence actually
|
||||
*/
|
||||
void Train(int* sentence, int sentence_length,
|
||||
uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools);
|
||||
|
||||
real* GetSumGradient2IE(int input_node_id);
|
||||
real* GetSumGradient2EO(int output_node_id);
|
||||
void SetSumGradient2IE(int input_node_id, real* ptr);
|
||||
void SetSumGradient2EO(int output_node_id, real* ptr);
|
||||
|
||||
private:
|
||||
Option *option_ = nullptr;
|
||||
Dictionary *dictionary_ = nullptr;
|
||||
HuffmanEncoder *huffmanEncoder_ = nullptr;
|
||||
Sampler *sampler_ = nullptr;
|
||||
std::unordered_set<int> input_nodes_, output_nodes_;
|
||||
int dictionary_size_;
|
||||
|
||||
DataBlock * data_block_ = nullptr;
|
||||
|
||||
typedef void(WordEmbedding::*FunctionType)(std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes,
|
||||
void *hidden_act, void *hidden_err);
|
||||
/*!
|
||||
* \brief Parse the needed parameter in a window
|
||||
*/
|
||||
void Parse(int *feat, int feat_cnt, int word_idx, uint64 &next_random,
|
||||
std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes, std::vector <int> &negativesample_pools);
|
||||
/*!
|
||||
* \brief Parse a sentence and deepen into two branchs
|
||||
* \one for TrainNN,the other one is for Parameter_parse&request
|
||||
*/
|
||||
void ParseSentence(int* sentence, int sentence_length,
|
||||
uint64 next_random,
|
||||
real* hidden_act, real* hidden_err,
|
||||
FunctionType function, std::vector <int> &negativesample_pools);
|
||||
/*!
|
||||
* \brief Get the hidden layer vector
|
||||
* \param input_nodes represent the input nodes
|
||||
* \param hidden_act store the hidden layer vector
|
||||
*/
|
||||
void FeedForward(std::vector<int>& input_nodes, real* hidden_act);
|
||||
/*!
|
||||
* \brief Calculate the hidden_err and update the output-embedding weight
|
||||
* \param label record the label of every output-embedding vector
|
||||
* \param word_idx the index of the output-embedding vector
|
||||
* \param classifier store the output-embedding vector
|
||||
* \param store the hidden layer vector
|
||||
* \param store the hidden-error which is used
|
||||
* \to update the input-embedding vector
|
||||
*/
|
||||
void BPOutputLayer(int label, int word_idx, real* classifier,
|
||||
real* hidden_act, real* hidden_err);
|
||||
|
||||
/*!
|
||||
* \brief Train a window sample and update the
|
||||
* \input-embedding&output-embedding vectors
|
||||
* \param input_nodes represent the input nodes
|
||||
* \param output_nodes represent the ouput nodes
|
||||
* \param hidden_act store the hidden layer vector
|
||||
* \param hidden_err store the hidden layer error
|
||||
*/
|
||||
void TrainSample(std::vector<int>& input_nodes,
|
||||
std::vector<std::pair<int, int> >& output_nodes,
|
||||
void *hidden_act, void *hidden_err);
|
||||
/*!
|
||||
* \brief Train the sentence actually
|
||||
*/
|
||||
void Train(int* sentence, int sentence_length,
|
||||
uint64 next_random, real* hidden_act, real* hidden_err, std::vector <int> &negativesample_pools);
|
||||
|
||||
//No copying allowed
|
||||
WordEmbedding(const WordEmbedding&);
|
||||
void operator=(const WordEmbedding&);
|
||||
};
|
||||
}
|
||||
//No copying allowed
|
||||
WordEmbedding(const WordEmbedding&);
|
||||
void operator=(const WordEmbedding&);
|
||||
};
|
||||
}
|
||||
}
|
||||
#endif
|
|
@ -3,7 +3,7 @@ Microsoft Visual Studio Solution File, Format Version 12.00
|
|||
# Visual Studio 2013
|
||||
VisualStudioVersion = 12.0.21005.1
|
||||
MinimumVisualStudioVersion = 10.0.40219.1
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "distributed_word_embedding", "distributed_word_embedding\distributed_word_embedding.vcxproj", "{D1C18C01-40A1-400D-B537-528FE982DC5C}"
|
||||
Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "distributed_word_embedding", "distributed_word_embedding.vcxproj", "{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}"
|
||||
EndProject
|
||||
Global
|
||||
GlobalSection(SolutionConfigurationPlatforms) = preSolution
|
||||
|
@ -13,14 +13,14 @@ Global
|
|||
Release|x64 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(ProjectConfigurationPlatforms) = postSolution
|
||||
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Debug|x64.Build.0 = Debug|x64
|
||||
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Release|Win32.Build.0 = Release|Win32
|
||||
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Release|x64.ActiveCfg = Release|x64
|
||||
{D1C18C01-40A1-400D-B537-528FE982DC5C}.Release|x64.Build.0 = Release|x64
|
||||
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Debug|Win32.ActiveCfg = Debug|Win32
|
||||
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Debug|Win32.Build.0 = Debug|Win32
|
||||
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Debug|x64.ActiveCfg = Debug|x64
|
||||
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Debug|x64.Build.0 = Debug|x64
|
||||
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Release|Win32.ActiveCfg = Release|Win32
|
||||
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Release|Win32.Build.0 = Release|Win32
|
||||
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Release|x64.ActiveCfg = Release|x64
|
||||
{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}.Release|x64.Build.0 = Release|x64
|
||||
EndGlobalSection
|
||||
GlobalSection(SolutionProperties) = preSolution
|
||||
HideSolutionNode = FALSE
|
||||
|
|
|
@ -0,0 +1,173 @@
|
|||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
|
||||
<ItemGroup Label="ProjectConfigurations">
|
||||
<ProjectConfiguration Include="Debug|Win32">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Debug|x64">
|
||||
<Configuration>Debug</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|Win32">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>Win32</Platform>
|
||||
</ProjectConfiguration>
|
||||
<ProjectConfiguration Include="Release|x64">
|
||||
<Configuration>Release</Configuration>
|
||||
<Platform>x64</Platform>
|
||||
</ProjectConfiguration>
|
||||
</ItemGroup>
|
||||
<PropertyGroup Label="Globals">
|
||||
<ProjectGuid>{8CB94C32-EA45-43B1-9DE0-B59EB6BBB2AE}</ProjectGuid>
|
||||
<Keyword>Win32Proj</Keyword>
|
||||
<RootNamespace>distributed_word_embedding</RootNamespace>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v120</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>true</UseDebugLibraries>
|
||||
<PlatformToolset>v120</PlatformToolset>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v120</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
|
||||
<ConfigurationType>Application</ConfigurationType>
|
||||
<UseDebugLibraries>false</UseDebugLibraries>
|
||||
<PlatformToolset>v120</PlatformToolset>
|
||||
<WholeProgramOptimization>true</WholeProgramOptimization>
|
||||
<CharacterSet>Unicode</CharacterSet>
|
||||
</PropertyGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
|
||||
<ImportGroup Label="ExtensionSettings">
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
|
||||
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
|
||||
</ImportGroup>
|
||||
<PropertyGroup Label="UserMacros" />
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<LinkIncremental>true</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
</PropertyGroup>
|
||||
<PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<LinkIncremental>false</LinkIncremental>
|
||||
<IncludePath>$(VC_IncludePath);$(WindowsSDK_IncludePath);..\..\multiverso\include\multiverso</IncludePath>
|
||||
<LibraryPath>$(VC_LibraryPath_x64);$(WindowsSDK_LibraryPath_x64);..\..\multiverso\windows\x64\Release</LibraryPath>
|
||||
</PropertyGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
|
||||
<ClCompile>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<Optimization>Disabled</Optimization>
|
||||
<PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
|
||||
<ClCompile>
|
||||
<WarningLevel>Level3</WarningLevel>
|
||||
<PrecompiledHeader>
|
||||
</PrecompiledHeader>
|
||||
<Optimization>MaxSpeed</Optimization>
|
||||
<FunctionLevelLinking>true</FunctionLevelLinking>
|
||||
<IntrinsicFunctions>true</IntrinsicFunctions>
|
||||
<PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
|
||||
</ClCompile>
|
||||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<EnableCOMDATFolding>true</EnableCOMDATFolding>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalDependencies>multiverso.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
</Link>
|
||||
</ItemDefinitionGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\..\src\constant.h" />
|
||||
<ClInclude Include="..\..\src\data_block.h" />
|
||||
<ClInclude Include="..\..\src\dictionary.h" />
|
||||
<ClInclude Include="..\..\src\distributed_wordembedding.h" />
|
||||
<ClInclude Include="..\..\src\huffman_encoder.h" />
|
||||
<ClInclude Include="..\..\src\memory_manager.h" />
|
||||
<ClInclude Include="..\..\src\parameter_loader.h" />
|
||||
<ClInclude Include="..\..\src\reader.h" />
|
||||
<ClInclude Include="..\..\src\trainer.h" />
|
||||
<ClInclude Include="..\..\src\util.h" />
|
||||
<ClInclude Include="..\..\src\word_embedding.h" />
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClCompile Include="..\..\src\data_block.cpp" />
|
||||
<ClCompile Include="..\..\src\dictionary.cpp" />
|
||||
<ClCompile Include="..\..\src\distributed_wordembedding.cpp" />
|
||||
<ClCompile Include="..\..\src\huffman_encoder.cpp" />
|
||||
<ClCompile Include="..\..\src\main.cpp" />
|
||||
<ClCompile Include="..\..\src\memory_manager.cpp" />
|
||||
<ClCompile Include="..\..\src\parameter_loader.cpp" />
|
||||
<ClCompile Include="..\..\src\reader.cpp" />
|
||||
<ClCompile Include="..\..\src\trainer.cpp" />
|
||||
<ClCompile Include="..\..\src\util.cpp" />
|
||||
<ClCompile Include="..\..\src\word_embedding.cpp" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
<ImportGroup Label="ExtensionTargets">
|
||||
</ImportGroup>
|
||||
</Project>
|
Загрузка…
Ссылка в новой задаче