init commit
This commit is contained in:
Родитель
e1ec448d22
Коммит
82d261863c
|
@ -0,0 +1,3 @@
|
|||
bin/
|
||||
multiverso
|
||||
src/*.o
|
|
@ -0,0 +1,45 @@
|
|||
PROJECT := $(shell readlink $(dir $(lastword $(MAKEFILE_LIST))) -f)
|
||||
|
||||
CXX = g++
|
||||
CXXFLAGS = -O3 \
|
||||
-std=c++11 \
|
||||
-Wall \
|
||||
-Wno-sign-compare \
|
||||
-fno-omit-frame-pointer
|
||||
|
||||
MULTIVERSO_DIR = $(PROJECT)/multiverso
|
||||
MULTIVERSO_INC = $(MULTIVERSO_DIR)/include/multiverso
|
||||
MULTIVERSO_LIB = $(MULTIVERSO_DIR)/lib
|
||||
THIRD_PARTY_LIB = $(MULTIVERSO_DIR)/third_party/lib
|
||||
|
||||
INC_FLAGS = -I$(MULTIVERSO_INC)
|
||||
LD_FLAGS = -L$(MULTIVERSO_LIB) -lmultiverso
|
||||
LD_FLAGS += -L$(THIRD_PARTY_LIB) -lzmq -lmpi -lmpl
|
||||
|
||||
WORD_EMBEDDING_HEADERS = $(shell find $(PROJECT)/src -type f -name "*.h")
|
||||
WORD_EMBEDDING_SRC = $(shell find $(PROJECT)/src -type f -name "*.cpp")
|
||||
WORD_EMBEDDING_OBJ = $(WORD_EMBEDDING_SRC:.cpp=.o)
|
||||
|
||||
BIN_DIR = $(PROJECT)/bin
|
||||
WORD_EMBEDDING = $(BIN_DIR)/multisense_word_embedding
|
||||
|
||||
all: path \
|
||||
multisense_word_embedding
|
||||
|
||||
path: $(BIN_DIR)
|
||||
|
||||
$(BIN_DIR):
|
||||
mkdir -p $@
|
||||
|
||||
$(WORD_EMBEDDING): $(WORD_EMBEDDING_OBJ)
|
||||
$(CXX) $(WORD_EMBEDDING_OBJ) $(CXXFLAGS) $(INC_FLAGS) $(LD_FLAGS) -o $@
|
||||
|
||||
$(WORD_EMBEDDING_OBJ): %.o: %.cpp $(WORD_EMBEDDING_HEADERS) $(MULTIVERSO_INC)
|
||||
$(CXX) $(CXXFLAGS) $(INC_FLAGS) -c $< -o $@
|
||||
|
||||
multisense_word_embedding: path $(WORD_EMBEDDING)
|
||||
|
||||
clean:
|
||||
rm -rf $(BIN_DIR) $(WORD_EMBEDDING_OBJ)
|
||||
|
||||
.PHONY: all path multisense_word_embedding clean
|
|
@ -0,0 +1,12 @@
|
|||
# build word_embedding
|
||||
|
||||
git clone https://github.com/msraai/multiverso
|
||||
|
||||
cd multiverso
|
||||
cd third_party
|
||||
sh install.sh
|
||||
cd ..
|
||||
make -j4 all
|
||||
|
||||
cd ..
|
||||
make -j4
|
|
@ -0,0 +1,60 @@
|
|||
#include "DataBlock.h"
|
||||
|
||||
size_t DataBlock::Size()
|
||||
{
|
||||
return m_sentences.size();
|
||||
}
|
||||
|
||||
void DataBlock::Add(int *head, int sentence_length, int64_t word_count, uint64_t next_random)
|
||||
{
|
||||
Sentence sentence(head, sentence_length, word_count, next_random);
|
||||
m_sentences.push_back(sentence);
|
||||
}
|
||||
|
||||
void DataBlock::UpdateNextRandom()
|
||||
{
|
||||
for (int i = 0; i < m_sentences.size(); ++i)
|
||||
m_sentences[i].next_random *= (uint64_t)rand();
|
||||
}
|
||||
|
||||
void DataBlock::Get(int index, int* &head, int &sentence_length, int64_t &word_count, uint64_t &next_random)
|
||||
{
|
||||
if (index >= 0 && index < m_sentences.size())
|
||||
{
|
||||
m_sentences[index].Get(head, sentence_length, word_count, next_random);
|
||||
}
|
||||
else
|
||||
{
|
||||
head = nullptr;
|
||||
sentence_length = 0;
|
||||
word_count = 0;
|
||||
next_random = 0;
|
||||
}
|
||||
}
|
||||
|
||||
void DataBlock::ReleaseSentences()
|
||||
{
|
||||
for (int i = 0; i < m_sentences.size(); ++i)
|
||||
delete m_sentences[i].head;
|
||||
m_sentences.clear();
|
||||
}
|
||||
|
||||
void DataBlock::AddTable(int table_id)
|
||||
{
|
||||
m_tables.push_back(table_id);
|
||||
}
|
||||
|
||||
std::vector<int> & DataBlock::GetTables()
|
||||
{
|
||||
return m_tables;
|
||||
}
|
||||
|
||||
void DataBlock::SetEpochId(const int epoch_id)
|
||||
{
|
||||
m_epoch_id = epoch_id;
|
||||
}
|
||||
|
||||
int DataBlock::GetEpochId()
|
||||
{
|
||||
return m_epoch_id;
|
||||
}
|
|
@ -0,0 +1,76 @@
|
|||
#pragma once
|
||||
|
||||
/*!
|
||||
* \file DataBlock.h
|
||||
* \brief Defines class DataBlock to store the necessary data for trainer and param_loader
|
||||
* \author
|
||||
* - v-fetia
|
||||
*/
|
||||
#include "Util.h"
|
||||
#include <multiverso.h>
|
||||
#include "HuffmanEncoder.h"
|
||||
|
||||
/*!
|
||||
* \brief The class DataBlock stores train for trainer and param_loader
|
||||
*/
|
||||
class DataBlock : public multiverso::DataBlockBase
|
||||
{
|
||||
public:
|
||||
/*!
|
||||
* \brief Get the number of sentences stored in DataBlock
|
||||
* \return the number of sentences
|
||||
*/
|
||||
size_t Size();
|
||||
/*!
|
||||
* \brief Add a new sentence to the DataBlock
|
||||
* \param sentence the starting address of the sentence
|
||||
* \param sentence_length the length of the sentence
|
||||
* \param word_count the number of words when getting the sentence from train-file
|
||||
* \param next_random the seed for getting random number
|
||||
*/
|
||||
void Add(int *sentence, int sentence_length, int64_t word_count, uint64_t next_random);
|
||||
/*!
|
||||
* \brief Get the information of the index-th sentence
|
||||
* \param index the id of the sentence
|
||||
* \param sentence the starting address of the sentence
|
||||
* \param sentence_length the number of words in the sentence
|
||||
* \param word_count the number of words when getting the sentence from train-file
|
||||
* \param next_random the seed for getting random number
|
||||
*/
|
||||
void Get(int index, int* &sentence, int &sentence_length, int64_t &word_count, uint64_t &next_random);
|
||||
|
||||
|
||||
void UpdateNextRandom();
|
||||
|
||||
void AddTable(int table_id);
|
||||
|
||||
std::vector <int> & GetTables();
|
||||
|
||||
void ReleaseSentences();
|
||||
|
||||
int GetEpochId();
|
||||
|
||||
void SetEpochId(const int epoch_id);
|
||||
|
||||
private:
|
||||
struct Sentence
|
||||
{
|
||||
int* head;
|
||||
int length;
|
||||
int64_t word_count;
|
||||
uint64_t next_random;
|
||||
Sentence(int *head, int length, int64_t word_count, uint64_t next_random)
|
||||
:head(head), length(length), word_count(word_count), next_random(next_random){}
|
||||
void Get(int* &local_head, int &sentence_length, int64_t &local_word_count, uint64_t &local_next_random)
|
||||
{
|
||||
local_head = head;
|
||||
sentence_length = length;
|
||||
local_word_count = word_count;
|
||||
local_next_random = next_random;
|
||||
}
|
||||
};
|
||||
|
||||
std::vector <int> m_tables;
|
||||
std::vector <Sentence> m_sentences;
|
||||
int m_epoch_id;
|
||||
};
|
|
@ -0,0 +1,210 @@
|
|||
#include "Dictionary.h"
|
||||
|
||||
Dictionary::Dictionary()
|
||||
{
|
||||
combine =0;
|
||||
Clear();
|
||||
}
|
||||
|
||||
Dictionary::Dictionary(int i)
|
||||
{
|
||||
combine = i;
|
||||
Clear();
|
||||
}
|
||||
|
||||
void Dictionary::Clear()
|
||||
{
|
||||
m_word_idx_map.clear();
|
||||
m_word_info.clear();
|
||||
m_word_whitelist.clear();
|
||||
}
|
||||
|
||||
void Dictionary::SetWhiteList(const std::vector<std::string>& whitelist)
|
||||
{
|
||||
for (unsigned int i = 0; i < whitelist.size(); ++i)
|
||||
m_word_whitelist.insert(whitelist[i]);
|
||||
}
|
||||
|
||||
void Dictionary::MergeInfrequentWords(int64_t threshold)
|
||||
{
|
||||
m_word_idx_map.clear();
|
||||
std::vector<WordInfo> tmp_info;
|
||||
tmp_info.clear();
|
||||
int infreq_idx = -1;
|
||||
|
||||
for (auto& word_info : m_word_info)
|
||||
{
|
||||
if (word_info.freq >= threshold || word_info.freq == 0 || m_word_whitelist.count(word_info.word))
|
||||
{
|
||||
m_word_idx_map[word_info.word] = static_cast<int>(tmp_info.size());
|
||||
tmp_info.push_back(word_info);
|
||||
}
|
||||
else {
|
||||
if (infreq_idx < 0)
|
||||
{
|
||||
WordInfo infreq_word_info;
|
||||
infreq_word_info.word = "WE_ARE_THE_INFREQUENT_WORDS";
|
||||
infreq_word_info.freq = 0;
|
||||
m_word_idx_map[infreq_word_info.word] = static_cast<int>(tmp_info.size());
|
||||
infreq_idx = static_cast<int>(tmp_info.size());
|
||||
tmp_info.push_back(infreq_word_info);
|
||||
}
|
||||
m_word_idx_map[word_info.word] = infreq_idx;
|
||||
tmp_info[infreq_idx].freq += word_info.freq;
|
||||
}
|
||||
}
|
||||
m_word_info = tmp_info;
|
||||
}
|
||||
|
||||
void Dictionary::RemoveWordsLessThan(int64_t min_count)
|
||||
{
|
||||
m_word_idx_map.clear();
|
||||
std::vector<WordInfo> tmp_info;
|
||||
tmp_info.clear();
|
||||
for (auto& info : m_word_info)
|
||||
{
|
||||
if (info.freq >= min_count || info.freq == 0 || m_word_whitelist.count(info.word))
|
||||
{
|
||||
m_word_idx_map[info.word] = static_cast<int>(tmp_info.size());
|
||||
tmp_info.push_back(info);
|
||||
}
|
||||
}
|
||||
m_word_info = tmp_info;
|
||||
}
|
||||
|
||||
void Dictionary::Insert(const char* word, int64_t cnt)
|
||||
{
|
||||
const auto& it = m_word_idx_map.find(word);
|
||||
if (it != m_word_idx_map.end())
|
||||
m_word_info[it->second].freq += cnt;
|
||||
else
|
||||
{
|
||||
m_word_idx_map[word] = static_cast<int>(m_word_info.size());
|
||||
m_word_info.push_back(WordInfo(word, cnt));
|
||||
}
|
||||
}
|
||||
|
||||
void Dictionary::LoadFromFile(const char* filename)
|
||||
{
|
||||
FILE* fid = fopen(filename, "r");
|
||||
|
||||
if(fid)
|
||||
{
|
||||
char sz_label[MAX_WORD_SIZE];
|
||||
|
||||
while (fscanf(fid, "%s", sz_label, MAX_WORD_SIZE) != EOF)
|
||||
{
|
||||
int freq;
|
||||
fscanf(fid, "%d", &freq);
|
||||
Insert(sz_label, freq);
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
|
||||
void Dictionary::LoadTriLetterFromFile(const char* filename, unsigned int min_cnt, unsigned int letter_count)
|
||||
{
|
||||
FILE* fid = fopen(filename, "r");
|
||||
if(fid)
|
||||
{
|
||||
char sz_label[MAX_WORD_SIZE];
|
||||
while (fscanf(fid, "%s", sz_label, MAX_WORD_SIZE) != EOF)
|
||||
{
|
||||
int freq;
|
||||
fscanf(fid, "%d", &freq);
|
||||
if (static_cast<unsigned int>(freq) < min_cnt) continue;
|
||||
|
||||
// Construct Tri-letter From word
|
||||
size_t len = strlen(sz_label);
|
||||
if (len > MAX_WORD_SIZE)
|
||||
{
|
||||
printf("ignore super long term");
|
||||
continue;
|
||||
}
|
||||
|
||||
char tri_letters[MAX_WORD_SIZE + 2];
|
||||
tri_letters[0] = '#';
|
||||
int i = 0;
|
||||
for (i = 0; i < strlen(sz_label); i++)
|
||||
{
|
||||
tri_letters[i+1] = sz_label[i];
|
||||
}
|
||||
|
||||
tri_letters[i+1] = '#';
|
||||
tri_letters[i+2] = 0;
|
||||
if (combine) Insert(sz_label,freq);
|
||||
|
||||
if (strlen(tri_letters) <= letter_count) {
|
||||
Insert(tri_letters, freq);
|
||||
} else {
|
||||
for (i = 0; i <= strlen(tri_letters) - letter_count; ++i)
|
||||
{
|
||||
char tri_word[MAX_WORD_SIZE];
|
||||
unsigned int j = 0;
|
||||
for(j = 0; j < letter_count; j++)
|
||||
{
|
||||
tri_word[j] = tri_letters[i+j];
|
||||
}
|
||||
tri_word[j] = 0;
|
||||
Insert(tri_word, freq);
|
||||
}
|
||||
}
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
int Dictionary::GetWordIdx(const char* word)
|
||||
{
|
||||
const auto& it = m_word_idx_map.find(word);
|
||||
if (it != m_word_idx_map.end())
|
||||
return it->second;
|
||||
return -1;
|
||||
}
|
||||
|
||||
int Dictionary::Size()
|
||||
{
|
||||
return static_cast<int>(m_word_info.size());
|
||||
}
|
||||
|
||||
const WordInfo* Dictionary::GetWordInfo(const char* word)
|
||||
{
|
||||
const auto& it = m_word_idx_map.find(word);
|
||||
if (it != m_word_idx_map.end())
|
||||
return GetWordInfo(it->second);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
const WordInfo* Dictionary::GetWordInfo(int word_idx)
|
||||
{
|
||||
if (word_idx >= 0 && word_idx < m_word_info.size())
|
||||
return &m_word_info[word_idx];
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void Dictionary::StartIteration()
|
||||
{
|
||||
m_word_iterator = m_word_info.begin();
|
||||
}
|
||||
|
||||
bool Dictionary::HasMore()
|
||||
{
|
||||
return m_word_iterator != m_word_info.end();
|
||||
}
|
||||
|
||||
const WordInfo* Dictionary::Next()
|
||||
{
|
||||
const WordInfo* entry = &(*m_word_iterator);
|
||||
++m_word_iterator;
|
||||
return entry;
|
||||
}
|
||||
|
||||
std::vector<WordInfo>::iterator Dictionary::Begin()
|
||||
{
|
||||
return m_word_info.begin();
|
||||
}
|
||||
std::vector<WordInfo>::iterator Dictionary::End()
|
||||
{
|
||||
return m_word_info.end();
|
||||
}
|
|
@ -0,0 +1,55 @@
|
|||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
#include "Util.h"
|
||||
|
||||
const int MAX_WORD_SIZE = 901;
|
||||
|
||||
struct WordInfo
|
||||
{
|
||||
std::string word;
|
||||
int64_t freq;
|
||||
WordInfo()
|
||||
{
|
||||
freq = 0;
|
||||
word.clear();
|
||||
}
|
||||
WordInfo(const std::string& _word, int64_t _freq)
|
||||
{
|
||||
word = _word;
|
||||
freq = _freq;
|
||||
}
|
||||
};
|
||||
|
||||
class Dictionary
|
||||
{
|
||||
public:
|
||||
Dictionary();
|
||||
Dictionary(int i);
|
||||
void Clear();
|
||||
void SetWhiteList(const std::vector<std::string>& whitelist);
|
||||
void RemoveWordsLessThan(int64_t min_count);
|
||||
void MergeInfrequentWords(int64_t threshold);
|
||||
void Insert(const char* word, int64_t cnt = 1);
|
||||
void LoadFromFile(const char* filename);
|
||||
void LoadTriLetterFromFile(const char* filename, unsigned int min_cnt = 1, unsigned int letter_count = 3);
|
||||
int GetWordIdx(const char* word);
|
||||
const WordInfo* GetWordInfo(const char* word);
|
||||
const WordInfo* GetWordInfo(int word_idx);
|
||||
int Size();
|
||||
void StartIteration();
|
||||
bool HasMore();
|
||||
const WordInfo* Next();
|
||||
std::vector<WordInfo>::iterator Begin();
|
||||
std::vector<WordInfo>::iterator End();
|
||||
|
||||
private:
|
||||
int combine;
|
||||
std::vector<WordInfo> m_word_info;
|
||||
std::vector<WordInfo>::iterator m_word_iterator;
|
||||
std::unordered_map<std::string, int> m_word_idx_map;
|
||||
std::unordered_set<std::string> m_word_whitelist;
|
||||
};
|
|
@ -0,0 +1,266 @@
|
|||
#include "HuffmanEncoder.h"
|
||||
#include <algorithm>
|
||||
#include <assert.h>
|
||||
|
||||
HuffmanEncoder::HuffmanEncoder()
|
||||
{
|
||||
m_dict = NULL;
|
||||
}
|
||||
|
||||
void HuffmanEncoder::Save2File(const char* filename)
|
||||
{
|
||||
FILE* fid = fopen(filename, "w");
|
||||
if(fid)
|
||||
{
|
||||
fprintf(fid, "%lld\n", m_hufflabel_info.size());
|
||||
|
||||
for (unsigned i = 0; i < m_hufflabel_info.size(); ++i)
|
||||
{
|
||||
const auto& info = m_hufflabel_info[i];
|
||||
const auto& word = m_dict->GetWordInfo(i);
|
||||
fprintf(fid, "%s %d", word->word.c_str(), info.codelen);
|
||||
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
fprintf(fid, " %d", info.code[j]);
|
||||
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
fprintf(fid, " %d", info.point[j]);
|
||||
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("file open failed %s", filename);
|
||||
}
|
||||
}
|
||||
|
||||
void HuffmanEncoder::RecoverFromFile(const char* filename)
|
||||
{
|
||||
m_dict = new Dictionary();
|
||||
FILE* fid = fopen(filename, "r");
|
||||
if(fid)
|
||||
{
|
||||
int vocab_size;
|
||||
fscanf(fid, "%lld", &vocab_size);
|
||||
m_hufflabel_info.reserve(vocab_size);
|
||||
m_hufflabel_info.clear();
|
||||
|
||||
int tmp;
|
||||
char sz_label[MAX_WORD_SIZE];
|
||||
for (int i = 0; i < vocab_size; ++i)
|
||||
{
|
||||
HuffLabelInfo info;
|
||||
|
||||
fscanf(fid, "%s", sz_label, MAX_WORD_SIZE);
|
||||
m_dict->Insert(sz_label);
|
||||
|
||||
fscanf(fid, "%d", &info.codelen);
|
||||
|
||||
info.code.clear();
|
||||
info.point.clear();
|
||||
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
{
|
||||
fscanf(fid, "%d", &tmp);
|
||||
info.code.push_back(tmp);
|
||||
}
|
||||
for (int j = 0; j < info.codelen; ++j)
|
||||
{
|
||||
fscanf(fid, "%d", &tmp);
|
||||
info.point.push_back(tmp);
|
||||
}
|
||||
|
||||
m_hufflabel_info.push_back(info);
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("file open failed %s", filename);
|
||||
}
|
||||
}
|
||||
|
||||
bool compare(const std::pair<int, int64_t>& x, const std::pair<int, int64_t>& y)
|
||||
{
|
||||
if (x.second == 0) return true;
|
||||
if (y.second == 0) return false;
|
||||
return (x.second > y.second);
|
||||
}
|
||||
|
||||
void HuffmanEncoder::BuildHuffmanTreeFromDict()
|
||||
{
|
||||
std::vector<std::pair<int, int64_t> > ordered_words;
|
||||
ordered_words.reserve(m_dict->Size());
|
||||
ordered_words.clear();
|
||||
for (unsigned i = 0; i < static_cast<unsigned>(m_dict->Size()); ++i)
|
||||
ordered_words.push_back(std::pair<int, int64_t>(i, m_dict->GetWordInfo(i)->freq));
|
||||
std::sort(ordered_words.begin(), ordered_words.end(), compare);
|
||||
|
||||
unsigned vocab_size = (unsigned) ordered_words.size();
|
||||
int64_t *count = new int64_t[vocab_size * 2 + 1]; //frequence
|
||||
unsigned *binary = new unsigned[vocab_size * 2 + 1]; //huffman code relative to parent node [1,0] of each node
|
||||
memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
|
||||
|
||||
unsigned *parent_node = new unsigned[vocab_size * 2 + 1]; //
|
||||
memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
|
||||
unsigned code[MAX_CODE_LENGTH], point[MAX_CODE_LENGTH];
|
||||
|
||||
for (unsigned i = 0; i < vocab_size; ++i)
|
||||
count[i] = ordered_words[i].second;
|
||||
for (unsigned i = vocab_size; i < vocab_size * 2; i++)
|
||||
count[i] = static_cast<int64_t>(1e15);
|
||||
int pos1 = vocab_size - 1;
|
||||
int pos2 = vocab_size;
|
||||
int min1i, min2i;
|
||||
for (unsigned i = 0; i < vocab_size - 1; i++)
|
||||
{
|
||||
// First, find two smallest nodes 'min1, min2'
|
||||
assert(pos2 < vocab_size * 2 - 1);
|
||||
//find the samllest node
|
||||
if (pos1 >= 0)
|
||||
{
|
||||
if (count[pos1] < count[pos2])
|
||||
{
|
||||
min1i = pos1;
|
||||
pos1--;
|
||||
}
|
||||
else
|
||||
{
|
||||
min1i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
min1i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
|
||||
//find the second samllest node
|
||||
if (pos1 >= 0)
|
||||
{
|
||||
if (count[pos1] < count[pos2])
|
||||
{
|
||||
min2i = pos1;
|
||||
pos1--;
|
||||
}
|
||||
else
|
||||
{
|
||||
min2i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
min2i = pos2;
|
||||
pos2++;
|
||||
}
|
||||
|
||||
count[vocab_size + i] = count[min1i] + count[min2i];
|
||||
|
||||
assert(min1i >= 0 && min1i < vocab_size * 2 - 1 && min2i >= 0 && min2i < vocab_size * 2 - 1);
|
||||
parent_node[min1i] = vocab_size + i;
|
||||
parent_node[min2i] = vocab_size + i;
|
||||
binary[min2i] = 1;
|
||||
}
|
||||
assert(pos1 < 0);
|
||||
|
||||
//generate the huffman code for each leaf node
|
||||
m_hufflabel_info.clear();
|
||||
for (unsigned a = 0; a < vocab_size; ++a)
|
||||
m_hufflabel_info.push_back(HuffLabelInfo());
|
||||
for (unsigned a = 0; a < vocab_size; a++)
|
||||
{
|
||||
unsigned b = a, i = 0;
|
||||
while (1)
|
||||
{
|
||||
assert(i < MAX_CODE_LENGTH);
|
||||
code[i] = binary[b];
|
||||
point[i] = b;
|
||||
i++;
|
||||
b = parent_node[b];
|
||||
if (b == vocab_size * 2 - 2) break;
|
||||
}
|
||||
unsigned cur_word = ordered_words[a].first;
|
||||
|
||||
m_hufflabel_info[cur_word].codelen = i;
|
||||
m_hufflabel_info[cur_word].point.push_back(vocab_size - 2);
|
||||
|
||||
for (b = 0; b < i; b++)
|
||||
{
|
||||
m_hufflabel_info[cur_word].code.push_back(code[i - b - 1]);
|
||||
if (b)
|
||||
m_hufflabel_info[cur_word].point.push_back(point[i - b] - vocab_size);
|
||||
}
|
||||
}
|
||||
|
||||
delete[] count;
|
||||
count = nullptr;
|
||||
delete[] binary;
|
||||
binary = nullptr;
|
||||
delete[] parent_node;
|
||||
parent_node = nullptr;
|
||||
}
|
||||
|
||||
void HuffmanEncoder::BuildFromTermFrequency(const char* filename)
|
||||
{
|
||||
FILE* fid = fopen(filename, "r");
|
||||
if(fid)
|
||||
{
|
||||
char sz_label[MAX_WORD_SIZE];
|
||||
m_dict = new Dictionary();
|
||||
|
||||
while (fscanf(fid, "%s", sz_label, MAX_WORD_SIZE) != EOF)
|
||||
{
|
||||
HuffLabelInfo info;
|
||||
int freq;
|
||||
fscanf(fid, "%d", &freq);
|
||||
m_dict->Insert(sz_label, freq);
|
||||
}
|
||||
fclose(fid);
|
||||
|
||||
BuildHuffmanTreeFromDict();
|
||||
}
|
||||
else
|
||||
{
|
||||
printf("file open failed %s", filename);
|
||||
}
|
||||
}
|
||||
|
||||
void HuffmanEncoder::BuildFromTermFrequency(Dictionary* dict)
|
||||
{
|
||||
m_dict = dict;
|
||||
BuildHuffmanTreeFromDict();
|
||||
}
|
||||
|
||||
int HuffmanEncoder::GetLabelSize()
|
||||
{
|
||||
return m_dict->Size();
|
||||
}
|
||||
|
||||
int HuffmanEncoder::GetLabelIdx(const char* label)
|
||||
{
|
||||
return m_dict->GetWordIdx(label);
|
||||
}
|
||||
|
||||
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(char* label)
|
||||
{
|
||||
int idx = GetLabelIdx(label);
|
||||
if (idx == -1)
|
||||
return NULL;
|
||||
return GetLabelInfo(idx);
|
||||
}
|
||||
|
||||
HuffLabelInfo* HuffmanEncoder::GetLabelInfo(int label_idx)
|
||||
{
|
||||
if (label_idx == -1) return NULL;
|
||||
return &m_hufflabel_info[label_idx];
|
||||
}
|
||||
|
||||
Dictionary* HuffmanEncoder::GetDict()
|
||||
{
|
||||
return m_dict;
|
||||
}
|
|
@ -0,0 +1,40 @@
|
|||
#pragma once
|
||||
|
||||
#include "Dictionary.h"
|
||||
|
||||
const int MAX_CODE_LENGTH = 100;
|
||||
|
||||
struct HuffLabelInfo
|
||||
{
|
||||
std::vector<int> point; //internal node ids in the code path
|
||||
std::vector<char> code; //huffman code
|
||||
int codelen;
|
||||
HuffLabelInfo()
|
||||
{
|
||||
codelen = 0;
|
||||
point.clear();
|
||||
code.clear();
|
||||
}
|
||||
};
|
||||
|
||||
class HuffmanEncoder
|
||||
{
|
||||
public:
|
||||
HuffmanEncoder();
|
||||
|
||||
void Save2File(const char* filename);
|
||||
void RecoverFromFile(const char* filename);
|
||||
void BuildFromTermFrequency(const char* filename);
|
||||
void BuildFromTermFrequency(Dictionary* dict);
|
||||
|
||||
int GetLabelSize();
|
||||
int GetLabelIdx(const char* label);
|
||||
HuffLabelInfo* GetLabelInfo(char* label);
|
||||
HuffLabelInfo* GetLabelInfo(int label_idx);
|
||||
Dictionary* GetDict();
|
||||
|
||||
private:
|
||||
void BuildHuffmanTreeFromDict();
|
||||
std::vector<HuffLabelInfo> m_hufflabel_info;
|
||||
Dictionary* m_dict;
|
||||
};
|
|
@ -0,0 +1,110 @@
|
|||
#include "Log.h"
|
||||
|
||||
LogLevel Logger::level_ = LogLevel::Info;
|
||||
std::FILE* Logger::file_ = nullptr;
|
||||
|
||||
Logger::Logger()
|
||||
{
|
||||
level_ = LogLevel::Info;
|
||||
file_ = nullptr;
|
||||
}
|
||||
|
||||
Logger::~Logger()
|
||||
{
|
||||
CloseLogFile();
|
||||
}
|
||||
|
||||
void Logger::Reset(std::string filename, LogLevel level)
|
||||
{
|
||||
level_ = level;
|
||||
file_ = nullptr;
|
||||
ResetLogFile(filename);
|
||||
}
|
||||
|
||||
int Logger::ResetLogFile(std::string filename)
|
||||
{
|
||||
// close the current log file
|
||||
CloseLogFile();
|
||||
// If the filename is specified, try to open it, or just write the
|
||||
// messages to standard output if filename is empty or openning fail.
|
||||
if (filename.size() > 0)
|
||||
{
|
||||
file_ = fopen(filename.c_str(), "w");
|
||||
if (file_ == nullptr) // fail on openning file
|
||||
{
|
||||
Printf(LogLevel::Error, "Cannot create log file %s\n",
|
||||
filename.c_str());
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void Logger::ResetLogLevel(LogLevel level)
|
||||
{
|
||||
level_ = level;
|
||||
}
|
||||
|
||||
int Logger::Printf(LogLevel level, const char *format, ...)
|
||||
{
|
||||
// omit the message with low level
|
||||
if (level < level_)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::string level_str = GetLevelStr(level);
|
||||
std::string time_str = GetSystemTime();
|
||||
va_list val;
|
||||
va_start(val, format);
|
||||
// write the message to standard output
|
||||
printf("[%s] [%s] ", level_str.c_str(), time_str.c_str());
|
||||
int ret = vprintf(format, val);
|
||||
fflush(stdout);
|
||||
// write the message to log file
|
||||
if (file_ != nullptr)
|
||||
{
|
||||
fprintf(file_, "[%s] [%s] ", level_str.c_str(), time_str.c_str());
|
||||
vfprintf(file_, format, val);
|
||||
fflush(file_);
|
||||
}
|
||||
va_end(val);
|
||||
|
||||
// If it is a FATAL error, kill the process
|
||||
if (LogLevel::Fatal == level)
|
||||
{
|
||||
CloseLogFile();
|
||||
exit(1);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void Logger::CloseLogFile()
|
||||
{
|
||||
if (file_ != nullptr)
|
||||
{
|
||||
fclose(file_);
|
||||
file_ = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
std::string Logger::GetSystemTime()
|
||||
{
|
||||
time_t t = time(0);
|
||||
char str[64];
|
||||
strftime(str, sizeof(str), "%Y-%m-%d %H:%M:%S", localtime(&t));
|
||||
return str;
|
||||
}
|
||||
|
||||
std::string Logger::GetLevelStr(LogLevel level)
|
||||
{
|
||||
switch (level)
|
||||
{
|
||||
case LogLevel::Debug: return "DEBUG";
|
||||
case LogLevel::Info: return "INFO";
|
||||
case LogLevel::Error: return "ERROR";
|
||||
case LogLevel::Fatal: return "FATAL";
|
||||
default: return "UNKNOW";
|
||||
}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
#pragma once
|
||||
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
#include <cstdarg>
|
||||
#include <ctime>
|
||||
|
||||
/*!
|
||||
* \brief A enumeration type of log message levels.
|
||||
* \note The values are ordered: DEBUG < INFO < ERROR < FATAL.
|
||||
*/
|
||||
enum class LogLevel : int
|
||||
{
|
||||
Debug = 0,
|
||||
Info = 1,
|
||||
Error = 2,
|
||||
Fatal = 3
|
||||
};
|
||||
|
||||
|
||||
/*!
|
||||
* \brief The class Logger is responsible for writing log messages into
|
||||
* standard output or log file.
|
||||
*/
|
||||
class Logger
|
||||
{
|
||||
public:
|
||||
/*!
|
||||
* \brief Creates an instance of class Logger.
|
||||
*
|
||||
* By default, the log messages will be written to standard output with
|
||||
* minimal level of INFO. Users are able to further set the log file or
|
||||
* log level with corresponding methods.
|
||||
*/
|
||||
Logger();
|
||||
~Logger();
|
||||
|
||||
/*!
|
||||
* \brief Reset the setting of the Logger by specifying log file
|
||||
* and log level.
|
||||
*
|
||||
* The log message will be written to both standard output and file (if
|
||||
* created successfully).
|
||||
* \param filename Log file name
|
||||
* \param level Log level
|
||||
*/
|
||||
static void Reset(std::string filename, LogLevel level = LogLevel::Info);
|
||||
|
||||
/*!
|
||||
* \brief Resets the log file.
|
||||
* \param filename The new log filename. If it is empty, the Logger
|
||||
* will close current log file (if it exists).
|
||||
*/
|
||||
static int ResetLogFile(std::string filename);
|
||||
/*!
|
||||
* \brief Resets the log level.
|
||||
* \param level The new log level.
|
||||
*/
|
||||
static void ResetLogLevel(LogLevel level);
|
||||
|
||||
/*!
|
||||
* \brief C style formatted method for writing log messages. A message
|
||||
* is with the following format: [LEVEL] [TIME] message
|
||||
* \param level The log level of this message.
|
||||
* \param format The C format string.
|
||||
* \param ... Output items.
|
||||
* \return Returns a nonnegative integer on success,
|
||||
* or a negative number if error.
|
||||
*/
|
||||
static int Printf(LogLevel level, const char *format, ...);
|
||||
|
||||
private:
|
||||
static void CloseLogFile();
|
||||
// Returns current system time as a string.
|
||||
static std::string GetSystemTime();
|
||||
// Returns the string of a log level.
|
||||
static std::string GetLevelStr(LogLevel level);
|
||||
|
||||
static LogLevel level_; // Only the message not less than level_ will be outputed.
|
||||
static std::FILE *file_; // A file pointer to the log file.
|
||||
};
|
|
@ -0,0 +1,124 @@
|
|||
#include <thread>
|
||||
#include <string>
|
||||
#include <iostream>
|
||||
#include <cstring>
|
||||
#include <cmath>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
#include <sstream>
|
||||
#include <multiverso.h>
|
||||
#include <barrier.h>
|
||||
|
||||
#include "Dictionary.h"
|
||||
#include "HuffmanEncoder.h"
|
||||
#include "Util.h"
|
||||
#include "Reader.h"
|
||||
#include "MultiversoSkipGramMixture.h"
|
||||
#include "ParamLoader.h"
|
||||
#include "Trainer.h"
|
||||
#include "SkipGramMixtureNeuralNetwork.h"
|
||||
|
||||
bool ReadWord(char *word, FILE *fin)
|
||||
{
|
||||
int idx = 0;
|
||||
char ch;
|
||||
while (!feof(fin))
|
||||
{
|
||||
ch = fgetc(fin);
|
||||
if (ch == 13) continue;
|
||||
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
|
||||
{
|
||||
if (idx > 0)
|
||||
{
|
||||
if (ch == '\n')
|
||||
ungetc(ch, fin);
|
||||
break;
|
||||
}
|
||||
|
||||
if (ch == '\n')
|
||||
{
|
||||
strcpy(word, (char *)"</s>");
|
||||
return true;
|
||||
}
|
||||
else
|
||||
{
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
word[idx++] = ch;
|
||||
if (idx >= MAX_STRING - 1) idx--; // Truncate too long words
|
||||
}
|
||||
|
||||
word[idx] = 0;
|
||||
return idx > 0;
|
||||
}
|
||||
|
||||
// Read the vocabulary file; create the dictionary and huffman_encoder according opt
|
||||
int64_t LoadVocab(Option *opt, Dictionary *dictionary, HuffmanEncoder *huffman_encoder)
|
||||
{
|
||||
int64_t total_words = 0;
|
||||
char word[MAX_STRING];
|
||||
FILE* fid = nullptr;
|
||||
printf("vocab_file %s\n", opt->read_vocab_file);
|
||||
if (opt->read_vocab_file != nullptr && strlen(opt->read_vocab_file) > 0)
|
||||
{
|
||||
printf("Begin to load vocabulary file [%s] ...\n", opt->read_vocab_file);
|
||||
fid = fopen(opt->read_vocab_file, "r");
|
||||
int word_freq;
|
||||
while (fscanf(fid, "%s %d", word, &word_freq) != EOF)
|
||||
{
|
||||
dictionary->Insert(word, word_freq);
|
||||
}
|
||||
}
|
||||
|
||||
dictionary->RemoveWordsLessThan(opt->min_count);
|
||||
printf("Dictionary size: %d\n", dictionary->Size());
|
||||
total_words = 0;
|
||||
for (int i = 0; i < dictionary->Size(); ++i)
|
||||
total_words += dictionary->GetWordInfo(i)->freq;
|
||||
printf("Words in Corpus %I64d\n", total_words);
|
||||
huffman_encoder->BuildFromTermFrequency(dictionary);
|
||||
fclose(fid);
|
||||
|
||||
return total_words;
|
||||
}
|
||||
|
||||
|
||||
int main(int argc, char *argv[])
|
||||
{
|
||||
srand(static_cast<unsigned int>(time(NULL)));
|
||||
Option *option = new Option();
|
||||
Dictionary *dictionary = new Dictionary();
|
||||
HuffmanEncoder *huffman_encoder = new HuffmanEncoder();
|
||||
|
||||
// Parse argument and store them in option
|
||||
option->ParseArgs(argc, argv);
|
||||
option->PrintArgs();
|
||||
if (!option->CheckArgs())
|
||||
{
|
||||
printf("Fatal error in arguments\n");
|
||||
return -1;
|
||||
}
|
||||
// Read the vocabulary file; create the dictionary and huffman_encoder according opt
|
||||
printf("Loading vocabulary ...\n");
|
||||
option->total_words = LoadVocab(option, dictionary, huffman_encoder);
|
||||
printf("Loaded vocabulary\n");
|
||||
fflush(stdout);
|
||||
|
||||
Reader *reader = new Reader(dictionary, option);
|
||||
|
||||
MultiversoSkipGramMixture *multiverso_word2vector = new MultiversoSkipGramMixture(option, dictionary, huffman_encoder, reader);
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
multiverso_word2vector->Train(argc, argv);
|
||||
|
||||
delete multiverso_word2vector;
|
||||
delete reader;
|
||||
delete huffman_encoder;
|
||||
delete dictionary;
|
||||
delete option;
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,271 @@
|
|||
#include "MultiversoSkipGramMixture.h"
|
||||
#include <algorithm>
|
||||
|
||||
MultiversoSkipGramMixture::MultiversoSkipGramMixture(Option *option, Dictionary *dictionary, HuffmanEncoder *huffman_encoder, Reader *reader)
|
||||
{
|
||||
m_option = option;
|
||||
m_dictionary = dictionary;
|
||||
m_huffman_encoder = huffman_encoder;
|
||||
m_reader = reader;
|
||||
|
||||
InitSenseCntInfo();
|
||||
}
|
||||
|
||||
void MultiversoSkipGramMixture::InitSenseCntInfo()
|
||||
{
|
||||
//First, determine #senses for words according to configuration parameters: top_N and top_ratio
|
||||
int threshold = (m_option->top_N ? std::min(m_option->top_N, m_dictionary->Size()) : m_dictionary->Size());
|
||||
threshold = static_cast<int>(std::min(static_cast<real>(m_option->top_ratio) * m_dictionary->Size(), static_cast<real>(threshold)));
|
||||
|
||||
m_word_sense_info.total_senses_cnt = threshold * m_option->sense_num_multi + (m_dictionary->Size() - threshold);
|
||||
|
||||
std::pair<int, int64_t>* wordlist = new std::pair<int, int64_t>[m_dictionary->Size() + 10];
|
||||
for (int i = 0; i < m_dictionary->Size(); ++i)
|
||||
wordlist[i] = std::pair<int, int64_t>(i, m_dictionary->GetWordInfo(i)->freq);
|
||||
|
||||
std::sort(wordlist, wordlist + m_dictionary->Size(), [](std::pair<int, int64_t> a, std::pair<int, int64_t> b) {
|
||||
return a.second > b.second;
|
||||
});
|
||||
|
||||
m_word_sense_info.word_sense_cnts_info.resize(m_dictionary->Size());
|
||||
|
||||
for (int i = 0; i < threshold; ++i)
|
||||
m_word_sense_info.word_sense_cnts_info[wordlist[i].first] = m_option->sense_num_multi;
|
||||
for (int i = threshold; i < m_dictionary->Size(); ++i)
|
||||
m_word_sense_info.word_sense_cnts_info[wordlist[i].first] = 1;
|
||||
|
||||
//Then, read words #sense info from the sense file
|
||||
if (m_option->sense_file)
|
||||
{
|
||||
FILE* fid = fopen(m_option->sense_file, "r");
|
||||
char word[1000];
|
||||
while (fscanf(fid, "%s", word) != EOF)
|
||||
{
|
||||
int word_idx = m_dictionary->GetWordIdx(word);
|
||||
if (word_idx == -1)
|
||||
continue;
|
||||
if (m_word_sense_info.word_sense_cnts_info[word_idx] == 1)
|
||||
{
|
||||
m_word_sense_info.word_sense_cnts_info[word_idx] = m_option->sense_num_multi;
|
||||
m_word_sense_info.total_senses_cnt += (m_option->sense_num_multi - 1);
|
||||
}
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
|
||||
//At last, point pointers to the right position
|
||||
m_word_sense_info.p_input_embedding.resize(m_dictionary->Size());
|
||||
int cnt = 0;
|
||||
m_word_sense_info.multi_senses_words_cnt = 0;
|
||||
|
||||
for (int i = 0; i < m_dictionary->Size(); ++i)
|
||||
{
|
||||
m_word_sense_info.p_input_embedding[i] = cnt;
|
||||
if (m_word_sense_info.word_sense_cnts_info[i] > 1)
|
||||
m_word_sense_info.p_wordidx2sense_idx[i] = m_word_sense_info.multi_senses_words_cnt++;
|
||||
cnt += m_word_sense_info.word_sense_cnts_info[i];
|
||||
}
|
||||
|
||||
printf("Total senses:%d, total multiple mearning words:%d\n", m_word_sense_info.total_senses_cnt, m_word_sense_info.multi_senses_words_cnt);
|
||||
|
||||
}
|
||||
|
||||
void MultiversoSkipGramMixture::Train(int argc, char *argv[])
|
||||
{
|
||||
multiverso::Barrier* barrier = new multiverso::Barrier(m_option->thread_cnt);
|
||||
|
||||
printf("Inited barrier\n");
|
||||
|
||||
SkipGramMixtureNeuralNetwork<real>* word2vector_neural_networks[2] = { new SkipGramMixtureNeuralNetwork<real>(m_option, m_huffman_encoder, &m_word_sense_info, m_dictionary, m_dictionary->Size()),
|
||||
new SkipGramMixtureNeuralNetwork<real>(m_option, m_huffman_encoder, &m_word_sense_info, m_dictionary, m_dictionary->Size()) };
|
||||
|
||||
// Create Multiverso ParameterLoader and Trainers,
|
||||
// start Multiverso environment
|
||||
printf("Initializing Multiverso ...\n");
|
||||
|
||||
fflush(stdout);
|
||||
std::vector<multiverso::TrainerBase*> trainers;
|
||||
for (int i = 0; i < m_option->thread_cnt; ++i)
|
||||
{
|
||||
trainers.push_back(new Trainer<real>(i, m_option, (void**)word2vector_neural_networks, barrier, m_dictionary, &m_word_sense_info, m_huffman_encoder));
|
||||
}
|
||||
|
||||
ParameterLoader<real> *parameter_loader = new ParameterLoader<real>(m_option, (void**)word2vector_neural_networks, &m_word_sense_info);
|
||||
multiverso::Config config;
|
||||
config.max_delay = m_option->max_delay;
|
||||
config.num_servers = m_option->num_servers;
|
||||
config.num_aggregator = m_option->num_aggregator;
|
||||
config.lock_option = static_cast<multiverso::LockOption>(m_option->lock_option);
|
||||
config.num_lock = m_option->num_lock;
|
||||
config.is_pipeline = m_option->pipline;
|
||||
|
||||
fflush(stdout);
|
||||
|
||||
multiverso::Multiverso::Init(trainers, parameter_loader, config, &argc, &argv);
|
||||
|
||||
fflush(stdout);
|
||||
multiverso::Log::ResetLogFile("log.txt");
|
||||
m_process_id = multiverso::Multiverso::ProcessRank();
|
||||
PrepareMultiversoParameterTables(m_option, m_dictionary);
|
||||
|
||||
printf("Start to train ...\n");
|
||||
TrainNeuralNetwork();
|
||||
printf("Rank %d Finish training\n", m_process_id);
|
||||
|
||||
delete barrier;
|
||||
delete word2vector_neural_networks[0];
|
||||
delete word2vector_neural_networks[1];
|
||||
for (auto &trainer : trainers)
|
||||
{
|
||||
delete trainer;
|
||||
}
|
||||
delete parameter_loader;
|
||||
multiverso::Multiverso::Close();
|
||||
}
|
||||
|
||||
void MultiversoSkipGramMixture::AddMultiversoParameterTable(multiverso::integer_t table_id, multiverso::integer_t rows,
|
||||
multiverso::integer_t cols, multiverso::Type type, multiverso::Format default_format)
|
||||
{
|
||||
multiverso::Multiverso::AddServerTable(table_id, rows, cols, type, default_format);
|
||||
multiverso::Multiverso::AddCacheTable(table_id, rows, cols, type, default_format, 0);
|
||||
multiverso::Multiverso::AddAggregatorTable(table_id, rows, cols, type, default_format, 0);
|
||||
}
|
||||
|
||||
void MultiversoSkipGramMixture::PrepareMultiversoParameterTables(Option *opt, Dictionary *dictionary)
|
||||
{
|
||||
multiverso::Multiverso::BeginConfig();
|
||||
int proc_count = multiverso::Multiverso::TotalProcessCount();
|
||||
|
||||
// create tables
|
||||
AddMultiversoParameterTable(kInputEmbeddingTableId, m_word_sense_info.total_senses_cnt, opt->embeding_size, multiverso::Type::Float, multiverso::Format::Dense);
|
||||
AddMultiversoParameterTable(kEmbeddingOutputTableId, dictionary->Size(), opt->embeding_size, multiverso::Type::Float, multiverso::Format::Dense);
|
||||
AddMultiversoParameterTable(kWordCountActualTableId, 1, 1, multiverso::Type::LongLong, multiverso::Format::Dense);
|
||||
AddMultiversoParameterTable(kWordSensePriorTableId, m_word_sense_info.multi_senses_words_cnt, m_option->sense_num_multi, multiverso::Type::Float, multiverso::Format::Dense);
|
||||
|
||||
// initialize input embeddings
|
||||
for (int row = 0; row < m_word_sense_info.total_senses_cnt; ++row)
|
||||
{
|
||||
for (int col = 0; col < opt->embeding_size; ++col)
|
||||
{
|
||||
multiverso::Multiverso::AddToServer<real>(kInputEmbeddingTableId, row, col, static_cast<real>((static_cast<real>(rand()) / RAND_MAX - 0.5) / opt->embeding_size / proc_count));
|
||||
}
|
||||
}
|
||||
|
||||
//initialize sense priors
|
||||
for (int row = 0; row < m_word_sense_info.multi_senses_words_cnt; ++row)
|
||||
{
|
||||
for (int col = 0; col < opt->sense_num_multi; ++col)
|
||||
{
|
||||
multiverso::Multiverso::AddToServer<real>(kWordSensePriorTableId, row, col,
|
||||
static_cast<real>(m_option->store_multinomial ? 1.0 / m_option->sense_num_multi : log(1.0 / m_option->sense_num_multi)));
|
||||
}
|
||||
}
|
||||
multiverso::Multiverso::EndConfig();
|
||||
}
|
||||
|
||||
//Load the sentences from train file, and store them in data_block
|
||||
void MultiversoSkipGramMixture::LoadData(DataBlock *data_block, Reader *reader, int64_t size)
|
||||
{
|
||||
data_block->ReleaseSentences();
|
||||
while (data_block->Size() < m_option->data_block_size)
|
||||
{
|
||||
int64_t word_count = 0;
|
||||
int *sentence = new (std::nothrow)int[MAX_SENTENCE_LENGTH + 2];
|
||||
assert(sentence != nullptr);
|
||||
int sentence_length = reader->GetSentence(sentence, word_count);
|
||||
if (sentence_length > 0)
|
||||
{
|
||||
data_block->Add(sentence, sentence_length, word_count, (uint64_t)rand() * 10000 + (uint64_t)rand());
|
||||
}
|
||||
else
|
||||
{
|
||||
//Reader read eof
|
||||
delete[] sentence;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void MultiversoSkipGramMixture::PushDataBlock(
|
||||
std::queue<DataBlock*> &datablock_queue, DataBlock* data_block)
|
||||
{
|
||||
|
||||
multiverso::Multiverso::PushDataBlock(data_block);
|
||||
|
||||
datablock_queue.push(data_block);
|
||||
//limit the max size of total datablocks to avoid out of memory
|
||||
while (static_cast<int64_t>(datablock_queue.size()) > m_option->max_preload_blocks_cnt)
|
||||
{
|
||||
std::chrono::milliseconds dura(200);
|
||||
std::this_thread::sleep_for(dura);
|
||||
|
||||
RemoveDoneDataBlock(datablock_queue);
|
||||
}
|
||||
}
|
||||
|
||||
//Remove the datablock which has been delt by parameterloader and trainer
|
||||
void MultiversoSkipGramMixture::RemoveDoneDataBlock(std::queue<DataBlock*> &datablock_queue)
|
||||
{
|
||||
while (datablock_queue.empty() == false
|
||||
&& datablock_queue.front()->IsDone())
|
||||
{
|
||||
DataBlock *p_data_block = datablock_queue.front();
|
||||
datablock_queue.pop();
|
||||
delete p_data_block;
|
||||
}
|
||||
}
|
||||
|
||||
void MultiversoSkipGramMixture::TrainNeuralNetwork()
|
||||
{
|
||||
std::queue<DataBlock*>datablock_queue;
|
||||
int data_block_count = 0;
|
||||
|
||||
multiverso::Multiverso::BeginTrain();
|
||||
|
||||
for (int curr_epoch = 0; curr_epoch < m_option->epoch; ++curr_epoch)
|
||||
{
|
||||
m_reader->Open(m_option->train_file);
|
||||
while (1)
|
||||
{
|
||||
++data_block_count;
|
||||
DataBlock *data_block = new (std::nothrow)DataBlock();
|
||||
assert(data_block != nullptr);
|
||||
clock_t start = clock();
|
||||
LoadData(data_block, m_reader, m_option->data_block_size);
|
||||
if (data_block->Size() <= 0)
|
||||
{
|
||||
delete data_block;
|
||||
break;
|
||||
}
|
||||
multiverso::Log::Info("Rank%d Load%d^thDataBlockTime:%lfs\n", m_process_id, data_block_count,
|
||||
(clock() - start) / (double)CLOCKS_PER_SEC);
|
||||
multiverso::Multiverso::BeginClock();
|
||||
PushDataBlock(datablock_queue, data_block);
|
||||
multiverso::Multiverso::EndClock();
|
||||
}
|
||||
|
||||
m_reader->Close();
|
||||
|
||||
multiverso::Multiverso::BeginClock();
|
||||
|
||||
DataBlock *output_data_block = new DataBlock(); //Add a special data_block for dumping model files
|
||||
output_data_block->AddTable(kInputEmbeddingTableId);
|
||||
output_data_block->AddTable(kEmbeddingOutputTableId);
|
||||
output_data_block->AddTable(kWordSensePriorTableId);
|
||||
output_data_block->SetEpochId(curr_epoch);
|
||||
|
||||
++data_block_count;
|
||||
multiverso::Multiverso::PushDataBlock(output_data_block);
|
||||
multiverso::Multiverso::EndClock();
|
||||
}
|
||||
|
||||
multiverso::Log::Info("Rank %d pushed %d blocks\n", multiverso::Multiverso::ProcessRank(), data_block_count);
|
||||
|
||||
multiverso::Multiverso::EndTrain();
|
||||
|
||||
//After EndTrain, all the datablock are done,
|
||||
//we remove all the datablocks
|
||||
RemoveDoneDataBlock(datablock_queue);
|
||||
}
|
||||
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
#include <ctime>
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <unordered_set>
|
||||
#include <unordered_map>
|
||||
#include <multiverso.h>
|
||||
#include <log.h>
|
||||
|
||||
#include "Util.h"
|
||||
#include "HuffmanEncoder.h"
|
||||
#include "DataBlock.h"
|
||||
#include "ParamLoader.h"
|
||||
#include "Trainer.h"
|
||||
#include "Reader.h"
|
||||
|
||||
class MultiversoSkipGramMixture
|
||||
{
|
||||
public:
|
||||
MultiversoSkipGramMixture(Option *option, Dictionary *dictionary, HuffmanEncoder *huffman_encoder, Reader *reader);
|
||||
|
||||
void Train(int argc, char *argv[]);
|
||||
|
||||
private:
|
||||
int m_process_id;
|
||||
Option* m_option;
|
||||
Dictionary* m_dictionary;
|
||||
HuffmanEncoder* m_huffman_encoder;
|
||||
Reader* m_reader;
|
||||
|
||||
WordSenseInfo m_word_sense_info;
|
||||
|
||||
/*!
|
||||
* \brief Complete the train task with multiverso
|
||||
*/
|
||||
void TrainNeuralNetwork();
|
||||
|
||||
|
||||
/*!
|
||||
* \brief Create a new table in the multiverso
|
||||
*/
|
||||
void AddMultiversoParameterTable(multiverso::integer_t table_id, multiverso::integer_t rows,
|
||||
multiverso::integer_t cols, multiverso::Type type, multiverso::Format default_format);
|
||||
|
||||
/*!
|
||||
* \brief Prepare parameter table in the multiverso
|
||||
*/
|
||||
void PrepareMultiversoParameterTables(Option *opt, Dictionary *dictionary);
|
||||
|
||||
|
||||
/*!
|
||||
* \brief Load data from train_file to datablock
|
||||
* \param datablock the datablock which needs to be assigned
|
||||
* \param reader some useful function for calling
|
||||
* \param size datablock limit byte size
|
||||
*/
|
||||
void LoadData(DataBlock *data_block, Reader *reader, int64_t size);
|
||||
|
||||
/*!
|
||||
* \brief Push the datablock into the multiverso and datablock_queue
|
||||
*/
|
||||
void PushDataBlock(std::queue<DataBlock*> &datablock_queue, DataBlock* data_block);
|
||||
|
||||
/*!
|
||||
* \brief Remove datablock which is finished by multiverso thread
|
||||
* \param datablock_queue store the pushed datablocks
|
||||
*/
|
||||
void RemoveDoneDataBlock(std::queue<DataBlock*> &datablock_queue);
|
||||
|
||||
/*!
|
||||
* \brief Init the sense count info for all words
|
||||
*/
|
||||
void InitSenseCntInfo();
|
||||
};
|
||||
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
#pragma once
|
||||
|
||||
#include <multiverso.h>
|
||||
/*!
|
||||
* \brief Defines the index of parameter tables.
|
||||
*/
|
||||
const multiverso::integer_t kInputEmbeddingTableId = 0; //Input embedding vector table
|
||||
const multiverso::integer_t kEmbeddingOutputTableId = 1; //Huffman tree node embedding vector table
|
||||
const multiverso::integer_t kWordCountActualTableId = 2; //Word count table
|
||||
const multiverso::integer_t kWordSensePriorTableId = 3; //Sense priors table
|
|
@ -0,0 +1,65 @@
|
|||
#include "ParamLoader.h"
|
||||
|
||||
template<typename T>
|
||||
ParameterLoader<T>::ParameterLoader(Option *option, void** word2vector_neural_networks, WordSenseInfo* word_sense_info)
|
||||
{
|
||||
m_option = option;
|
||||
m_parse_and_request_count = 0;
|
||||
m_sgmixture_neural_networks = word2vector_neural_networks;
|
||||
m_log_file = fopen("parameter_loader.log", "w");
|
||||
m_words_sense_info = word_sense_info;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void ParameterLoader<T>::ParseAndRequest(multiverso::DataBlockBase *data_block)
|
||||
{
|
||||
if (m_parse_and_request_count == 0)
|
||||
{
|
||||
m_start_time = clock();
|
||||
}
|
||||
|
||||
fprintf(m_log_file, "%lf\n", (clock() - m_start_time) / (double)CLOCKS_PER_SEC);
|
||||
multiverso::Log::Info("Rank %d ParameterLoader begin %d\n", multiverso::Multiverso::ProcessRank(), m_parse_and_request_count);
|
||||
DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
|
||||
|
||||
SkipGramMixtureNeuralNetwork<T>* sg_mixture_neural_network = reinterpret_cast<SkipGramMixtureNeuralNetwork<T>*>(m_sgmixture_neural_networks[m_parse_and_request_count % 2]);
|
||||
++m_parse_and_request_count;
|
||||
data->UpdateNextRandom();
|
||||
sg_mixture_neural_network->PrepareParmeter(data);
|
||||
|
||||
std::vector<int>& input_layer_nodes = sg_mixture_neural_network->GetInputLayerNodes();
|
||||
std::vector<int>& output_layer_nodes = sg_mixture_neural_network->GetOutputLayerNodes();
|
||||
assert(sg_mixture_neural_network->status == 0);
|
||||
sg_mixture_neural_network->status = 1;
|
||||
|
||||
for (int i = 0; i < input_layer_nodes.size(); ++i)
|
||||
{
|
||||
int word_id = input_layer_nodes[i];
|
||||
for (int j = 0; j < m_words_sense_info->word_sense_cnts_info[word_id]; ++j)
|
||||
RequestRow(kInputEmbeddingTableId, m_words_sense_info->p_input_embedding[word_id] + j);
|
||||
}
|
||||
|
||||
for (int i = 0; i < output_layer_nodes.size(); ++i)
|
||||
RequestRow(kEmbeddingOutputTableId, output_layer_nodes[i]);
|
||||
|
||||
RequestRow(kWordCountActualTableId, 0);
|
||||
|
||||
for (int i = 0; i < input_layer_nodes.size(); ++i)
|
||||
{
|
||||
int word_id = input_layer_nodes[i];
|
||||
if (m_words_sense_info->word_sense_cnts_info[word_id] > 1)
|
||||
RequestRow(kWordSensePriorTableId, m_words_sense_info->p_wordidx2sense_idx[word_id]);
|
||||
}
|
||||
|
||||
std::vector<int> & tables = data->GetTables();
|
||||
for (int i = 0; i < tables.size(); ++i)
|
||||
RequestTable(tables[i]);
|
||||
|
||||
multiverso::Log::Info("Rank %d ParameterLoader finish %d\n", multiverso::Multiverso::ProcessRank(), m_parse_and_request_count - 1);
|
||||
fprintf(m_log_file, "%lf\n", (clock() - m_start_time) / (double)CLOCKS_PER_SEC);
|
||||
assert(sg_mixture_neural_network->status == 1);
|
||||
sg_mixture_neural_network->status = 2;
|
||||
}
|
||||
|
||||
template class ParameterLoader<float>;
|
||||
template class ParameterLoader<double>;
|
|
@ -0,0 +1,34 @@
|
|||
#pragma once
|
||||
|
||||
#include <multiverso.h>
|
||||
#include "DataBlock.h"
|
||||
#include "MultiversoTablesId.h"
|
||||
#include "Util.h"
|
||||
#include "HuffmanEncoder.h"
|
||||
#include "SkipGramMixtureNeuralNetwork.h"
|
||||
#include "Log.h"
|
||||
|
||||
|
||||
/*!
|
||||
* \brief The class ParameterLoader preloads the parameters from multiverso server
|
||||
*/
|
||||
template<typename T>
|
||||
class ParameterLoader : public multiverso::ParameterLoaderBase
|
||||
{
|
||||
public:
|
||||
ParameterLoader(Option *opt, void ** word2vector_neural_networks, WordSenseInfo* word_sense_info);
|
||||
/*!
|
||||
* \brief Request the parameters from multiverso server according to data_block
|
||||
* \param data_block stores the information of sentences
|
||||
*/
|
||||
void ParseAndRequest(multiverso::DataBlockBase* data_block) override;
|
||||
|
||||
private:
|
||||
int m_parse_and_request_count;
|
||||
Option* m_option;
|
||||
clock_t m_start_time;
|
||||
WordSenseInfo* m_words_sense_info;
|
||||
void ** m_sgmixture_neural_networks;
|
||||
FILE* m_log_file;
|
||||
};
|
||||
|
|
@ -0,0 +1,85 @@
|
|||
#include "Reader.h"
|
||||
|
||||
Reader::Reader(Dictionary *dictionary, Option *option)
|
||||
{
|
||||
m_dictionary = dictionary;
|
||||
m_option = option;
|
||||
|
||||
m_stopwords_table.clear();
|
||||
if (m_option->stopwords)
|
||||
{
|
||||
FILE* fid = fopen(m_option->sw_file, "r");
|
||||
while (ReadWord(m_word, fid))
|
||||
{
|
||||
m_stopwords_table.insert(m_word);
|
||||
if (m_dictionary->GetWordIdx(m_word) != -1)
|
||||
m_option->total_words -= m_dictionary->GetWordInfo(m_word)->freq;
|
||||
}
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
|
||||
void Reader::Open(const char *input_file)
|
||||
{
|
||||
m_fin = fopen(input_file, "r");
|
||||
}
|
||||
|
||||
void Reader::Close()
|
||||
{
|
||||
fclose(m_fin);
|
||||
m_fin = nullptr;
|
||||
}
|
||||
|
||||
int Reader::GetSentence(int *sentence, int64_t &word_count)
|
||||
{
|
||||
int length = 0, word_idx;
|
||||
word_count = 0;
|
||||
while (1)
|
||||
{
|
||||
if (!ReadWord(m_word, m_fin))
|
||||
break;
|
||||
word_idx = m_dictionary->GetWordIdx(m_word);
|
||||
if (word_idx == -1)
|
||||
continue;
|
||||
word_count++;
|
||||
if (m_option->stopwords && m_stopwords_table.count(m_word))
|
||||
continue;
|
||||
sentence[length++] = word_idx;
|
||||
if (length >= MAX_SENTENCE_LENGTH)
|
||||
break;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
|
||||
bool Reader::ReadWord(char *word, FILE *fin)
|
||||
{
|
||||
int idx = 0;
|
||||
char ch;
|
||||
while (!feof(fin))
|
||||
{
|
||||
ch = fgetc(fin);
|
||||
if (ch == 13) continue;
|
||||
if ((ch == ' ') || (ch == '\t') || (ch == '\n'))
|
||||
{
|
||||
if (idx > 0)
|
||||
{
|
||||
if (ch == '\n')
|
||||
ungetc(ch, fin);
|
||||
break;
|
||||
}
|
||||
if (ch == '\n')
|
||||
{
|
||||
strcpy(word, (char *)"</s>");
|
||||
return true;
|
||||
}
|
||||
else continue;
|
||||
}
|
||||
word[idx++] = ch;
|
||||
if (idx >= MAX_STRING - 1) idx--; // Truncate too long words
|
||||
}
|
||||
word[idx] = 0;
|
||||
return idx != 0;
|
||||
}
|
|
@ -0,0 +1,24 @@
|
|||
#pragma once
|
||||
|
||||
#include "Util.h"
|
||||
#include "Dictionary.h"
|
||||
#include <mutex>
|
||||
#include <unordered_set>
|
||||
|
||||
class Reader
|
||||
{
|
||||
public:
|
||||
Reader(Dictionary *dictionary, Option *option);
|
||||
void Open(const char *input_file);
|
||||
void Close();
|
||||
int GetSentence(int *sentence, int64_t &word_count);
|
||||
|
||||
private:
|
||||
Option* m_option;
|
||||
FILE* m_fin;
|
||||
char m_word[MAX_STRING + 1];
|
||||
Dictionary *m_dictionary;
|
||||
std::unordered_set<std::string> m_stopwords_table;
|
||||
|
||||
bool ReadWord(char *word, FILE *fin);
|
||||
};
|
|
@ -0,0 +1,333 @@
|
|||
#include "SkipGramMixtureNeuralNetwork.h"
|
||||
|
||||
template<typename T>
|
||||
SkipGramMixtureNeuralNetwork<T>::SkipGramMixtureNeuralNetwork(Option* option, HuffmanEncoder* huffmanEncoder, WordSenseInfo* word_sense_info, Dictionary* dic, int dicSize)
|
||||
{
|
||||
status = 0;
|
||||
m_option = option;
|
||||
m_huffman_encoder = huffmanEncoder;
|
||||
m_word_sense_info = word_sense_info;
|
||||
m_dictionary_size = dicSize;
|
||||
m_dictionary = dic;
|
||||
|
||||
m_input_embedding_weights_ptr = new T*[m_dictionary_size];
|
||||
m_sense_priors_ptr = new T*[m_dictionary_size];
|
||||
m_sense_priors_paras_ptr = new T*[m_dictionary_size];
|
||||
|
||||
m_output_embedding_weights_ptr = new T*[m_dictionary_size];
|
||||
m_seleted_input_embedding_weights = new bool[m_dictionary_size];
|
||||
m_selected_output_embedding_weights = new bool[m_dictionary_size];
|
||||
assert(m_input_embedding_weights_ptr != nullptr);
|
||||
assert(m_output_embedding_weights_ptr != nullptr);
|
||||
assert(m_seleted_input_embedding_weights != nullptr);
|
||||
assert(m_selected_output_embedding_weights != nullptr);
|
||||
memset(m_seleted_input_embedding_weights, 0, sizeof(bool) * m_dictionary_size);
|
||||
memset(m_selected_output_embedding_weights, 0, sizeof(bool) * m_dictionary_size);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
SkipGramMixtureNeuralNetwork<T>::~SkipGramMixtureNeuralNetwork()
|
||||
{
|
||||
delete m_input_embedding_weights_ptr;
|
||||
delete m_output_embedding_weights_ptr;
|
||||
delete m_sense_priors_ptr;
|
||||
delete m_sense_priors_paras_ptr;
|
||||
delete m_seleted_input_embedding_weights;
|
||||
delete m_selected_output_embedding_weights;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void SkipGramMixtureNeuralNetwork<T>::Train(int* sentence, int sentence_length, T* gamma, T* fTable, T* input_backup)
|
||||
{
|
||||
ParseSentence(sentence, sentence_length, gamma, fTable, input_backup, &SkipGramMixtureNeuralNetwork<T>::TrainSample);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//The E - step, estimate the posterior multinomial probabilities
|
||||
T SkipGramMixtureNeuralNetwork<T>::Estimate_Gamma_m(int word_input, std::vector<std::pair<int, int> >& output_nodes, T* posterior_ll, T* estimation, T* sense_prior, T* f_m)
|
||||
{
|
||||
T* inputEmbedding = m_input_embedding_weights_ptr[word_input];
|
||||
T f, log_likelihood = 0;
|
||||
for (int sense_idx = 0; sense_idx < m_word_sense_info->word_sense_cnts_info[word_input]; ++sense_idx, inputEmbedding += m_option->embeding_size)
|
||||
{
|
||||
posterior_ll[sense_idx] = sense_prior[sense_idx] < eps ? MIN_LOG : log(sense_prior[sense_idx]); //posterior likelihood for each sense
|
||||
|
||||
int64_t fidx = sense_idx * MAX_CODE_LENGTH;
|
||||
|
||||
for (int d = 0; d < output_nodes.size(); ++d, fidx++)
|
||||
{
|
||||
f = Util::InnerProduct(inputEmbedding, m_output_embedding_weights_ptr[output_nodes[d].first], m_option->embeding_size);
|
||||
f = Util::Sigmoid(f);
|
||||
f_m[fidx] = f;
|
||||
if (output_nodes[d].second) //huffman code, 0 or 1
|
||||
f = 1 - f;
|
||||
posterior_ll[sense_idx] += f < eps ? MIN_LOG : log(f);
|
||||
}
|
||||
log_likelihood += posterior_ll[sense_idx];
|
||||
}
|
||||
if (m_word_sense_info->word_sense_cnts_info[word_input] == 1)
|
||||
{
|
||||
estimation[0] = 1;
|
||||
return log_likelihood;
|
||||
}
|
||||
|
||||
Util::SoftMax(posterior_ll, estimation, m_word_sense_info->word_sense_cnts_info[word_input]);
|
||||
|
||||
return log_likelihood;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//The M Step: update the sense prior probabilities to maximize the Q function
|
||||
void SkipGramMixtureNeuralNetwork<T>::Maximize_Pi(int word_input, T* log_likelihood)
|
||||
{
|
||||
if (m_word_sense_info->word_sense_cnts_info[word_input] == 1)
|
||||
{
|
||||
return;
|
||||
}
|
||||
|
||||
for (int sense_idx = 0; sense_idx < m_word_sense_info->word_sense_cnts_info[word_input]; ++sense_idx)
|
||||
{
|
||||
T new_alpha = log_likelihood[sense_idx];
|
||||
m_sense_priors_paras_ptr[word_input][sense_idx] = m_sense_priors_paras_ptr[word_input][sense_idx] * sense_prior_momentum + new_alpha * (1 - sense_prior_momentum);
|
||||
}
|
||||
|
||||
if (!m_option->store_multinomial)
|
||||
Util::SoftMax(m_sense_priors_paras_ptr[word_input], m_sense_priors_ptr[word_input], m_option->sense_num_multi); //Update the multinomial parameters
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//The M step : update the embedding vectors to maximize the Q function
|
||||
void SkipGramMixtureNeuralNetwork<T>::UpdateEmbeddings(int word_input, std::vector<std::pair<int, int> >& output_nodes, T* estimation, T* f_m, T* input_backup, UpdateDirection direction)
|
||||
{
|
||||
T g;
|
||||
T* output_embedding;
|
||||
T* inputEmbedding;
|
||||
if (direction == UpdateDirection::UPDATE_INPUT)
|
||||
inputEmbedding = m_input_embedding_weights_ptr[word_input];
|
||||
else inputEmbedding = input_backup;
|
||||
for (int sense_idx = 0; sense_idx < m_word_sense_info->word_sense_cnts_info[word_input]; ++sense_idx, inputEmbedding += m_option->embeding_size)
|
||||
{
|
||||
int64_t fidx = sense_idx * MAX_CODE_LENGTH;
|
||||
for (int d = 0; d < output_nodes.size(); ++d, ++fidx)
|
||||
{
|
||||
output_embedding = m_output_embedding_weights_ptr[output_nodes[d].first];
|
||||
g = estimation[sense_idx] * (1 - output_nodes[d].second - f_m[fidx]) * learning_rate;
|
||||
if (direction == UpdateDirection::UPDATE_INPUT) //Update Input
|
||||
{
|
||||
for (int j = 0; j < m_option->embeding_size; ++j)
|
||||
inputEmbedding[j] += g * output_embedding[j];
|
||||
}
|
||||
else // Update Output
|
||||
{
|
||||
for (int j = 0; j < m_option->embeding_size; ++j)
|
||||
output_embedding[j] += g * inputEmbedding[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<typename T>
|
||||
//Train a window sample and update the input embedding & output embedding vectors
|
||||
void SkipGramMixtureNeuralNetwork<T>::TrainSample(int input_node, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup)
|
||||
{
|
||||
T* gamma = (T*)v_gamma; //stores the posterior probabilities
|
||||
T* fTable = (T*)v_fTable; //stores the inner product values of input and output embeddings
|
||||
T* input_backup = (T*)v_input_backup;
|
||||
|
||||
T posterior_ll[MAX_SENSE_CNT]; //stores the posterior log likelihood
|
||||
T senses[1] = { 1.0 }; //For those words with only one sense
|
||||
|
||||
T* sense_prior = m_word_sense_info->word_sense_cnts_info[input_node] == 1 ? senses : (m_option->store_multinomial ? m_sense_priors_paras_ptr[input_node] : m_sense_priors_ptr[input_node]);
|
||||
|
||||
T log_likelihood;
|
||||
|
||||
for (int iter = 0; iter < m_option->EM_iteration; ++iter)
|
||||
{
|
||||
// backup input embeddings
|
||||
memcpy(input_backup, m_input_embedding_weights_ptr[input_node], m_option->embeding_size * m_word_sense_info->word_sense_cnts_info[input_node] * sizeof(T));
|
||||
log_likelihood = 0;
|
||||
|
||||
// E-Step
|
||||
log_likelihood += Estimate_Gamma_m(input_node, output_nodes, posterior_ll, gamma, sense_prior, fTable);
|
||||
|
||||
// M-Step
|
||||
if (m_option->store_multinomial)
|
||||
Maximize_Pi(input_node, gamma);
|
||||
else
|
||||
Maximize_Pi(input_node, posterior_ll);
|
||||
|
||||
UpdateEmbeddings(input_node, output_nodes, gamma, fTable, input_backup, UpdateDirection::UPDATE_INPUT);
|
||||
UpdateEmbeddings(input_node, output_nodes, gamma, fTable, input_backup, UpdateDirection::UPDATE_OUTPUT);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//Collect all the input words and output nodes in the data block
|
||||
void SkipGramMixtureNeuralNetwork<T>::PrepareParmeter(DataBlock* data_block)
|
||||
{
|
||||
for (int i = 0; i < m_input_layer_nodes.size(); ++i)
|
||||
{
|
||||
m_input_embedding_weights_ptr[m_input_layer_nodes[i]] = nullptr;
|
||||
m_seleted_input_embedding_weights[m_input_layer_nodes[i]] = false;
|
||||
}
|
||||
|
||||
for (int i = 0; i < m_output_layer_nodes.size(); ++i)
|
||||
{
|
||||
m_output_embedding_weights_ptr[m_output_layer_nodes[i]] = nullptr;
|
||||
m_selected_output_embedding_weights[m_output_layer_nodes[i]] = false;
|
||||
}
|
||||
|
||||
m_input_layer_nodes.clear();
|
||||
m_output_layer_nodes.clear();
|
||||
|
||||
int sentence_length;
|
||||
int64_t word_count_deta;
|
||||
int* sentence;
|
||||
uint64_t next_random;
|
||||
|
||||
for (int i = 0; i < data_block->Size(); ++i)
|
||||
{
|
||||
data_block->Get(i, sentence, sentence_length, word_count_deta, next_random);
|
||||
ParseSentence(sentence, sentence_length, nullptr, nullptr, nullptr, &SkipGramMixtureNeuralNetwork<T>::DealPrepareParameter);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//Copy the input_nodes&output_nodes to private set
|
||||
void SkipGramMixtureNeuralNetwork<T>::DealPrepareParameter(int input_node, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup)
|
||||
{
|
||||
AddInputLayerNode(input_node);
|
||||
for (int i = 0; i < output_nodes.size(); ++i)
|
||||
AddOutputLayerNode(output_nodes[i].first);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
/*
|
||||
Parse a sentence and deepen into two branchs:
|
||||
one for TrainNN, the other one is for Parameter_parse&request
|
||||
*/
|
||||
void SkipGramMixtureNeuralNetwork<T>::ParseSentence(int* sentence, int sentence_length, T* gamma, T* fTable, T* input_backup, FunctionType function)
|
||||
{
|
||||
if (sentence_length == 0)
|
||||
return;
|
||||
|
||||
int feat[MAX_SENTENCE_LENGTH + 10];
|
||||
int input_node;
|
||||
std::vector<std::pair<int, int> > output_nodes;
|
||||
for (int sentence_position = 0; sentence_position < sentence_length; ++sentence_position)
|
||||
{
|
||||
if (sentence[sentence_position] == -1) continue;
|
||||
int feat_size = 0;
|
||||
|
||||
for (int i = 0; i < m_option->window_size * 2 + 1; ++i)
|
||||
if (i != m_option->window_size)
|
||||
{
|
||||
int c = sentence_position - m_option->window_size + i;
|
||||
if (c < 0 || c >= sentence_length || sentence[c] == -1) continue;
|
||||
feat[feat_size++] = sentence[c];
|
||||
|
||||
//Begin: Train SkipGram
|
||||
{
|
||||
input_node = feat[feat_size - 1];
|
||||
output_nodes.clear();
|
||||
Parse(input_node, sentence[sentence_position], output_nodes);
|
||||
(this->*function)(input_node, output_nodes, gamma, fTable, input_backup);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//Parse the needed parameter in a window
|
||||
void SkipGramMixtureNeuralNetwork<T>::Parse(int feat, int out_word_idx, std::vector<std::pair<int, int> >& output_nodes)
|
||||
{
|
||||
const auto info = m_huffman_encoder->GetLabelInfo(out_word_idx);
|
||||
for (int d = 0; d < info->codelen; d++)
|
||||
output_nodes.push_back(std::make_pair(info->point[d], info->code[d]));
|
||||
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void SkipGramMixtureNeuralNetwork<T>::AddInputLayerNode(int node_id)
|
||||
{
|
||||
if (m_seleted_input_embedding_weights[node_id] == false)
|
||||
{
|
||||
m_seleted_input_embedding_weights[node_id] = true;
|
||||
m_input_layer_nodes.push_back(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void SkipGramMixtureNeuralNetwork<T>::AddOutputLayerNode(int node_id)
|
||||
{
|
||||
if (m_selected_output_embedding_weights[node_id] == false)
|
||||
{
|
||||
m_selected_output_embedding_weights[node_id] = true;
|
||||
m_output_layer_nodes.push_back(node_id);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
std::vector<int>& SkipGramMixtureNeuralNetwork<T>::GetInputLayerNodes()
|
||||
{
|
||||
return m_input_layer_nodes;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
std::vector<int>& SkipGramMixtureNeuralNetwork<T>::GetOutputLayerNodes()
|
||||
{
|
||||
return m_output_layer_nodes;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void SkipGramMixtureNeuralNetwork<T>::SetInputEmbeddingWeights(int input_node_id, T* ptr)
|
||||
{
|
||||
m_input_embedding_weights_ptr[input_node_id] = ptr;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void SkipGramMixtureNeuralNetwork<T>::SetOutputEmbeddingWeights(int output_node_id, T* ptr)
|
||||
{
|
||||
m_output_embedding_weights_ptr[output_node_id] = ptr;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SkipGramMixtureNeuralNetwork<T>::SetSensePriorWeights(int input_node_id, T*ptr)
|
||||
{
|
||||
m_sense_priors_ptr[input_node_id] = ptr;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void SkipGramMixtureNeuralNetwork<T>::SetSensePriorParaWeights(int input_node_id, T* ptr)
|
||||
{
|
||||
m_sense_priors_paras_ptr[input_node_id] = ptr;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T* SkipGramMixtureNeuralNetwork<T>::GetInputEmbeddingWeights(int input_node_id)
|
||||
{
|
||||
return m_input_embedding_weights_ptr[input_node_id];
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T* SkipGramMixtureNeuralNetwork<T>::GetEmbeddingOutputWeights(int output_node_id)
|
||||
{
|
||||
return m_output_embedding_weights_ptr[output_node_id];
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T* SkipGramMixtureNeuralNetwork<T>::GetSensePriorWeights(int input_node_id)
|
||||
{
|
||||
return m_sense_priors_ptr[input_node_id];
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
T* SkipGramMixtureNeuralNetwork<T>::GetSensePriorParaWeights(int input_node_id)
|
||||
{
|
||||
return m_sense_priors_paras_ptr[input_node_id];
|
||||
}
|
||||
|
||||
template class SkipGramMixtureNeuralNetwork<float>;
|
||||
template class SkipGramMixtureNeuralNetwork<double>;
|
|
@ -0,0 +1,140 @@
|
|||
#pragma once
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "Util.h"
|
||||
#include <multiverso.h>
|
||||
#include "HuffmanEncoder.h"
|
||||
#include "MultiversoSkipGramMixture.h"
|
||||
#include "cstring"
|
||||
|
||||
enum class UpdateDirection
|
||||
{
|
||||
UPDATE_INPUT,
|
||||
UPDATE_OUTPUT
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class SkipGramMixtureNeuralNetwork
|
||||
{
|
||||
public:
|
||||
T learning_rate;
|
||||
T sense_prior_momentum;
|
||||
|
||||
int status;
|
||||
SkipGramMixtureNeuralNetwork(Option* option, HuffmanEncoder* huffmanEncoder, WordSenseInfo* word_sense_info, Dictionary* dic, int dicSize);
|
||||
~SkipGramMixtureNeuralNetwork();
|
||||
|
||||
void Train(int* sentence, int sentence_length, T* gamma, T* fTable, T* input_backup);
|
||||
|
||||
/*!
|
||||
* \brief Collect all the input words and output nodes in the data block
|
||||
*/
|
||||
void PrepareParmeter(DataBlock *data_block);
|
||||
|
||||
std::vector<int>& GetInputLayerNodes();
|
||||
std::vector<int>& GetOutputLayerNodes();
|
||||
|
||||
/*!
|
||||
* \brief Set the pointers to those local parameters
|
||||
*/
|
||||
void SetInputEmbeddingWeights(int input_node_id, T* ptr);
|
||||
void SetOutputEmbeddingWeights(int output_node_id, T* ptr);
|
||||
void SetSensePriorWeights(int input_node_id, T*ptr);
|
||||
void SetSensePriorParaWeights(int input_node_id, T* ptr);
|
||||
|
||||
/*!
|
||||
* \brief Get the pointers to those locally updated parameters
|
||||
*/
|
||||
T* GetInputEmbeddingWeights(int input_node_id);
|
||||
T* GetEmbeddingOutputWeights(int output_node_id);
|
||||
T* GetSensePriorWeights(int input_node_id);
|
||||
T* GetSensePriorParaWeights(int input_node_id);
|
||||
|
||||
private:
|
||||
Option *m_option;
|
||||
Dictionary *m_dictionary;
|
||||
HuffmanEncoder *m_huffman_encoder;
|
||||
int m_dictionary_size;
|
||||
|
||||
WordSenseInfo* m_word_sense_info;
|
||||
|
||||
T** m_input_embedding_weights_ptr; //Points to every word's input embedding vector
|
||||
bool *m_seleted_input_embedding_weights;
|
||||
T** m_output_embedding_weights_ptr; //Points to every huffman node's embedding vector
|
||||
bool *m_selected_output_embedding_weights;
|
||||
|
||||
T** m_sense_priors_ptr; //Points to the multinomial parameters, if store_multinomial is set to zero.
|
||||
T** m_sense_priors_paras_ptr;//Points to sense prior parameters. If store_multinomial is zero, then it points to the log of multinomial, otherwise points to the multinomial parameters
|
||||
|
||||
std::vector<int> m_input_layer_nodes;
|
||||
std::vector<int> m_output_layer_nodes;
|
||||
|
||||
typedef void(SkipGramMixtureNeuralNetwork<T>::*FunctionType)(int input_node, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup);
|
||||
|
||||
/*!
|
||||
* \brief Parse the needed parameter in a window
|
||||
*/
|
||||
void Parse(int feat, int word_idx, std::vector<std::pair<int, int> >& output_nodes);
|
||||
|
||||
/*!
|
||||
* \brief Parse a sentence and deepen into two branchs
|
||||
* \one for TrainNN,the other one is for Parameter_parse&request
|
||||
*/
|
||||
void ParseSentence(int* sentence, int sentence_length, T* gamma, T* fTable, T* input_backup, FunctionType function);
|
||||
|
||||
/*!
|
||||
* \brief Copy the input_nodes&output_nodes to WordEmbedding private set
|
||||
*/
|
||||
void DealPrepareParameter(int input_nodes, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup);
|
||||
|
||||
/*!
|
||||
* \brief Train a window sample and update the
|
||||
* \input-embedding&output-embedding vectors
|
||||
* \param word_input represent the input words
|
||||
* \param output_nodes represent the ouput nodes on huffman tree, including the node index and path label
|
||||
* \param v_gamma is the temp memory to store the posterior probabilities of each sense
|
||||
* \param v_fTable is the temp memory to store the sigmoid value of inner product of input and output embeddings
|
||||
* \param v_input_backup stores the input embedding vectors as backup
|
||||
*/
|
||||
void TrainSample(int word_input, std::vector<std::pair<int, int> >& output_nodes, void* v_gamma, void* v_fTable, void* v_input_backup);
|
||||
|
||||
/*!
|
||||
* \brief The E-step, estimate the posterior multinomial probabilities
|
||||
* \param word_input represent the input words
|
||||
* \param output_nodes represent the ouput nodes on huffman tree, including the node index and path label
|
||||
* \param posterior represents the calculated posterior log likelihood
|
||||
* \param estimation represents the calculated gammas (see the paper), that is, the softmax terms of posterior
|
||||
* \param sense_prior represents the parameters of sense prior probablities for each polysemous words
|
||||
* \param f_m is the temp memory to store the sigmoid value of inner products of input and output embeddings
|
||||
*/
|
||||
T Estimate_Gamma_m(int word_input, std::vector<std::pair<int, int> >& output_nodes, T* posterior, T* estimation, T* sense_prior, T* f_m);
|
||||
|
||||
/*!
|
||||
* \brief The M step: update the embedding vectors to maximize the Q function
|
||||
* \param word_input represent the input words
|
||||
* \param output_nodes represent the ouput nodes on huffman tree, including the node index and path label
|
||||
* \param estimation represents the calculated gammas (see the paper), that is, the softmax terms of posterior
|
||||
* \param f_m is the temp memory to store the sigmoid value of inner products of input and output embeddings
|
||||
* \param input_backup stores the input embedding vectors as backup
|
||||
* \param direction: update input vectors or output vectors
|
||||
*/
|
||||
void UpdateEmbeddings(int word_input, std::vector<std::pair<int, int> >& output_nodes, T* estimation, T* f_m, T* input_backup, UpdateDirection direction);
|
||||
|
||||
/*!
|
||||
* \brief The M Step: update the sense prior probabilities to maximize the Q function
|
||||
* \param word_input represent the input words
|
||||
* \param curr_priors are the closed form values of the sense priors in this iteration
|
||||
*/
|
||||
void Maximize_Pi(int word_input, T* curr_priors);
|
||||
|
||||
/*
|
||||
* \brief Record the input word so that parameter loader can be performed
|
||||
*/
|
||||
void AddInputLayerNode(int node_id);
|
||||
|
||||
/*
|
||||
* \brief Record the huffman tree node so that parameter loader can be performed
|
||||
*/
|
||||
void AddOutputLayerNode(int node_id);
|
||||
};
|
|
@ -0,0 +1,445 @@
|
|||
#include "Trainer.h"
|
||||
|
||||
template<typename T>
|
||||
Trainer<T>::Trainer(int trainer_id, Option *option, void** word2vector_neural_networks, multiverso::Barrier *barrier, Dictionary* dictionary, WordSenseInfo* word_sense_info, HuffmanEncoder* huff_encoder)
|
||||
{
|
||||
m_trainer_id = trainer_id;
|
||||
m_option = option;
|
||||
m_word_count = m_last_word_count = 0;
|
||||
m_sgmixture_neural_networks = word2vector_neural_networks;
|
||||
m_barrier = barrier;
|
||||
m_dictionary = dictionary;
|
||||
m_word_sense_info = word_sense_info;
|
||||
m_huffman_encoder = huff_encoder;
|
||||
|
||||
gamma = (T*)calloc(m_option-> window_size * MAX_SENSE_CNT, sizeof(T));
|
||||
fTable = (T*)calloc(m_option-> window_size * MAX_CODE_LENGTH * MAX_SENSE_CNT, sizeof(T));
|
||||
input_backup = (T*)calloc(m_option->embeding_size * MAX_SENSE_CNT, sizeof(T));
|
||||
|
||||
m_start_time = 0;
|
||||
m_train_count = 0;
|
||||
m_executive_time = 0;
|
||||
if (m_trainer_id == 0)
|
||||
{
|
||||
m_log_file = fopen("trainer.log", "w");
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//Train one datablock
|
||||
void Trainer<T>::TrainIteration(multiverso::DataBlockBase *data_block)
|
||||
{
|
||||
if (m_train_count == 0)
|
||||
{
|
||||
m_start_time = clock();
|
||||
m_process_id = multiverso::Multiverso::ProcessRank();
|
||||
}
|
||||
|
||||
printf("Rank %d Begin TrainIteration...%d\n", m_process_id, m_train_count);
|
||||
clock_t train_interation_start = clock();
|
||||
fflush(stdout);
|
||||
|
||||
m_process_count = multiverso::Multiverso::TotalProcessCount();
|
||||
|
||||
DataBlock *data = reinterpret_cast<DataBlock*>(data_block);
|
||||
SkipGramMixtureNeuralNetwork<T>* word2vector_neural_network = reinterpret_cast<SkipGramMixtureNeuralNetwork<T>*>(m_sgmixture_neural_networks[m_train_count % 2]);
|
||||
++m_train_count;
|
||||
std::vector<int>& input_layer_nodes = word2vector_neural_network->GetInputLayerNodes();
|
||||
std::vector<int>& output_layer_nodes = word2vector_neural_network->GetOutputLayerNodes();
|
||||
std::vector<int> local_input_layer_nodes, local_output_layer_nodes;
|
||||
assert(word2vector_neural_network->status == 2);
|
||||
if (m_trainer_id == 0)
|
||||
{
|
||||
multiverso::Log::Info("Rank %d input_layer_size=%d, output_layer_size=%d\n", m_process_id, input_layer_nodes.size(), output_layer_nodes.size());
|
||||
}
|
||||
|
||||
for (int i = m_trainer_id; i < input_layer_nodes.size(); i += m_option->thread_cnt)
|
||||
{
|
||||
local_input_layer_nodes.push_back(input_layer_nodes[i]);
|
||||
}
|
||||
|
||||
for (int i = m_trainer_id; i < output_layer_nodes.size(); i += m_option->thread_cnt)
|
||||
{
|
||||
local_output_layer_nodes.push_back(output_layer_nodes[i]);
|
||||
}
|
||||
|
||||
CopyParameterFromMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network);
|
||||
|
||||
multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0);
|
||||
T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1));
|
||||
if (learning_rate < m_option->init_learning_rate * (real)0.0001)
|
||||
learning_rate = m_option->init_learning_rate * (real)0.0001;
|
||||
word2vector_neural_network->learning_rate = learning_rate;
|
||||
|
||||
//Linearly increase the momentum from init_sense_prior_momentum to 1
|
||||
word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum +
|
||||
(1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1);
|
||||
|
||||
m_barrier->Wait();
|
||||
|
||||
for (int i = m_trainer_id; i < data->Size(); i += m_option->thread_cnt) //i iterates over all sentences
|
||||
{
|
||||
int sentence_length;
|
||||
int64_t word_count_deta;
|
||||
int *sentence;
|
||||
uint64_t next_random;
|
||||
data->Get(i, sentence, sentence_length, word_count_deta, next_random);
|
||||
|
||||
word2vector_neural_network->Train(sentence, sentence_length, gamma, fTable, input_backup);
|
||||
|
||||
m_word_count += word_count_deta;
|
||||
if (m_word_count - m_last_word_count > 10000)
|
||||
{
|
||||
multiverso::Row<int64_t>& word_count_actual_row = GetRow<int64_t>(kWordCountActualTableId, 0);
|
||||
Add<int64_t>(kWordCountActualTableId, 0, 0, m_word_count - m_last_word_count);
|
||||
m_last_word_count = m_word_count;
|
||||
m_now_time = clock();
|
||||
|
||||
if (m_trainer_id % 3 == 0)
|
||||
{
|
||||
multiverso::Log::Info("Rank %d Trainer %d lr: %.5f Mom: %.4f Progress: %.2f%% Words/thread/sec(total): %.2fk W/t/sec(executive): %.2fk\n",
|
||||
m_process_id, m_trainer_id,
|
||||
word2vector_neural_network->learning_rate, word2vector_neural_network->sense_prior_momentum,
|
||||
word_count_actual_row.At(0) / (real)(m_option->total_words * m_option->epoch + 1) * 100,
|
||||
m_last_word_count / ((real)(m_now_time - m_start_time + 1) / (real)CLOCKS_PER_SEC * 1000),
|
||||
m_last_word_count / ((real)(m_executive_time + clock() - train_interation_start + 1) / (real)CLOCKS_PER_SEC * 1000));
|
||||
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
T learning_rate = m_option->init_learning_rate * (1 - word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1));
|
||||
if (learning_rate < m_option->init_learning_rate * (real)0.0001)
|
||||
learning_rate = m_option->init_learning_rate * (real)0.0001;
|
||||
word2vector_neural_network->learning_rate = learning_rate;
|
||||
|
||||
word2vector_neural_network->sense_prior_momentum = m_option->init_sense_prior_momentum + (1 - m_option->init_sense_prior_momentum) * word_count_actual_row.At(0) / (T)(m_option->total_words * m_option->epoch + 1);
|
||||
}
|
||||
}
|
||||
|
||||
m_barrier->Wait();
|
||||
AddParameterToMultiverso(local_input_layer_nodes, local_output_layer_nodes, word2vector_neural_network);
|
||||
|
||||
m_executive_time += clock() - train_interation_start;
|
||||
|
||||
multiverso::Log::Info("Rank %d Train %d end at %lfs, cost %lfs, total cost %lfs\n",
|
||||
m_process_id,
|
||||
m_trainer_id, clock() / (double)CLOCKS_PER_SEC,
|
||||
(clock() - train_interation_start) / (double)CLOCKS_PER_SEC,
|
||||
m_executive_time / (double)CLOCKS_PER_SEC);
|
||||
fflush(stdout);
|
||||
|
||||
if (data->GetTables().size() > 0 && m_trainer_id == 0) //Dump model files
|
||||
{
|
||||
SaveMultiInputEmbedding(data->GetEpochId());
|
||||
SaveOutputEmbedding(data->GetEpochId());
|
||||
if (data->GetEpochId() == 0)
|
||||
SaveHuffEncoder();
|
||||
|
||||
fprintf(m_log_file, "%d %lf\t %lf\n", data->GetEpochId(), (clock() - m_start_time) / (double)CLOCKS_PER_SEC, m_executive_time / (double)CLOCKS_PER_SEC);
|
||||
}
|
||||
|
||||
assert(word2vector_neural_network->status == 2);
|
||||
|
||||
word2vector_neural_network->status = 0;
|
||||
|
||||
multiverso::Log::Info("Rank %d Train %d are leaving training iter with nn status:%d\n", m_process_id, m_trainer_id, word2vector_neural_network->status);
|
||||
fflush(stdout);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//Copy a size of memory from source row to dest
|
||||
void Trainer<T>::CopyMemory(T* dest, multiverso::Row<T>& source, int size)
|
||||
{
|
||||
for (int i = 0; i < size; ++i)
|
||||
dest[i] = source.At(i);
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//Copy the needed parameter from buffer to local blocks
|
||||
int Trainer<T>::CopyParameterFromMultiverso(std::vector<int>& input_layer_nodes, std::vector<int>& output_layer_nodes, void* local_word2vector_neural_network)
|
||||
{
|
||||
SkipGramMixtureNeuralNetwork<T>* word2vector_neural_network = (SkipGramMixtureNeuralNetwork<T>*)local_word2vector_neural_network;
|
||||
|
||||
//Copy input embedding
|
||||
for (int i = 0; i < input_layer_nodes.size(); ++i)
|
||||
{
|
||||
T* ptr = (T*)calloc(m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]] * m_option->embeding_size, sizeof(T));
|
||||
int row_id_base = m_word_sense_info->p_input_embedding[input_layer_nodes[i]];
|
||||
for (int j = 0, row_id = row_id_base; j < m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]]; ++j, ++row_id)
|
||||
CopyMemory(ptr + j * m_option->embeding_size, GetRow<T>(kInputEmbeddingTableId, row_id), m_option->embeding_size);
|
||||
word2vector_neural_network->SetInputEmbeddingWeights(input_layer_nodes[i], ptr);
|
||||
}
|
||||
|
||||
//Copy output embedding
|
||||
for (int i = 0; i < output_layer_nodes.size(); ++i)
|
||||
{
|
||||
T* ptr = (T*)calloc(m_option->embeding_size, sizeof(T));
|
||||
CopyMemory(ptr, GetRow<T>(kEmbeddingOutputTableId, output_layer_nodes[i]), m_option->embeding_size);
|
||||
for (int j = 0; j < m_option->embeding_size; j += 5)
|
||||
if (!Util::ValidF(static_cast<real>(ptr[j])))
|
||||
{
|
||||
printf("invalid number\n");
|
||||
fflush(stdout);
|
||||
throw std::runtime_error("Invalid output embeddings");
|
||||
}
|
||||
word2vector_neural_network->SetOutputEmbeddingWeights(output_layer_nodes[i], ptr);
|
||||
}
|
||||
|
||||
//Copy sense prior
|
||||
for (int i = 0; i < input_layer_nodes.size(); ++i)
|
||||
{
|
||||
if (m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]] > 1)
|
||||
{
|
||||
T* ptr = (T*)calloc(m_option->sense_num_multi, sizeof(T));
|
||||
T* para_ptr = (T*)calloc(m_option->sense_num_multi, sizeof(T));
|
||||
|
||||
CopyMemory(para_ptr, GetRow<T>(kWordSensePriorTableId, m_word_sense_info->p_wordidx2sense_idx[input_layer_nodes[i]]), m_option->sense_num_multi);
|
||||
|
||||
if (!m_option->store_multinomial)//softmax the para_ptr to obtain the multinomial parameters
|
||||
Util::SoftMax(para_ptr, ptr, m_option->sense_num_multi);
|
||||
word2vector_neural_network->SetSensePriorWeights(input_layer_nodes[i], ptr);
|
||||
word2vector_neural_network->SetSensePriorParaWeights(input_layer_nodes[i], para_ptr);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//Add delta of a row of local parameters to the parameter stored in the buffer and send it to multiverso
|
||||
void Trainer<T>::AddParameterRowToMultiverso(T* ptr, int table_id, int row_id, int size, real momentum)
|
||||
{
|
||||
multiverso::Row<T>& row = GetRow<T>(table_id, row_id);
|
||||
for (int i = 0; i < size; ++i)
|
||||
{
|
||||
T dest = ptr[i] * (1 - momentum) + row.At(i) * momentum;
|
||||
T delta = (dest - row.At(i)) / m_process_count;
|
||||
Add<T>(table_id, row_id, i, delta);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
//Add delta to the parameter stored in the buffer and send it to multiverso
|
||||
int Trainer<T>::AddParameterToMultiverso(std::vector<int>& input_layer_nodes, std::vector<int>& output_layer_nodes, void* local_word2vector_neural_network)
|
||||
{
|
||||
SkipGramMixtureNeuralNetwork<T>* word2vector_neural_network = (SkipGramMixtureNeuralNetwork<T>*)local_word2vector_neural_network;
|
||||
std::vector<T*> blocks; //used to store locally malloced memorys
|
||||
|
||||
//Add input embeddings
|
||||
for (int i = 0; i < input_layer_nodes.size(); ++i)
|
||||
{
|
||||
int table_id = kInputEmbeddingTableId;
|
||||
int row_id_base = m_word_sense_info->p_input_embedding[input_layer_nodes[i]];
|
||||
T* ptr = word2vector_neural_network->GetInputEmbeddingWeights(input_layer_nodes[i]);
|
||||
|
||||
for (int j = 0, row_id = row_id_base; j < m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]]; ++j, ++row_id)
|
||||
AddParameterRowToMultiverso(ptr + m_option->embeding_size * j, table_id, row_id, m_option->embeding_size);
|
||||
blocks.push_back(ptr);
|
||||
}
|
||||
|
||||
//Add output embeddings
|
||||
for (int i = 0; i < output_layer_nodes.size(); ++i)
|
||||
{
|
||||
int table_id = kEmbeddingOutputTableId;
|
||||
int row_id = output_layer_nodes[i];
|
||||
T* ptr = word2vector_neural_network->GetEmbeddingOutputWeights(row_id);
|
||||
AddParameterRowToMultiverso(ptr, table_id, row_id, m_option->embeding_size);
|
||||
blocks.push_back(ptr);
|
||||
}
|
||||
|
||||
//Add sense priors
|
||||
for (int i = 0; i < input_layer_nodes.size(); ++i)
|
||||
{
|
||||
if (m_word_sense_info->word_sense_cnts_info[input_layer_nodes[i]] > 1)
|
||||
{
|
||||
int table_id = kWordSensePriorTableId;
|
||||
int row_id = m_word_sense_info->p_wordidx2sense_idx[input_layer_nodes[i]];
|
||||
|
||||
T* ptr = word2vector_neural_network->GetSensePriorWeights(input_layer_nodes[i]);
|
||||
T* para_ptr = word2vector_neural_network->GetSensePriorParaWeights(input_layer_nodes[i]);
|
||||
|
||||
AddParameterRowToMultiverso(para_ptr, table_id, row_id, m_option->sense_num_multi, static_cast<real>(word2vector_neural_network->sense_prior_momentum));
|
||||
|
||||
blocks.push_back(ptr);
|
||||
blocks.push_back(para_ptr);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
for (auto& x : blocks)
|
||||
free(x);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Trainer<T>::SaveMultiInputEmbedding(const int epoch_id)
|
||||
{
|
||||
FILE* fid = nullptr;
|
||||
T* sense_priors_ptr = (T*)calloc(m_option->sense_num_multi, sizeof(real));
|
||||
|
||||
char outfile[2000];
|
||||
if (m_option->output_binary)
|
||||
{
|
||||
sprintf(outfile, "%s%d", m_option->binary_embedding_file, epoch_id);
|
||||
|
||||
fid = fopen(outfile, "wb");
|
||||
|
||||
fprintf(fid, "%d %d %d\n", m_dictionary->Size(), m_word_sense_info->total_senses_cnt, m_option->embeding_size);
|
||||
for (int i = 0; i < m_dictionary->Size(); ++i)
|
||||
{
|
||||
fprintf(fid, "%s %d ", m_dictionary->GetWordInfo(i)->word.c_str(), m_word_sense_info->word_sense_cnts_info[i]);
|
||||
int emb_row_id;
|
||||
real emb_tmp;
|
||||
|
||||
if (m_word_sense_info->word_sense_cnts_info[i] > 1)
|
||||
{
|
||||
CopyMemory(sense_priors_ptr, GetRow<T>(kWordSensePriorTableId, m_word_sense_info->p_wordidx2sense_idx[i]), m_option->sense_num_multi);
|
||||
if (!m_option->store_multinomial)
|
||||
Util::SoftMax(sense_priors_ptr, sense_priors_ptr, m_option->sense_num_multi);
|
||||
|
||||
for (int j = 0; j < m_option->sense_num_multi; ++j)
|
||||
{
|
||||
fwrite(sense_priors_ptr + j, sizeof(real), 1, fid);
|
||||
emb_row_id = m_word_sense_info->p_input_embedding[i] + j;
|
||||
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, emb_row_id);
|
||||
for (int k = 0; k < m_option->embeding_size; ++k)
|
||||
{
|
||||
emb_tmp = embedding.At(k);
|
||||
fwrite(&emb_tmp, sizeof(real), 1, fid);
|
||||
}
|
||||
}
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
else
|
||||
{
|
||||
real prob = 1.0;
|
||||
fwrite(&prob, sizeof(real), 1, fid);
|
||||
emb_row_id = m_word_sense_info->p_input_embedding[i];
|
||||
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, emb_row_id);
|
||||
|
||||
for (int k = 0; k < m_option->embeding_size; ++k)
|
||||
{
|
||||
emb_tmp = embedding.At(k);
|
||||
fwrite(&emb_tmp, sizeof(real), 1, fid);
|
||||
}
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
if (m_option->output_binary % 2 == 0)
|
||||
{
|
||||
sprintf(outfile, "%s%d", m_option->text_embedding_file, epoch_id);
|
||||
|
||||
fid = fopen(outfile, "w");
|
||||
fprintf(fid, "%d %d %d\n", m_dictionary->Size(), m_word_sense_info->total_senses_cnt, m_option->embeding_size);
|
||||
for (int i = 0; i < m_dictionary->Size(); ++i)
|
||||
{
|
||||
fprintf(fid, "%s %d\n", m_dictionary->GetWordInfo(i)->word.c_str(), m_word_sense_info->word_sense_cnts_info[i]);
|
||||
|
||||
int emb_row_id;
|
||||
real emb_tmp;
|
||||
|
||||
if (m_word_sense_info->word_sense_cnts_info[i] > 1)
|
||||
{
|
||||
CopyMemory(sense_priors_ptr, GetRow<T>(kWordSensePriorTableId, m_word_sense_info->p_wordidx2sense_idx[i]), m_option->sense_num_multi);
|
||||
|
||||
if (!m_option->store_multinomial)
|
||||
Util::SoftMax(sense_priors_ptr, sense_priors_ptr, m_option->sense_num_multi);
|
||||
|
||||
for (int j = 0; j < m_option->sense_num_multi; ++j)
|
||||
{
|
||||
fprintf(fid, "%.4f", sense_priors_ptr[j]);
|
||||
|
||||
emb_row_id = m_word_sense_info->p_input_embedding[i] + j;
|
||||
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, emb_row_id);
|
||||
for (int k = 0; k < m_option->embeding_size; ++k)
|
||||
{
|
||||
emb_tmp = embedding.At(k);
|
||||
fprintf(fid, " %.3f", emb_tmp);
|
||||
}
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
real prob = 1.0;
|
||||
fprintf(fid, "%.4f", 1.0);
|
||||
|
||||
emb_row_id = m_word_sense_info->p_input_embedding[i];
|
||||
multiverso::Row<real>& embedding = GetRow<real>(kInputEmbeddingTableId, emb_row_id);
|
||||
for (int k = 0; k < m_option->embeding_size; ++k)
|
||||
{
|
||||
emb_tmp = embedding.At(k);
|
||||
fprintf(fid, " %.3f", emb_tmp);
|
||||
}
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
}
|
||||
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Trainer<T>::SaveOutputEmbedding(const int epoch_id)
|
||||
{
|
||||
char outfile[2000];
|
||||
if (m_option->output_binary)
|
||||
{
|
||||
sprintf(outfile, "%s%d", m_option->outputlayer_binary_file, epoch_id);
|
||||
|
||||
FILE* fid = fopen(outfile, "wb");
|
||||
fprintf(fid, "%d %d\n", m_dictionary->Size(), m_option->embeding_size);
|
||||
for (int i = 0; i < m_dictionary->Size(); ++i)
|
||||
{
|
||||
multiverso::Row<real>& hs_embedding = GetRow<real>(kEmbeddingOutputTableId, i);
|
||||
for (int j = 0; j < m_option->embeding_size; ++j)
|
||||
{
|
||||
real emb_tmp = hs_embedding.At(j);
|
||||
fwrite(&emb_tmp, sizeof(real), 1, fid);
|
||||
}
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
if (m_option->output_binary % 2 == 0)
|
||||
{
|
||||
sprintf(outfile, "%s%d", m_option->outputlayer_text_file, epoch_id);
|
||||
|
||||
FILE* fid = fopen(outfile, "w");
|
||||
fprintf(fid, "%d %d\n", m_dictionary->Size(), m_option->embeding_size);
|
||||
for (int i = 0; i < m_dictionary->Size(); ++i)
|
||||
{
|
||||
multiverso::Row<real>& hs_embedding = GetRow<real>(kEmbeddingOutputTableId, i);
|
||||
|
||||
for (int j = 0; j < m_option->embeding_size; ++j)
|
||||
fprintf(fid, "%.2f ", hs_embedding.At(j));
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
void Trainer<T>::SaveHuffEncoder()
|
||||
{
|
||||
FILE* fid = fopen(m_option->huff_tree_file, "w");
|
||||
fprintf(fid, "%d\n", m_dictionary->Size());
|
||||
for (int i = 0; i < m_dictionary->Size(); ++i)
|
||||
{
|
||||
fprintf(fid, "%s", m_dictionary->GetWordInfo(i)->word.c_str());
|
||||
const auto info = m_huffman_encoder->GetLabelInfo(i);
|
||||
fprintf(fid, " %d", info->codelen);
|
||||
for (int j = 0; j < info->codelen; ++j)
|
||||
fprintf(fid, " %d", info->code[j]);
|
||||
for (int j = 0; j < info->codelen; ++j)
|
||||
fprintf(fid, " %d", info->point[j]);
|
||||
fprintf(fid, "\n");
|
||||
}
|
||||
fclose(fid);
|
||||
}
|
||||
|
||||
template class Trainer<float>;
|
||||
template class Trainer<double>;
|
|
@ -0,0 +1,83 @@
|
|||
#pragma once
|
||||
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
#include <multiverso.h>
|
||||
#include <log.h>
|
||||
#include <barrier.h>
|
||||
|
||||
#include "DataBlock.h"
|
||||
#include "MultiversoTablesId.h"
|
||||
#include "Util.h"
|
||||
#include "HuffmanEncoder.h"
|
||||
#include "SkipGramMixtureNeuralNetwork.h"
|
||||
|
||||
|
||||
template<typename T>
|
||||
class Trainer : public multiverso::TrainerBase
|
||||
{
|
||||
public:
|
||||
Trainer(int trainer_id, Option *option, void** word2vector_neural_networks, multiverso::Barrier* barrier, Dictionary* dictionary, WordSenseInfo* word_sense_info, HuffmanEncoder* huff_encoder);
|
||||
|
||||
/*!
|
||||
* /brief Train one datablock
|
||||
*/
|
||||
void TrainIteration(multiverso::DataBlockBase* data_block) override;
|
||||
|
||||
private:
|
||||
int m_process_id;
|
||||
int m_trainer_id;
|
||||
int m_train_count; //threads count
|
||||
int m_process_count; //machines count
|
||||
|
||||
Option *m_option;
|
||||
WordSenseInfo* m_word_sense_info;
|
||||
HuffmanEncoder* m_huffman_encoder;
|
||||
|
||||
int64_t m_word_count, m_last_word_count;
|
||||
|
||||
T* gamma, * fTable, *input_backup; //temp memories to store middle results in the EM algorithm
|
||||
|
||||
clock_t m_start_time, m_now_time, m_executive_time;
|
||||
void ** m_sgmixture_neural_networks;
|
||||
multiverso::Barrier *m_barrier;
|
||||
Dictionary* m_dictionary;
|
||||
FILE* m_log_file;
|
||||
|
||||
/*!
|
||||
* \brief Save the multi sense input-embedding vectors
|
||||
* \param epoch_id, the embedding vectors after epoch_id is dumped
|
||||
*/
|
||||
void SaveMultiInputEmbedding(const int epoch_id);
|
||||
|
||||
/*!
|
||||
* \brief Save the outpue embedding vectors, i.e. the embeddings for huffman tree nodes
|
||||
* \param epoch_id, the embedding vectors after epoch_id is dumped
|
||||
*/
|
||||
void SaveOutputEmbedding(const int epoch_id);
|
||||
|
||||
/*!
|
||||
* \brief Save the Huffman tree structure
|
||||
*/
|
||||
void SaveHuffEncoder();
|
||||
|
||||
/*!
|
||||
* \brief Copy the needed parameter from buffer to local blocks
|
||||
*/
|
||||
void CopyMemory(T* dest, multiverso::Row<T>& source, int size);
|
||||
int CopyParameterFromMultiverso(std::vector<int>& input_layer_nodes, std::vector<int>& output_layer_nodes, void* word2vector_neural_networks);
|
||||
|
||||
/*!
|
||||
* \brief Add delta to the parameter stored in the
|
||||
* \buffer and send it to multiverso
|
||||
*/
|
||||
int AddParameterToMultiverso(std::vector<int>& input_layer_nodes, std::vector<int>& output_layer_nodes, void* word2vector_neural_networks);
|
||||
/*!
|
||||
* \brief Add delta of a row of local parameters to the parameter stored in the
|
||||
* \buffer and send it to multiverso
|
||||
* \param momentum: new_value = old_value * momentum + current_value * (1 - momentum). Set to non zero when updating the sense_priors
|
||||
*/
|
||||
void AddParameterRowToMultiverso(T* ptr, int table_id, int row_id, int size, real momentum = 0);
|
||||
|
||||
};
|
||||
|
|
@ -0,0 +1,177 @@
|
|||
#include "Util.h"
|
||||
|
||||
Option::Option()
|
||||
{
|
||||
train_file = NULL;
|
||||
read_vocab_file = NULL;
|
||||
binary_embedding_file = NULL;
|
||||
text_embedding_file = NULL;
|
||||
|
||||
sw_file = NULL;
|
||||
output_binary = 2;
|
||||
embeding_size = 0;
|
||||
thread_cnt = 1;
|
||||
window_size = 5;
|
||||
min_count = 5;
|
||||
data_block_size = 100;
|
||||
init_learning_rate = static_cast<real>(0.025);
|
||||
epoch = 1;
|
||||
stopwords = false;
|
||||
total_words = 0;
|
||||
|
||||
//multisense config
|
||||
store_multinomial = false;
|
||||
EM_iteration = 1;
|
||||
top_N = 0;
|
||||
top_ratio = static_cast<real>(0.1);
|
||||
sense_num_multi = 1;
|
||||
init_sense_prior_momentum = static_cast<real>(0.1);
|
||||
sense_file = NULL;
|
||||
huff_tree_file = NULL;
|
||||
outputlayer_binary_file = NULL;
|
||||
outputlayer_text_file = NULL;
|
||||
|
||||
// multiverso config
|
||||
num_servers = 0;
|
||||
num_aggregator = 1;
|
||||
lock_option = 1;
|
||||
num_lock = 100;
|
||||
max_delay = 0;
|
||||
}
|
||||
|
||||
void Option::ParseArgs(int argc, char* argv[])
|
||||
{
|
||||
for (int i = 1; i < argc; i += 2)
|
||||
{
|
||||
if (strcmp(argv[i], "-size") == 0) embeding_size = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-train_file") == 0) train_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-vocab_file") == 0) read_vocab_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-binary") == 0) output_binary = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-init_learning_rate") == 0) init_learning_rate = static_cast<real>(atof(argv[i + 1]));
|
||||
if (strcmp(argv[i], "-binary_embedding_file") == 0) binary_embedding_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-text_embedding_file") == 0) text_embedding_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-window") == 0) window_size = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-data_block_size") == 0) data_block_size = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-threads") == 0) thread_cnt = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-min_count") == 0) min_count = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-epoch") == 0) epoch = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-stopwords") == 0) stopwords = atoi(argv[i + 1]) != 0;
|
||||
if (strcmp(argv[i], "-sw_file") == 0) sw_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-num_servers") == 0) num_servers = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-num_aggregator") == 0) num_aggregator = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-lock_option") == 0) lock_option = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-num_lock") == 0) num_lock = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-max_delay") == 0) max_delay = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-max_preload_size") == 0) max_preload_blocks_cnt = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-is_pipline") == 0) pipline = atoi(argv[i + 1]) != 0;
|
||||
|
||||
if (strcmp(argv[i], "-sense_num_multi") == 0) sense_num_multi = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-momentum") == 0) init_sense_prior_momentum = static_cast<real>(atof(argv[i + 1]));
|
||||
if (strcmp(argv[i], "-EM_iteration") == 0) EM_iteration = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-store_multinomial") == 0) store_multinomial = atoi(argv[i + 1]) != 0;
|
||||
if (strcmp(argv[i], "-top_n") == 0) top_N = atoi(argv[i + 1]);
|
||||
if (strcmp(argv[i], "-top_ratio") == 0) top_ratio = static_cast<real>(atof(argv[i + 1]));
|
||||
if (strcmp(argv[i], "-read_sense") == 0) sense_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-huff_tree_file") == 0) huff_tree_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-outputlayer_binary_file") == 0) outputlayer_binary_file = argv[i + 1];
|
||||
if (strcmp(argv[i], "-outputlayer_text_file") == 0) outputlayer_text_file = argv[i + 1];
|
||||
}
|
||||
}
|
||||
|
||||
void Option::PrintArgs()
|
||||
{
|
||||
printf("train_file: %s\n", train_file);
|
||||
printf("read_vocab_file: %s\n", read_vocab_file);
|
||||
printf("binary_embedding_file: %s\n", binary_embedding_file);
|
||||
printf("sw_file: %s\n", sw_file);
|
||||
printf("output_binary: %d\n", output_binary);
|
||||
printf("stopwords: %d\n", stopwords);
|
||||
printf("embeding_size: %d\n", embeding_size);
|
||||
printf("thread_cnt: %d\n", thread_cnt);
|
||||
printf("window_size: %d\n", window_size);
|
||||
printf("min_count: %d\n", min_count);
|
||||
printf("epoch: %d\n", epoch);
|
||||
printf("total_words: %lld\n", total_words);
|
||||
printf("init_learning_rate: %lf\n", init_learning_rate);
|
||||
printf("data_block_size: %d\n", data_block_size);
|
||||
printf("pre_load_data_blocks: %d\n", max_preload_blocks_cnt);
|
||||
printf("num_servers: %d\n", num_servers);
|
||||
printf("num_aggregator: %d\n", num_aggregator);
|
||||
printf("lock_option: %d\n", lock_option);
|
||||
printf("num_lock: %d\n", num_lock);
|
||||
printf("max_delay: %d\n", max_delay);
|
||||
printf("is_pipline:%d\n", pipline);
|
||||
printf("top_ratio: %lf\n", top_ratio);
|
||||
printf("top_N: %d\n", top_N);
|
||||
printf("store_multinomial: %d\n", store_multinomial);
|
||||
}
|
||||
|
||||
//Check whether the user defined arguments are valid
|
||||
bool Option::CheckArgs()
|
||||
{
|
||||
if (!Util::IsFileExist(train_file))
|
||||
{
|
||||
printf("Train corpus does not exist\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!Util::IsFileExist(read_vocab_file))
|
||||
{
|
||||
printf("Vocab file does not exist\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (output_binary && (binary_embedding_file == NULL || outputlayer_binary_file == NULL))
|
||||
{
|
||||
printf("Binary output file name not specified\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (output_binary % 2 == 0 && (text_embedding_file == NULL || outputlayer_text_file == NULL))
|
||||
{
|
||||
printf("Text output file name not specified\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (huff_tree_file == NULL)
|
||||
{
|
||||
printf("Huffman tree file name not speficied\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (stopwords && !Util::IsFileExist(sw_file))
|
||||
{
|
||||
printf("Stop words file does not exist\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (init_sense_prior_momentum < -eps || init_sense_prior_momentum >= 1)
|
||||
{
|
||||
printf("Init momentum %.4f out of range, must lie between 0.0 and 1.0\n", init_sense_prior_momentum);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (top_ratio < -eps || top_ratio >= 1)
|
||||
{
|
||||
printf("Top ratio %.4f out of range, must lie between 0.0 and 1.0\n", init_sense_prior_momentum);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (sense_num_multi > MAX_SENSE_CNT)
|
||||
{
|
||||
printf("Sense number is too big, the maximum value is 50\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
if (fabs(static_cast<real>(max_delay)) > eps)
|
||||
{
|
||||
printf("Warning: better set max_delay to 0!\n");
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool Util::ValidF(const real &f)
|
||||
{
|
||||
return f < 1 || f >= 1;
|
||||
}
|
|
@ -0,0 +1,114 @@
|
|||
#pragma once
|
||||
|
||||
#include <fstream>
|
||||
#include <cstdlib>
|
||||
#include <cstring>
|
||||
#include <random>
|
||||
#include <cassert>
|
||||
#include <exception>
|
||||
#include <algorithm>
|
||||
#include <unordered_map>
|
||||
#include <cstdint>
|
||||
|
||||
typedef float real;
|
||||
|
||||
#define MAX_STRING 100
|
||||
#define MAX_SENTENCE_LENGTH 2000
|
||||
#define MAX_EXP 6
|
||||
#define MAX_SENSE_CNT 50
|
||||
#define MIN_LOG -15
|
||||
|
||||
const int table_size = (int)1e8;
|
||||
const real eps = (real)1e-8;
|
||||
|
||||
struct WordSenseInfo
|
||||
{
|
||||
std::vector<int> p_input_embedding; //Points to a word's row index in kInputEmbeddingTable
|
||||
std::unordered_map<int, int> p_wordidx2sense_idx; //Map a word's idx to its row index in the table kWordSensePriorTableId
|
||||
|
||||
std::vector<int> word_sense_cnts_info; //Record every word's #sense count information
|
||||
int total_senses_cnt;
|
||||
int multi_senses_words_cnt; //Total number of words with multiple senses
|
||||
};
|
||||
|
||||
struct Option
|
||||
{
|
||||
const char* train_file;
|
||||
const char* read_vocab_file;
|
||||
const char* binary_embedding_file;
|
||||
const char* text_embedding_file;
|
||||
const char* sw_file;
|
||||
int output_binary, stopwords;
|
||||
int data_block_size;
|
||||
int embeding_size, thread_cnt, window_size, min_count, epoch;
|
||||
int64_t total_words;
|
||||
real init_learning_rate;
|
||||
int num_servers, num_aggregator, lock_option, num_lock, max_delay;
|
||||
bool pipline;
|
||||
int64_t max_preload_blocks_cnt;
|
||||
|
||||
/*Multi sense config*/
|
||||
int EM_iteration;
|
||||
int top_N; //The top top_N frequent words has multi senses, e.g. 500, 1000,...
|
||||
real top_ratio; // The top top_ratop frequent words has multi senses, e.g. 0.05, 0.1...
|
||||
int sense_num_multi; //Default number of senses for the multi_sense words
|
||||
real init_sense_prior_momentum; //Initial momentum, momentum is used in updating the sense priors
|
||||
bool store_multinomial; //Use multinomial parameters. If set to false, use the log of multinomial instead
|
||||
const char* sense_file; //The sense file storing (word, #sense) mapping
|
||||
const char* huff_tree_file; // The output file storing the huffman tree structure
|
||||
const char* outputlayer_binary_file; //The output binary file storing all the output embedding(i.e. the huffman node embedding)
|
||||
const char* outputlayer_text_file; //The output text file storing all the output embedding(i.e. the huffman node embedding)
|
||||
|
||||
Option();
|
||||
void ParseArgs(int argc, char* argv[]);
|
||||
void PrintArgs();
|
||||
bool CheckArgs();
|
||||
};
|
||||
|
||||
|
||||
class Util
|
||||
{
|
||||
public:
|
||||
static void SaveVocab();
|
||||
|
||||
template<typename T>
|
||||
static T InnerProduct(T* x, T* y, int length)
|
||||
{
|
||||
T result = 0;
|
||||
for (int i = 0; i < length; ++i)
|
||||
result += x[i] * y[i];
|
||||
return result;
|
||||
}
|
||||
|
||||
static bool ValidF(const real &f);
|
||||
|
||||
template <typename T>
|
||||
static T Sigmoid(T f)
|
||||
{
|
||||
if (f < -MAX_EXP)
|
||||
return 0;
|
||||
if (f > MAX_EXP)
|
||||
return 1;
|
||||
return 1 / (1 + exp(-f));
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void SoftMax(T* s, T* result, int size)
|
||||
{
|
||||
T sum = 0, max_v = s[0];
|
||||
for (int j = 1; j < size; ++j)
|
||||
max_v = std::max(max_v, s[j]);
|
||||
for (int j = 0; j < size; ++j)
|
||||
sum += exp(s[j] - max_v);
|
||||
for (int j = 0; j < size; ++j)
|
||||
result[j] = exp(s[j] - max_v) / sum;
|
||||
}
|
||||
|
||||
static bool IsFileExist(const char *fileName)
|
||||
{
|
||||
std::ifstream infile(fileName);
|
||||
return infile.good();
|
||||
}
|
||||
|
||||
};
|
||||
|
Загрузка…
Ссылка в новой задаче